Made xpath search return a scalar if possible, added name in the xpat…

…h to get topics, modified the dynamic sitemap handling to conform to tagesspiel implementation, removed redundant free access check
flairNLP · MaxDall · Jan 2, 2025 · Dec 30, 2024 · Dec 30, 2024 · Dec 30, 2024
commit 06f2dcc6fade740177f837ce32d7cf902c5664f8
diff --git a/src/fundus/publishers/it/__init__.py b/src/fundus/publishers/it/__init__.py
@@ -1,29 +1,25 @@
 from datetime import datetime, timedelta
+from dateutil.rrule import MONTHLY, rrule
 
 from fundus.publishers.base_objects import Publisher, PublisherGroup
 from fundus.publishers.it.la_repubblica import LaRepubblicaParser
 from fundus.scraping.url import RSSFeed, Sitemap
 
-start_month = "2020-01"
-# end month is the next month
-end_month = (datetime.now() + timedelta(days=30)).strftime("%Y-%m")
-
-sitemap_urls = []
-# urls in the format https://www.repubblica.it/sitemap-<year>-<month>.xml
-# like https://www.repubblica.it/sitemap-2000-01.xml
-for year in range(int(start_month.split("-")[0]), int(end_month.split("-")[0]) + 1):
-    for month in range(1, 13):
-        # month needs to be in the format 01, 02, 03, etc.
-        month_str = f"{month:02d}"
-        sitemap_urls.append(f"https://www.repubblica.it/sitemap-{year}-{month_str}.xml")
-sitemap_urls.reverse()
-
-
 class IT(metaclass=PublisherGroup):
     LaRepubblica = Publisher(
         name="La Repubblica",
         domain="https://www.repubblica.it",
         parser=LaRepubblicaParser,
-        sources=[Sitemap(sitemap_url, reverse=False, recursive=False) for sitemap_url in sitemap_urls]
-        + [RSSFeed("https://www.repubblica.it/rss/homepage/rss2.0.xml")],
+        sources=[
+            RSSFeed("https://www.repubblica.it/rss/homepage/rss2.0.xml"),
+        ] + [
+            Sitemap(f"https://www.repubblica.it/sitemap-{date.strftime('%Y-%m')}.xml")
+            for date in reversed(list(rrule(
+                MONTHLY, 
+                dtstart=datetime(2020, 1, 1),
+                until=datetime.now()+timedelta(days=30)
+            )))
+        ],
     )
+
+print(IT.LaRepubblica.source_mapping)
diff --git a/src/fundus/publishers/it/la_repubblica.py b/src/fundus/publishers/it/la_repubblica.py
@@ -43,21 +43,12 @@ def authors(self) -> List[str]:
 
         @attribute
         def publishing_date(self) -> Optional[datetime]:
-            # Extract publishing date from schema.org NewsArticle data
-            date_str = self.precomputed.ld.xpath_search("//NewsArticle/datePublished")
-            return generic_date_parsing(date_str[0] if date_str else None)
+            # Use scalar parameter for direct value
+            date_str = self.precomputed.ld.xpath_search("//NewsArticle/datePublished", scalar=True)
+            return generic_date_parsing(date_str)
 
         @attribute
         def topics(self) -> List[str]:
-            # Extract topics from schema.org NewsArticle data
-            topics = self.precomputed.ld.xpath_search("//NewsArticle/about")
-            if topics:
-                return generic_topic_parsing([topic.get("name") for topic in topics if topic.get("name")])
-            return []
-
-        @attribute
-        def free_access(self) -> bool:
-            # Check if article is freely accessible from schema.org NewsArticle data
-            is_free = self.precomputed.ld.xpath_search("//NewsArticle/isAccessibleForFree")
-            free = True if is_free[0] == "True" else False
-            return free
+            # Simplified topic extraction using name in xpath
+            topics = self.precomputed.ld.xpath_search("//NewsArticle/about/name")
+            return generic_topic_parsing(topics) if topics else []