Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial support for Italian publishers, starting with La Repubblica #670

Merged
merged 12 commits into from
Jan 2, 2025
Prev Previous commit
Next Next commit
Made xpath search return a scalar if possible, added name in the xpat…
…h to get topics, modified the dynamic sitemap handling to conform to tagesspiel implementation, removed redundant free access check
  • Loading branch information
ruggsea committed Dec 30, 2024
commit 06f2dcc6fade740177f837ce32d7cf902c5664f8
30 changes: 13 additions & 17 deletions src/fundus/publishers/it/__init__.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,25 @@
from datetime import datetime, timedelta
from dateutil.rrule import MONTHLY, rrule

from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.it.la_repubblica import LaRepubblicaParser
from fundus.scraping.url import RSSFeed, Sitemap

start_month = "2020-01"
# end month is the next month
end_month = (datetime.now() + timedelta(days=30)).strftime("%Y-%m")

sitemap_urls = []
# urls in the format https://www.repubblica.it/sitemap-<year>-<month>.xml
# like https://www.repubblica.it/sitemap-2000-01.xml
for year in range(int(start_month.split("-")[0]), int(end_month.split("-")[0]) + 1):
for month in range(1, 13):
# month needs to be in the format 01, 02, 03, etc.
month_str = f"{month:02d}"
sitemap_urls.append(f"https://www.repubblica.it/sitemap-{year}-{month_str}.xml")
sitemap_urls.reverse()


class IT(metaclass=PublisherGroup):
LaRepubblica = Publisher(
name="La Repubblica",
domain="https://www.repubblica.it",
parser=LaRepubblicaParser,
sources=[Sitemap(sitemap_url, reverse=False, recursive=False) for sitemap_url in sitemap_urls]
+ [RSSFeed("https://www.repubblica.it/rss/homepage/rss2.0.xml")],
sources=[
RSSFeed("https://www.repubblica.it/rss/homepage/rss2.0.xml"),
] + [
Sitemap(f"https://www.repubblica.it/sitemap-{date.strftime('%Y-%m')}.xml")
for date in reversed(list(rrule(
MONTHLY,
dtstart=datetime(2020, 1, 1),
until=datetime.now()+timedelta(days=30)
)))
],
)

print(IT.LaRepubblica.source_mapping)
21 changes: 6 additions & 15 deletions src/fundus/publishers/it/la_repubblica.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,21 +43,12 @@ def authors(self) -> List[str]:

@attribute
def publishing_date(self) -> Optional[datetime]:
# Extract publishing date from schema.org NewsArticle data
date_str = self.precomputed.ld.xpath_search("//NewsArticle/datePublished")
return generic_date_parsing(date_str[0] if date_str else None)
# Use scalar parameter for direct value
date_str = self.precomputed.ld.xpath_search("//NewsArticle/datePublished", scalar=True)
return generic_date_parsing(date_str)

@attribute
def topics(self) -> List[str]:
# Extract topics from schema.org NewsArticle data
topics = self.precomputed.ld.xpath_search("//NewsArticle/about")
if topics:
return generic_topic_parsing([topic.get("name") for topic in topics if topic.get("name")])
return []

@attribute
def free_access(self) -> bool:
# Check if article is freely accessible from schema.org NewsArticle data
is_free = self.precomputed.ld.xpath_search("//NewsArticle/isAccessibleForFree")
free = True if is_free[0] == "True" else False
return free
# Simplified topic extraction using name in xpath
topics = self.precomputed.ld.xpath_search("//NewsArticle/about/name")
return generic_topic_parsing(topics) if topics else []