Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial support for Italian publishers, starting with La Repubblica #670

Merged
merged 12 commits into from
Jan 2, 2025
Prev Previous commit
Next Next commit
Adding dynamic sitemap generation for La Repubblica
  • Loading branch information
ruggsea committed Dec 30, 2024
commit f76166438c35a0cc47fea2df2786397b8b4ea7d5
22 changes: 18 additions & 4 deletions src/fundus/publishers/it/__init__.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
from datetime import datetime, timedelta

from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.it.la_repubblica import LaRepubblicaParser
from fundus.scraping.url import RSSFeed, Sitemap

start_month = "2020-01"
# end month is the next month
end_month = (datetime.now() + timedelta(days=30)).strftime("%Y-%m")

sitemap_urls = []
# urls in the format https://www.repubblica.it/sitemap-<year>-<month>.xml
# like https://www.repubblica.it/sitemap-2000-01.xml
for year in range(int(start_month.split("-")[0]), int(end_month.split("-")[0]) + 1):
for month in range(1, 13):
# month needs to be in the format 01, 02, 03, etc.
month_str = f"{month:02d}"
sitemap_urls.append(f"https://www.repubblica.it/sitemap-{year}-{month_str}.xml")
sitemap_urls.reverse()
MaxDall marked this conversation as resolved.
Show resolved Hide resolved


class IT(metaclass=PublisherGroup):
LaRepubblica = Publisher(
name="La Repubblica",
domain="https://www.repubblica.it",
parser=LaRepubblicaParser,
sources=[
RSSFeed("https://www.repubblica.it/rss/homepage/rss2.0.xml"),
Sitemap("https://www.repubblica.it/sitemap-n.xml", reverse=True, recursive=False),
],
sources=[Sitemap(sitemap_url, reverse=False, recursive=False) for sitemap_url in sitemap_urls]
+ [RSSFeed("https://www.repubblica.it/rss/homepage/rss2.0.xml")],
)
3 changes: 2 additions & 1 deletion src/fundus/publishers/it/la_repubblica.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,5 @@ def topics(self) -> List[str]:
def free_access(self) -> bool:
# Check if article is freely accessible from schema.org NewsArticle data
is_free = self.precomputed.ld.xpath_search("//NewsArticle/isAccessibleForFree")
return bool(is_free[0]) if is_free else False
free = True if is_free[0] == "True" else False
return free
Loading