Skip to content

Commit

Permalink
Added Krautreporter
Browse files Browse the repository at this point in the history
  • Loading branch information
dkm1006 committed Aug 22, 2024
1 parent e55ecb6 commit 56fb213
Show file tree
Hide file tree
Showing 6 changed files with 289 additions and 0 deletions.
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,21 @@
</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>Krautreporter</code>
</td>
<td>
<div>Krautreporter</div>
</td>
<td>
<a href="https://krautreporter.de/">
<span>krautreporter.de</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>MitteldeutscheZeitung</code>
Expand Down
16 changes: 16 additions & 0 deletions src/fundus/publishers/de/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from .hessenschau import HessenschauParser
from .junge_welt import JungeWeltParser
from .kicker import KickerParser
from .krautreporter import KrautreporterParser
from .mdr import MDRParser
from .merkur import MerkurParser
from .morgenpost_berlin import BerlinerMorgenpostParser
Expand Down Expand Up @@ -463,6 +464,21 @@ class DE(metaclass=PublisherGroup):
url_filter=regex_filter("/slideshow|/video"),
)

Krautreporter = Publisher(
name="Krautreporter",
domain="https://krautreporter.de/",
parser=KrautreporterParser,
sources=[
# NOTE: robots.txt mentions that it reserves the right of use for text & data mining (§ 44 b UrhG),
# but this is not done in machine readable format, so it is null & void
# TODO: Maybe we have to implement a sitemap_filter here (archiv and alle-artikel)
# NOTE: both sitemap and news are identical
Sitemap("https://krautreporter.de/sitemap.xml", reverse=True, sitemap_filter=regex_filter('archiv')),
# NewsMap("https://krautreporter.de/news.xml"),
RSSFeed("https://krautreporter.de/feeds.rss")
],
)

FrankfurterRundschau = Publisher(
name="Frankfurter Rundschau",
domain="https://www.fr.de",
Expand Down
58 changes: 58 additions & 0 deletions src/fundus/publishers/de/krautreporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import json
from datetime import datetime
from typing import List, Optional

from lxml.cssselect import CSSSelector

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute, utility


class KrautreporterParser(ParserProxy):
class V1(BaseParser):
_summary_selector = CSSSelector("p.article-headers-standard__teaser")
_subheadline_selector = CSSSelector("div.article-markdown > h2")
_paragraph_selector = CSSSelector("div.article-markdown > p")
_json_ld_selector = CSSSelector('script[type="application/ld+json"]')
_topic_selector = CSSSelector("div.article-headers-shared-topic")

@attribute
def title(self) -> Optional[str]:
return self.precomputed.meta.get("og:title")

@attribute
def body(self) -> ArticleBody:
article_body = utility.extract_article_body_with_selector(
self.precomputed.doc,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
paragraph_selector=self._paragraph_selector,
)
return article_body

@attribute
def authors(self) -> List[str]:
author_string = self.precomputed.meta.get("author")
return utility.generic_author_parsing(author_string)

@attribute
def publishing_date(self) -> Optional[datetime]:
json_ld = self._get_json_ld_dict()
date_string = json_ld.get("@graph", [])[0]["datePublished"]
print(date_string)
return utility.generic_date_parsing(date_string)

@attribute
def topics(self) -> List[str]:
topic_element = self._topic_selector(self.precomputed.doc)[0]
return utility.generic_topic_parsing(topic_element.text_content())

def _get_json_ld_dict(self):
"""
Since the JSON-LD is wrapped in a CDATA block we need to implement this workaround
"""
# NOTE: Maybe cleaner to override BaseParser._base_setup (also because of free_access attribute)
json_ld_element = self._json_ld_selector(self.precomputed.doc.head)[0]
json_ld_string = json_ld_element.text_content()
json_ld_string = json_ld_string.replace("//<![CDATA[", "").replace("//]]>", "")
json_ld = json.loads(json_ld_string)
return json_ld
Loading

0 comments on commit 56fb213

Please sign in to comment.