Added Krautreporter

flairNLP · Aug 22, 2024 · 56fb213 · 56fb213
1 parent e55ecb6
commit 56fb213
Show file tree

Hide file tree

Showing 6 changed files with 289 additions and 0 deletions.
diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md
@@ -533,6 +533,21 @@
       </td>
       <td>&#160;</td>
     </tr>
+    <tr>
+      <td>
+        <code>Krautreporter</code>
+      </td>
+      <td>
+        <div>Krautreporter</div>
+      </td>
+      <td>
+        <a href="https://krautreporter.de/">
+          <span>krautreporter.de</span>
+        </a>
+      </td>
+      <td>&#160;</td>
+      <td>&#160;</td>
+    </tr>
     <tr>
       <td>
         <code>MitteldeutscheZeitung</code>

diff --git a/src/fundus/publishers/de/__init__.py b/src/fundus/publishers/de/__init__.py
@@ -27,6 +27,7 @@
 from .hessenschau import HessenschauParser
 from .junge_welt import JungeWeltParser
 from .kicker import KickerParser
+from .krautreporter import KrautreporterParser
 from .mdr import MDRParser
 from .merkur import MerkurParser
 from .morgenpost_berlin import BerlinerMorgenpostParser
@@ -463,6 +464,21 @@ class DE(metaclass=PublisherGroup):
         url_filter=regex_filter("/slideshow|/video"),
     )
 
+    Krautreporter = Publisher(
+        name="Krautreporter",
+        domain="https://krautreporter.de/",
+        parser=KrautreporterParser,
+        sources=[
+            # NOTE: robots.txt mentions that it reserves the right of use for text & data mining (§ 44 b UrhG), 
+            # but this is not done in machine readable format, so it is null & void
+            # TODO: Maybe we have to implement a sitemap_filter here (archiv and alle-artikel)
+            # NOTE: both sitemap and news are identical
+            Sitemap("https://krautreporter.de/sitemap.xml", reverse=True, sitemap_filter=regex_filter('archiv')),
+            # NewsMap("https://krautreporter.de/news.xml"),  
+            RSSFeed("https://krautreporter.de/feeds.rss")
+        ],
+    )
+
     FrankfurterRundschau = Publisher(
         name="Frankfurter Rundschau",
         domain="https://www.fr.de",

diff --git a/src/fundus/publishers/de/krautreporter.py b/src/fundus/publishers/de/krautreporter.py
@@ -0,0 +1,58 @@
+import json
+from datetime import datetime
+from typing import List, Optional
+
+from lxml.cssselect import CSSSelector
+
+from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute, utility
+
+
+class KrautreporterParser(ParserProxy):
+    class V1(BaseParser):
+        _summary_selector = CSSSelector("p.article-headers-standard__teaser")
+        _subheadline_selector = CSSSelector("div.article-markdown > h2")
+        _paragraph_selector = CSSSelector("div.article-markdown > p")
+        _json_ld_selector = CSSSelector('script[type="application/ld+json"]')
+        _topic_selector = CSSSelector("div.article-headers-shared-topic")
+
+        @attribute
+        def title(self) -> Optional[str]:
+            return self.precomputed.meta.get("og:title")
+
+        @attribute
+        def body(self) -> ArticleBody:
+            article_body = utility.extract_article_body_with_selector(
+                self.precomputed.doc,
+                summary_selector=self._summary_selector,
+                subheadline_selector=self._subheadline_selector,
+                paragraph_selector=self._paragraph_selector,
+            )
+            return article_body
+
+        @attribute
+        def authors(self) -> List[str]:
+            author_string = self.precomputed.meta.get("author")
+            return utility.generic_author_parsing(author_string)
+
+        @attribute
+        def publishing_date(self) -> Optional[datetime]:
+            json_ld = self._get_json_ld_dict()
+            date_string = json_ld.get("@graph", [])[0]["datePublished"]
+            print(date_string)
+            return utility.generic_date_parsing(date_string)
+
+        @attribute
+        def topics(self) -> List[str]:
+            topic_element = self._topic_selector(self.precomputed.doc)[0]
+            return utility.generic_topic_parsing(topic_element.text_content())
+
+        def _get_json_ld_dict(self):
+            """
+            Since the JSON-LD is wrapped in a CDATA block we need to implement this workaround
+            """
+            # NOTE: Maybe cleaner to override BaseParser._base_setup (also because of free_access attribute)
+            json_ld_element = self._json_ld_selector(self.precomputed.doc.head)[0]
+            json_ld_string = json_ld_element.text_content()
+            json_ld_string = json_ld_string.replace("//<![CDATA[", "").replace("//]]>", "")
+            json_ld = json.loads(json_ld_string)            
+            return json_ld