add LesEchos

flairNLP · Jul 18, 2024 · 248a497 · 248a497
1 parent 72e7ff0
commit 248a497
Show file tree

Hide file tree

Showing 4 changed files with 89 additions and 0 deletions.
diff --git a/docs/supported_publishers.md b/docs/supported_publishers.md
@@ -899,6 +899,21 @@
       <td>&#160;</td>
       <td>&#160;</td>
     </tr>
+    <tr>
+      <td>
+        <code>LesEchos</code>
+      </td>
+      <td>
+        <div>Les &#201;chos</div>
+      </td>
+      <td>
+        <a href="https://www.lesechos.fr/">
+          <span>www.lesechos.fr</span>
+        </a>
+      </td>
+      <td>&#160;</td>
+      <td>&#160;</td>
+    </tr>
   </tbody>
 </table>
 

diff --git a/src/fundus/parser/utility.py b/src/fundus/parser/utility.py
@@ -194,6 +194,12 @@ def strip_nodes_to_text(text_nodes: List[lxml.html.HtmlElement], join_on: str =
     return join_on.join(([re.sub(r"\n+", " ", node.text_content()) for node in text_nodes])).strip()
 
 
+def generic_nodes_to_text(nodes: List[lxml.html.HtmlElement]) -> List[str]:
+    if not nodes:
+        return []
+    return [str(node.text_content() for node in nodes)]
+
+
 def apply_substitution_pattern_over_list(
     input_list: List[str], pattern: Pattern[str], replacement: Union[str, Callable[[Match[str]], str]] = ""
 ) -> List[str]:

diff --git a/src/fundus/publishers/fr/__init__.py b/src/fundus/publishers/fr/__init__.py
@@ -4,6 +4,7 @@
 from ..shared import EuronewsParser
 from .le_figaro import LeFigaroParser
 from .le_monde import LeMondeParser
+from .les_echos import LesEchosParser
 
 
 class FR(metaclass=PublisherGroup):
@@ -36,3 +37,13 @@ class FR(metaclass=PublisherGroup):
             NewsMap("https://www.lefigaro.fr/sitemap_news.xml"),
         ],
     )
+
+    LesEchos = Publisher(
+        name="Les Échos",
+        domain="https://www.lesechos.fr/",
+        parser=LesEchosParser,
+        sources=[
+            Sitemap("https://sitemap.lesechos.fr/sitemap_index.xml", reverse=True),
+            NewsMap("https://www.lesechos.fr/sitemap_news.xml"),
+        ],
+    )
diff --git a/src/fundus/publishers/fr/les_echos.py b/src/fundus/publishers/fr/les_echos.py
@@ -0,0 +1,57 @@
+import datetime
+from typing import List, Optional
+
+from lxml.cssselect import CSSSelector
+from lxml.etree import XPath
+
+from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
+from fundus.parser.utility import (
+    extract_article_body_with_selector,
+    generic_author_parsing,
+    generic_date_parsing,
+    generic_nodes_to_text,
+)
+
+
+class LesEchosParser(ParserProxy):
+    class V1(BaseParser):
+        _summary_selector = CSSSelector("article header > p")
+        _subheadline_selector = CSSSelector("article div.post-paywall > h3")
+
+        _bloat_regex_ = r"^Pour ne rien rater de l'actualité politique"
+
+        _paragraph_selector = XPath(
+            f'//article //div[contains(@class, "post-paywall")] /p[not(re:test(string(), "{_bloat_regex_}"))]',
+            namespaces={"re": "http://exslt.org/regular-expressions"},
+        )
+
+        _topic_selector = CSSSelector("header div.sc-108qdzy-3 div.sc-108qdzy-2 > div")
+
+        @attribute
+        def body(self) -> ArticleBody:
+            return extract_article_body_with_selector(
+                self.precomputed.doc,
+                paragraph_selector=self._paragraph_selector,
+                summary_selector=self._summary_selector,
+                subheadline_selector=self._subheadline_selector,
+            )
+
+        @attribute
+        def title(self) -> Optional[str]:
+            # Use the `get` function to retrieve data from the `meta` precomputed attribute
+            return self.precomputed.meta.get("og:title")
+
+        @attribute
+        def topics(self) -> List[str]:
+            topic_nodes = self._topic_selector(self.precomputed.doc)
+            return generic_nodes_to_text(topic_nodes)
+
+        @attribute
+        def publishing_date(self) -> Optional[datetime.datetime]:
+            return generic_date_parsing(
+                self.precomputed.meta.get("article:published_time") or self.precomputed.ld.bf_search("datePublished")
+            )
+
+        @attribute
+        def authors(self) -> List[str]:
+            return generic_author_parsing(self.precomputed.ld.bf_search("author"))