Skip to content

Commit

Permalink
add LesEchos
Browse files Browse the repository at this point in the history
  • Loading branch information
MaxDall committed Jul 18, 2024
1 parent 72e7ff0 commit 248a497
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 0 deletions.
15 changes: 15 additions & 0 deletions docs/supported_publishers.md
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,21 @@
<td>&#160;</td>
<td>&#160;</td>
</tr>
<tr>
<td>
<code>LesEchos</code>
</td>
<td>
<div>Les &#201;chos</div>
</td>
<td>
<a href="https://www.lesechos.fr/">
<span>www.lesechos.fr</span>
</a>
</td>
<td>&#160;</td>
<td>&#160;</td>
</tr>
</tbody>
</table>

Expand Down
6 changes: 6 additions & 0 deletions src/fundus/parser/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,12 @@ def strip_nodes_to_text(text_nodes: List[lxml.html.HtmlElement], join_on: str =
return join_on.join(([re.sub(r"\n+", " ", node.text_content()) for node in text_nodes])).strip()


def generic_nodes_to_text(nodes: List[lxml.html.HtmlElement]) -> List[str]:
if not nodes:
return []
return [str(node.text_content() for node in nodes)]


def apply_substitution_pattern_over_list(
input_list: List[str], pattern: Pattern[str], replacement: Union[str, Callable[[Match[str]], str]] = ""
) -> List[str]:
Expand Down
11 changes: 11 additions & 0 deletions src/fundus/publishers/fr/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from ..shared import EuronewsParser
from .le_figaro import LeFigaroParser
from .le_monde import LeMondeParser
from .les_echos import LesEchosParser


class FR(metaclass=PublisherGroup):
Expand Down Expand Up @@ -36,3 +37,13 @@ class FR(metaclass=PublisherGroup):
NewsMap("https://www.lefigaro.fr/sitemap_news.xml"),
],
)

LesEchos = Publisher(
name="Les Échos",
domain="https://www.lesechos.fr/",
parser=LesEchosParser,
sources=[
Sitemap("https://sitemap.lesechos.fr/sitemap_index.xml", reverse=True),
NewsMap("https://www.lesechos.fr/sitemap_news.xml"),
],
)
57 changes: 57 additions & 0 deletions src/fundus/publishers/fr/les_echos.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import datetime
from typing import List, Optional

from lxml.cssselect import CSSSelector
from lxml.etree import XPath

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_nodes_to_text,
)


class LesEchosParser(ParserProxy):
class V1(BaseParser):
_summary_selector = CSSSelector("article header > p")
_subheadline_selector = CSSSelector("article div.post-paywall > h3")

_bloat_regex_ = r"^Pour ne rien rater de l'actualité politique"

_paragraph_selector = XPath(
f'//article //div[contains(@class, "post-paywall")] /p[not(re:test(string(), "{_bloat_regex_}"))]',
namespaces={"re": "http://exslt.org/regular-expressions"},
)

_topic_selector = CSSSelector("header div.sc-108qdzy-3 div.sc-108qdzy-2 > div")

@attribute
def body(self) -> ArticleBody:
return extract_article_body_with_selector(
self.precomputed.doc,
paragraph_selector=self._paragraph_selector,
summary_selector=self._summary_selector,
subheadline_selector=self._subheadline_selector,
)

@attribute
def title(self) -> Optional[str]:
# Use the `get` function to retrieve data from the `meta` precomputed attribute
return self.precomputed.meta.get("og:title")

@attribute
def topics(self) -> List[str]:
topic_nodes = self._topic_selector(self.precomputed.doc)
return generic_nodes_to_text(topic_nodes)

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(
self.precomputed.meta.get("article:published_time") or self.precomputed.ld.bf_search("datePublished")
)

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(self.precomputed.ld.bf_search("author"))

0 comments on commit 248a497

Please sign in to comment.