Skip to content

Commit

Permalink
add Times Of India
Browse files Browse the repository at this point in the history
  • Loading branch information
addie9800 committed Jul 19, 2024
1 parent 0e9e500 commit cf15ac9
Show file tree
Hide file tree
Showing 8 changed files with 200 additions and 15 deletions.
56 changes: 44 additions & 12 deletions docs/supported_publishers.md

Large diffs are not rendered by default.

12 changes: 9 additions & 3 deletions src/fundus/parser/utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,7 @@ def generic_author_parsing(
List[Dict[str, str]],
],
split_on: Optional[List[str]] = None,
default_authors=None,
) -> List[str]:
"""This function tries to parse the given <value> to a list of authors (List[str]) based on the type of value.
Expand All @@ -232,11 +233,16 @@ def generic_author_parsing(
value: An input value representing author(s) which get parsed based on type
split_on: Only relevant for type(<value>) = str. If set, split <value> on <split_on>,
else (default) split <value> on common delimiters
default_authors: Allows to filter out default authors like 'NewsDesk' or 'Redaktion'
Returns:
A parsed and striped list of authors
@param default_authors:
"""

if default_authors is None:
default_authors = list()

def parse_author_dict(author_dict: Dict[str, str]) -> Optional[str]:
if (author_name := author_dict.get("name")) is not None:
return author_name
Expand All @@ -262,7 +268,7 @@ def parse_author_dict(author_dict: Dict[str, str]) -> Optional[str]:
authors = list(filter(bool, re.split(r"|".join(split_on or common_delimiters), value)))

elif isinstance(value, dict):
if author := parse_author_dict(value):
if (author := parse_author_dict(value)) and author not in default_authors:
return [author]
else:
return []
Expand All @@ -274,7 +280,7 @@ def parse_author_dict(author_dict: Dict[str, str]) -> Optional[str]:

elif isinstance(value[0], dict):
value = cast(List[Dict[str, str]], value)
authors = [name for author in value if (name := parse_author_dict(author))]
authors = [name for author in value if (name := parse_author_dict(author)) and name not in default_authors]

else:
raise parameter_type_error
Expand All @@ -284,7 +290,7 @@ def parse_author_dict(author_dict: Dict[str, str]) -> Optional[str]:

authors = list(more_itertools.collapse(authors, base_type=str))

return [name.strip() for name in authors]
return [name.strip() for name in authors if name.strip() not in default_authors]


def generic_text_extraction_with_css(doc, selector: XPath) -> Optional[str]:
Expand Down
2 changes: 2 additions & 0 deletions src/fundus/publishers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from fundus.publishers.cn import CN
from fundus.publishers.de import DE
from fundus.publishers.fr import FR
from fundus.publishers.ind import IND
from fundus.publishers.lt import LT
from fundus.publishers.my import MY
from fundus.publishers.na import NA
Expand Down Expand Up @@ -59,3 +60,4 @@ class PublisherCollection(metaclass=PublisherCollectionMeta):
cn = CN
tr = TR
my = MY
ind = IND
17 changes: 17 additions & 0 deletions src/fundus/publishers/ind/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from fundus.publishers.base_objects import Publisher, PublisherGroup
from fundus.publishers.ind.times_of_india import TimesOfIndiaParser
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap


class IND(metaclass=PublisherGroup):
TimesOfIndia = Publisher(
name="Times Of India",
domain="https://www.timesofindia.indiatimes.com",
parser=TimesOfIndiaParser,
sources=[
Sitemap("https://timesofindia.indiatimes.com/sitemap/today"),
Sitemap("https://timesofindia.indiatimes.com/sitemap/yesterday"),
RSSFeed("https://timesofindia.indiatimes.com/rssfeedstopstories.cms"),
RSSFeed("https://timesofindia.indiatimes.com/rssfeedmostrecent.cms"),
],
)
80 changes: 80 additions & 0 deletions src/fundus/publishers/ind/times_of_india.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import datetime
import re
from typing import List, Optional

from lxml.cssselect import CSSSelector
from lxml.etree import XPath
from lxml.html import fromstring, tostring

from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute
from fundus.parser.utility import (
extract_article_body_with_selector,
generic_author_parsing,
generic_date_parsing,
generic_topic_parsing,
)


class TimesOfIndiaParser(ParserProxy):
class V1(BaseParser):
_subheadline_selector = XPath(
"(//div[@class='_s30J clearfix '])[1]/div/b |" "(//div[@class='_s30J clearfix '])[1]/div/h2"
)
_paragraph_selector = XPath("(//div[@class='_s30J clearfix '])[1]/p[not(@class='intro')]")
_summary_selector = XPath("(//div[@class='_s30J clearfix '])[1]/p[@class='intro']")

@attribute
def body(self) -> ArticleBody:
html_as_string = tostring(self.precomputed.doc).decode("utf-8")
html_as_string = re.sub(r"(</div>)((\r\n|\r|\n)<br>)", "</div><p>", html_as_string)
html_as_string = re.sub(r"</div></div>(?!<)", "</div></div><p>", html_as_string)
html_as_string = re.sub(r"</div></div></div>(?!<)", "</div></div></div><p>", html_as_string)
html_as_string = re.sub(r"<br>(\r\n|\r|\n)(:?<div)", "</p>", html_as_string)
html_as_string = re.sub(r"(:?::before)(\r\n|\r|\n)", "<p>", html_as_string)
html_as_string = re.sub(r"(\r\n|\r|\n)(:?::after)", "</p>", html_as_string)
html_as_string = re.sub(r"<br>", "</p><p>", html_as_string)
html_as_string = re.sub(
r"<div class=\"_s30J clearfix \">", "<div class=\"_s30J clearfix \"><p class='intro'>", html_as_string
)
with open("test.html", "w") as file:
file.write(html_as_string)
return extract_article_body_with_selector(
fromstring(html_as_string), # type: ignore
summary_selector=self._summary_selector,
paragraph_selector=self._paragraph_selector,
subheadline_selector=self._subheadline_selector,
)

@attribute
def publishing_date(self) -> Optional[datetime.datetime]:
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished"))

@attribute
def authors(self) -> List[str]:
return generic_author_parsing(
self.precomputed.ld.bf_search("author"),
default_authors=["TOI Sports Desk", "TOI News Desk", "TOI Tech Desk"],
)

@attribute
def title(self) -> Optional[str]:
if title := self.precomputed.meta.get("og:title"):
return re.sub(r"( - Times.*| \| India.*)", "", title)
return None

@attribute
def topics(self) -> List[str]:
bloat_topics = [
"India",
"News",
"Google News",
"India Breaking News",
"India news",
"Live News India",
"Top news in India",
]
return [
topic
for topic in generic_topic_parsing(self.precomputed.meta.get("news_keywords"))
if topic not in bloat_topics
]
42 changes: 42 additions & 0 deletions tests/resources/parser/test_data/ind/TimesOfIndia.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
{
"V1": {
"authors": [
"Chidanand Rajghatta"
],
"body": {
"summary": [
"WASHINGTON: Secluded at his home in Delaware recovering from Covid and increasingly isolated in his own party, US President Joe Biden is grimly hanging in for the November 2024 election even as Democratic operatives who profess love and admiration for him are trying to prise the party nomination from him."
],
"sections": [
{
"headline": [],
"paragraphs": [
"Biden is “absolutely” staying in the presidential race, the Democratic campaign chair Jen O’Malley Dillon insisted on MSNBC on Friday, calling him the “best person to take on Donald Trump.” The assessment is shared by very few people, and surveys and opeds are starting to point otherwise, causing more and more lawmakers and party operatives to urge the beleaguered President to bow out.",
"On Friday, four more lawmakers, including Congressman and constitutional scholar Jamie Raskin from Maryland and Sean Casten from Illinois (both heavily Democratic states from where they will get re-elected comfortably) urged Biden to stand down, their appeal prefaced by copious praise for his service.",
"Contrasting Biden (“a fundamentally kind, empathetic and decent human being”) with Trump (“a twice-impeached convicted felon and adjudicated rapist who has promised to be a ‘dictator on day one.’\") Casten nevertheless urged Biden to \"manage an exit with all the dignity and decency that has guided his half-century of public service.”",
"“Politics, like life, isn’t fair. And as long as this election is instead litigated over which candidate is more likely to be held accountable for public gaffes and ‘senior moments,’ I believe that Biden is not only going to lose but is also uniquely incapable of shifting that conversation,” Casten wrote in an oped.",
"But with no strong alternative emerging and little public confidence in his vice-president Kamala Harris taking on the mantle, a stalemate has ensued in the party, with first indications of fissures that could result in an open primary at the party convention in Chicago next month, the first such free-for-all since 1968.",
"In fact, some lawmakers, notably Alexandra Ocasio-Cortez, fear that if Biden is ousted from the nomination process, some of her colleagues would seek to jettison Kamala Harris too from the ticket. The Congressional Black Caucus and most Latino lawmakers remain strongly behind the Biden-Harris ticket.",
"If Biden decides to bow out, then his own preferred choice is Harris. He has implicitly endorsed her, saying several times that he wouldn’t have picked her as his vice president “unless I thought she was qualified to be president from the very beginning.”",
"But the handover is far from guaranteed. Unlike a resignation or departure during term from the White House, which automatically promotes the vice-president to the post, giving up the nomination (which in any case has not been formalized) does not automatically confer it on the running mate. The party will have to go through the political process that involves voting for by more than 3000 delegates to choose a nominee.",
"That process was expected to be a formality to nominate Biden. But if he drops out, then party stalwarts can first try and coalesce support around Harris, or someone else acceptable to a majority of delegates, before the DNC that begins on August 19 in Chicago.",
"If there is no consensus, then it will result in an open convention in Chicago where an actual, truly democratic process could unfold. In either case, there is plenty of backroom intrigue and drama in store for the party."
]
}
]
},
"publishing_date": "2024-07-19 21:27:00+05:30",
"title": "Biden hangs in grimly as more party reps urge him to bow out",
"topics": [
"World news",
"World news today",
"World news live",
"Top news in US",
"Breaking newsJoe Biden",
"Donald Trump",
"Democratic operatives",
"Democratic campaign chair",
"Biden"
]
}
}
Binary file not shown.
6 changes: 6 additions & 0 deletions tests/resources/parser/test_data/ind/meta.info
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"TimesOfIndia_2024_07_19.html.gz": {
"url": "https://timesofindia.indiatimes.com/world/us/biden-hangs-in-grimly-as-more-party-reps-urge-him-to-bow-out/articleshow/111868705.cms",
"crawl_date": "2024-07-19 19:33:21.248239"
}
}

0 comments on commit cf15ac9

Please sign in to comment.