-
Notifications
You must be signed in to change notification settings - Fork 79
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
200 additions
and
15 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
from fundus.publishers.base_objects import Publisher, PublisherGroup | ||
from fundus.publishers.ind.times_of_india import TimesOfIndiaParser | ||
from fundus.scraping.url import NewsMap, RSSFeed, Sitemap | ||
|
||
|
||
class IND(metaclass=PublisherGroup): | ||
TimesOfIndia = Publisher( | ||
name="Times Of India", | ||
domain="https://www.timesofindia.indiatimes.com", | ||
parser=TimesOfIndiaParser, | ||
sources=[ | ||
Sitemap("https://timesofindia.indiatimes.com/sitemap/today"), | ||
Sitemap("https://timesofindia.indiatimes.com/sitemap/yesterday"), | ||
RSSFeed("https://timesofindia.indiatimes.com/rssfeedstopstories.cms"), | ||
RSSFeed("https://timesofindia.indiatimes.com/rssfeedmostrecent.cms"), | ||
], | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
import datetime | ||
import re | ||
from typing import List, Optional | ||
|
||
from lxml.cssselect import CSSSelector | ||
from lxml.etree import XPath | ||
from lxml.html import fromstring, tostring | ||
|
||
from fundus.parser import ArticleBody, BaseParser, ParserProxy, attribute | ||
from fundus.parser.utility import ( | ||
extract_article_body_with_selector, | ||
generic_author_parsing, | ||
generic_date_parsing, | ||
generic_topic_parsing, | ||
) | ||
|
||
|
||
class TimesOfIndiaParser(ParserProxy): | ||
class V1(BaseParser): | ||
_subheadline_selector = XPath( | ||
"(//div[@class='_s30J clearfix '])[1]/div/b |" "(//div[@class='_s30J clearfix '])[1]/div/h2" | ||
) | ||
_paragraph_selector = XPath("(//div[@class='_s30J clearfix '])[1]/p[not(@class='intro')]") | ||
_summary_selector = XPath("(//div[@class='_s30J clearfix '])[1]/p[@class='intro']") | ||
|
||
@attribute | ||
def body(self) -> ArticleBody: | ||
html_as_string = tostring(self.precomputed.doc).decode("utf-8") | ||
html_as_string = re.sub(r"(</div>)((\r\n|\r|\n)<br>)", "</div><p>", html_as_string) | ||
html_as_string = re.sub(r"</div></div>(?!<)", "</div></div><p>", html_as_string) | ||
html_as_string = re.sub(r"</div></div></div>(?!<)", "</div></div></div><p>", html_as_string) | ||
html_as_string = re.sub(r"<br>(\r\n|\r|\n)(:?<div)", "</p>", html_as_string) | ||
html_as_string = re.sub(r"(:?::before)(\r\n|\r|\n)", "<p>", html_as_string) | ||
html_as_string = re.sub(r"(\r\n|\r|\n)(:?::after)", "</p>", html_as_string) | ||
html_as_string = re.sub(r"<br>", "</p><p>", html_as_string) | ||
html_as_string = re.sub( | ||
r"<div class=\"_s30J clearfix \">", "<div class=\"_s30J clearfix \"><p class='intro'>", html_as_string | ||
) | ||
with open("test.html", "w") as file: | ||
file.write(html_as_string) | ||
return extract_article_body_with_selector( | ||
fromstring(html_as_string), # type: ignore | ||
summary_selector=self._summary_selector, | ||
paragraph_selector=self._paragraph_selector, | ||
subheadline_selector=self._subheadline_selector, | ||
) | ||
|
||
@attribute | ||
def publishing_date(self) -> Optional[datetime.datetime]: | ||
return generic_date_parsing(self.precomputed.ld.bf_search("datePublished")) | ||
|
||
@attribute | ||
def authors(self) -> List[str]: | ||
return generic_author_parsing( | ||
self.precomputed.ld.bf_search("author"), | ||
default_authors=["TOI Sports Desk", "TOI News Desk", "TOI Tech Desk"], | ||
) | ||
|
||
@attribute | ||
def title(self) -> Optional[str]: | ||
if title := self.precomputed.meta.get("og:title"): | ||
return re.sub(r"( - Times.*| \| India.*)", "", title) | ||
return None | ||
|
||
@attribute | ||
def topics(self) -> List[str]: | ||
bloat_topics = [ | ||
"India", | ||
"News", | ||
"Google News", | ||
"India Breaking News", | ||
"India news", | ||
"Live News India", | ||
"Top news in India", | ||
] | ||
return [ | ||
topic | ||
for topic in generic_topic_parsing(self.precomputed.meta.get("news_keywords")) | ||
if topic not in bloat_topics | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
{ | ||
"V1": { | ||
"authors": [ | ||
"Chidanand Rajghatta" | ||
], | ||
"body": { | ||
"summary": [ | ||
"WASHINGTON: Secluded at his home in Delaware recovering from Covid and increasingly isolated in his own party, US President Joe Biden is grimly hanging in for the November 2024 election even as Democratic operatives who profess love and admiration for him are trying to prise the party nomination from him." | ||
], | ||
"sections": [ | ||
{ | ||
"headline": [], | ||
"paragraphs": [ | ||
"Biden is “absolutely” staying in the presidential race, the Democratic campaign chair Jen O’Malley Dillon insisted on MSNBC on Friday, calling him the “best person to take on Donald Trump.” The assessment is shared by very few people, and surveys and opeds are starting to point otherwise, causing more and more lawmakers and party operatives to urge the beleaguered President to bow out.", | ||
"On Friday, four more lawmakers, including Congressman and constitutional scholar Jamie Raskin from Maryland and Sean Casten from Illinois (both heavily Democratic states from where they will get re-elected comfortably) urged Biden to stand down, their appeal prefaced by copious praise for his service.", | ||
"Contrasting Biden (“a fundamentally kind, empathetic and decent human being”) with Trump (“a twice-impeached convicted felon and adjudicated rapist who has promised to be a ‘dictator on day one.’\") Casten nevertheless urged Biden to \"manage an exit with all the dignity and decency that has guided his half-century of public service.”", | ||
"“Politics, like life, isn’t fair. And as long as this election is instead litigated over which candidate is more likely to be held accountable for public gaffes and ‘senior moments,’ I believe that Biden is not only going to lose but is also uniquely incapable of shifting that conversation,” Casten wrote in an oped.", | ||
"But with no strong alternative emerging and little public confidence in his vice-president Kamala Harris taking on the mantle, a stalemate has ensued in the party, with first indications of fissures that could result in an open primary at the party convention in Chicago next month, the first such free-for-all since 1968.", | ||
"In fact, some lawmakers, notably Alexandra Ocasio-Cortez, fear that if Biden is ousted from the nomination process, some of her colleagues would seek to jettison Kamala Harris too from the ticket. The Congressional Black Caucus and most Latino lawmakers remain strongly behind the Biden-Harris ticket.", | ||
"If Biden decides to bow out, then his own preferred choice is Harris. He has implicitly endorsed her, saying several times that he wouldn’t have picked her as his vice president “unless I thought she was qualified to be president from the very beginning.”", | ||
"But the handover is far from guaranteed. Unlike a resignation or departure during term from the White House, which automatically promotes the vice-president to the post, giving up the nomination (which in any case has not been formalized) does not automatically confer it on the running mate. The party will have to go through the political process that involves voting for by more than 3000 delegates to choose a nominee.", | ||
"That process was expected to be a formality to nominate Biden. But if he drops out, then party stalwarts can first try and coalesce support around Harris, or someone else acceptable to a majority of delegates, before the DNC that begins on August 19 in Chicago.", | ||
"If there is no consensus, then it will result in an open convention in Chicago where an actual, truly democratic process could unfold. In either case, there is plenty of backroom intrigue and drama in store for the party." | ||
] | ||
} | ||
] | ||
}, | ||
"publishing_date": "2024-07-19 21:27:00+05:30", | ||
"title": "Biden hangs in grimly as more party reps urge him to bow out", | ||
"topics": [ | ||
"World news", | ||
"World news today", | ||
"World news live", | ||
"Top news in US", | ||
"Breaking newsJoe Biden", | ||
"Donald Trump", | ||
"Democratic operatives", | ||
"Democratic campaign chair", | ||
"Biden" | ||
] | ||
} | ||
} |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{ | ||
"TimesOfIndia_2024_07_19.html.gz": { | ||
"url": "https://timesofindia.indiatimes.com/world/us/biden-hangs-in-grimly-as-more-party-reps-urge-him-to-bow-out/articleshow/111868705.cms", | ||
"crawl_date": "2024-07-19 19:33:21.248239" | ||
} | ||
} |