Skip to content

Commit

Permalink
Added Google News Search Engine (#81)
Browse files Browse the repository at this point in the history
  • Loading branch information
devajithvs authored Apr 22, 2020
1 parent 5704bde commit 0324d6a
Show file tree
Hide file tree
Showing 5 changed files with 83 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/supported_engines.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ Below is the list of supported engines with their summaries
11|YouTube|titles, links, descriptions, channels, [single videos only: durations, views, upload_dates]
12|MyAnimeList|titles, links, descriptions, number of episodes, type of result (OVA, series, movie, etc.), ratings
13|GoogleScholar|titles, links, descriptions, type of results ([BOOK], [CITATION], etc.), links of files
14|GoogleNews|titles, links, descriptions, image links, date, news source
3 changes: 2 additions & 1 deletion search_engine_parser/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@
GitHubSearch,
YouTubeSearch,
AskSearch,
MyAnimeListSearch
MyAnimeListSearch,
GoogleNewsSearch
)
2 changes: 2 additions & 0 deletions search_engine_parser/core/engines/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .ask import AskSearch
from .youtube import YouTubeSearch
from .myanimelist import MyAnimeListSearch
from .googlenews import GoogleNewsSearch


ENGINE_DICT = {
Expand All @@ -27,4 +28,5 @@
'aol': AolSearch,
'myanimelist': MyAnimeListSearch,
'googlescholar': GoogleScholarSearch,
'googlenews': GoogleNewsSearch,
}
67 changes: 67 additions & 0 deletions search_engine_parser/core/engines/googlenews.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
"""@desc
Parser for google news search results
"""

from search_engine_parser.core.base import BaseSearch


class GoogleNewsSearch(BaseSearch):
"""
Searches Google News for string
"""
name = "GoogleNews"
search_url = "https://www.google.com/search?"
summary = "\tGoogle News is a news aggregator app developed by Google. It presents a "\
"continuous, customizable flow of articles organized from thousands of publishers "\
"and magazines. Google News is available as an app on Android, iOS, and the Web. "\
"Google released a beta version in September 2002 and the official app in January 2006."

def get_params(self, query=None, offset=None, page=None, **kwargs):
params = {}
params["num"] = 10
params["start"] = page
params["q"] = query
params["client"] = "ubuntu"
params["tbm"] = "nws"
return params

def parse_soup(self, soup):
"""
Parses Google News Search Soup for results
"""
# find all class_='g' => each result
return soup.find_all('div', class_='g')

def parse_single_result(self, single_result):
"""
Parses the source code to return
:param single_result: single result found in <div class="g">
:type single_result: `bs4.element.ResultSet`
:return: parsed title, link, description, imge link, news source, date of single result
:rtype: dict
"""

link_tag = single_result.find('a')
title_tag = single_result.find('h3')
desc_tag = single_result.find('div', class_='st')
img_tag = single_result.find('img', class_='th')
news_source_tag = single_result.find('span', class_='e8fRJf')
date_tag = single_result.find('span', class_='f')

title = title_tag.text
raw_link = link_tag.get('href')
desc = desc_tag.text
img = img_tag.get('src')
news_source = news_source_tag.text
date = date_tag.text

rdict = {
"titles": title,
"links": raw_link,
"descriptions": desc,
"image_url" : img,
"news_source" : news_source,
"date" : date
}
return rdict
11 changes: 11 additions & 0 deletions search_engine_parser/tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
YahooSearch,
GoogleSearch,
GoogleScholarSearch,
GoogleNewsSearch,
BingSearch,
DuckDuckGoSearch,
AolSearch,
Expand Down Expand Up @@ -41,6 +42,16 @@ def test_returned_results(self):
self.assertTrue(len(self.results['result_types']) >= 10)
self.assertTrue(len(self.results['files_links']) >= 10)

class GoogleNewsEngineTest(EngineTestBase, EngineTests):
engine_class = GoogleNewsSearch

def test_returned_results(self):
self.assertTrue(len(self.results['titles']) >= 10)
self.assertTrue(len(self.results['links']) >= 10)
self.assertTrue(len(self.results['descriptions']) >= 10)
self.assertTrue(len(self.results['image_url']) >= 10)
self.assertTrue(len(self.results['news_source']) >= 10)
self.assertTrue(len(self.results['date']) >= 10)

class BingEngineTest(EngineTestBase, EngineTests):
engine_class = BingSearch
Expand Down

0 comments on commit 0324d6a

Please sign in to comment.