Added Google News Search Engine (#81)

bisohns · Apr 22, 2020 · 0324d6a · 0324d6a
1 parent 5704bde
commit 0324d6a
Show file tree

Hide file tree

Showing 5 changed files with 83 additions and 1 deletion.
diff --git a/docs/supported_engines.md b/docs/supported_engines.md
@@ -21,3 +21,4 @@ Below is the list of supported engines with their summaries
 11|YouTube|titles, links, descriptions, channels, [single videos only: durations, views, upload_dates]
 12|MyAnimeList|titles, links, descriptions, number of episodes, type of result (OVA, series, movie, etc.), ratings
 13|GoogleScholar|titles, links, descriptions, type of results ([BOOK], [CITATION], etc.), links of files
+14|GoogleNews|titles, links, descriptions, image links, date, news source
diff --git a/search_engine_parser/core/__init__.py b/search_engine_parser/core/__init__.py
@@ -13,5 +13,6 @@
                                                GitHubSearch,
                                                YouTubeSearch,
                                                AskSearch,
-                                               MyAnimeListSearch
+                                               MyAnimeListSearch,
+                                               GoogleNewsSearch
                                                )
diff --git a/search_engine_parser/core/engines/__init__.py b/search_engine_parser/core/engines/__init__.py
@@ -11,6 +11,7 @@
 from .ask import AskSearch
 from .youtube import YouTubeSearch
 from .myanimelist import MyAnimeListSearch
+from .googlenews import GoogleNewsSearch
 
 
 ENGINE_DICT = {
@@ -27,4 +28,5 @@
     'aol': AolSearch,
     'myanimelist': MyAnimeListSearch,
     'googlescholar': GoogleScholarSearch,
+    'googlenews': GoogleNewsSearch,
 }
diff --git a/search_engine_parser/core/engines/googlenews.py b/search_engine_parser/core/engines/googlenews.py
@@ -0,0 +1,67 @@
+"""@desc
+		Parser for google news search results
+"""
+
+from search_engine_parser.core.base import BaseSearch
+
+
+class GoogleNewsSearch(BaseSearch):
+    """
+    Searches Google News for string
+    """
+    name = "GoogleNews"
+    search_url = "https://www.google.com/search?"
+    summary = "\tGoogle News is a news aggregator app developed by Google. It presents a "\
+        "continuous, customizable flow of articles organized from thousands of publishers "\
+        "and magazines. Google News is available as an app on Android, iOS, and the Web. "\
+        "Google released a beta version in September 2002 and the official app in January 2006."
+
+    def get_params(self, query=None, offset=None, page=None, **kwargs):
+        params = {}
+        params["num"] = 10
+        params["start"] = page
+        params["q"] = query
+        params["client"] = "ubuntu"
+        params["tbm"] = "nws"
+        return params
+
+    def parse_soup(self, soup):
+        """
+        Parses Google News Search Soup for results
+        """
+        # find all class_='g' => each result
+        return soup.find_all('div', class_='g')
+
+    def parse_single_result(self, single_result):
+        """
+        Parses the source code to return
+
+        :param single_result: single result found in <div class="g">
+        :type single_result: `bs4.element.ResultSet`
+        :return: parsed title, link, description, imge link, news source, date of single result
+        :rtype: dict
+        """
+
+        link_tag = single_result.find('a')
+        title_tag = single_result.find('h3')
+        desc_tag = single_result.find('div', class_='st')
+        img_tag = single_result.find('img', class_='th')
+        news_source_tag = single_result.find('span', class_='e8fRJf')
+        date_tag = single_result.find('span', class_='f')
+
+        title = title_tag.text
+        raw_link = link_tag.get('href')
+        desc = desc_tag.text
+        img = img_tag.get('src')
+        news_source = news_source_tag.text
+        date = date_tag.text 
+
+        rdict = {
+            "titles": title,
+            "links": raw_link,
+            "descriptions": desc,
+            "image_url" : img,
+            "news_source" : news_source,
+            "date" : date
+        }
+        return rdict
diff --git a/search_engine_parser/tests/test_search.py b/search_engine_parser/tests/test_search.py
@@ -6,6 +6,7 @@
     YahooSearch,
     GoogleSearch,
     GoogleScholarSearch,
+    GoogleNewsSearch,
     BingSearch,
     DuckDuckGoSearch,
     AolSearch,
@@ -41,6 +42,16 @@ def test_returned_results(self):
         self.assertTrue(len(self.results['result_types']) >= 10)
         self.assertTrue(len(self.results['files_links']) >= 10)
 
+class GoogleNewsEngineTest(EngineTestBase, EngineTests):
+    engine_class = GoogleNewsSearch
+
+    def test_returned_results(self):
+        self.assertTrue(len(self.results['titles']) >= 10)
+        self.assertTrue(len(self.results['links']) >= 10)
+        self.assertTrue(len(self.results['descriptions']) >= 10)
+        self.assertTrue(len(self.results['image_url']) >= 10)
+        self.assertTrue(len(self.results['news_source']) >= 10)
+        self.assertTrue(len(self.results['date']) >= 10)
 
 class BingEngineTest(EngineTestBase, EngineTests):
     engine_class = BingSearch