TransformerOptimus · TransformerOptimus · May 18, 2023 · May 17, 2023 · May 17, 2023 · May 17, 2023
diff --git a/README.MD b/README.MD
@@ -53,6 +53,9 @@ Run multiple agents simultaneously, maximizing efficiency and achieving parallel
 ### 💾 **Resource Manager:**
 Read and store files generated by agents, facilitating data management and analysis.
 
+# 🛣 Roadmap
+[Click here to checkout the latest roadmap 🔗](https://github.com/TransformerOptimus/SuperAGI/wiki/Roadmap-%F0%9F%9B%A3)
+
 # ⚙️ Setting up
 
 1. Download the repo using `git clone https://github.com/TransformerOptimus/SuperAGI.git` in your terminal or directly from github page in zip format and unzip in your desired folder
@@ -86,4 +89,4 @@ This project is under active development and may still have issues. We appreciat
 
 # ⭐Star History
 
-[![Star History Chart](https://api.star-history.com/svg?repos=TransformerOptimus/SuperAGI&type=Date)](https://star-history.com/#TransformerOptimus/SuperAGI&Date)
+[![Star History Chart](https://api.star-history.com/svg?repos=TransformerOptimus/SuperAGI&type=Date)](https://star-history.com/#TransformerOptimus/SuperAGI&Date)
diff --git a/superagi/helper/google_search.py b/superagi/helper/google_search.py
@@ -0,0 +1,79 @@
+import requests
+import time
+from pydantic import BaseModel
+from webpage_extractor import WebpageExtractor
+
+
+class GoogleSearchWrap:
+
+    def __init__(self, api_key, search_engine_id, num_results=10, num_pages=1, num_extracts=3):
+        self.api_key = api_key
+        self.search_engine_id = search_engine_id
+        self.num_results = num_results
+        self.num_pages = num_pages
+        self.num_extracts = num_extracts
+        self.extractor = WebpageExtractor()
+
+    def search_run(self, query):
+        all_snippets = []
+        links = []
+        for page in range(1, self.num_pages * self.num_results, self.num_results):
+            url = "https://www.googleapis.com/customsearch/v1"
+            params = {
+                "key": self.api_key,
+                "cx": self.search_engine_id,
+                "q": query,
+                "num": self.num_results,
+                "start": page
+            }
+            response = requests.get(url, params=params, timeout=100)
+
+            if response.status_code == 200:
+                try:
+                    json_data = response.json()
+                    if "items" in json_data:
+                        for item in json_data["items"]:
+                            all_snippets.append(item["snippet"])
+                            links.append(item["link"])
+                    else:
+                        print("No items found in the response.")
+                except ValueError as e:
+                    print(f"Error while parsing JSON data: {e}")
+            else:
+                print(f"Error: {response.status_code}")
+
+        return all_snippets, links, response.status_code
+
+    def get_result(self, query):
+        snippets, links, error_code = self.search_run(query)
+
+        webpages = []
+        attempts = 0
+        while snippets == [] and attempts < 2:
+            attempts += 1
+            print("Google blocked the request. Trying again...")
+            time.sleep(3)
+            snippets, links, error_code = self.search_run(query)
+
+        if links:
+            for i in range(0, self.num_extracts):
+                time.sleep(3)
+                content = self.extractor.extract_with_3k(links[i])
+                attempts = 0
+                while content == "" and attempts < 2:
+                    attempts += 1
+                    content = self.extractor.extract_with_3k(links[i])
+                if content == "":
+                    time.sleep(3)
+                    content = self.extractor.extract_with_bs4(links[i])
+                    attempts = 0
+                    while content == "" and attempts < 2:
+                        attempts += 1
+                        content = self.extractor.extract_with_bs4(links[i])
+                webpages.append(content)
+        else:
+            snippets = ["", "", ""]
+            links = ["", "", ""]
+            webpages = ["", "", ""]
+
+        return snippets, webpages, links
diff --git a/superagi/helper/google_serp.py b/superagi/helper/google_serp.py
@@ -0,0 +1,73 @@
+import time
+from pydantic import BaseModel
+from webpage_extractor import WebpageExtractor
+from serpapi import GoogleSearch
+
+class GoogleSerpApiWrap:
+    def __init__(self, api_key, num_results=10, num_pages=1, num_extracts=3):
+        self.api_key = api_key
+        self.num_results = num_results
+        self.num_pages = num_pages
+        self.num_extracts = num_extracts
+        self.extractor = WebpageExtractor()
+
+    def search_run(self, query):
+        all_snippets = []
+        links = []
+
+        params = {
+            "api_key": self.api_key,
+            "engine": 'google',
+            "num": self.num_results,
+            "start": 0,
+            "q": query
+        }
+
+        for page in range(self.num_pages):
+            params["start"] = page * self.num_results
+            search = GoogleSearch(params)
+            results = search.get_dict()
+
+            if "organic_results" in results:
+                for result in results["organic_results"]:
+                    all_snippets.append(result["snippet"])
+                    links.append(result["link"])
+            else:
+                print("No organic results found in the response.")
+
+        return all_snippets, links
+
+    def get_result(self, query):
+        snippets, links = self.search_run(query)
+
+        webpages = []
+        attempts = 0
+        while snippets == [] and attempts < 2:
+            attempts += 1
+            print("Google blocked the request. Trying again...")
+            time.sleep(3)
+            snippets, links = self.search_run(query)
+
+        if links:
+            for i in range(0, self.num_extracts):
+                time.sleep(3)
+                content = self.extractor.extract_with_3k(links[i])
+                attempts = 0
+                while content == "" and attempts < 2:
+                    attempts += 1
+                    content = self.extractor.extract_with_3k(links[i])
+                if content == "":
+                    time.sleep(3)
+                    content = self.extractor.extract_with_bs4(links[i])
+                    attempts = 0
+                    while content == "" and attempts < 2:
+                        attempts += 1
+                        content = self.extractor.extract_with_bs4(links[i])
+                webpages.append(content)
+        else:
+            snippets = ["", "", ""]
+            links = ["", "", ""]
+            webpages = ["", "", ""]
+
+        return snippets, webpages, links
+
diff --git a/superagi/helper/webpage_extractor.py b/superagi/helper/webpage_extractor.py
@@ -0,0 +1,132 @@
+from io import BytesIO
+from PyPDF2 import PdfFileReader
+import requests
+import re
+from requests.exceptions import RequestException
+from bs4 import BeautifulSoup
+from newspaper import Article, ArticleException, Config
+from requests_html import HTMLSession
+import time
+import random
+from lxml import html
+
+USER_AGENTS = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.0",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15",
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36",
+    "Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Mobile/15E148 Safari/604.1",
+    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
+]
+
+class WebpageExtractor:
+
+    def __init__(self, num_extracts=3):
+        self.num_extracts = num_extracts
+
+    def extract_with_3k(self, url):
+        try:
+            if url.lower().endswith(".pdf"):
+                response = requests.get(url)
+                response.raise_for_status()
+
+                with BytesIO(response.content) as pdf_data:
+                    reader = PdfFileReader(pdf_data)
+                    content = " ".join([reader.getPage(i).extract_text() for i in range(reader.getNumPages())])
+
+            else:
+                config = Config()
+                config.browser_user_agent = random.choice(USER_AGENTS)
+                config.request_timeout = 5
+                session = HTMLSession()
+
+                response = session.get(url)
+                response.html.render(timeout=config.request_timeout)
+                html_content = response.html.html
+
+                article = Article(url, config=config)
+                article.set_html(html_content)
+                article.parse()
+                content = article.text.replace('\t', ' ').replace('\n', ' ').strip()
+
+            return content[:1500]
+
+        except ArticleException as ae:
+            print(f"Error while extracting text from HTML (newspaper3k): {str(ae)}")
+            return ""
+
+        except RequestException as re:
+            print(f"Error while making the request to the URL (newspaper3k): {str(re)}")
+            return ""
+
+        except Exception as e:
+            print(f"Unknown error while extracting text from HTML (newspaper3k): {str(e)}")
+            return ""
+
+    def extract_with_bs4(self, url):
+        headers = {
+            "User-Agent": random.choice(USER_AGENTS)
+        }
+
+        try:
+            response = requests.get(url, headers=headers, timeout=10)
+            if response.status_code == 200:
+                soup = BeautifulSoup(response.text, 'html.parser')
+                for tag in soup(['script', 'style', 'nav', 'footer', 'head', 'link', 'meta', 'noscript']):
+                    tag.decompose()
+
+                main_content_areas = soup.find_all(['main', 'article', 'section', 'div'])
+                if main_content_areas:
+                    main_content = max(main_content_areas, key=lambda x: len(x.text))
+                    content_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+                    content = ' '.join([tag.text.strip() for tag in main_content.find_all(content_tags)])
+                else:
+                    content = ' '.join([tag.text.strip() for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])
+
+                content = re.sub(r'\t', ' ', content)
+                content = re.sub(r'\s+', ' ', content)
+                return content[:1500]
+            else:
+                print(f"Error while extracting text from HTML (bs4): {response.status_code}")
+                return ""
+
+        except Exception as e:
+            print(f"Unknown error while extracting text from HTML (bs4): {str(e)}")
+            return ""
+
+    def extract_with_lxml(self, url):
+        try:
+            config = Config()
+            config.browser_user_agent = random.choice(USER_AGENTS)
+            config.request_timeout = 5
+            session = HTMLSession()
+
+            response = session.get(url)
+            response.html.render(timeout=config.request_timeout)
+            html_content = response.html.html
+
+            tree = html.fromstring(html_content)
+            paragraphs = tree.cssselect('p, h1, h2, h3, h4, h5, h6')
+            content = ' '.join([para.text_content() for para in paragraphs if para.text_content()])
+            content = content.replace('\t', ' ').replace('\n', ' ').strip()
+
+            return content[:1600]
+
+        except ArticleException as ae:
+            print(f"Error while extracting text from HTML (lxml): {str(ae)}")
+            return ""
+
+        except RequestException as re:
+            print(f"Error while making the request to the URL (lxml): {str(re)}")
+            return ""
+
+        except Exception as e:
+            print(f"Unknown error while extracting text from HTML (lxml): {str(e)}")
+            return ""
+
diff --git a/superagi/tools/google_search/__init__.py b/superagi/tools/google_search/__init__.py
diff --git a/superagi/tools/google_search/tools.py b/superagi/tools/google_search/tools.py
@@ -0,0 +1,40 @@
+from typing import Type, List
+from pydantic import BaseModel, Field
+from superagi.tools.base_tool import BaseTool
+from helper.google_search import GoogleSearchWrap
+import os
+import json
+
+
+class GoogleSearchSchema(BaseModel):
+    query: str = Field(
+        ...,
+        description="The search query for Google search.",
+    )
+
+
+class GoogleSearchTool(BaseTool):
+    name = "GoogleSearch"
+    description = (
+        "A tool for performing a Google search and extracting snippets and webpages."
+        "Input should be a search query."
+    )
+    args_schema: Type[GoogleSearchSchema] = GoogleSearchSchema
+
+    def execute(self, query: str) -> tuple:
+        api_key = os.environ.get("GOOGLE_API_KEY")
+        search_engine_id = os.environ.get("SEARCH_ENGINE_ID")
+        num_results = 10
+        num_pages = 1
+        num_extracts = 3
+
+        google_search = GoogleSearchWrap(api_key, search_engine_id, num_results, num_pages, num_extracts)
+        snippets, webpages, links = google_search.get_result(query)
+
+        result = {
+            "snippets": snippets,
+            "webpages": webpages,
+            "links": links
+        }
+
+        return json.dumps(result)
diff --git a/superagi/tools/google_serp_search/__init__.py b/superagi/tools/google_serp_search/__init__.py
diff --git a/superagi/tools/google_serp_search/tools.py b/superagi/tools/google_serp_search/tools.py
@@ -0,0 +1,38 @@
+from typing import Type
+from pydantic import BaseModel, Field
+from superagi.tools.base_tool import BaseTool
+from helper.google_serp import GoogleSerpApiWrap  # Import the GoogleSerpApiWrap class
+import os
+import json
+
+class GoogleSerpSchema(BaseModel):
+    query: str = Field(
+        ...,
+        description="The search query for Google SERP.",
+    )
+
+
+class GoogleSerpTool(BaseTool):
+    name = "GoogleSerp"
+    description = (
+        "A tool for performing a Google SERP search and extracting snippets and webpages."
+        "Input should be a search query."
+    )
+    args_schema: Type[GoogleSerpSchema] = GoogleSerpSchema
+
+    def execute(self, query: str) -> tuple:
+        api_key = os.environ.get("SERP_API_KEY")
+        num_results = 10
+        num_pages = 1
+        num_extracts = 3
+
+        serp_api = GoogleSerpApiWrap(api_key, num_results, num_pages, num_extracts)
+        snippets, webpages, links = serp_api.get_result(query)
+
+        result = {
+            "snippets": snippets,
+            "webpages": webpages,
+            "links": links
+        }
+
+        return json.dumps(result)