diff --git a/README.MD b/README.MD index ff1069413..296ca2c14 100644 --- a/README.MD +++ b/README.MD @@ -53,6 +53,9 @@ Run multiple agents simultaneously, maximizing efficiency and achieving parallel ### 💾 **Resource Manager:** Read and store files generated by agents, facilitating data management and analysis. +# 🛣 Roadmap +[Click here to checkout the latest roadmap 🔗](https://github.com/TransformerOptimus/SuperAGI/wiki/Roadmap-%F0%9F%9B%A3) + # ⚙️ Setting up 1. Download the repo using `git clone https://github.com/TransformerOptimus/SuperAGI.git` in your terminal or directly from github page in zip format and unzip in your desired folder @@ -86,4 +89,4 @@ This project is under active development and may still have issues. We appreciat # ⭐Star History -[![Star History Chart](https://api.star-history.com/svg?repos=TransformerOptimus/SuperAGI&type=Date)](https://star-history.com/#TransformerOptimus/SuperAGI&Date) \ No newline at end of file +[![Star History Chart](https://api.star-history.com/svg?repos=TransformerOptimus/SuperAGI&type=Date)](https://star-history.com/#TransformerOptimus/SuperAGI&Date) diff --git a/superagi/helper/google_search.py b/superagi/helper/google_search.py new file mode 100644 index 000000000..4f75155f0 --- /dev/null +++ b/superagi/helper/google_search.py @@ -0,0 +1,79 @@ +import requests +import time +from pydantic import BaseModel +from webpage_extractor import WebpageExtractor + + +class GoogleSearchWrap: + + def __init__(self, api_key, search_engine_id, num_results=10, num_pages=1, num_extracts=3): + self.api_key = api_key + self.search_engine_id = search_engine_id + self.num_results = num_results + self.num_pages = num_pages + self.num_extracts = num_extracts + self.extractor = WebpageExtractor() + + def search_run(self, query): + all_snippets = [] + links = [] + for page in range(1, self.num_pages * self.num_results, self.num_results): + url = "https://www.googleapis.com/customsearch/v1" + params = { + "key": self.api_key, + "cx": self.search_engine_id, + "q": query, + "num": self.num_results, + "start": page + } + response = requests.get(url, params=params, timeout=100) + + if response.status_code == 200: + try: + json_data = response.json() + if "items" in json_data: + for item in json_data["items"]: + all_snippets.append(item["snippet"]) + links.append(item["link"]) + else: + print("No items found in the response.") + except ValueError as e: + print(f"Error while parsing JSON data: {e}") + else: + print(f"Error: {response.status_code}") + + return all_snippets, links, response.status_code + + def get_result(self, query): + snippets, links, error_code = self.search_run(query) + + webpages = [] + attempts = 0 + while snippets == [] and attempts < 2: + attempts += 1 + print("Google blocked the request. Trying again...") + time.sleep(3) + snippets, links, error_code = self.search_run(query) + + if links: + for i in range(0, self.num_extracts): + time.sleep(3) + content = self.extractor.extract_with_3k(links[i]) + attempts = 0 + while content == "" and attempts < 2: + attempts += 1 + content = self.extractor.extract_with_3k(links[i]) + if content == "": + time.sleep(3) + content = self.extractor.extract_with_bs4(links[i]) + attempts = 0 + while content == "" and attempts < 2: + attempts += 1 + content = self.extractor.extract_with_bs4(links[i]) + webpages.append(content) + else: + snippets = ["", "", ""] + links = ["", "", ""] + webpages = ["", "", ""] + + return snippets, webpages, links \ No newline at end of file diff --git a/superagi/helper/google_serp.py b/superagi/helper/google_serp.py new file mode 100644 index 000000000..8caf64a77 --- /dev/null +++ b/superagi/helper/google_serp.py @@ -0,0 +1,73 @@ +import time +from pydantic import BaseModel +from webpage_extractor import WebpageExtractor +from serpapi import GoogleSearch + +class GoogleSerpApiWrap: + def __init__(self, api_key, num_results=10, num_pages=1, num_extracts=3): + self.api_key = api_key + self.num_results = num_results + self.num_pages = num_pages + self.num_extracts = num_extracts + self.extractor = WebpageExtractor() + + def search_run(self, query): + all_snippets = [] + links = [] + + params = { + "api_key": self.api_key, + "engine": 'google', + "num": self.num_results, + "start": 0, + "q": query + } + + for page in range(self.num_pages): + params["start"] = page * self.num_results + search = GoogleSearch(params) + results = search.get_dict() + + if "organic_results" in results: + for result in results["organic_results"]: + all_snippets.append(result["snippet"]) + links.append(result["link"]) + else: + print("No organic results found in the response.") + + return all_snippets, links + + def get_result(self, query): + snippets, links = self.search_run(query) + + webpages = [] + attempts = 0 + while snippets == [] and attempts < 2: + attempts += 1 + print("Google blocked the request. Trying again...") + time.sleep(3) + snippets, links = self.search_run(query) + + if links: + for i in range(0, self.num_extracts): + time.sleep(3) + content = self.extractor.extract_with_3k(links[i]) + attempts = 0 + while content == "" and attempts < 2: + attempts += 1 + content = self.extractor.extract_with_3k(links[i]) + if content == "": + time.sleep(3) + content = self.extractor.extract_with_bs4(links[i]) + attempts = 0 + while content == "" and attempts < 2: + attempts += 1 + content = self.extractor.extract_with_bs4(links[i]) + webpages.append(content) + else: + snippets = ["", "", ""] + links = ["", "", ""] + webpages = ["", "", ""] + + return snippets, webpages, links + diff --git a/superagi/helper/webpage_extractor.py b/superagi/helper/webpage_extractor.py new file mode 100644 index 000000000..6faaa95ff --- /dev/null +++ b/superagi/helper/webpage_extractor.py @@ -0,0 +1,132 @@ +from io import BytesIO +from PyPDF2 import PdfFileReader +import requests +import re +from requests.exceptions import RequestException +from bs4 import BeautifulSoup +from newspaper import Article, ArticleException, Config +from requests_html import HTMLSession +import time +import random +from lxml import html + +USER_AGENTS = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.0", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36", + "Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Mobile/15E148 Safari/604.1", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36" +] + +class WebpageExtractor: + + def __init__(self, num_extracts=3): + self.num_extracts = num_extracts + + def extract_with_3k(self, url): + try: + if url.lower().endswith(".pdf"): + response = requests.get(url) + response.raise_for_status() + + with BytesIO(response.content) as pdf_data: + reader = PdfFileReader(pdf_data) + content = " ".join([reader.getPage(i).extract_text() for i in range(reader.getNumPages())]) + + else: + config = Config() + config.browser_user_agent = random.choice(USER_AGENTS) + config.request_timeout = 5 + session = HTMLSession() + + response = session.get(url) + response.html.render(timeout=config.request_timeout) + html_content = response.html.html + + article = Article(url, config=config) + article.set_html(html_content) + article.parse() + content = article.text.replace('\t', ' ').replace('\n', ' ').strip() + + return content[:1500] + + except ArticleException as ae: + print(f"Error while extracting text from HTML (newspaper3k): {str(ae)}") + return "" + + except RequestException as re: + print(f"Error while making the request to the URL (newspaper3k): {str(re)}") + return "" + + except Exception as e: + print(f"Unknown error while extracting text from HTML (newspaper3k): {str(e)}") + return "" + + def extract_with_bs4(self, url): + headers = { + "User-Agent": random.choice(USER_AGENTS) + } + + try: + response = requests.get(url, headers=headers, timeout=10) + if response.status_code == 200: + soup = BeautifulSoup(response.text, 'html.parser') + for tag in soup(['script', 'style', 'nav', 'footer', 'head', 'link', 'meta', 'noscript']): + tag.decompose() + + main_content_areas = soup.find_all(['main', 'article', 'section', 'div']) + if main_content_areas: + main_content = max(main_content_areas, key=lambda x: len(x.text)) + content_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] + content = ' '.join([tag.text.strip() for tag in main_content.find_all(content_tags)]) + else: + content = ' '.join([tag.text.strip() for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])]) + + content = re.sub(r'\t', ' ', content) + content = re.sub(r'\s+', ' ', content) + return content[:1500] + else: + print(f"Error while extracting text from HTML (bs4): {response.status_code}") + return "" + + except Exception as e: + print(f"Unknown error while extracting text from HTML (bs4): {str(e)}") + return "" + + def extract_with_lxml(self, url): + try: + config = Config() + config.browser_user_agent = random.choice(USER_AGENTS) + config.request_timeout = 5 + session = HTMLSession() + + response = session.get(url) + response.html.render(timeout=config.request_timeout) + html_content = response.html.html + + tree = html.fromstring(html_content) + paragraphs = tree.cssselect('p, h1, h2, h3, h4, h5, h6') + content = ' '.join([para.text_content() for para in paragraphs if para.text_content()]) + content = content.replace('\t', ' ').replace('\n', ' ').strip() + + return content[:1600] + + except ArticleException as ae: + print(f"Error while extracting text from HTML (lxml): {str(ae)}") + return "" + + except RequestException as re: + print(f"Error while making the request to the URL (lxml): {str(re)}") + return "" + + except Exception as e: + print(f"Unknown error while extracting text from HTML (lxml): {str(e)}") + return "" + \ No newline at end of file diff --git a/superagi/tools/google_search/__init__.py b/superagi/tools/google_search/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/superagi/tools/google_search/tools.py b/superagi/tools/google_search/tools.py new file mode 100644 index 000000000..a9fa6b761 --- /dev/null +++ b/superagi/tools/google_search/tools.py @@ -0,0 +1,40 @@ +from typing import Type, List +from pydantic import BaseModel, Field +from superagi.tools.base_tool import BaseTool +from helper.google_search import GoogleSearchWrap +import os +import json + + +class GoogleSearchSchema(BaseModel): + query: str = Field( + ..., + description="The search query for Google search.", + ) + + +class GoogleSearchTool(BaseTool): + name = "GoogleSearch" + description = ( + "A tool for performing a Google search and extracting snippets and webpages." + "Input should be a search query." + ) + args_schema: Type[GoogleSearchSchema] = GoogleSearchSchema + + def execute(self, query: str) -> tuple: + api_key = os.environ.get("GOOGLE_API_KEY") + search_engine_id = os.environ.get("SEARCH_ENGINE_ID") + num_results = 10 + num_pages = 1 + num_extracts = 3 + + google_search = GoogleSearchWrap(api_key, search_engine_id, num_results, num_pages, num_extracts) + snippets, webpages, links = google_search.get_result(query) + + result = { + "snippets": snippets, + "webpages": webpages, + "links": links + } + + return json.dumps(result) \ No newline at end of file diff --git a/superagi/tools/google_serp_search/__init__.py b/superagi/tools/google_serp_search/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/superagi/tools/google_serp_search/tools.py b/superagi/tools/google_serp_search/tools.py new file mode 100644 index 000000000..57c2fba48 --- /dev/null +++ b/superagi/tools/google_serp_search/tools.py @@ -0,0 +1,38 @@ +from typing import Type +from pydantic import BaseModel, Field +from superagi.tools.base_tool import BaseTool +from helper.google_serp import GoogleSerpApiWrap # Import the GoogleSerpApiWrap class +import os +import json + +class GoogleSerpSchema(BaseModel): + query: str = Field( + ..., + description="The search query for Google SERP.", + ) + + +class GoogleSerpTool(BaseTool): + name = "GoogleSerp" + description = ( + "A tool for performing a Google SERP search and extracting snippets and webpages." + "Input should be a search query." + ) + args_schema: Type[GoogleSerpSchema] = GoogleSerpSchema + + def execute(self, query: str) -> tuple: + api_key = os.environ.get("SERP_API_KEY") + num_results = 10 + num_pages = 1 + num_extracts = 3 + + serp_api = GoogleSerpApiWrap(api_key, num_results, num_pages, num_extracts) + snippets, webpages, links = serp_api.get_result(query) + + result = { + "snippets": snippets, + "webpages": webpages, + "links": links + } + + return json.dumps(result) \ No newline at end of file