Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tools logic merging to v3 #39

Merged
merged 13 commits into from
May 18, 2023
Merged
5 changes: 4 additions & 1 deletion README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ Run multiple agents simultaneously, maximizing efficiency and achieving parallel
### 💾 **Resource Manager:**
Read and store files generated by agents, facilitating data management and analysis.

# 🛣 Roadmap
[Click here to checkout the latest roadmap 🔗](https://github.com/TransformerOptimus/SuperAGI/wiki/Roadmap-%F0%9F%9B%A3)

# ⚙️ Setting up

1. Download the repo using `git clone https://github.com/TransformerOptimus/SuperAGI.git` in your terminal or directly from github page in zip format and unzip in your desired folder
Expand Down Expand Up @@ -86,4 +89,4 @@ This project is under active development and may still have issues. We appreciat

# ⭐Star History

[![Star History Chart](https://api.star-history.com/svg?repos=TransformerOptimus/SuperAGI&type=Date)](https://star-history.com/#TransformerOptimus/SuperAGI&Date)
[![Star History Chart](https://api.star-history.com/svg?repos=TransformerOptimus/SuperAGI&type=Date)](https://star-history.com/#TransformerOptimus/SuperAGI&Date)
79 changes: 79 additions & 0 deletions superagi/helper/google_search.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import requests
import time
from pydantic import BaseModel
from webpage_extractor import WebpageExtractor


class GoogleSearchWrap:

def __init__(self, api_key, search_engine_id, num_results=10, num_pages=1, num_extracts=3):
self.api_key = api_key
self.search_engine_id = search_engine_id
self.num_results = num_results
self.num_pages = num_pages
self.num_extracts = num_extracts
self.extractor = WebpageExtractor()

def search_run(self, query):
all_snippets = []
links = []
for page in range(1, self.num_pages * self.num_results, self.num_results):
url = "https://www.googleapis.com/customsearch/v1"
params = {
"key": self.api_key,
"cx": self.search_engine_id,
"q": query,
"num": self.num_results,
"start": page
}
response = requests.get(url, params=params, timeout=100)

if response.status_code == 200:
try:
json_data = response.json()
if "items" in json_data:
for item in json_data["items"]:
all_snippets.append(item["snippet"])
links.append(item["link"])
else:
print("No items found in the response.")
except ValueError as e:
print(f"Error while parsing JSON data: {e}")
else:
print(f"Error: {response.status_code}")

return all_snippets, links, response.status_code

def get_result(self, query):
snippets, links, error_code = self.search_run(query)

webpages = []
attempts = 0
while snippets == [] and attempts < 2:
attempts += 1
print("Google blocked the request. Trying again...")
time.sleep(3)
snippets, links, error_code = self.search_run(query)

if links:
for i in range(0, self.num_extracts):
time.sleep(3)
content = self.extractor.extract_with_3k(links[i])
attempts = 0
while content == "" and attempts < 2:
attempts += 1
content = self.extractor.extract_with_3k(links[i])
if content == "":
time.sleep(3)
content = self.extractor.extract_with_bs4(links[i])
attempts = 0
while content == "" and attempts < 2:
attempts += 1
content = self.extractor.extract_with_bs4(links[i])
webpages.append(content)
else:
snippets = ["", "", ""]
links = ["", "", ""]
webpages = ["", "", ""]

return snippets, webpages, links
73 changes: 73 additions & 0 deletions superagi/helper/google_serp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import time
from pydantic import BaseModel
from webpage_extractor import WebpageExtractor
from serpapi import GoogleSearch

class GoogleSerpApiWrap:
def __init__(self, api_key, num_results=10, num_pages=1, num_extracts=3):
self.api_key = api_key
self.num_results = num_results
self.num_pages = num_pages
self.num_extracts = num_extracts
self.extractor = WebpageExtractor()

def search_run(self, query):
all_snippets = []
links = []

params = {
"api_key": self.api_key,
"engine": 'google',
"num": self.num_results,
"start": 0,
"q": query
}

for page in range(self.num_pages):
params["start"] = page * self.num_results
search = GoogleSearch(params)
results = search.get_dict()

if "organic_results" in results:
for result in results["organic_results"]:
all_snippets.append(result["snippet"])
links.append(result["link"])
else:
print("No organic results found in the response.")

return all_snippets, links

def get_result(self, query):
snippets, links = self.search_run(query)

webpages = []
attempts = 0
while snippets == [] and attempts < 2:
attempts += 1
print("Google blocked the request. Trying again...")
time.sleep(3)
snippets, links = self.search_run(query)

if links:
for i in range(0, self.num_extracts):
time.sleep(3)
content = self.extractor.extract_with_3k(links[i])
attempts = 0
while content == "" and attempts < 2:
attempts += 1
content = self.extractor.extract_with_3k(links[i])
if content == "":
time.sleep(3)
content = self.extractor.extract_with_bs4(links[i])
attempts = 0
while content == "" and attempts < 2:
attempts += 1
content = self.extractor.extract_with_bs4(links[i])
webpages.append(content)
else:
snippets = ["", "", ""]
links = ["", "", ""]
webpages = ["", "", ""]

return snippets, webpages, links

132 changes: 132 additions & 0 deletions superagi/helper/webpage_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from io import BytesIO
from PyPDF2 import PdfFileReader
import requests
import re
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from newspaper import Article, ArticleException, Config
from requests_html import HTMLSession
import time
import random
from lxml import html

USER_AGENTS = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.1 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.0",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:80.0) Gecko/20100101 Firefox/80.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 13_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.4 Mobile/15E148 Safari/604.1",
"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36"
]

class WebpageExtractor:

def __init__(self, num_extracts=3):
self.num_extracts = num_extracts

def extract_with_3k(self, url):
try:
if url.lower().endswith(".pdf"):
response = requests.get(url)
response.raise_for_status()

with BytesIO(response.content) as pdf_data:
reader = PdfFileReader(pdf_data)
content = " ".join([reader.getPage(i).extract_text() for i in range(reader.getNumPages())])

else:
config = Config()
config.browser_user_agent = random.choice(USER_AGENTS)
config.request_timeout = 5
session = HTMLSession()

response = session.get(url)
response.html.render(timeout=config.request_timeout)
html_content = response.html.html

article = Article(url, config=config)
article.set_html(html_content)
article.parse()
content = article.text.replace('\t', ' ').replace('\n', ' ').strip()

return content[:1500]

except ArticleException as ae:
print(f"Error while extracting text from HTML (newspaper3k): {str(ae)}")
return ""

except RequestException as re:
print(f"Error while making the request to the URL (newspaper3k): {str(re)}")
return ""

except Exception as e:
print(f"Unknown error while extracting text from HTML (newspaper3k): {str(e)}")
return ""

def extract_with_bs4(self, url):
headers = {
"User-Agent": random.choice(USER_AGENTS)
}

try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
for tag in soup(['script', 'style', 'nav', 'footer', 'head', 'link', 'meta', 'noscript']):
tag.decompose()

main_content_areas = soup.find_all(['main', 'article', 'section', 'div'])
if main_content_areas:
main_content = max(main_content_areas, key=lambda x: len(x.text))
content_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
content = ' '.join([tag.text.strip() for tag in main_content.find_all(content_tags)])
else:
content = ' '.join([tag.text.strip() for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'])])

content = re.sub(r'\t', ' ', content)
content = re.sub(r'\s+', ' ', content)
return content[:1500]
else:
print(f"Error while extracting text from HTML (bs4): {response.status_code}")
return ""

except Exception as e:
print(f"Unknown error while extracting text from HTML (bs4): {str(e)}")
return ""

def extract_with_lxml(self, url):
try:
config = Config()
config.browser_user_agent = random.choice(USER_AGENTS)
config.request_timeout = 5
session = HTMLSession()

response = session.get(url)
response.html.render(timeout=config.request_timeout)
html_content = response.html.html

tree = html.fromstring(html_content)
paragraphs = tree.cssselect('p, h1, h2, h3, h4, h5, h6')
content = ' '.join([para.text_content() for para in paragraphs if para.text_content()])
content = content.replace('\t', ' ').replace('\n', ' ').strip()

return content[:1600]

except ArticleException as ae:
print(f"Error while extracting text from HTML (lxml): {str(ae)}")
return ""

except RequestException as re:
print(f"Error while making the request to the URL (lxml): {str(re)}")
return ""

except Exception as e:
print(f"Unknown error while extracting text from HTML (lxml): {str(e)}")
return ""

Empty file.
40 changes: 40 additions & 0 deletions superagi/tools/google_search/tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from typing import Type, List
from pydantic import BaseModel, Field
from superagi.tools.base_tool import BaseTool
from helper.google_search import GoogleSearchWrap
import os
import json


class GoogleSearchSchema(BaseModel):
query: str = Field(
...,
description="The search query for Google search.",
)


class GoogleSearchTool(BaseTool):
name = "GoogleSearch"
description = (
"A tool for performing a Google search and extracting snippets and webpages."
"Input should be a search query."
)
args_schema: Type[GoogleSearchSchema] = GoogleSearchSchema

def execute(self, query: str) -> tuple:
api_key = os.environ.get("GOOGLE_API_KEY")
search_engine_id = os.environ.get("SEARCH_ENGINE_ID")
num_results = 10
num_pages = 1
num_extracts = 3

google_search = GoogleSearchWrap(api_key, search_engine_id, num_results, num_pages, num_extracts)
snippets, webpages, links = google_search.get_result(query)

result = {
"snippets": snippets,
"webpages": webpages,
"links": links
}

return json.dumps(result)
Empty file.
38 changes: 38 additions & 0 deletions superagi/tools/google_serp_search/tools.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from typing import Type
from pydantic import BaseModel, Field
from superagi.tools.base_tool import BaseTool
from helper.google_serp import GoogleSerpApiWrap # Import the GoogleSerpApiWrap class
import os
import json

class GoogleSerpSchema(BaseModel):
query: str = Field(
...,
description="The search query for Google SERP.",
)


class GoogleSerpTool(BaseTool):
name = "GoogleSerp"
description = (
"A tool for performing a Google SERP search and extracting snippets and webpages."
"Input should be a search query."
)
args_schema: Type[GoogleSerpSchema] = GoogleSerpSchema

def execute(self, query: str) -> tuple:
api_key = os.environ.get("SERP_API_KEY")
num_results = 10
num_pages = 1
num_extracts = 3

serp_api = GoogleSerpApiWrap(api_key, num_results, num_pages, num_extracts)
snippets, webpages, links = serp_api.get_result(query)

result = {
"snippets": snippets,
"webpages": webpages,
"links": links
}

return json.dumps(result)