forked from TransformerOptimus/SuperAGI
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Searx is a free internet metasearch engine which aggregates results from more than 70 search services. I added a simple beautifulsoup scraper that allows many of the instances on https://searx.space/ to be used without an API key. Uses the bs4, httpx, and pydantic packages which are already in the `requirements.txt`.
- Loading branch information
1 parent
3fc82cd
commit dc6eaff
Showing
4 changed files
with
132 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
<p align=center> | ||
<a href="https://superagi.co"><img src=https://superagi.co/wp-content/uploads/2023/05/SuperAGI_icon.png></a> | ||
</p> | ||
|
||
# SuperAGI Searx Search Tool | ||
|
||
The SuperAGI Searx Search Tool helps users perform a Searx search and extract snippets and webpages. We parse the HTML response pages because most Searx instances do not support the JSON response format without an API key. | ||
|
||
## ⚙️ Installation | ||
|
||
### 🛠 **Setting Up of SuperAGI** | ||
Set up the SuperAGI by following the instructions given (https://github.com/TransformerOptimus/SuperAGI/blob/main/README.MD) | ||
|
||
## Running SuperAGI Searx Search Serp Tool | ||
|
||
You can simply ask your agent about latest information regarding anything in the world and your agent will be able to browse the internet to get that information for you. |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
import random | ||
from typing import List | ||
import httpx | ||
from bs4 import BeautifulSoup | ||
from pydantic import BaseModel | ||
|
||
|
||
searx_hosts = ["https://search.ononoki.org", "https://searx.be", "https://search.us.projectsegfau.lt"] | ||
|
||
class SearchResult(BaseModel): | ||
id: int | ||
title: str | ||
link: str | ||
description: str | ||
sources: List[str] | ||
|
||
def __str__(self): | ||
return f"""{self.id}. {self.title} - {self.link} | ||
{self.description}""" | ||
|
||
def search(query): | ||
'''Gets the raw HTML of a searx search result page''' | ||
# TODO: use a better strategy for choosing hosts. Could use this list: https://searx.space/data/instances.json | ||
searx_url = random.choice(searx_hosts) | ||
res = httpx.get( | ||
searx_url + "/search", params={"q": query}, headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686; rv:109.0) Gecko/20100101 Firefox/114.0"} | ||
) | ||
if res.status_code != 200: | ||
print(res.status_code, searx_url) | ||
raise Exception(f"Searx returned {res.status_code} status code") | ||
|
||
return res.text | ||
|
||
def clean_whitespace(s: str): | ||
return " ".join(s.split()) | ||
|
||
|
||
def scrape_results(html): | ||
'''Converts raw HTML into a list of SearchResult objects''' | ||
soup = BeautifulSoup(html, "html.parser") | ||
result_divs = soup.find_all(attrs={"class": "result"}) | ||
|
||
result_list = [] | ||
n = 1 | ||
for result_div in result_divs: | ||
# Needed to work on multiple versions of Searx | ||
header = result_div.find(["h4", "h3"]) | ||
link = header.find("a")["href"] | ||
title = header.text.strip() | ||
|
||
description = clean_whitespace(result_div.find("p").text) | ||
|
||
# Needed to work on multiple versions of Searx | ||
sources_container = result_div.find( | ||
attrs={"class": "pull-right"} | ||
) or result_div.find(attrs={"class": "engines"}) | ||
source_spans = sources_container.find_all("span") | ||
sources = [] | ||
for s in source_spans: | ||
sources.append(s.text.strip()) | ||
|
||
result = SearchResult( | ||
id=n, title=title, link=link, description=description, sources=sources | ||
) | ||
result_list.append(result) | ||
n += 1 | ||
|
||
return result_list | ||
|
||
|
||
def search_results(query): | ||
'''Returns a text summary of the search results via the SearchResult.__str__ method''' | ||
return "\n\n".join(list(map(lambda x: str(x), scrape_results(search(query))))) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
from typing import Type, Optional | ||
from pydantic import BaseModel, Field | ||
from superagi.llms.base_llm import BaseLlm | ||
from superagi.tools.base_tool import BaseTool | ||
from superagi.tools.searx.search_scraper import search_results | ||
|
||
|
||
class SearxSearchSchema(BaseModel): | ||
query: str = Field( | ||
..., | ||
description="The search query for the Searx search engine.", | ||
) | ||
|
||
class SearxSearchTool(BaseTool): | ||
llm: Optional[BaseLlm] = None | ||
name = "SearxSearch" | ||
description = ( | ||
"A tool for performing a Searx search and extracting snippets and webpages." | ||
"Input should be a search query." | ||
) | ||
args_schema: Type[SearxSearchSchema] = SearxSearchSchema | ||
|
||
class Config: | ||
arbitrary_types_allowed = True | ||
|
||
def _execute(self, query: str) -> tuple: | ||
snippets = search_results(query) | ||
summary = self.summarise_result(query, snippets) | ||
|
||
return summary | ||
|
||
def summarise_result(self, query, snippets): | ||
summarize_prompt = """Summarize the following text `{snippets}` | ||
Write a concise or as descriptive as necessary and attempt to | ||
answer the query: `{query}` as best as possible. Use markdown formatting for | ||
longer responses.""" | ||
|
||
summarize_prompt = summarize_prompt.replace("{snippets}", str(snippets)) | ||
summarize_prompt = summarize_prompt.replace("{query}", query) | ||
|
||
messages = [{"role": "system", "content": summarize_prompt}] | ||
result = self.llm.chat_completion(messages, max_tokens=self.max_token_limit) | ||
return result["content"] |