Adds support for searx search

Searx is a free internet metasearch engine which aggregates results from more than 70 search services. I added a simple beautifulsoup scraper that allows many of the instances on https://searx.space/ to be used without an API key. Uses the bs4, httpx, and pydantic packages which are already in the `requirements.txt`.
AZ1688 · Jun 9, 2023 · dc6eaff · dc6eaff
1 parent 3fc82cd
commit dc6eaff
Show file tree

Hide file tree

Showing 4 changed files with 132 additions and 0 deletions.
diff --git a/superagi/tools/searx/README.MD b/superagi/tools/searx/README.MD
@@ -0,0 +1,16 @@
+<p align=center>
+<a href="https://superagi.co"><img src=https://superagi.co/wp-content/uploads/2023/05/SuperAGI_icon.png></a>
+</p>
+
+# SuperAGI Searx Search Tool
+
+The SuperAGI Searx Search Tool helps users perform a Searx search and extract snippets and webpages. We parse the HTML response pages because most Searx instances do not support the JSON response format without an API key.
+
+## ⚙️ Installation
+
+### 🛠 **Setting Up of SuperAGI**
+Set up the SuperAGI by following the instructions given (https://github.com/TransformerOptimus/SuperAGI/blob/main/README.MD)
+
+## Running SuperAGI Searx Search Serp Tool
+
+You can simply ask your agent about latest information regarding anything in the world and your agent will be able to browse the internet to get that information for you. 
diff --git a/superagi/tools/searx/__init__.py b/superagi/tools/searx/__init__.py
diff --git a/superagi/tools/searx/search_scraper.py b/superagi/tools/searx/search_scraper.py
@@ -0,0 +1,73 @@
+import random
+from typing import List
+import httpx
+from bs4 import BeautifulSoup
+from pydantic import BaseModel
+
+
+searx_hosts = ["https://search.ononoki.org", "https://searx.be", "https://search.us.projectsegfau.lt"]
+
+class SearchResult(BaseModel):
+    id: int
+    title: str
+    link: str
+    description: str
+    sources: List[str]
+
+    def __str__(self):
+        return f"""{self.id}. {self.title} - {self.link} 
+{self.description}"""
+
+def search(query):
+    '''Gets the raw HTML of a searx search result page'''
+    # TODO: use a better strategy for choosing hosts. Could use this list: https://searx.space/data/instances.json
+    searx_url = random.choice(searx_hosts)
+    res = httpx.get(
+        searx_url + "/search", params={"q": query}, headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686; rv:109.0) Gecko/20100101 Firefox/114.0"}
+    )
+    if res.status_code != 200:
+        print(res.status_code, searx_url)
+        raise Exception(f"Searx returned {res.status_code} status code")
+
+    return res.text
+
+def clean_whitespace(s: str):
+    return " ".join(s.split())
+
+
+def scrape_results(html):
+    '''Converts raw HTML into a list of SearchResult objects'''
+    soup = BeautifulSoup(html, "html.parser")
+    result_divs = soup.find_all(attrs={"class": "result"})
+
+    result_list = []
+    n = 1
+    for result_div in result_divs:
+        # Needed to work on multiple versions of Searx
+        header = result_div.find(["h4", "h3"])
+        link = header.find("a")["href"]
+        title = header.text.strip()
+
+        description = clean_whitespace(result_div.find("p").text)
+
+        # Needed to work on multiple versions of Searx
+        sources_container = result_div.find(
+            attrs={"class": "pull-right"}
+        ) or result_div.find(attrs={"class": "engines"}) 
+        source_spans = sources_container.find_all("span")
+        sources = []
+        for s in source_spans:
+            sources.append(s.text.strip())
+
+        result = SearchResult(
+            id=n, title=title, link=link, description=description, sources=sources
+        )
+        result_list.append(result)
+        n += 1
+
+    return result_list
+
+
+def search_results(query):
+    '''Returns a text summary of the search results via the SearchResult.__str__ method'''
+    return "\n\n".join(list(map(lambda x: str(x), scrape_results(search(query)))))
diff --git a/superagi/tools/searx/searx.py b/superagi/tools/searx/searx.py
@@ -0,0 +1,43 @@
+from typing import Type, Optional
+from pydantic import BaseModel, Field
+from superagi.llms.base_llm import BaseLlm
+from superagi.tools.base_tool import BaseTool
+from superagi.tools.searx.search_scraper import search_results
+
+
+class SearxSearchSchema(BaseModel):
+    query: str = Field(
+        ...,
+        description="The search query for the Searx search engine.",
+    )
+
+class SearxSearchTool(BaseTool):
+    llm: Optional[BaseLlm] = None
+    name = "SearxSearch"
+    description = (
+        "A tool for performing a Searx search and extracting snippets and webpages."
+        "Input should be a search query."
+    )
+    args_schema: Type[SearxSearchSchema] = SearxSearchSchema
+
+    class Config:
+        arbitrary_types_allowed = True
+
+    def _execute(self, query: str) -> tuple:
+        snippets = search_results(query)
+        summary = self.summarise_result(query, snippets)
+
+        return summary
+
+    def summarise_result(self, query, snippets):
+        summarize_prompt = """Summarize the following text `{snippets}`
+            Write a concise or as descriptive as necessary and attempt to
+            answer the query: `{query}` as best as possible. Use markdown formatting for
+            longer responses."""
+
+        summarize_prompt = summarize_prompt.replace("{snippets}", str(snippets))
+        summarize_prompt = summarize_prompt.replace("{query}", query)
+
+        messages = [{"role": "system", "content": summarize_prompt}]
+        result = self.llm.chat_completion(messages, max_tokens=self.max_token_limit)
+        return result["content"]