Skip to content

Commit

Permalink
Adds support for searx search
Browse files Browse the repository at this point in the history
Searx is a free internet metasearch engine which aggregates
results from more than 70 search services. I added a simple
beautifulsoup scraper that allows many of the instances on
https://searx.space/ to be used without an API key. Uses the
bs4, httpx, and pydantic packages which are already in the
`requirements.txt`.
  • Loading branch information
alexkreidler committed Jun 9, 2023
1 parent 3fc82cd commit dc6eaff
Show file tree
Hide file tree
Showing 4 changed files with 132 additions and 0 deletions.
16 changes: 16 additions & 0 deletions superagi/tools/searx/README.MD
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<p align=center>
<a href="https://superagi.co"><img src=https://superagi.co/wp-content/uploads/2023/05/SuperAGI_icon.png></a>
</p>

# SuperAGI Searx Search Tool

The SuperAGI Searx Search Tool helps users perform a Searx search and extract snippets and webpages. We parse the HTML response pages because most Searx instances do not support the JSON response format without an API key.

## ⚙️ Installation

### 🛠 **Setting Up of SuperAGI**
Set up the SuperAGI by following the instructions given (https://github.com/TransformerOptimus/SuperAGI/blob/main/README.MD)

## Running SuperAGI Searx Search Serp Tool

You can simply ask your agent about latest information regarding anything in the world and your agent will be able to browse the internet to get that information for you.
Empty file.
73 changes: 73 additions & 0 deletions superagi/tools/searx/search_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import random
from typing import List
import httpx
from bs4 import BeautifulSoup
from pydantic import BaseModel


searx_hosts = ["https://search.ononoki.org", "https://searx.be", "https://search.us.projectsegfau.lt"]

class SearchResult(BaseModel):
id: int
title: str
link: str
description: str
sources: List[str]

def __str__(self):
return f"""{self.id}. {self.title} - {self.link}
{self.description}"""

def search(query):
'''Gets the raw HTML of a searx search result page'''
# TODO: use a better strategy for choosing hosts. Could use this list: https://searx.space/data/instances.json
searx_url = random.choice(searx_hosts)
res = httpx.get(
searx_url + "/search", params={"q": query}, headers={"User-Agent": "Mozilla/5.0 (X11; Linux i686; rv:109.0) Gecko/20100101 Firefox/114.0"}
)
if res.status_code != 200:
print(res.status_code, searx_url)
raise Exception(f"Searx returned {res.status_code} status code")

return res.text

def clean_whitespace(s: str):
return " ".join(s.split())


def scrape_results(html):
'''Converts raw HTML into a list of SearchResult objects'''
soup = BeautifulSoup(html, "html.parser")
result_divs = soup.find_all(attrs={"class": "result"})

result_list = []
n = 1
for result_div in result_divs:
# Needed to work on multiple versions of Searx
header = result_div.find(["h4", "h3"])
link = header.find("a")["href"]
title = header.text.strip()

description = clean_whitespace(result_div.find("p").text)

# Needed to work on multiple versions of Searx
sources_container = result_div.find(
attrs={"class": "pull-right"}
) or result_div.find(attrs={"class": "engines"})
source_spans = sources_container.find_all("span")
sources = []
for s in source_spans:
sources.append(s.text.strip())

result = SearchResult(
id=n, title=title, link=link, description=description, sources=sources
)
result_list.append(result)
n += 1

return result_list


def search_results(query):
'''Returns a text summary of the search results via the SearchResult.__str__ method'''
return "\n\n".join(list(map(lambda x: str(x), scrape_results(search(query)))))
43 changes: 43 additions & 0 deletions superagi/tools/searx/searx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from typing import Type, Optional
from pydantic import BaseModel, Field
from superagi.llms.base_llm import BaseLlm
from superagi.tools.base_tool import BaseTool
from superagi.tools.searx.search_scraper import search_results


class SearxSearchSchema(BaseModel):
query: str = Field(
...,
description="The search query for the Searx search engine.",
)

class SearxSearchTool(BaseTool):
llm: Optional[BaseLlm] = None
name = "SearxSearch"
description = (
"A tool for performing a Searx search and extracting snippets and webpages."
"Input should be a search query."
)
args_schema: Type[SearxSearchSchema] = SearxSearchSchema

class Config:
arbitrary_types_allowed = True

def _execute(self, query: str) -> tuple:
snippets = search_results(query)
summary = self.summarise_result(query, snippets)

return summary

def summarise_result(self, query, snippets):
summarize_prompt = """Summarize the following text `{snippets}`
Write a concise or as descriptive as necessary and attempt to
answer the query: `{query}` as best as possible. Use markdown formatting for
longer responses."""

summarize_prompt = summarize_prompt.replace("{snippets}", str(snippets))
summarize_prompt = summarize_prompt.replace("{query}", query)

messages = [{"role": "system", "content": summarize_prompt}]
result = self.llm.chat_completion(messages, max_tokens=self.max_token_limit)
return result["content"]

0 comments on commit dc6eaff

Please sign in to comment.