Skip to content

Commit

Permalink
* Combined summary.py web.py and browse.py because they were duplicat…
Browse files Browse the repository at this point in the history
…ing each others efforts in many aspects.

* added a new config parameter in .env to control which kind of browser the user wants: headless or full Selenium with Chrome
* restored browse_website() to commands.py
* PR Significant-Gravitas#1397 introduced a working Selenium adapter, but inadvertently clobbered PR Significant-Gravitas#968, and replicated most of the stuff in browse.py, but based on an old version, without any merge conflicts. This is now rectified by moving Selenium code into browse.py, and reducing duplication as much as possible.
* there was a small typo, because an object reference was also returned along with the links in the link scraper.
* listed the PROs and CONs of each browser in the source code
  • Loading branch information
batyu committed Apr 15, 2023
1 parent 6a93537 commit a9fb5dd
Show file tree
Hide file tree
Showing 7 changed files with 161 additions and 198 deletions.
2 changes: 2 additions & 0 deletions .env.template
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ EXECUTE_LOCAL_COMMANDS=False
BROWSE_CHUNK_MAX_LENGTH=8192
# BROWSE_SUMMARY_MAX_TOKEN - Define the maximum length of the summary generated by GPT agent when browsing website
BROWSE_SUMMARY_MAX_TOKEN=300
# BROWSER Automation - Can be "HeadlessBarebones" or "SeleniumChrome" for now
BROWSER_AUTOMATION = SeleniumChrome
# USER_AGENT - Define the user-agent used by the requests library to browse website (string)
# USER_AGENT="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"
# AI_SETTINGS_FILE - Specifies which AI Settings file to use (defaults to ai_settings.yaml)
Expand Down
183 changes: 142 additions & 41 deletions autogpt/browse.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
import logging
from abc import ABC, abstractmethod
from pathlib import Path
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

from autogpt.config import Config
from autogpt.llm_utils import create_chat_completion
from autogpt.memory import get_memory

cfg = Config()
memory = get_memory(cfg)

session = requests.Session()
session.headers.update({"User-Agent": cfg.user_agent})

_browser_instance = None

# Function to check if the URL is valid
def is_valid_url(url):
Expand All @@ -39,44 +45,28 @@ def check_local_file_access(url):
return any(url.startswith(prefix) for prefix in local_prefixes)


def get_response(url, timeout=10):
try:
# Restrict access to local files
if check_local_file_access(url):
raise ValueError("Access to local files is restricted")

# Most basic check if the URL is valid:
if not url.startswith("http://") and not url.startswith("https://"):
raise ValueError("Invalid URL format")

sanitized_url = sanitize_url(url)
def check_and_sanitize_url(url):
# Restrict access to local files
if check_local_file_access(url):
raise ValueError("Access to local files is restricted")

response = session.get(sanitized_url, timeout=timeout)
# Most basic check if the URL is valid:
if not url.startswith("http://") and not url.startswith("https://"):
raise ValueError("Invalid URL format")

# Check if the response contains an HTTP error
if response.status_code >= 400:
return None, "Error: HTTP " + str(response.status_code) + " error"

return response, None
except ValueError as ve:
# Handle invalid URL format
return None, "Error: " + str(ve)

except requests.exceptions.RequestException as re:
# Handle exceptions related to the HTTP request
# (e.g., connection errors, timeouts, etc.)
return None, "Error: " + str(re)
sanitized_url = sanitize_url(url)
return sanitized_url


def scrape_text(url):
"""Scrape text from a webpage"""
response, error_message = get_response(url)
if error_message:
return error_message
if not response:
return "Error: Could not get response"
browser = get_browser_instance()
try:
page_source = browser.get_page_source(url)
except ValueError as ve:
return str(ve)

soup = BeautifulSoup(response.text, "html.parser")
soup = BeautifulSoup(page_source, "html.parser")

for script in soup(["script", "style"]):
script.extract()
Expand Down Expand Up @@ -107,12 +97,13 @@ def format_hyperlinks(hyperlinks):

def scrape_links(url):
"""Scrape links from a webpage"""
response, error_message = get_response(url)
if error_message:
return error_message
if not response:
return "Error: Could not get response"
soup = BeautifulSoup(response.text, "html.parser")
browser = get_browser_instance()
try:
page_source = browser.get_page_source(url)
except ValueError as ve:
return str(ve)

soup = BeautifulSoup(page_source, "html.parser")

for script in soup(["script", "style"]):
script.extract()
Expand Down Expand Up @@ -196,3 +187,113 @@ def summarize_text(url, text, question):
)

return final_summary


def _initialize_requests_session():
session = requests.Session()
session.headers.update({"User-Agent": cfg.user_agent})
return session


def get_browser_instance():
# Singleton derived from "BrowserBase" class
global _browser_instance

if _browser_instance is not None:
return _browser_instance
else:
browser_cls = {'HeadlessBarebones': BrowserBareBonesHeadless,
'SeleniumChrome': BrowserSeleniumChrome,
}
assert cfg.browser_automation in list(browser_cls.keys()), \
'ERROR: Unknown browser setting for BROWSER_AUTOMATION in .env config file.'
cls = browser_cls[cfg.browser_automation]
_browser_instance = cls()
return _browser_instance


class BrowserBase(ABC):

@abstractmethod
def get_page_source(self, url):
# Return the body of the HTML page, in a format that can be digested by BeautifulSoup's html.parser
# If an error was encountered, then raise an ValueError exception for simplicity.
pass


class BrowserBareBonesHeadless(BrowserBase):
"""
Advantage: headless, so runs on servers too. Does not need Chrome installed. Faster
Disadvantage: May miss some content, because the Javascript parts of the website are not executed.
"""
session = _initialize_requests_session()

def __init__(self):
pass

def get_page_source(self, url):
"""Scrape text from a webpage"""
response, error_message = self.get_response(url)
if error_message:
raise ValueError(error_message)

if not response:
raise ValueError("Error: Could not get response")

return response.text

def get_response(self, url, timeout=10):
try:

sanitized_url = check_and_sanitize_url(url)
response = BrowserBareBonesHeadless.session.get(sanitized_url, timeout=timeout)

# Check if the response contains an HTTP error
if response.status_code >= 400:
return None, "Error: HTTP " + str(response.status_code) + " error"

return response, None
except ValueError as ve:
# Handle invalid URL format
return None, "Error: " + str(ve)

except requests.exceptions.RequestException as re:
# Handle exceptions related to the HTTP request
# (e.g., connection errors, timeouts, etc.)
return None, "Error: " + str(re)


class BrowserSeleniumChrome(BrowserBase):
"""
Advantage: will load a website with Javascript running, as many modern websites need this for proper content
Disadvantage: can be slower than the BrowserBareBonesHeadless
"""
file_dir = Path(__file__).parent

def __init__(self):
logging.getLogger("selenium").setLevel(logging.CRITICAL)
self.options = Options()
self.options.add_argument(f"user-agent={cfg.user_agent}")

def get_page_source(self, url):
# TODO: re-use a session in Selenium, instead of starting a new one every time
sanitized_url = check_and_sanitize_url(url)
with webdriver.Chrome(executable_path=ChromeDriverManager().install(),
options=self.options) as driver:
driver.get(sanitized_url)
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.TAG_NAME, "body"))
)
# Get the HTML content directly from the browser's DOM
page_source = driver.execute_script("return document.body.outerHTML;")

# Add graphical overlay
self.add_header(driver)

# Close browser
driver.quit()

return page_source

def add_header(self, driver):
driver.execute_script(open(f"{self.file_dir}/js/overlay.js", "r").read())
17 changes: 15 additions & 2 deletions autogpt/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
)
from autogpt.memory import get_memory
from autogpt.speak import say_text
from autogpt.web import browse_website


cfg = Config()

Expand Down Expand Up @@ -206,6 +204,21 @@ def google_official_search(query, num_results=8):
return search_results_links


def browse_website(url, question):
"""Browse a website and return the summary and links"""

summary = get_text_summary(url, question)
links = get_hyperlinks(url)

# Limit links to 5
if len(links) > 5:
links = links[:5]

result = f"""Website Content Summary: {summary}\n\nLinks: {links}"""

return result


def get_text_summary(url, question):
"""Return the results of a google search"""
text = scrape_text(url)
Expand Down
1 change: 1 addition & 0 deletions autogpt/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(self):
self.smart_token_limit = int(os.getenv("SMART_TOKEN_LIMIT", 8000))
self.browse_chunk_max_length = int(os.getenv("BROWSE_CHUNK_MAX_LENGTH", 8192))
self.browse_summary_max_token = int(os.getenv("BROWSE_SUMMARY_MAX_TOKEN", 300))
self.browser_automation = os.getenv("BROWSER_AUTOMATION", "HeadlessBarebones")

self.openai_api_key = os.getenv("OPENAI_API_KEY")
self.temperature = float(os.getenv("TEMPERATURE", "1"))
Expand Down
69 changes: 0 additions & 69 deletions autogpt/summary.py

This file was deleted.

Loading

0 comments on commit a9fb5dd

Please sign in to comment.