Skip to content

Commit

Permalink
fix: Augment the information getting fetched from a webpage
Browse files Browse the repository at this point in the history
  • Loading branch information
mayurdb committed May 10, 2024
1 parent 0ca52b1 commit f8ce3d5
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 7 deletions.
21 changes: 18 additions & 3 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document
from .base_node import BaseNode
from ..utils.remover import remover
from ..utils.cleanup_html import cleanup_html
import requests
from bs4 import BeautifulSoup


class FetchNode(BaseNode):
Expand All @@ -32,6 +34,7 @@ class FetchNode(BaseNode):
def __init__(self, input: str, output: List[str], node_config: Optional[dict]=None, node_name: str = "Fetch"):
super().__init__(node_name, "node", input, output, 1)

self.useSoup = True if node_config is None else node_config.get("useSoup", True)
self.headless = True if node_config is None else node_config.get("headless", True)
self.verbose = False if node_config is None else node_config.get("verbose", False)

Expand Down Expand Up @@ -67,10 +70,22 @@ def execute(self, state):
})]
# if it is a local directory
elif not source.startswith("http"):
compressed_document = [Document(page_content=remover(source), metadata={
compressed_document = [Document(page_content=cleanup_html(source), metadata={
"source": "local_dir"
})]

elif self.useSoup:
response = requests.get(source)
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a')
link_urls = []
for link in links:
if 'href' in link.attrs:
link_urls.append(link['href'])
compressed_document = [Document(page_content=cleanup_html(soup.prettify(), link_urls))]
else:
print(f"Failed to retrieve contents from the webpage at url: {url}")
else:
if self.node_config is not None and self.node_config.get("endpoint") is not None:

Expand All @@ -87,7 +102,7 @@ def execute(self, state):

document = loader.load()
compressed_document = [
Document(page_content=remover(str(document[0].page_content)))]
Document(page_content=cleanup_html(str(document[0].page_content)))]

state.update({self.output[0]: compressed_document})
return state
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from minify_html import minify


def remover(html_content: str) -> str:
def cleanup_html(html_content: str, urls: list = []) -> str:
"""
Processes HTML content by removing unnecessary tags, minifying the HTML, and extracting the title and body content.
Expand All @@ -17,7 +17,7 @@ def remover(html_content: str) -> str:
Example:
>>> html_content = "<html><head><title>Example</title></head><body><p>Hello World!</p></body></html>"
>>> remover(html_content)
>>> cleanup_html(html_content)
'Title: Example, Body: <body><p>Hello World!</p></body>'
This function is particularly useful for preparing HTML content for environments where bandwidth usage needs to be minimized.
Expand All @@ -35,9 +35,12 @@ def remover(html_content: str) -> str:

# Body Extraction (if it exists)
body_content = soup.find('body')
urls_content = ""
if urls:
urls_content = f", URLs in page: {urls}"
if body_content:
# Minify the HTML within the body tag
minimized_body = minify(str(body_content))
return "Title: " + title + ", Body: " + minimized_body
return "Title: " + title + ", Body: " + minimized_body + urls_content

return "Title: " + title + ", Body: No body content found"
return "Title: " + title + ", Body: No body content found" + urls_content

0 comments on commit f8ce3d5

Please sign in to comment.