Skip to content

Commit

Permalink
feat(fetch): added playwright support
Browse files Browse the repository at this point in the history
  • Loading branch information
PeriniM committed Apr 30, 2024
1 parent 450291f commit 42ab0aa
Show file tree
Hide file tree
Showing 8 changed files with 28 additions and 8 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ The reference page for Scrapegraph-ai is available on the official page of pypy:
```bash
pip install scrapegraphai
```
you will also need to install Playwright for javascript-based scraping:
```bash
playwright install
```
## 🔍 Demo
Official streamlit demo:

Expand Down
3 changes: 2 additions & 1 deletion examples/mixed_models/smart_scraper_mixed.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434", # set ollama URL arbitrarily
}
},
"headless": False
}

# ************************************************
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ google = "3.0.0"
minify-html = "0.15.0"
free-proxy = "1.1.1"
langchain-groq = "0.1.3"
playwright = "^1.43.0"

[tool.poetry.dev-dependencies]
pytest = "8.0.0"
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ google==3.0.0
minify-html==0.15.0
free-proxy==1.1.1
langchain-groq==0.1.3
playwright==1.43.0
1 change: 1 addition & 0 deletions scrapegraphai/graphs/search_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def _create_graph(self):
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
)
parse_node = ParseNode(
input="doc",
Expand Down
2 changes: 2 additions & 0 deletions scrapegraphai/graphs/smart_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,15 @@ def __init__(self, prompt: str, source: str, config: dict):

self.input_key = "url" if source.startswith("http") else "local_dir"


def _create_graph(self):
"""
Creates the graph of nodes representing the workflow for web scraping.
"""
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
)
parse_node = ParseNode(
input="doc",
Expand Down
1 change: 1 addition & 0 deletions scrapegraphai/graphs/speech_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def _create_graph(self):
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={"headless": True if self.config is None else self.config.get("headless", True)}
)
parse_node = ParseNode(
input="doc",
Expand Down
23 changes: 16 additions & 7 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
Module for fetching the HTML node
"""

from typing import List
from langchain_community.document_loaders import AsyncHtmlLoader
from typing import List, Optional
from langchain_community.document_loaders import AsyncChromiumLoader
from langchain_core.documents import Document
from .base_node import BaseNode
from ..utils.remover import remover
Expand Down Expand Up @@ -37,7 +37,7 @@ class FetchNode(BaseNode):
to succeed.
"""

def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"):
"""
Initializes the FetchHTMLNode with a node name and node type.
Arguments:
Expand All @@ -46,6 +46,8 @@ def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
"""
super().__init__(node_name, "node", input, output, 1)

self.headless = True if node_config is None else node_config.get("headless", True)

def execute(self, state):
"""
Executes the node's logic to fetch HTML content from a specified URL and
Expand Down Expand Up @@ -79,14 +81,21 @@ def execute(self, state):

else:
if self.node_config is not None and self.node_config.get("endpoint") is not None:
loader = AsyncHtmlLoader(
source, proxies={"http": self.node_config["endpoint"]})

loader = AsyncChromiumLoader(
[source],
proxies={"http": self.node_config["endpoint"]},
headless=self.headless,
)
else:
loader = AsyncHtmlLoader(source)
loader = AsyncChromiumLoader(
[source],
headless=self.headless,
)

document = loader.load()
compressed_document = [
Document(page_content=remover(str(document)))]
Document(page_content=remover(str(document[0].page_content)))]

state.update({self.output[0]: compressed_document})
return state

0 comments on commit 42ab0aa

Please sign in to comment.