feat(fetch): added playwright support

ScrapeGraphAI · Apr 30, 2024 · 42ab0aa · 42ab0aa
1 parent 450291f
commit 42ab0aa
Show file tree

Hide file tree

Showing 8 changed files with 28 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -23,6 +23,10 @@ The reference page for Scrapegraph-ai is available on the official page of pypy:
 ```bash
 pip install scrapegraphai
 ```
+you will also need to install Playwright for javascript-based scraping:
+```bash
+playwright install
+```
 ## 🔍 Demo
 Official streamlit demo:
 

diff --git a/examples/mixed_models/smart_scraper_mixed.py b/examples/mixed_models/smart_scraper_mixed.py
@@ -24,7 +24,8 @@
         "model": "ollama/nomic-embed-text",
         "temperature": 0,
         "base_url": "http://localhost:11434",  # set ollama URL arbitrarily
-    }
+    },
+    "headless": False
 }
 
 # ************************************************

diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,7 @@ google = "3.0.0"
 minify-html = "0.15.0"
 free-proxy = "1.1.1"
 langchain-groq = "0.1.3"
+playwright = "^1.43.0"
 
 [tool.poetry.dev-dependencies]
 pytest = "8.0.0"

diff --git a/requirements.txt b/requirements.txt
@@ -13,3 +13,4 @@ google==3.0.0
 minify-html==0.15.0
 free-proxy==1.1.1
 langchain-groq==0.1.3
+playwright==1.43.0
diff --git a/scrapegraphai/graphs/search_graph.py b/scrapegraphai/graphs/search_graph.py
@@ -29,6 +29,7 @@ def _create_graph(self):
         fetch_node = FetchNode(
             input="url | local_dir",
             output=["doc"],
+            node_config={"headless": True if self.config is None else self.config.get("headless", True)}
         )
         parse_node = ParseNode(
             input="doc",

diff --git a/scrapegraphai/graphs/smart_scraper_graph.py b/scrapegraphai/graphs/smart_scraper_graph.py
@@ -25,13 +25,15 @@ def __init__(self, prompt: str, source: str, config: dict):
 
         self.input_key = "url" if source.startswith("http") else "local_dir"
 
+
     def _create_graph(self):
         """
         Creates the graph of nodes representing the workflow for web scraping.
         """
         fetch_node = FetchNode(
             input="url | local_dir",
             output=["doc"],
+            node_config={"headless": True if self.config is None else self.config.get("headless", True)}
         )
         parse_node = ParseNode(
             input="doc",

diff --git a/scrapegraphai/graphs/speech_graph.py b/scrapegraphai/graphs/speech_graph.py
@@ -35,6 +35,7 @@ def _create_graph(self):
         fetch_node = FetchNode(
             input="url | local_dir",
             output=["doc"],
+            node_config={"headless": True if self.config is None else self.config.get("headless", True)}
         )
         parse_node = ParseNode(
             input="doc",

diff --git a/scrapegraphai/nodes/fetch_node.py b/scrapegraphai/nodes/fetch_node.py
@@ -2,8 +2,8 @@
 Module for fetching the HTML node
 """
 
-from typing import List
-from langchain_community.document_loaders import AsyncHtmlLoader
+from typing import List, Optional
+from langchain_community.document_loaders import AsyncChromiumLoader
 from langchain_core.documents import Document
 from .base_node import BaseNode
 from ..utils.remover import remover
@@ -37,7 +37,7 @@ class FetchNode(BaseNode):
                         to succeed.
     """
 
-    def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
+    def __init__(self, input: str, output: List[str], node_config: Optional[dict], node_name: str = "Fetch"):
         """
         Initializes the FetchHTMLNode with a node name and node type.
         Arguments:
@@ -46,6 +46,8 @@ def __init__(self, input: str, output: List[str], node_name: str = "Fetch"):
         """
         super().__init__(node_name, "node", input, output, 1)
 
+        self.headless = True if node_config is None else node_config.get("headless", True)
+
     def execute(self, state):
         """
         Executes the node's logic to fetch HTML content from a specified URL and
@@ -79,14 +81,21 @@ def execute(self, state):
 
         else:
             if self.node_config is not None and self.node_config.get("endpoint") is not None:
-                loader = AsyncHtmlLoader(
-                    source, proxies={"http": self.node_config["endpoint"]})
+
+                loader = AsyncChromiumLoader(
+                    [source],
+                    proxies={"http": self.node_config["endpoint"]},
+                    headless=self.headless,
+                )
             else:
-                loader = AsyncHtmlLoader(source)
+                loader = AsyncChromiumLoader(
+                    [source],
+                    headless=self.headless,
+                )
 
             document = loader.load()
             compressed_document = [
-                Document(page_content=remover(str(document)))]
+                Document(page_content=remover(str(document[0].page_content)))]
 
         state.update({self.output[0]: compressed_document})
         return state