Skip to content

Commit

Permalink
fix: examples and graphs
Browse files Browse the repository at this point in the history
  • Loading branch information
VinciGit00 committed May 2, 2024
1 parent ba2b24b commit 5cf4e4f
Show file tree
Hide file tree
Showing 12 changed files with 95 additions and 21 deletions.
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ Please make sure to format your code accordingly before submitting a pull reques
- [Style Guide for Python Code](https://www.python.org/dev/peps/pep-0008/)
- [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html)
- [The Hitchhiker's Guide to Python](https://docs.python-guide.org/writing/style/)
- [Pylint style of code for the documentation](https://pylint.pycqa.org/en/1.6.0/tutorial.html)

## Submitting a Pull Request

Expand Down
3 changes: 3 additions & 0 deletions examples/single_node/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
robots_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={
"headless": False
}
)

# ************************************************
Expand Down
4 changes: 3 additions & 1 deletion examples/single_node/robot_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
robots_node = RobotsNode(
input="url",
output=["is_scrapable"],
node_config={"llm": llm_model}
node_config={"llm": llm_model,
"headless": False
}
)

# ************************************************
Expand Down
5 changes: 3 additions & 2 deletions scrapegraphai/graphs/json_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ class JSONScraperGraph(AbstractGraph):
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client, configured for generating embeddings.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
Expand All @@ -47,7 +48,7 @@ def __init__(self, prompt: str, source: str, config: dict):
def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping.
Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""
Expand Down
19 changes: 12 additions & 7 deletions scrapegraphai/graphs/script_creator_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ class ScriptCreatorGraph(AbstractGraph):
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client, configured for generating embeddings.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
model_token (int): The token limit for the language model.
Expand All @@ -44,7 +45,7 @@ class ScriptCreatorGraph(AbstractGraph):
def __init__(self, prompt: str, source: str, config: dict):

self.library = config['library']

super().__init__(prompt, config, source)

self.input_key = "url" if source.startswith("http") else "local_dir"
Expand All @@ -61,25 +62,29 @@ def _create_graph(self) -> BaseGraph:
input="url | local_dir",
output=["doc"],
node_config={
"headless": True if self.config is None else self.config.get("headless", True)}
"headless": True if self.config is None else self.config.get("headless", True),
"verbose": self.verbose}
)
parse_node = ParseNode(
input="doc",
output=["parsed_doc"],
node_config={"chunk_size": self.model_token}
node_config={"chunk_size": self.model_token,
"verbose": self.verbose}
)
rag_node = RAGNode(
input="user_prompt & (parsed_doc | doc)",
output=["relevant_chunks"],
node_config={
"llm": self.llm_model,
"embedder_model": self.embedder_model
"embedder_model": self.embedder_model,
"verbose": self.verbose
}
)
generate_scraper_node = GenerateScraperNode(
input="user_prompt & (relevant_chunks | parsed_doc | doc)",
output=["answer"],
node_config={"llm": self.llm_model},
node_config={"llm": self.llm_model,
"verbose": self.verbose},
library=self.library,
website=self.source
)
Expand All @@ -106,7 +111,7 @@ def run(self) -> str:
Returns:
str: The answer to the prompt.
"""

inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)

Expand Down
8 changes: 5 additions & 3 deletions scrapegraphai/graphs/smart_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,17 @@

class SmartScraperGraph(AbstractGraph):
"""
SmartScraper is a scraping pipeline that automates the process of extracting information from web pages
SmartScraper is a scraping pipeline that automates the process of
extracting information from web pages
using a natural language model to interpret and answer prompts.
Attributes:
prompt (str): The prompt for the graph.
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client, configured for generating embeddings.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
Expand All @@ -45,7 +47,7 @@ def __init__(self, prompt: str, source: str, config: dict):
super().__init__(prompt, config, source)

self.input_key = "url" if source.startswith("http") else "local_dir"

def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping.
Expand Down
7 changes: 4 additions & 3 deletions scrapegraphai/graphs/xml_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ class XMLScraperGraph(AbstractGraph):
source (str): The source of the graph.
config (dict): Configuration parameters for the graph.
llm_model: An instance of a language model client, configured for generating answers.
embedder_model: An instance of an embedding model client, configured for generating embeddings.
embedder_model: An instance of an embedding model client,
configured for generating embeddings.
verbose (bool): A flag indicating whether to show print statements during execution.
headless (bool): A flag indicating whether to run the graph in headless mode.
model_token (int): The token limit for the language model.
Expand All @@ -49,7 +50,7 @@ def __init__(self, prompt: str, source: str, config: dict):
def _create_graph(self) -> BaseGraph:
"""
Creates the graph of nodes representing the workflow for web scraping.
Returns:
BaseGraph: A graph instance representing the web scraping workflow.
"""
Expand Down Expand Up @@ -110,7 +111,7 @@ def run(self) -> str:
Returns:
str: The answer to the prompt.
"""

inputs = {"user_prompt": self.prompt, self.input_key: self.source}
self.final_state, self.execution_info = self.graph.execute(inputs)

Expand Down
56 changes: 56 additions & 0 deletions tests/graphs/scrape_json_ollama.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""
Module for scraping json documents
"""
import os
import pytest
from scrapegraphai.graphs import JSONScraperGraph


@pytest.fixture
def sample_json():
"""
Example of text
"""
file_name = "inputs/example.json"
curr_dir = os.path.dirname(os.path.realpath(__file__))
file_path = os.path.join(curr_dir, file_name)

with open(file_path, 'r', encoding="utf-8") as file:
text = file.read()

return text


@pytest.fixture
def graph_config():
"""
Configuration of the graph
"""
return {
"llm": {
"model": "ollama/mistral",
"temperature": 0,
"format": "json",
"base_url": "http://localhost:11434",
},
"embeddings": {
"model": "ollama/nomic-embed-text",
"temperature": 0,
"base_url": "http://localhost:11434",
}
}


def test_scraping_pipeline(sample_json: str, graph_config: dict):
"""
Start of the scraping pipeline
"""
smart_scraper_graph = JSONScraperGraph(
prompt="List me all the titles",
source=sample_json,
config=graph_config
)

result = smart_scraper_graph.run()

assert result is not None
4 changes: 2 additions & 2 deletions tests/graphs/scrape_xml_ollama_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
import os
import pytest
from scrapegraphai.graphs import SmartScraperGraph
from scrapegraphai.graphs import XMLScraperGraph


@pytest.fixture
Expand Down Expand Up @@ -45,7 +45,7 @@ def test_scraping_pipeline(sample_xml: str, graph_config: dict):
"""
Start of the scraping pipeline
"""
smart_scraper_graph = SmartScraperGraph(
smart_scraper_graph = XMLScraperGraph(
prompt="List me all the authors, title and genres of the books",
source=sample_xml,
config=graph_config
Expand Down
2 changes: 0 additions & 2 deletions tests/graphs/script_generator_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,4 @@ def test_script_creator_graph(graph_config: dict):

assert graph_exec_info is not None

assert isinstance(graph_exec_info, dict)

print(prettify_exec_info(graph_exec_info))
3 changes: 3 additions & 0 deletions tests/nodes/fetch_node_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ def setup():
robots_node = FetchNode(
input="url | local_dir",
output=["doc"],
node_config={
"headless": False
}
)

return robots_node
Expand Down
4 changes: 3 additions & 1 deletion tests/nodes/robot_node_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ def setup():
robots_node = RobotsNode(
input="url",
output=["is_scrapable"],
node_config={"llm": llm_model}
node_config={"llm": llm_model,
"headless": False
}
)

return robots_node
Expand Down

0 comments on commit 5cf4e4f

Please sign in to comment.