Skip to content

Commit

Permalink
fix(fetch_node): bug in handling local files
Browse files Browse the repository at this point in the history
  • Loading branch information
PeriniM committed May 14, 2024
1 parent fcb3abb commit a6e1813
Show file tree
Hide file tree
Showing 11 changed files with 34 additions and 135 deletions.
113 changes: 0 additions & 113 deletions examples/openai/custom_graph_openai copy.py

This file was deleted.

2 changes: 1 addition & 1 deletion examples/openai/omni_scraper_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
graph_config = {
"llm": {
"api_key": openai_key,
"model": "gpt-4o",
"model": "gpt-4-turbo",
},
"verbose": True,
"headless": True,
Expand Down
4 changes: 2 additions & 2 deletions scrapegraphai/graphs/csv_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ def _create_graph(self):
Creates the graph of nodes representing the workflow for web scraping.
"""
fetch_node = FetchNode(
input="csv",
output=["doc"],
input="csv | csv_dir",
output=["doc", "link_urls", "img_urls"],
)
parse_node = ParseNode(
input="doc",
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/deep_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def _create_graph(self) -> BaseGraph:
"""
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"]
output=["doc", "link_urls", "img_urls"]
)
parse_node = ParseNode(
input="doc",
Expand Down
4 changes: 2 additions & 2 deletions scrapegraphai/graphs/json_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ def _create_graph(self) -> BaseGraph:
"""

fetch_node = FetchNode(
input="json",
output=["doc"],
input="json | json_dir",
output=["doc", "link_urls", "img_urls"],
)
parse_node = ParseNode(
input="doc",
Expand Down
4 changes: 2 additions & 2 deletions scrapegraphai/graphs/pdf_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def _create_graph(self) -> BaseGraph:
"""

fetch_node = FetchNode(
input='pdf',
output=["doc"],
input='pdf | pdf_dir',
output=["doc", "link_urls", "img_urls"],
)
parse_node = ParseNode(
input="doc",
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/script_creator_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def _create_graph(self) -> BaseGraph:

fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
output=["doc", "link_urls", "img_urls"],
)
parse_node = ParseNode(
input="doc",
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/smart_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def _create_graph(self) -> BaseGraph:
"""
fetch_node = FetchNode(
input="url | local_dir",
output=["doc"],
output=["doc", "link_urls", "img_urls"],
node_config={
"loader_kwargs": self.config.get("loader_kwargs", {}),
}
Expand Down
2 changes: 1 addition & 1 deletion scrapegraphai/graphs/speech_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _create_graph(self) -> BaseGraph:

fetch_node = FetchNode(
input="url | local_dir",
output=["doc"]
output=["doc", "link_urls", "img_urls"]
)
parse_node = ParseNode(
input="doc",
Expand Down
4 changes: 2 additions & 2 deletions scrapegraphai/graphs/xml_scraper_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def _create_graph(self) -> BaseGraph:
"""

fetch_node = FetchNode(
input="xml",
output=["doc"]
input="xml | xml_dir",
output=["doc", "link_urls", "img_urls"]
)
parse_node = ParseNode(
input="doc",
Expand Down
30 changes: 21 additions & 9 deletions scrapegraphai/nodes/fetch_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,37 +83,49 @@ def execute(self, state):

source = input_data[0]
if (
self.input == "json_dir"
or self.input == "xml_dir"
or self.input == "csv_dir"
input_keys[0] == "json_dir"
or input_keys[0] == "xml_dir"
or input_keys[0] == "csv_dir"
):
compressed_document = [
Document(page_content=source, metadata={"source": "local_dir"})
]
# if it is a local directory

state.update({self.output[0]: compressed_document})
return state

# handling for pdf
elif self.input == "pdf":
elif input_keys[0] == "pdf":
loader = PyPDFLoader(source)
compressed_document = loader.load()
state.update({self.output[0]: compressed_document})
return state

elif self.input == "csv":
elif input_keys[0] == "csv":
compressed_document = [
Document(
page_content=str(pd.read_csv(source)), metadata={"source": "csv"}
)
]
elif self.input == "json":
state.update({self.output[0]: compressed_document})
return state

elif input_keys[0] == "json":
f = open(source)
compressed_document = [
Document(page_content=str(json.load(f)), metadata={"source": "json"})
]
elif self.input == "xml":
state.update({self.output[0]: compressed_document})
return state

elif input_keys[0] == "xml":
with open(source, "r", encoding="utf-8") as f:
data = f.read()
compressed_document = [
Document(page_content=data, metadata={"source": "xml"})
]
state.update({self.output[0]: compressed_document})
return state

elif self.input == "pdf_dir":
pass

Expand Down

0 comments on commit a6e1813

Please sign in to comment.