Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/custom changes #2095

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[fix] fix calling readers during runtime
  • Loading branch information
zoazhyga committed Aug 12, 2024
commit 00b64b9a4a1886feb8cf321f1213625accd718f1
70 changes: 36 additions & 34 deletions private_gpt/components/ingest/ingest_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,16 @@

logger = logging.getLogger(__name__)

LLMSHERPA_API_URL = (
"http://localhost:5010/api/parseDocument?renderFormat=all&useNewIndentParser=yes"
)


# Inspired by the `llama_index.core.readers.file.base` module
def _try_loading_included_file_formats(
llmsherpa_api_url: str = None,
) -> dict[str, type[BaseReader]]:
simple_pdf_extractor = None
if llmsherpa_api_url is not None:
def _try_loading_included_file_formats() -> dict[str, type[BaseReader]]:
if LLMSHERPA_API_URL is not None:
try:
from llama_index.readers.smart_pdf_loader import SmartPDFLoader

# llmsherpa_api_url = "http://localhost:5010/api/parseDocument?renderFormat=all&useNewIndentParser=yes"
simple_pdf_extractor = SmartPDFLoader(
llmsherpa_api_url=llmsherpa_api_url,
)
except ImportError as e:
raise ImportError(
"`llama-index-readers-smart-pdf-loader` package not found"
Expand All @@ -47,33 +43,31 @@ def _try_loading_included_file_formats(
raise ImportError("`llama-index-readers-file` package not found") from e

default_file_reader_cls: dict[str, type[BaseReader]] = {
".hwp": HWPReader(),
".hwp": HWPReader,
# ".pdf": simple_pdf_extractor if simple_pdf_extractor else PDFReader,
".pdf": PDFReader(),
".docx": simple_pdf_extractor if simple_pdf_extractor else DocxReader(),
".pptx": PptxReader(),
".ppt": PptxReader(),
".pptm": PptxReader(),
".jpg": ImageReader(),
".png": ImageReader(),
".jpeg": ImageReader(),
# ".mp3": VideoAudioReader(),
# ".mp4": VideoAudioReader(),
".csv": simple_pdf_extractor if simple_pdf_extractor else PandasCSVReader(),
".xls": simple_pdf_extractor if simple_pdf_extractor else None,
".xlsx": simple_pdf_extractor if simple_pdf_extractor else None,
".epub": EpubReader(),
".md": MarkdownReader(),
".mbox": MboxReader(),
".ipynb": IPYNBReader(),
".pdf": PDFReader,
".docx": SmartPDFLoader if LLMSHERPA_API_URL else DocxReader,
".pptx": PptxReader,
".ppt": PptxReader,
".pptm": PptxReader,
".jpg": ImageReader,
".png": ImageReader,
".jpeg": ImageReader,
# ".mp3": VideoAudioReader,
# ".mp4": VideoAudioReader,
".csv": SmartPDFLoader if LLMSHERPA_API_URL else PandasCSVReader,
".xls": SmartPDFLoader if LLMSHERPA_API_URL else None,
".xlsx": SmartPDFLoader if LLMSHERPA_API_URL else None,
".epub": EpubReader,
".md": MarkdownReader,
".mbox": MboxReader,
".ipynb": IPYNBReader,
}
return default_file_reader_cls


# Patching the default file reader to support other file types
FILE_READER_CLS = _try_loading_included_file_formats(
"http://localhost:5010/api/parseDocument?renderFormat=all&useNewIndentParser=yes"
)
FILE_READER_CLS = _try_loading_included_file_formats()
FILE_READER_CLS.update(
{
".json": JSONReader(),
Expand Down Expand Up @@ -109,13 +103,21 @@ def _load_file_to_documents(file_name: str, file_data: Path) -> list[Document]:
extension,
)
# Read as a plain text
string_reader = StringIterableReader()
return string_reader.load_data([file_data.read_text()])
try:
string_reader = StringIterableReader()
return string_reader.load_data([file_data.read_text()])
except Exception as e:
logger.error(f"Error reading file as plain text: {e}")

logger.debug(
f"Specific reader found for extension=%s, {reader_cls=}", extension
)
return reader_cls.load_data(file_data.as_posix())
if reader_cls.__name__ == "SmartPDFLoader":
return reader_cls(llmsherpa_api_url=LLMSHERPA_API_URL).load_data(
file_data.as_posix()
)
else:
return reader_cls().load_data(file_data)

@staticmethod
def _exclude_metadata(documents: list[Document]) -> None:
Expand Down
2 changes: 1 addition & 1 deletion settings-huglama.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ server:
env_name: ${APP_ENV:huglama}

data:
local_data_folder: local_data/private_gpt
# local_data_folder: local_data/private_gpt
local_ingestion:
enabled: ${LOCAL_INGESTION_ENABLED:true}
allow_ingest_from: ["local_data/input_raw"]
Expand Down