Skip to content

Commit

Permalink
v0.0.2: add gpt_worker to speed up parsing speed.
Browse files Browse the repository at this point in the history
  • Loading branch information
CosmosShadow committed Jun 29, 2024
1 parent e36ac0d commit 4bf2c6d
Show file tree
Hide file tree
Showing 5 changed files with 27 additions and 13 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Using VLLM (like GPT-4o) to parse PDF into markdown.

Our approach is very simple (only 293 lines of code), but can almost perfectly parse typography, math formulas, tables, pictures, charts, etc.

Average price per page: $0.013
Average cost per page: $0.013

This package use [GeneralAgent](https://github.com/CosmosShadow/GeneralAgent) lib to interact with OpenAI API.

Expand Down Expand Up @@ -68,4 +68,6 @@ parse pdf file to markdown file, and return markdown content and all image paths

- **model**: OpenAI Vison LLM Model, default is 'gpt-4o'. You also can use qwen-vl-max

- **verbose**: verbose mode
- **verbose**: verbose mode

- **gpt_worker**: gpt parse worker number. default is 1. If your machine performance is good, you can increase it appropriately to improve parsing speed.
4 changes: 3 additions & 1 deletion README_CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,4 +68,6 @@ print(content)

- **model**:OpenAI Vison LLM 模型,默认为“gpt-4o”。您也可以使用 qwen-vl-max

- **verbose**:详细模式
- **verbose**:详细模式

- **gpt_worker**: gpt解析工作线程数,默认为1. 如果您的机器性能较好,可以适当调高,以提高解析速度。
26 changes: 18 additions & 8 deletions gptpdf/parse.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import fitz # PyMuPDF
import shapely.geometry as sg
from shapely.validation import explain_validity
import concurrent.futures


def _is_near(rect1, rect2, distance=20):
Expand Down Expand Up @@ -211,7 +212,7 @@ def _parse_pdf_to_images(pdf_path, output_dir='./'):
return image_infos


def _gpt_parse_images(image_infos, output_dir='./', api_key=None, base_url=None, model='gpt-4o', verbose=False):
def _gpt_parse_images(image_infos, output_dir='./', api_key=None, base_url=None, model='gpt-4o', verbose=False, gpt_worker=1):
"""
parse images to markdown content
:param image_infos: [(page_image, rect_images)]
Expand All @@ -236,16 +237,23 @@ def _gpt_parse_images(image_infos, output_dir='./', api_key=None, base_url=None,
"""

role = '你是一个PDF文档解析器,使用markdown和latex语法输出图片的内容。'
agent = Agent(role=role, api_key=api_key, base_url=base_url, model=model, disable_python_run=True)
contents = []
for index, (page_image, rect_images) in enumerate(image_infos):

def _process_page(index, image_info):
print(f'gpt parse page: {index}')
agent.clear()
agent = Agent(role=role, api_key=api_key, base_url=base_url, model=model, disable_python_run=True)
page_image, rect_images = image_info
local_prompt = prompt
if rect_images:
local_prompt += rect_prompt % ', '.join(rect_images)
content = agent.run([local_prompt, {'image': page_image}], show_stream=verbose)
contents.append(content)
return index, content

contents = [None] * len(image_infos)
with concurrent.futures.ThreadPoolExecutor(max_workers=gpt_worker) as executor:
futures = [executor.submit(_process_page, index, image_info) for index, image_info in enumerate(image_infos)]
for future in concurrent.futures.as_completed(futures):
index, content = future.result()
contents[index] = content

# 输出结果
output_path = os.path.join(output_dir, 'output.md')
Expand All @@ -255,7 +263,7 @@ def _gpt_parse_images(image_infos, output_dir='./', api_key=None, base_url=None,
return '\n\n'.join(contents)


def parse_pdf(pdf_path, output_dir='./', api_key=None, base_url=None, model='gpt-4o', verbose=False):
def parse_pdf(pdf_path, output_dir='./', api_key=None, base_url=None, model='gpt-4o', verbose=False, gpt_worker=1):
"""
parse pdf file to markdown file
:param pdf_path: pdf file path
Expand All @@ -264,6 +272,7 @@ def parse_pdf(pdf_path, output_dir='./', api_key=None, base_url=None, model='gpt
:param base_url: OpenAI Base URL. (optional). If not provided, Use OPENAI_BASE_URL environment variable.
:param model: OpenAI Vison LLM Model, default is 'gpt-4o'. You also can use qwen-vl-max
:param verbose: verbose mode
:param gpt_worker: gpt parse worker number
:return: markdown content with ![](path/to/image.png) and all rect image (image, table, chart, ...) paths.
"""
"""
Expand All @@ -274,14 +283,15 @@ def parse_pdf(pdf_path, output_dir='./', api_key=None, base_url=None, model='gpt
:param base_url: OpenAI Base URL。 (可选)。如果未提供,则使用OPENAI_BASE_URL环境变量。
:param model: OpenAI Vison LLM Model,默认为'gpt-4o'。您还可以使用qwen-vl-max
:param verbose: 详细模式,默认为False
:param gpt_worker: gpt解析工作线程数,默认为1
:return: (content, all_rect_images), markdown内容,带有![](path/to/image.png) 和 所有矩形图像(图像、表格、图表等)路径列表。
"""
import os
if not os.path.exists(output_dir):
os.makedirs(output_dir)

image_infos = _parse_pdf_to_images(pdf_path, output_dir=output_dir)
content = _gpt_parse_images(image_infos, output_dir=output_dir, api_key=api_key, base_url=base_url, model=model, verbose=verbose)
content = _gpt_parse_images(image_infos, output_dir=output_dir, api_key=api_key, base_url=base_url, model=model, verbose=verbose, gpt_worker=gpt_worker)

# 删除每页的图片 & 保留所有的矩形图片
all_rect_images = []
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gptpdf"
version = "0.0.1"
version = "0.0.2"
description = "Using GPT to parse PDF"
authors = ["Chen Li <lichenarthurdata@gmail.com>"]
license = "Apache 2.0"
Expand Down
2 changes: 1 addition & 1 deletion test/test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_use_api_key():
api_key = os.getenv('OPENAI_API_KEY')
base_url = os.getenv('OPENAI_API_BASE')
# Manually provide OPENAI_API_KEY and OPEN_API_BASE
content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model='gpt-4o')
content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model='gpt-4o', gpt_worker=6)
print(content)
print(image_paths)
# also output_dir/output.md is generated
Expand Down

0 comments on commit 4bf2c6d

Please sign in to comment.