Skip to content

Commit

Permalink
v0.0.1
Browse files Browse the repository at this point in the history
  • Loading branch information
CosmosShadow committed Jun 28, 2024
1 parent 43eeca1 commit 63be262
Show file tree
Hide file tree
Showing 20 changed files with 476 additions and 30 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@

test/.env
gp/*
*.pyc
dist/*
34 changes: 28 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
# gptpdf
Using GPT to parse PDF

## Introduction

This package uses OpenAI's GPT-4o to parse PDFs to Markdowns.
It perfectly parse text, image, math equations, charts, and tables.
It almost cost $0.013 per page.


## DEMO

See [examples/attention_is_all_you_need/output.md](examples/attention_is_all_you_need/output.md)

## Installation

Expand All @@ -13,11 +22,24 @@ pip install gptpdf

```python
from gptpdf import parse_pdf
pdf_path = '../examples/attention_is_all_you_need.pdf'
output_dir = '../examples/attention_is_all_you_need/'
api_key = os.getenv('OPENAI_API_KEY')
base_url = os.getenv('OPENAI_API_BASE')
# Manually provide OPENAI_API_KEY and OPEN_API_BASE
content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model='gpt-4o')
print(content)
print(image_paths)
# also output_dir/output.md is generated
```

pdf_path = "./examples/appattention_is_all_you_need.pdf"
api_key = "your-openai-api-key"
# base_url = "https://api.openai.com/v1"
model = 'gpt-4o'

content, image_paths = parse_pdf(pdf_path)
```python
from gptpdf import parse_pdf
pdf_path = '../examples/attention_is_all_you_need.pdf'
output_dir = '../examples/attention_is_all_you_need/'
# Use OPENAI_API_KEY and OPENAI_API_BASE from environment variables
content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, model='gpt-4o', verbose=True)
print(content)
print(image_paths)
# also output_dir/output.md is generated
```
27 changes: 27 additions & 0 deletions docs/develop.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# 发布

```bash
# 发布pip库
poetry build -f sdist
poetry publish
```

# 测试

```shell
# 新建python环境
python -m venv gp
source gp/bin/activate

# 临时取消python别名 (如果有)
unalias python

# 安装依赖
pip install .

# 测试
cd test
# 导出环境变量
export $(grep -v '^#' .env | sed 's/^export //g' | xargs)
python test.py
```
Binary file added examples/attention_is_all_you_need/0_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/attention_is_all_you_need/12_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/attention_is_all_you_need/13_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/attention_is_all_you_need/14_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/attention_is_all_you_need/2_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/attention_is_all_you_need/3_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/attention_is_all_you_need/3_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/attention_is_all_you_need/3_2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/attention_is_all_you_need/5_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/attention_is_all_you_need/7_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/attention_is_all_you_need/8_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/attention_is_all_you_need/9_0.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
374 changes: 374 additions & 0 deletions examples/attention_is_all_you_need/output.md

Large diffs are not rendered by default.

35 changes: 24 additions & 11 deletions gptpdf/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def _parse_rects(page):
return _parse_images(page) + _parse_drawings(page)


def _parse_pdf_to_images(pdf_path, output_dir):
def _parse_pdf_to_images(pdf_path, output_dir='./'):
"""
parse pdf to images and save to output_dir
:param pdf_path: pdf file path
Expand All @@ -163,6 +163,7 @@ def _parse_pdf_to_images(pdf_path, output_dir):

image_infos = []
for page_index, page in enumerate(pdf_document):
print(f'parse page: {page_index}')
# 保存页面为图片
page_image = page.get_pixmap(matrix=fitz.Matrix(3, 3))

Expand All @@ -173,7 +174,7 @@ def _parse_pdf_to_images(pdf_path, output_dir):
rects = _parse_rects(page)
# rects = _parse_tables(page)
for index, rect in enumerate(rects):
print(page_index, index, rect)
# print(page_index, index, rect)
fitz_rect = fitz.Rect(rect)
pix = page.get_pixmap(clip=fitz_rect, matrix=fitz.Matrix(4, 4))
name = f'{page_index}_{index}.png'
Expand Down Expand Up @@ -210,7 +211,7 @@ def _parse_pdf_to_images(pdf_path, output_dir):
return image_infos


def _gpt_parse_images(image_infos, output_dir, api_key=None, base_url=None, model='gpt-4o'):
def _gpt_parse_images(image_infos, output_dir='./', api_key=None, base_url=None, model='gpt-4o', verbose=False):
"""
parse images to markdown content
:param image_infos: [(page_image, rect_images)]
Expand Down Expand Up @@ -238,43 +239,55 @@ def _gpt_parse_images(image_infos, output_dir, api_key=None, base_url=None, mode
agent = Agent(role=role, api_key=api_key, base_url=base_url, model=model, disable_python_run=True)
contents = []
for index, (page_image, rect_images) in enumerate(image_infos):
print(f'Processing page: {index}')
print(f'gpt parse page: {index}')
agent.clear()
local_prompt = prompt
if rect_images:
local_prompt += rect_prompt % ', '.join(rect_images)
content = agent.run([local_prompt, {'image': page_image}])
content = agent.run([local_prompt, {'image': page_image}], show_stream=verbose)
contents.append(content)

# 输出结果
os.path.join(output_dir, 'output.md')
with open('output.md', 'w') as f:
output_path = os.path.join(output_dir, 'output.md')
with open(output_path, 'w') as f:
f.write('\n\n'.join(contents))

return '\n\n'.join(contents)


def parse_pdf(pdf_path, output_dir='./', api_key=None, base_url=None, model='gpt-4o'):
def parse_pdf(pdf_path, output_dir='./', api_key=None, base_url=None, model='gpt-4o', verbose=False):
"""
parse pdf file to markdown file
:param pdf_path: pdf file path
:param output_dir: output directory. store all images and markdown file
:param api_key: OpenAI API Key (optional). If not provided, Use OPENAI_API_KEY environment variable.
:param base_url: OpenAI Base URL. (optional). If not provided, Use OPENAI_BASE_URL environment variable.
:param model: OpenAI Vison LLM Model, default is 'gpt-4o'. You also can use qwen-vl-max
:param verbose: verbose mode
:return: markdown content with ![](path/to/image.png) and all rect image (image, table, chart, ...) paths.
"""
"""
解析PDF文件到markdown文件
:param pdf_path: pdf文件路径
:param output_dir: 输出目录。存储所有的图片和markdown文件
:param api_key: OpenAI API Key(可选)。如果未提供,则使用OPENAI_API_KEY环境变量。
:param base_url: OpenAI Base URL。 (可选)。如果未提供,则使用OPENAI_BASE_URL环境变量。
:param model: OpenAI Vison LLM Model,默认为'gpt-4o'。您还可以使用qwen-vl-max
:param verbose: 详细模式,默认为False
:return: (content, all_rect_images), markdown内容,带有![](path/to/image.png) 和 所有矩形图像(图像、表格、图表等)路径列表。
"""
import os
if not os.path.exists(output_dir):
os.makedirs(output_dir)

image_infos = _parse_pdf_to_images(pdf_path, output_dir)
content = _gpt_parse_images(image_infos, api_key, base_url, model)
image_infos = _parse_pdf_to_images(pdf_path, output_dir=output_dir)
content = _gpt_parse_images(image_infos, output_dir=output_dir, api_key=api_key, base_url=base_url, model=model, verbose=verbose)

# 删除每页的图片 & 保留所有的矩形图片
all_rect_images = []
for page_image, rect_images in image_infos:
os.remove(page_image)
if os.path.exists(page_image):
os.remove(page_image)
all_rect_images.extend(rect_images)

return content, all_rect_images
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ packages = [
GeneralAgent = "^0.3.11"
shapely = "^2.0.1"
pymupdf = "^1.24.7"
python-dotenv = "^1.0.0"

[tool.poetry.group.dev.dependencies]
pytest = "^7.4.3"
Expand Down
File renamed without changes.
32 changes: 19 additions & 13 deletions test/test.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,33 @@
import os
from gptpdf import parse_pdf

path_load = lambda x: os.path.abspath(os.path.join(os.path.dirname(__file__), x))
pdf_path = path_load('../examples/attention_is_all_you_need.pdf')
output_dir = path_load('../examples/attention_is_all_you_need/')

# laod environment variables from .env file
import dotenv
dotenv.load_dotenv()

def test_use_api_key():
api_key = 'your'
base_url = 'your'
content, image_paths = parse_pdf(pdf_path, output_dir, api_key, base_url)
from gptpdf import parse_pdf
pdf_path = '../examples/attention_is_all_you_need.pdf'
output_dir = '../examples/attention_is_all_you_need/'
api_key = os.getenv('OPENAI_API_KEY')
base_url = os.getenv('OPENAI_API_BASE')
# Manually provide OPENAI_API_KEY and OPEN_API_BASE
content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, api_key=api_key, base_url=base_url, model='gpt-4o')
print(content)
print(image_paths)
# also output_dir/output.md is generated


def test_use_env():
import dotenv
dotenv.load_dotenv(path_load('./.env'))
content, image_paths = parse_pdf(pdf_path, output_dir)
from gptpdf import parse_pdf
pdf_path = '../examples/attention_is_all_you_need.pdf'
output_dir = '../examples/attention_is_all_you_need/'
# Use OPENAI_API_KEY and OPENAI_API_BASE from environment variables
content, image_paths = parse_pdf(pdf_path, output_dir=output_dir, model='gpt-4o', verbose=True)
print(content)
print(image_paths)
# also output_dir/output.md is generated


if __name__ == '__main__':
# test_use_api_key()
test_use_env()
test_use_api_key()
# test_use_env()

0 comments on commit 63be262

Please sign in to comment.