Skip to content

Commit

Permalink
Optimize extraction of rects
Browse files Browse the repository at this point in the history
  • Loading branch information
CosmosShadow committed Jul 5, 2024
1 parent e996b53 commit bc616f8
Showing 1 changed file with 51 additions and 37 deletions.
88 changes: 51 additions & 37 deletions gptpdf/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,25 +11,31 @@

# This Default Prompt Using Chinese and could be changed to other languages.

DEFAULT_PROMPT = """使用markdown语法,将图片中识别到的文字转换为markdown格式输出。你必须做到:
DEFAULT_PROMPT = """
使用markdown语法,将图片中识别到的文字转换为markdown格式输出。你必须做到:
1. 输出和使用识别到的图片的相同的语言,例如,识别到英语的字段,输出的内容必须是英语。
2. 不要解释和输出无关的文字,直接输出图片中的内容。例如,严禁输出 “以下是我根据图片内容生成的markdown文本:”这样的例子,而是应该直接输出markdown。
3. 内容不要包含在```markdown ```中、段落公式使用 $$ $$ 的形式、行内公式使用 $ $ 的形式、忽略掉长直线、忽略掉页码。
2. 不要解释和输出无关的文字,直接输出图片中的内容。例如,严禁输出 “以下是我根据图片内容生成的markdown文本:”这样的例子,而是应该直接输出markdown内容
3. 内容不要包含在```markdown ```中
4. 段落公式使用 $$ $$ 的形式、行内公式使用 $ $ 的形式。
5. 忽略掉图片中的直线、页码、左右上下和正文无关的文字。
再次强调,不要解释和输出无关的文字,直接输出图片中的内容。
"""
DEFAULT_RECT_PROMPT = """图片中用红色框和名称(%s)标注出了一些区域。
如果区域是表格或者图片,使用 ![]() 的形式插入到输出内容中,否则直接输出文字内容。

DEFAULT_RECT_PROMPT = """
图片中用红色框和名称(%s)标注出了一些区域(已经抠出),使用 ![]() 的形式插入到输出内容中。例如,![](path/to/image.png)。
"""
DEFAULT_ROLE_PROMPT = """你是一个PDF文档解析器,使用markdown和latex语法输出图片的内容。

DEFAULT_ROLE_PROMPT = """
你是一个PDF文档解析器,使用markdown和latex语法输出图片的内容。
"""


def _is_near(rect1: BaseGeometry, rect2: BaseGeometry, distance: float = 20) -> bool:
"""
Check if two rectangles are near each other if the distance between them is less than the target.
"""
return rect1.buffer(1).distance(rect2.buffer(1)) < distance
return rect1.buffer(0.1).distance(rect2.buffer(0.1)) < distance


def _is_horizontal_near(rect1: BaseGeometry, rect2: BaseGeometry, distance: float = 100) -> bool:
Expand Down Expand Up @@ -88,55 +94,55 @@ def _adsorb_rects_to_rects(source_rects: List[BaseGeometry], target_rects: List[
break
if not adsorbed:
new_source_rects.append(text_area_rect)
return target_rects, new_source_rects
return new_source_rects, target_rects


def _parse_drawings(page: fitz.Page) -> List[Tuple[float, float, float, float]]:
def _parse_rects(page: fitz.Page) -> List[Tuple[float, float, float, float]]:
"""
Parse drawings in the page and merge adjacent rectangles.
"""

# 提取画的内容
drawings = page.get_drawings()
rect_list = [drawing['rect'] for drawing in drawings]
rect_list = [sg.box(rect[0], rect[1], rect[2], rect[3]) for rect in rect_list]

merged_rects = _merge_rects(rect_list, distance=10, horizontal_distance=100)
merged_rects = [rect for rect in merged_rects if explain_validity(rect) == 'Valid Geometry']
text_area_rects = [sg.box(text_area['bbox'][0], text_area['bbox'][1], text_area['bbox'][2], text_area['bbox'][3])
for text_area in page.get_text('dict')['blocks']]
merged_rects, text_area_rects = _adsorb_rects_to_rects(text_area_rects, merged_rects, distance=5)
merged_rects = _merge_rects(merged_rects, distance=10)
merged_rects = [rect for rect in merged_rects if
rect.bounds[2] - rect.bounds[0] > 10 and rect.bounds[3] - rect.bounds[1] > 10]
return [rect.bounds for rect in merged_rects]
# 忽略掉长度小于30的水平直线
is_short_line = lambda x: abs(x['rect'][3] - x['rect'][1]) < 1 and abs(x['rect'][2] - x['rect'][0]) < 30
drawings = [drawing for drawing in drawings if not is_short_line(drawing)]

# 转换为shapely的矩形
rect_list = [sg.box(*drawing['rect']) for drawing in drawings]

def _parse_images(page: fitz.Page) -> List[Tuple[float, float, float, float]]:
"""
Parse images in the page.
"""
# 提取图片区域
images = page.get_image_info()
return [image['bbox'] for image in images]
image_rects = [sg.box(*image['bbox']) for image in images]

# 合并drawings和images
rect_list += image_rects

def _parse_tables(page: fitz.Page) -> List[Tuple[float, float, float, float]]:
"""
Parse tables in the page.
"""
tables = page.find_tables(snap_tolerance=20)
return [table.bbox for table in tables]
merged_rects = _merge_rects(rect_list, distance=10, horizontal_distance=100)
merged_rects = [rect for rect in merged_rects if explain_validity(rect) == 'Valid Geometry']

# 将大文本区域和小文本区域分开处理: 大文本相小合并,小文本靠近合并
is_large_content = lambda x: (len(x[4]) / max(1, len(x[4].split('\n')))) > 5
small_text_area_rects = [sg.box(*x[:4]) for x in page.get_text('blocks') if not is_large_content(x)]
large_text_area_rects = [sg.box(*x[:4]) for x in page.get_text('blocks') if is_large_content(x)]
_, merged_rects = _adsorb_rects_to_rects(large_text_area_rects, merged_rects, distance=0.1) # 完全相交
_, merged_rects = _adsorb_rects_to_rects(small_text_area_rects, merged_rects, distance=5) # 靠近

# 再次自身合并
merged_rects = _merge_rects(merged_rects, distance=10)

# 过滤比较小的矩形
merged_rects = [rect for rect in merged_rects if rect.bounds[2] - rect.bounds[0] > 20 and rect.bounds[3] - rect.bounds[1] > 20]

def _parse_rects(page: fitz.Page) -> List[Tuple[float, float, float, float]]:
"""
Parse rectangles in the page.
"""
return _parse_images(page) + _parse_drawings(page)
return [rect.bounds for rect in merged_rects]


def _parse_pdf_to_images(pdf_path: str, output_dir: str = './') -> List[Tuple[str, List[str]]]:
"""
Parse PDF to images and save to output_dir.
"""
# 打开PDF文件
pdf_document = fitz.open(pdf_path)
image_infos = []

Expand All @@ -146,16 +152,24 @@ def _parse_pdf_to_images(pdf_path: str, output_dir: str = './') -> List[Tuple[st
rects = _parse_rects(page)
for index, rect in enumerate(rects):
fitz_rect = fitz.Rect(rect)
# 保存页面为图片
pix = page.get_pixmap(clip=fitz_rect, matrix=fitz.Matrix(4, 4))
name = f'{page_index}_{index}.png'
pix.save(os.path.join(output_dir, name))
rect_images.append(name)
# # 在页面上绘制红色矩形
big_fitz_rect = fitz.Rect(fitz_rect.x0 - 1, fitz_rect.y0 - 1, fitz_rect.x1 + 1, fitz_rect.y1 + 1)
page.draw_rect(big_fitz_rect, color=(1, 0, 0), width=1)
# 空心矩形
# page.draw_rect(big_fitz_rect, color=(1, 0, 0), width=1)
# 画矩形区域(实心)
page.draw_rect(big_fitz_rect, color=(1, 0, 0), fill=(1, 0, 0))
# 在矩形内的左上角写上矩形的索引name,添加一些偏移量
text_x = fitz_rect.x0 + 2
text_y = fitz_rect.y0 + 10
text_rect = fitz.Rect(text_x, text_y - 9, text_x + 80, text_y + 2)
# 绘制白色背景矩形
page.draw_rect(text_rect, color=(1, 1, 1), fill=(1, 1, 1))
# 插入带有白色背景的文字
page.insert_text((text_x, text_y), name, fontsize=10, color=(1, 0, 0))
page_image_with_rects = page.get_pixmap(matrix=fitz.Matrix(3, 3))
page_image = os.path.join(output_dir, f'{page_index}.png')
Expand Down

0 comments on commit bc616f8

Please sign in to comment.