Skip to content

Commit

Permalink
Merge pull request #15 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Fix error with character coordinates
  • Loading branch information
VikParuchuri authored Oct 18, 2024
2 parents c6a85c6 + b356e3b commit c88e23c
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 52 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ temp.json
notebooks
results
.DS_Store
profile_output*

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
35 changes: 17 additions & 18 deletions pdftext/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@
from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES
from pdftext.settings import settings


def update_current(current, new_char):
bbox = new_char["bbox"]
if "bbox" not in current:
current["bbox"] = bbox.copy()
current_bbox = bbox.copy()
current["bbox"] = current_bbox
else:
current_bbox = current["bbox"]
current_bbox[0] = min(bbox[0], current_bbox[0])
current_bbox[1] = min(bbox[1], current_bbox[1])
current_bbox[2] = max(bbox[2], current_bbox[2])
current_bbox[3] = max(bbox[3], current_bbox[3])
current_bbox = current["bbox"]

current["center_x"] = (current_bbox[0] + current_bbox[2]) / 2
current["center_y"] = (current_bbox[1] + current_bbox[3]) / 2

Expand Down Expand Up @@ -45,7 +45,7 @@ def create_training_row(char_info, prev_char, currblock, currline):

is_space = char in SPACES or char in TABS

return np.array([
return [
char_center_x - currblock["center_x"],
char_x1 - currblock_bbox[2],
char_x1 - currblock_bbox[0],
Expand All @@ -65,17 +65,12 @@ def create_training_row(char_info, prev_char, currblock, currline):
char_x2 - prev_x1,
y_gap,
char_y2 - prev_y1
], dtype=np.float32)

return training_row
]


def update_span(line, span):
if span["chars"]:
first_char = span["chars"][0]
span["font"] = first_char["font"]
span["rotation"] = first_char["rotation"]

char_bboxes = [char["bbox"] for char in span["chars"]]
min_x, min_y, max_x, max_y = char_bboxes[0]

Expand All @@ -85,14 +80,19 @@ def update_span(line, span):
max_x = max(max_x, bbox[2])
max_y = max(max_y, bbox[3])

span["bbox"] = [min_x, min_y, max_x, max_y]
span["text"] = "".join(char["char"] for char in span["chars"])
span["char_start_idx"] = first_char["char_idx"]
span["char_end_idx"] = span["chars"][-1]["char_idx"]
span.update({
"font": first_char["font"],
"rotation": first_char["rotation"],
"bbox": [min_x, min_y, max_x, max_y],
"text": "".join(char["char"] for char in span["chars"]),
"char_start_idx": first_char["char_idx"],
"char_end_idx": span["chars"][-1]["char_idx"]
})

# Remove unneeded keys from the characters
char_keys = list(first_char.keys())
for char in span["chars"]:
for key in list(char.keys()):
for key in char_keys:
if key not in ["char", "bbox"]:
del char[key]

Expand Down Expand Up @@ -150,7 +150,6 @@ def normalized_diff(a, b, mult=1, use_abs=True):
char_center_y = (char_bbox[1] + char_bbox[3]) / 2
return normalized_diff(char_center_y, line_center_y)


def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD):
prev_char = None
prev_font_info = None
Expand Down Expand Up @@ -241,7 +240,7 @@ def inference(text_chars, model):

training_idxs = sorted(training_data.keys())
training_rows = [training_data[idx] for idx in training_idxs]
training_rows = np.stack(training_rows, axis=0)
training_rows = np.stack(training_rows, axis=0, dtype=np.float32)

# Run inference
predictions = model.run([output_name], {input_name: training_rows})[0]
Expand All @@ -250,4 +249,4 @@ def inference(text_chars, model):
sorted_keys = sorted(page_blocks.keys())
page_blocks = [page_blocks[key] for key in sorted_keys]
assert len(page_blocks) == len(text_chars)
return page_blocks
return page_blocks
10 changes: 6 additions & 4 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import math
from collections import defaultdict
from typing import Dict, List

import pypdfium2.raw as pdfium_c
Expand Down Expand Up @@ -41,7 +42,6 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
page = pdf.get_page(page_idx)

text_page = page.get_textpage()
mediabox = page.get_mediabox()
page_rotation = page.get_rotation()
bbox = page.get_bbox()
page_width = math.ceil(abs(bbox[2] - bbox[0]))
Expand All @@ -56,8 +56,6 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
if page_rotation == 90 or page_rotation == 270:
page_width, page_height = page_height, page_width

bl_origin = (mediabox[0] == 0 and mediabox[1] == 0)

text_chars = {
"page": page_idx,
"rotation": page_rotation,
Expand All @@ -66,6 +64,10 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
"height": page_height,
}

# For pypdfium bbox function later
page_width = math.ceil(page_width)
page_height = math.ceil(page_height)

fontname = None
fontflags = None
total_chars = text_page.count_chars()
Expand All @@ -87,7 +89,7 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
rotation = rotation * rad_to_deg # convert from radians to degrees
coords = text_page.get_charbox(i, loose=rotation == 0) # Loose doesn't work properly when charbox is rotated
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, bl_origin, page_rotation, normalize=True)
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, page_rotation, normalize=True)

char_info = {
"font": {
Expand Down
39 changes: 11 additions & 28 deletions pdftext/pdf/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import pypdfium2.raw as pdfium_c
import ctypes
import math

LINE_BREAKS = ["\n", "\u000D", "\u000A"]
TABS = ["\t", "\u0009", "\x09"]
Expand Down Expand Up @@ -32,7 +31,7 @@ def get_fontname(textpage, char_index):
return decoded, flag_buffer.value


def page_to_device(page, x, y, page_width, page_height, page_rotation: int):
def page_to_device(page, x, y, page_width, page_height, page_rotation: int, device_x, device_y):
if page_rotation == 90:
page_rotation = 1
elif page_rotation == 180:
Expand All @@ -41,44 +40,28 @@ def page_to_device(page, x, y, page_width, page_height, page_rotation: int):
page_rotation = 3
else:
page_rotation = 0
width = math.ceil(page_width)
height = math.ceil(page_height)
device_x = ctypes.c_int()
device_y = ctypes.c_int()
pdfium_c.FPDF_PageToDevice(page, 0, 0, width, height, page_rotation, x, y, device_x, device_y)
x = device_x.value
y = device_y.value
return x, y
pdfium_c.FPDF_PageToDevice(page, 0, 0, page_width, page_height, page_rotation, x, y, device_x, device_y)
return device_x.value, device_y.value


def pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, page_rotation):
left_bottom = page_to_device(page, *bbox[:2], page_width, page_height, page_rotation)
top_right = page_to_device(page, *bbox[2:], page_width, page_height, page_rotation)
device_x = ctypes.c_int()
device_y = ctypes.c_int()
left_bottom = page_to_device(page, *bbox[:2], page_width, page_height, page_rotation, device_x, device_y)
top_right = page_to_device(page, *bbox[2:], page_width, page_height, page_rotation, device_x, device_y)

dev_bbox = [left_bottom[0], top_right[1], top_right[0], left_bottom[1]]
return dev_bbox


def fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height):
left, bottom, right, top = bbox

dev_bbox = [left, page_height-top, right, page_height-bottom]
return dev_bbox


def page_bbox_to_device_bbox(page, bbox, page_width: int, page_height: int, bl_origin: bool, page_rotation: int, normalize=False):
def page_bbox_to_device_bbox(page, bbox, page_width: int, page_height: int, page_rotation: int, normalize=False):
orig_page_height, orig_page_width = page_height, page_width
if page_rotation in [90, 270]:
orig_page_height, orig_page_width = page_width, page_height

if bl_origin:
bbox = fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height)
if page_rotation > 0:
bbox = rotate_page_bbox(bbox, page_rotation, page_width, page_height)
else:
bbox = pdfium_page_bbox_to_device_bbox(page, bbox, orig_page_width, orig_page_height, page_rotation)
if page_rotation > 0:
bbox = rotate_pdfium_bbox(bbox, page_rotation, page_width, page_height)
bbox = pdfium_page_bbox_to_device_bbox(page, bbox, orig_page_width, orig_page_height, page_rotation)
if page_rotation > 0:
bbox = rotate_pdfium_bbox(bbox, page_rotation, page_width, page_height)

if normalize:
bbox = [bbox[0] / page_width, bbox[1] / page_height, bbox[2] / page_width, bbox[3] / page_height]
Expand Down
2 changes: 1 addition & 1 deletion pdftext/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ class Settings(BaseSettings):
MODEL_PATH: str = os.path.join(BASE_PATH, "models", "dt.onnx")

# Fonts
FONTNAME_SAMPLE_FREQ: int = 4
FONTNAME_SAMPLE_FREQ: int = 6

# Inference
BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.16"
version = "0.3.17"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down

0 comments on commit c88e23c

Please sign in to comment.