Skip to content

Commit

Permalink
Always use pdfium coords
Browse files Browse the repository at this point in the history
VikParuchuri committed Oct 18, 2024
1 parent 46ba8bf commit 1263a1e
Showing 4 changed files with 10 additions and 26 deletions.
8 changes: 3 additions & 5 deletions pdftext/inference.py
Original file line number Diff line number Diff line change
@@ -45,7 +45,7 @@ def create_training_row(char_info, prev_char, currblock, currline):

is_space = char in SPACES or char in TABS

return np.array([
return [
char_center_x - currblock["center_x"],
char_x1 - currblock_bbox[2],
char_x1 - currblock_bbox[0],
@@ -65,9 +65,7 @@ def create_training_row(char_info, prev_char, currblock, currline):
char_x2 - prev_x1,
y_gap,
char_y2 - prev_y1
], dtype=np.float32)

return training_row
]


def update_span(line, span):
@@ -241,7 +239,7 @@ def inference(text_chars, model):

training_idxs = sorted(training_data.keys())
training_rows = [training_data[idx] for idx in training_idxs]
training_rows = np.stack(training_rows, axis=0)
training_rows = np.stack(training_rows, axis=0, dtype=np.float32)

# Run inference
predictions = model.run([output_name], {input_name: training_rows})[0]
6 changes: 2 additions & 4 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import math
from collections import defaultdict
from typing import Dict, List

import pypdfium2.raw as pdfium_c
@@ -41,7 +42,6 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
page = pdf.get_page(page_idx)

text_page = page.get_textpage()
mediabox = page.get_mediabox()
page_rotation = page.get_rotation()
bbox = page.get_bbox()
page_width = math.ceil(abs(bbox[2] - bbox[0]))
@@ -56,8 +56,6 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
if page_rotation == 90 or page_rotation == 270:
page_width, page_height = page_height, page_width

bl_origin = (mediabox[0] == 0 and mediabox[1] == 0)

text_chars = {
"page": page_idx,
"rotation": page_rotation,
@@ -87,7 +85,7 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
rotation = rotation * rad_to_deg # convert from radians to degrees
coords = text_page.get_charbox(i, loose=rotation == 0) # Loose doesn't work properly when charbox is rotated
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, bl_origin, page_rotation, normalize=True)
device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, page_rotation, normalize=True)

char_info = {
"font": {
20 changes: 4 additions & 16 deletions pdftext/pdf/utils.py
Original file line number Diff line number Diff line change
@@ -59,26 +59,14 @@ def pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, page_ro
return dev_bbox


def fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height):
left, bottom, right, top = bbox

dev_bbox = [left, page_height-top, right, page_height-bottom]
return dev_bbox


def page_bbox_to_device_bbox(page, bbox, page_width: int, page_height: int, bl_origin: bool, page_rotation: int, normalize=False):
def page_bbox_to_device_bbox(page, bbox, page_width: int, page_height: int, page_rotation: int, normalize=False):
orig_page_height, orig_page_width = page_height, page_width
if page_rotation in [90, 270]:
orig_page_height, orig_page_width = page_width, page_height

if bl_origin:
bbox = fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height)
if page_rotation > 0:
bbox = rotate_page_bbox(bbox, page_rotation, page_width, page_height)
else:
bbox = pdfium_page_bbox_to_device_bbox(page, bbox, orig_page_width, orig_page_height, page_rotation)
if page_rotation > 0:
bbox = rotate_pdfium_bbox(bbox, page_rotation, page_width, page_height)
bbox = pdfium_page_bbox_to_device_bbox(page, bbox, orig_page_width, orig_page_height, page_rotation)
if page_rotation > 0:
bbox = rotate_pdfium_bbox(bbox, page_rotation, page_width, page_height)

if normalize:
bbox = [bbox[0] / page_width, bbox[1] / page_height, bbox[2] / page_width, bbox[3] / page_height]
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.16"
version = "0.3.17"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"

0 comments on commit 1263a1e

Please sign in to comment.