Merge pull request #15 from VikParuchuri/dev

Fix error with character coordinates
VikParuchuri · Oct 18, 2024 · c88e23c · c88e23c
2 parents c6a85c6 + b356e3b
commit c88e23c
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 52 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ temp.json
 notebooks
 results
 .DS_Store
+profile_output*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/pdftext/inference.py b/pdftext/inference.py
@@ -3,18 +3,18 @@
 from pdftext.pdf.utils import LINE_BREAKS, TABS, SPACES
 from pdftext.settings import settings
 
-
 def update_current(current, new_char):
     bbox = new_char["bbox"]
     if "bbox" not in current:
-        current["bbox"] = bbox.copy()
+        current_bbox = bbox.copy()
+        current["bbox"] = current_bbox
     else:
         current_bbox = current["bbox"]
         current_bbox[0] = min(bbox[0], current_bbox[0])
         current_bbox[1] = min(bbox[1], current_bbox[1])
         current_bbox[2] = max(bbox[2], current_bbox[2])
         current_bbox[3] = max(bbox[3], current_bbox[3])
-    current_bbox = current["bbox"]
+
     current["center_x"] = (current_bbox[0] + current_bbox[2]) / 2
     current["center_y"] = (current_bbox[1] + current_bbox[3]) / 2
 
@@ -45,7 +45,7 @@ def create_training_row(char_info, prev_char, currblock, currline):
 
     is_space = char in SPACES or char in TABS
 
-    return np.array([
+    return [
         char_center_x - currblock["center_x"],
         char_x1 - currblock_bbox[2],
         char_x1 - currblock_bbox[0],
@@ -65,17 +65,12 @@ def create_training_row(char_info, prev_char, currblock, currline):
         char_x2 - prev_x1,
         y_gap,
         char_y2 - prev_y1
-    ], dtype=np.float32)
-
-    return training_row
+    ]
 
 
 def update_span(line, span):
     if span["chars"]:
         first_char = span["chars"][0]
-        span["font"] = first_char["font"]
-        span["rotation"] = first_char["rotation"]
-
         char_bboxes = [char["bbox"] for char in span["chars"]]
         min_x, min_y, max_x, max_y = char_bboxes[0]
 
@@ -85,14 +80,19 @@ def update_span(line, span):
             max_x = max(max_x, bbox[2])
             max_y = max(max_y, bbox[3])
 
-        span["bbox"] = [min_x, min_y, max_x, max_y]
-        span["text"] = "".join(char["char"] for char in span["chars"])
-        span["char_start_idx"] = first_char["char_idx"]
-        span["char_end_idx"] = span["chars"][-1]["char_idx"]
+        span.update({
+            "font": first_char["font"],
+            "rotation": first_char["rotation"],
+            "bbox": [min_x, min_y, max_x, max_y],
+            "text": "".join(char["char"] for char in span["chars"]),
+            "char_start_idx": first_char["char_idx"],
+            "char_end_idx": span["chars"][-1]["char_idx"]
+        })
 
         # Remove unneeded keys from the characters
+        char_keys = list(first_char.keys())
         for char in span["chars"]:
-            for key in list(char.keys()):
+            for key in char_keys:
                 if key not in ["char", "bbox"]:
                     del char[key]
 
@@ -150,7 +150,6 @@ def normalized_diff(a, b, mult=1, use_abs=True):
         char_center_y = (char_bbox[1] + char_bbox[3]) / 2
         return normalized_diff(char_center_y, line_center_y)
 
-
 def infer_single_page(text_chars, block_threshold=settings.BLOCK_THRESHOLD):
     prev_char = None
     prev_font_info = None
@@ -241,7 +240,7 @@ def inference(text_chars, model):
 
         training_idxs = sorted(training_data.keys())
         training_rows = [training_data[idx] for idx in training_idxs]
-        training_rows = np.stack(training_rows, axis=0)
+        training_rows = np.stack(training_rows, axis=0, dtype=np.float32)
 
         # Run inference
         predictions = model.run([output_name], {input_name: training_rows})[0]
@@ -250,4 +249,4 @@ def inference(text_chars, model):
     sorted_keys = sorted(page_blocks.keys())
     page_blocks = [page_blocks[key] for key in sorted_keys]
     assert len(page_blocks) == len(text_chars)
-    return page_blocks
+    return page_blocks
diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py
@@ -1,4 +1,5 @@
 import math
+from collections import defaultdict
 from typing import Dict, List
 
 import pypdfium2.raw as pdfium_c
@@ -41,7 +42,6 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
             page = pdf.get_page(page_idx)
 
         text_page = page.get_textpage()
-        mediabox = page.get_mediabox()
         page_rotation = page.get_rotation()
         bbox = page.get_bbox()
         page_width = math.ceil(abs(bbox[2] - bbox[0]))
@@ -56,8 +56,6 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
         if page_rotation == 90 or page_rotation == 270:
             page_width, page_height = page_height, page_width
 
-        bl_origin = (mediabox[0] == 0 and mediabox[1] == 0)
-
         text_chars = {
             "page": page_idx,
             "rotation": page_rotation,
@@ -66,6 +64,10 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
             "height": page_height,
         }
 
+        # For pypdfium bbox function later
+        page_width = math.ceil(page_width)
+        page_height = math.ceil(page_height)
+
         fontname = None
         fontflags = None
         total_chars = text_page.count_chars()
@@ -87,7 +89,7 @@ def get_pdfium_chars(pdf, page_range, flatten_pdf, fontname_sample_freq=settings
             rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
             rotation = rotation * rad_to_deg # convert from radians to degrees
             coords = text_page.get_charbox(i, loose=rotation == 0) # Loose doesn't work properly when charbox is rotated
-            device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, bl_origin, page_rotation, normalize=True)
+            device_coords = page_bbox_to_device_bbox(page, coords, page_width, page_height, page_rotation, normalize=True)
 
             char_info = {
                 "font": {

diff --git a/pdftext/pdf/utils.py b/pdftext/pdf/utils.py
@@ -1,6 +1,5 @@
 import pypdfium2.raw as pdfium_c
 import ctypes
-import math
 
 LINE_BREAKS = ["\n", "\u000D", "\u000A"]
 TABS = ["\t", "\u0009", "\x09"]
@@ -32,7 +31,7 @@ def get_fontname(textpage, char_index):
     return decoded, flag_buffer.value
 
 
-def page_to_device(page, x, y, page_width, page_height, page_rotation: int):
+def page_to_device(page, x, y, page_width, page_height, page_rotation: int, device_x, device_y):
     if page_rotation == 90:
         page_rotation = 1
     elif page_rotation == 180:
@@ -41,44 +40,28 @@ def page_to_device(page, x, y, page_width, page_height, page_rotation: int):
         page_rotation = 3
     else:
         page_rotation = 0
-    width = math.ceil(page_width)
-    height = math.ceil(page_height)
-    device_x = ctypes.c_int()
-    device_y = ctypes.c_int()
-    pdfium_c.FPDF_PageToDevice(page, 0, 0, width, height, page_rotation, x, y, device_x, device_y)
-    x = device_x.value
-    y = device_y.value
-    return x, y
+    pdfium_c.FPDF_PageToDevice(page, 0, 0, page_width, page_height, page_rotation, x, y, device_x, device_y)
+    return device_x.value, device_y.value
 
 
 def pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, page_rotation):
-    left_bottom = page_to_device(page, *bbox[:2], page_width, page_height, page_rotation)
-    top_right = page_to_device(page, *bbox[2:], page_width, page_height, page_rotation)
+    device_x = ctypes.c_int()
+    device_y = ctypes.c_int()
+    left_bottom = page_to_device(page, *bbox[:2], page_width, page_height, page_rotation, device_x, device_y)
+    top_right = page_to_device(page, *bbox[2:], page_width, page_height, page_rotation, device_x, device_y)
 
     dev_bbox = [left_bottom[0], top_right[1], top_right[0], left_bottom[1]]
     return dev_bbox
 
 
-def fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height):
-    left, bottom, right, top = bbox
-
-    dev_bbox = [left, page_height-top, right, page_height-bottom]
-    return dev_bbox
-
-
-def page_bbox_to_device_bbox(page, bbox, page_width: int, page_height: int, bl_origin: bool, page_rotation: int, normalize=False):
+def page_bbox_to_device_bbox(page, bbox, page_width: int, page_height: int, page_rotation: int, normalize=False):
     orig_page_height, orig_page_width = page_height, page_width
     if page_rotation in [90, 270]:
         orig_page_height, orig_page_width = page_width, page_height
 
-    if bl_origin:
-        bbox = fast_page_bbox_to_device_bbox(page, bbox, page_width, page_height)
-        if page_rotation > 0:
-            bbox = rotate_page_bbox(bbox, page_rotation, page_width, page_height)
-    else:
-        bbox = pdfium_page_bbox_to_device_bbox(page, bbox, orig_page_width, orig_page_height, page_rotation)
-        if page_rotation > 0:
-            bbox = rotate_pdfium_bbox(bbox, page_rotation, page_width, page_height)
+    bbox = pdfium_page_bbox_to_device_bbox(page, bbox, orig_page_width, orig_page_height, page_rotation)
+    if page_rotation > 0:
+        bbox = rotate_pdfium_bbox(bbox, page_rotation, page_width, page_height)
 
     if normalize:
         bbox = [bbox[0] / page_width, bbox[1] / page_height, bbox[2] / page_width, bbox[3] / page_height]

diff --git a/pdftext/settings.py b/pdftext/settings.py
@@ -8,7 +8,7 @@ class Settings(BaseSettings):
     MODEL_PATH: str = os.path.join(BASE_PATH, "models", "dt.onnx")
 
     # Fonts
-    FONTNAME_SAMPLE_FREQ: int = 4
+    FONTNAME_SAMPLE_FREQ: int = 6
 
     # Inference
     BLOCK_THRESHOLD: float = 0.8 # Confidence threshold for block detection

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdftext"
-version = "0.3.16"
+version = "0.3.17"
 description = "Extract structured text from pdfs quickly"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 license = "Apache-2.0"