Improve speed

wocclyl · Apr 25, 2024 · 7353d62 · 7353d62
1 parent d4721bf
commit 7353d62
Show file tree

Hide file tree

Showing 9 changed files with 118 additions and 109 deletions.
diff --git a/README.md b/README.md
@@ -1,22 +1,19 @@
 # PDFText
 
-Extracts text from pdfs in a similar way to [PymuPDF](https://github.com/pymupdf/PyMuPDF), but without the AGPL license.  Built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2).
+Text extraction like PyMuPDF, but without the AGPL license.  PDFText extracts plain text or structured blocks and lines, similar to [PymuPDF](https://github.com/pymupdf/PyMuPDF).  It's built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2), so it's [fast, accurate](https://github.com/py-pdf/benchmarks), and Apache licensed.
 
 # Installation
 
-You'll need python 3.9+.
-
-Install with:
+You'll need python 3.9+ first.  Then run:
 
 ```shell
 pip install pdftext
 ```
 
-# Usage
+# CLI Usage
 
 - Inspect the settings in `pdftext/settings.py`.  You can override any settings with environment variables.
 
-
 ## Plain text
 
 This command will write out a text file with the extracted plain text.
@@ -53,12 +50,29 @@ The output will be a json list, with each item in the list corresponding to a si
       - `char` - the actual character, encoded in utf-8
       - `rotation` - how much the character is rotated, in degrees
       - `bbox` - the character bbox, in [x1, y1, x2, y2] format
-      - `origin` - the original pdf coordinate origin
       - `char_idx` - the index of the character on the page (from 0 to number of characters, in original pdf order)
       - `font` this is font info straight from the pdf, see [this pdfium code](https://pdfium.googlesource.com/pdfium/+/refs/heads/main/public/fpdf_text.h)
         - `size` - the size of the font used for the character
         - `weight` - font weight
         - `name` - font name, may be None
         - `flags` - font flags, in the format of the `PDF spec 1.7 Section 5.7.1 Font Descriptor Flags`
 
-- 
+# Programmatic usage
+
+Extract plain text:
+
+```python
+from pdftext.extraction import plain_text_output
+
+text = plain_text_output(PDF_PATH, sort=False)
+```
+
+Extract structured blocks and lines:
+
+```python
+from pdftext.extraction import dictionary_output
+
+text = dictionary_output(PDF_PATH)
+```
+
+
diff --git a/extract_text.py b/extract_text.py
@@ -17,7 +17,7 @@ def main():
     if args.output_type == "plain_text":
         text = plain_text_output(args.pdf_path, sort=args.sort)
     elif args.output_type == "json":
-        text = dictionary_output(args.pdf_path)
+        text = dictionary_output(args.pdf_path, sort=args.sort)
         text = json.dumps(text)
 
     if args.out_path is None:

diff --git a/models/dt.joblib b/models/dt.joblib
diff --git a/pdftext/extraction.py b/pdftext/extraction.py
@@ -1,6 +1,3 @@
-import copy
-from itertools import chain
-
 from pdftext.inference import inference
 from pdftext.model import get_model
 from pdftext.pdf.chars import get_pdfium_chars
@@ -31,12 +28,22 @@ def dictionary_output(pdf_path, sort=False):
             for key in bad_keys:
                 del block[key]
             for line in block["lines"]:
-                line["bbox"] = unnormalize_bbox(line["bbox"], page["bbox"])
+                line_box = None
                 bad_keys = [key for key in line.keys() if key not in ["chars", "bbox"]]
                 for key in bad_keys:
                     del line[key]
                 for char in line["chars"]:
                     char["bbox"] = unnormalize_bbox(char["bbox"], page["bbox"])
+                    if line_box is None:
+                        line_box = char["bbox"]
+                    else:
+                        line_box = [
+                            min(line_box[0], char["bbox"][0]),
+                            min(line_box[1], char["bbox"][1]),
+                            max(line_box[2], char["bbox"][2]),
+                            max(line_box[3], char["bbox"][3]),
+                        ]
+                line["bbox"] = line_box
             block["bbox"] = unnormalize_bbox(block["bbox"], page["bbox"])
         if sort:
             page["blocks"] = sort_blocks(page["blocks"])

diff --git a/pdftext/inference.py b/pdftext/inference.py
@@ -1,108 +1,101 @@
-import operator
-from collections import defaultdict
 from itertools import chain
 
-from pdftext.pdf.utils import SPACES, TABS, LINE_BREAKS, HYPHEN
-from pdftext.utils import replace_zero
+from pdftext.pdf.utils import LINE_BREAKS
 
 
 def update_current(current, new_char):
     bbox = new_char["bbox"]
     if "bbox" not in current:
-        current["bbox"] = list(bbox)
+        current_bbox = bbox
+        current["bbox"] = current_bbox
     else:
-        current["bbox"][0] = min(bbox[0], current["bbox"][0])
-        current["bbox"][1] = min(bbox[1], current["bbox"][1])
-        current["bbox"][2] = max(bbox[2], current["bbox"][2])
-        current["bbox"][3] = max(bbox[3], current["bbox"][3])
-    current["height"] = current["bbox"][2] - current["bbox"][0]
-    current["center_x"] = (current["bbox"][0] + current["bbox"][2]) / 2
-    current["center_y"] = (current["bbox"][1] + current["bbox"][3]) / 2
-    if "length" not in current:
-        current["length"] = 0
-    current["length"] += 1
+        current_bbox = current["bbox"]
+        current_bbox[0] = min(bbox[0], current_bbox[0])
+        current_bbox[1] = min(bbox[1], current_bbox[1])
+        current_bbox[2] = max(bbox[2], current_bbox[2])
+        current_bbox[3] = max(bbox[3], current_bbox[3])
+    current["center_x"] = (current_bbox[0] + current_bbox[2]) / 2
+    current["center_y"] = (current_bbox[1] + current_bbox[3]) / 2
     return current
 
 
-def create_training_row(char_info, prev_char, currspan, currline, currblock):
+def create_training_row(char_info, prev_char, currspan, currblock, avg_x_gap, avg_y_gap):
     char = char_info["char"]
     char_center_x = (char_info["bbox"][2] + char_info["bbox"][0]) / 2
     char_center_y = (char_info["bbox"][3] + char_info["bbox"][1]) / 2
     prev_char_center_x = (prev_char["bbox"][2] + prev_char["bbox"][0]) / 2
     prev_char_center_y = (prev_char["bbox"][3] + prev_char["bbox"][1]) / 2
-    char_height = char_info["bbox"][3] - char_info["bbox"][1]
-    char_width = char_info["bbox"][2] - char_info["bbox"][0]
-    training_row = {"is_space": char.isspace() or char in SPACES,
-                    "is_newline": char in LINE_BREAKS, "is_printable": char.isprintable(), "is_hyphen": char == HYPHEN,
-                    "char_x1": char_info["bbox"][0], "char_y1": char_info["bbox"][1],
-                    "char_x2": char_info["bbox"][2], "char_y2": char_info["bbox"][3],
-                    "prev_char_x1": prev_char["bbox"][0], "prev_char_y1": prev_char["bbox"][1],
-                    "prev_char_x2": prev_char["bbox"][2], "prev_char_y2": prev_char["bbox"][3],
-                    "x_gap": char_info["bbox"][0] - prev_char["bbox"][2],
-                    "y_gap": char_info["bbox"][1] - prev_char["bbox"][3],
-                    "x_center_gap": char_center_x - prev_char_center_x,
-                    "y_center_gap": char_center_y - prev_char_center_y,
-                    "span_len": len(currspan),
-                    "line_len": len(currline), "block_len": len(currblock), "height": char_height,
-                    "width": char_width,
-                    "width_ratio": char_width / replace_zero(prev_char["bbox"][2] - prev_char["bbox"][0]),
-                    "height_ratio": char_width / replace_zero(prev_char["bbox"][3] - prev_char["bbox"][1]),
-                    "block_x_center_gap": char_center_x - currblock["center_x"],
-                    "block_y_center_gap": char_center_y - currblock["center_y"],
-                    "line_x_center_gap": char_center_x - currline["center_x"],
-                    "line_y_center_gap": char_center_y - currblock["center_y"],
-                    "span_x_center_gap": char_center_x - currspan["center_x"],
-                    "span_y_center_gap": char_center_y - currspan["center_y"],
-                    "block_x_gap": char_info["bbox"][0] - currblock["bbox"][2],
-                    "block_y_gap": char_info["bbox"][1] - currblock["bbox"][3]}
+    x_gap = char_info["bbox"][0] - prev_char["bbox"][2]
+    y_gap = char_info["bbox"][1] - prev_char["bbox"][3]
+
+    training_row = {
+        "is_newline": char in LINE_BREAKS,
+        "x_gap": x_gap,
+        "y_gap": y_gap,
+        "x_outer_gap": char_info["bbox"][2] - prev_char["bbox"][0],
+        "y_outer_gap": char_info["bbox"][3] - prev_char["bbox"][1],
+        "x_gap_ratio": x_gap / avg_x_gap if avg_x_gap > 0 else 0,
+        "y_gap_ratio": y_gap / avg_y_gap if avg_y_gap > 0 else 0,
+        "x_center_gap": char_center_x - prev_char_center_x,
+        "y_center_gap": char_center_y - prev_char_center_y,
+        "block_x_center_gap": char_center_x - currblock["center_x"],
+        "block_y_center_gap": char_center_y - currblock["center_y"],
+        "span_x_center_gap": char_center_x - currspan["center_x"],
+        "span_y_center_gap": char_center_y - currspan["center_y"],
+        "block_x_gap": char_info["bbox"][0] - currblock["bbox"][2],
+        "block_y_gap": char_info["bbox"][1] - currblock["bbox"][3]
+    }
+
     return training_row
 
 
 def infer_single_page(text_chars):
     prev_char = None
 
-    blocks = defaultdict(list)
-    block = defaultdict(list)
-    line = defaultdict(list)
-    span = defaultdict(list)
+    blocks = {"blocks": []}
+    block = {"lines": []}
+    line = {"spans": []}
+    span = {"chars": []}
     for i, char_info in enumerate(text_chars["chars"]):
         if prev_char:
-            training_row = create_training_row(char_info, prev_char, span, line, block)
-            training_row = [v for k, v in sorted(training_row.items(), key=operator.itemgetter(0))]
+            training_row = create_training_row(char_info, prev_char, span, block, text_chars["avg_x_gap"], text_chars["avg_y_gap"])
+            training_row = [v for _, v in sorted(training_row.items())]
 
             prediction = yield training_row
             if prediction == 0:
                 pass
             elif prediction == 1:
                 line["spans"].append(span)
-                span = defaultdict(list)
+                span = {"chars": []}
             elif prediction == 2:
                 line["spans"].append(span)
-                line["chars"] = list(chain.from_iterable([s["chars"] for s in line["spans"]]))
+                line["chars"] = list(chain.from_iterable(s["chars"] for s in line["spans"]))
                 del line["spans"]
                 block["lines"].append(line)
-                line = defaultdict(list)
-                span = defaultdict(list)
+                line = {"spans": []}
+                span = {"chars": []}
             else:
                 line["spans"].append(span)
-                line["chars"] = list(chain.from_iterable([s["chars"] for s in line["spans"]]))
+                line["chars"] = list(chain.from_iterable(s["chars"] for s in line["spans"]))
                 del line["spans"]
                 block["lines"].append(line)
                 blocks["blocks"].append(block)
-                block = defaultdict(list)
-                line = defaultdict(list)
-                span = defaultdict(list)
+                block = {"lines": []}
+                line = {"spans": []}
+                span = {"chars": []}
 
         span["chars"].append(char_info)
         span = update_current(span, char_info)
-        line = update_current(line, char_info)
         block = update_current(block, char_info)
 
         prev_char = char_info
     if len(span["chars"]) > 0:
-        line["chars"] = list(chain.from_iterable([s["chars"] for s in line["spans"]]))
+        line["chars"] = list(chain.from_iterable(s["chars"] for s in line["spans"]))
+        del line["spans"]
+    if "spans" in line and len(line["spans"]) > 0:
+        line["chars"] = list(chain.from_iterable(s["chars"] for s in line["spans"]))
         del line["spans"]
-    if len(line["chars"]) > 0:
+    if "chars" in line and len(line["chars"]) > 0:
         block["lines"].append(line)
     if len(block["lines"]) > 0:
         blocks["blocks"].append(block)
@@ -139,14 +132,14 @@ def inference(text_chars, model):
         if len(page_blocks) == len(generators):
             break
 
-        training_list = sorted(training_data.items(), key=operator.itemgetter(0))
+        training_list = sorted(training_data.items())
         training_rows = [tl[1] for tl in training_list]
         training_idxs = [tl[0] for tl in training_list]
 
         predictions = model.predict(training_rows)
         for pred, page_idx in zip(predictions, training_idxs):
             next_prediction[page_idx] = pred
-    page_blocks = sorted(page_blocks.items(), key=operator.itemgetter(0))
+    page_blocks = sorted(page_blocks.items())
     page_blocks = [p[1] for p in page_blocks]
     assert len(page_blocks) == len(text_chars)
     return page_blocks
diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py
@@ -1,28 +1,36 @@
+import decimal
 import math
 from collections import defaultdict
 
-from pdftext.pdf.utils import get_fontname, page_to_device, page_bbox_to_device_bbox, pdfium_page_bbox_to_device_bbox
+from pdftext.pdf.utils import get_fontname, pdfium_page_bbox_to_device_bbox
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 
 
 def get_pdfium_chars(pdf_path):
     pdf = pdfium.PdfDocument(pdf_path)
     blocks = []
+
     for page_idx in range(len(pdf)):
         page = pdf.get_page(page_idx)
         text_page = page.get_textpage()
 
-        text_chars = defaultdict(list)
-        text_chars["page"] = page_idx
-        text_chars["rotation"] = page.get_rotation()
         bbox = page.get_bbox()
         page_width = math.ceil(bbox[2] - bbox[0])
         page_height = math.ceil(abs(bbox[1] - bbox[3]))
-        #text_chars["bbox"] = page_bbox_to_device_bbox(bbox, page_width, page_height)
-        text_chars["bbox"] = pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height)
 
-        for i in range(text_page.count_chars()):
+        text_chars = {
+            "chars": [],
+            "page": page_idx,
+            "rotation": page.get_rotation(),
+            "bbox": pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height)
+        }
+
+        prev_bbox = None
+        x_gaps = decimal.Decimal(0)
+        y_gaps = decimal.Decimal(0)
+        total_chars = text_page.count_chars()
+        for i in range(total_chars):
             char = pdfium_c.FPDFText_GetUnicode(text_page, i)
             char = chr(char)
             fontsize = pdfium_c.FPDFText_GetFontSize(text_page, i)
@@ -31,8 +39,8 @@ def get_pdfium_chars(pdf_path):
             rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
             rotation = rotation * 180 / math.pi # convert from radians to degrees
             coords = text_page.get_charbox(i, loose=True)
-            #device_coords = page_bbox_to_device_bbox(coords, page_width, page_height, normalize=True)
             device_coords = pdfium_page_bbox_to_device_bbox(page, coords, page_width, page_height, normalize=True)
+
             char_info = {
                 "font": {
                     "size": fontsize,
@@ -42,10 +50,18 @@ def get_pdfium_chars(pdf_path):
                 },
                 "rotation": rotation,
                 "char": char,
-                "origin": coords,
                 "bbox": device_coords,
                 "char_idx": i
             }
             text_chars["chars"].append(char_info)
+
+            if prev_bbox:
+                x_gaps += decimal.Decimal(device_coords[0] - prev_bbox[2])
+                y_gaps += decimal.Decimal(device_coords[1] - prev_bbox[3])
+            prev_bbox = device_coords
+
+        text_chars["avg_x_gap"] = float(x_gaps / total_chars) if total_chars > 0 else 0
+        text_chars["avg_y_gap"] = float(y_gaps / total_chars) if total_chars > 0 else 0
+        text_chars["total_chars"] = total_chars
         blocks.append(text_chars)
     return blocks
diff --git a/pdftext/pdf/utils.py b/pdftext/pdf/utils.py
@@ -67,30 +67,12 @@ def page_to_device(page, x, y, page_width, page_height):
     return x, y
 
 
-def page_bbox_to_device_bbox(pdf_bbox, page_width, page_height, normalize=False):
-    left, bottom, right, top = pdf_bbox
-
-    device_top = page_height - bottom
-    device_bottom = page_height - top
-    if normalize:
-        device_bbox = [left / page_width, device_top / page_height, right / page_width, device_bottom / page_height]
-    else:
-        device_bbox = [left, device_top, right, device_bottom]
-    return device_bbox
-
-
-def pdfium_page_bbox_to_device_bbox2(page, bbox, page_width, page_height, normalize=False):
-    dev_bbox = page_to_device(page, *bbox[:2], normalize=normalize) + page_to_device(page, *bbox[2:], normalize=normalize)
-    dev_bbox = (dev_bbox[0], dev_bbox[3], dev_bbox[2], dev_bbox[1])  # Convert to ltrb
-    return dev_bbox
-
-
 def pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize=False):
     bbox_width = bbox[2] - bbox[0]
     bbox_height = bbox[3] - bbox[1]
     left_bottom = page_to_device(page, *bbox[:2], page_width, page_height)
 
-    dev_bbox = (left_bottom[0], left_bottom[1] - bbox_height, left_bottom[0] + bbox_width, left_bottom[1])   # Convert to ltrb
+    dev_bbox = [left_bottom[0], left_bottom[1] - bbox_height, left_bottom[0] + bbox_width, left_bottom[1]]   # Convert to ltrb
     if normalize:
         dev_bbox = [dev_bbox[0] / page_width, dev_bbox[1] / page_height, dev_bbox[2] / page_width, dev_bbox[3] / page_height]
     return dev_bbox