Improve speed

VikParuchuri · Apr 24, 2024 · d4721bf · d4721bf
1 parent 13e2722
commit d4721bf
Show file tree

Hide file tree

Showing 9 changed files with 340 additions and 40 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@ temp.txt
 temp.json
 notebooks
 results
+.DS_Store
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/benchmark.py b/benchmark.py
@@ -7,6 +7,7 @@
 
 import fitz as pymupdf
 import datasets
+import pdfplumber
 
 from pdftext.extraction import dictionary_output
 from pdftext.settings import settings
@@ -22,6 +23,16 @@ def pymupdf_inference(pdf_path):
     return pages
 
 
+def pdfplumber_inference(pdf_path):
+    with pdfplumber.open(pdf_path) as pdf:
+        pages = []
+        for i in range(len(pdf.pages)):
+            page = pdf.pages[i]
+            text = page.extract_text()
+            pages.append(text)
+    return pages
+
+
 def main():
     parser = argparse.ArgumentParser(description="Benchmark pdf extraction.")
     parser.add_argument("--result_path", type=str, help="Path to the output text file, defaults to stdout", default=None)
@@ -35,6 +46,7 @@ def main():
 
     mu_times = []
     pdftext_times = []
+    pdfplumber_times = []
     for i in range(len(dataset)):
         row = dataset[i]
         pdf = row["pdf"]
@@ -52,12 +64,18 @@ def main():
             pdftext_pages = dictionary_output(pdf_path)
             pdftext_times.append(time.time() - start)
 
+            start = time.time()
+            pdfplumber_pages = pdfplumber_inference(pdf_path)
+            pdfplumber_times.append(time.time() - start)
+
     print(f"MuPDF avg time: {mean(mu_times):.2f}")
+    print(f"pdfplumber avg time: {mean(pdfplumber_times):.2f}")
     print(f"pdftext avg time: {mean(pdftext_times):.2f}")
 
     results = {
         "mu_times": mu_times,
-        "pdftext_times": pdftext_times
+        "pdftext_times": pdftext_times,
+        "pdfplumber_times": pdfplumber_times
     }
 
     result_path = args.result_path

diff --git a/extract_text.py b/extract_text.py
@@ -23,7 +23,7 @@ def main():
     if args.out_path is None:
         print(text)
     else:
-        with open(args.out_path, "w") as f:
+        with open(args.out_path, "w+") as f:
             f.write(text)
 
 

diff --git a/pdftext/extraction.py b/pdftext/extraction.py
@@ -25,26 +25,19 @@ def plain_text_output(pdf_path, sort=False):
 
 def dictionary_output(pdf_path, sort=False):
     pages = _get_pages(pdf_path)
-    merged_pages = []
     for page in pages:
-        merged_page = {
-            "page_idx": page["page"],
-            "rotation": page["rotation"],
-            "bbox": page["bbox"],
-            "blocks": []
-        }
         for block in page["blocks"]:
-            merged_lines = []
+            bad_keys = [key for key in block.keys() if key not in ["lines", "bbox"]]
+            for key in bad_keys:
+                del block[key]
             for line in block["lines"]:
-                chars = [s["chars"] for s in line["spans"]]
-                chars = chain.from_iterable(chars)
-                line["chars"] = chars
-                del line["spans"]
                 line["bbox"] = unnormalize_bbox(line["bbox"], page["bbox"])
-            block["lines"] = merged_lines
+                bad_keys = [key for key in line.keys() if key not in ["chars", "bbox"]]
+                for key in bad_keys:
+                    del line[key]
+                for char in line["chars"]:
+                    char["bbox"] = unnormalize_bbox(char["bbox"], page["bbox"])
             block["bbox"] = unnormalize_bbox(block["bbox"], page["bbox"])
-            merged_page["blocks"].append(block)
         if sort:
-            merged_page["blocks"] = sort_blocks(merged_page["blocks"])
-        merged_pages.append(merged_page)
-    return merged_pages
+            page["blocks"] = sort_blocks(page["blocks"])
+    return pages
diff --git a/pdftext/inference.py b/pdftext/inference.py
@@ -1,4 +1,7 @@
+import operator
 from collections import defaultdict
+from itertools import chain
+
 from pdftext.pdf.utils import SPACES, TABS, LINE_BREAKS, HYPHEN
 from pdftext.utils import replace_zero
 
@@ -65,7 +68,7 @@ def infer_single_page(text_chars):
     for i, char_info in enumerate(text_chars["chars"]):
         if prev_char:
             training_row = create_training_row(char_info, prev_char, span, line, block)
-            training_row = [v for k, v in sorted(training_row.items(), key=lambda x: x[0])]
+            training_row = [v for k, v in sorted(training_row.items(), key=operator.itemgetter(0))]
 
             prediction = yield training_row
             if prediction == 0:
@@ -75,11 +78,15 @@ def infer_single_page(text_chars):
                 span = defaultdict(list)
             elif prediction == 2:
                 line["spans"].append(span)
+                line["chars"] = list(chain.from_iterable([s["chars"] for s in line["spans"]]))
+                del line["spans"]
                 block["lines"].append(line)
                 line = defaultdict(list)
                 span = defaultdict(list)
             else:
                 line["spans"].append(span)
+                line["chars"] = list(chain.from_iterable([s["chars"] for s in line["spans"]]))
+                del line["spans"]
                 block["lines"].append(line)
                 blocks["blocks"].append(block)
                 block = defaultdict(list)
@@ -92,11 +99,12 @@ def infer_single_page(text_chars):
         block = update_current(block, char_info)
 
         prev_char = char_info
-    if len(span) > 0:
-        line["spans"].append(span)
-    if len(line) > 0:
+    if len(span["chars"]) > 0:
+        line["chars"] = list(chain.from_iterable([s["chars"] for s in line["spans"]]))
+        del line["spans"]
+    if len(line["chars"]) > 0:
         block["lines"].append(line)
-    if len(block) > 0:
+    if len(block["lines"]) > 0:
         blocks["blocks"].append(block)
 
     blocks["page"] = text_chars["page"]
@@ -108,7 +116,7 @@ def infer_single_page(text_chars):
 def inference(text_chars, model):
     # Create generators and get first training row from each
     generators = [infer_single_page(text_page) for text_page in text_chars]
-    next_prediction = {idx: next(gen) for idx, gen in enumerate(generators)}
+    next_prediction = {}
 
     page_blocks = {}
     while len(page_blocks) < len(generators):
@@ -118,8 +126,11 @@ def inference(text_chars, model):
                 continue
 
             try:
-                training_row = page_generator.send(next_prediction[page_idx])
-                del next_prediction[page_idx]
+                if page_idx not in next_prediction:
+                    training_row = next(page_generator)
+                else:
+                    training_row = page_generator.send(next_prediction[page_idx])
+                    del next_prediction[page_idx]
                 training_data[page_idx] = training_row
             except StopIteration as e:
                 blocks = e.value
@@ -128,14 +139,14 @@ def inference(text_chars, model):
         if len(page_blocks) == len(generators):
             break
 
-        training_list = sorted(training_data.items(), key=lambda x: x[0])
+        training_list = sorted(training_data.items(), key=operator.itemgetter(0))
         training_rows = [tl[1] for tl in training_list]
         training_idxs = [tl[0] for tl in training_list]
 
         predictions = model.predict(training_rows)
         for pred, page_idx in zip(predictions, training_idxs):
             next_prediction[page_idx] = pred
-    page_blocks = sorted(page_blocks.items(), key=lambda x: x[0])
+    page_blocks = sorted(page_blocks.items(), key=operator.itemgetter(0))
     page_blocks = [p[1] for p in page_blocks]
     assert len(page_blocks) == len(text_chars)
     return page_blocks
diff --git a/pdftext/pdf/chars.py b/pdftext/pdf/chars.py
@@ -1,7 +1,7 @@
 import math
 from collections import defaultdict
 
-from pdftext.pdf.utils import get_fontname, page_to_device, page_bbox_to_device_bbox
+from pdftext.pdf.utils import get_fontname, page_to_device, page_bbox_to_device_bbox, pdfium_page_bbox_to_device_bbox
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 
@@ -17,7 +17,10 @@ def get_pdfium_chars(pdf_path):
         text_chars["page"] = page_idx
         text_chars["rotation"] = page.get_rotation()
         bbox = page.get_bbox()
-        text_chars["bbox"] = page_bbox_to_device_bbox(page, bbox, normalize=False)
+        page_width = math.ceil(bbox[2] - bbox[0])
+        page_height = math.ceil(abs(bbox[1] - bbox[3]))
+        #text_chars["bbox"] = page_bbox_to_device_bbox(bbox, page_width, page_height)
+        text_chars["bbox"] = pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height)
 
         for i in range(text_page.count_chars()):
             char = pdfium_c.FPDFText_GetUnicode(text_page, i)
@@ -28,7 +31,8 @@ def get_pdfium_chars(pdf_path):
             rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
             rotation = rotation * 180 / math.pi # convert from radians to degrees
             coords = text_page.get_charbox(i, loose=True)
-            device_coords = page_bbox_to_device_bbox(page, coords)
+            #device_coords = page_bbox_to_device_bbox(coords, page_width, page_height, normalize=True)
+            device_coords = pdfium_page_bbox_to_device_bbox(page, coords, page_width, page_height, normalize=True)
             char_info = {
                 "font": {
                     "size": fontsize,

diff --git a/pdftext/pdf/utils.py b/pdftext/pdf/utils.py
@@ -53,24 +53,44 @@ def get_fontname(textpage, char_index):
     return decoded, flag_buffer.value
 
 
-def page_to_device(page, x, y, normalize=True):
+def page_to_device(page, x, y, page_width, page_height):
     device_x = ctypes.c_int()
     device_y = ctypes.c_int()
     device_x_ptr = ctypes.pointer(device_x)
     device_y_ptr = ctypes.pointer(device_y)
     rotation = pdfium_c.FPDFPage_GetRotation(page)
-    width = math.ceil(page.get_width())
-    height = math.ceil(page.get_height())
+    width = math.ceil(page_width)
+    height = math.ceil(page_height)
     pdfium_c.FPDF_PageToDevice(page, 0, 0, width, height, rotation, x, y, device_x_ptr, device_y_ptr)
     x = device_x.value
     y = device_y.value
-    if normalize:
-        x = x / width # Normalise to 0-1
-        y = y / height # Normalise to 0-1
     return x, y
 
 
-def page_bbox_to_device_bbox(page, bbox, normalize=True):
+def page_bbox_to_device_bbox(pdf_bbox, page_width, page_height, normalize=False):
+    left, bottom, right, top = pdf_bbox
+
+    device_top = page_height - bottom
+    device_bottom = page_height - top
+    if normalize:
+        device_bbox = [left / page_width, device_top / page_height, right / page_width, device_bottom / page_height]
+    else:
+        device_bbox = [left, device_top, right, device_bottom]
+    return device_bbox
+
+
+def pdfium_page_bbox_to_device_bbox2(page, bbox, page_width, page_height, normalize=False):
     dev_bbox = page_to_device(page, *bbox[:2], normalize=normalize) + page_to_device(page, *bbox[2:], normalize=normalize)
     dev_bbox = (dev_bbox[0], dev_bbox[3], dev_bbox[2], dev_bbox[1])  # Convert to ltrb
+    return dev_bbox
+
+
+def pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize=False):
+    bbox_width = bbox[2] - bbox[0]
+    bbox_height = bbox[3] - bbox[1]
+    left_bottom = page_to_device(page, *bbox[:2], page_width, page_height)
+
+    dev_bbox = (left_bottom[0], left_bottom[1] - bbox_height, left_bottom[0] + bbox_width, left_bottom[1])   # Convert to ltrb
+    if normalize:
+        dev_bbox = [dev_bbox[0] / page_width, dev_bbox[1] / page_height, dev_bbox[2] / page_width, dev_bbox[3] / page_height]
     return dev_bbox