Improve model quality

VikParuchuri · Apr 25, 2024 · cc6a6e4 · cc6a6e4
1 parent 1501377
commit cc6a6e4
Show file tree

Hide file tree

Showing 5 changed files with 21 additions and 14 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # PDFText
 
-Text extraction like [PyMuPDF]((https://github.com/pymupdf/PyMuPDF), but without the AGPL license.  PDFText extracts plain text or structured blocks and lines.  It's built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2), so it's [fast, accurate](#benchmarks), and Apache licensed.
+Text extraction like [PyMuPDF](https://github.com/pymupdf/PyMuPDF), but without the AGPL license.  PDFText extracts plain text or structured blocks and lines.  It's built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2), so it's [fast, accurate](#benchmarks), and Apache licensed.
 
 # Installation
 
@@ -81,13 +81,11 @@ I benchmarked extraction speed and accuracy of [pymupdf](https://pymupdf.readthe
 
 Here are the scores:
 
-+------------+-------------------+-----------------------------------------+
-|  Library   | Time (s per page) | Alignment Score (% accuracy vs pymupdf) |
-+------------+-------------------+-----------------------------------------+
-|  pymupdf   |       0.31        |                   --                    |
-|  pdftext   |       1.45        |                  95.64                  |
-| pdfplumber |       2.97        |                  89.88                  |
-+------------+-------------------+-----------------------------------------+
+| Library    | Time (s per page) | Alignment Score (% accuracy vs pymupdf) |
+|------------|-------------------|-----------------------------------------|
+| pymupdf    | 0.32              | --                                      |
+| pdftext    | 1.79              | 96.22                                   |
+| pdfplumber | 3.0               | 89.88                                   |
 
 pdftext is approximately 2x slower than using pypdfium2 alone (if you were to extract all the same information).
 
@@ -127,6 +125,6 @@ This is built on some amazing open source work, including:
 
 - [pypdfium2](https://github.com/pypdfium2-team/pypdfium2)
 - [scikit-learn](https://scikit-learn.org/stable/index.html)
-- [pypdf2](https://github.com/py-pdf/benchmarks) for very thorough and fair benchmarks
+- [pypdf](https://github.com/py-pdf/benchmarks) for very thorough and fair benchmarks
 
 Thank you to the [pymupdf](https://github.com/pymupdf/PyMuPDF) devs for creating such a great library - I just wish it had a simpler license!
diff --git a/benchmark.py b/benchmark.py
@@ -96,7 +96,7 @@ def main():
     table_alignments.insert(0, "--")
 
     table = [(tool, time, alignment) for tool, time, alignment in zip(times_tools, table_times, table_alignments)]
-    table = tabulate.tabulate(table, tablefmt="pretty", headers=headers)
+    table = tabulate.tabulate(table, tablefmt="github", headers=headers)
     print(table)
 
     results = {

diff --git a/models/dt.joblib b/models/dt.joblib
diff --git a/pdftext/inference.py b/pdftext/inference.py
@@ -19,7 +19,7 @@ def update_current(current, new_char):
     return current
 
 
-def create_training_row(char_info, prev_char, currblock):
+def create_training_row(char_info, prev_char, currblock, currline):
     char = char_info["char"]
     char_center_x = (char_info["bbox"][2] + char_info["bbox"][0]) / 2
     char_center_y = (char_info["bbox"][3] + char_info["bbox"][1]) / 2
@@ -42,10 +42,18 @@ def create_training_row(char_info, prev_char, currblock):
         "font_match": font_match,
         "x_outer_gap": char_info["bbox"][2] - prev_char["bbox"][0],
         "y_outer_gap": char_info["bbox"][3] - prev_char["bbox"][1],
+        "line_x_center_gap": char_center_x - currline["center_x"],
+        "line_y_center_gap": char_center_y - currline["center_y"],
+        "line_x_gap": char_info["bbox"][0] - currline["bbox"][2],
+        "line_y_gap": char_info["bbox"][1] - currline["bbox"][3],
+        "line_x_start_gap": char_info["bbox"][0] - currline["bbox"][0],
+        "line_y_start_gap": char_info["bbox"][1] - currline["bbox"][1],
         "block_x_center_gap": char_center_x - currblock["center_x"],
         "block_y_center_gap": char_center_y - currblock["center_y"],
         "block_x_gap": char_info["bbox"][0] - currblock["bbox"][2],
-        "block_y_gap": char_info["bbox"][1] - currblock["bbox"][3]
+        "block_y_gap": char_info["bbox"][1] - currblock["bbox"][3],
+        "block_x_start_gap": char_info["bbox"][0] - currblock["bbox"][0],
+        "block_y_start_gap": char_info["bbox"][1] - currblock["bbox"][1]
     }
 
     return training_row
@@ -80,7 +88,7 @@ def infer_single_page(text_chars):
     span = {"chars": []}
     for i, char_info in enumerate(text_chars["chars"]):
         if prev_char:
-            training_row = create_training_row(char_info, prev_char, block)
+            training_row = create_training_row(char_info, prev_char, block, line)
             training_row = [v for _, v in sorted(training_row.items())]
 
             prediction = yield training_row
@@ -97,6 +105,7 @@ def infer_single_page(text_chars):
                 block = update_block(blocks, block)
 
         span["chars"].append(char_info)
+        line = update_current(line, char_info)
         block = update_current(block, char_info)
 
         prev_char = char_info

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdftext"
-version = "0.1.0"
+version = "0.1.1"
 description = "Extract structured text from pdfs quickly"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 license = "Apache-2.0"