Enable option to keep individual characters

wocclyl · May 2, 2024 · 3cd1706 · 3cd1706
1 parent 1a56805
commit 3cd1706
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -41,6 +41,7 @@ pdftext PDF_PATH --out_path output.txt --json
 - `--json` specifies json output
 - `--sort` will attempt to sort in reading order if specified.
 - `--pages` will specify pages (comma separated) to extract
+- `--keep_chars` will keep individual characters in the json output
 
 The output will be a json list, with each item in the list corresponding to a single page in the input pdf (in order).  Each page will include the following keys:
 
@@ -84,7 +85,7 @@ import pypdfium2 as pdfium
 from pdftext.extraction import dictionary_output
 
 pdf = pdfium.PdfDocument(PDF_PATH)
-text = dictionary_output(pdf, sort=False, page_range=[1,2,3]) # Optional arguments explained above
+text = dictionary_output(pdf, sort=False, page_range=[1,2,3], keep_chars=False) # Optional arguments explained above
 ```
 
 If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper.  pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well.

diff --git a/extract_text.py b/extract_text.py
@@ -13,6 +13,7 @@ def main():
     parser.add_argument("--sort", action="store_true", help="Attempt to sort the text by reading order", default=False)
     parser.add_argument("--keep_hyphens", action="store_true", help="Keep hyphens in words", default=False)
     parser.add_argument("--pages", type=str, help="Comma separated pages to extract, like 1,2,3", default=None)
+    parser.add_argument("--keep_chars", action="store_true", help="Keep character level information", default=False)
     args = parser.parse_args()
 
     pdf_doc = pdfium.PdfDocument(args.pdf_path)
@@ -22,7 +23,7 @@ def main():
         assert all(p <= len(pdf_doc) for p in pages), "Invalid page number(s) provided"
 
     if args.json:
-        text = dictionary_output(pdf_doc, sort=args.sort, page_range=pages)
+        text = dictionary_output(pdf_doc, sort=args.sort, page_range=pages, keep_chars=args.keep_chars)
         text = json.dumps(text)
     else:
         text = plain_text_output(pdf_doc, sort=args.sort, hyphens=args.keep_hyphens, page_range=pages)

diff --git a/pdftext/extraction.py b/pdftext/extraction.py
@@ -28,7 +28,7 @@ def paginated_plain_text_output(pdf_doc, sort=False, model=None, hyphens=False,
     return text
 
 
-def dictionary_output(pdf_doc, sort=False, model=None, page_range=None):
+def dictionary_output(pdf_doc, sort=False, model=None, page_range=None, keep_chars=False):
     pages = _get_pages(pdf_doc, model, page_range)
     for page in pages:
         for block in page["blocks"]:
@@ -44,6 +44,12 @@ def dictionary_output(pdf_doc, sort=False, model=None, page_range=None):
                     span["text"] = postprocess_text(span["text"])
                     span["text"] = handle_hyphens(span["text"], keep_hyphens=True)
 
+                    if not keep_chars:
+                        del span["chars"]
+                    else:
+                        for char in span["chars"]:
+                            char["bbox"] = unnormalize_bbox(char["bbox"], page["width"], page["height"])
+
                 line["bbox"] = unnormalize_bbox(line["bbox"], page["width"], page["height"])
             block["bbox"] = unnormalize_bbox(block["bbox"], page["width"], page["height"])
         if sort:

diff --git a/pdftext/inference.py b/pdftext/inference.py
@@ -81,7 +81,11 @@ def update_span(line, span):
         span["char_start_idx"] = span["chars"][0]["char_idx"]
         span["char_end_idx"] = span["chars"][-1]["char_idx"]
 
-    del span["chars"]
+    # Remove unneeded keys from the characters
+    for char in span["chars"]:
+        del_keys = [k for k in list(char.keys()) if k not in ["char", "bbox"]]
+        for key in del_keys:
+            del char[key]
     line["spans"].append(span)
     span = {"chars": []}
     return span

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "pdftext"
-version = "0.3.2"
+version = "0.3.3"
 description = "Extract structured text from pdfs quickly"
 authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
 license = "Apache-2.0"