Skip to content

Commit

Permalink
Enable option to keep individual characters
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed May 2, 2024
1 parent 1a56805 commit 3cd1706
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 5 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ pdftext PDF_PATH --out_path output.txt --json
- `--json` specifies json output
- `--sort` will attempt to sort in reading order if specified.
- `--pages` will specify pages (comma separated) to extract
- `--keep_chars` will keep individual characters in the json output

The output will be a json list, with each item in the list corresponding to a single page in the input pdf (in order). Each page will include the following keys:

Expand Down Expand Up @@ -84,7 +85,7 @@ import pypdfium2 as pdfium
from pdftext.extraction import dictionary_output

pdf = pdfium.PdfDocument(PDF_PATH)
text = dictionary_output(pdf, sort=False, page_range=[1,2,3]) # Optional arguments explained above
text = dictionary_output(pdf, sort=False, page_range=[1,2,3], keep_chars=False) # Optional arguments explained above
```

If you want more customization, check out the `pdftext.extraction._get_pages` function for a starting point to dig deeper. pdftext is a pretty thin wrapper around [pypdfium2](https://pypdfium2.readthedocs.io/en/stable/), so you might want to look at the documentation for that as well.
Expand Down
3 changes: 2 additions & 1 deletion extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def main():
parser.add_argument("--sort", action="store_true", help="Attempt to sort the text by reading order", default=False)
parser.add_argument("--keep_hyphens", action="store_true", help="Keep hyphens in words", default=False)
parser.add_argument("--pages", type=str, help="Comma separated pages to extract, like 1,2,3", default=None)
parser.add_argument("--keep_chars", action="store_true", help="Keep character level information", default=False)
args = parser.parse_args()

pdf_doc = pdfium.PdfDocument(args.pdf_path)
Expand All @@ -22,7 +23,7 @@ def main():
assert all(p <= len(pdf_doc) for p in pages), "Invalid page number(s) provided"

if args.json:
text = dictionary_output(pdf_doc, sort=args.sort, page_range=pages)
text = dictionary_output(pdf_doc, sort=args.sort, page_range=pages, keep_chars=args.keep_chars)
text = json.dumps(text)
else:
text = plain_text_output(pdf_doc, sort=args.sort, hyphens=args.keep_hyphens, page_range=pages)
Expand Down
8 changes: 7 additions & 1 deletion pdftext/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def paginated_plain_text_output(pdf_doc, sort=False, model=None, hyphens=False,
return text


def dictionary_output(pdf_doc, sort=False, model=None, page_range=None):
def dictionary_output(pdf_doc, sort=False, model=None, page_range=None, keep_chars=False):
pages = _get_pages(pdf_doc, model, page_range)
for page in pages:
for block in page["blocks"]:
Expand All @@ -44,6 +44,12 @@ def dictionary_output(pdf_doc, sort=False, model=None, page_range=None):
span["text"] = postprocess_text(span["text"])
span["text"] = handle_hyphens(span["text"], keep_hyphens=True)

if not keep_chars:
del span["chars"]
else:
for char in span["chars"]:
char["bbox"] = unnormalize_bbox(char["bbox"], page["width"], page["height"])

line["bbox"] = unnormalize_bbox(line["bbox"], page["width"], page["height"])
block["bbox"] = unnormalize_bbox(block["bbox"], page["width"], page["height"])
if sort:
Expand Down
6 changes: 5 additions & 1 deletion pdftext/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,11 @@ def update_span(line, span):
span["char_start_idx"] = span["chars"][0]["char_idx"]
span["char_end_idx"] = span["chars"][-1]["char_idx"]

del span["chars"]
# Remove unneeded keys from the characters
for char in span["chars"]:
del_keys = [k for k in list(char.keys()) if k not in ["char", "bbox"]]
for key in del_keys:
del char[key]
line["spans"].append(span)
span = {"chars": []}
return span
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "pdftext"
version = "0.3.2"
version = "0.3.3"
description = "Extract structured text from pdfs quickly"
authors = ["Vik Paruchuri <vik.paruchuri@gmail.com>"]
license = "Apache-2.0"
Expand Down

0 comments on commit 3cd1706

Please sign in to comment.