Skip to content

Commit

Permalink
Improve speed
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Apr 25, 2024
1 parent d4721bf commit 7353d62
Show file tree
Hide file tree
Showing 9 changed files with 118 additions and 109 deletions.
30 changes: 22 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
# PDFText

Extracts text from pdfs in a similar way to [PymuPDF](https://github.com/pymupdf/PyMuPDF), but without the AGPL license. Built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2).
Text extraction like PyMuPDF, but without the AGPL license. PDFText extracts plain text or structured blocks and lines, similar to [PymuPDF](https://github.com/pymupdf/PyMuPDF). It's built on [pypdfium2](https://github.com/pypdfium2-team/pypdfium2), so it's [fast, accurate](https://github.com/py-pdf/benchmarks), and Apache licensed.

# Installation

You'll need python 3.9+.

Install with:
You'll need python 3.9+ first. Then run:

```shell
pip install pdftext
```

# Usage
# CLI Usage

- Inspect the settings in `pdftext/settings.py`. You can override any settings with environment variables.


## Plain text

This command will write out a text file with the extracted plain text.
Expand Down Expand Up @@ -53,12 +50,29 @@ The output will be a json list, with each item in the list corresponding to a si
- `char` - the actual character, encoded in utf-8
- `rotation` - how much the character is rotated, in degrees
- `bbox` - the character bbox, in [x1, y1, x2, y2] format
- `origin` - the original pdf coordinate origin
- `char_idx` - the index of the character on the page (from 0 to number of characters, in original pdf order)
- `font` this is font info straight from the pdf, see [this pdfium code](https://pdfium.googlesource.com/pdfium/+/refs/heads/main/public/fpdf_text.h)
- `size` - the size of the font used for the character
- `weight` - font weight
- `name` - font name, may be None
- `flags` - font flags, in the format of the `PDF spec 1.7 Section 5.7.1 Font Descriptor Flags`

-
# Programmatic usage

Extract plain text:

```python
from pdftext.extraction import plain_text_output

text = plain_text_output(PDF_PATH, sort=False)
```

Extract structured blocks and lines:

```python
from pdftext.extraction import dictionary_output

text = dictionary_output(PDF_PATH)
```


2 changes: 1 addition & 1 deletion extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def main():
if args.output_type == "plain_text":
text = plain_text_output(args.pdf_path, sort=args.sort)
elif args.output_type == "json":
text = dictionary_output(args.pdf_path)
text = dictionary_output(args.pdf_path, sort=args.sort)
text = json.dumps(text)

if args.out_path is None:
Expand Down
Binary file modified models/dt.joblib
Binary file not shown.
15 changes: 11 additions & 4 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
import copy
from itertools import chain

from pdftext.inference import inference
from pdftext.model import get_model
from pdftext.pdf.chars import get_pdfium_chars
Expand Down Expand Up @@ -31,12 +28,22 @@ def dictionary_output(pdf_path, sort=False):
for key in bad_keys:
del block[key]
for line in block["lines"]:
line["bbox"] = unnormalize_bbox(line["bbox"], page["bbox"])
line_box = None
bad_keys = [key for key in line.keys() if key not in ["chars", "bbox"]]
for key in bad_keys:
del line[key]
for char in line["chars"]:
char["bbox"] = unnormalize_bbox(char["bbox"], page["bbox"])
if line_box is None:
line_box = char["bbox"]
else:
line_box = [
min(line_box[0], char["bbox"][0]),
min(line_box[1], char["bbox"][1]),
max(line_box[2], char["bbox"][2]),
max(line_box[3], char["bbox"][3]),
]
line["bbox"] = line_box
block["bbox"] = unnormalize_bbox(block["bbox"], page["bbox"])
if sort:
page["blocks"] = sort_blocks(page["blocks"])
Expand Down
113 changes: 53 additions & 60 deletions pdftext/inference.py
Original file line number Diff line number Diff line change
@@ -1,108 +1,101 @@
import operator
from collections import defaultdict
from itertools import chain

from pdftext.pdf.utils import SPACES, TABS, LINE_BREAKS, HYPHEN
from pdftext.utils import replace_zero
from pdftext.pdf.utils import LINE_BREAKS


def update_current(current, new_char):
bbox = new_char["bbox"]
if "bbox" not in current:
current["bbox"] = list(bbox)
current_bbox = bbox
current["bbox"] = current_bbox
else:
current["bbox"][0] = min(bbox[0], current["bbox"][0])
current["bbox"][1] = min(bbox[1], current["bbox"][1])
current["bbox"][2] = max(bbox[2], current["bbox"][2])
current["bbox"][3] = max(bbox[3], current["bbox"][3])
current["height"] = current["bbox"][2] - current["bbox"][0]
current["center_x"] = (current["bbox"][0] + current["bbox"][2]) / 2
current["center_y"] = (current["bbox"][1] + current["bbox"][3]) / 2
if "length" not in current:
current["length"] = 0
current["length"] += 1
current_bbox = current["bbox"]
current_bbox[0] = min(bbox[0], current_bbox[0])
current_bbox[1] = min(bbox[1], current_bbox[1])
current_bbox[2] = max(bbox[2], current_bbox[2])
current_bbox[3] = max(bbox[3], current_bbox[3])
current["center_x"] = (current_bbox[0] + current_bbox[2]) / 2
current["center_y"] = (current_bbox[1] + current_bbox[3]) / 2
return current


def create_training_row(char_info, prev_char, currspan, currline, currblock):
def create_training_row(char_info, prev_char, currspan, currblock, avg_x_gap, avg_y_gap):
char = char_info["char"]
char_center_x = (char_info["bbox"][2] + char_info["bbox"][0]) / 2
char_center_y = (char_info["bbox"][3] + char_info["bbox"][1]) / 2
prev_char_center_x = (prev_char["bbox"][2] + prev_char["bbox"][0]) / 2
prev_char_center_y = (prev_char["bbox"][3] + prev_char["bbox"][1]) / 2
char_height = char_info["bbox"][3] - char_info["bbox"][1]
char_width = char_info["bbox"][2] - char_info["bbox"][0]
training_row = {"is_space": char.isspace() or char in SPACES,
"is_newline": char in LINE_BREAKS, "is_printable": char.isprintable(), "is_hyphen": char == HYPHEN,
"char_x1": char_info["bbox"][0], "char_y1": char_info["bbox"][1],
"char_x2": char_info["bbox"][2], "char_y2": char_info["bbox"][3],
"prev_char_x1": prev_char["bbox"][0], "prev_char_y1": prev_char["bbox"][1],
"prev_char_x2": prev_char["bbox"][2], "prev_char_y2": prev_char["bbox"][3],
"x_gap": char_info["bbox"][0] - prev_char["bbox"][2],
"y_gap": char_info["bbox"][1] - prev_char["bbox"][3],
"x_center_gap": char_center_x - prev_char_center_x,
"y_center_gap": char_center_y - prev_char_center_y,
"span_len": len(currspan),
"line_len": len(currline), "block_len": len(currblock), "height": char_height,
"width": char_width,
"width_ratio": char_width / replace_zero(prev_char["bbox"][2] - prev_char["bbox"][0]),
"height_ratio": char_width / replace_zero(prev_char["bbox"][3] - prev_char["bbox"][1]),
"block_x_center_gap": char_center_x - currblock["center_x"],
"block_y_center_gap": char_center_y - currblock["center_y"],
"line_x_center_gap": char_center_x - currline["center_x"],
"line_y_center_gap": char_center_y - currblock["center_y"],
"span_x_center_gap": char_center_x - currspan["center_x"],
"span_y_center_gap": char_center_y - currspan["center_y"],
"block_x_gap": char_info["bbox"][0] - currblock["bbox"][2],
"block_y_gap": char_info["bbox"][1] - currblock["bbox"][3]}
x_gap = char_info["bbox"][0] - prev_char["bbox"][2]
y_gap = char_info["bbox"][1] - prev_char["bbox"][3]

training_row = {
"is_newline": char in LINE_BREAKS,
"x_gap": x_gap,
"y_gap": y_gap,
"x_outer_gap": char_info["bbox"][2] - prev_char["bbox"][0],
"y_outer_gap": char_info["bbox"][3] - prev_char["bbox"][1],
"x_gap_ratio": x_gap / avg_x_gap if avg_x_gap > 0 else 0,
"y_gap_ratio": y_gap / avg_y_gap if avg_y_gap > 0 else 0,
"x_center_gap": char_center_x - prev_char_center_x,
"y_center_gap": char_center_y - prev_char_center_y,
"block_x_center_gap": char_center_x - currblock["center_x"],
"block_y_center_gap": char_center_y - currblock["center_y"],
"span_x_center_gap": char_center_x - currspan["center_x"],
"span_y_center_gap": char_center_y - currspan["center_y"],
"block_x_gap": char_info["bbox"][0] - currblock["bbox"][2],
"block_y_gap": char_info["bbox"][1] - currblock["bbox"][3]
}

return training_row


def infer_single_page(text_chars):
prev_char = None

blocks = defaultdict(list)
block = defaultdict(list)
line = defaultdict(list)
span = defaultdict(list)
blocks = {"blocks": []}
block = {"lines": []}
line = {"spans": []}
span = {"chars": []}
for i, char_info in enumerate(text_chars["chars"]):
if prev_char:
training_row = create_training_row(char_info, prev_char, span, line, block)
training_row = [v for k, v in sorted(training_row.items(), key=operator.itemgetter(0))]
training_row = create_training_row(char_info, prev_char, span, block, text_chars["avg_x_gap"], text_chars["avg_y_gap"])
training_row = [v for _, v in sorted(training_row.items())]

prediction = yield training_row
if prediction == 0:
pass
elif prediction == 1:
line["spans"].append(span)
span = defaultdict(list)
span = {"chars": []}
elif prediction == 2:
line["spans"].append(span)
line["chars"] = list(chain.from_iterable([s["chars"] for s in line["spans"]]))
line["chars"] = list(chain.from_iterable(s["chars"] for s in line["spans"]))
del line["spans"]
block["lines"].append(line)
line = defaultdict(list)
span = defaultdict(list)
line = {"spans": []}
span = {"chars": []}
else:
line["spans"].append(span)
line["chars"] = list(chain.from_iterable([s["chars"] for s in line["spans"]]))
line["chars"] = list(chain.from_iterable(s["chars"] for s in line["spans"]))
del line["spans"]
block["lines"].append(line)
blocks["blocks"].append(block)
block = defaultdict(list)
line = defaultdict(list)
span = defaultdict(list)
block = {"lines": []}
line = {"spans": []}
span = {"chars": []}

span["chars"].append(char_info)
span = update_current(span, char_info)
line = update_current(line, char_info)
block = update_current(block, char_info)

prev_char = char_info
if len(span["chars"]) > 0:
line["chars"] = list(chain.from_iterable([s["chars"] for s in line["spans"]]))
line["chars"] = list(chain.from_iterable(s["chars"] for s in line["spans"]))
del line["spans"]
if "spans" in line and len(line["spans"]) > 0:
line["chars"] = list(chain.from_iterable(s["chars"] for s in line["spans"]))
del line["spans"]
if len(line["chars"]) > 0:
if "chars" in line and len(line["chars"]) > 0:
block["lines"].append(line)
if len(block["lines"]) > 0:
blocks["blocks"].append(block)
Expand Down Expand Up @@ -139,14 +132,14 @@ def inference(text_chars, model):
if len(page_blocks) == len(generators):
break

training_list = sorted(training_data.items(), key=operator.itemgetter(0))
training_list = sorted(training_data.items())
training_rows = [tl[1] for tl in training_list]
training_idxs = [tl[0] for tl in training_list]

predictions = model.predict(training_rows)
for pred, page_idx in zip(predictions, training_idxs):
next_prediction[page_idx] = pred
page_blocks = sorted(page_blocks.items(), key=operator.itemgetter(0))
page_blocks = sorted(page_blocks.items())
page_blocks = [p[1] for p in page_blocks]
assert len(page_blocks) == len(text_chars)
return page_blocks
34 changes: 25 additions & 9 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,36 @@
import decimal
import math
from collections import defaultdict

from pdftext.pdf.utils import get_fontname, page_to_device, page_bbox_to_device_bbox, pdfium_page_bbox_to_device_bbox
from pdftext.pdf.utils import get_fontname, pdfium_page_bbox_to_device_bbox
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c


def get_pdfium_chars(pdf_path):
pdf = pdfium.PdfDocument(pdf_path)
blocks = []

for page_idx in range(len(pdf)):
page = pdf.get_page(page_idx)
text_page = page.get_textpage()

text_chars = defaultdict(list)
text_chars["page"] = page_idx
text_chars["rotation"] = page.get_rotation()
bbox = page.get_bbox()
page_width = math.ceil(bbox[2] - bbox[0])
page_height = math.ceil(abs(bbox[1] - bbox[3]))
#text_chars["bbox"] = page_bbox_to_device_bbox(bbox, page_width, page_height)
text_chars["bbox"] = pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height)

for i in range(text_page.count_chars()):
text_chars = {
"chars": [],
"page": page_idx,
"rotation": page.get_rotation(),
"bbox": pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height)
}

prev_bbox = None
x_gaps = decimal.Decimal(0)
y_gaps = decimal.Decimal(0)
total_chars = text_page.count_chars()
for i in range(total_chars):
char = pdfium_c.FPDFText_GetUnicode(text_page, i)
char = chr(char)
fontsize = pdfium_c.FPDFText_GetFontSize(text_page, i)
Expand All @@ -31,8 +39,8 @@ def get_pdfium_chars(pdf_path):
rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
rotation = rotation * 180 / math.pi # convert from radians to degrees
coords = text_page.get_charbox(i, loose=True)
#device_coords = page_bbox_to_device_bbox(coords, page_width, page_height, normalize=True)
device_coords = pdfium_page_bbox_to_device_bbox(page, coords, page_width, page_height, normalize=True)

char_info = {
"font": {
"size": fontsize,
Expand All @@ -42,10 +50,18 @@ def get_pdfium_chars(pdf_path):
},
"rotation": rotation,
"char": char,
"origin": coords,
"bbox": device_coords,
"char_idx": i
}
text_chars["chars"].append(char_info)

if prev_bbox:
x_gaps += decimal.Decimal(device_coords[0] - prev_bbox[2])
y_gaps += decimal.Decimal(device_coords[1] - prev_bbox[3])
prev_bbox = device_coords

text_chars["avg_x_gap"] = float(x_gaps / total_chars) if total_chars > 0 else 0
text_chars["avg_y_gap"] = float(y_gaps / total_chars) if total_chars > 0 else 0
text_chars["total_chars"] = total_chars
blocks.append(text_chars)
return blocks
20 changes: 1 addition & 19 deletions pdftext/pdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,30 +67,12 @@ def page_to_device(page, x, y, page_width, page_height):
return x, y


def page_bbox_to_device_bbox(pdf_bbox, page_width, page_height, normalize=False):
left, bottom, right, top = pdf_bbox

device_top = page_height - bottom
device_bottom = page_height - top
if normalize:
device_bbox = [left / page_width, device_top / page_height, right / page_width, device_bottom / page_height]
else:
device_bbox = [left, device_top, right, device_bottom]
return device_bbox


def pdfium_page_bbox_to_device_bbox2(page, bbox, page_width, page_height, normalize=False):
dev_bbox = page_to_device(page, *bbox[:2], normalize=normalize) + page_to_device(page, *bbox[2:], normalize=normalize)
dev_bbox = (dev_bbox[0], dev_bbox[3], dev_bbox[2], dev_bbox[1]) # Convert to ltrb
return dev_bbox


def pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize=False):
bbox_width = bbox[2] - bbox[0]
bbox_height = bbox[3] - bbox[1]
left_bottom = page_to_device(page, *bbox[:2], page_width, page_height)

dev_bbox = (left_bottom[0], left_bottom[1] - bbox_height, left_bottom[0] + bbox_width, left_bottom[1]) # Convert to ltrb
dev_bbox = [left_bottom[0], left_bottom[1] - bbox_height, left_bottom[0] + bbox_width, left_bottom[1]] # Convert to ltrb
if normalize:
dev_bbox = [dev_bbox[0] / page_width, dev_bbox[1] / page_height, dev_bbox[2] / page_width, dev_bbox[3] / page_height]
return dev_bbox
Loading

0 comments on commit 7353d62

Please sign in to comment.