Skip to content

Commit

Permalink
Improve speed
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Apr 24, 2024
1 parent 13e2722 commit d4721bf
Show file tree
Hide file tree
Showing 9 changed files with 340 additions and 40 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ temp.txt
temp.json
notebooks
results
.DS_Store

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down
20 changes: 19 additions & 1 deletion benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import fitz as pymupdf
import datasets
import pdfplumber

from pdftext.extraction import dictionary_output
from pdftext.settings import settings
Expand All @@ -22,6 +23,16 @@ def pymupdf_inference(pdf_path):
return pages


def pdfplumber_inference(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
pages = []
for i in range(len(pdf.pages)):
page = pdf.pages[i]
text = page.extract_text()
pages.append(text)
return pages


def main():
parser = argparse.ArgumentParser(description="Benchmark pdf extraction.")
parser.add_argument("--result_path", type=str, help="Path to the output text file, defaults to stdout", default=None)
Expand All @@ -35,6 +46,7 @@ def main():

mu_times = []
pdftext_times = []
pdfplumber_times = []
for i in range(len(dataset)):
row = dataset[i]
pdf = row["pdf"]
Expand All @@ -52,12 +64,18 @@ def main():
pdftext_pages = dictionary_output(pdf_path)
pdftext_times.append(time.time() - start)

start = time.time()
pdfplumber_pages = pdfplumber_inference(pdf_path)
pdfplumber_times.append(time.time() - start)

print(f"MuPDF avg time: {mean(mu_times):.2f}")
print(f"pdfplumber avg time: {mean(pdfplumber_times):.2f}")
print(f"pdftext avg time: {mean(pdftext_times):.2f}")

results = {
"mu_times": mu_times,
"pdftext_times": pdftext_times
"pdftext_times": pdftext_times,
"pdfplumber_times": pdfplumber_times
}

result_path = args.result_path
Expand Down
2 changes: 1 addition & 1 deletion extract_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def main():
if args.out_path is None:
print(text)
else:
with open(args.out_path, "w") as f:
with open(args.out_path, "w+") as f:
f.write(text)


Expand Down
27 changes: 10 additions & 17 deletions pdftext/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,26 +25,19 @@ def plain_text_output(pdf_path, sort=False):

def dictionary_output(pdf_path, sort=False):
pages = _get_pages(pdf_path)
merged_pages = []
for page in pages:
merged_page = {
"page_idx": page["page"],
"rotation": page["rotation"],
"bbox": page["bbox"],
"blocks": []
}
for block in page["blocks"]:
merged_lines = []
bad_keys = [key for key in block.keys() if key not in ["lines", "bbox"]]
for key in bad_keys:
del block[key]
for line in block["lines"]:
chars = [s["chars"] for s in line["spans"]]
chars = chain.from_iterable(chars)
line["chars"] = chars
del line["spans"]
line["bbox"] = unnormalize_bbox(line["bbox"], page["bbox"])
block["lines"] = merged_lines
bad_keys = [key for key in line.keys() if key not in ["chars", "bbox"]]
for key in bad_keys:
del line[key]
for char in line["chars"]:
char["bbox"] = unnormalize_bbox(char["bbox"], page["bbox"])
block["bbox"] = unnormalize_bbox(block["bbox"], page["bbox"])
merged_page["blocks"].append(block)
if sort:
merged_page["blocks"] = sort_blocks(merged_page["blocks"])
merged_pages.append(merged_page)
return merged_pages
page["blocks"] = sort_blocks(page["blocks"])
return pages
31 changes: 21 additions & 10 deletions pdftext/inference.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
import operator
from collections import defaultdict
from itertools import chain

from pdftext.pdf.utils import SPACES, TABS, LINE_BREAKS, HYPHEN
from pdftext.utils import replace_zero

Expand Down Expand Up @@ -65,7 +68,7 @@ def infer_single_page(text_chars):
for i, char_info in enumerate(text_chars["chars"]):
if prev_char:
training_row = create_training_row(char_info, prev_char, span, line, block)
training_row = [v for k, v in sorted(training_row.items(), key=lambda x: x[0])]
training_row = [v for k, v in sorted(training_row.items(), key=operator.itemgetter(0))]

prediction = yield training_row
if prediction == 0:
Expand All @@ -75,11 +78,15 @@ def infer_single_page(text_chars):
span = defaultdict(list)
elif prediction == 2:
line["spans"].append(span)
line["chars"] = list(chain.from_iterable([s["chars"] for s in line["spans"]]))
del line["spans"]
block["lines"].append(line)
line = defaultdict(list)
span = defaultdict(list)
else:
line["spans"].append(span)
line["chars"] = list(chain.from_iterable([s["chars"] for s in line["spans"]]))
del line["spans"]
block["lines"].append(line)
blocks["blocks"].append(block)
block = defaultdict(list)
Expand All @@ -92,11 +99,12 @@ def infer_single_page(text_chars):
block = update_current(block, char_info)

prev_char = char_info
if len(span) > 0:
line["spans"].append(span)
if len(line) > 0:
if len(span["chars"]) > 0:
line["chars"] = list(chain.from_iterable([s["chars"] for s in line["spans"]]))
del line["spans"]
if len(line["chars"]) > 0:
block["lines"].append(line)
if len(block) > 0:
if len(block["lines"]) > 0:
blocks["blocks"].append(block)

blocks["page"] = text_chars["page"]
Expand All @@ -108,7 +116,7 @@ def infer_single_page(text_chars):
def inference(text_chars, model):
# Create generators and get first training row from each
generators = [infer_single_page(text_page) for text_page in text_chars]
next_prediction = {idx: next(gen) for idx, gen in enumerate(generators)}
next_prediction = {}

page_blocks = {}
while len(page_blocks) < len(generators):
Expand All @@ -118,8 +126,11 @@ def inference(text_chars, model):
continue

try:
training_row = page_generator.send(next_prediction[page_idx])
del next_prediction[page_idx]
if page_idx not in next_prediction:
training_row = next(page_generator)
else:
training_row = page_generator.send(next_prediction[page_idx])
del next_prediction[page_idx]
training_data[page_idx] = training_row
except StopIteration as e:
blocks = e.value
Expand All @@ -128,14 +139,14 @@ def inference(text_chars, model):
if len(page_blocks) == len(generators):
break

training_list = sorted(training_data.items(), key=lambda x: x[0])
training_list = sorted(training_data.items(), key=operator.itemgetter(0))
training_rows = [tl[1] for tl in training_list]
training_idxs = [tl[0] for tl in training_list]

predictions = model.predict(training_rows)
for pred, page_idx in zip(predictions, training_idxs):
next_prediction[page_idx] = pred
page_blocks = sorted(page_blocks.items(), key=lambda x: x[0])
page_blocks = sorted(page_blocks.items(), key=operator.itemgetter(0))
page_blocks = [p[1] for p in page_blocks]
assert len(page_blocks) == len(text_chars)
return page_blocks
10 changes: 7 additions & 3 deletions pdftext/pdf/chars.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import math
from collections import defaultdict

from pdftext.pdf.utils import get_fontname, page_to_device, page_bbox_to_device_bbox
from pdftext.pdf.utils import get_fontname, page_to_device, page_bbox_to_device_bbox, pdfium_page_bbox_to_device_bbox
import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c

Expand All @@ -17,7 +17,10 @@ def get_pdfium_chars(pdf_path):
text_chars["page"] = page_idx
text_chars["rotation"] = page.get_rotation()
bbox = page.get_bbox()
text_chars["bbox"] = page_bbox_to_device_bbox(page, bbox, normalize=False)
page_width = math.ceil(bbox[2] - bbox[0])
page_height = math.ceil(abs(bbox[1] - bbox[3]))
#text_chars["bbox"] = page_bbox_to_device_bbox(bbox, page_width, page_height)
text_chars["bbox"] = pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height)

for i in range(text_page.count_chars()):
char = pdfium_c.FPDFText_GetUnicode(text_page, i)
Expand All @@ -28,7 +31,8 @@ def get_pdfium_chars(pdf_path):
rotation = pdfium_c.FPDFText_GetCharAngle(text_page, i)
rotation = rotation * 180 / math.pi # convert from radians to degrees
coords = text_page.get_charbox(i, loose=True)
device_coords = page_bbox_to_device_bbox(page, coords)
#device_coords = page_bbox_to_device_bbox(coords, page_width, page_height, normalize=True)
device_coords = pdfium_page_bbox_to_device_bbox(page, coords, page_width, page_height, normalize=True)
char_info = {
"font": {
"size": fontsize,
Expand Down
34 changes: 27 additions & 7 deletions pdftext/pdf/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,24 +53,44 @@ def get_fontname(textpage, char_index):
return decoded, flag_buffer.value


def page_to_device(page, x, y, normalize=True):
def page_to_device(page, x, y, page_width, page_height):
device_x = ctypes.c_int()
device_y = ctypes.c_int()
device_x_ptr = ctypes.pointer(device_x)
device_y_ptr = ctypes.pointer(device_y)
rotation = pdfium_c.FPDFPage_GetRotation(page)
width = math.ceil(page.get_width())
height = math.ceil(page.get_height())
width = math.ceil(page_width)
height = math.ceil(page_height)
pdfium_c.FPDF_PageToDevice(page, 0, 0, width, height, rotation, x, y, device_x_ptr, device_y_ptr)
x = device_x.value
y = device_y.value
if normalize:
x = x / width # Normalise to 0-1
y = y / height # Normalise to 0-1
return x, y


def page_bbox_to_device_bbox(page, bbox, normalize=True):
def page_bbox_to_device_bbox(pdf_bbox, page_width, page_height, normalize=False):
left, bottom, right, top = pdf_bbox

device_top = page_height - bottom
device_bottom = page_height - top
if normalize:
device_bbox = [left / page_width, device_top / page_height, right / page_width, device_bottom / page_height]
else:
device_bbox = [left, device_top, right, device_bottom]
return device_bbox


def pdfium_page_bbox_to_device_bbox2(page, bbox, page_width, page_height, normalize=False):
dev_bbox = page_to_device(page, *bbox[:2], normalize=normalize) + page_to_device(page, *bbox[2:], normalize=normalize)
dev_bbox = (dev_bbox[0], dev_bbox[3], dev_bbox[2], dev_bbox[1]) # Convert to ltrb
return dev_bbox


def pdfium_page_bbox_to_device_bbox(page, bbox, page_width, page_height, normalize=False):
bbox_width = bbox[2] - bbox[0]
bbox_height = bbox[3] - bbox[1]
left_bottom = page_to_device(page, *bbox[:2], page_width, page_height)

dev_bbox = (left_bottom[0], left_bottom[1] - bbox_height, left_bottom[0] + bbox_width, left_bottom[1]) # Convert to ltrb
if normalize:
dev_bbox = [dev_bbox[0] / page_width, dev_bbox[1] / page_height, dev_bbox[2] / page_width, dev_bbox[3] / page_height]
return dev_bbox
Loading

0 comments on commit d4721bf

Please sign in to comment.