Skip to content

Commit

Permalink
Fix benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Apr 29, 2024
1 parent 2c163db commit dc483f9
Showing 1 changed file with 10 additions and 4 deletions.
14 changes: 10 additions & 4 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from rapidfuzz import fuzz
import tabulate
from tqdm import tqdm
import pypdfium2 as pdfium

from pdftext.extraction import paginated_plain_text_output
from pdftext.model import get_model
Expand All @@ -31,7 +32,7 @@ def pymupdf_inference(pdf_path):
for span in line["spans"]:
text += span["text"]
text = text.rstrip() + "\n"
text = text.rstrip() + "\n\n"
text = text.rstrip() + "\n"
pages.append(text)
return pages

Expand All @@ -49,6 +50,11 @@ def pdfplumber_inference(pdf_path):
return pages


def pdftext_inference(pdf_path, model):
pdf = pdfium.PdfDocument(pdf_path)
return paginated_plain_text_output(pdf, model=model)


def compare_docs(doc1: str, doc2: str):
return fuzz.ratio(doc1, doc2)

Expand All @@ -70,7 +76,7 @@ def main():
times_tools = ["pymupdf", "pdftext", "pdfplumber"]
alignment_tools = ["pdftext", "pdfplumber"]
if args.pdftext_only:
times_tools = ["pdftext", "pymupdf"]
times_tools = ["pymupdf", "pdftext"]
alignment_tools = ["pdftext"]
model = get_model()
for i in tqdm(range(len(dataset)), desc="Benchmarking"):
Expand All @@ -82,8 +88,8 @@ def main():
f.seek(0)
pdf_path = f.name

pdftext_inference = partial(paginated_plain_text_output, model=model)
inference_funcs = [pymupdf_inference, pdftext_inference, pdfplumber_inference]
pdftext_inference_model = partial(pdftext_inference, model=model)
inference_funcs = [pymupdf_inference, pdftext_inference_model, pdfplumber_inference]
for tool, inference_func in zip(times_tools, inference_funcs):
start = time.time()
pages = inference_func(pdf_path)
Expand Down

0 comments on commit dc483f9

Please sign in to comment.