-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathbenchmark.py
92 lines (71 loc) · 2.48 KB
/
benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import argparse
import tempfile
import time
from statistics import mean
import os
import json
import fitz as pymupdf
import datasets
import pdfplumber
from pdftext.extraction import dictionary_output
from pdftext.settings import settings
def pymupdf_inference(pdf_path):
doc = pymupdf.open(pdf_path)
pages = []
for i in range(len(doc)):
page = doc[i]
text = page.get_text("dict")
pages.append(text)
return pages
def pdfplumber_inference(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
pages = []
for i in range(len(pdf.pages)):
page = pdf.pages[i]
text = page.extract_text()
pages.append(text)
return pages
def main():
parser = argparse.ArgumentParser(description="Benchmark pdf extraction.")
parser.add_argument("--result_path", type=str, help="Path to the output text file, defaults to stdout", default=None)
parser.add_argument("--max", type=int, help="Maximum number of pages to process.", default=None)
args = parser.parse_args()
split = "train"
if args.max:
split = f"train[:{args.max}]"
dataset = datasets.load_dataset(settings.BENCH_DATASET_NAME, split=split)
mu_times = []
pdftext_times = []
pdfplumber_times = []
for i in range(len(dataset)):
row = dataset[i]
pdf = row["pdf"]
with tempfile.NamedTemporaryFile(suffix=".pdf") as f:
f.write(pdf)
f.seek(0)
pdf_path = f.name
start = time.time()
mu_pages = pymupdf_inference(pdf_path)
mu_times.append(time.time() - start)
start = time.time()
pdftext_pages = dictionary_output(pdf_path)
pdftext_times.append(time.time() - start)
start = time.time()
pdfplumber_pages = pdfplumber_inference(pdf_path)
pdfplumber_times.append(time.time() - start)
print(f"MuPDF avg time: {mean(mu_times):.2f}")
print(f"pdfplumber avg time: {mean(pdfplumber_times):.2f}")
print(f"pdftext avg time: {mean(pdftext_times):.2f}")
results = {
"mu_times": mu_times,
"pdftext_times": pdftext_times,
"pdfplumber_times": pdfplumber_times
}
result_path = args.result_path
if result_path is None:
result_path = settings.RESULTS_FOLDER
os.makedirs(result_path, exist_ok=True)
with open(os.path.join(result_path, "results.json"), "w+") as f:
json.dump(results, f)
if __name__ == "__main__":
main()