Skip to content

Commit

Permalink
process_pdf only returns text (file is saved by process_batch)
Browse files Browse the repository at this point in the history
Enables interacting with the API without having to store files
  • Loading branch information
gchers committed Mar 19, 2021
1 parent 60ad3f3 commit 5571763
Showing 1 changed file with 44 additions and 42 deletions.
86 changes: 44 additions & 42 deletions grobid_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,22 @@ def _load_config(self, path="./config.json"):
else:
print("GROBID server is up and running")

def _output_file_name(self, pdf_file, input_path, output):
# we use ntpath here to be sure it will work on Windows too
if output is not None:
pdf_file_name = str(os.path.relpath(os.path.abspath(pdf_file), input_path))
filename = os.path.join(
output, os.path.splitext(pdf_file_name)[0] + ".tei.xml"
)
else:
pdf_file_name = ntpath.basename(pdf_file)
filename = os.path.join(
ntpath.dirname(pdf_file),
os.path.splitext(pdf_file_name)[0] + ".tei.xml",
)

return filename

def process(
self,
service,
Expand Down Expand Up @@ -138,60 +154,57 @@ def process_batch(
):
if verbose:
print(len(pdf_files), "PDF files to process in current batch")

# with concurrent.futures.ThreadPoolExecutor(max_workers=n) as executor:
with concurrent.futures.ProcessPoolExecutor(max_workers=n) as executor:
results = []
for pdf_file in pdf_files:
executor.submit(
# check if TEI file is already produced
filename = self._output_file_name(pdf_file, input_path, output)
if not force and os.path.isfile(filename):
print(f"{filename} already exist, skipping... (use --force"
" to reprocess pdf input files)")
continue

r = executor.submit(
self.process_pdf,
service,
pdf_file,
input_path,
output,
generateIDs,
consolidate_header,
consolidate_citations,
include_raw_citations,
include_raw_affiliations,
teiCoordinates,
force,
verbose,
)
results.append(r)

for r in concurrent.futures.as_completed(results):
pdf_file, status, text = r.result()
filename = self._output_file_name(pdf_file, input_path, output)

if text is None:
print(f"Processing of {pdf_file} failed with error {str(status)}")
else:
# writing TEI file
try:
pathlib.Path(os.path.dirname(filename)).mkdir(parents=True, exist_ok=True)
with open(filename,'w',encoding='utf8') as tei_file:
tei_file.write(text)
except OSError:
print("Writing resulting TEI XML file {filename} failed")

def process_pdf(
self,
service,
pdf_file,
input_path,
output,
generateIDs,
consolidate_header,
consolidate_citations,
include_raw_citations,
include_raw_affiliations,
teiCoordinates,
force,
verbose=False,
):
# check if TEI file is already produced
# we use ntpath here to be sure it will work on Windows too
if output is not None:
pdf_file_name = str(os.path.relpath(os.path.abspath(pdf_file), input_path))
filename = os.path.join(
output, os.path.splitext(pdf_file_name)[0] + ".tei.xml"
)
else:
pdf_file_name = ntpath.basename(pdf_file)
filename = os.path.join(
ntpath.dirname(pdf_file),
os.path.splitext(pdf_file_name)[0] + ".tei.xml",
)

if not force and os.path.isfile(filename):
print(
filename,
"already exist, skipping... (use --force to reprocess pdf input files)",
)
return

files = {
"input": (
Expand Down Expand Up @@ -241,19 +254,8 @@ def process_pdf(
force,
teiCoordinates,
)
elif status != 200:
print("Processing failed with error " + str(status))
else:
# writing TEI file
try:
pathlib.Path(os.path.dirname(filename)).mkdir(
parents=True, exist_ok=True
)
with io.open(filename, "w", encoding="utf8") as tei_file:
tei_file.write(res.text)
except OSError:
print("Writing resulting TEI XML file %s failed" % filename)
pass

return (pdf_file, status, res.text)


if __name__ == "__main__":
Expand Down

0 comments on commit 5571763

Please sign in to comment.