Skip to content

Commit

Permalink
add process citation list call
Browse files Browse the repository at this point in the history
  • Loading branch information
kermitt2 committed Jul 18, 2021
1 parent 4bce8b5 commit 55afc5f
Show file tree
Hide file tree
Showing 13 changed files with 3,593 additions and 1,193 deletions.
26 changes: 14 additions & 12 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,32 +17,33 @@ Get the github repo:
```
git clone https://github.com/kermitt2/grobid_client_python
cd grobid_client_python
python setup.py install
python3 setup.py install
```



There is nothing more to do to start using the python command lines, see the next section.

## Usage and options

```
usage: grobid_client [-h] [--input INPUT] [--output OUTPUT]
[--config CONFIG] [--n N] [--generateIDs]
[--consolidate_header] [--consolidate_citations]
[--include_raw_citations] [--include_raw_affiliations]
[--force] [--teiCoordinates] [--verbose]
service
usage: grobid_client [-h] [--input INPUT] [--output OUTPUT] [--config CONFIG]
[--n N] [--generateIDs] [--consolidate_header]
[--consolidate_citations] [--include_raw_citations]
[--include_raw_affiliations] [--force] [--teiCoordinates]
[--verbose]
service
Client for GROBID services
positional arguments:
service one of [processFulltextDocument,
processHeaderDocument, processReferences]
service one of ['processFulltextDocument',
'processHeaderDocument', 'processReferences',
'processCitationList']
optional arguments:
-h, --help show this help message and exit
--input INPUT path to the directory containing PDF to process
--input INPUT path to the directory containing PDF files or .txt
(for processCitationList only, one reference per line)
to process
--output OUTPUT path to the directory where to put the results
(optional)
--config CONFIG path to the config file, default is ./config.json
Expand All @@ -65,6 +66,7 @@ optional arguments:
the extracted elements
--verbose print information about processed files in the console
```

Examples:
Expand Down
107 changes: 79 additions & 28 deletions grobid_client/grobid_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,18 @@ def _load_config(self, path="./config.json"):
else:
print("GROBID server is up and running")

def _output_file_name(self, pdf_file, input_path, output):
def _output_file_name(self, input_file, input_path, output):
# we use ntpath here to be sure it will work on Windows too
if output is not None:
pdf_file_name = str(os.path.relpath(os.path.abspath(pdf_file), input_path))
input_file_name = str(os.path.relpath(os.path.abspath(input_file), input_path))
filename = os.path.join(
output, os.path.splitext(pdf_file_name)[0] + ".tei.xml"
output, os.path.splitext(input_file_name)[0] + ".tei.xml"
)
else:
pdf_file_name = ntpath.basename(pdf_file)
input_file_name = ntpath.basename(input_file)
filename = os.path.join(
ntpath.dirname(pdf_file),
os.path.splitext(pdf_file_name)[0] + ".tei.xml",
ntpath.dirname(input_file),
os.path.splitext(input_file_name)[0] + ".tei.xml",
)

return filename
Expand All @@ -86,23 +86,24 @@ def process(
verbose=False,
):
batch_size_pdf = self.config["batch_size"]
pdf_files = []
input_files = []

for (dirpath, dirnames, filenames) in os.walk(input_path):
for filename in filenames:
if filename.endswith(".pdf") or filename.endswith(".PDF"):
if filename.endswith(".pdf") or filename.endswith(".PDF") or \
(service == 'processCitationList' and (filename.endswith(".txt") or filename.endswith(".TXT"))):
if verbose:
try:
print(filename)
except Exception:
# may happen on linux see https://stackoverflow.com/questions/27366479/python-3-os-walk-file-paths-unicodeencodeerror-utf-8-codec-cant-encode-s
pass
pdf_files.append(os.sep.join([dirpath, filename]))
input_files.append(os.sep.join([dirpath, filename]))

if len(pdf_files) == batch_size_pdf:
if len(input_files) == batch_size_pdf:
self.process_batch(
service,
pdf_files,
input_files,
input_path,
output,
n,
Expand All @@ -115,13 +116,13 @@ def process(
force,
verbose,
)
pdf_files = []
input_files = []

# last batch
if len(pdf_files) > 0:
if len(input_files) > 0:
self.process_batch(
service,
pdf_files,
input_files,
input_path,
output,
n,
Expand All @@ -138,7 +139,7 @@ def process(
def process_batch(
self,
service,
pdf_files,
input_files,
input_path,
output,
n,
Expand All @@ -152,37 +153,41 @@ def process_batch(
verbose=False,
):
if verbose:
print(len(pdf_files), "PDF files to process in current batch")
print(len(input_files), "files to process in current batch")

# with concurrent.futures.ThreadPoolExecutor(max_workers=n) as executor:
with concurrent.futures.ProcessPoolExecutor(max_workers=n) as executor:
results = []
for pdf_file in pdf_files:
for input_file in input_files:
# check if TEI file is already produced
filename = self._output_file_name(pdf_file, input_path, output)
filename = self._output_file_name(input_file, input_path, output)
if not force and os.path.isfile(filename):
print(filename, "already exist, skipping... (use --force to reprocess pdf input files)")
continue

selected_process = self.process_pdf
if service == 'processCitationList':
selected_process = self.process_txt

r = executor.submit(
self.process_pdf,
selected_process,
service,
pdf_file,
input_file,
generateIDs,
consolidate_header,
consolidate_citations,
include_raw_citations,
include_raw_affiliations,
teiCoordinates,
)
teiCoordinates)

results.append(r)

for r in concurrent.futures.as_completed(results):
pdf_file, status, text = r.result()
filename = self._output_file_name(pdf_file, input_path, output)
input_file, status, text = r.result()
filename = self._output_file_name(input_file, input_path, output)

if text is None:
print("Processing of", pdf_file, "failed with error", str(status))
print("Processing of", input_file, "failed with error", str(status))
else:
# writing TEI file
try:
Expand All @@ -203,7 +208,6 @@ def process_pdf(
include_raw_affiliations,
teiCoordinates,
):

files = {
"input": (
pdf_file,
Expand Down Expand Up @@ -252,21 +256,68 @@ def process_pdf(

return (pdf_file, status, res.text)

def process_txt(
self,
service,
txt_file,
generateIDs,
consolidate_header,
consolidate_citations,
include_raw_citations,
include_raw_affiliations,
teiCoordinates,
):
# create request based on file content
references = None
with open(txt_file) as f:
references = [line.rstrip() for line in f]

the_url = "http://" + self.config["grobid_server"]
if len(self.config["grobid_port"]) > 0:
the_url += ":" + self.config["grobid_port"]
the_url += "/api/" + service

# set the GROBID parameters
the_data = {}
if consolidate_citations:
the_data["consolidateCitations"] = "1"
if include_raw_citations:
the_data["includeRawCitations"] = "1"
the_data["citations"] = references
res, status = self.post(
url=the_url, data=the_data, headers={"Accept": "text/plain"}
)

if status == 503:
time.sleep(self.config["sleep_time"])
return self.process_txt(
service,
txt_file,
generateIDs,
consolidate_header,
consolidate_citations,
include_raw_citations,
include_raw_affiliations,
teiCoordinates,
)

return (txt_file, status, res.text)

def main():
valid_services = [
"processFulltextDocument",
"processHeaderDocument",
"processReferences",
"processCitationList"
]

parser = argparse.ArgumentParser(description="Client for GROBID services")
parser.add_argument(
"service",
help="one of [processFulltextDocument, processHeaderDocument, processReferences]",
help="one of " + str(valid_services),
)
parser.add_argument(
"--input", default=None, help="path to the directory containing PDF to process"
"--input", default=None, help="path to the directory containing PDF files or .txt (for processCitationList only, one reference per line) to process"
)
parser.add_argument(
"--output",
Expand Down
Binary file not shown.
Loading

0 comments on commit 55afc5f

Please sign in to comment.