forked from VikParuchuri/pdftext
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_text.py
31 lines (23 loc) · 1.17 KB
/
extract_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import argparse
import json
from pdftext.extraction import plain_text_output, dictionary_output
def main():
parser = argparse.ArgumentParser(description="Extract plain text from PDF. Not guaranteed to be in order.")
parser.add_argument("pdf_path", type=str, help="Path to the PDF file")
parser.add_argument("--out_path", type=str, help="Path to the output text file, defaults to stdout", default=None)
parser.add_argument("--output_type", type=str, help="Type of output to generate", default="plain_text")
parser.add_argument("--sort", action="store_true", help="Attempt to sort the text by reading order", default=False)
args = parser.parse_args()
assert args.output_type in ["plain_text", "json"], "Invalid output type, must be 'plain_text' or 'json'"
if args.output_type == "plain_text":
text = plain_text_output(args.pdf_path, sort=args.sort)
elif args.output_type == "json":
text = dictionary_output(args.pdf_path, sort=args.sort)
text = json.dumps(text)
if args.out_path is None:
print(text)
else:
with open(args.out_path, "w+") as f:
f.write(text)
if __name__ == "__main__":
main()