-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtestExtraction.py
139 lines (124 loc) · 4.99 KB
/
testExtraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import os
from time import perf_counter
import fitz
import pdftotext
import tika
from tika import parser
import PyPDF2
from nltk.tokenize import RegexpTokenizer
# artigo de 2002 - 6 paginas com 2 figuras e 5 tabelas
arqPDF1 = "/Volumes/SD-64-Interno/artigosPDFbmc/1471-2318-2-3.pdf"
# artigo de 2007 - 12 paginas com 1 figura e 3 tabelas
arqPDF2 = "/Volumes/SD-64-Interno/artigosPDFbmc/1471-2318-7-23.pdf"
# artigo de 2009 - 9 paginas com 2 figuras e 3 tabelas
arqPDF3 = "/Volumes/SD-64-Interno/artigosPDFbmc/1471-2318-9-12.pdf"
# artigo de 2016 - 14 paginas com 1 figura
arqPDF4 = "/Volumes/SD-64-Interno/artigosPDFbmc/s12877-016-0361-8.pdf"
# artigo de 2019 - 8 paginas com 1 figura e 4 tabelas
arqPDF5 = "/Volumes/SD-64-Interno/artigosPDFbmc/s12877-019-1283-z.pdf"
# artigo de 2019 - 12 paginas com 1 figura
arqPDF6 = "/Volumes/SD-64-Interno/artigosPDFbmc/s12877-019-1372-z.pdf"
# lista de arqs
PDFlist = [arqPDF1, arqPDF2, arqPDF3, arqPDF4, arqPDF5, arqPDF6]
def saveText(texto, fileName, nameLib):
"""Save the text in a file
Arguments:
texto {str} -- text in str format
fileName {str} -- filename (without path in this code)
nameLib {str} -- name of extractor project
"""
arq = open(fileName + "-" + nameLib + ".txt", "w")
arq.write(texto)
arq.close()
def printMiniReport(texto, fileName, nameLib, timeConversion):
"""Shows in the screen, some informations to help compare performance in extract file
Arguments:
texto {str} -- text in str format
fileName {str} -- filename (without path in this code)
nameLib {str} -- name of extractor project
timeConversion {float} -- time in seconds
"""
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(texto)
print(nameLib, " - ", fileName, " - Total of chars: ", str(len(texto)) )
print(nameLib, " - ", fileName, " - Total of tokens: ", str(len(tokens)) )
print(nameLib, " - ", fileName, " - Extract time in seconds: ", str(timeConversion) )
def extractPDFwithPyMuPDF(arqs):
"""Using PyMuPDF do extract PDF text - https://pypi.org/project/PyMuPDF/
Arguments:
arqs {str} -- A list of filenames with path
"""
for arq in arqs:
timeIni = perf_counter()
doc = fitz.open(arq)
textoCompleto = ""
for page in doc:
texto = page.getText("text")
textoCompleto = textoCompleto + texto
fileName = os.path.basename(arq)
timeEnd = perf_counter()
timeTotal = timeEnd - timeIni
printMiniReport(textoCompleto, fileName, "PyMuPDF", timeTotal)
saveText(textoCompleto, fileName, "PyMuPDF")
doc.close()
print("--- PyMuPDF ---" )
def extractPDFwithPdfToText(arqs):
"""Using PdfToText to extract PDF text - https://pypi.org/project/pdftotext/
Arguments:
arqs {str} -- A list of filenames with path
"""
for arq in arqs:
timeIni = perf_counter()
textoCompleto = ""
with open(arq, "rb") as f:
doc = pdftotext.PDF(f)
for page in doc:
textoCompleto = textoCompleto + page
fileName = os.path.basename(arq)
timeEnd = perf_counter()
timeTotal = timeEnd - timeIni
printMiniReport(textoCompleto, fileName, "PdfToText", timeTotal)
saveText(textoCompleto, fileName, "PdfToText")
f.close()
print("--- PdfToText ---" )
def extractPDFwithTika(arqs):
"""Using Apache Tika to extract PDF text - https://pypi.org/project/tika/
Arguments:
arqs {str} -- A list of filenames with path
"""
#the time for load the Tika .jar server impact in first time of use
tika.initVM()
for arq in arqs:
timeIni = perf_counter()
textoCompleto = parser.from_file(arq)
fileName = os.path.basename(arq)
timeEnd = perf_counter()
timeTotal = timeEnd - timeIni
printMiniReport(textoCompleto["content"], fileName, "Tika", timeTotal)
saveText(textoCompleto["content"], fileName, "Tika")
print("--- Tika ---")
def extractPDFwithPyPDF2(arqs):
"""Using PyPDF2 to extract PDF text - https://pypi.org/project/PyPDF2/
Arguments:
arqs {str} -- A list of filenames with path
"""
for arq in arqs:
timeIni = perf_counter()
with open(arq, "rb") as f:
doc = PyPDF2.PdfFileReader(f)
numPags = doc.getNumPages()
textoCompleto = ""
for i in range(numPags):
textoCompleto = textoCompleto + doc.getPage(i).extractText()
fileName = os.path.basename(arq)
timeEnd = perf_counter()
timeTotal = timeEnd - timeIni
printMiniReport(textoCompleto, fileName, "PyPDF2", timeTotal)
saveText(textoCompleto, fileName, "PyPDF2")
print("--- PyPDF2 ---" )
if __name__ == "__main__":
print(PDFlist)
extractPDFwithPyMuPDF(PDFlist)
extractPDFwithPdfToText(PDFlist)
extractPDFwithTika(PDFlist)
extractPDFwithPyPDF2(PDFlist)