-
Notifications
You must be signed in to change notification settings - Fork 19
/
model.py
132 lines (108 loc) · 4.73 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
from imports import *
from mail import *
model = T5ForConditionalGeneration.from_pretrained('t5-small')
tokenizer = T5Tokenizer.from_pretrained('t5-small')
device = torch.device('cpu')
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
def cleanText(text):
text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
text = re.sub('\t', ' ', text)
text = re.sub(r" +", ' ', text)
return text
def getSummary(text, tokenizer):
preprocess_text = text.strip().replace("\n", "")
t5_prepared_Text = "summarize: " + preprocess_text
tokenized_text = tokenizer.encode(t5_prepared_Text, return_tensors="pt").to(device)
summary_ids = model.generate(tokenized_text,
num_beams=5,
no_repeat_ngram_size=2,
min_length=30,
max_length=96,
early_stopping=True)
output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
return output
def sentenceCorrection(text):
correctedText = ""
parser = GingerIt()
sentences = sent_tokenize(text, language='english')
for sentence in sentences:
sentenceDict = parser.parse(sentence)
sentence = str(sentenceDict['result'])
correctedText += sentence
return correctedText
def summaryGeneration(mailid=None):
try:
txtFiles = []
for filename in os.listdir(app.config["PDF_UPLOADS"]):
if fnmatch.fnmatch(filename, 'pdf_fileChapter*.txt'):
print(filename)
txtFiles.append(filename)
for fname in txtFiles:
summary = ""
print("Summarising: ", fname)
text = ""
with open(os.path.join(app.config['PDF_UPLOADS'] + '/' + fname), 'r', encoding="utf-8") as f:
textLines = f.readlines()
for line in textLines:
line = cleanText(line)
line = line.replace("\n", " ")
text += line
textTokens = word_tokenize(text)
totalTokens = len(textTokens)
chunkCounter = 0
maxTokenLen = 400
chunkList = []
start = 0
end = maxTokenLen
if (totalTokens % maxTokenLen) == 0:
totalChunks = int(totalTokens / maxTokenLen)
for i in range(0, totalChunks):
tempTokens = textTokens[start:end]
chunkText = ' '.join([str(elem) for elem in tempTokens])
chunkList.append(chunkText)
start = end
end += maxTokenLen
chunkText = ""
else:
totalChunks = int(totalTokens / maxTokenLen) + 1
for i in range(0, (totalChunks - 1)):
tempTokens = textTokens[start:end]
chunkText = ' '.join([str(elem) for elem in tempTokens])
chunkList.append(chunkText)
start = end
end += maxTokenLen
chunkText = ""
tempTokens = textTokens[start:totalTokens]
chunkText = ' '.join([str(elem) for elem in tempTokens])
chunkList.append(chunkText)
for chunk in chunkList:
tempSummary = getSummary(chunk, tokenizer)
summary += tempSummary
summary = sentenceCorrection(summary)
print("Summarisation complete!")
fileName = fname[:-4] + "_summary.txt"
with open(os.path.join(app.config['PDF_UPLOADS'] + '/' + fileName), 'w', encoding="utf-8") as f1:
f1.write(summary)
print("Summary written to file!")
f1.close()
f.close()
os.remove(os.path.join(app.config['PDF_UPLOADS'] + '/' + fname))
makezipAndCleanUp(mailid)
except Exception as e:
print(e)
send_fail(mailid)
def makezipAndCleanUp(mailid=None):
# function to compress all summary text files into single zip file
# call mail function and send zip file
shutil.make_archive('summarized_chapters', 'zip', app.config['PDF_UPLOADS'])
for file in os.listdir(app.config['PDF_UPLOADS']):
os.remove(os.path.join(app.config['PDF_UPLOADS'] + '/' + file))
if mailid is not None:
send_mail('summarized_chapters.zip', mailid)
else:
print('\nChapter-wise Summaries stored in summarized_chapters.zip')