forked from halxp1/seq2seq_chatbot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_process.py
125 lines (96 loc) · 3.02 KB
/
data_process.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import re
import sys
import pickle
from tqdm import tqdm
from word_sequence import WordSequence
def make_split(line):
if re.match(r'.*([,。…?!\.,!? ])$', ''.join(line)):
return []
return [',']
def good_line(line):
if len(re.findall(r'[a-zA-Z0-9]',''.join(line)))>2:
return False
return True
def regular(sen):
sen=re.sub(r'\.{3,100}','…',sen)
sen =re.sub(r'…{2,100}','…',sen)
sen =re.sub(r'[,]{1,100}',',',sen)
sen=re.sub(r'[\.]{1,100}','。',sen)
sen=re.sub(r'[\?]{1,100}','?',sen)
sen=re.sub(r'[!]{1,100}','!',sen)
return sen
def main(limit=20,x_limit=3,y_limit=6):
print("extract lines")
fp = open("./data/dgk_shooter_min.conv",'r',encoding='utf-8')
groups =[]
group=[]
for line in tqdm(fp):
if line.startswith('M '):
line = line.replace('\n','')
if '/' in line:
line = line[2:].split('/')
else:
line = list(line[2:])
line = line[:-1] #
group.append(list(regular(''.join(line))))
else:
lsat_line=None
if group:
groups.append(group)
group=[]
print('extract group')
x_data=[]
y_data=[]
for group in tqdm(groups):
for i,line in enumerate(group):
last_line=None
if i>0:
last_line = group[i-1]
if not good_line(last_line):
lsat_line=None
next_line = None
if i <len(group) -1 :
next_line= group[i+1]
if not good_line(next_line):
next_line=None
next_next_line = None
if i<len(group)-2:
next_next_line=group[i+2]
if not good_line(next_next_line):
next_next_line=None
if next_line:
x_data.append(line)
y_data.append(next_line)
if last_line and next_line:
x_data.append(last_line + make_split(last_line) + line)
y_data.append(next_line)
if next_line and next_next_line:
x_data.append(line)
y_data.append(next_line + make_split(next_line) + next_next_line)
print(len(x_data),len(y_data))
for ask,answer in zip(x_data[:20],y_data[:20]):
print(''.join(ask))
print(''.join(answer))
print('-'*20)
data = list(zip(x_data,y_data))
data = [
(x,y)
for x,y in data
if len(x) <limit \
and len(y) < limit \
and len(y)>=y_limit \
and len(x)>=x_limit
]
x_data,y_data=zip(*data)
print('fit word_sequence')
ws_input = WordSequence()
ws_input.fit(x_data + y_data)
print('dump')
pickle.dump(
(x_data,y_data),
open('./data/chatbot.pkl','wb')
)
pickle.dump(ws_input,open('./data/ws.pkl','wb'))
print('done')
if __name__ == '__main__':
main()