This repository has been archived by the owner on Nov 22, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 3
/
extractAnswer.py
118 lines (81 loc) · 2.87 KB
/
extractAnswer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import re
import os
import numpy as np
import cPickle
import subprocess
from collections import defaultdict
UNKNOWN_WORD_IDX = 0
xml='''<top>
<num> Number: %s
<title> %s
<desc> Description:
<narr> Narrative:
</top>
'''
def load_data(fname):
lines = open(fname).readlines()
qids, questions, answers, labels = [], [], [], []
num_skipped = 0
prev = ''
qid2num_answers = {}
for i, line in enumerate(lines):
line = line.strip()
qid_match = re.match('<QApairs id=\'(.*)\'>', line)
if qid_match:
qid = qid_match.group(1)
qid2num_answers[qid] = 0
if prev and prev.startswith('<question>'):
question = line.lower().split('\t')
label = re.match('^<(positive|negative)>', prev)
if label:
label = label.group(1)
label = 1 if label == 'positive' else 0
answer = line.lower().split('\t')
# if len(answer) > 60:
# num_skipped += 1
# continue
labels.append(label)
answers.append(answer)
questions.append(question)
qids.append(qid)
qid2num_answers[qid] += 1
prev = line
# print sorted(qid2num_answers.items(), key=lambda x: float(x[0]))
print 'num_skipped', num_skipped
return qids, questions, answers, labels
if __name__ == '__main__':
stoplist = None
train = 'data/train.xml'
train_all = 'data/train-all.xml'
# train_files = [train, train_all]
train_files = [train_all]
for train in train_files:
print train
train_basename = os.path.basename(train)
name, ext = os.path.splitext(train_basename)
# outdir = '{}'.format(name.upper())
# print 'outdir', outdir
# all_fname = train
all_fname = "/tmp/trec-merged.txt"
files = ' '.join([train])
subprocess.call("/bin/cat {} > {}".format(files, all_fname), shell=True)
# qids, questions, answers, labels = load_data(all_fname, stoplist)
qids, questions, answers, labels = load_data(all_fname)
writeAnsF = open("allQAAnswers.topics.xml",'w')
writeAnsListF = open("allQAAnswers.topics.list",'w')
writeQueF = open("allQAQuestions.topics.xml",'w')
writeQueListF = open("allQAQuestions.topics.list",'w')
count = 0
preQid = 0
for qid, ans, que, label in zip( qids, answers, questions, labels):
count += 1
ansXml = xml%(count,' '.join([str(x) for x in ans]))
writeAnsF.write(ansXml)
writeAnsListF.write(qid+ '\t' +str(count)+ '\t'+str(label) +'\t'+ ' '.join([str(x) for x in ans])+'\n')
queXml = xml%(count,' '.join([str(x) for x in que]))
writeQueF.write(queXml)
writeQueListF.write(qid+ '\t' +str(count)+ '\t'+str(label) +'\t'+ ' '.join([str(x) for x in que])+'\n')
writeAnsF.close()
writeAnsListF.close()
writeQueF.close()
writeQueListF.close()