-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathexport_emr.py
127 lines (114 loc) · 4.02 KB
/
export_emr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#-*- coding: UTF-8 -*-
import re
import os
def read_single_file(ann_file,raw_file):
with open(raw_file, encoding='utf-8') as r:
sentence = r.read()
rn_indices = [m.start() for m in re.finditer('\n',sentence)]
spans_diff = {}
if len(rn_indices):
spans = zip([-1]+rn_indices,rn_indices+[len(sentence)+len(rn_indices)])
for i,(before,curr) in enumerate(spans):
spans_diff[(before+2,curr)] = i*2
raw_sentence = sentence
sentence = sentence.replace('\n','')
#periods = [m.start() for m in re.finditer('。', sentence)]
periods = []
sentence_len = len(sentence)
last = 0
sentences = []
for i,ch in enumerate(sentence):
if ch =='。':
if i<sentence_len-1 and sentence[i+1]=='”':
pass
else:
periods.append(i)
sentences.append(sentence[last:i+1])
last = i+1
if last!= len(sentence):
sentences.append(sentence[last:sentence_len])
period_spans = {}
sentence_spans = {}
# sentences = sentence.split('。')
# if sentences[-1] == '':
# sentences = [s+'。' for s in sentences]
# else:
# sentences = [s+'。' for s in sentences[:-1]]+[sentences[-1]]
sentence_dict = {k:{'text':k} for k in sentences}
if len(periods):
for s, e in zip([-1] + periods, periods + [len(sentence)]):
period_spans[(s + 1, e + 1)] = s + 1
with open(ann_file, encoding='utf-8') as a:
entries = map(lambda l:l.strip().split(' '),a.read().replace('\t',' ').splitlines())
for entry in entries:
id = entry[0]
if id.startswith('T'):
start = int(entry[2])
end = int(entry[3])
text = entry[4]
if len(rn_indices):
flag = False
for s,e in spans_diff:
if s <= start and end <= e:
diff = spans_diff[(s,e)]
start -= diff
end -= diff
flag = True
break
if not flag:
print('a fucked world')
if sentence[start:end] != text:
# print('=========')
# print(end - start)
# print(id)
# print(ann_file)
# print(sentence[start:end])
# print(text)
# print('fuck world')
continue
if len(period_spans):
for s,e in period_spans:
if s<= start and end<= e:
new_sentence = sentence[s:e]
if new_sentence not in sentence_dict:
print(ann_file)
print('fuck aa')
new_diff = period_spans[(s,e)]
start -= new_diff
end -= new_diff
if new_sentence[start:end] != text:
print('fuck')
entity = {'id': id, 'start': start, 'length': end - start, 'text': text}
entities = sentence_dict[new_sentence].get('entities')
if entities is not None:
entities.append(entity)
else:
sentence_dict[new_sentence]['entities'] = [entity]
break
for sentence in sentence_dict:
labels = ['O'] * len(sentence)
if sentence_dict[sentence].get('entities') is not None:
for entity in sentence_dict[sentence]['entities']:
start = entity['start']
end = start + entity['length']
labels[start] = 'B'
if end -start > 1:
labels[start+1:end] = ['I']*(end-start-1)
sentence_dict[sentence]['label'] = labels
return sentence_dict
def read_emr(directory,dest_file):
files = set()
for f in os.listdir(directory):
files.add(os.path.splitext(os.path.split(f)[1])[0])
sentences = []
for f in files:
sentences.extend(read_single_file(directory+f+'.ann',directory+f+'.txt').values())
text = ''
with open(dest_file, 'w',encoding='utf-8') as f:
for sentence in sentences:
text += '\n'.join([' '.join(l) for l in zip(sentence['text'],sentence['label'])])
text += '\n\n'
f.write(text)
if __name__ == '__main__':
read_emr('corpus/emr_paper/train/','corpus/emr_paper/emr_training.conll')
read_emr('corpus/emr_paper/test/', 'corpus/emr_paper/emr_test.conll')