-
Notifications
You must be signed in to change notification settings - Fork 18
/
Copy pathcrf.py
92 lines (82 loc) · 3.19 KB
/
crf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: UTF-8 -*-
import os
def estimate_ner(current_labels, correct_labels):
corr_dict = {}
curr_dict = {}
corr_start = -2
curr_start = -2
# print('curr',current_labels)
# print('corr', correct_labels)
for label_index, (curr_label, corr_label) in enumerate(zip(current_labels, correct_labels)):
if corr_label == 1:
corr_start = label_index
if corr_start == label_index - 1:
corr_dict[corr_start] = 1
elif label_index > 0 and corr_label == 2 and correct_labels[label_index - 1] != 2:
corr_dict[corr_start] = label_index - corr_start
if curr_label == 1:
curr_start = label_index
if curr_start == label_index - 1:
curr_dict[curr_start] = 1
elif label_index > 0 and curr_label == 2 and current_labels[label_index - 1] != 2:
curr_dict[curr_start] = label_index - curr_start
corr_count = 0
prec_length = len(curr_dict)
recall_length = len(corr_dict)
for curr_start in curr_dict:
if curr_start in corr_dict and curr_dict[curr_start] == corr_dict[curr_start]:
corr_count += 1
return corr_count, prec_length,recall_length
def prepare_for_crfpp(folder, output_name):
content = []
filenames = set()
for _, _, names in os.walk(folder):
for filename in names:
name, _ = os.path.splitext(filename)
if name not in filenames:
filenames.add(name)
for filename in filenames:
path = folder + filename
with open(path + '.txt', encoding='utf-8') as src_file:
raw_text = src_file.read().replace('\n', '\r\n')
labels = len(raw_text) * ['O']
with open(path + '.ann', encoding='utf-8') as ann_file:
ann_items = ann_file.read().splitlines()
for item in ann_items:
sections = item.split('\t')
if sections[0].startswith('T'):
pos = sections[1].split(' ')
start, end = int(pos[1]), int(pos[2])
labels[start] = 'B'
if end - start - 1 > 0:
labels[start + 1:end] = ['I'] * (end - start - 1)
for ch, l in zip(raw_text, labels):
if ch == '\r':
continue
if ch == '。':
content.append(ch + '\t' + l + '\n')
else:
content.append(ch + '\t' + l)
with open(output_name, mode='w', encoding='utf-8') as o:
o.write('\n'.join(content))
def evaluate_ner(path):
with open(path, encoding='utf-8') as f:
entries = map(lambda l: l.split('\t'), [l for l in f.read().splitlines() if l])
res = list(zip(*entries))
label_map = {'O': 0, 'B': 1, 'I': 2}
correct = list(map(lambda l: label_map[l], res[1]))
current = list(map(lambda l: label_map[l], res[2]))
corr, p_count, r_count = estimate_ner(current, correct)
p = corr / p_count
r = corr / r_count
f1 = 2 * p * r / (p + r)
print('precision:', p)
print('recall:', r)
print('f1', f1)
if __name__ == '__main__':
# train_folder = 'corpus/emr_paper/train/'
# test_folder = 'corpus/emr_paper/test/'
# prepare_for_crfpp(test_folder,'corpus/test.data')
# prepare_for_crfpp(train_folder, 'corpus/train.data')
# evaluate_ner('D:\Learning\master_project\clinicalText\CRF++-0.58\\res.data')
evaluate_ner('D:\Learning\master_project\clinicalText\CRF++-0.58\\res_slim.data')