-
Notifications
You must be signed in to change notification settings - Fork 0
/
unittest_bio_tags.py
39 lines (27 loc) · 1.01 KB
/
unittest_bio_tags.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
'''
Testing BIO labeling function for correctness
'''
def get_bio_tags(context_tokens, annotation_tokens):
token_bio_seq = ["O"] * len(context_tokens)
for unit in annotation_tokens:
unit_tags = ["B"] + ["I"]*(len(unit)-1)
idc = unit
for i, j in enumerate(idc):
token_bio_seq[j] = unit_tags[i]
return token_bio_seq
context = [0,1,2,3,4,5]
separated = [[0,1],[3,4]]
separated_tags = ["B","I","O","B","I","O"]
separated_res = get_bio_tags(context, separated)
assert separated_tags == separated_res
adjacent = [[0,1,2],[3,4]]
adjacent_tags = ["B","I","I","B","I","O"]
adjacent_res = get_bio_tags(context, adjacent)
assert adjacent_tags == adjacent_res
overlap = [[0,1,2],[2,3,4]]
overlap_tags = ["B","I","B","I","I", "O"]
# replacing the tags of preceding annotation is best choice, since this rare and an annotation artifact
# we assume later B boundary annotations in text are more intentional.
overlap_res = get_bio_tags(context, overlap)
assert overlap_tags == overlap_res
pass