-
-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathtest_parsing.py
127 lines (124 loc) · 4.35 KB
/
test_parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pytest
from pycantonese import parse_text
from pycantonese.word_segmentation import Segmenter
@pytest.mark.parametrize(
"text, segment_kwargs, pos_tag_kwargs, participant, expected",
[
("", None, None, None, ""),
(None, None, None, None, ""),
(
# The canonical case
"學廣東話",
None,
None,
None,
"*X: 學 廣東話\n%mor: VERB|hok6 PROPN|gwong2dung1waa2\n",
),
(
# Custom participant
"學廣東話",
None,
None,
"Foo",
"*Foo: 學 廣東話\n%mor: VERB|hok6 PROPN|gwong2dung1waa2\n",
),
(
# Unseen "word", so no jyutping in the output
"135",
None,
None,
None,
"*X: 135\n%mor: X|\n",
),
(
# Custom POS tagging
"學廣東話",
None,
{"tagset": "hkcancor"},
None,
"*X: 學 廣東話\n%mor: V|hok6 NZ|gwong2dung1waa2\n",
),
(
# Custom word segmentation
"學廣東話",
{"segmenter": Segmenter(disallow={"廣東話"})},
None,
None,
(
"*X: 學 廣東 話\n"
"%mor: VERB|hok6 PROPN|gwong2dung1 VERB|waa6\n"
),
),
(
# Extra whitespace characters should be ignored
"學廣東話\n\n\n\n\n\n學廣東話",
None,
None,
None,
(
"*X: 學 廣東話\n"
"%mor: VERB|hok6 PROPN|gwong2dung1waa2\n"
"*X: 學 廣東話\n"
"%mor: VERB|hok6 PROPN|gwong2dung1waa2\n"
),
),
(
# Let utterance segmentation do its thing
"廣東話好難學?都唔係吖!",
None,
None,
None,
(
"*X: 廣東話 好 難 學 ?\n"
"%mor: PROPN|gwong2dung1waa2 ADV|hou2 ADJ|naan4 VERB|hok6 ?\n"
"*X: 都 唔係 吖 !\n"
"%mor: ADV|dou1 VERB|m4hai6 PART|aa1 !\n"
),
),
(
# User-specified utterance segmentation with a list
["廣東話好難學?都唔係吖!"],
None,
None,
None,
(
"*X: 廣東話 好 難 學 ? 都 唔係 吖 !\n" # noqa: E501
"%mor: PROPN|gwong2dung1waa2 ADV|hou2 ADJ|naan4 VERB|hok6 ? ADV|dou1 VERB|m4hai6 PART|aa1 !\n" # noqa: E501
),
),
(
# User-specified utterance segmentation with a list, with an empty utterance
["廣東話好難學?都唔係吖!", None],
None,
None,
None,
(
"*X: 廣東話 好 難 學 ? 都 唔係 吖 !\n" # noqa: E501
"%mor: PROPN|gwong2dung1waa2 ADV|hou2 ADJ|naan4 VERB|hok6 ? ADV|dou1 VERB|m4hai6 PART|aa1 !\n" # noqa: E501
"*X:\t\n"
),
),
(
# User-specified participants
[("小芬", "你食咗飯未呀?"), ("小明", "我食咗喇。")],
None,
None,
None,
(
"*小芬: 你 食 咗 飯 未 呀 ?\n" # noqa: E501
"%mor: PRON|nei5 VERB|sik6 PART|zo2 NOUN|faan6 ADV|mei6 PART|aa4 ?\n" # noqa: E501
"*小明: 我 食 咗 喇 。\n"
"%mor: PRON|ngo5 VERB|sik6 PART|zo2 PART|laa1 。\n"
),
),
],
)
def test_parse_text(text, segment_kwargs, pos_tag_kwargs, participant, expected):
corpus = parse_text(
text,
segment_kwargs=segment_kwargs,
pos_tag_kwargs=pos_tag_kwargs,
participant=participant,
)
actual = "\n".join(corpus.to_strs())
assert actual == expected