forked from espnet/espnet
-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtest_sentencepiece.py
38 lines (31 loc) · 1.14 KB
/
test_sentencepiece.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import os
import sentencepiece as spm
root = os.path.dirname(os.path.abspath(__file__))
def test_spm_compatibility():
"""test python API with legacy C++ tool outputs
NOTE: hard-coded strings are generated by spm v0.1.82
"""
testfile = root + "/tedlium2.txt"
nbpe = 100
bpemode = "unigram"
bpemodel = "test_spm"
# test train
spm.SentencePieceTrainer.Train(
f"--input={testfile} --vocab_size={nbpe} --model_type={bpemode} \
--model_prefix={bpemodel} --input_sentence_size=100000000 \
--character_coverage=1.0 --bos_id=-1 --eos_id=-1 \
--unk_id=0 --user_defined_symbols=[laughter],[noise],[vocalized-noise]"
)
with open(f"{bpemodel}.vocab", "r") as fa, open(
root + "/tedlium2.vocab", "r"
) as fb:
for a, b in zip(fa, fb):
assert a == b
# test encode and decode
sp = spm.SentencePieceProcessor()
sp.Load(f"{bpemodel}.model")
txt = "test sentencepiece.[noise]"
actual = sp.EncodeAsPieces(txt)
expect = "▁ te s t ▁ s en t en c e p ie c e . [noise]".split()
assert actual == expect
assert sp.DecodePieces(actual) == txt