-
Notifications
You must be signed in to change notification settings - Fork 0
/
basics.py
86 lines (68 loc) · 2.47 KB
/
basics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import pynini
import pynini as pn
from pynini.lib import pynutil, rewrite
chars = (
[chr(i) for i in range(1, 91)]
+ [r"\[", r"\\", r"\]"]
+ [chr(i) for i in range(94, 256)]
)
sigma_star = pynini.union(*chars).closure()
def example():
s = "How are you"
x = pn.acceptor(s)
print(x)
ASCII_STRINGS = ["ha ha", "who?", "Capitals!", "`1234567890~!@#$%^&*()"]
NON_ASCII_CHARS = ["é", "ב", "क", "€", "д", "零"]
UTF8_STRINGS = ["Who?", "¿Quién?", "ארה״ב", "हिन्दी", "今日はそれがググりました。"]
def fsa_a2b(input: str):
# contruct FSA containing the inputs strings
sigma = pn.union("a", "b", "c").closure()
# construct a FST representing the rule
tau = pn.cross("a", "b")
# compute compostion A@B extract the path
rule = pn.cdrewrite(tau, "c", "c", sigma)
lattice = pn.compose(input, rule)
print(lattice.string())
def string_tagging(input_string: str):
# mention of a various types of cheese
# output: Do you have <cheese>Camembert</cheese> or <cheese>Edam</cheese>?
cheeses = (
"Boursin",
"Camembert",
"Cheddar",
"Edam",
"Gruyere",
"Ilchester",
"Jarlsberg",
"Red Leicester",
"Stilton",
)
# construct transducers insert the left and right tags
fst_target = pynini.string_map(cheeses)
ltag = pynini.cross("", "<cheese>")
rtag = pynini.cross("", "</cheese>")
substitution = ltag + fst_target + rtag
rewrite = pynini.cdrewrite(substitution, "", "", sigma_star)
output = pynini.compose(input_string, rewrite).string()
# output = pynini.shortestpath(pynini.compose(input_string, rewrite)).string()
print(output)
def test(input_string):
graph_digit = pn.string_file("data/en.tsv")
graph_hundred = pynini.cross("trăm", "")
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
graph_hundred_component = pynini.union(
graph_digit + delete_space + graph_hundred, pynutil.insert("0")
)
rewrite = pynini.cdrewrite(graph_hundred_component, "", "", sigma_star)
output = pynini.compose(input_string, rewrite).string()
print(output)
if __name__ == "__main__":
# example()
input_string = "Do you have Camembert or Edam?"
string_tagging(input_string)
#
# input_string = "caccac"
# fsa_a2b(input_string)
# error????
# test("một trăm nghìn")