import pynini
import pynini as pn
from pynini.lib import pynutil, rewrite
chars = (
[chr(i) for i in range(1, 91)]
+ [r"\[", r"\\", r"\]"]
+ [chr(i) for i in range(94, 256)]
)
sigma_star = pynini.union(*chars).closure()
def example():
s = "How are you"
x = pn.acceptor(s)
print(x)
ASCII_STRINGS = ["ha ha", "who?", "Capitals!", "`1234567890~!@#$%^&*()"]
NON_ASCII_CHARS = ["é", "ב", "क", "€", "д", "零"]
UTF8_STRINGS = ["Who?", "¿Quién?", "ארה״ב", "हिन्दी", "今日はそれがググりました。"]
def fsa_a2b(input: str):
# contruct FSA containing the inputs strings
sigma = pn.union("a", "b", "c").closure()
# construct a FST representing the rule
tau = pn.cross("a", "b")
# compute compostion A@B extract the path
rule = pn.cdrewrite(tau, "c", "c", sigma)
lattice = pn.compose(input, rule)
print(lattice.string())
def string_tagging(input_string: str):
# mention of a various types of cheese
# output: Do you have Camembert or Edam?
cheeses = (
"Boursin",
"Camembert",
"Cheddar",
"Edam",
"Gruyere",
"Ilchester",
"Jarlsberg",
"Red Leicester",
"Stilton",
)
# construct transducers insert the left and right tags
fst_target = pynini.string_map(cheeses)
ltag = pynini.cross("", "")
rtag = pynini.cross("", "")
substitution = ltag + fst_target + rtag
rewrite = pynini.cdrewrite(substitution, "", "", sigma_star)
output = pynini.compose(input_string, rewrite).string()
# output = pynini.shortestpath(pynini.compose(input_string, rewrite)).string()
print(output)
def test(input_string):
graph_digit = pn.string_file("data/en.tsv")
graph_hundred = pynini.cross("trăm", "")
NEMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", u"\u00A0").optimize()
delete_space = pynutil.delete(pynini.closure(NEMO_WHITE_SPACE))
graph_hundred_component = pynini.union(
graph_digit + delete_space + graph_hundred, pynutil.insert("0")
)
rewrite = pynini.cdrewrite(graph_hundred_component, "", "", sigma_star)
output = pynini.compose(input_string, rewrite).string()
print(output)
if __name__ == "__main__":
# example()
input_string = "Do you have Camembert or Edam?"
string_tagging(input_string)
#
# input_string = "caccac"
# fsa_a2b(input_string)
# error????
# test("một trăm nghìn")