[fix] prepare_dict support english and chinese in one lexicon.txt (#1693

) * [fix] prepare_dict support english and chinese in one lexicon.txt * add comment * formatting adjustment * formatting adjustment * warnings.warn stacklevel keyword set stacklevel of 2 --------- Co-authored-by: cuidongcai1035 <cuidongcai1035@wezhuiyi.com>
wenet-e2e · Feb 15, 2023 · a983da9 · a983da9
1 parent b3b82c8
commit a983da9
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 6 deletions.
diff --git a/tools/fst/prepare_dict.py b/tools/fst/prepare_dict.py
@@ -41,7 +41,14 @@ def contain_oov(units):
             if word in lexicon_table:
                 continue
             if bpemode:
-                pieces = sp.EncodeAsPieces(word)
+                # We assume that the lexicon does not contain code-switch,
+                # i.e. the word contains both English and Chinese.
+                # see PR https://github.com/wenet-e2e/wenet/pull/1693
+                # and Issue https://github.com/wenet-e2e/wenet/issues/1653
+                if word.encode('utf8').isalpha():
+                    pieces = sp.EncodeAsPieces(word)
+                else:
+                    pieces = word
                 if contain_oov(pieces):
                     print(
                         'Ignoring words {}, which contains oov unit'.format(

diff --git a/wenet/utils/scheduler.py b/wenet/utils/scheduler.py
@@ -112,7 +112,7 @@ def get_lr(self):
             warnings.warn(
                 "To get the last learning rate computed "
                 "by the scheduler, please use `get_last_lr()`.",
-                UserWarning
+                UserWarning, stacklevel=2
             )
 
         step = self.last_epoch
@@ -173,7 +173,7 @@ def get_lr(self):
             warnings.warn(
                 "To get the last learning rate computed "
                 "by the scheduler, please use `get_last_lr()`.",
-                UserWarning
+                UserWarning, stacklevel=2
             )
 
         step = self.last_epoch
@@ -255,7 +255,7 @@ def get_lr(self):
             warnings.warn(
                 "To get the last learning rate computed by the scheduler,"
                 " " "please use `get_last_lr()`.",
-                UserWarning
+                UserWarning, stacklevel=2
             )
 
         step = self.last_epoch
@@ -336,7 +336,7 @@ def get_lr(self):
             warnings.warn(
                 "To get the last learning rate computed "
                 "by the scheduler, please use `get_last_lr()`.",
-                UserWarning
+                UserWarning, stacklevel=2
             )
 
         step = self.last_epoch
@@ -554,7 +554,7 @@ def get_lr(self):
             warnings.warn(
                 "To get the last learning rate computed "
                 "by the scheduler, please use `get_last_lr()`.",
-                UserWarning
+                UserWarning, stacklevel=2
             )
 
         step = max(1, self.last_epoch)