Update: Add the language identification pattern to config

Artrajz · Jul 24, 2024 · 68cddda · 68cddda
1 parent d4de56c
commit 68cddda
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 13 deletions.
diff --git a/config.py b/config.py
@@ -42,10 +42,6 @@
 it's recommended to specify model paths in config.yaml.
 """
 
-model_list = [
-    # {"model_path": "model_name/G_9000.pth", "config_path": "model_name/config.json"},
-]
-
 
 @dataclass
 class AsDictMixin:
@@ -391,6 +387,9 @@ class LanguageIdentification(AsDictMixin):
     espeak_library: str = r"C:/Program Files/eSpeak NG/libespeak-ng.dll" if "win" in sys.platform else ""
     # zh ja ko en... If it is empty, it will be read based on the text_cleaners specified in the config.json.
     language_automatic_detect: list = field(default_factory=list)
+    split_pattern: str = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
+                         r'\！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」' \
+                         r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
 
 
 @dataclass
@@ -476,9 +475,6 @@ def load_config():
                 else:
                     logging.info("config.yaml is empty, initializing config.yaml...")
 
-                # Load default models from config.py.
-                # config.update_config(model_list)
-
                 # If parameters are incomplete, they will be automatically filled in upon saving.
                 Config.save_config(config)
 

diff --git a/utils/sentence.py b/utils/sentence.py
@@ -2,6 +2,7 @@
 
 import regex as re
 
+from contants import config
 from utils.data_utils import check_is_none
 from utils.classify_language import classify_language, split_alpha_nonalpha
 
@@ -18,9 +19,7 @@ def _expand_hyphens(text):
 
 
 def markup_language(text: str, target_languages: list = None) -> str:
-    pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
-              r'\！？。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」' \
-              r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
+    pattern = config.LanguageIdentification.split_pattern
     sentences = re.split(pattern, text)
 
     pre_lang = ""
@@ -51,9 +50,7 @@ def markup_language(text: str, target_languages: list = None) -> str:
 
 def split_languages(text: str, target_languages: list = None, segment_size: int = 50,
                     expand_abbreviations: bool = False, expand_hyphens: bool = False) -> list:
-    pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
-              r'\！？\。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」' \
-              r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
+    pattern = config.language_identification.split_pattern
     sentences = re.split(pattern, text)
 
     pre_lang = ""