Skip to content

Commit

Permalink
Update: Add the language identification pattern to config
Browse files Browse the repository at this point in the history
  • Loading branch information
Artrajz committed Jul 24, 2024
1 parent d4de56c commit 68cddda
Show file tree
Hide file tree
Showing 2 changed files with 6 additions and 13 deletions.
10 changes: 3 additions & 7 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,6 @@
it's recommended to specify model paths in config.yaml.
"""

model_list = [
# {"model_path": "model_name/G_9000.pth", "config_path": "model_name/config.json"},
]


@dataclass
class AsDictMixin:
Expand Down Expand Up @@ -391,6 +387,9 @@ class LanguageIdentification(AsDictMixin):
espeak_library: str = r"C:/Program Files/eSpeak NG/libespeak-ng.dll" if "win" in sys.platform else ""
# zh ja ko en... If it is empty, it will be read based on the text_cleaners specified in the config.json.
language_automatic_detect: list = field(default_factory=list)
split_pattern: str = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'


@dataclass
Expand Down Expand Up @@ -476,9 +475,6 @@ def load_config():
else:
logging.info("config.yaml is empty, initializing config.yaml...")

# Load default models from config.py.
# config.update_config(model_list)

# If parameters are incomplete, they will be automatically filled in upon saving.
Config.save_config(config)

Expand Down
9 changes: 3 additions & 6 deletions utils/sentence.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import regex as re

from contants import config
from utils.data_utils import check_is_none
from utils.classify_language import classify_language, split_alpha_nonalpha

Expand All @@ -18,9 +19,7 @@ def _expand_hyphens(text):


def markup_language(text: str, target_languages: list = None) -> str:
pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
pattern = config.LanguageIdentification.split_pattern
sentences = re.split(pattern, text)

pre_lang = ""
Expand Down Expand Up @@ -51,9 +50,7 @@ def markup_language(text: str, target_languages: list = None) -> str:

def split_languages(text: str, target_languages: list = None, segment_size: int = 50,
expand_abbreviations: bool = False, expand_hyphens: bool = False) -> list:
pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
r'\!?\。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
pattern = config.language_identification.split_pattern
sentences = re.split(pattern, text)

pre_lang = ""
Expand Down

0 comments on commit 68cddda

Please sign in to comment.