subdivide long words and sort the dataset

keonlee9420 · Oct 12, 2021 · 84fad6f · 84fad6f
1 parent bf23771
commit 84fad6f
Show file tree

Hide file tree

Showing 8 changed files with 12,956 additions and 12,924 deletions.
diff --git a/README.md b/README.md
@@ -120,6 +120,7 @@ to serve TensorBoard on your localhost.
 # Notes
 
 - For vocoder, **HiFi-GAN** and **MelGAN** are supported.
+- Speed up the convergence of word-to-phoneme alignment in **LinguisticEncoder** by dividing long words into subwords and sorting the dataset by mel-spectrogram frame length.
 - No ReLU activation and LayerNorm in **VariationalGenerator** to avoid mashed output.
 - Will be extended to a **multi-speaker TTS**.
 <!-- - Two options for embedding for the **multi-speaker TTS** setting: training speaker embedder from scratch or using a pre-trained [philipperemy's DeepSpeaker](https://github.com/philipperemy/deep-speaker) model (as [STYLER](https://github.com/keonlee9420/STYLER) did). You can toggle it by setting the config (between `'none'` and `'DeepSpeaker'`).

diff --git a/config/LJSpeech/preprocess.yaml b/config/LJSpeech/preprocess.yaml
@@ -7,10 +7,13 @@ path:
   preprocessed_path: "./preprocessed_data/LJSpeech"
 
 preprocessing:
+  sort_data: True # sort data by mel frame length
   val_size: 512
   text:
     text_cleaners: ["english_cleaners"]
     language: "en"
+    sub_divide_word: True
+    max_phoneme_num: 7
   audio:
     sampling_rate: 22050
     max_wav_value: 32768.0

diff --git a/config/LJSpeech/train.yaml b/config/LJSpeech/train.yaml
@@ -8,7 +8,7 @@ path:
   log_path: "./output/log/LJSpeech"
   result_path: "./output/result/LJSpeech"
 optimizer:
-  batch_size: 8
+  batch_size: 64
   betas: [0.9, 0.98]
   eps: 0.000000001
   weight_decay: 0.0

diff --git a/preprocessed_data/LJSpeech/train.txt b/preprocessed_data/LJSpeech/train.txt
diff --git a/preprocessed_data/LJSpeech/val.txt b/preprocessed_data/LJSpeech/val.txt
diff --git a/preprocessor/preprocessor.py b/preprocessor/preprocessor.py
@@ -14,7 +14,7 @@
 
 import audio as Audio
 from model import PreDefinedEmbedder
-from utils.tools import plot_embedding
+from utils.tools import plot_embedding, word_level_subdivision
 
 
 class Preprocessor:
@@ -28,6 +28,9 @@ def __init__(self, preprocess_config, model_config, train_config):
         self.val_size = preprocess_config["preprocessing"]["val_size"]
         self.sampling_rate = preprocess_config["preprocessing"]["audio"]["sampling_rate"]
         self.hop_length = preprocess_config["preprocessing"]["stft"]["hop_length"]
+        self.sort_data = preprocess_config["preprocessing"]["sort_data"]
+        self.sub_divide_word = preprocess_config["preprocessing"]["text"]["sub_divide_word"]
+        self.max_phoneme_num = preprocess_config["preprocessing"]["text"]["max_phoneme_num"]
 
         self.STFT = Audio.stft.TacotronSTFT(
             preprocess_config["preprocessing"]["stft"]["filter_length"],
@@ -78,6 +81,7 @@ def build_from_path(self):
         val = list()
         n_frames = 0
         max_seq_len = -float('inf')
+        mel_frame_len_dict = dict()
 
         skip_speakers = set()
         for embedding_name in os.listdir(embedding_dir):
@@ -120,6 +124,7 @@ def build_from_path(self):
                         max_seq_len = n
 
                     n_frames += n
+                    mel_frame_len_dict[basename] = n
 
                 # Calculate and save mean speaker embedding of this speaker
                 if save_speaker_emb:
@@ -162,6 +167,10 @@ def build_from_path(self):
             train = out[self.val_size:]
             val = out[: self.val_size]
 
+        if self.sort_data:
+            train.sort(key=lambda x: mel_frame_len_dict[x.split("|")[0]])
+            val.sort(key=lambda x: mel_frame_len_dict[x.split("|")[0]])
+
         # Write metadata
         with open(os.path.join(self.out_dir, "train.txt"), "w", encoding="utf-8") as f:
             for m in train:
@@ -184,6 +193,9 @@ def process_utterance(self, tg_path, speaker, basename, save_speaker_emb):
             textgrid.get_tier_by_name("phones"),
             textgrid.get_tier_by_name("words"),
         )
+        if self.sub_divide_word:
+            phones_per_word = word_level_subdivision(
+                phones_per_word, self.max_phoneme_num)
         text = "{" + " ".join(phone) + "}"
         if start >= end:
             return None

diff --git a/synthesize.py b/synthesize.py
@@ -12,7 +12,7 @@
 # from pypinyin import pinyin, Style
 
 from utils.model import get_model, get_vocoder
-from utils.tools import get_configs_of, to_device, synth_samples
+from utils.tools import get_configs_of, to_device, synth_samples, word_level_subdivision
 from dataset import TextDataset
 from text import text_to_sequence
 
@@ -49,6 +49,10 @@ def preprocess_english(text, preprocess_config):
     phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
     phones = phones.replace("}{", " ")
 
+    if preprocess_config["preprocessing"]["text"]["sub_divide_word"]:
+        word_boundaries = word_level_subdivision(
+            word_boundaries, preprocess_config["preprocessing"]["text"]["max_phoneme_num"])
+
     print("Raw Text Sequence: {}".format(text))
     print("Phoneme Sequence: {}".format(phones))
     sequence = np.array(text_to_sequence(

diff --git a/utils/tools.py b/utils/tools.py
@@ -163,7 +163,8 @@ def synth_one_sample(model, targets, predictions, vocoder, model_config, preproc
                                     :mel_len].float().detach()
     # Variational Generator Reconstruction
     residual = predictions[11][0, :mel_len].unsqueeze(0).detach()
-    out_residual = model.variational_generator.residual_layer(mel_reconst_vg.unsqueeze(0))
+    out_residual = model.variational_generator.residual_layer(
+        mel_reconst_vg.unsqueeze(0))
     mel_reconst_vg = mel_reconst_vg.transpose(0, 1)
 
     # PostNet Inference on the reconstruction
@@ -415,3 +416,14 @@ def word_level_pooling(src_seq, src_len, wb, src_w_len, reduce="sum"):
             raise ValueError()
         batch.append(m)
     return pad(batch).to(device)
+
+
+def word_level_subdivision(phones_per_word, max_phoneme_num):
+    res = []
+    for l in phones_per_word:
+        if l <= max_phoneme_num:
+            res.append(l)
+        else:
+            s, r = l//max_phoneme_num, l % max_phoneme_num
+            res += [max_phoneme_num]*s + ([r] if r else [])
+    return res