Skip to content

Commit

Permalink
subdivide long words and sort the dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
keonlee9420 committed Oct 12, 2021
1 parent bf23771 commit 84fad6f
Show file tree
Hide file tree
Showing 8 changed files with 12,956 additions and 12,924 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ to serve TensorBoard on your localhost.
# Notes

- For vocoder, **HiFi-GAN** and **MelGAN** are supported.
- Speed ​​up the convergence of word-to-phoneme alignment in **LinguisticEncoder** by dividing long words into subwords and sorting the dataset by mel-spectrogram frame length.
- No ReLU activation and LayerNorm in **VariationalGenerator** to avoid mashed output.
- Will be extended to a **multi-speaker TTS**.
<!-- - Two options for embedding for the **multi-speaker TTS** setting: training speaker embedder from scratch or using a pre-trained [philipperemy's DeepSpeaker](https://github.com/philipperemy/deep-speaker) model (as [STYLER](https://github.com/keonlee9420/STYLER) did). You can toggle it by setting the config (between `'none'` and `'DeepSpeaker'`).
Expand Down
3 changes: 3 additions & 0 deletions config/LJSpeech/preprocess.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@ path:
preprocessed_path: "./preprocessed_data/LJSpeech"

preprocessing:
sort_data: True # sort data by mel frame length
val_size: 512
text:
text_cleaners: ["english_cleaners"]
language: "en"
sub_divide_word: True
max_phoneme_num: 7
audio:
sampling_rate: 22050
max_wav_value: 32768.0
Expand Down
2 changes: 1 addition & 1 deletion config/LJSpeech/train.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ path:
log_path: "./output/log/LJSpeech"
result_path: "./output/result/LJSpeech"
optimizer:
batch_size: 8
batch_size: 64
betas: [0.9, 0.98]
eps: 0.000000001
weight_decay: 0.0
Expand Down
24,900 changes: 12,450 additions & 12,450 deletions preprocessed_data/LJSpeech/train.txt

Large diffs are not rendered by default.

940 changes: 470 additions & 470 deletions preprocessed_data/LJSpeech/val.txt

Large diffs are not rendered by default.

14 changes: 13 additions & 1 deletion preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

import audio as Audio
from model import PreDefinedEmbedder
from utils.tools import plot_embedding
from utils.tools import plot_embedding, word_level_subdivision


class Preprocessor:
Expand All @@ -28,6 +28,9 @@ def __init__(self, preprocess_config, model_config, train_config):
self.val_size = preprocess_config["preprocessing"]["val_size"]
self.sampling_rate = preprocess_config["preprocessing"]["audio"]["sampling_rate"]
self.hop_length = preprocess_config["preprocessing"]["stft"]["hop_length"]
self.sort_data = preprocess_config["preprocessing"]["sort_data"]
self.sub_divide_word = preprocess_config["preprocessing"]["text"]["sub_divide_word"]
self.max_phoneme_num = preprocess_config["preprocessing"]["text"]["max_phoneme_num"]

self.STFT = Audio.stft.TacotronSTFT(
preprocess_config["preprocessing"]["stft"]["filter_length"],
Expand Down Expand Up @@ -78,6 +81,7 @@ def build_from_path(self):
val = list()
n_frames = 0
max_seq_len = -float('inf')
mel_frame_len_dict = dict()

skip_speakers = set()
for embedding_name in os.listdir(embedding_dir):
Expand Down Expand Up @@ -120,6 +124,7 @@ def build_from_path(self):
max_seq_len = n

n_frames += n
mel_frame_len_dict[basename] = n

# Calculate and save mean speaker embedding of this speaker
if save_speaker_emb:
Expand Down Expand Up @@ -162,6 +167,10 @@ def build_from_path(self):
train = out[self.val_size:]
val = out[: self.val_size]

if self.sort_data:
train.sort(key=lambda x: mel_frame_len_dict[x.split("|")[0]])
val.sort(key=lambda x: mel_frame_len_dict[x.split("|")[0]])

# Write metadata
with open(os.path.join(self.out_dir, "train.txt"), "w", encoding="utf-8") as f:
for m in train:
Expand All @@ -184,6 +193,9 @@ def process_utterance(self, tg_path, speaker, basename, save_speaker_emb):
textgrid.get_tier_by_name("phones"),
textgrid.get_tier_by_name("words"),
)
if self.sub_divide_word:
phones_per_word = word_level_subdivision(
phones_per_word, self.max_phoneme_num)
text = "{" + " ".join(phone) + "}"
if start >= end:
return None
Expand Down
6 changes: 5 additions & 1 deletion synthesize.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# from pypinyin import pinyin, Style

from utils.model import get_model, get_vocoder
from utils.tools import get_configs_of, to_device, synth_samples
from utils.tools import get_configs_of, to_device, synth_samples, word_level_subdivision
from dataset import TextDataset
from text import text_to_sequence

Expand Down Expand Up @@ -49,6 +49,10 @@ def preprocess_english(text, preprocess_config):
phones = re.sub(r"\{[^\w\s]?\}", "{sp}", phones)
phones = phones.replace("}{", " ")

if preprocess_config["preprocessing"]["text"]["sub_divide_word"]:
word_boundaries = word_level_subdivision(
word_boundaries, preprocess_config["preprocessing"]["text"]["max_phoneme_num"])

print("Raw Text Sequence: {}".format(text))
print("Phoneme Sequence: {}".format(phones))
sequence = np.array(text_to_sequence(
Expand Down
14 changes: 13 additions & 1 deletion utils/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,8 @@ def synth_one_sample(model, targets, predictions, vocoder, model_config, preproc
:mel_len].float().detach()
# Variational Generator Reconstruction
residual = predictions[11][0, :mel_len].unsqueeze(0).detach()
out_residual = model.variational_generator.residual_layer(mel_reconst_vg.unsqueeze(0))
out_residual = model.variational_generator.residual_layer(
mel_reconst_vg.unsqueeze(0))
mel_reconst_vg = mel_reconst_vg.transpose(0, 1)

# PostNet Inference on the reconstruction
Expand Down Expand Up @@ -415,3 +416,14 @@ def word_level_pooling(src_seq, src_len, wb, src_w_len, reduce="sum"):
raise ValueError()
batch.append(m)
return pad(batch).to(device)


def word_level_subdivision(phones_per_word, max_phoneme_num):
res = []
for l in phones_per_word:
if l <= max_phoneme_num:
res.append(l)
else:
s, r = l//max_phoneme_num, l % max_phoneme_num
res += [max_phoneme_num]*s + ([r] if r else [])
return res

0 comments on commit 84fad6f

Please sign in to comment.