update Gaussian Upsampling and configs

keonlee9420 · Jul 13, 2021 · 523ec24 · 523ec24
1 parent ba7715d
commit 523ec24
Show file tree

Hide file tree

Showing 6 changed files with 57 additions and 95 deletions.
diff --git a/README.md b/README.md
@@ -132,4 +132,5 @@ to serve TensorBoard on your localhost.
 # References
 - [ivanvovk's WaveGrad](https://github.com/ivanvovk/WaveGrad)
 - [yanggeng1995's EATS](https://github.com/yanggeng1995/EATS)
-- [ming024's FastSpeech2](https://github.com/ming024/FastSpeech2)
+- [ming024's FastSpeech2](https://github.com/ming024/FastSpeech2)
+- [mindslab-ai's wavegrad2](https://github.com/mindslab-ai/wavegrad2)
diff --git a/config/LJSpeech/train.yaml b/config/LJSpeech/train.yaml
@@ -1,26 +1,27 @@
 path:
-  ckpt_path: "./output/ckpt/LJSpeech_wg2_TEST_down______logging_phone"
-  log_path: "./output/log/LJSpeech_wg2_TEST_down______logging_phone"
-  result_path: "./output/result/LJSpeech_wg2_TEST_down______logging_phone"
+  ckpt_path: "./output/ckpt/LJSpeech"
+  log_path: "./output/log/LJSpeech"
+  result_path: "./output/result/LJSpeech"
   noise_schedule_path: "./noise_schedule"
 optimizer:
   batch_size: 12
   betas: [0.9, 0.98]
   eps: 0.000000001
-  weight_decay: 0.0
+  weight_decay: 0.000001
   grad_clip_thresh: 1.0
   grad_acc_step: 1
   warm_up_step: 4000
   anneal_steps: [300000, 400000, 500000]
   anneal_rate: 0.3
+  init_lr: 0.0003
 step:
   total_step: 900000
   log_step: 100
   synth_step: 1000
   val_step: 1000
   save_step: 50000
 window:
-  segment_length: 19200 # [19200, 76800]
+  segment_length: 76800 # [19200, 76800]
   noise_iter: 1000 # [6, 7, 12, 25, 50, 100, 1000]
   # training_noise_schedule: {
   #   "n_iter": 1000,

diff --git a/model/modules.py b/model/modules.py
@@ -100,39 +100,38 @@ class GaussianUpsampling(nn.Module):
 
     def __init__(self, model_config):
         super(GaussianUpsampling, self).__init__()
-        # self.range_param_predictor = RangeParameterPredictor(model_config)
+        self.range_param_predictor = RangeParameterPredictor(model_config)
+
+    def get_alignment_energies(self, gaussian, frames):
+        """
+        See https://github.com/mindslab-ai/wavegrad2
+        """
+        energies = gaussian.log_prob(frames).exp()  # [B, L, T]
+        return energies
 
     def forward(self, encoder_outputs, duration, mask):
         device = encoder_outputs.device
 
-        # range_param = self.range_param_predictor(encoder_outputs, duration, mask)
-
         t = torch.sum(duration, dim=-1, keepdim=True) #[B, 1]
 
         e = torch.cumsum(duration, dim=-1).float() #[B, L]
         c = e - 0.5 * duration #[B, L]
         t = torch.arange(1, torch.max(t).item()+1, device=device) # (1, ..., T)
         t = t.unsqueeze(0).unsqueeze(1) #[1, 1, T]
         c = c.unsqueeze(2)
+        s = self.range_param_predictor(encoder_outputs, duration, mask).unsqueeze(-1)
 
-        # print(range_param, 0.1*(range_param ** 2))
-
-        # w_1 = torch.exp(-0.1*(range_param.unsqueeze(-1) ** -2) * (t - c) ** 2)  # [B, L, T]
-        # w_2 = torch.sum(torch.exp(-0.1*(range_param.unsqueeze(-1) ** -2) * (t - c) ** 2), dim=1, keepdim=True)  # [B, 1, T]
-        w_1 = torch.exp(-0.1 * (t - c) ** 2)  # [B, L, T]
-        w_2 = torch.sum(torch.exp(-0.1 * (t - c) ** 2), dim=1, keepdim=True)  # [B, 1, T]
-        w_2[w_2==0.] = 1.
+        g = torch.distributions.normal.Normal(loc=c, scale=s)
 
-        # w_1 = self.normpdf(t, c, range_param.unsqueeze(-1))  # [B, L, T]
-        # w_1 = torch.distributions.normal.Normal(c, 0.1).log_prob(t)  # [B, L, T]
-        # w_2 = torch.sum(w_1, dim=1, keepdim=True)  # [B, 1, T]
-        # w_2[w_2==0.] = 1.
+        w = self.get_alignment_energies(g, t)  # [B, L, T]
 
-        w = w_1 / w_2
+        if mask is not None:
+            w = w.masked_fill(mask.unsqueeze(-1), 0.0)
 
-        out = torch.matmul(w.transpose(1, 2), encoder_outputs)
+        attn = w / (torch.sum(w, dim=1).unsqueeze(1) + 1e-8)  # [B, L, T]
+        out = torch.bmm(attn.transpose(1, 2), encoder_outputs)
 
-        return out, w
+        return out, attn
 
 
 class DurationPredictor(nn.Module):
@@ -163,33 +162,33 @@ def forward(self, encoder_output, mask):
         return duration_prediction
 
 
-# class RangeParameterPredictor(nn.Module):
-#     """ Range Parameter Predictor """
-
-#     def __init__(self, model_config):
-#         super(RangeParameterPredictor, self).__init__()
-#         encoder_hidden = model_config["transformer"]["encoder_hidden"]
-#         variance_hidden = model_config["variance_predictor"]["variance_hidden"]
-
-#         self.range_param_lstm = nn.LSTM(
-#             encoder_hidden + 1,
-#             int(variance_hidden / 2), 2,
-#             batch_first=True, bidirectional=True
-#         )
-#         self.range_param_proj = nn.Sequential(
-#             LinearNorm(variance_hidden, 1),
-#             nn.Softplus(),
-#         )
-
-#     def forward(self, encoder_output, duration, mask):
-#         range_param_input = torch.cat([encoder_output, duration.unsqueeze(-1)], dim=-1)
-#         range_param_prediction, _ = self.range_param_lstm(range_param_input)
-#         range_param_prediction = self.range_param_proj(range_param_prediction)
-#         range_param_prediction = range_param_prediction.squeeze(-1) # [B, L]
-#         if mask is not None:
-#             range_param_prediction = range_param_prediction.masked_fill(mask, 0.0)
-
-#         return range_param_prediction
+class RangeParameterPredictor(nn.Module):
+    """ Range Parameter Predictor """
+
+    def __init__(self, model_config):
+        super(RangeParameterPredictor, self).__init__()
+        encoder_hidden = model_config["transformer"]["encoder_hidden"]
+        variance_hidden = model_config["variance_predictor"]["variance_hidden"]
+
+        self.range_param_lstm = nn.LSTM(
+            encoder_hidden + 1,
+            int(variance_hidden / 2), 2,
+            batch_first=True, bidirectional=True
+        )
+        self.range_param_proj = nn.Sequential(
+            LinearNorm(variance_hidden, 1),
+            nn.Softplus(),
+        )
+
+    def forward(self, encoder_output, duration, mask):
+        range_param_input = torch.cat([encoder_output, duration.unsqueeze(-1)], dim=-1)
+        range_param_prediction, _ = self.range_param_lstm(range_param_input)
+        range_param_prediction = self.range_param_proj(range_param_prediction)
+        range_param_prediction = range_param_prediction.squeeze(-1) # [B, L]
+        if mask is not None:
+            range_param_prediction = range_param_prediction.masked_fill(mask, 0.0)
+
+        return range_param_prediction
 
 
 class SamplingWindow(nn.Module):

diff --git a/model/optimizer.py b/model/optimizer.py
@@ -5,22 +5,18 @@
 class ScheduledOptim:
     """ A simple wrapper class for learning rate scheduling """
 
-    def __init__(self, model, train_config, model_config, current_step):
+    def __init__(self, model, train_config):
 
         self._optimizer = torch.optim.Adam(
             model.parameters(),
             betas=train_config["optimizer"]["betas"],
             eps=train_config["optimizer"]["eps"],
             weight_decay=train_config["optimizer"]["weight_decay"],
         )
-        self.n_warmup_steps = train_config["optimizer"]["warm_up_step"]
-        self.anneal_steps = train_config["optimizer"]["anneal_steps"]
-        self.anneal_rate = train_config["optimizer"]["anneal_rate"]
-        self.current_step = current_step
-        self.init_lr = np.power(model_config["transformer"]["encoder_hidden"], -0.5)
+        self.init_lr = train_config["optimizer"]["init_lr"]
+        self._init_learning_rate()
 
     def step_and_update_lr(self):
-        self._update_learning_rate()
         self._optimizer.step()
 
     def zero_grad(self):
@@ -30,22 +26,7 @@ def zero_grad(self):
     def load_state_dict(self, path):
         self._optimizer.load_state_dict(path)
 
-    def _get_lr_scale(self):
-        lr = np.min(
-            [
-                np.power(self.current_step, -0.5),
-                np.power(self.n_warmup_steps, -1.5) * self.current_step,
-            ]
-        )
-        for s in self.anneal_steps:
-            if self.current_step > s:
-                lr = lr * self.anneal_rate
-        return lr
-
-    def _update_learning_rate(self):
-        """ Learning rate scheduling per step """
-        self.current_step += 1
-        lr = self.init_lr * self._get_lr_scale()
-
+    def _init_learning_rate(self):
+        lr = self.init_lr
         for param_group in self._optimizer.param_groups:
             param_group["lr"] = lr
diff --git a/preprocessor/preprocessor.py b/preprocessor/preprocessor.py
@@ -23,7 +23,6 @@ def __init__(self, config):
     def build_from_path(self):
         os.makedirs((os.path.join(self.out_dir, "wav")), exist_ok=True)
         os.makedirs((os.path.join(self.out_dir, "duration")), exist_ok=True)
-        # os.makedirs((os.path.join(self.out_dir, "duration_up")), exist_ok=True)
 
         print("Processing Data ...")
         out = list()
@@ -96,15 +95,6 @@ def process_utterance(self, speaker, basename):
             int(self.sampling_rate * start) : int(self.sampling_rate * end)
         ].astype(np.float32)
 
-        # # A single frame padding
-        # if sum(duration_up) > wav.shape[0]:
-        #     wav = np.concatenate((wav, np.array([0.])))
-        # elif sum(duration_up) < wav.shape[0]:
-        #     wav = wav[:-1]
-
-        # assert sum(duration_up) == wav.shape[0], \
-        #     "{} is wrongly processed in length!: {} != {}".format(basename, sum(duration_up), wav.shape[0])
-
         # Read raw text
         with open(text_path, "r") as f:
             raw_text = f.readline().strip("\n")
@@ -113,9 +103,6 @@ def process_utterance(self, speaker, basename):
         dur_filename = "{}-duration-{}.npy".format(speaker, basename)
         np.save(os.path.join(self.out_dir, "duration", dur_filename), duration)
 
-        # dur_up_filename = "{}-duration_up-{}.npy".format(speaker, basename)
-        # np.save(os.path.join(self.out_dir, "duration_up", dur_up_filename), duration_up)
-
         wav_filename = "{}-wav-{}.wav".format(speaker, basename)
         wavfile.write(
             os.path.join(self.out_dir, "wav", wav_filename),
@@ -162,16 +149,9 @@ def get_alignment(self, tier):
                     - np.round(s * self.sampling_rate / self.hop_length)
                 )
             )
-            # durations_up.append(
-            #     int(
-            #         np.round(e * self.sampling_rate)
-            #         - np.round(s * self.sampling_rate)
-            #     )
-            # )
 
         # Trim tailing silences
         phones = phones[:end_idx]
         durations = durations[:end_idx]
-        # durations_up = durations_up[:end_idx]
 
         return phones, durations, durations_up, start_time, end_time
diff --git a/utils/model.py b/utils/model.py
@@ -20,7 +20,7 @@ def get_model(args, configs, device, train=False):
 
     if train:
         scheduled_optim = ScheduledOptim(
-            model, train_config, model_config, args.restore_step
+            model, train_config
         )
         if args.restore_step:
             scheduled_optim.load_state_dict(ckpt["optimizer"])