Skip to content

Commit

Permalink
update Gaussian Upsampling and configs
Browse files Browse the repository at this point in the history
  • Loading branch information
keonlee9420 committed Jul 13, 2021
1 parent ba7715d commit 523ec24
Show file tree
Hide file tree
Showing 6 changed files with 57 additions and 95 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,4 +132,5 @@ to serve TensorBoard on your localhost.
# References
- [ivanvovk's WaveGrad](https://github.com/ivanvovk/WaveGrad)
- [yanggeng1995's EATS](https://github.com/yanggeng1995/EATS)
- [ming024's FastSpeech2](https://github.com/ming024/FastSpeech2)
- [ming024's FastSpeech2](https://github.com/ming024/FastSpeech2)
- [mindslab-ai's wavegrad2](https://github.com/mindslab-ai/wavegrad2)
11 changes: 6 additions & 5 deletions config/LJSpeech/train.yaml
Original file line number Diff line number Diff line change
@@ -1,26 +1,27 @@
path:
ckpt_path: "./output/ckpt/LJSpeech_wg2_TEST_down______logging_phone"
log_path: "./output/log/LJSpeech_wg2_TEST_down______logging_phone"
result_path: "./output/result/LJSpeech_wg2_TEST_down______logging_phone"
ckpt_path: "./output/ckpt/LJSpeech"
log_path: "./output/log/LJSpeech"
result_path: "./output/result/LJSpeech"
noise_schedule_path: "./noise_schedule"
optimizer:
batch_size: 12
betas: [0.9, 0.98]
eps: 0.000000001
weight_decay: 0.0
weight_decay: 0.000001
grad_clip_thresh: 1.0
grad_acc_step: 1
warm_up_step: 4000
anneal_steps: [300000, 400000, 500000]
anneal_rate: 0.3
init_lr: 0.0003
step:
total_step: 900000
log_step: 100
synth_step: 1000
val_step: 1000
save_step: 50000
window:
segment_length: 19200 # [19200, 76800]
segment_length: 76800 # [19200, 76800]
noise_iter: 1000 # [6, 7, 12, 25, 50, 100, 1000]
# training_noise_schedule: {
# "n_iter": 1000,
Expand Down
87 changes: 43 additions & 44 deletions model/modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,39 +100,38 @@ class GaussianUpsampling(nn.Module):

def __init__(self, model_config):
super(GaussianUpsampling, self).__init__()
# self.range_param_predictor = RangeParameterPredictor(model_config)
self.range_param_predictor = RangeParameterPredictor(model_config)

def get_alignment_energies(self, gaussian, frames):
"""
See https://github.com/mindslab-ai/wavegrad2
"""
energies = gaussian.log_prob(frames).exp() # [B, L, T]
return energies

def forward(self, encoder_outputs, duration, mask):
device = encoder_outputs.device

# range_param = self.range_param_predictor(encoder_outputs, duration, mask)

t = torch.sum(duration, dim=-1, keepdim=True) #[B, 1]

e = torch.cumsum(duration, dim=-1).float() #[B, L]
c = e - 0.5 * duration #[B, L]
t = torch.arange(1, torch.max(t).item()+1, device=device) # (1, ..., T)
t = t.unsqueeze(0).unsqueeze(1) #[1, 1, T]
c = c.unsqueeze(2)
s = self.range_param_predictor(encoder_outputs, duration, mask).unsqueeze(-1)

# print(range_param, 0.1*(range_param ** 2))

# w_1 = torch.exp(-0.1*(range_param.unsqueeze(-1) ** -2) * (t - c) ** 2) # [B, L, T]
# w_2 = torch.sum(torch.exp(-0.1*(range_param.unsqueeze(-1) ** -2) * (t - c) ** 2), dim=1, keepdim=True) # [B, 1, T]
w_1 = torch.exp(-0.1 * (t - c) ** 2) # [B, L, T]
w_2 = torch.sum(torch.exp(-0.1 * (t - c) ** 2), dim=1, keepdim=True) # [B, 1, T]
w_2[w_2==0.] = 1.
g = torch.distributions.normal.Normal(loc=c, scale=s)

# w_1 = self.normpdf(t, c, range_param.unsqueeze(-1)) # [B, L, T]
# w_1 = torch.distributions.normal.Normal(c, 0.1).log_prob(t) # [B, L, T]
# w_2 = torch.sum(w_1, dim=1, keepdim=True) # [B, 1, T]
# w_2[w_2==0.] = 1.
w = self.get_alignment_energies(g, t) # [B, L, T]

w = w_1 / w_2
if mask is not None:
w = w.masked_fill(mask.unsqueeze(-1), 0.0)

out = torch.matmul(w.transpose(1, 2), encoder_outputs)
attn = w / (torch.sum(w, dim=1).unsqueeze(1) + 1e-8) # [B, L, T]
out = torch.bmm(attn.transpose(1, 2), encoder_outputs)

return out, w
return out, attn


class DurationPredictor(nn.Module):
Expand Down Expand Up @@ -163,33 +162,33 @@ def forward(self, encoder_output, mask):
return duration_prediction


# class RangeParameterPredictor(nn.Module):
# """ Range Parameter Predictor """

# def __init__(self, model_config):
# super(RangeParameterPredictor, self).__init__()
# encoder_hidden = model_config["transformer"]["encoder_hidden"]
# variance_hidden = model_config["variance_predictor"]["variance_hidden"]

# self.range_param_lstm = nn.LSTM(
# encoder_hidden + 1,
# int(variance_hidden / 2), 2,
# batch_first=True, bidirectional=True
# )
# self.range_param_proj = nn.Sequential(
# LinearNorm(variance_hidden, 1),
# nn.Softplus(),
# )

# def forward(self, encoder_output, duration, mask):
# range_param_input = torch.cat([encoder_output, duration.unsqueeze(-1)], dim=-1)
# range_param_prediction, _ = self.range_param_lstm(range_param_input)
# range_param_prediction = self.range_param_proj(range_param_prediction)
# range_param_prediction = range_param_prediction.squeeze(-1) # [B, L]
# if mask is not None:
# range_param_prediction = range_param_prediction.masked_fill(mask, 0.0)

# return range_param_prediction
class RangeParameterPredictor(nn.Module):
""" Range Parameter Predictor """

def __init__(self, model_config):
super(RangeParameterPredictor, self).__init__()
encoder_hidden = model_config["transformer"]["encoder_hidden"]
variance_hidden = model_config["variance_predictor"]["variance_hidden"]

self.range_param_lstm = nn.LSTM(
encoder_hidden + 1,
int(variance_hidden / 2), 2,
batch_first=True, bidirectional=True
)
self.range_param_proj = nn.Sequential(
LinearNorm(variance_hidden, 1),
nn.Softplus(),
)

def forward(self, encoder_output, duration, mask):
range_param_input = torch.cat([encoder_output, duration.unsqueeze(-1)], dim=-1)
range_param_prediction, _ = self.range_param_lstm(range_param_input)
range_param_prediction = self.range_param_proj(range_param_prediction)
range_param_prediction = range_param_prediction.squeeze(-1) # [B, L]
if mask is not None:
range_param_prediction = range_param_prediction.masked_fill(mask, 0.0)

return range_param_prediction


class SamplingWindow(nn.Module):
Expand Down
29 changes: 5 additions & 24 deletions model/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,18 @@
class ScheduledOptim:
""" A simple wrapper class for learning rate scheduling """

def __init__(self, model, train_config, model_config, current_step):
def __init__(self, model, train_config):

self._optimizer = torch.optim.Adam(
model.parameters(),
betas=train_config["optimizer"]["betas"],
eps=train_config["optimizer"]["eps"],
weight_decay=train_config["optimizer"]["weight_decay"],
)
self.n_warmup_steps = train_config["optimizer"]["warm_up_step"]
self.anneal_steps = train_config["optimizer"]["anneal_steps"]
self.anneal_rate = train_config["optimizer"]["anneal_rate"]
self.current_step = current_step
self.init_lr = np.power(model_config["transformer"]["encoder_hidden"], -0.5)
self.init_lr = train_config["optimizer"]["init_lr"]
self._init_learning_rate()

def step_and_update_lr(self):
self._update_learning_rate()
self._optimizer.step()

def zero_grad(self):
Expand All @@ -30,22 +26,7 @@ def zero_grad(self):
def load_state_dict(self, path):
self._optimizer.load_state_dict(path)

def _get_lr_scale(self):
lr = np.min(
[
np.power(self.current_step, -0.5),
np.power(self.n_warmup_steps, -1.5) * self.current_step,
]
)
for s in self.anneal_steps:
if self.current_step > s:
lr = lr * self.anneal_rate
return lr

def _update_learning_rate(self):
""" Learning rate scheduling per step """
self.current_step += 1
lr = self.init_lr * self._get_lr_scale()

def _init_learning_rate(self):
lr = self.init_lr
for param_group in self._optimizer.param_groups:
param_group["lr"] = lr
20 changes: 0 additions & 20 deletions preprocessor/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def __init__(self, config):
def build_from_path(self):
os.makedirs((os.path.join(self.out_dir, "wav")), exist_ok=True)
os.makedirs((os.path.join(self.out_dir, "duration")), exist_ok=True)
# os.makedirs((os.path.join(self.out_dir, "duration_up")), exist_ok=True)

print("Processing Data ...")
out = list()
Expand Down Expand Up @@ -96,15 +95,6 @@ def process_utterance(self, speaker, basename):
int(self.sampling_rate * start) : int(self.sampling_rate * end)
].astype(np.float32)

# # A single frame padding
# if sum(duration_up) > wav.shape[0]:
# wav = np.concatenate((wav, np.array([0.])))
# elif sum(duration_up) < wav.shape[0]:
# wav = wav[:-1]

# assert sum(duration_up) == wav.shape[0], \
# "{} is wrongly processed in length!: {} != {}".format(basename, sum(duration_up), wav.shape[0])

# Read raw text
with open(text_path, "r") as f:
raw_text = f.readline().strip("\n")
Expand All @@ -113,9 +103,6 @@ def process_utterance(self, speaker, basename):
dur_filename = "{}-duration-{}.npy".format(speaker, basename)
np.save(os.path.join(self.out_dir, "duration", dur_filename), duration)

# dur_up_filename = "{}-duration_up-{}.npy".format(speaker, basename)
# np.save(os.path.join(self.out_dir, "duration_up", dur_up_filename), duration_up)

wav_filename = "{}-wav-{}.wav".format(speaker, basename)
wavfile.write(
os.path.join(self.out_dir, "wav", wav_filename),
Expand Down Expand Up @@ -162,16 +149,9 @@ def get_alignment(self, tier):
- np.round(s * self.sampling_rate / self.hop_length)
)
)
# durations_up.append(
# int(
# np.round(e * self.sampling_rate)
# - np.round(s * self.sampling_rate)
# )
# )

# Trim tailing silences
phones = phones[:end_idx]
durations = durations[:end_idx]
# durations_up = durations_up[:end_idx]

return phones, durations, durations_up, start_time, end_time
2 changes: 1 addition & 1 deletion utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def get_model(args, configs, device, train=False):

if train:
scheduled_optim = ScheduledOptim(
model, train_config, model_config, args.restore_step
model, train_config
)
if args.restore_step:
scheduled_optim.load_state_dict(ckpt["optimizer"])
Expand Down

0 comments on commit 523ec24

Please sign in to comment.