Skip to content

Commit

Permalink
train.py: using amp for mixed precision training
Browse files Browse the repository at this point in the history
  • Loading branch information
rafaelvalle committed Apr 3, 2019
1 parent bb20035 commit 0274619
Showing 1 changed file with 16 additions and 22 deletions.
38 changes: 16 additions & 22 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,13 @@
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader

from fp16_optimizer import FP16_Optimizer

from model import Tacotron2
from data_utils import TextMelLoader, TextMelCollate
from loss_function import Tacotron2Loss
from logger import Tacotron2Logger
from hparams import create_hparams


def batchnorm_to_float(module):
"""Converts batch norm modules to FP32"""
if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
module.float()
for child in module.children():
batchnorm_to_float(child)
return module


def reduce_tensor(tensor, n_gpus):
rt = tensor.clone()
dist.all_reduce(rt, op=dist.reduce_op.SUM)
Expand Down Expand Up @@ -80,8 +69,7 @@ def prepare_directories_and_logger(output_directory, log_directory, rank):
def load_model(hparams):
model = Tacotron2(hparams).cuda()
if hparams.fp16_run:
model = batchnorm_to_float(model.half())
model.decoder.attention_layer.score_mask_value = float(finfo('float16').min)
model.decoder.attention_layer.score_mask_value = finfo('float16').min

if hparams.distributed_run:
model = apply_gradient_allreduce(model)
Expand Down Expand Up @@ -177,9 +165,11 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
learning_rate = hparams.learning_rate
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
weight_decay=hparams.weight_decay)

if hparams.fp16_run:
optimizer = FP16_Optimizer(
optimizer, dynamic_loss_scale=hparams.dynamic_loss_scaling)
from apex import amp
model, optimizer = amp.initialize(
model, optimizer, opt_level='O2')

if hparams.distributed_run:
model = apply_gradient_allreduce(model)
Expand Down Expand Up @@ -207,6 +197,7 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
epoch_offset = max(0, int(iteration / len(train_loader)))

model.train()
is_overflow = False
# ================ MAIN TRAINNIG LOOP! ===================
for epoch in range(epoch_offset, hparams.epochs):
print("Epoch: {}".format(epoch))
Expand All @@ -224,27 +215,30 @@ def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
reduced_loss = reduce_tensor(loss.data, n_gpus).item()
else:
reduced_loss = loss.item()

if hparams.fp16_run:
optimizer.backward(loss)
grad_norm = optimizer.clip_fp32_grads(hparams.grad_clip_thresh)
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()

if hparams.fp16_run:
grad_norm = torch.nn.utils.clip_grad_norm_(
amp.master_params(optimizer), hparams.grad_clip_thresh)
is_overflow = math.isnan(grad_norm)
else:
grad_norm = torch.nn.utils.clip_grad_norm_(
model.parameters(), hparams.grad_clip_thresh)

optimizer.step()

overflow = optimizer.overflow if hparams.fp16_run else False

if not overflow and not math.isnan(reduced_loss) and rank == 0:
if not is_overflow and rank == 0:
duration = time.perf_counter() - start
print("Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
iteration, reduced_loss, grad_norm, duration))
logger.log_training(
reduced_loss, grad_norm, learning_rate, duration, iteration)

if not overflow and (iteration % hparams.iters_per_checkpoint == 0):
if not is_overflow and (iteration % hparams.iters_per_checkpoint == 0):
validate(model, criterion, valset, iteration,
hparams.batch_size, n_gpus, collate_fn, logger,
hparams.distributed_run, rank)
Expand Down

0 comments on commit 0274619

Please sign in to comment.