Merge branch 'master' into v.0.6.0

iamanigeeit · Nov 15, 2019 · 5d4a838 · 5d4a838
2 parents bd15d88 + f10be24
commit 5d4a838
Show file tree

Hide file tree

Showing 19 changed files with 126 additions and 30 deletions.
diff --git a/.gitignore b/.gitignore
@@ -41,6 +41,7 @@ egs/*/*/wav*
 tools/bin
 tools/include
 tools/lib
+tools/lib64
 tools/bats-core
 tools/chainer_ctc/
 tools/kaldi*

diff --git a/README.md b/README.md
@@ -385,9 +385,9 @@ We list the character error rate (CER) and word error rate (WER) of major ASR ta
 | Aishell test           | 6.7     | N/A     | same as above                                                                                                                                                         |
 | Common Voice dev       | 1.7     | 2.2     | [link](https://github.com/espnet/espnet/blob/master/egs/commonvoice/asr1/RESULTS.md#first-results-default-pytorch-transformer-setting-with-bpe-100-epochs-single-gpu) |
 | Common Voice test      | 1.8     | 2.3     | same as above                                                                                                                                                         |
-| CSJ eval1              | 5.7     | N/A     | N/A                                                                                                                                                                   |
-| CSJ eval2              | 4.1     | N/A     | N/A                                                                                                                                                                   |
-| CSJ eval3              | 4.5     | N/A     | N/A                                                                                                                                                                   |
+| CSJ eval1              | 5.7     | N/A     | [link](https://github.com/espnet/espnet/blob/master/egs/csj/asr1/RESULTS.md#pytorch-backend-transformer-without-any-hyperparameter-tuning)                            |
+| CSJ eval2              | 3.8     | N/A     | same as above                                                                                                                                                         |
+| CSJ eval3              | 4.2     | N/A     | same as above                                                                                                                                                         |
 | HKUST dev              | 23.5    | N/A     | [link](https://github.com/espnet/espnet/blob/master/egs/hkust/asr1/RESULTS.md#transformer-only-20-epochs)                                                             |
 | Librispeech dev_clean  | N/A     | 2.2     | [link](https://github.com/espnet/espnet/blob/master/egs/librispeech/asr1/RESULTS.md#pytorch-large-transformer-with-specaug-4-gpus--large-lm)                          |
 | Librispeech dev_other  | N/A     | 5.6     | same as above                                                                                                                                                         |
@@ -425,6 +425,7 @@ Available pretrained models in the demo script are listed as below.
 | [tedlium3.transformer.v1](https://drive.google.com/open?id=1wYYTwgvbB7uy6agHywhQfnuVWWW_obmO)    | Joint-CTC attention Transformer trained on Tedlium 3       |
 | [librispeech.transformer.v1](https://drive.google.com/open?id=1BtQvAnsFvVi-dp_qsaFP7n4A_5cwnlR6) | Joint-CTC attention Transformer trained on Librispeech     |
 | [commonvoice.transformer.v1](https://drive.google.com/open?id=1tWccl6aYU67kbtkm8jv5H6xayqg1rzjh) | Joint-CTC attention Transformer trained on CommonVoice     |
+| [csj.transformer.v1](https://drive.google.com/open?id=120nUQcSsKeY5dpyMWw_kI33ooMRGT2uF)         | Joint-CTC attention Transformer trained on CSJ             |
 
 
 ### TTS results

diff --git a/egs/csj/asr1/RESULTS.md b/egs/csj/asr1/RESULTS.md
@@ -1,10 +1,26 @@
 # Transformer results
 ## Pytorch backend Transformer without any hyperparameter tuning
-|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
-|---|---|---|---|---|---|---|---|---|
-|exp/train_nodup_sp_pytorch_transformer_conv2d_e12_unit2048_d6_unit2048_aheads4_dim256_mtlalpha0.3_noam_sampprob0.0_ngpu2_bs32_lr10.0_warmup25000_mli512_mlo150_epochs50_accum2_lennormFalse_lsmunigram0.1/decode_eval1_beam20_emodel.last10.avg.best_p0.0_len0.0-0.0_ctcw0.3_rnnlm0.3_2layer_unit650_sgd_bs256/|1272|43897|95.1|3.1|1.8|0.8|5.7|52.9|
-|exp/train_nodup_sp_pytorch_transformer_conv2d_e12_unit2048_d6_unit2048_aheads4_dim256_mtlalpha0.3_noam_sampprob0.0_ngpu2_bs32_lr10.0_warmup25000_mli512_mlo150_epochs50_accum2_lennormFalse_lsmunigram0.1/decode_eval2_beam20_emodel.last10.avg.best_p0.0_len0.0-0.0_ctcw0.3_rnnlm0.3_2layer_unit650_sgd_bs256/|1292|43623|96.4|2.4|1.2|0.5|4.1|51.9|
-|exp/train_nodup_sp_pytorch_transformer_conv2d_e12_unit2048_d6_unit2048_aheads4_dim256_mtlalpha0.3_noam_sampprob0.0_ngpu2_bs32_lr10.0_warmup25000_mli512_mlo150_epochs50_accum2_lennormFalse_lsmunigram0.1/decode_eval3_beam20_emodel.last10.avg.best_p0.0_len0.0-0.0_ctcw0.3_rnnlm0.3_2layer_unit650_sgd_bs256/|1385|28225|96.3|2.5|1.2|0.8|4.5|36.6|
+  - Model files (archived to transformer.v1.tar.gz by `$ pack_model.sh`)
+    - model link: https://drive.google.com/open?id=120nUQcSsKeY5dpyMWw_kI33ooMRGT2uF
+    - training config file: `conf/train.yaml`
+    - decoding config file: `conf/decode.yaml`
+    - cmvn file: `data/train_nodup_sp/cmvn.ark`
+    - e2e file: `exp/train_nodup_sp_pytorch_train/results/model.acc.best`
+    - e2e JSON file: `exp/train_nodup_sp_pytorch_train/results/model.json`
+    - lm file: `exp/train_rnnlm_pytorch_lm/rnnlm.model.best`
+    - lm JSON file: `exp/train_rnnlm_pytorch_lm/model.json`
+  - Results (paste them by yourself or obtained by `$ pack_model.sh --results <results>`)
+```
+exp/train_nodup_sp_pytorch_train/decode_eval1_decode_lm/result.txt
+     | SPKR     | # Snt  # Wrd | Corr    Sub    Del    Ins    Err  S.Err |
+     | Sum/Avg  | 1272   43897 | 95.1    3.1    1.7    0.8    5.7   53.5 |
+exp/train_nodup_sp_pytorch_train/decode_eval2_decode_lm/result.txt
+     | SPKR     | # Snt  # Wrd | Corr    Sub    Del    Ins    Err  S.Err |
+     | Sum/Avg  | 1292   43623 | 96.7    2.1    1.1    0.5    3.8   49.6 |
+exp/train_nodup_sp_pytorch_train/decode_eval3_decode_lm/result.txt
+     | SPKR     | # Snt  # Wrd | Corr    Sub    Del    Ins    Err  S.Err |
+     | Sum/Avg  | 1385   28225 | 96.6    2.3    1.1    0.8    4.2   35.9 |
+```
 
 # RNN results
 ## Deep VGGBLSTM with pytorch backend + Dropout + Speed perturbation + CTC joint decoding + LM rescoring

diff --git a/espnet/asr/chainer_backend/asr.py b/espnet/asr/chainer_backend/asr.py
@@ -119,7 +119,7 @@ def train(args):
         for gid in six.moves.xrange(1, ngpu):
             devices['sub_%d' % gid] = gid
         logging.info('multi gpu calculation (#gpus = %d).' % ngpu)
-        logging.info('batch size is automatically increased (%d -> %d)' % (
+        logging.warning('batch size is automatically increased (%d -> %d)' % (
             args.batch_size, args.batch_size * args.ngpu))
     else:
         gpu_id = -1

diff --git a/espnet/asr/pytorch_backend/asr.py b/espnet/asr/pytorch_backend/asr.py
@@ -392,7 +392,7 @@ def train(args):
     # check the use of multi-gpu
     if args.ngpu > 1:
         if args.batch_size != 0:
-            logging.info('batch size is automatically increased (%d -> %d)' % (
+            logging.warning('batch size is automatically increased (%d -> %d)' % (
                 args.batch_size, args.batch_size * args.ngpu))
             args.batch_size *= args.ngpu
         if args.num_encs > 1:

diff --git a/espnet/asr/pytorch_backend/asr_mix.py b/espnet/asr/pytorch_backend/asr_mix.py
@@ -177,7 +177,7 @@ def train(args):
     # check the use of multi-gpu
     if args.ngpu > 1:
         if args.batch_size != 0:
-            logging.info('batch size is automatically increased (%d -> %d)' % (
+            logging.warning('batch size is automatically increased (%d -> %d)' % (
                 args.batch_size, args.batch_size * args.ngpu))
             args.batch_size *= args.ngpu
 

diff --git a/espnet/bin/asr_train.py b/espnet/bin/asr_train.py
@@ -4,7 +4,7 @@
 # Copyright 2017 Tomoki Hayashi (Nagoya University)
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
-"""End-to-end speech recognition model training script."""
+"""Automatic speech recognition model training script."""
 
 import logging
 import multiprocessing as mp
@@ -13,12 +13,17 @@
 import subprocess
 import sys
 
+from distutils.version import LooseVersion
+
 import configargparse
 import numpy as np
+import torch
 
 from espnet.utils.cli_utils import strtobool
 from espnet.utils.training.batchfy import BATCH_COUNT_CHOICES
 
+is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion('1.2')
+
 
 # NOTE: you need this func to generate our sphinx doc
 def get_parser(parser=None, required=True):
@@ -318,6 +323,9 @@ def main(cmd_args):
             else:
                 ngpu = len(p.stderr.decode().split('\n')) - 1
     else:
+        if is_torch_1_2_plus:
+            assert args.ngpu == 1, "There are some bugs with multi-GPU processing in PyTorch 1.2+" \
+                                   " (see https://github.com/pytorch/pytorch/issues/21108)"
         ngpu = args.ngpu
     logging.info(f"ngpu: {ngpu}")
 

diff --git a/espnet/mt/pytorch_backend/mt.py b/espnet/mt/pytorch_backend/mt.py
@@ -142,7 +142,7 @@ def train(args):
     # check the use of multi-gpu
     if args.ngpu > 1:
         if args.batch_size != 0:
-            logging.info('batch size is automatically increased (%d -> %d)' % (
+            logging.warning('batch size is automatically increased (%d -> %d)' % (
                 args.batch_size, args.batch_size * args.ngpu))
             args.batch_size *= args.ngpu
 

diff --git a/espnet/nets/pytorch_backend/ctc.py b/espnet/nets/pytorch_backend/ctc.py
@@ -40,7 +40,10 @@ def __init__(self, odim, eprojs, dropout_rate, ctc_type='warpctc', reduce=True):
     def loss_fn(self, th_pred, th_target, th_ilen, th_olen):
         if self.ctc_type == 'builtin':
             th_pred = th_pred.log_softmax(2)
-            loss = self.ctc_loss(th_pred, th_target, th_ilen, th_olen)
+            # Use the deterministic CuDNN implementation of CTC loss to avoid
+            #  [issue#17798](https://github.com/pytorch/pytorch/issues/17798)
+            with torch.backends.cudnn.flags(deterministic=True):
+                loss = self.ctc_loss(th_pred, th_target, th_ilen, th_olen)
             # Batch-size average
             loss = loss / th_pred.size(1)
             return loss
@@ -83,6 +86,9 @@ def forward(self, hs_pad, hlens, ys_pad):
         if self.ctc_type == "warpctc":
             # warpctc only supports float32
             ys_hat = ys_hat.to(dtype=torch.float32)
+        else:
+            # use GPU when using the cuDNN implementation
+            ys_true = to_device(self, ys_true)
         self.loss = to_device(self, self.loss_fn(ys_hat, ys_true, hlens, olens)).to(dtype=dtype)
         if self.reduce:
             # NOTE: sum() is needed to keep consistency since warpctc return as tensor w/ shape (1,)

diff --git a/espnet/nets/pytorch_backend/e2e_asr_transducer.py b/espnet/nets/pytorch_backend/e2e_asr_transducer.py
@@ -100,20 +100,22 @@ def add_arguments(parser):
         # prediction
         group.add_argument('--dec-embed-dim', default=320, type=int,
                            help='Number of decoder embeddings dimensions')
-        parser.add_argument('--dropout-rate-embed-decoder', default=0.0, type=float,
-                            help='Dropout rate for the decoder embeddings')
+        group.add_argument('--dropout-rate-embed-decoder', default=0.0, type=float,
+                           help='Dropout rate for the decoder embeddings')
         # general
         group.add_argument('--rnnt_type', default='warp-transducer', type=str,
                            choices=['warp-transducer'],
                            help='Type of transducer implementation to calculate loss.')
-        parser.add_argument('--rnnt-mode', default='rnnt', type=str, choices=['rnnt', 'rnnt-att'],
-                            help='RNN-Transducing mode')
-        parser.add_argument('--joint-dim', default=320, type=int,
-                            help='Number of dimensions in joint space')
+        group.add_argument('--rnnt-mode', default='rnnt', type=str, choices=['rnnt', 'rnnt-att'],
+                           help='RNN-Transducing mode')
+        group.add_argument('--joint-dim', default=320, type=int,
+                           help='Number of dimensions in joint space')
         # decoding
-        parser.add_argument('--score-norm-transducer', type=strtobool, nargs='?',
-                            default=True,
-                            help='Normalize transducer scores by length')
+        group.add_argument('--score-norm-transducer', type=strtobool, nargs='?',
+                           default=True,
+                           help='Normalize transducer scores by length')
+
+        return parser
 
     def __init__(self, idim, odim, args):
         """Initialize transducer modules.

diff --git a/espnet/nets/pytorch_backend/e2e_tts_fastspeech.py b/espnet/nets/pytorch_backend/e2e_tts_fastspeech.py
@@ -501,6 +501,14 @@ def _integrate_with_spk_embed(self, hs, spembs):
     def _source_mask(self, ilens):
         """Make masks for self-attention.
 
+        Args:
+            ilens (LongTensor or List): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for self-attention.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+
         Examples:
             >>> ilens = [5, 3]
             >>> self._source_mask(ilens)

diff --git a/espnet/nets/pytorch_backend/e2e_tts_tacotron2.py b/espnet/nets/pytorch_backend/e2e_tts_tacotron2.py
@@ -122,6 +122,15 @@ def _make_guided_attention_mask(ilen, olen, sigma):
     def _make_masks(ilens, olens):
         """Make masks indicating non-padded part.
 
+        Args:
+            ilens (LongTensor or List): Batch of lengths (B,).
+            olens (LongTensor or List): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor indicating non-padded part.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+
         Examples:
             >>> ilens, olens = [5, 2], [8, 5]
             >>> _make_mask(ilens, olens)

diff --git a/espnet/nets/pytorch_backend/e2e_tts_transformer.py b/espnet/nets/pytorch_backend/e2e_tts_transformer.py
@@ -818,6 +818,14 @@ def _integrate_with_spk_embed(self, hs, spembs):
     def _source_mask(self, ilens):
         """Make masks for self-attention.
 
+        Args:
+            ilens (LongTensor or List): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for self-attention.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+
         Examples:
             >>> ilens = [5, 3]
             >>> self._source_mask(ilens)
@@ -839,6 +847,14 @@ def _source_mask(self, ilens):
     def _target_mask(self, olens):
         """Make masks for masked self-attention.
 
+        Args:
+            olens (LongTensor or List): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for masked self-attention.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+
         Examples:
             >>> olens = [5, 3]
             >>> self._target_mask(olens)
@@ -861,6 +877,15 @@ def _target_mask(self, olens):
     def _source_to_target_mask(self, ilens, olens):
         """Make masks for encoder-decoder attention.
 
+        Args:
+            ilens (LongTensor or List): Batch of lengths (B,).
+            olens (LongTensor or List): Batch of lengths (B,).
+
+        Returns:
+            Tensor: Mask tensor for encoder-decoder attention.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
+
         Examples:
             >>> ilens = [4, 2]
             >>> olens = [5, 3]

diff --git a/espnet/nets/pytorch_backend/frontends/dnn_beamformer.py b/espnet/nets/pytorch_backend/frontends/dnn_beamformer.py
@@ -1,3 +1,4 @@
+from distutils.version import LooseVersion
 from typing import Tuple
 
 import torch
@@ -12,6 +13,8 @@
 from espnet.nets.pytorch_backend.frontends.mask_estimator import MaskEstimator
 from torch_complex.tensor import ComplexTensor
 
+is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion('1.2')
+
 
 class DNN_Beamformer(torch.nn.Module):
     """DNN mask based Beamformer
@@ -143,7 +146,8 @@ def forward(self, psd_in: ComplexTensor, ilens: torch.LongTensor,
         B, _, C = psd_in.size()[:3]
         assert psd_in.size(2) == psd_in.size(3), psd_in.size()
         # psd_in: (B, F, C, C)
-        psd = psd_in.masked_fill(torch.eye(C, dtype=torch.uint8,
+        datatype = torch.bool if is_torch_1_2_plus else torch.uint8
+        psd = psd_in.masked_fill(torch.eye(C, dtype=datatype,
                                            device=psd_in.device), 0)
         # psd: (B, F, C, C) -> (B, C, F)
         psd = (psd.sum(dim=-1) / (C - 1)).transpose(-1, -2)

diff --git a/espnet/nets/pytorch_backend/nets_utils.py b/espnet/nets/pytorch_backend/nets_utils.py
@@ -62,6 +62,8 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
 
     Returns:
         Tensor: Mask tensor containing indices of padded part.
+                dtype=torch.uint8 in PyTorch 1.2-
+                dtype=torch.bool in PyTorch 1.2+ (including 1.2)
 
     Examples:
         With only lengths.
@@ -172,6 +174,8 @@ def make_non_pad_mask(lengths, xs=None, length_dim=-1):
 
     Returns:
         ByteTensor: mask tensor containing indices of padded part.
+                    dtype=torch.uint8 in PyTorch 1.2-
+                    dtype=torch.bool in PyTorch 1.2+ (including 1.2)
 
     Examples:
         With only lengths.

diff --git a/espnet/nets/pytorch_backend/transformer/decoder.py b/espnet/nets/pytorch_backend/transformer/decoder.py
@@ -98,9 +98,13 @@ def forward(self, tgt, tgt_mask, memory, memory_mask):
 
         :param torch.Tensor tgt: input token ids, int64 (batch, maxlen_out) if input_layer == "embed"
                                  input tensor (batch, maxlen_out, #mels) in the other cases
-        :param torch.Tensor tgt_mask: input token mask, uint8  (batch, maxlen_out)
+        :param torch.Tensor tgt_mask: input token mask,  (batch, maxlen_out)
+                                      dtype=torch.uint8 in PyTorch 1.2-
+                                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
         :param torch.Tensor memory: encoded memory, float32  (batch, maxlen_in, feat)
-        :param torch.Tensor memory_mask: encoded memory mask, uint8  (batch, maxlen_in)
+        :param torch.Tensor memory_mask: encoded memory mask,  (batch, maxlen_in)
+                                         dtype=torch.uint8 in PyTorch 1.2-
+                                         dtype=torch.bool in PyTorch 1.2+ (include 1.2)
         :return x: decoded token score before softmax (batch, maxlen_out, token) if use_output_layer is True,
                    final block outputs (batch, maxlen_out, attention_dim) in the other cases
         :rtype: torch.Tensor
@@ -119,7 +123,9 @@ def forward_one_step(self, tgt, tgt_mask, memory, cache=None):
         """Forward one step.
 
         :param torch.Tensor tgt: input token ids, int64 (batch, maxlen_out)
-        :param torch.Tensor tgt_mask: input token mask, uint8  (batch, maxlen_out)
+        :param torch.Tensor tgt_mask: input token mask,  (batch, maxlen_out)
+                                      dtype=torch.uint8 in PyTorch 1.2-
+                                      dtype=torch.bool in PyTorch 1.2+ (include 1.2)
         :param torch.Tensor memory: encoded memory, float32  (batch, maxlen_in, feat)
         :param List[torch.Tensor] cache: cached output list of (batch, max_time_out-1, size)
         :return y, cache: NN output value and cache per `self.decoders`.

diff --git a/espnet/nets/pytorch_backend/transformer/mask.py b/espnet/nets/pytorch_backend/transformer/mask.py
@@ -1,15 +1,19 @@
-#!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
 # Copyright 2019 Shigeki Karita
 #  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
 """Mask module."""
 
+from distutils.version import LooseVersion
+
 import torch
 
+is_torch_1_2_plus = LooseVersion(torch.__version__) >= LooseVersion('1.2.0')
+datatype = torch.bool if is_torch_1_2_plus else torch.uint8
+
 
-def subsequent_mask(size, device="cpu", dtype=torch.uint8):
+def subsequent_mask(size, device="cpu", dtype=datatype):
     """Create mask for subsequent steps (1, size, size).
 
     :param int size: size of mask

diff --git a/espnet/tts/pytorch_backend/tts.py b/espnet/tts/pytorch_backend/tts.py
@@ -310,7 +310,7 @@ def train(args):
     if args.ngpu > 1:
         model = torch.nn.DataParallel(model, device_ids=list(range(args.ngpu)))
         if args.batch_size != 0:
-            logging.info('batch size is automatically increased (%d -> %d)' % (
+            logging.warning('batch size is automatically increased (%d -> %d)' % (
                 args.batch_size, args.batch_size * args.ngpu))
             args.batch_size *= args.ngpu