merge master to v.0.6.0

iamanigeeit · Oct 8, 2019 · d2ab386 · d2ab386
2 parents 2626d0c + 2b14d3b
commit d2ab386
Show file tree

Hide file tree

Showing 109 changed files with 4,332 additions and 755 deletions.
diff --git a/.gitignore b/.gitignore
@@ -38,10 +38,12 @@ egs/*/*/tensorboard
 egs/*/*/wav*
 
 # tools related
+tools/bin
+tools/include
+tools/lib
 tools/bats-core
 tools/chainer_ctc/
 tools/kaldi*
-tools/mecab/
 tools/miniconda.sh
 tools/moses/
 tools/mwerSegmenter/
@@ -53,3 +55,6 @@ tools/warp-ctc/
 tools/warp-transducer/
 tools/*.done
 tools/PESQ*
+tools/hts_engine_API*
+tools/open_jtalk*
+tools/pyopenjtalk*
diff --git a/README.md b/README.md
@@ -458,24 +458,24 @@ Available pretrained models are listed as follows:
 | [ljspeech.fastspeech.v2](https://drive.google.com/open?id=1zD-2GMrWM3thaDpS3h3rkTU4jIC0wc5B) | Feed-forward Transformer with CNN instead of position-wise FFN |
 | [libritts.transformer.v1 (New!)](https://drive.google.com/open?id=1Xj73mDPuuPH8GsyNO8GnOC3mn0_OK4g3) | Multi-speaker Transformer with reduction factor = 2 |
 
-Waveform synthesis is performed with Griffin-Lim algorithm as default, but we also support a pretrained WaveNet vocoder based on [kan-bayashi/PytorchWaveNetVocoder](https://github.com/kan-bayashi/PytorchWaveNetVocoder).  
+Waveform synthesis is performed with Griffin-Lim algorithm as default, but we also support a pretrained WaveNet vocoder.  
 You can try it by extending the `stop_stage` as follows:
 ```
 ../../../utils/synth_wav.sh --stop_stage 4 example.txt
 ```
 You can change the pretrained vocoder model as follows:
 ```
-../../../utils/synth_wav.sh --stop_stage 4 --vocoder_models ljspeech.wavenet.ns.v1.1000k_iters example.txt
+../../../utils/synth_wav.sh --stop_stage 4 --vocoder_models ljspeech.wavenet.softmax.ns.v1 example.txt
 ```
 
 Available pretrained vocoder models are listed as follows:
 
 | Model | Notes |
 |:------|:------|
-| [ljspeech.wavenet.ns.v1.100k_iters](https://drive.google.com/open?id=1eA1VcRS9jzFa-DovyTgJLQ_jmwOLIi8L) | WaveNet vocoder with noise shaping @ 100k iters |
-| [ljspeech.wavenet.ns.v1.1000k_iters](https://drive.google.com/open?id=1NlG47iTVsBhIDklJALXgRtZPI8ST1Tzd) | WaveNet vocoder with noise shaping @ 1000k iters |
+| [ljspeech.wavenet.softmax.ns.v1](https://drive.google.com/open?id=1eA1VcRS9jzFa-DovyTgJLQ_jmwOLIi8L) | 8 bit Softmax WaveNet w/ noise shapining trained by [kan-bayashi/PytorchWaveNetVocoder](https://github.com/kan-bayashi/PytorchWaveNetVocoder) |
+| [ljspeech.wavenet.mol.v1](https://drive.google.com/open?id=1sY7gEUg39QaO1szuN62-Llst9TrFno2t) | 16 bit MoL WaveNet trained by [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder) |
 
-If you want to build your own WaveNet vocoder, please check [kan-bayashi/PytorchWaveNetVocoder](https://github.com/kan-bayashi/PytorchWaveNetVocoder).
+If you want to build your own WaveNet vocoder, please check [kan-bayashi/PytorchWaveNetVocoder](https://github.com/kan-bayashi/PytorchWaveNetVocoder) or [r9y9/wavenet_vocoder](https://github.com/r9y9/wavenet_vocoder).
 
 
 ## Chainer and Pytorch backends

diff --git a/ci/install_kaldi.sh b/ci/install_kaldi.sh
@@ -3,7 +3,7 @@
 set -euo pipefail
 
 # install kaldi
-git clone https://github.com/kaldi-asr/kaldi --depth 1 tools/kaldi
+[ ! -d tools/kaldi ] && git clone https://github.com/kaldi-asr/kaldi --depth 1 tools/kaldi
 (
     cd ./tools/kaldi/tools || exit 1
     echo "" > extras/check_dependencies.sh

diff --git a/ci/test_flake8.sh b/ci/test_flake8.sh
@@ -9,12 +9,9 @@ set -euo pipefail
 # TODO: remove files from this list!
 flake8_black_list="\
 espnet/__init__.py
-espnet/asr/asr_mix_utils.py
 espnet/asr/asr_utils.py
 espnet/asr/chainer_backend/asr.py
 espnet/asr/pytorch_backend/asr.py
-espnet/asr/pytorch_backend/asr_init.py
-espnet/asr/pytorch_backend/asr_mix.py
 espnet/bin/asr_enhance.py
 espnet/lm/chainer_backend/extlm.py
 espnet/lm/chainer_backend/lm.py
@@ -30,8 +27,6 @@ espnet/nets/chainer_backend/rnn/training.py
 espnet/nets/ctc_prefix_score.py
 espnet/nets/e2e_asr_common.py
 espnet/nets/pytorch_backend/ctc.py
-espnet/nets/pytorch_backend/e2e_asr_mix.py
-espnet/nets/pytorch_backend/e2e_asr_transducer.py
 espnet/nets/pytorch_backend/e2e_asr_transformer.py
 espnet/nets/pytorch_backend/frontends/beamformer.py
 espnet/nets/pytorch_backend/frontends/dnn_beamformer.py

diff --git a/ci/test_integration.sh b/ci/test_integration.sh
@@ -14,6 +14,14 @@
     echo "==== ASR (backend=chainer) ==="
     ./run.sh --stage 3 --backend chainer
 )
+# test asr_mix recipe
+(
+    set -euo pipefail
+
+    cd ./egs/mini_an4/asr_mix1 || exit 1
+    echo "==== ASR Mix (backend=pytorch) ==="
+    ./run.sh
+)
 # test tts recipe
 (
     set -euo pipefail
@@ -23,4 +31,4 @@
     ./run.sh
 )
 
-# TODO(karita): test asr_mix, mt, st?
+# TODO(karita): test mt, st?
diff --git a/ci/test_shell.sh b/ci/test_shell.sh
@@ -1,5 +1,9 @@
 #!/usr/bin/env bash
 
+if [ ! -e tools/kaldi ]; then
+    git clone https://github.com/kaldi-asr/kaldi --depth 1 tools/kaldi
+fi
+
 PATH=$(pwd)/bats-core/bin:$(pwd)/shellcheck-stable:$PATH
 if ! [ -x "$(command -v bats)" ]; then
     echo "=== install bats ==="

diff --git a/doc/notebook b/doc/notebook
diff --git a/docker/prebuilt/local/Dockerfile b/docker/prebuilt/local/Dockerfile
@@ -27,8 +27,8 @@ WORKDIR /espnet/tools
 RUN if [ -z "$( nvcc -V )" ]; then \
         make KALDI=/kaldi CUPY_VERSION=''; \
     else \
-        sed -i '159s|install.py|install.py --no-cupy|' Makefile && \
-        sed -i '19s|nvidia-smi|nvcc|' Makefile && \
+        sed -i '161s|install.py|install.py --no-cupy|' Makefile && \
+        sed -i '21s|nvidia-smi|nvcc|' Makefile && \
         make KALDI=/kaldi CUDA_VERSION=${CUDA_VER}; \
     fi
 

diff --git a/egs/README.md b/egs/README.md
@@ -13,6 +13,7 @@
 | cmu_wilderness          | CMU Wilderness Multilingual Speech Dataset                   | Multilingual ASR                           | ~100 Languages | https://github.com/festvox/datasets-CMU_Wilderness           |                               |
 | commonvoice             | The Mozilla Common Voice corpus v1.                          | ASR                                        | EN             | https://voice.mozilla.org/datasets                           |                               |
 | csj                     | Corpus of Spontaneous Japanese                               | ASR                                        | JP             | https://pj.ninjal.ac.jp/corpus_center/csj/en/                |                               |
+| dirha_wsj               | Distant-speech Interaction for Robust Home Applications      | Multi-Array ASR                            | EN             | https://dirha.fbk.eu/, https://github.com/SHINE-FBK/DIRHA_English_wsj|                               |
 | fisher_callhome_spanish | Fisher and CALLHOME Spanish--English Speech Translation      | ASR/Machine Translation/Speech Translation | ES->EN         | https://catalog.ldc.upenn.edu/LDC2014T23                     |                               |
 | fisher_swbd             | Fisher English Training Speech, Switchboard-1 Release 2      | ASR                                        | EN             | https://catalog.ldc.upenn.edu/LDC2004S13, https://catalog.ldc.upenn.edu/LDC2005S13, https://catalog.ldc.upenn.edu/LDC97S62 |                               |
 | hkust                   | HKUST Mandarin Telephone Speech                              | ASR                                        | ZH             | [https://catalog.ldc.upenn.edu/LDC2005S15, https://catalog.ldc.upenn.edu/LDC2005T32](https://catalog.ldc.upenn.edu/LDC2005S15) |                               |

diff --git a/egs/dirha_wsj/asr1/RESULTS.md b/egs/dirha_wsj/asr1/RESULTS.md
@@ -0,0 +1,63 @@
+# RNN results
+## Mic: Beam_Circular_Array
+### (pytorch) 2-layer vggblstmp, add attention, batchsize 15;
+#### CER
+|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
+|---|---|---|---|---|---|---|---|---|
+|exp/train_si284_Beam_Circular_Array_pytorch_train_no_preprocess/decode_dirha_real_Beam_Circular_Array_decode_lm_word65000/result.txt:| 409 | 39842 | 82.5 | 9.3 | 8.2 | 3.8 | 21.3 | 83.4 |
+#### WER
+|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
+|---|---|---|---|---|---|---|---|---|
+|exp/train_si284_Beam_Circular_Array_pytorch_train_no_preprocess/decode_dirha_real_Beam_Circular_Array_decode_lm_word65000/result.wrd.txt:| 409 | 6762 | 69.3 | 25.6 | 5.0 | 4.3 | 35.0 | 83.4 |
+
+### add SpecAug;
+#### CER
+|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
+|---|---|---|---|---|---|---|---|---|
+|exp/train_si284_Beam_Circular_Array_pytorch_train_specaug/decode_dirha_real_Beam_Circular_Array_decode_lm_word65000/result.txt:| 409 | 39842 | 83.9 | 7.2 | 8.9 | 2.8 | 19.0 | 80.9 |
+#### WER
+|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
+|---|---|---|---|---|---|---|---|---|
+|exp/train_si284_Beam_Circular_Array_pytorch_train_specaug/decode_dirha_real_Beam_Circular_Array_decode_lm_word65000/result.wrd.txt:| 409 | 6762 | 72.6 | 20.6 | 6.8 | 2.3 | 29.7 | 80.9 |
+
+## Mic: Beam_Linear_Array
+### (pytorch) 2-layer vggblstmp, add attention, batchsize 15;
+#### CER
+|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
+|---|---|---|---|---|---|---|---|---|
+|exp/train_si284_Beam_Linear_Array_pytorch_train_no_preprocess/decode_dirha_real_Beam_Linear_Array_decode_lm_word65000/result.txt:| 409 | 39842 | 83.7 | 9.9 | 6.4 | 5.2 | 21.6 | 85.6 |
+#### WER
+|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
+|---|---|---|---|---|---|---|---|---|
+|exp/train_si284_Beam_Linear_Array_pytorch_train_no_preprocess/decode_dirha_real_Beam_Linear_Array_decode_lm_word65000/result.wrd.txt:| 409 | 6762 | 69.1 | 27.6 | 3.3 | 5.9 | 36.8 | 85.6 |
+
+### add SpecAug;
+#### CER
+|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
+|---|---|---|---|---|---|---|---|---|
+|exp/train_si284_Beam_Linear_Array_pytorch_train_specaug/decode_dirha_real_Beam_Linear_Array_decode_lm_word65000/result.txt:| 409 | 39842 | 87.2 | 6.9 | 5.9 | 3.3 | 16.2 | 76.5 |
+#### WER
+|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
+|---|---|---|---|---|---|---|---|---|
+|exp/train_si284_Beam_Linear_Array_pytorch_train_specaug/decode_dirha_real_Beam_Linear_Array_decode_lm_word65000/result.wrd.txt:| 409 | 6762 | 76.2 | 19.7 | 4.1 | 3.2 | 27.0 | 76.5 |
+
+## Mic: L1C
+### (pytorch) 2-layer vggblstmp, add attention, batchsize 15;
+#### CER
+|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
+|---|---|---|---|---|---|---|---|---|
+|exp/train_si284_L1C_pytorch_train_no_preprocess/decode_dirha_real_L1C_decode_lm_word65000/result.txt:| 409 | 39842 | 84.1 | 11.3 | 4.6 | 8.6 | 24.6 | 83.6 |
+#### WER
+|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
+|---|---|---|---|---|---|---|---|---|
+|exp/train_si284_L1C_pytorch_train_no_preprocess/decode_dirha_real_L1C_decode_lm_word65000/result.wrd.txt:| 409 | 6762 | 69.3 | 28.5 | 2.2 | 9.3 | 39.9 | 83.6 |
+
+### add SpecAug;
+#### CER
+|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
+|---|---|---|---|---|---|---|---|---|
+|exp/train_si284_L1C_pytorch_train_specaug/decode_dirha_real_L1C_decode_lm_word65000/result.txt:| 409 | 39842 | 86.2 | 9.3 | 4.5 | 6.4 | 20.2 | 81.2 |
+#### WER
+|dataset| Snt | Wrd| Corr | Sub | Del | Ins | Err | S.Err|
+|---|---|---|---|---|---|---|---|---|
+|exp/train_si284_L1C_pytorch_train_specaug/decode_dirha_real_L1C_decode_lm_word65000/result.wrd.txt:| 409 | 6762 | 73.7 | 23.2 | 3.1 | 5.3 | 31.6 | 81.2 |
diff --git a/egs/dirha_wsj/asr1/cmd.sh b/egs/dirha_wsj/asr1/cmd.sh
@@ -0,0 +1,89 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/egs/dirha_wsj/asr1/conf/decode.yaml b/egs/dirha_wsj/asr1/conf/decode.yaml
@@ -0,0 +1 @@
+./tuning/decode_rnn.yaml
diff --git a/egs/dirha_wsj/asr1/conf/fbank.conf b/egs/dirha_wsj/asr1/conf/fbank.conf
@@ -0,0 +1,2 @@
+--sample-frequency=16000 
+--num-mel-bins=80
diff --git a/egs/dirha_wsj/asr1/conf/gpu.conf b/egs/dirha_wsj/asr1/conf/gpu.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=b1[12345678]*|c*,gpu=$0' -q g.q
diff --git a/egs/dirha_wsj/asr1/conf/lm.yaml b/egs/dirha_wsj/asr1/conf/lm.yaml
@@ -0,0 +1,8 @@
+layer: 1         # 2 for character LMs
+unit: 1000       # 650 for character LMs
+opt: sgd          # adam for character LMs
+sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
+batchsize: 300    # 1024 for character LMs
+epoch: 20        # number of epochs
+patience: 3
+maxlen: 40        # 150 for character LMs
diff --git a/egs/dirha_wsj/asr1/conf/no_preprocess.yaml b/egs/dirha_wsj/asr1/conf/no_preprocess.yaml
@@ -0,0 +1,2 @@
+process:
+  - type: "identity"
diff --git a/egs/dirha_wsj/asr1/conf/pitch.conf b/egs/dirha_wsj/asr1/conf/pitch.conf
@@ -0,0 +1 @@
+--sample-frequency=16000
diff --git a/egs/dirha_wsj/asr1/conf/queue.conf b/egs/dirha_wsj/asr1/conf/queue.conf
@@ -0,0 +1,10 @@
+# Default configuration
+command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
+option mem=* -l mem_free=$0,ram_free=$0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* -pe smp $0
+option num_threads=1  # Do not add anything to qsub_opts
+option max_jobs_run=* -tc $0
+default gpu=0
+option gpu=0
+option gpu=* -l 'hostname=c*,gpu=$0' -q g.q
diff --git a/egs/dirha_wsj/asr1/conf/slurm.conf b/egs/dirha_wsj/asr1/conf/slurm.conf
@@ -0,0 +1,12 @@
+# Default configuration
+command sbatch --export=PATH  --ntasks-per-node=1
+option time=* --time $0
+option mem=* --mem-per-cpu $0
+option mem=0          # Do not add anything to qsub_opts
+option num_threads=* --cpus-per-task $0 --ntasks-per-node=1
+option num_threads=1 --cpus-per-task 1  --ntasks-per-node=1 # Do not add anything to qsub_opts
+default gpu=0
+option gpu=0 -p cpu
+option gpu=* -p gpu --gres=gpu:$0
+# note: the --max-jobs-run option is supported as a special case
+# by slurm.pl and you don't have to handle it in the config file.
diff --git a/egs/dirha_wsj/asr1/conf/specaug.yaml b/egs/dirha_wsj/asr1/conf/specaug.yaml
@@ -0,0 +1,16 @@
+process:
+  # these three processes are a.k.a. SpecAugument
+  - type: "time_warp"
+    max_time_warp: 5
+    inplace: true
+    mode: "PIL"
+  - type: "freq_mask"
+    F: 30
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
+  - type: "time_mask"
+    T: 40
+    n_mask: 2
+    inplace: true
+    replace_with_zero: false
diff --git a/egs/dirha_wsj/asr1/conf/train.yaml b/egs/dirha_wsj/asr1/conf/train.yaml
@@ -0,0 +1 @@
+./tuning/train_rnn.yaml
diff --git a/egs/dirha_wsj/asr1/conf/tuning/decode_rnn.yaml b/egs/dirha_wsj/asr1/conf/tuning/decode_rnn.yaml
@@ -0,0 +1,6 @@
+lm-weight: 1.0
+beam-size: 30
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc-weight: 0.3