Merge branch 'master' into time_sync_asr_pr

BriansIDP · Dec 22, 2022 · 0ab198d · 0ab198d
2 parents c7dac1b + aa5cc02
commit 0ab198d
Show file tree

Hide file tree

Showing 16 changed files with 652 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -285,7 +285,7 @@ We list the character error rate (CER) and word error rate (WER) of major ASR ta
 | Task                                                              |     CER (%)     |     WER (%)     |                                                                              Pretrained model                                                                               |
 | ----------------------------------------------------------------- | :-------------: | :-------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
 | Aishell dev/test                                                  |     4.6/5.1     |       N/A       |                [link](https://github.com/espnet/espnet/blob/master/egs/aishell/asr1/RESULTS.md#conformer-kernel-size--15--specaugment--lm-weight--00-result)                |
-| **ESPnet2** Aishell dev/test                                      |     4.4/4.7     |       N/A       |                [link](https://github.com/espnet/espnet/tree/master/egs2/aishell/asr1#conformer--specaug--speed-perturbation-featsraw-n_fft512-hop_length128)                |
+| **ESPnet2** Aishell dev/test                                      |     4.1/4.4     |       N/A       |                [link](https://github.com/espnet/espnet/tree/master/egs2/aishell/asr1#branchformer-initial)                |
 | Common Voice dev/test                                             |     1.7/1.8     |     2.2/2.3     |    [link](https://github.com/espnet/espnet/blob/master/egs/commonvoice/asr1/RESULTS.md#first-results-default-pytorch-transformer-setting-with-bpe-100-epochs-single-gpu)    |
 | CSJ eval1/eval2/eval3                                             |   5.7/3.8/4.2   |       N/A       |                 [link](https://github.com/espnet/espnet/blob/master/egs/csj/asr1/RESULTS.md#pytorch-backend-transformer-without-any-hyperparameter-tuning)                  |
 | **ESPnet2** CSJ eval1/eval2/eval3                                 |   4.5/3.3/3.6   |       N/A       |                                        [link](https://github.com/espnet/espnet/tree/master/egs2/csj/asr1#initial-conformer-results)                                         |
@@ -295,6 +295,7 @@ We list the character error rate (CER) and word error rate (WER) of major ASR ta
 | **ESPnet2** Librispeech dev_clean/dev_other/test_clean/test_other | 0.6/1.5/0.6/1.4 | 1.7/3.4/1.8/3.6 |    [link](https://github.com/espnet/espnet/tree/master/egs2/librispeech/asr1#self-supervised-learning-features-hubert_large_ll60k-conformer-utt_mvn-with-transformer-lm)    |
 | Switchboard (eval2000) callhm/swbd                                |       N/A       |    14.0/6.8     |          [link](https://github.com/espnet/espnet/blob/master/egs/swbd/asr1/RESULTS.md#conformer-with-bpe-2000-specaug-speed-perturbation-transformer-lm-decoding)           |
 | TEDLIUM2 dev/test                                                 |       N/A       |     8.6/7.2     |                 [link](https://github.com/espnet/espnet/blob/master/egs/tedlium2/asr1/RESULTS.md#conformer-large-model--specaug--speed-perturbation--rnnlm)                 |
+| **ESPnet2** TEDLIUM2 dev/test                                                 |       N/A       |     7.3/7.1     |                 [link](https://github.com/espnet/espnet/blob/master/egs2/tedlium2/asr1/README.md#e-branchformer-12-encoder-layers) |
 | TEDLIUM3 dev/test                                                 |       N/A       |     9.6/7.6     |                                              [link](https://github.com/espnet/espnet/blob/master/egs/tedlium3/asr1/RESULTS.md)                                              |
 | WSJ dev93/eval92                                                  |     3.2/2.1     |     7.0/4.7     |                                                                                     N/A                                                                                     |
 | **ESPnet2** WSJ dev93/eval92                                      |     1.1/0.8     |     2.8/1.8     |       [link](https://github.com/espnet/espnet/tree/master/egs2/wsj/asr1#self-supervised-learning-features-wav2vec2_large_ll60k-conformer-utt_mvn-with-transformer-lm)       |

diff --git a/egs2/aishell/asr1/README.md b/egs2/aishell/asr1/README.md
@@ -42,6 +42,31 @@
 |decode_asr_streaming_lm_lm_train_lm_transformer_zh_char_valid.loss.ave_asr_model_valid.acc.ave/test|7176|104765|93.0|6.7|0.2|0.8|7.8|50.7|
 
 
+# E-Branchformer
+
+## Environments
+- date: `Sun Dec 18 12:21:46 CST 2022`
+- python version: `3.9.15 (main, Nov 24 2022, 14:31:59)  [GCC 11.2.0]`
+- espnet version: `espnet 202209`
+- pytorch version: `pytorch 1.12.1`
+- Git hash: `26f432bc859e5e40cac1a86042d498ba7baffbb0`
+  - Commit date: `Fri Dec 9 02:16:01 2022 +0000`
+
+## Without LM
+
+- ASR config: [conf/tuning/train_asr_e_branchformer_e12_mlp1024_linear1024_mactrue_amp.yaml](conf/tuning/train_asr_e_branchformer_e12_mlp1024_linear1024_mactrue_amp.yaml)
+- #Params: 37.88 M
+- Model link: [https://huggingface.co/pyf98/aishell_e_branchformer](https://huggingface.co/pyf98/aishell_e_branchformer)
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_branchformer_asr_model_valid.acc.ave/dev|14326|205341|95.9|4.0|0.1|0.1|4.2|33.1|
+|decode_asr_branchformer_asr_model_valid.acc.ave/test|7176|104765|95.6|4.3|0.1|0.1|4.5|34.6|
+
+
+
 
 # Branchformer: initial
 

diff --git a/...aishell/asr1/conf/tuning/train_asr_e_branchformer_e12_mlp1024_linear1024_mactrue_amp.yaml b/...aishell/asr1/conf/tuning/train_asr_e_branchformer_e12_mlp1024_linear1024_mactrue_amp.yaml
@@ -0,0 +1,83 @@
+# network architecture
+# encoder related
+encoder: e_branchformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    attention_layer_type: rel_selfattn
+    pos_enc_layer_type: rel_pos
+    rel_pos_type: latest
+    cgmlp_linear_units: 1024
+    cgmlp_conv_kernel: 31
+    use_linear_after_conv: false
+    gate_activation: identity
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    layer_drop_rate: 0.0
+    linear_units: 1024
+    positionwise_layer_type: linear
+    use_ffn: true
+    macaron_ffn: true
+    merge_conv_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.
+    src_attention_dropout_rate: 0.
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# minibatch related
+batch_type: numel
+batch_bins: 25000000
+
+# optimization related
+accum_grad: 1
+grad_clip: 5
+max_epoch: 60
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.001
+   weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 35000
+
+num_workers: 4      # num of workers of data loader
+use_amp: true      # automatic mixed precision
+unused_parameters: false    # set as true if some params are unused in DDP
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
diff --git a/egs2/librispeech_100/asr1/README.md b/egs2/librispeech_100/asr1/README.md
@@ -1,5 +1,48 @@
-<!-- Generated by scripts/utils/show_asr_result.sh -->
 # RESULTS
+
+## Environments
+- date: `Mon Dec 12 06:50:58 CST 2022`
+- python version: `3.9.15 (main, Nov 24 2022, 14:31:59)  [GCC 11.2.0]`
+- espnet version: `espnet 202209`
+- pytorch version: `pytorch 1.12.1`
+- Git hash: `26f432bc859e5e40cac1a86042d498ba7baffbb0`
+  - Commit date: `Fri Dec 9 02:16:01 2022 +0000`
+
+## asr_train_asr_e_branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0_raw_en_bpe5000_sp
+
+- Config: [conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0.yaml](conf/tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0.yaml)
+- Params: 38.47 M
+- Model: [https://huggingface.co/pyf98/librispeech_100_e_branchformer](https://huggingface.co/pyf98/librispeech_100_e_branchformer)
+
+### WER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_clean|2703|54402|94.6|5.0|0.3|0.8|6.1|55.4|
+|decode_asr_asr_model_valid.acc.ave/dev_other|2864|50948|85.3|13.3|1.4|2.1|16.7|78.9|
+|decode_asr_asr_model_valid.acc.ave/test_clean|2620|52576|94.4|5.1|0.4|0.8|6.3|56.1|
+|decode_asr_asr_model_valid.acc.ave/test_other|2939|52343|85.0|13.6|1.4|2.0|17.0|80.3|
+
+### CER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_clean|2703|288456|98.3|1.0|0.7|0.7|2.4|55.4|
+|decode_asr_asr_model_valid.acc.ave/dev_other|2864|265951|93.6|4.0|2.4|2.0|8.3|78.9|
+|decode_asr_asr_model_valid.acc.ave/test_clean|2620|281530|98.2|1.1|0.8|0.6|2.5|56.1|
+|decode_asr_asr_model_valid.acc.ave/test_other|2939|272758|93.7|3.8|2.5|1.9|8.2|80.3|
+
+### TER
+
+|dataset|Snt|Wrd|Corr|Sub|Del|Ins|Err|S.Err|
+|---|---|---|---|---|---|---|---|---|
+|decode_asr_asr_model_valid.acc.ave/dev_clean|2703|69558|92.2|4.9|2.9|0.6|8.4|55.4|
+|decode_asr_asr_model_valid.acc.ave/dev_other|2864|64524|81.9|12.8|5.2|2.3|20.4|78.9|
+|decode_asr_asr_model_valid.acc.ave/test_clean|2620|66983|92.2|4.9|2.9|0.6|8.4|56.1|
+|decode_asr_asr_model_valid.acc.ave/test_other|2939|66650|81.5|13.0|5.5|2.2|20.7|80.3|
+
+
+
 ## Environments
 - date: `Mon Feb  7 21:28:00 EST 2022`
 - python version: `3.9.7 (default, Sep 16 2021, 13:09:58)  [GCC 7.5.0]`

diff --git a/egs2/librispeech_100/asr1/conf/train_asr.yaml b/egs2/librispeech_100/asr1/conf/train_asr.yaml
@@ -1 +1 @@
-tuning/train_asr_conformer_lr2e-3_warmup15k_amp_nondeterministic.yaml
+tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0.yaml
diff --git a/...ng/train_asr_e_branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0.yaml b/...ng/train_asr_e_branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0.yaml
@@ -0,0 +1,83 @@
+# Trained with A40 (48 GB) x 1 GPUs.
+encoder: e_branchformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    attention_layer_type: rel_selfattn
+    pos_enc_layer_type: rel_pos
+    rel_pos_type: latest
+    cgmlp_linear_units: 1024
+    cgmlp_conv_kernel: 31
+    use_linear_after_conv: false
+    gate_activation: identity
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    layer_drop_rate: 0.0
+    linear_units: 1024
+    positionwise_layer_type: linear
+    use_ffn: true
+    macaron_ffn: true
+    merge_conv_kernel: 31
+
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+    layer_drop_rate: 0.0
+
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1
+    length_normalized_loss: false
+
+frontend_conf:
+    n_fft: 512
+    win_length: 400
+    hop_length: 160
+
+seed: 2022
+num_workers: 4
+batch_type: numel
+batch_bins: 16000000
+accum_grad: 4
+max_epoch: 70
+patience: none
+init: none
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+use_amp: true      
+
+optim: adam
+optim_conf:
+    lr: 0.002
+    weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+    warmup_steps: 15000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 5
diff --git a/egs2/librispeech_100/asr1/run.sh b/egs2/librispeech_100/asr1/run.sh
@@ -9,25 +9,21 @@ train_set="train_clean_100"
 valid_set="dev"
 test_sets="test_clean test_other dev_clean dev_other"
 
-asr_tag=conformer_lr2e-3_warmup15k_amp_nondeterministic
 asr_config=conf/train_asr.yaml
 inference_config=conf/decode_asr.yaml
 
 ./asr.sh \
-    --skip_data_prep false \
-    --skip_train false \
-    --skip_eval false \
     --lang en \
     --ngpu 1 \
-    --nj 32 \
-    --inference_nj 32 \
+    --nj 16 \
+    --gpu_inference true \
+    --inference_nj 2 \
     --nbpe 5000 \
     --max_wav_duration 30 \
     --speed_perturb_factors "0.9 1.0 1.1" \
     --audio_format "flac.ark" \
     --feats_type raw \
     --use_lm false \
-    --asr_tag "${asr_tag}" \
     --asr_config "${asr_config}" \
     --inference_config "${inference_config}" \
     --train_set "${train_set}" \
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		tuning/train_asr_conformer_lr2e-3_warmup15k_amp_nondeterministic.yaml
		tuning/train_asr_e_branchformer_size256_mlp1024_linear1024_e12_mactrue_edrop0.0_ddrop0.0.yaml