From 9b32e01c836c99d6dcd8ebcbc4d80ff1ecffeb93 Mon Sep 17 00:00:00 2001 From: Binbin Zhang Date: Fri, 19 Aug 2022 11:38:41 +0800 Subject: [PATCH] [examples] remove deprecated recipes in aishell and librispeech (#1392) --- examples/aishell/s0_deprecated/README.md | 133 --------- .../s0_deprecated/conf/train_conformer.yaml | 86 ------ .../conf/train_conformer_no_pos.yaml | 86 ------ .../s0_deprecated/conf/train_transformer.yaml | 80 ------ .../conf/train_u2++_conformer.yaml | 94 ------- .../conf/train_u2++_transformer.yaml | 87 ------ .../conf/train_unified_conformer.yaml | 90 ------ .../conf/train_unified_transformer.yaml | 83 ------ .../s0_deprecated/local/aishell_data_prep.sh | 65 ----- .../s0_deprecated/local/aishell_train_lms.sh | 59 ---- .../s0_deprecated/local/download_and_untar.sh | 105 ------- examples/aishell/s0_deprecated/path.sh | 8 - examples/aishell/s0_deprecated/run.sh | 263 ------------------ examples/aishell/s0_deprecated/tools | 1 - examples/aishell/s0_deprecated/wenet | 1 - examples/aishell/s1/README.md | 84 ------ examples/aishell/s1/cmd.sh | 5 - examples/aishell/s1/conf/fbank.conf | 5 - examples/aishell/s1/conf/pitch.conf | 1 - examples/aishell/s1/conf/train_conformer.yaml | 76 ----- .../aishell/s1/conf/train_transformer.yaml | 67 ----- .../s1/conf/train_unified_conformer.yaml | 75 ----- .../aishell/s1/local/aishell_data_prep.sh | 68 ----- .../aishell/s1/local/download_and_untar.sh | 105 ------- examples/aishell/s1/path.sh | 15 - examples/aishell/s1/run.sh | 183 ------------ examples/aishell/s1/tools | 1 - examples/aishell/s1/wenet | 1 - examples/librispeech/s1/README.md | 46 --- examples/librispeech/s1/cmd.sh | 5 - examples/librispeech/s1/conf/fbank.conf | 2 - examples/librispeech/s1/conf/pitch.conf | 1 - .../s1/conf/train_conformer_large.yaml | 71 ----- .../s1/conf/train_unified_conformer.yaml | 75 ----- examples/librispeech/s1/local/data_prep.sh | 85 ------ .../s1/local/download_and_untar.sh | 97 ------- examples/librispeech/s1/path.sh | 17 -- examples/librispeech/s1/run.sh | 214 -------------- examples/librispeech/s1/steps | 1 - examples/librispeech/s1/tools | 1 - examples/librispeech/s1/utils | 1 - examples/librispeech/s1/wenet | 1 - 42 files changed, 2544 deletions(-) delete mode 100644 examples/aishell/s0_deprecated/README.md delete mode 100644 examples/aishell/s0_deprecated/conf/train_conformer.yaml delete mode 100644 examples/aishell/s0_deprecated/conf/train_conformer_no_pos.yaml delete mode 100644 examples/aishell/s0_deprecated/conf/train_transformer.yaml delete mode 100644 examples/aishell/s0_deprecated/conf/train_u2++_conformer.yaml delete mode 100644 examples/aishell/s0_deprecated/conf/train_u2++_transformer.yaml delete mode 100644 examples/aishell/s0_deprecated/conf/train_unified_conformer.yaml delete mode 100644 examples/aishell/s0_deprecated/conf/train_unified_transformer.yaml delete mode 100755 examples/aishell/s0_deprecated/local/aishell_data_prep.sh delete mode 100755 examples/aishell/s0_deprecated/local/aishell_train_lms.sh delete mode 100755 examples/aishell/s0_deprecated/local/download_and_untar.sh delete mode 100644 examples/aishell/s0_deprecated/path.sh delete mode 100644 examples/aishell/s0_deprecated/run.sh delete mode 120000 examples/aishell/s0_deprecated/tools delete mode 120000 examples/aishell/s0_deprecated/wenet delete mode 100644 examples/aishell/s1/README.md delete mode 100644 examples/aishell/s1/cmd.sh delete mode 100644 examples/aishell/s1/conf/fbank.conf delete mode 100644 examples/aishell/s1/conf/pitch.conf delete mode 100644 examples/aishell/s1/conf/train_conformer.yaml delete mode 100644 examples/aishell/s1/conf/train_transformer.yaml delete mode 100644 examples/aishell/s1/conf/train_unified_conformer.yaml delete mode 100755 examples/aishell/s1/local/aishell_data_prep.sh delete mode 100755 examples/aishell/s1/local/download_and_untar.sh delete mode 100644 examples/aishell/s1/path.sh delete mode 100644 examples/aishell/s1/run.sh delete mode 120000 examples/aishell/s1/tools delete mode 120000 examples/aishell/s1/wenet delete mode 100644 examples/librispeech/s1/README.md delete mode 100644 examples/librispeech/s1/cmd.sh delete mode 100644 examples/librispeech/s1/conf/fbank.conf delete mode 100644 examples/librispeech/s1/conf/pitch.conf delete mode 100644 examples/librispeech/s1/conf/train_conformer_large.yaml delete mode 100644 examples/librispeech/s1/conf/train_unified_conformer.yaml delete mode 100755 examples/librispeech/s1/local/data_prep.sh delete mode 100755 examples/librispeech/s1/local/download_and_untar.sh delete mode 100644 examples/librispeech/s1/path.sh delete mode 100644 examples/librispeech/s1/run.sh delete mode 120000 examples/librispeech/s1/steps delete mode 120000 examples/librispeech/s1/tools delete mode 120000 examples/librispeech/s1/utils delete mode 120000 examples/librispeech/s1/wenet diff --git a/examples/aishell/s0_deprecated/README.md b/examples/aishell/s0_deprecated/README.md deleted file mode 100644 index 96675cbf4..000000000 --- a/examples/aishell/s0_deprecated/README.md +++ /dev/null @@ -1,133 +0,0 @@ -# Performance Record - -## Conformer Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb -* Training info: lr 0.002, batch size 18, 4 gpu, acc_grad 4, 240 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a - -| decoding mode | CER | -|---------------------------|-------| -| attention decoder | 5.18 | -| ctc greedy search | 4.94 | -| ctc prefix beam search | 4.94 | -| attention rescoring | 4.61 | -| LM + attention rescoring | 4.36 | - -## U2++ Conformer Result - -* Feature info: using fbank feature, dither=1.0, cmvn, oneline speed perturb -* Training info: lr 0.001, batch size 16, 8 gpu, acc_grad 1, 360 epochs -* Decoding info: ctc_weight 0.3, reverse_weight 0.5 average_num 30 -* Git hash: 5a1342312668e7a5abb83aed1e53256819cebf95 - -| decoding mode/chunk size | full | 16 | -|---------------------------|-------|-------| -| ctc greedy search | 5.19 | 5.81 | -| ctc prefix beam search | 5.17 | 5.81 | -| attention rescoring | 4.63 | 5.05 | -| LM + attention rescoring | 4.40 | 4.75 | - -## Unified Conformer Result - -* Feature info: using fbank feature, dither=0, cmvn, oneline speed perturb -* Training info: lr 0.001, batch size 16, 8 gpu, acc_grad 1, 180 epochs, dither 0.0 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a - -| decoding mode/chunk size | full | 16 | 8 | 4 | -|---------------------------|-------|-------|-------|-------| -| attention decoder | 5.40 | 5.60 | 5.74 | 5.86 | -| ctc greedy search | 5.56 | 6.29 | 6.68 | 7.10 | -| ctc prefix beam search | 5.57 | 6.30 | 6.67 | 7.10 | -| attention rescoring | 5.05 | 5.45 | 5.69 | 5.91 | -| LM + attention rescoring | 4.73 | 5.08 | 5.22 | 5.38 | - -## U2++ Transformer Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb. -* Training info: lr 0.001, batch size 26, 8 gpu, acc_grad 1, 360 epochs, dither 0.1 -* Decoding info: ctc_weight 0.2, reverse_weight 0.5, average_num 30 -* Git hash: 65270043fc8c2476d1ab95e7c39f730017a670e0 - -| decoding mode/chunk size | full | 16 | -|---------------------------|-------|-------| -| ctc greedy search | 6.05 | 6.92 | -| ctc prefix beam search | 6.05 | 6.90 | -| attention rescoring | 5.11 | 5.63 | -| LM + attention rescoring | 4.82 | 5.24 | - -## Transformer Result - -* Feature info: using fbank feature, dither, with cmvn, online speed perturb. -* Training info: lr 0.002, batch size 26, 4 gpu, acc_grad 4, 240 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a - -| decoding mode | CER | -|---------------------------|-------| -| attention decoder | 5.69 | -| ctc greedy search | 5.92 | -| ctc prefix beam search | 5.91 | -| attention rescoring | 5.30 | -| LM + attention rescoring | 5.04 | - -## Unified Transformer Result - -* Feature info: using fbank feature, dither=0, with cmvn, online speed perturb. -* Training info: lr 0.002, batch size 16, 4 gpu, acc_grad 1, 240 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: 919f07c4887ac500168ba84b39b535fd8e58918a - -| decoding mode/chunk size | full | 16 | 8 | 4 | -|---------------------------|-------|-------|-------|-------| -| attention decoder | 6.04 | 6.35 | 6.45 | 6.70 | -| ctc greedy search | 6.28 | 6.99 | 7.39 | 7.89 | -| ctc prefix beam search | 6.28 | 6.98 | 7.40 | 7.89 | -| attention rescoring | 5.52 | 6.05 | 6.28 | 6.62 | -| LM + attention rescoring | 5.11 | 5.59 | 5.86 | 6.17 | - -## AMP Training Transformer Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb -* Training info: lr 0.002, batch size, 4 gpus, acc_grad 4, 240 epochs, dither 0.1, warm up steps 25000 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: 1bb4e5a269c535340fae5b0739482fa47733d2c1 - -| decoding mode | CER | -|------------------------|------| -| attention decoder | 5.73 | -| ctc greedy search | 5.92 | -| ctc prefix beam search | 5.92 | -| attention rescoring | 5.31 | - - -## Muilti-machines Training Conformer Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb -* Training info: lr 0.004, batch size 16, 2 machines, 8\*2=16 gpus, acc_grad 4, 240 epochs, dither 0.1, warm up steps 10000 -* Decoding info: ctc_weight 0.5, average_num 20 -* Git hash: f6b1409023440da1998d31abbcc3826dd40aaf35 - -| decoding mode | CER | -|------------------------|------| -| attention decoder | 4.90 | -| ctc greedy search | 5.07 | -| ctc prefix beam search | 5.06 | -| attention rescoring | 4.65 | - - -## Conformer with/without Position Encoding Result - -* Feature info: using fbank feature, dither, cmvn, online speed perturb -* Training info: lr 0.002, batch size 16, 8 gpu, acc_grad 4, 240 epochs, dither 0.1 -* Decoding info: ctc_weight 0.5, average_num 20 - -| decoding mode | with PE | without PE | -|------------------------|---------|------------| -| attention decoder | 5.18 | 5.73 | -| ctc greedy search | 4.94 | 4.97 | -| ctc prefix beam search | 4.94 | 4.97 | -| attention rescoring | 4.61 | 4.69 | - diff --git a/examples/aishell/s0_deprecated/conf/train_conformer.yaml b/examples/aishell/s0_deprecated/conf/train_conformer.yaml deleted file mode 100644 index 5a34ccf20..000000000 --- a/examples/aishell/s0_deprecated/conf/train_conformer.yaml +++ /dev/null @@ -1,86 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# use raw_wav or kaldi feature -raw_wav: true - -# feature extraction -collate_conf: - # waveform level config - wav_distortion_conf: - wav_dither: 0.1 - wav_distortion_rate: 0.0 - distortion_methods: [] - speed_perturb: true - feature_extraction_conf: - feature_type: 'fbank' - mel_bins: 80 - frame_shift: 10 - frame_length: 25 - using_pitch: false - # spec level config - # spec_swap: false - feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature - spec_aug: true - spec_aug_conf: - warp_for_time: False - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - max_w: 80 - - -# dataset related -dataset_conf: - max_length: 40960 - min_length: 0 - batch_type: 'static' # static or dynamic - # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB - batch_size: 16 - sort: true - -grad_clip: 5 -accum_grad: 4 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/examples/aishell/s0_deprecated/conf/train_conformer_no_pos.yaml b/examples/aishell/s0_deprecated/conf/train_conformer_no_pos.yaml deleted file mode 100644 index 60bbb0034..000000000 --- a/examples/aishell/s0_deprecated/conf/train_conformer_no_pos.yaml +++ /dev/null @@ -1,86 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'no_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# use raw_wav or kaldi feature -raw_wav: true - -# feature extraction -collate_conf: - # waveform level config - wav_distortion_conf: - wav_dither: 0.1 - wav_distortion_rate: 0.0 - distortion_methods: [] - speed_perturb: true - feature_extraction_conf: - feature_type: 'fbank' - mel_bins: 80 - frame_shift: 10 - frame_length: 25 - using_pitch: false - # spec level config - # spec_swap: false - feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature - spec_aug: true - spec_aug_conf: - warp_for_time: False - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - max_w: 80 - - -# dataset related -dataset_conf: - max_length: 40960 - min_length: 0 - batch_type: 'static' # static or dynamic - # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB - batch_size: 16 - sort: true - -grad_clip: 5 -accum_grad: 4 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/examples/aishell/s0_deprecated/conf/train_transformer.yaml b/examples/aishell/s0_deprecated/conf/train_transformer.yaml deleted file mode 100644 index f2b608d85..000000000 --- a/examples/aishell/s0_deprecated/conf/train_transformer.yaml +++ /dev/null @@ -1,80 +0,0 @@ -# network architecture -# encoder related -encoder: transformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder architecture type - normalize_before: true - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# use raw_wav or kaldi feature -raw_wav: true - -# feature extraction -collate_conf: - # waveform level config - wav_distortion_conf: - wav_dither: 0.1 - wav_distortion_rate: 0.0 - distortion_methods: [] - speed_perturb: true - feature_extraction_conf: - feature_type: 'fbank' - mel_bins: 80 - frame_shift: 10 - frame_length: 25 - using_pitch: false - # spec level config - feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature - spec_aug: true - spec_aug_conf: - warp_for_time: False - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - max_w: 80 - - -# dataset related -dataset_conf: - max_length: 40960 - min_length: 0 - batch_type: 'static' # static or dynamic - # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB - batch_size: 26 - sort: true - -grad_clip: 5 -accum_grad: 1 -max_epoch: 240 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/examples/aishell/s0_deprecated/conf/train_u2++_conformer.yaml b/examples/aishell/s0_deprecated/conf/train_u2++_conformer.yaml deleted file mode 100644 index 10ba5e4a4..000000000 --- a/examples/aishell/s0_deprecated/conf/train_u2++_conformer.yaml +++ /dev/null @@ -1,94 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.1 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 8 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.1 - src_attention_dropout_rate: 0.1 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 - -# use raw_wav or kaldi feature -raw_wav: true - -# feature extraction -collate_conf: - # waveform level config - wav_distortion_conf: - wav_dither: 1.0 - wav_distortion_rate: 0.0 - distortion_methods: [] - speed_perturb: true - feature_extraction_conf: - feature_type: 'fbank' - mel_bins: 80 - frame_shift: 10 - frame_length: 25 - using_pitch: false - # spec level config - # spec_swap: false - feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature - spec_aug: true - spec_aug_conf: - warp_for_time: True - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - max_w: 80 - spec_sub: true - spec_sub_conf: - num_t_sub: 3 - max_t: 30 -# dataset related -dataset_conf: - max_length: 40960 - min_length: 0 - batch_type: 'static' # static or dynamic - # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB - batch_size: 16 - sort: true - -grad_clip: 5 -accum_grad: 1 -max_epoch: 360 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/examples/aishell/s0_deprecated/conf/train_u2++_transformer.yaml b/examples/aishell/s0_deprecated/conf/train_u2++_transformer.yaml deleted file mode 100644 index 8729eb0ed..000000000 --- a/examples/aishell/s0_deprecated/conf/train_u2++_transformer.yaml +++ /dev/null @@ -1,87 +0,0 @@ -# network architecture -# encoder related -encoder: transformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder architecture type - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - -# decoder related -decoder: bitransformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 3 - r_num_blocks: 3 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - reverse_weight: 0.3 -# use raw_wav or kaldi feature -raw_wav: true - -# feature extraction -collate_conf: - # waveform level config - wav_distortion_conf: - wav_dither: 1.0 - wav_distortion_rate: 0.0 - distortion_methods: [] - speed_perturb: True - feature_extraction_conf: - feature_type: 'fbank' - mel_bins: 80 - frame_shift: 10 - frame_length: 25 - using_pitch: false - # spec level config - # spec_swap: false - feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature - spec_aug: true - spec_aug_conf: - warp_for_time: False - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - max_w: 80 - spec_sub: true - spec_sub_conf: - num_t_sub: 3 - max_t: 30 - -# dataset related -dataset_conf: - max_length: 40960 - min_length: 0 - batch_type: 'static' # static or dynamic - # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB - batch_size: 26 - sort: true - -grad_clip: 5 -accum_grad: 1 -max_epoch: 360 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/examples/aishell/s0_deprecated/conf/train_unified_conformer.yaml b/examples/aishell/s0_deprecated/conf/train_unified_conformer.yaml deleted file mode 100644 index 54a0a42d6..000000000 --- a/examples/aishell/s0_deprecated/conf/train_unified_conformer.yaml +++ /dev/null @@ -1,90 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - use_dynamic_left_chunk: false - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - - -# use raw_wav or kaldi feature -raw_wav: true - -# feature extraction -collate_conf: - # waveform level config - wav_distortion_conf: - wav_dither: 1.0 - wav_distortion_rate: 0.0 - distortion_methods: [] - speed_perturb: true - feature_extraction_conf: - feature_type: 'fbank' - mel_bins: 80 - frame_shift: 10 - frame_length: 25 - using_pitch: false - # spec level config - # spec_swap: false - feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature - spec_aug: true - spec_aug_conf: - warp_for_time: False - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - max_w: 80 - -# dataset related -dataset_conf: - max_length: 40960 - min_length: 0 - batch_type: 'static' # static or dynamic - # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB - batch_size: 16 - sort: true - -grad_clip: 5 -accum_grad: 1 -max_epoch: 180 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/examples/aishell/s0_deprecated/conf/train_unified_transformer.yaml b/examples/aishell/s0_deprecated/conf/train_unified_transformer.yaml deleted file mode 100644 index 7dbb80c95..000000000 --- a/examples/aishell/s0_deprecated/conf/train_unified_transformer.yaml +++ /dev/null @@ -1,83 +0,0 @@ -# network architecture -# encoder related -encoder: transformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder architecture type - normalize_before: true - use_dynamic_chunk: true - use_dynamic_left_chunk: false - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# use raw_wav or kaldi feature -raw_wav: true - -# feature extraction -collate_conf: - # waveform level config - wav_distortion_conf: - wav_dither: 0.0 - wav_distortion_rate: 0.0 - distortion_methods: [] - speed_perturb: false - feature_extraction_conf: - feature_type: 'fbank' - mel_bins: 80 - frame_shift: 10 - frame_length: 25 - using_pitch: false - # spec level config - # spec_swap: false - feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature - spec_aug: true - spec_aug_conf: - warp_for_time: False - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - max_w: 80 - - -# dataset related -dataset_conf: - max_length: 40960 - min_length: 0 - batch_type: 'static' # static or dynamic - # the size of batch_size should be set according to your gpu memory size, here we used 2080ti gpu whose memory size is 11GB - batch_size: 16 - sort: true - -grad_clip: 5 -accum_grad: 1 -max_epoch: 180 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/examples/aishell/s0_deprecated/local/aishell_data_prep.sh b/examples/aishell/s0_deprecated/local/aishell_data_prep.sh deleted file mode 100755 index fb4d5fb0a..000000000 --- a/examples/aishell/s0_deprecated/local/aishell_data_prep.sh +++ /dev/null @@ -1,65 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Xingyu Na -# Apache 2.0 - -. ./path.sh || exit 1; - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript" - exit 1; -fi - -aishell_audio_dir=$1 -aishell_text=$2/aishell_transcript_v0.8.txt - -train_dir=data/local/train -dev_dir=data/local/dev -test_dir=data/local/test -tmp_dir=data/local/tmp - -mkdir -p $train_dir -mkdir -p $dev_dir -mkdir -p $test_dir -mkdir -p $tmp_dir - -# data directory check -if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then - echo "Error: $0 requires two directory arguments" - exit 1; -fi - -# find wav audio file for train, dev and test resp. -find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist -n=`cat $tmp_dir/wav.flist | wc -l` -[ $n -ne 141925 ] && \ - echo Warning: expected 141925 data data files, found $n - -grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; -grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; -grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; - -rm -r $tmp_dir - -# Transcriptions preparation -for dir in $train_dir $dev_dir $test_dir; do - echo Preparing $dir transcriptions - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list - paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all - tools/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt - awk '{print $1}' $dir/transcripts.txt > $dir/utt.list - tools/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp - sort -u $dir/transcripts.txt > $dir/text -done - -mkdir -p data/train data/dev data/test - -for f in wav.scp text; do - cp $train_dir/$f data/train/$f || exit 1; - cp $dev_dir/$f data/dev/$f || exit 1; - cp $test_dir/$f data/test/$f || exit 1; -done - -echo "$0: AISHELL data preparation succeeded" -exit 0; diff --git a/examples/aishell/s0_deprecated/local/aishell_train_lms.sh b/examples/aishell/s0_deprecated/local/aishell_train_lms.sh deleted file mode 100755 index 30ffb7973..000000000 --- a/examples/aishell/s0_deprecated/local/aishell_train_lms.sh +++ /dev/null @@ -1,59 +0,0 @@ -#!/bin/bash - - -# To be run from one directory above this script. -. ./path.sh - -text=data/local/lm/text -lexicon=data/local/dict/lexicon.txt - -for f in "$text" "$lexicon"; do - [ ! -f $x ] && echo "$0: No such file $f" && exit 1; -done - -# Check SRILM tools -if ! which ngram-count > /dev/null; then - echo "srilm tools are not found, please download it and install it from: " - echo "http://www.speech.sri.com/projects/srilm/download.html" - echo "Then add the tools to your PATH" - exit 1 -fi - -# This script takes no arguments. It assumes you have already run -# aishell_data_prep.sh. -# It takes as input the files -# data/local/lm/text -# data/local/dict/lexicon.txt -dir=data/local/lm -mkdir -p $dir - - -cleantext=$dir/text.no_oov - -cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } - {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ - > $cleantext || exit 1; - -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ - sort -nr > $dir/word.counts || exit 1; - -# Get counts from acoustic training transcripts, and add one-count -# for each word in the lexicon (but not silence, we don't want it -# in the LM-- we'll add it optionally later). -cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ - cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \ - sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1; - -cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist - -heldout_sent=10000 # Don't change this if you want result to be comparable with - # kaldi_lm results -mkdir -p $dir -cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/heldout -cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n $dir/train - -ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \ - -map-unk "" -kndiscount -interpolate -lm $dir/lm.arpa -ngram -lm $dir/lm.arpa -ppl $dir/heldout diff --git a/examples/aishell/s0_deprecated/local/download_and_untar.sh b/examples/aishell/s0_deprecated/local/download_and_untar.sh deleted file mode 100755 index 58a278241..000000000 --- a/examples/aishell/s0_deprecated/local/download_and_untar.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# 2017 Xingyu Na -# Apache 2.0 - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." - echo " can be one of: data_aishell, resource_aishell." -fi - -data=$1 -url=$2 -part=$3 - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data" - exit 1; -fi - -part_ok=false -list="data_aishell resource_aishell" -for x in $list; do - if [ "$part" == $x ]; then part_ok=true; fi -done -if ! $part_ok; then - echo "$0: expected to be one of $list, but got '$part'" - exit 1; -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1; -fi - -if [ -f $data/$part/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0; -fi - -# sizes of the archive files in bytes. -sizes="15582913665 1246920" - -if [ -f $data/$part.tgz ]; then - size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') - size_ok=false - for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.tgz - else - echo "$data/$part.tgz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tgz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1; - fi - full_url=$url/$part.tgz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - cd $data - if ! wget --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1; - fi -fi - -cd $data - -if ! tar -xvzf $part.tgz; then - echo "$0: error un-tarring archive $data/$part.tgz" - exit 1; -fi - -touch $data/$part/.complete - -if [ $part == "data_aishell" ]; then - cd $data/$part/wav - for wav in ./*.tar.gz; do - echo "Extracting wav from $wav" - tar -zxf $wav && rm $wav - done -fi - -echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" - -if $remove_archive; then - echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." - rm $data/$part.tgz -fi - -exit 0; diff --git a/examples/aishell/s0_deprecated/path.sh b/examples/aishell/s0_deprecated/path.sh deleted file mode 100644 index 8d4c9092e..000000000 --- a/examples/aishell/s0_deprecated/path.sh +++ /dev/null @@ -1,8 +0,0 @@ -export WENET_DIR=$PWD/../../.. -export BUILD_DIR=${WENET_DIR}/runtime/libtorch/build -export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix -export PATH=$PWD:${BUILD_DIR}:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/examples/aishell/s0_deprecated/run.sh b/examples/aishell/s0_deprecated/run.sh deleted file mode 100644 index fa79d0d63..000000000 --- a/examples/aishell/s0_deprecated/run.sh +++ /dev/null @@ -1,263 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. -. ./path.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl -# communication. More details can be found in -# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html -# export NCCL_SOCKET_IFNAME=ens4f1 -export NCCL_DEBUG=INFO -stage=0 # start from 0 if you need to start from data preparation -stop_stage=6 -# The num of nodes or machines used for multi-machine training -# Default 1 for single machine/node -# NFS will be needed if you want run multi-machine training -num_nodes=1 -# The rank of each node or machine, range from 0 to num_nodes -1 -# The first node/machine sets node_rank 0, the second one sets node_rank 1 -# the third one set node_rank 2, and so on. Default 0 -node_rank=0 -# data -data=/export/data/asr-data/OpenSLR/33/ -data_url=www.openslr.org/resources/33 - -nj=16 -feat_dir=raw_wav -dict=data/dict/lang_char.txt - -train_set=train -# Optional train_config -# 1. conf/train_transformer.yaml: Standard transformer -# 2. conf/train_conformer.yaml: Standard conformer -# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer -# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer -# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding -# 6. conf/train_u2++_conformer.yaml: U2++ conformer -# 7. conf/train_u2++_transformer.yaml: U2++ transformer -train_config=conf/train_conformer.yaml -cmvn=true -dir=exp/conformer -checkpoint= - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=30 -decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring" - -. tools/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - echo "stage -1: Data Download" - local/download_and_untar.sh ${data} ${data_url} data_aishell - local/download_and_untar.sh ${data} ${data_url} resource_aishell -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Data preparation - local/aishell_data_prep.sh ${data}/data_aishell/wav ${data}/data_aishell/transcript -fi - - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # remove the space between the text labels for Mandarin dataset - for x in train dev test; do - cp data/${x}/text data/${x}/text.org - paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \ - > data/${x}/text - rm data/${x}/text.org - done - # For wav feature, just copy the data. Fbank extraction is done in training - mkdir -p $feat_dir - for x in ${train_set} dev test; do - cp -r data/$x $feat_dir - done - - tools/compute_cmvn_stats_deprecated.py --num_workers 16 --train_config $train_config \ - --in_scp data/${train_set}/wav.scp \ - --out_cmvn $feat_dir/$train_set/global_cmvn - -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # Make train dict - echo "Make a dictionary" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - tools/text2token.py -s 1 -n 1 data/train/text | cut -f 2- -d" " | tr " " "\n" \ - | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - nj=32 - # Prepare wenet required data - echo "Prepare data, prepare required format" - for x in dev test ${train_set}; do - tools/format_data.sh --nj ${nj} \ - --feat-type wav --feat $feat_dir/$x/wav.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data.tmp - - tools/remove_longshortdata.py \ - --min_input_len 0.5 \ - --max_input_len 20 \ - --max_output_len 400 \ - --max_output_input_ratio 10.0 \ - --data_file $feat_dir/$x/format.data.tmp \ - --output_data_file $feat_dir/$x/format.data - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - # You had better rm it manually before you start run.sh on first node. - # rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - # The number of gpus runing on each node/machine - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="nccl" - # The total number of processes/gpus, so that the master knows - # how many workers to wait for. - # More details about ddp can be found in - # https://pytorch.org/tutorials/intermediate/dist_tuto.html - world_size=`expr $num_gpus \* $num_nodes` - echo "total gpus is: $world_size" - cmvn_opts= - $cmvn && cp ${feat_dir}/${train_set}/global_cmvn $dir - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - # Rank of each gpu/process used for knowing whether it is - # the master of a worker. - rank=`expr $node_rank \* $num_gpus + $i` - python wenet/bin/train_deprecated.py --gpu $gpu_id \ - --config $train_config \ - --train_data $feat_dir/$train_set/format.data \ - --cv_data $feat_dir/dev/format.data \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $world_size \ - --ddp.rank $rank \ - --ddp.dist_backend $dist_backend \ - --num_workers 2 \ - $cmvn_opts \ - --pin_memory - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - reverse_weight=0.0 - for mode in ${decode_modes}; do - { - test_dir=$dir/test_${mode} - mkdir -p $test_dir - python wenet/bin/recognize_deprecated.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --test_data $feat_dir/test/format.data \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - --reverse_weight $reverse_weight \ - --result_file $test_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - python tools/compute-wer.py --char=1 --v=1 \ - $feat_dir/test/text $test_dir/text > $test_dir/wer - } & - done - wait - -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip \ - --output_quant_file $dir/final_quant.zip -fi - -# Optionally, you can add LM and test it with runtime. -if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - # 7.1 Prepare dict - unit_file=$dict - mkdir -p data/local/dict - cp $unit_file data/local/dict/units.txt - tools/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \ - data/local/dict/lexicon.txt - # 7.2 Train lm - lm=data/local/lm - mkdir -p $lm - tools/filter_scp.pl data/train/text \ - $data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text - local/aishell_train_lms.sh - # 7.3 Build decoding TLG - tools/fst/compile_lexicon_token_fst.sh \ - data/local/dict data/local/tmp data/local/lang - tools/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; - # 7.4 Decoding with runtime - # reverse_weight only works for u2++ model and only left to right decoder is used when it is set to 0.0. - reverse_weight=0.0 - chunk_size=-1 - ./tools/decode.sh --nj 16 \ - --beam 15.0 --lattice_beam 7.5 --max_active 7000 \ - --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \ - --reverse_weight $reverse_weight --chunk_size $chunk_size \ - --fst_path data/lang_test/TLG.fst \ - --dict_path data/lang_test/words.txt \ - data/test/wav.scp data/test/text $dir/final.zip \ - data/lang_test/units.txt $dir/lm_with_runtime - # See $dir/lm_with_runtime for wer -fi - -if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then - # Test model, please specify the model you want to use by --checkpoint - # alignment input - ali_format=$feat_dir/test/format.data - # alignment output - ali_result=$dir/ali - python wenet/bin/alignment_deprecated.py --gpu -1 \ - --config $dir/train.yaml \ - --input_file $ali_format \ - --checkpoint $checkpoint \ - --batch_size 1 \ - --dict $dict \ - --result_file $ali_result \ - --gen_praat -fi - diff --git a/examples/aishell/s0_deprecated/tools b/examples/aishell/s0_deprecated/tools deleted file mode 120000 index 570c2efd6..000000000 --- a/examples/aishell/s0_deprecated/tools +++ /dev/null @@ -1 +0,0 @@ -../../../tools/ \ No newline at end of file diff --git a/examples/aishell/s0_deprecated/wenet b/examples/aishell/s0_deprecated/wenet deleted file mode 120000 index 5f46eee4d..000000000 --- a/examples/aishell/s0_deprecated/wenet +++ /dev/null @@ -1 +0,0 @@ -../../../wenet/ \ No newline at end of file diff --git a/examples/aishell/s1/README.md b/examples/aishell/s1/README.md deleted file mode 100644 index c6b4210b1..000000000 --- a/examples/aishell/s1/README.md +++ /dev/null @@ -1,84 +0,0 @@ -# ASR Benchmark Results on AISHELL-1 - -## Standard E2E Results - -Conformer -* feature: fbank -* config: conf/train_conformer.yaml -* beam: 10 -* num of gpu: 4 -* learning rate: 0.002 -* ctc weight (used for attention rescoring): 0.8 -* num of averaged model: 20 -* use spec substitute - -| decoding mode/chunk size | full | -|--------------------------|------| -| attention decoder | 5.15 | -| ctc greedy search | 4.85 | -| ctc prefix beam search | 4.85 | -| attention rescoring | 4.48 | - -Conformer -* feature: fbank -* config: conf/train_conformer.yaml -* beam: 10 -* num of gpu: 4 -* learning rate: 0.002 -* ctc weight (used for attention rescoring): 0.6 -* num of averaged model: 20 - -| decoding mode/chunk size | full | -|--------------------------|------| -| attention decoder | 5.20 | -| ctc greedy search | 4.92 | -| ctc prefix beam search | 4.92 | -| attention rescoring | 4.61 | - -Conformer -* feature: fbank & pitch -* config: conf/train_conformer.yaml -* beam: 10 -* num of gpu: 4 -* learning rate: 0.002 -* ctc weight (used for attention rescoring): 0.7 -* num of averaged model: 20 - -| decoding mode/chunk size | full | -|--------------------------|------| -| attention decoder | 4.92 | -| ctc greedy search | 4.93 | -| ctc prefix beam search | 4.93 | -| attention rescoring | 4.64 | - - -Transformer -* config: conf/train_transformer.yaml -* beam: 10 -* num of gpu: 8 -* ctc weight (used for attention rescoring): 0.5 - -| decoding mode/chunk size | full | -|--------------------------|------| -| attention decoder | 5.67 | -| ctc greedy search | 5.88 | -| ctc prefix beam search | 5.88 | -| attention rescoring | 5.30 | - - - -## Unified Dynamic Chunk Results - -Conformer (causal convolution) -* config: conf/train_unified_conformer.yaml -* beam: 10 -* num of gpu: 8 -* ctc weight (used for attention rescoring): 0.5 - -| decoding mode/chunk size | full | 16 | 8 | 4 | 1 | -|--------------------------|------|------|------|------|------| -| attention decoder | 5.27 | 5.51 | 5.67 | 5.72 | 5.88 | -| ctc greedy search | 5.49 | 6.08 | 6.41 | 6.64 | 7.58 | -| ctc prefix beam search | 5.49 | 6.08 | 6.41 | 6.64 | 7.58 | -| attention rescoring | 4.90 | 5.33 | 5.52 | 5.71 | 6.23 | - diff --git a/examples/aishell/s1/cmd.sh b/examples/aishell/s1/cmd.sh deleted file mode 100644 index b4e9058f4..000000000 --- a/examples/aishell/s1/cmd.sh +++ /dev/null @@ -1,5 +0,0 @@ -export train_cmd="run.pl" -export decode_cmd="run.pl" -export mkgraph_cmd="run.pl" -export cuda_cmd="run.pl" -export other_cmd="run.pl" diff --git a/examples/aishell/s1/conf/fbank.conf b/examples/aishell/s1/conf/fbank.conf deleted file mode 100644 index 19bfba514..000000000 --- a/examples/aishell/s1/conf/fbank.conf +++ /dev/null @@ -1,5 +0,0 @@ ---sample-frequency=16000 ---num-mel-bins=80 ---dither=1.0 ---allow-upsample=true ---allow-downsample=true diff --git a/examples/aishell/s1/conf/pitch.conf b/examples/aishell/s1/conf/pitch.conf deleted file mode 100644 index e959a19d5..000000000 --- a/examples/aishell/s1/conf/pitch.conf +++ /dev/null @@ -1 +0,0 @@ ---sample-frequency=16000 diff --git a/examples/aishell/s1/conf/train_conformer.yaml b/examples/aishell/s1/conf/train_conformer.yaml deleted file mode 100644 index 6703875de..000000000 --- a/examples/aishell/s1/conf/train_conformer.yaml +++ /dev/null @@ -1,76 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# use raw_wav or kaldi feature -raw_wav: false - -# feature extraction -collate_conf: - # spec level config - feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature - spec_aug: true - spec_aug_conf: - warp_for_time: False - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - max_w: 80 - spec_sub: true - spec_sub_conf: - num_t_sub: 3 - max_t: 20 - -# dataset related -dataset_conf: - max_length: 10240 - min_length: 0 - batch_type: 'static' # static or dynamic - # the size of batch_size should be set according to your gpu memory size, here we used titan xp gpu whose memory size is 12GB - batch_size: 20 - sort: true - -grad_clip: 5 -accum_grad: 4 -max_epoch: 80 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/examples/aishell/s1/conf/train_transformer.yaml b/examples/aishell/s1/conf/train_transformer.yaml deleted file mode 100644 index a008a77c3..000000000 --- a/examples/aishell/s1/conf/train_transformer.yaml +++ /dev/null @@ -1,67 +0,0 @@ -# network architecture -# encoder related -encoder: transformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder architecture type - normalize_before: true - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# use raw_wav or kaldi feature -raw_wav: false - -# feature extraction -collate_conf: - # spec level config - feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature - spec_aug: true - spec_aug_conf: - warp_for_time: False - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - max_w: 80 - -# dataset related -dataset_conf: - max_length: 10240 - min_length: 0 - batch_type: 'static' # static or dynamic - # the size of batch_size should be set according to your gpu memory size, here we used titan xp gpu whose memory size is 12GB - batch_size: 20 - sort: true - -grad_clip: 5 -accum_grad: 4 -max_epoch: 1 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/examples/aishell/s1/conf/train_unified_conformer.yaml b/examples/aishell/s1/conf/train_unified_conformer.yaml deleted file mode 100644 index 25da3547f..000000000 --- a/examples/aishell/s1/conf/train_unified_conformer.yaml +++ /dev/null @@ -1,75 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 256 # dimension of attention - attention_heads: 4 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 4 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# use raw_wav or kaldi feature -raw_wav: false - -# feature extraction -collate_conf: - # spec level config - feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature - spec_aug: true - spec_aug_conf: - warp_for_time: False - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - max_w: 80 - -# dataset related -dataset_conf: - max_length: 10240 - min_length: 0 - batch_type: 'static' # static or dynamic - # the size of batch_size should be set according to your gpu memory size, here we used titan xp gpu whose memory size is 12GB - batch_size: 20 - sort: true - -grad_clip: 5 -accum_grad: 1 -max_epoch: 80 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.001 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/examples/aishell/s1/local/aishell_data_prep.sh b/examples/aishell/s1/local/aishell_data_prep.sh deleted file mode 100755 index 4747e4f4d..000000000 --- a/examples/aishell/s1/local/aishell_data_prep.sh +++ /dev/null @@ -1,68 +0,0 @@ -#!/bin/bash - -# Copyright 2017 Xingyu Na -# Apache 2.0 - -. ./path.sh || exit 1; - -if [ $# != 2 ]; then - echo "Usage: $0 " - echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript" - exit 1; -fi - -aishell_audio_dir=$1 -aishell_text=$2/aishell_transcript_v0.8.txt - -train_dir=data/local/train -dev_dir=data/local/dev -test_dir=data/local/test -tmp_dir=data/local/tmp - -mkdir -p $train_dir -mkdir -p $dev_dir -mkdir -p $test_dir -mkdir -p $tmp_dir - -# data directory check -if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then - echo "Error: $0 requires two directory arguments" - exit 1; -fi - -# find wav audio file for train, dev and test resp. -find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist -n=`cat $tmp_dir/wav.flist | wc -l` -[ $n -ne 141925 ] && \ - echo Warning: expected 141925 data data files, found $n - -grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; -grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; -grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; - -rm -r $tmp_dir - -# Transcriptions preparation -for dir in $train_dir $dev_dir $test_dir; do - echo Preparing $dir transcriptions - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list - sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{i=NF-1;printf("%s %s\n",$NF,$i)}' > $dir/utt2spk_all - paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all - utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt - awk '{print $1}' $dir/transcripts.txt > $dir/utt.list - utils/filter_scp.pl -f 1 $dir/utt.list $dir/utt2spk_all | sort -u > $dir/utt2spk - utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp - sort -u $dir/transcripts.txt > $dir/text - utils/utt2spk_to_spk2utt.pl $dir/utt2spk > $dir/spk2utt -done - -mkdir -p data/train data/dev data/test - -for f in spk2utt utt2spk wav.scp text; do - cp $train_dir/$f data/train/$f || exit 1; - cp $dev_dir/$f data/dev/$f || exit 1; - cp $test_dir/$f data/test/$f || exit 1; -done - -echo "$0: AISHELL data preparation succeeded" -exit 0; diff --git a/examples/aishell/s1/local/download_and_untar.sh b/examples/aishell/s1/local/download_and_untar.sh deleted file mode 100755 index 58a278241..000000000 --- a/examples/aishell/s1/local/download_and_untar.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# 2017 Xingyu Na -# Apache 2.0 - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." - echo " can be one of: data_aishell, resource_aishell." -fi - -data=$1 -url=$2 -part=$3 - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data" - exit 1; -fi - -part_ok=false -list="data_aishell resource_aishell" -for x in $list; do - if [ "$part" == $x ]; then part_ok=true; fi -done -if ! $part_ok; then - echo "$0: expected to be one of $list, but got '$part'" - exit 1; -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1; -fi - -if [ -f $data/$part/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0; -fi - -# sizes of the archive files in bytes. -sizes="15582913665 1246920" - -if [ -f $data/$part.tgz ]; then - size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') - size_ok=false - for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.tgz - else - echo "$data/$part.tgz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tgz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1; - fi - full_url=$url/$part.tgz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - cd $data - if ! wget --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1; - fi -fi - -cd $data - -if ! tar -xvzf $part.tgz; then - echo "$0: error un-tarring archive $data/$part.tgz" - exit 1; -fi - -touch $data/$part/.complete - -if [ $part == "data_aishell" ]; then - cd $data/$part/wav - for wav in ./*.tar.gz; do - echo "Extracting wav from $wav" - tar -zxf $wav && rm $wav - done -fi - -echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" - -if $remove_archive; then - echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." - rm $data/$part.tgz -fi - -exit 0; diff --git a/examples/aishell/s1/path.sh b/examples/aishell/s1/path.sh deleted file mode 100644 index 55c5b8213..000000000 --- a/examples/aishell/s1/path.sh +++ /dev/null @@ -1,15 +0,0 @@ -KALDI_ROOT=/export/maryland/binbinzhang/kaldi - -[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH -[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 -. $KALDI_ROOT/tools/config/common_path.sh - -[ ! -d utils ] && ln -s $KALDI_ROOT/egs/wsj/s5/utils -[ ! -d steps ] && ln -s $KALDI_ROOT/egs/wsj/s5/steps - -export LC_ALL=C - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh deleted file mode 100644 index 8c40a2508..000000000 --- a/examples/aishell/s1/run.sh +++ /dev/null @@ -1,183 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. - -. ./path.sh || exit 1; -. ./cmd.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3" -stage=4 # start from 0 if you need to start from data preparation -stop_stage=6 -# The aishell dataset location, please change this to your own path -# make sure of using absolute path. DO-NOT-USE relatvie path! -data=/export/data/asr-data/OpenSLR/33/ -data_url=www.openslr.org/resources/33 - -nj=16 -feat_dir=fbank -dict=data/dict/lang_char.txt - -train_set=train_sp -# Optional train_config -# 1. conf/train_transformer.yaml: Standard transformer -# 2. conf/train_conformer.yaml: Standard conformer -# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer -train_config=conf/train_conformer.yaml -cmvn=true -compress=true -fbank_conf=conf/fbank.conf -dir=exp/fbank_sp -checkpoint= - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -average_num=20 -decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring" - -. utils/parse_options.sh || exit 1; - -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - echo "stage -1: Data Download" - local/download_and_untar.sh ${data} ${data_url} data_aishell - local/download_and_untar.sh ${data} ${data_url} resource_aishell -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - # Data preparation - local/aishell_data_prep.sh ${data}/data_aishell/wav ${data}/data_aishell/transcript - utils/perturb_data_dir_speed.sh 0.9 data/train data/train_sp0.9 - utils/perturb_data_dir_speed.sh 1.1 data/train data/train_sp1.1 - utils/combine_data.sh data/train_sp data/train data/train_sp0.9 data/train_sp1.1 - # Remove the space in Mandarin text - for x in train_sp dev test; do - cp data/${x}/text data/${x}/text.org - paste -d " " <(cut -f 1 -d" " data/${x}/text.org) <(cut -f 2- -d" " data/${x}/text.org | tr -d " ") \ - > data/${x}/text - rm data/${x}/text.org - done - -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - # Feature extraction - mkdir -p $feat_dir - for x in ${train_set} dev test; do - cp -r data/$x $feat_dir - steps/make_fbank.sh --cmd "$train_cmd" --nj $nj \ - --write_utt2num_frames true --fbank_config $fbank_conf --compress $compress $feat_dir/$x - done - if $cmvn; then - compute-cmvn-stats --binary=false scp:$feat_dir/$train_set/feats.scp \ - $feat_dir/$train_set/global_cmvn - fi -fi - -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - # Make train dict - echo "Make a dictionary" - mkdir -p $(dirname $dict) - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - tools/text2token.py -s 1 -n 1 data/train/text | cut -f 2- -d" " | tr " " "\n" \ - | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # Prepare wenet required data - echo "Prepare data, prepare required format" - for x in dev test ${train_set}; do - tools/format_data.sh --nj ${nj} --feat $feat_dir/$x/feats.scp \ - $feat_dir/$x ${dict} > $feat_dir/$x/format.data - done -fi - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="nccl" - cp ${feat_dir}/${train_set}/global_cmvn $dir - cmvn_opts= - $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - python wenet/bin/train_deprecated.py --gpu $gpu_id \ - --config $train_config \ - --train_data $feat_dir/$train_set/format.data \ - --cv_data $feat_dir/dev/format.data \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $num_gpus \ - --ddp.rank $i \ - --ddp.dist_backend $dist_backend \ - --num_workers 2 \ - $cmvn_opts - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - # TODO, Add model average here - mkdir -p $dir/test - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - for mode in ${decode_modes}; do - { - test_dir=$dir/test_${mode} - mkdir -p $test_dir - python wenet/bin/recognize_deprecated.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --test_data $feat_dir/test/format.data \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --ctc_weight $ctc_weight \ - --result_file $test_dir/text \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - python tools/compute-wer.py --char=1 --v=1 \ - $feat_dir/test/text $test_dir/text > $test_dir/wer - } & - done - wait - -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip -fi - diff --git a/examples/aishell/s1/tools b/examples/aishell/s1/tools deleted file mode 120000 index c92f4172d..000000000 --- a/examples/aishell/s1/tools +++ /dev/null @@ -1 +0,0 @@ -../../../tools \ No newline at end of file diff --git a/examples/aishell/s1/wenet b/examples/aishell/s1/wenet deleted file mode 120000 index 702de77db..000000000 --- a/examples/aishell/s1/wenet +++ /dev/null @@ -1 +0,0 @@ -../../../wenet \ No newline at end of file diff --git a/examples/librispeech/s1/README.md b/examples/librispeech/s1/README.md deleted file mode 100644 index 97daf2be1..000000000 --- a/examples/librispeech/s1/README.md +++ /dev/null @@ -1,46 +0,0 @@ -# ASR Benchmark Results on LibriSpeech - -## Standard E2E Results - -Conformer without speed perpurb and lm -* config: conf/train_conformer_large.yaml -* beam: 10 -* num of gpu: 8 -* num of averaged model: 20 -* ctc weight (used for attention rescoring): 0.5 - -test clean (chunk size = full) -| decoding mode | WER | -|--------------------------|------| -| attention rescoring | 2.85 | - -test other (chunk size = full) -| decoding mode | WER | -|--------------------------|------| -| attention rescoring | 7.24 | - - -## Unified Dynamic Chunk Results - -Conformer (causal convolution) -* config: conf/train_unified_conformer.yaml -* beam: 10 -* num of gpu: 8 -* ctc weight (used for attention rescoring): 0.5 -* num of averaged model: 30 - -test clean -| decoding mode/chunk size | full | 16 | -|--------------------------|------|------| -| attention decoder | 5.17 | 5.21 | -| ctc greedy search | 3.99 | 4.74 | -| ctc prefix beam search | 3.99 | 4.74 | -| attention rescoring | 3.39 | 3.94 | - -test other -| decoding mode/chunk size | full | 16 | -|--------------------------|------|-------| -| attention decoder | 9.41 | 10.75 | -| ctc greedy search | 9.80 | 11.86 | -| ctc prefix beam search | 9.80 | 11.85 | -| attention rescoring | 8.64 | 10.52 | diff --git a/examples/librispeech/s1/cmd.sh b/examples/librispeech/s1/cmd.sh deleted file mode 100644 index b4e9058f4..000000000 --- a/examples/librispeech/s1/cmd.sh +++ /dev/null @@ -1,5 +0,0 @@ -export train_cmd="run.pl" -export decode_cmd="run.pl" -export mkgraph_cmd="run.pl" -export cuda_cmd="run.pl" -export other_cmd="run.pl" diff --git a/examples/librispeech/s1/conf/fbank.conf b/examples/librispeech/s1/conf/fbank.conf deleted file mode 100644 index 752323586..000000000 --- a/examples/librispeech/s1/conf/fbank.conf +++ /dev/null @@ -1,2 +0,0 @@ ---sample-frequency=16000 ---num-mel-bins=80 diff --git a/examples/librispeech/s1/conf/pitch.conf b/examples/librispeech/s1/conf/pitch.conf deleted file mode 100644 index e959a19d5..000000000 --- a/examples/librispeech/s1/conf/pitch.conf +++ /dev/null @@ -1 +0,0 @@ ---sample-frequency=16000 diff --git a/examples/librispeech/s1/conf/train_conformer_large.yaml b/examples/librispeech/s1/conf/train_conformer_large.yaml deleted file mode 100644 index 59220e614..000000000 --- a/examples/librispeech/s1/conf/train_conformer_large.yaml +++ /dev/null @@ -1,71 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.0 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 31 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.0 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# use raw_wav or kaldi feature -raw_wav: false - -collate_conf: - # spec level config - feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature - spec_aug: true - spec_aug_conf: - warp_for_time: False - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - max_w: 80 - -# dataset related -dataset_conf: - max_length: 10240 - min_length: 0 - # the size of batch_size should be set according to your gpu memory size, here we used titan xp gpu whose memory size is 12GB - max_frames_in_batch: 12000 - batch_type: 'dynamic' # static or dynamic - sort: true - -grad_clip: 5 -accum_grad: 4 -max_epoch: 120 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/examples/librispeech/s1/conf/train_unified_conformer.yaml b/examples/librispeech/s1/conf/train_unified_conformer.yaml deleted file mode 100644 index db6601fa4..000000000 --- a/examples/librispeech/s1/conf/train_unified_conformer.yaml +++ /dev/null @@ -1,75 +0,0 @@ -# network architecture -# encoder related -encoder: conformer -encoder_conf: - output_size: 512 # dimension of attention - attention_heads: 8 - linear_units: 2048 # the number of units of position-wise feed forward - num_blocks: 12 # the number of encoder blocks - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - attention_dropout_rate: 0.0 - input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 - normalize_before: true - cnn_module_kernel: 15 - use_cnn_module: True - activation_type: 'swish' - pos_enc_layer_type: 'rel_pos' - selfattention_layer_type: 'rel_selfattn' - causal: true - use_dynamic_chunk: true - cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster - -# decoder related -decoder: transformer -decoder_conf: - attention_heads: 8 - linear_units: 2048 - num_blocks: 6 - dropout_rate: 0.1 - positional_dropout_rate: 0.1 - self_attention_dropout_rate: 0.0 - src_attention_dropout_rate: 0.0 - -# hybrid CTC/attention -model_conf: - ctc_weight: 0.3 - lsm_weight: 0.1 # label smoothing option - length_normalized_loss: false - -# use raw_wav or kaldi feature -raw_wav: false - -collate_conf: - # spec level config - feature_dither: 0.0 # add dither [-feature_dither,feature_dither] on fbank feature - spec_aug: true - spec_aug_conf: - warp_for_time: False - num_t_mask: 2 - num_f_mask: 2 - max_t: 50 - max_f: 10 - max_w: 80 - -# dataset related -dataset_conf: - max_length: 10240 - min_length: 0 - batch_type: 'dynamic' # static or dynamic - # the size of batch_size should be set according to your gpu memory size, here we used titan xp gpu whose memory size is 12GB - max_frames_in_batch: 12000 - batch_size: 10 - sort: true - -grad_clip: 5 -accum_grad: 1 -max_epoch: 160 -log_interval: 100 - -optim: adam -optim_conf: - lr: 0.002 -scheduler: warmuplr # pytorch v1.1.0+ required -scheduler_conf: - warmup_steps: 25000 diff --git a/examples/librispeech/s1/local/data_prep.sh b/examples/librispeech/s1/local/data_prep.sh deleted file mode 100755 index 38c628558..000000000 --- a/examples/librispeech/s1/local/data_prep.sh +++ /dev/null @@ -1,85 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Vassil Panayotov -# 2014 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -if [ "$#" -ne 2 ]; then - echo "Usage: $0 " - echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean" - exit 1 -fi - -src=$1 -dst=$2 - -# all utterances are FLAC compressed -if ! which flac >&/dev/null; then - echo "Please install 'flac' on ALL worker nodes!" - exit 1 -fi - -spk_file=$src/../SPEAKERS.TXT - -mkdir -p $dst || exit 1 - -[ ! -d $src ] && echo "$0: no such directory $src" && exit 1 -[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1 - - -wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp -trans=$dst/text; [[ -f "$trans" ]] && rm $trans -utt2spk=$dst/utt2spk; [[ -f "$utt2spk" ]] && rm $utt2spk -spk2gender=$dst/spk2gender; [[ -f $spk2gender ]] && rm $spk2gender - -for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do - reader=$(basename $reader_dir) - if ! [ $reader -eq $reader ]; then # not integer. - echo "$0: unexpected subdirectory name $reader" - exit 1 - fi - - reader_gender=$(egrep "^$reader[ ]+\|" $spk_file | awk -F'|' '{gsub(/[ ]+/, ""); print tolower($2)}') - if [ "$reader_gender" != 'm' ] && [ "$reader_gender" != 'f' ]; then - echo "Unexpected gender: '$reader_gender'" - exit 1 - fi - - for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do - chapter=$(basename $chapter_dir) - if ! [ "$chapter" -eq "$chapter" ]; then - echo "$0: unexpected chapter-subdirectory name $chapter" - exit 1 - fi - - find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \ - awk -v "dir=$chapter_dir" '{printf "%s flac -c -d -s %s/%s.flac |\n", $0, dir, $0}' >>$wav_scp|| exit 1 - - chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt - [ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1 - cat $chapter_trans >>$trans - - # NOTE: For now we are using per-chapter utt2spk. That is each chapter is considered - # to be a different speaker. This is done for simplicity and because we want - # e.g. the CMVN to be calculated per-chapter - awk -v "reader=$reader" -v "chapter=$chapter" '{printf "%s %s-%s\n", $1, reader, chapter}' \ - <$chapter_trans >>$utt2spk || exit 1 - - # reader -> gender map (again using per-chapter granularity) - echo "${reader}-${chapter} $reader_gender" >>$spk2gender - done -done - -spk2utt=$dst/spk2utt -utils/utt2spk_to_spk2utt.pl <$utt2spk >$spk2utt || exit 1 - -ntrans=$(wc -l <$trans) -nutt2spk=$(wc -l <$utt2spk) -! [ "$ntrans" -eq "$nutt2spk" ] && \ - echo "Inconsistent #transcripts($ntrans) and #utt2spk($nutt2spk)" && exit 1 - -utils/validate_data_dir.sh --no-feats $dst || exit 1 - -echo "$0: successfully prepared data in $dst" - -exit 0 diff --git a/examples/librispeech/s1/local/download_and_untar.sh b/examples/librispeech/s1/local/download_and_untar.sh deleted file mode 100755 index cd32fb6b9..000000000 --- a/examples/librispeech/s1/local/download_and_untar.sh +++ /dev/null @@ -1,97 +0,0 @@ -#!/bin/bash - -# Copyright 2014 Johns Hopkins University (author: Daniel Povey) -# Apache 2.0 - -remove_archive=false - -if [ "$1" == --remove-archive ]; then - remove_archive=true - shift -fi - -if [ $# -ne 3 ]; then - echo "Usage: $0 [--remove-archive] " - echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean" - echo "With --remove-archive it will remove the archive after successfully un-tarring it." - echo " can be one of: dev-clean, test-clean, dev-other, test-other," - echo " train-clean-100, train-clean-360, train-other-500." - exit 1 -fi - -data=$1 -url=$2 -part=$3 - -if [ ! -d "$data" ]; then - echo "$0: no such directory $data" - exit 1 -fi - -part_ok=false -list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500" -for x in $list; do - if [ "$part" == $x ]; then part_ok=true; fi -done -if ! $part_ok; then - echo "$0: expected to be one of $list, but got '$part'" - exit 1 -fi - -if [ -z "$url" ]; then - echo "$0: empty URL base." - exit 1 -fi - -if [ -f $data/LibriSpeech/$part/.complete ]; then - echo "$0: data part $part was already successfully extracted, nothing to do." - exit 0 -fi - - -# sizes of the archive files in bytes. This is some older versions. -sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128" -# sizes_new is the archive file sizes of the final release. Some of these sizes are of -# things we probably won't download. -sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606" - -if [ -f $data/$part.tar.gz ]; then - size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}') - size_ok=false - for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done - if ! $size_ok; then - echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size" - echo "does not equal the size of one of the archives." - rm $data/$part.tar.gz - else - echo "$data/$part.tar.gz exists and appears to be complete." - fi -fi - -if [ ! -f $data/$part.tar.gz ]; then - if ! which wget >/dev/null; then - echo "$0: wget is not installed." - exit 1 - fi - full_url=$url/$part.tar.gz - echo "$0: downloading data from $full_url. This may take some time, please be patient." - - if ! wget -P $data --no-check-certificate $full_url; then - echo "$0: error executing wget $full_url" - exit 1 - fi -fi - -if ! tar -C $data -xvzf $data/$part.tar.gz; then - echo "$0: error un-tarring archive $data/$part.tar.gz" - exit 1 -fi - -touch $data/LibriSpeech/$part/.complete - -echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz" - -if $remove_archive; then - echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied." - rm $data/$part.tar.gz -fi diff --git a/examples/librispeech/s1/path.sh b/examples/librispeech/s1/path.sh deleted file mode 100644 index a7a89e4aa..000000000 --- a/examples/librispeech/s1/path.sh +++ /dev/null @@ -1,17 +0,0 @@ -KALDI_ROOT=../../../kaldi -ANACONDA_ROOT=/home/binbinzhang/miniconda3 - -[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh -export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$KALDI_ROOT/tools/sctk/bin:$PWD:$PATH -[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present -> Exit!" && exit 1 -. $KALDI_ROOT/tools/config/common_path.sh - -[ ! -d utils ] && ln -s $KALDI_ROOT/egs/wsj/s5/utils -[ ! -d steps ] && ln -s $KALDI_ROOT/egs/wsj/s5/steps - -export LC_ALL=C - -# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C -source $ANACONDA_ROOT/bin/activate py3 -export PYTHONIOENCODING=UTF-8 -export PYTHONPATH=../../../:$PYTHONPATH diff --git a/examples/librispeech/s1/run.sh b/examples/librispeech/s1/run.sh deleted file mode 100644 index 9a1a10ebf..000000000 --- a/examples/librispeech/s1/run.sh +++ /dev/null @@ -1,214 +0,0 @@ -#!/bin/bash - -# Copyright 2019 Mobvoi Inc. All Rights Reserved. - -. ./path.sh || exit 1; -. ./cmd.sh || exit 1; - -# Use this to control how many gpu you use, It's 1-gpu training if you specify -# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch -export CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" -stage=0 # start from 0 if you need to start from data preparation -stop_stage=5 -# data -data_url=www.openslr.org/resources/12 -# use your own data path -datadir=/nfsa/diwu/open-dir -nj=16 -# Optional train_config -# 1. conf/train_transformer_large.yaml: Standard transformer -train_config=conf/train_conformer_large.yaml -checkpoint= -cmvn=true -do_delta=false - -dir=exp/sp_spec_aug - -# use average_checkpoint will get better result -average_checkpoint=true -decode_checkpoint=$dir/final.pt -# maybe you can try to adjust it if you can not get close results as README.md -average_num=20 -decode_modes="attention_rescoring ctc_greedy_search ctc_prefix_beam_search attention" - -. utils/parse_options.sh || exit 1; - -# bpemode (unigram or bpe) -nbpe=5000 -bpemode=unigram - -set -e -set -u -set -o pipefail - -train_set=train_960 -train_dev=dev -recog_set="test_clean test_other dev_clean dev_other" -recog_set="test_clean" -if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then - echo "stage -1: Data Download" - for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do - local/download_and_untar.sh ${datadir} ${data_url} ${part} - done -fi - -if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then - ### Task dependent. You have to make data the following preparation part by yourself. - ### But you can utilize Kaldi recipes in most cases - echo "stage 0: Data preparation" - for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do - # use underscore-separated names in data directories. - local/data_prep.sh ${datadir}/LibriSpeech/${part} data/${part//-/_} - done -fi - -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - ### Task dependent. You have to design training and dev sets by yourself. - ### But you can utilize Kaldi recipes in most cases - echo "stage 1: Feature Generation" - fbankdir=fbank - # Generate the fbank features; by default 80-dimensional fbanks with pitch on each frame - for x in dev_clean test_clean dev_other test_other train_clean_100 train_clean_360 train_other_500; do - steps/make_fbank_pitch.sh --cmd "$train_cmd" --nj ${nj} --write_utt2num_frames true \ - data/${x} exp/make_fbank/${x} ${fbankdir} - utils/fix_data_dir.sh data/${x} - done - - utils/combine_data.sh --extra_files utt2num_frames data/${train_set} data/train_clean_100 data/train_clean_360 data/train_other_500 - utils/combine_data.sh --extra_files utt2num_frames data/${train_dev} data/dev_clean data/dev_other - - # compute global CMVN - compute-cmvn-stats --binary=false scp:data/${train_set}/feats.scp data/${train_set}/global_cmvn - -fi - - -dict=data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt -bpemodel=data/lang_char/${train_set}_${bpemode}${nbpe} -echo "dictionary: ${dict}" -if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then - ### Task dependent. You have to check non-linguistic symbols used in the corpus. - echo "stage 2: Dictionary and Json Data Preparation" - mkdir -p data/lang_char/ - - echo " 0" > ${dict} # 0 will be used for "blank" in CTC - echo " 1" >> ${dict} # must be 1 - - # we borrowed these code and scripts which are related bpe from ESPnet. - cut -f 2- -d" " data/${train_set}/text > data/lang_char/input.txt - tools/spm_train --input=data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 - tools/spm_encode --model=${bpemodel}.model --output_format=piece < data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} - num_token=$(cat $dict | wc -l) - echo " $num_token" >> $dict # - wc -l ${dict} -fi - -if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # Prepare wenet required data - echo "Prepare data, prepare required format" - for x in dev ${recog_set} ${train_set}; do - tools/format_data.sh --nj ${nj} --feat data/$x/feats.scp --bpecode ${bpemodel}.model \ - data/$x ${dict} > data/$x/format.data.tmp - - # remove utt having more than 3000 frames - # remove utt having more than 400 characters - tools/remove_longshortdata.py \ - --min_input_len 0.5 \ - --max_input_len 20 \ - --max_output_len 400 \ - --max_output_input_ratio 10.0 \ - --data_file data/$x/format.data.tmp \ - --output_data_file data/$x/format.data - - done -fi - - -if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - # Training - mkdir -p $dir - INIT_FILE=$dir/ddp_init - rm -f $INIT_FILE # delete old one before starting - init_method=file://$(readlink -f $INIT_FILE) - echo "$0: init method is $init_method" - num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') - # Use "nccl" if it works, otherwise use "gloo" - dist_backend="nccl" - cmvn_opts= - $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn" - # train.py will write $train_config to $dir/train.yaml with model input - # and output dimension, train.yaml will be used for inference or model - # export later - for ((i = 0; i < $num_gpus; ++i)); do - { - gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) - python wenet/bin/train_deprecated.py --gpu $gpu_id \ - --config $train_config \ - --train_data data/$train_set/format.data \ - --cv_data data/dev/format.data \ - ${checkpoint:+--checkpoint $checkpoint} \ - --model_dir $dir \ - --ddp.init_method $init_method \ - --ddp.world_size $num_gpus \ - --ddp.rank $i \ - --ddp.dist_backend $dist_backend \ - --num_workers 1 \ - $cmvn_opts - } & - done - wait -fi - -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # Test model, please specify the model you want to test by --checkpoint - # TODO, Add model average here - mkdir -p $dir/test - if [ ${average_checkpoint} == true ]; then - decode_checkpoint=$dir/avg_${average_num}.pt - echo "do model average and final checkpoint is $decode_checkpoint" - python wenet/bin/average_model.py \ - --dst_model $decode_checkpoint \ - --src_path $dir \ - --num ${average_num} \ - --val_best - fi - # static dataloader is need for attention_rescoring decode - # Specify decoding_chunk_size if it's a unified dynamic chunk trained model - # -1 for full chunk - decoding_chunk_size= - ctc_weight=0.5 - for test in $recog_set; do - for mode in ${decode_modes}; do - { - test_dir=$dir/${test}_${mode} - mkdir -p $test_dir - python wenet/bin/recognize_deprecated.py --gpu 0 \ - --mode $mode \ - --config $dir/train.yaml \ - --test_data data/$test/format.data \ - --checkpoint $decode_checkpoint \ - --beam_size 10 \ - --batch_size 1 \ - --penalty 0.0 \ - --dict $dict \ - --result_file $test_dir/text_bpe \ - --ctc_weight $ctc_weight \ - ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} - tools/spm_decode --model=${bpemodel}.model --input_format=piece < $test_dir/text_bpe | sed -e "s/▁/ /g" > $test_dir/text - python tools/compute-wer.py --char=1 --v=1 \ - data/$test/text $test_dir/text > $test_dir/wer - } & - done - done - wait - -fi - -if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then - # Export the best model you want - python wenet/bin/export_jit.py \ - --config $dir/train.yaml \ - --checkpoint $dir/avg_${average_num}.pt \ - --output_file $dir/final.zip -fi - diff --git a/examples/librispeech/s1/steps b/examples/librispeech/s1/steps deleted file mode 120000 index aee6627a6..000000000 --- a/examples/librispeech/s1/steps +++ /dev/null @@ -1 +0,0 @@ -../../../kaldi/egs/wsj/s5/steps \ No newline at end of file diff --git a/examples/librispeech/s1/tools b/examples/librispeech/s1/tools deleted file mode 120000 index 9c9686a0f..000000000 --- a/examples/librispeech/s1/tools +++ /dev/null @@ -1 +0,0 @@ -../../aishell/s0/tools \ No newline at end of file diff --git a/examples/librispeech/s1/utils b/examples/librispeech/s1/utils deleted file mode 120000 index 036b77dc8..000000000 --- a/examples/librispeech/s1/utils +++ /dev/null @@ -1 +0,0 @@ -../../../kaldi/egs/wsj/s5/utils \ No newline at end of file diff --git a/examples/librispeech/s1/wenet b/examples/librispeech/s1/wenet deleted file mode 120000 index 5f46eee4d..000000000 --- a/examples/librispeech/s1/wenet +++ /dev/null @@ -1 +0,0 @@ -../../../wenet/ \ No newline at end of file