Add implementations of USES2 speech enhancement models (Swin and Comp)

espnet · Emrys365 · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024 · Jun 9, 2024
commit 9699f000aa87cc062d91bdd254f4fa8a64487f5c
diff --git a/egs2/universal_se_v1/enh1/conf/tuning/train_enh_uses2_comp_refch0_2mem_stage1.yaml b/egs2/universal_se_v1/enh1/conf/tuning/train_enh_uses2_comp_refch0_2mem_stage1.yaml
@@ -0,0 +1,128 @@
+use_amp: false
+optim: adam
+init: none
+unused_parameters: true
+max_epoch: 20
+batch_type: folded
+batch_size: 4
+iterator_type: chunk
+chunk_length: 32000
+chunk_default_fs: 8000
+num_iters_per_epoch: 8000
+num_workers: 4
+grad_clip: 5.0
+optim_conf:
+    lr: 4.0e-04
+    eps: 1.0e-08
+    weight_decay: 1.0e-05
+patience: 20
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: warmupReducelronplateau
+scheduler_conf:
+    # for WarmupLR
+    warmup_steps: 25000
+    # for ReduceLROnPlateau
+    mode: min
+    factor: 0.5
+    patience: 2
+freeze_param:
+- separator.uses.atf_blocks.0.channel_nn
+- separator.uses.atf_blocks.1.channel_nn
+
+allow_multi_rates: true
+
+preprocessor: enh
+force_single_channel: true
+channel_reordering: true
+# The categories list order must be the same everywhere in this config
+categories:
+- 1ch_16k
+- 1ch_48k
+- 1ch_16k
+- 2ch_16k
+- 2ch_16k_both
+- 5ch_16k
+- 8ch_16k_reverb
+num_spk: 1
+
+model_conf:
+    extract_feats_in_collect_stats: false
+    normalize_variance_per_ch: true
+    # The categories list order must be the same everywhere in this config
+    categories:
+    - 1ch_16k
+    - 1ch_48k
+    - 1ch_16k
+    - 2ch_16k
+    - 2ch_16k_both
+    - 5ch_16k
+    - 8ch_16k_reverb
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+    use_builtin_complex: false
+    default_fs: 8000
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+    default_fs: 8000
+separator: uses2
+separator_conf:
+    num_spk: 1
+    enc_channels: 256
+    bottleneck_size: 64
+    num_blocks: 4
+    num_spatial_blocks: 2
+    segment_size: 64
+    ref_channel: 0
+    tf_mode: comp
+    memory_size: 20
+    memory_types: 2
+    # Transformer-related arguments
+    input_resolution: [130, 64]
+    window_size: [10, 8]
+    use_checkpoint: false
+    rnn_type: lstm
+    bidirectional: true
+    hidden_size: 128
+    att_heads: 4
+    dropout: 0.0
+    norm_type: cLN
+    activation: relu
+    ch_mode: att_tac
+    ch_att_dim: 256
+    eps: 1.0e-5
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions:
+  # The first criterion
+  - name: mr_l1_tfd
+    conf:
+      window_sz: [256, 512, 768, 1024]
+      hop_sz: null
+      eps: 1.0e-8
+      time_domain_weight: 0.5
+      normalize_variance: true
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
+  # The second criterion
+  - name: si_snr
+    conf:
+      eps: 1.0e-7
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 0.0
diff --git a/egs2/universal_se_v1/enh1/conf/tuning/train_enh_uses2_comp_refch0_2mem_stage2.yaml b/egs2/universal_se_v1/enh1/conf/tuning/train_enh_uses2_comp_refch0_2mem_stage2.yaml
@@ -0,0 +1,146 @@
+use_amp: false
+optim: adam
+init: none
+unused_parameters: true
+max_epoch: 20
+batch_type: folded
+batch_size: 4
+iterator_type: chunk
+chunk_length: 32000
+chunk_default_fs: 8000
+num_iters_per_epoch: 8000
+num_workers: 4
+grad_clip: 5.0
+optim_conf:
+    lr: 4.0e-04
+    eps: 1.0e-08
+    weight_decay: 1.0e-05
+patience: 20
+val_scheduler_criterion:
+- valid
+- loss
+best_model_criterion:
+-   - valid
+    - loss
+    - min
+keep_nbest_models: 1
+scheduler: warmupReducelronplateau
+scheduler_conf:
+    # for WarmupLR
+    warmup_steps: 25000
+    # for ReduceLROnPlateau
+    mode: min
+    factor: 0.5
+    patience: 2
+init_param:
+- "exp/enh_train_enh_uses2_comp_refch0_2mem_stage1_raw/valid.loss.best.pth"
+freeze_param:
+- encoder
+- separator.post_encoder
+- separator.uses.layer_norm
+- separator.uses.bottleneck_conv1x1
+- separator.uses.atf_blocks.0.freq_nn
+- separator.uses.atf_blocks.0.temporal_nn
+- separator.uses.atf_blocks.0.tf_nn
+- separator.uses.atf_blocks.1.freq_nn
+- separator.uses.atf_blocks.1.temporal_nn
+- separator.uses.atf_blocks.1.tf_nn
+- separator.uses.atf_blocks.2
+- separator.uses.atf_blocks.3
+- separator.uses.atf_blocks.4
+- separator.uses.atf_blocks.5
+- separator.uses.memory_tokens
+- separator.uses.output
+- separator.pre_decoder
+- decoder
+
+allow_multi_rates: true
+
+preprocessor: enh
+force_single_channel: false
+channel_reordering: true
+# The categories list order must be the same everywhere in this config
+categories:
+- 1ch_16k
+- 1ch_48k
+- 1ch_16k
+- 2ch_16k
+- 2ch_16k_both
+- 5ch_16k
+- 8ch_16k_reverb
+num_spk: 1
+
+model_conf:
+    extract_feats_in_collect_stats: false
+    normalize_variance_per_ch: true
+    # The categories list order must be the same everywhere in this config
+    categories:
+    - 1ch_16k
+    - 1ch_48k
+    - 1ch_16k
+    - 2ch_16k
+    - 2ch_16k_both
+    - 5ch_16k
+    - 8ch_16k_reverb
+
+encoder: stft
+encoder_conf:
+    n_fft: 256
+    hop_length: 128
+    use_builtin_complex: false
+    default_fs: 8000
+decoder: stft
+decoder_conf:
+    n_fft: 256
+    hop_length: 128
+    default_fs: 8000
+separator: uses2
+separator_conf:
+    num_spk: 1
+    enc_channels: 256
+    bottleneck_size: 64
+    num_blocks: 4
+    num_spatial_blocks: 2
+    segment_size: 64
+    ref_channel: 0
+    tf_mode: comp
+    memory_size: 20
+    memory_types: 2
+    # Transformer-related arguments
+    input_resolution: [130, 64]
+    window_size: [10, 8]
+    use_checkpoint: false
+    rnn_type: lstm
+    bidirectional: true
+    hidden_size: 128
+    att_heads: 4
+    dropout: 0.0
+    norm_type: cLN
+    activation: relu
+    ch_mode: att_tac
+    ch_att_dim: 256
+    eps: 1.0e-5
+
+# A list for criterions
+# The overlall loss in the multi-task learning will be:
+# loss = weight_1 * loss_1 + ... + weight_N * loss_N
+# The default `weight` for each sub-loss is 1.0
+criterions:
+  # The first criterion
+  - name: mr_l1_tfd
+    conf:
+      window_sz: [256, 512, 768, 1024]
+      hop_sz: null
+      eps: 1.0e-8
+      time_domain_weight: 0.5
+      normalize_variance: true
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 1.0
+  # The second criterion
+  - name: si_snr
+    conf:
+      eps: 1.0e-7
+    wrapper: fixed_order
+    wrapper_conf:
+      weight: 0.0