Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add implementations of USES2 speech enhancement models #5761

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Next Next commit
Add implementations of USES2 speech enhancement models (Swin and Comp)
  • Loading branch information
Emrys365 committed Apr 24, 2024
commit 9699f000aa87cc062d91bdd254f4fa8a64487f5c
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
use_amp: false
optim: adam
init: none
unused_parameters: true
max_epoch: 20
batch_type: folded
batch_size: 4
iterator_type: chunk
chunk_length: 32000
chunk_default_fs: 8000
num_iters_per_epoch: 8000
num_workers: 4
grad_clip: 5.0
optim_conf:
lr: 4.0e-04
eps: 1.0e-08
weight_decay: 1.0e-05
patience: 20
val_scheduler_criterion:
- valid
- loss
best_model_criterion:
- - valid
- loss
- min
keep_nbest_models: 1
scheduler: warmupReducelronplateau
scheduler_conf:
# for WarmupLR
warmup_steps: 25000
# for ReduceLROnPlateau
mode: min
factor: 0.5
patience: 2
freeze_param:
- separator.uses.atf_blocks.0.channel_nn
- separator.uses.atf_blocks.1.channel_nn

allow_multi_rates: true

preprocessor: enh
force_single_channel: true
channel_reordering: true
# The categories list order must be the same everywhere in this config
categories:
- 1ch_16k
- 1ch_48k
- 1ch_16k
- 2ch_16k
- 2ch_16k_both
- 5ch_16k
- 8ch_16k_reverb
num_spk: 1

model_conf:
extract_feats_in_collect_stats: false
normalize_variance_per_ch: true
# The categories list order must be the same everywhere in this config
categories:
- 1ch_16k
- 1ch_48k
- 1ch_16k
- 2ch_16k
- 2ch_16k_both
- 5ch_16k
- 8ch_16k_reverb

encoder: stft
encoder_conf:
n_fft: 256
hop_length: 128
use_builtin_complex: false
default_fs: 8000
decoder: stft
decoder_conf:
n_fft: 256
hop_length: 128
default_fs: 8000
separator: uses2
separator_conf:
num_spk: 1
enc_channels: 256
bottleneck_size: 64
num_blocks: 4
num_spatial_blocks: 2
segment_size: 64
ref_channel: 0
tf_mode: comp
memory_size: 20
memory_types: 2
# Transformer-related arguments
input_resolution: [130, 64]
window_size: [10, 8]
use_checkpoint: false
rnn_type: lstm
bidirectional: true
hidden_size: 128
att_heads: 4
dropout: 0.0
norm_type: cLN
activation: relu
ch_mode: att_tac
ch_att_dim: 256
eps: 1.0e-5

# A list for criterions
# The overlall loss in the multi-task learning will be:
# loss = weight_1 * loss_1 + ... + weight_N * loss_N
# The default `weight` for each sub-loss is 1.0
criterions:
# The first criterion
- name: mr_l1_tfd
conf:
window_sz: [256, 512, 768, 1024]
hop_sz: null
eps: 1.0e-8
time_domain_weight: 0.5
normalize_variance: true
wrapper: fixed_order
wrapper_conf:
weight: 1.0
# The second criterion
- name: si_snr
conf:
eps: 1.0e-7
wrapper: fixed_order
wrapper_conf:
weight: 0.0
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
use_amp: false
optim: adam
init: none
unused_parameters: true
max_epoch: 20
batch_type: folded
batch_size: 4
iterator_type: chunk
chunk_length: 32000
chunk_default_fs: 8000
num_iters_per_epoch: 8000
num_workers: 4
grad_clip: 5.0
optim_conf:
lr: 4.0e-04
eps: 1.0e-08
weight_decay: 1.0e-05
patience: 20
val_scheduler_criterion:
- valid
- loss
best_model_criterion:
- - valid
- loss
- min
keep_nbest_models: 1
scheduler: warmupReducelronplateau
scheduler_conf:
# for WarmupLR
warmup_steps: 25000
# for ReduceLROnPlateau
mode: min
factor: 0.5
patience: 2
init_param:
- "exp/enh_train_enh_uses2_comp_refch0_2mem_stage1_raw/valid.loss.best.pth"
freeze_param:
- encoder
- separator.post_encoder
- separator.uses.layer_norm
- separator.uses.bottleneck_conv1x1
- separator.uses.atf_blocks.0.freq_nn
- separator.uses.atf_blocks.0.temporal_nn
- separator.uses.atf_blocks.0.tf_nn
- separator.uses.atf_blocks.1.freq_nn
- separator.uses.atf_blocks.1.temporal_nn
- separator.uses.atf_blocks.1.tf_nn
- separator.uses.atf_blocks.2
- separator.uses.atf_blocks.3
- separator.uses.atf_blocks.4
- separator.uses.atf_blocks.5
- separator.uses.memory_tokens
- separator.uses.output
- separator.pre_decoder
- decoder

allow_multi_rates: true

preprocessor: enh
force_single_channel: false
channel_reordering: true
# The categories list order must be the same everywhere in this config
categories:
- 1ch_16k
- 1ch_48k
- 1ch_16k
- 2ch_16k
- 2ch_16k_both
- 5ch_16k
- 8ch_16k_reverb
num_spk: 1

model_conf:
extract_feats_in_collect_stats: false
normalize_variance_per_ch: true
# The categories list order must be the same everywhere in this config
categories:
- 1ch_16k
- 1ch_48k
- 1ch_16k
- 2ch_16k
- 2ch_16k_both
- 5ch_16k
- 8ch_16k_reverb

encoder: stft
encoder_conf:
n_fft: 256
hop_length: 128
use_builtin_complex: false
default_fs: 8000
decoder: stft
decoder_conf:
n_fft: 256
hop_length: 128
default_fs: 8000
separator: uses2
separator_conf:
num_spk: 1
enc_channels: 256
bottleneck_size: 64
num_blocks: 4
num_spatial_blocks: 2
segment_size: 64
ref_channel: 0
tf_mode: comp
memory_size: 20
memory_types: 2
# Transformer-related arguments
input_resolution: [130, 64]
window_size: [10, 8]
use_checkpoint: false
rnn_type: lstm
bidirectional: true
hidden_size: 128
att_heads: 4
dropout: 0.0
norm_type: cLN
activation: relu
ch_mode: att_tac
ch_att_dim: 256
eps: 1.0e-5

# A list for criterions
# The overlall loss in the multi-task learning will be:
# loss = weight_1 * loss_1 + ... + weight_N * loss_N
# The default `weight` for each sub-loss is 1.0
criterions:
# The first criterion
- name: mr_l1_tfd
conf:
window_sz: [256, 512, 768, 1024]
hop_sz: null
eps: 1.0e-8
time_domain_weight: 0.5
normalize_variance: true
wrapper: fixed_order
wrapper_conf:
weight: 1.0
# The second criterion
- name: si_snr
conf:
eps: 1.0e-7
wrapper: fixed_order
wrapper_conf:
weight: 0.0
Loading