Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add implementations of USES2 speech enhancement models #5761

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
Open
Prev Previous commit
Next Next commit
Add a recipe: egs2/urgent24/enh1
  • Loading branch information
Emrys365 committed Jun 9, 2024
commit 9d480c852480a3552dd524107b7fe4c0eb6d8a7a
1 change: 1 addition & 0 deletions egs2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
| totonac | Highland Totonac corpus (endangered language in central Mexico) | ASR | TOS | http://www.openslr.org/107/ | |
| tsukuyomi | つくよみちゃんコーパス | TTS | JPN | https://tyc.rei-yumesaki.net/material/corpus | |
| universal_se_v1 | Combination of Multi-condition English Corpora (vctk_noisy, dns_ins20, chime4, reverb, whamr) | SE | ENG | | |
| urgent2024 | Multi-domain simulated speech enhancement data for the URGENT 2024 Challenge | SE | ENG | https://urgent-challenge.github.io/urgent2024/data/ | |
| vctk | English Multi-speaker Corpus for CSTR Voice Cloning Toolkit | ASR/TTS | ENG | http://www.udialogue.org/download/cstr-vctk-corpus.html | |
| vctk_reverb | Reverberant speech database (48kHz) | SE | ENG | https://datashare.ed.ac.uk/handle/10283/2826 | |
| vctk_noisyreverb | Noisy reverberant speech database (48kHz) | SE | ENG | https://datashare.ed.ac.uk/handle/10283/2826 | |
Expand Down
110 changes: 110 additions & 0 deletions egs2/urgent24/enh1/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
# e.g.
# run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
#
# Options:
# --time <time>: Limit the maximum time to execute.
# --mem <mem>: Limit the maximum memory usage.
# -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
# --num-threads <ngpu>: Specify the number of CPU core.
# --gpu <ngpu>: Specify the number of GPU devices.
# --config: Change the configuration file from default.
#
# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
#
# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
# These options are mapping to specific options for each backend and
# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
# If jobs failed, your configuration might be wrong for your environment.
#
#
# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
# "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
# =========================================================~


# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
cmd_backend='local'

# Local machine, without any Job scheduling system
if [ "${cmd_backend}" = local ]; then

# The other usage
export train_cmd="run.pl"
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
export cuda_cmd="run.pl"
# Used for "*_recog.py"
export decode_cmd="run.pl"

# Local machine logging to stdout and log file, without any Job scheduling system
elif [ "${cmd_backend}" = stdout ]; then

# The other usage
export train_cmd="stdout.pl"
# Used for "*_train.py": "--gpu" is appended optionally by run.sh
export cuda_cmd="stdout.pl"
# Used for "*_recog.py"
export decode_cmd="stdout.pl"


# "qsub" (Sun Grid Engine, or derivation of it)
elif [ "${cmd_backend}" = sge ]; then
# The default setting is written in conf/queue.conf.
# You must change "-q g.q" for the "queue" for your environment.
# To know the "queue" names, type "qhost -q"
# Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.

export train_cmd="queue.pl"
export cuda_cmd="queue.pl"
export decode_cmd="queue.pl"


# "qsub" (Torque/PBS.)
elif [ "${cmd_backend}" = pbs ]; then
# The default setting is written in conf/pbs.conf.

export train_cmd="pbs.pl"
export cuda_cmd="pbs.pl"
export decode_cmd="pbs.pl"


# "sbatch" (Slurm)
elif [ "${cmd_backend}" = slurm ]; then
# The default setting is written in conf/slurm.conf.
# You must change "-p cpu" and "-p gpu" for the "partition" for your environment.
# To know the "partion" names, type "sinfo".
# You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
# The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".

export train_cmd="slurm.pl"
export cuda_cmd="slurm.pl"
export decode_cmd="slurm.pl"

elif [ "${cmd_backend}" = ssh ]; then
# You have to create ".queue/machines" to specify the host to execute jobs.
# e.g. .queue/machines
# host1
# host2
# host3
# Assuming you can login them without any password, i.e. You have to set ssh keys.

export train_cmd="ssh.pl"
export cuda_cmd="ssh.pl"
export decode_cmd="ssh.pl"

# This is an example of specifying several unique options in the JHU CLSP cluster setup.
# Users can modify/add their own command options according to their cluster environments.
elif [ "${cmd_backend}" = jhu ]; then

export train_cmd="queue.pl --mem 2G"
export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/queue.conf"
export decode_cmd="queue.pl --mem 4G"

else
echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
return 1
fi
11 changes: 11 additions & 0 deletions egs2/urgent24/enh1/conf/pbs.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Default configuration
command qsub -V -v PATH -S /bin/bash
option name=* -N $0
option mem=* -l mem=$0
option mem=0 # Do not add anything to qsub_opts
option num_threads=* -l ncpus=$0
option num_threads=1 # Do not add anything to qsub_opts
option num_nodes=* -l nodes=$0:ppn=1
default gpu=0
option gpu=0
option gpu=* -l ngpus=$0
12 changes: 12 additions & 0 deletions egs2/urgent24/enh1/conf/queue.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Default configuration
command qsub -v PATH -cwd -S /bin/bash -j y -l arch=*64*
option name=* -N $0
option mem=* -l mem_free=$0,ram_free=$0
option mem=0 # Do not add anything to qsub_opts
option num_threads=* -pe smp $0
option num_threads=1 # Do not add anything to qsub_opts
option max_jobs_run=* -tc $0
option num_nodes=* -pe mpi $0 # You must set this PE as allocation_rule=1
default gpu=0
option gpu=0
option gpu=* -l gpu=$0 -q g.q
14 changes: 14 additions & 0 deletions egs2/urgent24/enh1/conf/slurm.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# Default configuration
command sbatch --export=PATH
option name=* --job-name $0
option time=* --time $0
option mem=* --mem-per-cpu $0
option mem=0
option num_threads=* --cpus-per-task $0
option num_threads=1 --cpus-per-task 1
option num_nodes=* --nodes $0
default gpu=0
option gpu=0 -p cpu
option gpu=* -p gpu --gres=gpu:$0 -c $0 # Recommend allocating more CPU than, or equal to the number of GPU
# note: the --max-jobs-run option is supported as a special case
# by slurm.pl and you don't have to handle it in the config file.
103 changes: 103 additions & 0 deletions egs2/urgent24/enh1/conf/tuning/train_enh_bsrnn_large_noncausal.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
use_amp: false
optim: adam
init: none
unused_parameters: true
max_epoch: 100
batch_type: folded
batch_size: 4
iterator_type: chunk
chunk_length: 200 # 4s
chunk_default_fs: 50 # GCD among all possible sampling frequencies
num_iters_per_epoch: 8000
num_workers: 4
grad_clip: 5.0
optim_conf:
lr: 1.0e-03
eps: 1.0e-08
weight_decay: 1.0e-05
patience: 40
val_scheduler_criterion:
- valid
- loss
best_model_criterion:
- - valid
- loss
- min
keep_nbest_models: 1
scheduler: steplr
scheduler_conf:
step_size: 2
gamma: 0.99

allow_multi_rates: true

preprocessor: enh
force_single_channel: true
channel_reordering: true
# The categories list order must be the same everywhere in this config
categories:
- 1ch_8000Hz
- 1ch_16000Hz
- 1ch_22050Hz
- 1ch_24000Hz
- 1ch_32000Hz
- 1ch_44100Hz
- 1ch_48000Hz
num_spk: 1

model_conf:
normalize_variance_per_ch: true
#always_forward_in_48k: true
# The categories list order must be the same everywhere in this config
categories:
- 1ch_8000Hz
- 1ch_16000Hz
- 1ch_22050Hz
- 1ch_24000Hz
- 1ch_32000Hz
- 1ch_44100Hz
- 1ch_48000Hz

encoder: stft
encoder_conf:
n_fft: 960
hop_length: 480
use_builtin_complex: true
default_fs: 48000
decoder: stft
decoder_conf:
n_fft: 960
hop_length: 480
default_fs: 48000
separator: bsrnn
separator_conf:
num_spk: 1
num_channels: 196
num_layers: 6
target_fs: 48000
ref_channel: 0
causal: false

# A list for criterions
# The overlall loss in the multi-task learning will be:
# loss = weight_1 * loss_1 + ... + weight_N * loss_N
# The default `weight` for each sub-loss is 1.0
criterions:
# The first criterion
- name: mr_l1_tfd
conf:
window_sz: [256, 512, 768, 1024]
hop_sz: null
eps: 1.0e-8
time_domain_weight: 0.5
normalize_variance: true
wrapper: fixed_order
wrapper_conf:
weight: 1.0
# The second criterion
- name: si_snr
conf:
eps: 1.0e-7
wrapper: fixed_order
wrapper_conf:
weight: 0.0
104 changes: 104 additions & 0 deletions egs2/urgent24/enh1/conf/tuning/train_enh_conv_tasnet.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
use_amp: false
optim: adam
init: none
unused_parameters: true
max_epoch: 100
batch_type: folded
iterator_type: chunk
chunk_length: 200 # 4s
chunk_default_fs: 50 # GCD among all possible sampling frequencies
num_iters_per_epoch: 8000
num_workers: 4
grad_clip: 5.0
optim_conf:
lr: 1.0e-03
eps: 1.0e-08
weight_decay: 1.0e-05
patience: 40
val_scheduler_criterion:
- valid
- loss
best_model_criterion:
- - valid
- loss
- min
keep_nbest_models: 1
scheduler: steplr
scheduler_conf:
step_size: 2
gamma: 0.99

allow_multi_rates: true

preprocessor: enh
force_single_channel: true
channel_reordering: true
# The categories list order must be the same everywhere in this config
categories:
- 1ch_8000Hz
- 1ch_16000Hz
- 1ch_22050Hz
- 1ch_24000Hz
- 1ch_32000Hz
- 1ch_44100Hz
- 1ch_48000Hz
num_spk: 1

model_conf:
normalize_variance_per_ch: true
always_forward_in_48k: true
# The categories list order must be the same everywhere in this config
categories:
- 1ch_8000Hz
- 1ch_16000Hz
- 1ch_22050Hz
- 1ch_24000Hz
- 1ch_32000Hz
- 1ch_44100Hz
- 1ch_48000Hz

encoder: conv
encoder_conf:
channel: 1536 # for 48000 Hz input
kernel_size: 120
stride: 60
decoder: conv
decoder_conf:
channel: 1536 # for 48000 Hz input
kernel_size: 120
stride: 60
separator: tcn
separator_conf:
num_spk: 1
layer: 8
stack: 4
bottleneck_dim: 256
hidden_dim: 512
kernel: 3
causal: False
norm_type: "gLN"
nonlinear: relu

# A list for criterions
# The overlall loss in the multi-task learning will be:
# loss = weight_1 * loss_1 + ... + weight_N * loss_N
# The default `weight` for each sub-loss is 1.0
criterions:
# The first criterion
- name: mr_l1_tfd
conf:
window_sz: [256, 512, 768, 1024]
hop_sz: null
eps: 1.0e-8
time_domain_weight: 0.5
normalize_variance: true
wrapper: fixed_order
wrapper_conf:
weight: 1.0
# The second criterion
- name: si_snr
conf:
eps: 1.0e-7
wrapper: fixed_order
wrapper_conf:
weight: 0.0
Loading
Loading