Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add centralized data preparation for OWSM #5478

Merged
merged 31 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
3be1f40
add whisper data.sh for v1 and v2
jctian98 Oct 17, 2023
37ab173
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 17, 2023
92bf631
add OWSM v3 data recipe
jctian98 Oct 17, 2023
ac8e423
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Oct 17, 2023
a3c24bd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 17, 2023
063dc3f
fix ci issues
jctian98 Oct 18, 2023
5e14a62
update with ci issues
jctian98 Oct 18, 2023
7b707cd
change egs name from mixed_v* to owsm_v*
jctian98 Oct 23, 2023
14204e2
v3 shuold be ready except wsj
jctian98 Oct 30, 2023
ae05a6c
add wsj
jctian98 Oct 30, 2023
c515f76
update db.sh
jctian98 Oct 30, 2023
b53ce47
Merge branch 'master' into owsm_data
jctian98 Oct 30, 2023
ec109e2
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 30, 2023
31ad173
almost finish all scripts
jctian98 Nov 10, 2023
8a09625
fix small problems
jctian98 Nov 10, 2023
952acf6
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 10, 2023
2fd2668
merge master
jctian98 Nov 10, 2023
c53afd0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 10, 2023
bdaf344
update the langauge mapping
jctian98 Nov 11, 2023
d379fd0
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 11, 2023
b2cb427
Merge branch 'master' into owsm_data
jctian98 Nov 11, 2023
f5e5414
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 11, 2023
51e3691
fix CI issue
jctian98 Nov 11, 2023
7f75d15
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 11, 2023
66176bc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 11, 2023
3d89d78
update wsj and commonvoice
jctian98 Nov 26, 2023
8f1e0fa
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 26, 2023
77fe14b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2023
c391765
update wsj text norm script
jctian98 Nov 26, 2023
642fd22
update wsj text norm 2
jctian98 Nov 26, 2023
ee00c6c
revise voxpopuli
jctian98 Nov 29, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
merge master
  • Loading branch information
jctian98 committed Nov 10, 2023
commit 2fd2668b81729e3d994e5200d85718b65aa0a4ab
16 changes: 14 additions & 2 deletions ci/test_integration_espnet2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,10 @@ cd ./egs2/mini_an4/spk1
gen_dummy_coverage
echo "==== [ESPnet2] SPK ==="
./run.sh --ngpu 0 --stage 0 --stop-stage 4 --feats-type "raw" --python "${python}" --spk-args "--num_workers 0"
./run.sh --ngpu 0 --stage 4 --stop-stage 4 --feats-type "raw" --python "${python}" --spk_config conf/train_rawnet3_dataaug_debug.yaml --spk-args "--num_workers 0"
./run.sh --ngpu 0 --stage 4 --stop-stage 4 --feats-type "raw" --python "${python}" --spk_config conf/train_rawnet3_sampler.yaml --spk-args "--num_workers 0"
./run.sh --ngpu 0 --stage 5 --stop-stage 5 --feats-type "raw" --python "${python}" --spk_config conf/train_rawnet3_dataaug_debug.yaml --spk-args "--num_workers 0"
./run.sh --ngpu 0 --stage 5 --stop-stage 5 --feats-type "raw" --python "${python}" --spk_config conf/train_rawnet3_sampler.yaml --spk-args "--num_workers 0"
./run.sh --ngpu 0 --stage 5 --stop-stage 5 --feats-type "raw" --python "${python}" --spk_config conf/train_ecapa.yaml --spk-args "--num_workers 0"
./run.sh --ngpu 0 --stage 6 --stop-stage 7 --feats-type "raw" --python "${python}" --spk_config conf/train_rawnet3_sampler.yaml --spk-args "--num_workers 0" --inference_model "valid.eer.ave.pth"
# Remove generated files in order to reduce the disk usage
rm -rf exp dump data
cd "${cwd}"
Expand All @@ -300,6 +302,16 @@ echo "==== [ESPnet2] S2T ==="
rm -rf exp dump data
cd "${cwd}"

# [ESPnet2] test s2st1 recipe
cd ./egs2/mini_an4/s2st1
gen_dummy_coverage
echo "==== [ESPnet2] S2ST ==="
./run.sh --ngpu 0 --stage 1 --stop_stage 8 --use_discrete_unit false --s2st_config conf/s2st_spec_debug.yaml --python "${python}"
./run.sh --ngpu 0 --stage 1 --stop_stage 8 --python "${python}" --use_discrete_unit true --s2st_config conf/train_s2st_discrete_unit_debug.yaml --clustering_num_threads 2 --feature_num_clusters 5
# Remove generated files in order to reduce the disk usage
rm -rf exp dump data ckpt .cache
cd "${cwd}"

echo "=== report ==="
coverage combine egs2/*/*/.coverage
coverage report
Expand Down
36 changes: 1 addition & 35 deletions doc/paper/espnet-se++/paper.bib
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ @inproceedings{Takahashi:2019
booktitle = {Interspeech 2019}
}

@inproceedings{Luo:2019a,
@inproceedings{Luo:2019,
doi = {10.1109/asru46091.2019.9003849},
url = {https://doi.org/10.1109%2Fasru46091.2019.9003849},
pages={260--267},
Expand Down Expand Up @@ -248,30 +248,6 @@ @inproceedings{Povey:2011
organization={IEEE Signal Processing Society}
}


@inproceedings{Luo:2020,
doi = {10.1109/icassp40776.2020.9054266},
title={Dual-path {RNN}: efficient long sequence modeling for time-domain single-channel speech separation},
author={Luo, Y. and Chen, Z. and Yoshioka, T.},
booktitle={2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages={46--50},
year={2020},
organization={IEEE}
}

@article{Luo:2019b,
doi = {10.1109/taslp.2019.2915167},
title={{Conv-TasNet}: Surpassing ideal time--frequency magnitude masking for speech separation},
author={Luo, Y. and Mesgarani, N.},
journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
volume={27},
number={8},
pages={1256--1266},
year={2019},
publisher={IEEE}
}


@article{Taal:2011,
doi = {10.1109/tasl.2011.2114881},
title={An algorithm for intelligibility prediction of time--frequency weighted noisy speech},
Expand All @@ -295,16 +271,6 @@ @inproceedings{Rix:2001
organization={IEEE}
}

@inproceedings{Yu:2017,
doi = {10.1109/icassp.2017.7952154},
title={Permutation invariant training of deep models for speaker-independent multi-talker speech separation},
author={Yu, D. and Kolbæk, M. and Tan, Z. H. and Jensen, J.},
booktitle={2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages={241--245},
year={2017},
organization={IEEE}
}

@article{Towns:2014,
doi = {10.1109/mcse.2014.80},
title={{XSEDE}: accelerating scientific discovery},
Expand Down
6 changes: 3 additions & 3 deletions doc/paper/espnet-se++/paper.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ bibliography: paper.bib

# Summary
This paper presents the software design and user interface of ESPnet-SE++, a new speech separation and enhancement (SSE) module of the ESPnet toolkit.
ESPnet-SE++ significantly expands the functionality of ESPnet-SE [@Li:2021] with several new models[@Hershey:2016; @Chen:2017; @Hu:2020; @Tan:2021; @Li:2022; @Dang:2022; @Takahashi:2019; @Luo:2019a; @Lu:2022a], loss functions [@Luo:2018; @Le:2019; @Boeddeker:2021; @Scheibler:2022], and training recipes as shown in [@Lu:2022b]. Crucially, it features a new, redesigned interface, which allows for a flexible combination of SSE front-ends with many downstream tasks, including automatic speech recognition (ASR), speaker diarization (SD), speech translation (ST), and spoken language understanding (SLU).
ESPnet-SE++ significantly expands the functionality of ESPnet-SE [@Li:2021] with several new models[@Hershey:2016; @Chen:2017; @Hu:2020; @Tan:2021; @Li:2022; @Dang:2022; @Takahashi:2019; @Luo:2019; @Lu:2022a], loss functions [@Luo:2018; @Le:2019; @Boeddeker:2021; @Scheibler:2022], and training recipes as shown in [@Lu:2022b]. Crucially, it features a new, redesigned interface, which allows for a flexible combination of SSE front-ends with many downstream tasks, including automatic speech recognition (ASR), speaker diarization (SD), speech translation (ST), and spoken language understanding (SLU).

# Statement of need

Expand Down Expand Up @@ -127,7 +127,7 @@ object with the data-iterator for testing and validation. During its initializat

#### bin/enh_scoring.py
def scoring(..., ref_scp, inf_scp, ...)
The SSE scoring functions calculates several popular objective scores such as SI-SDR [@le:2019], STOI [@Taal:2011], SDR and PESQ [@Rix:2001], based on the reference signal and processed speech pairs.
The SSE scoring functions calculates several popular objective scores such as SI-SDR [@Le:2019], STOI [@Taal:2011], SDR and PESQ [@Rix:2001], based on the reference signal and processed speech pairs.

### SSE Control Class `tasks/enh.py`

Expand Down Expand Up @@ -200,7 +200,7 @@ which processes speech and only returns losses for [Trainer](https://github.com/
# ESPnet-SE++ User Interface

## Building a New Recipe from Scratch
Since ESPnet2 provides common scripts such as `enh.sh` and `enh_asr.sh` for each task, users only need to create `local/data.sh` for the data preparation of a new corpus. The generated data follows the Kaldi-style structure:
Since ESPnet2 provides common scripts such as `enh.sh` and `enh_asr.sh` for each task, users only need to create `local/data.sh` for the data preparation of a new corpus. The generated data follows the Kaldi-style structure [@Povey:2011]:


![](graphics/data_structure.png)
Expand Down
37 changes: 3 additions & 34 deletions egs/must_c/st1/local/download_and_untar.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,35 +34,9 @@ if [ ! "$(echo ${langs} | grep ${lang})" ]; then
fi

if [ ${version} = "v1" ]; then
if [ ${lang} = "de" ]; then
url=https://drive.google.com/open?id=1Mf2il_VelDIJMSio0bq7I8M9fSs-X4Ie
elif [ ${lang} = "es" ]; then
url=https://drive.google.com/open?id=14d2ttsuEUFXsxx-KRWJMsFhQGrYOJcpH
elif [ ${lang} = "fr" ]; then
url=https://drive.google.com/open?id=1acIBqcPVX5QXXXV9u8_yDPtCgfsdEJDV
elif [ ${lang} = "it" ]; then
url=https://drive.google.com/open?id=1qbK88SAKxqjMUybkMeIjrJWnNAZyE8V0
elif [ ${lang} = "nl" ]; then
url=https://drive.google.com/open?id=11fNraDQs-LiODDxyV5ZW0Slf3XuDq5Cf
elif [ ${lang} = "pt" ]; then
url=https://drive.google.com/open?id=1C5qK1FckA702nsYcXwmGdzlMmHg1F_ot
elif [ ${lang} = "ro" ]; then
url=https://drive.google.com/open?id=1nbdYR5VqcTbLpOB-9cICKCgsLAs7fVzd
elif [ ${lang} = "ru" ]; then
url=https://drive.google.com/open?id=1Z3hSiP7fsR3kf8fjQYzIa07jmw4KXNnw
else
echo "${lang} is not supported now."
exit 1;
fi
instructions="Please download the archives from https://mt.fbk.eu/must-c-release-v1-0/ and place them inside ${data}."
elif [ ${version} = "v2" ]; then
if [ ${lang} = "de" ]; then
url=https://drive.google.com/u/0/uc?id=1UBPNwFEVhIZCOEpu4hTqPji57XRg85UO
elif [ ${lang} = "zh" ]; then
url=https://drive.google.com/u/0/uc?id=1iz2Yl1avlzF79_77iKK7kPlcmbZhk3o6
else
echo "${lang} is not supported now."
exit 1;
fi
instructions="Please download the archives from https://mt.fbk.eu/must-c-release-v2-0/ and place them inside ${data}. For en-ja and en-zh, you may just download the archive without H5 files."
else
echo "${version} is not supported now."
exit 1;
Expand All @@ -85,12 +59,7 @@ if [ -f ${tar_path} ]; then
fi

if [ ! -f ${tar_path} ]; then
if ! which wget >/dev/null; then
echo "$0: wget is not installed."
exit 1;
fi
echo "$0: downloading data from ${url}. This may take some time, please be patient."
download_from_google_drive.sh ${url} ${data} tar.gz || exit 1
echo ${instructions}
fi

if ! tar -zxvf ${tar_path} -d -C ${data}; then
Expand Down
5 changes: 4 additions & 1 deletion egs2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
| kss | Korean single speaker corpus | TTS | KOR | https://www.kaggle.com/bryanpark/korean-single-speaker-speech-dataset | |
| l3das22 | L3DAS22: Machine Learning for 3D Audio Signal Processing - ICASSP 2022 | SE | ENG | https://www.l3das.com/icassp2022/ | |
| laborotv | LaboroTVSpeech (A large-scale Japanese speech corpus on TV recordings) | ASR | JPN | https://laboro.ai/column/eg-laboro-tv-corpus-jp | |
| librispeech | Librilight-limited subset | ASR | ENG | https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz | |
| libriheavy_medium | Libriheavy medium subset | ASR | ENG | https://github.com/k2-fsa/libriheavy | |
| libriheavy_small | Libriheavy small subset | ASR | ENG | https://github.com/k2-fsa/libriheavy | |
| librilight_limited | Librilight-limited subset | ASR | ENG | https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz | |
| librimix | LibriMix: An Open-Source Dataset for Generalizable Speech Separation | SE/DIAR | ENG | https://github.com/JorisCos/LibriMix | |
| librispeech | LibriSpeech ASR corpus | ASR | ENG | http://www.openslr.org/12 | |
| librispeech_100 | LibriSpeech ASR corpus 100h subset | ASR | ENG | http://www.openslr.org/12 | |
Expand Down Expand Up @@ -152,6 +154,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
| snips | SNIPS: A dataset for spoken language understanding | SLU | ENG | https://github.com/sonos/spoken-language-understanding-research-datasets | |
| speechcommands | Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition | SLU | ENG | https://www.tensorflow.org/datasets/catalog/speech_commands | |
| spgispeech | SPGISpeech 5k corpus | ASR | ENG | https://datasets.kensho.com/datasets/scribe | |
| spring_speech | SPRING-INX: Data for Indian Languages | ASR | ENG | https://asr.iitm.ac.in/dataset | |
| stop | STOP: Spoken Task Oriented Parsing | SLU | ENG | https://facebookresearch.github.io/spoken_task_oriented_parsing/ | |
| su_openslr36 | Sundanese | ASR | SUN | http://www.openslr.org/36 | |
| swbd | Switchboard Corpus for 2-channel Conversational Telephone Speech (300h) | ASR | ENG | https://catalog.ldc.upenn.edu/LDC97S62 | |
Expand Down
2 changes: 2 additions & 0 deletions egs2/TEMPLATE/asr1/db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ LABOROTV=
TEDXJP=
LIBRISPEECH=downloads
LIBRILIGHT_LIMITED=
LIBRILIGHT=downloads
FSC=
FREESOUND=
MELD=downloads
Expand Down Expand Up @@ -206,6 +207,7 @@ KIRITAN=
NAMINE=
KATHBATH=downloads
GRAMVAANI=downloads
SPRING_INX=downloads
VOXCELEB=
KSPONSPEECH=

Expand Down
60 changes: 60 additions & 0 deletions egs2/TEMPLATE/asr1/pyscripts/utils/calculate_eer_mindcf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import sys
from typing import List, Tuple

import numpy as np

from espnet2.utils.eer import ComputeErrorRates, ComputeMinDcf, tuneThresholdfromScore


def load_scorefile(scorefile: str) -> Tuple[List[float], List[int]]:
with open(scorefile, "r") as f:
lines = f.readlines()
scores, labels = [], []
for line in lines:
_, score, label = line.strip().split(" ")
scores.append(float(score))
labels.append(int(label))

return scores, labels


def main(args):
scorefile = args[0]
out_dir = args[1]

# get scores and labels
scores, labels = load_scorefile(scorefile)

# calculate statistics in target and nontarget classes.
n_trials = len(scores)
scores_trg = []
scores_nontrg = []
for _s, _l in zip(scores, labels):
if _l == 1:
scores_trg.append(_s)
elif _l == 0:
scores_nontrg.append(_s)
else:
raise ValueError(f"{_l}, {type(_l)}")
trg_mean = float(np.mean(scores_trg))
trg_std = float(np.std(scores_trg))
nontrg_mean = float(np.std(scores_nontrg))
nontrg_std = float(np.std(scores_nontrg))

# predictions, ground truth, and the false acceptance rates to calculate
results = tuneThresholdfromScore(scores, labels, [1, 0.1])
eer = results[1]
fnrs, fprs, thresholds = ComputeErrorRates(scores, labels)

# p_target, c_miss, and c_falsealarm in NIST minDCF calculation
p_trg, c_miss, c_fa = 0.05, 1, 1
mindcf, _ = ComputeMinDcf(fnrs, fprs, thresholds, p_trg, c_miss, c_fa)

with open(out_dir, "w") as f:
f.write(f"trg_mean: {trg_mean}, trg_std: {trg_std}\n")
f.write(f"nontrg_mean: {nontrg_mean}, nontrg_std: {nontrg_std}\n")
f.write(f"eer: {eer}, mindcf: {mindcf}\n")


if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
67 changes: 67 additions & 0 deletions egs2/TEMPLATE/asr1/pyscripts/utils/generate_cohort_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import sys

import numpy as np
import soundfile as sf
import yaml

np.random.seed(0)


def load_yaml(yamlfile):
with open(yamlfile, "r") as stream:
data = yaml.safe_load(stream)
return data


def main(args):
spk2utt = args[0]
wav_scp = args[1]
out_dir = args[2]
cfg = load_yaml(args[3])
samp_rate = args[4][:-1]
print(cfg)
with open(wav_scp) as f:
lines = f.readlines()
wav2dir_dic = {
line.strip().split(" ")[0]: line.strip().split(" ")[1] for line in lines
}

with open(spk2utt, "r") as f:
spk2utt = f.readlines()[: cfg["num_cohort_spk"]]

utt_list = []
trg_samp = int(cfg["target_duration"] * int(samp_rate) * 1000)

for spk in spk2utt:
chunk = spk.strip().split(" ")
spk = chunk[0]
utts = sorted(chunk[1:])
np.random.shuffle(utts)
n_selected = 0
for utt in utts:
utt_file = wav2dir_dic[utt]
dur = sf.info(utt_file).duration
if dur >= cfg["utt_select_sec"]:
utt_list.append(utt)
n_selected += 1
if n_selected == cfg["num_utt_per_spk"]:
break
print(f"Cohort utterances selected, {len(utt_list)} utts, {len(spk2utt)} spks")

# generate output adequate to ESPnet-SPK inference template
utt_list1 = utt_list[: len(utt_list) // 2]
utt_list2 = utt_list[len(utt_list) // 2 :]
with open(out_dir + "/cohort.scp", "w") as f_coh, open(
out_dir + "/cohort2.scp", "w"
) as f_coh2, open(out_dir + "/cohort_speech_shape", "w") as f_shape, open(
out_dir + "/cohort_label", "w"
) as f_lbl:
for utt1, utt2 in zip(utt_list1, utt_list2):
f_coh.write(f"{utt1}*{utt2} {wav2dir_dic[utt1]}\n")
f_coh2.write(f"{utt1}*{utt2} {wav2dir_dic[utt2]}\n")
f_shape.write(f"{utt1}*{utt2} {trg_samp}\n")
f_lbl.write(f"{utt1}*{utt2} 0\n")


if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))
Loading
Loading
You are viewing a condensed version of this merge commit. You can view the full changes here.