merge master

espnet · sw005320 · Dec 5, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
commit 2fd2668b81729e3d994e5200d85718b65aa0a4ab
diff --git a/ci/test_integration_espnet2.sh b/ci/test_integration_espnet2.sh
@@ -285,8 +285,10 @@ cd ./egs2/mini_an4/spk1
 gen_dummy_coverage
 echo "==== [ESPnet2] SPK ==="
 ./run.sh --ngpu 0 --stage 0 --stop-stage 4 --feats-type "raw" --python "${python}" --spk-args "--num_workers 0"
-./run.sh --ngpu 0 --stage 4 --stop-stage 4 --feats-type "raw" --python "${python}" --spk_config conf/train_rawnet3_dataaug_debug.yaml --spk-args "--num_workers 0"
-./run.sh --ngpu 0 --stage 4 --stop-stage 4 --feats-type "raw" --python "${python}" --spk_config conf/train_rawnet3_sampler.yaml --spk-args "--num_workers 0"
+./run.sh --ngpu 0 --stage 5 --stop-stage 5 --feats-type "raw" --python "${python}" --spk_config conf/train_rawnet3_dataaug_debug.yaml --spk-args "--num_workers 0"
+./run.sh --ngpu 0 --stage 5 --stop-stage 5 --feats-type "raw" --python "${python}" --spk_config conf/train_rawnet3_sampler.yaml --spk-args "--num_workers 0"
+./run.sh --ngpu 0 --stage 5 --stop-stage 5 --feats-type "raw" --python "${python}" --spk_config conf/train_ecapa.yaml --spk-args "--num_workers 0"
+./run.sh --ngpu 0 --stage 6 --stop-stage 7 --feats-type "raw" --python "${python}" --spk_config conf/train_rawnet3_sampler.yaml --spk-args "--num_workers 0" --inference_model "valid.eer.ave.pth"
 # Remove generated files in order to reduce the disk usage
 rm -rf exp dump data
 cd "${cwd}"
@@ -300,6 +302,16 @@ echo "==== [ESPnet2] S2T ==="
 rm -rf exp dump data
 cd "${cwd}"
 
+# [ESPnet2] test s2st1 recipe
+cd ./egs2/mini_an4/s2st1
+gen_dummy_coverage
+echo "==== [ESPnet2] S2ST ==="
+./run.sh --ngpu 0 --stage 1 --stop_stage 8 --use_discrete_unit false --s2st_config conf/s2st_spec_debug.yaml --python "${python}"
+./run.sh --ngpu 0 --stage 1 --stop_stage 8 --python "${python}" --use_discrete_unit true --s2st_config conf/train_s2st_discrete_unit_debug.yaml --clustering_num_threads 2 --feature_num_clusters 5
+# Remove generated files in order to reduce the disk usage
+rm -rf exp dump data ckpt .cache
+cd "${cwd}"
+
 echo "=== report ==="
 coverage combine egs2/*/*/.coverage
 coverage report

diff --git a/doc/paper/espnet-se++/paper.bib b/doc/paper/espnet-se++/paper.bib
@@ -83,7 +83,7 @@ @inproceedings{Takahashi:2019
     booktitle = {Interspeech 2019}
 }
 
-@inproceedings{Luo:2019a,
+@inproceedings{Luo:2019,
     doi = {10.1109/asru46091.2019.9003849},
     url = {https://doi.org/10.1109%2Fasru46091.2019.9003849},
     pages={260--267},
@@ -248,30 +248,6 @@ @inproceedings{Povey:2011
   organization={IEEE Signal Processing Society}
 }
 
-
-@inproceedings{Luo:2020,
-  doi = {10.1109/icassp40776.2020.9054266},
-  title={Dual-path {RNN}: efficient long sequence modeling for time-domain single-channel speech separation},
-  author={Luo, Y. and Chen, Z. and Yoshioka, T.},
-  booktitle={2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
-  pages={46--50},
-  year={2020},
-  organization={IEEE}
-}
-
-@article{Luo:2019b,
-  doi = {10.1109/taslp.2019.2915167},
-  title={{Conv-TasNet}: Surpassing ideal time--frequency magnitude masking for speech separation},
-  author={Luo, Y. and Mesgarani, N.},
-  journal={IEEE/ACM Transactions on Audio, Speech, and Language Processing},
-  volume={27},
-  number={8},
-  pages={1256--1266},
-  year={2019},
-  publisher={IEEE}
-}
-
-
 @article{Taal:2011,
   doi = {10.1109/tasl.2011.2114881},
   title={An algorithm for intelligibility prediction of time--frequency weighted noisy speech},
@@ -295,16 +271,6 @@ @inproceedings{Rix:2001
   organization={IEEE}
 }
 
-@inproceedings{Yu:2017,
-  doi = {10.1109/icassp.2017.7952154},
-  title={Permutation invariant training of deep models for speaker-independent multi-talker speech separation},
-  author={Yu, D. and Kolbæk, M. and Tan, Z. H. and Jensen, J.},
-  booktitle={2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
-  pages={241--245},
-  year={2017},
-  organization={IEEE}
-}
-
 @article{Towns:2014,
   doi = {10.1109/mcse.2014.80},
   title={{XSEDE}: accelerating scientific discovery},

diff --git a/doc/paper/espnet-se++/paper.md b/doc/paper/espnet-se++/paper.md
@@ -71,7 +71,7 @@ bibliography: paper.bib
 
 # Summary
 This paper presents the software design and user interface of ESPnet-SE++, a new speech separation and enhancement (SSE) module of the ESPnet toolkit.
-ESPnet-SE++ significantly expands the functionality of ESPnet-SE [@Li:2021] with several new models[@Hershey:2016; @Chen:2017; @Hu:2020; @Tan:2021; @Li:2022; @Dang:2022; @Takahashi:2019; @Luo:2019a; @Lu:2022a], loss functions [@Luo:2018; @Le:2019; @Boeddeker:2021; @Scheibler:2022], and training recipes as shown in [@Lu:2022b]. Crucially, it features a new, redesigned interface, which allows for a flexible combination of SSE front-ends with many downstream tasks, including automatic speech recognition (ASR), speaker diarization (SD), speech translation (ST), and spoken language understanding (SLU).
+ESPnet-SE++ significantly expands the functionality of ESPnet-SE [@Li:2021] with several new models[@Hershey:2016; @Chen:2017; @Hu:2020; @Tan:2021; @Li:2022; @Dang:2022; @Takahashi:2019; @Luo:2019; @Lu:2022a], loss functions [@Luo:2018; @Le:2019; @Boeddeker:2021; @Scheibler:2022], and training recipes as shown in [@Lu:2022b]. Crucially, it features a new, redesigned interface, which allows for a flexible combination of SSE front-ends with many downstream tasks, including automatic speech recognition (ASR), speaker diarization (SD), speech translation (ST), and spoken language understanding (SLU).
 
 # Statement of need
 
@@ -127,7 +127,7 @@ object with the data-iterator for testing and validation. During its initializat
 
 #### bin/enh_scoring.py
 		def scoring(..., ref_scp, inf_scp, ...)
-The SSE scoring functions calculates several popular objective scores such as SI-SDR [@le:2019], STOI [@Taal:2011], SDR and PESQ [@Rix:2001], based on the reference signal and processed speech pairs.
+The SSE scoring functions calculates several popular objective scores such as SI-SDR [@Le:2019], STOI [@Taal:2011], SDR and PESQ [@Rix:2001], based on the reference signal and processed speech pairs.
 
 ### SSE Control Class `tasks/enh.py`
 
@@ -200,7 +200,7 @@ which processes speech and only returns losses for [Trainer](https://github.com/
 # ESPnet-SE++ User Interface
 
 ## Building a New Recipe from Scratch
-Since ESPnet2 provides common scripts such as `enh.sh` and `enh_asr.sh` for each task, users only need to create `local/data.sh`  for the data preparation of a new corpus.  The generated data follows the Kaldi-style structure:
+Since ESPnet2 provides common scripts such as `enh.sh` and `enh_asr.sh` for each task, users only need to create `local/data.sh`  for the data preparation of a new corpus.  The generated data follows the Kaldi-style structure [@Povey:2011]:
 
 
 ![](graphics/data_structure.png)

diff --git a/egs/must_c/st1/local/download_and_untar.sh b/egs/must_c/st1/local/download_and_untar.sh
@@ -34,35 +34,9 @@ if [ ! "$(echo ${langs} | grep ${lang})" ]; then
 fi
 
 if [ ${version} = "v1" ]; then
-    if [ ${lang} = "de" ]; then
-        url=https://drive.google.com/open?id=1Mf2il_VelDIJMSio0bq7I8M9fSs-X4Ie
-    elif [ ${lang} = "es" ]; then
-        url=https://drive.google.com/open?id=14d2ttsuEUFXsxx-KRWJMsFhQGrYOJcpH
-    elif [ ${lang} = "fr" ]; then
-        url=https://drive.google.com/open?id=1acIBqcPVX5QXXXV9u8_yDPtCgfsdEJDV
-    elif [ ${lang} = "it" ]; then
-        url=https://drive.google.com/open?id=1qbK88SAKxqjMUybkMeIjrJWnNAZyE8V0
-    elif [ ${lang} = "nl" ]; then
-        url=https://drive.google.com/open?id=11fNraDQs-LiODDxyV5ZW0Slf3XuDq5Cf
-    elif [ ${lang} = "pt" ]; then
-        url=https://drive.google.com/open?id=1C5qK1FckA702nsYcXwmGdzlMmHg1F_ot
-    elif [ ${lang} = "ro" ]; then
-        url=https://drive.google.com/open?id=1nbdYR5VqcTbLpOB-9cICKCgsLAs7fVzd
-    elif [ ${lang} = "ru" ]; then
-        url=https://drive.google.com/open?id=1Z3hSiP7fsR3kf8fjQYzIa07jmw4KXNnw
-    else
-        echo "${lang} is not supported now."
-        exit 1;
-    fi
+    instructions="Please download the archives from https://mt.fbk.eu/must-c-release-v1-0/ and place them inside ${data}."
 elif [ ${version} = "v2" ]; then
-    if [ ${lang} = "de" ]; then
-        url=https://drive.google.com/u/0/uc?id=1UBPNwFEVhIZCOEpu4hTqPji57XRg85UO
-    elif [ ${lang} = "zh" ]; then
-        url=https://drive.google.com/u/0/uc?id=1iz2Yl1avlzF79_77iKK7kPlcmbZhk3o6
-    else
-        echo "${lang} is not supported now."
-        exit 1;
-    fi
+    instructions="Please download the archives from https://mt.fbk.eu/must-c-release-v2-0/ and place them inside ${data}. For en-ja and en-zh, you may just download the archive without H5 files."
 else
     echo "${version} is not supported now."
     exit 1;
@@ -85,12 +59,7 @@ if [ -f ${tar_path} ]; then
 fi
 
 if [ ! -f ${tar_path} ]; then
-    if ! which wget >/dev/null; then
-        echo "$0: wget is not installed."
-        exit 1;
-    fi
-    echo "$0: downloading data from ${url}.  This may take some time, please be patient."
-    download_from_google_drive.sh ${url} ${data} tar.gz || exit 1
+    echo ${instructions}
 fi
 
 if ! tar -zxvf ${tar_path} -d -C ${data}; then

diff --git a/egs2/README.md b/egs2/README.md
@@ -90,7 +90,9 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | kss                     | Korean single speaker corpus                                                                                                     | TTS                     | KOR                  | https://www.kaggle.com/bryanpark/korean-single-speaker-speech-dataset                                        |              |
 | l3das22                | L3DAS22: Machine Learning for 3D Audio Signal Processing - ICASSP 2022                                                            | SE                     | ENG                  | https://www.l3das.com/icassp2022/                                                                            |              |
 | laborotv                | LaboroTVSpeech (A large-scale Japanese speech corpus on TV recordings)                                                           | ASR                     | JPN                  | https://laboro.ai/column/eg-laboro-tv-corpus-jp                                                              |              |
-| librispeech             | Librilight-limited subset                                                                                                        | ASR                     | ENG                  | https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz                                    |              |
+| libriheavy_medium             | Libriheavy medium subset                                                                                                   | ASR                     | ENG                  | https://github.com/k2-fsa/libriheavy                                    |              |
+| libriheavy_small             | Libriheavy small subset                                                                                                   | ASR                     | ENG                  | https://github.com/k2-fsa/libriheavy                                    |              |
+| librilight_limited             | Librilight-limited subset                                                                                                        | ASR                     | ENG                  | https://dl.fbaipublicfiles.com/librilight/data/librispeech_finetuning.tgz                                    |              |
 | librimix                | LibriMix: An Open-Source Dataset for Generalizable Speech Separation                                                             | SE/DIAR                 | ENG                  | https://github.com/JorisCos/LibriMix                                                                         |              |
 | librispeech             | LibriSpeech ASR corpus                                                                                                           | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
 | librispeech_100         | LibriSpeech ASR corpus 100h subset                                                                                               | ASR                     | ENG                  | http://www.openslr.org/12                                                                                    |              |
@@ -152,6 +154,7 @@ See: https://espnet.github.io/espnet/espnet2_tutorial.html#recipes-using-espnet2
 | snips                   | SNIPS: A dataset for spoken language understanding                                                                               | SLU                     | ENG                  | https://github.com/sonos/spoken-language-understanding-research-datasets                                     |              |
 | speechcommands          | Speech Commands: A Dataset for Limited-Vocabulary Speech Recognition                                                             | SLU                     | ENG                  | https://www.tensorflow.org/datasets/catalog/speech_commands                                                  |              |
 | spgispeech              | SPGISpeech 5k corpus                                                                                                             | ASR                     | ENG                  | https://datasets.kensho.com/datasets/scribe                                                                  |              |
+| spring_speech           | SPRING-INX: Data for Indian Languages                                                                                            | ASR                     | ENG                  | https://asr.iitm.ac.in/dataset                                                                               |              |
 | stop              | STOP: Spoken Task Oriented Parsing                                                                                                             | SLU                     | ENG                  | https://facebookresearch.github.io/spoken_task_oriented_parsing/                                                                  |              |
 | su_openslr36            | Sundanese                                                                                                                        | ASR                     | SUN                  | http://www.openslr.org/36                                                                                    |              |
 | swbd                    | Switchboard Corpus for 2-channel Conversational Telephone Speech (300h)                                                          | ASR                     | ENG                  | https://catalog.ldc.upenn.edu/LDC97S62                                                                       |              |

diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
@@ -60,6 +60,7 @@ LABOROTV=
 TEDXJP=
 LIBRISPEECH=downloads
 LIBRILIGHT_LIMITED=
+LIBRILIGHT=downloads
 FSC=
 FREESOUND=
 MELD=downloads
@@ -206,6 +207,7 @@ KIRITAN=
 NAMINE=
 KATHBATH=downloads
 GRAMVAANI=downloads
+SPRING_INX=downloads
 VOXCELEB=
 KSPONSPEECH=
 

diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/calculate_eer_mindcf.py b/egs2/TEMPLATE/asr1/pyscripts/utils/calculate_eer_mindcf.py
@@ -0,0 +1,60 @@
+import sys
+from typing import List, Tuple
+
+import numpy as np
+
+from espnet2.utils.eer import ComputeErrorRates, ComputeMinDcf, tuneThresholdfromScore
+
+
+def load_scorefile(scorefile: str) -> Tuple[List[float], List[int]]:
+    with open(scorefile, "r") as f:
+        lines = f.readlines()
+    scores, labels = [], []
+    for line in lines:
+        _, score, label = line.strip().split(" ")
+        scores.append(float(score))
+        labels.append(int(label))
+
+    return scores, labels
+
+
+def main(args):
+    scorefile = args[0]
+    out_dir = args[1]
+
+    # get scores and labels
+    scores, labels = load_scorefile(scorefile)
+
+    # calculate statistics in target and nontarget classes.
+    n_trials = len(scores)
+    scores_trg = []
+    scores_nontrg = []
+    for _s, _l in zip(scores, labels):
+        if _l == 1:
+            scores_trg.append(_s)
+        elif _l == 0:
+            scores_nontrg.append(_s)
+        else:
+            raise ValueError(f"{_l}, {type(_l)}")
+    trg_mean = float(np.mean(scores_trg))
+    trg_std = float(np.std(scores_trg))
+    nontrg_mean = float(np.std(scores_nontrg))
+    nontrg_std = float(np.std(scores_nontrg))
+
+    # predictions, ground truth, and the false acceptance rates to calculate
+    results = tuneThresholdfromScore(scores, labels, [1, 0.1])
+    eer = results[1]
+    fnrs, fprs, thresholds = ComputeErrorRates(scores, labels)
+
+    # p_target, c_miss, and c_falsealarm in NIST minDCF calculation
+    p_trg, c_miss, c_fa = 0.05, 1, 1
+    mindcf, _ = ComputeMinDcf(fnrs, fprs, thresholds, p_trg, c_miss, c_fa)
+
+    with open(out_dir, "w") as f:
+        f.write(f"trg_mean: {trg_mean}, trg_std: {trg_std}\n")
+        f.write(f"nontrg_mean: {nontrg_mean}, nontrg_std: {nontrg_std}\n")
+        f.write(f"eer: {eer}, mindcf: {mindcf}\n")
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))
diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/generate_cohort_list.py b/egs2/TEMPLATE/asr1/pyscripts/utils/generate_cohort_list.py
@@ -0,0 +1,67 @@
+import sys
+
+import numpy as np
+import soundfile as sf
+import yaml
+
+np.random.seed(0)
+
+
+def load_yaml(yamlfile):
+    with open(yamlfile, "r") as stream:
+        data = yaml.safe_load(stream)
+        return data
+
+
+def main(args):
+    spk2utt = args[0]
+    wav_scp = args[1]
+    out_dir = args[2]
+    cfg = load_yaml(args[3])
+    samp_rate = args[4][:-1]
+    print(cfg)
+    with open(wav_scp) as f:
+        lines = f.readlines()
+    wav2dir_dic = {
+        line.strip().split(" ")[0]: line.strip().split(" ")[1] for line in lines
+    }
+
+    with open(spk2utt, "r") as f:
+        spk2utt = f.readlines()[: cfg["num_cohort_spk"]]
+
+    utt_list = []
+    trg_samp = int(cfg["target_duration"] * int(samp_rate) * 1000)
+
+    for spk in spk2utt:
+        chunk = spk.strip().split(" ")
+        spk = chunk[0]
+        utts = sorted(chunk[1:])
+        np.random.shuffle(utts)
+        n_selected = 0
+        for utt in utts:
+            utt_file = wav2dir_dic[utt]
+            dur = sf.info(utt_file).duration
+            if dur >= cfg["utt_select_sec"]:
+                utt_list.append(utt)
+                n_selected += 1
+                if n_selected == cfg["num_utt_per_spk"]:
+                    break
+    print(f"Cohort utterances selected, {len(utt_list)} utts, {len(spk2utt)} spks")
+
+    # generate output adequate to ESPnet-SPK inference template
+    utt_list1 = utt_list[: len(utt_list) // 2]
+    utt_list2 = utt_list[len(utt_list) // 2 :]
+    with open(out_dir + "/cohort.scp", "w") as f_coh, open(
+        out_dir + "/cohort2.scp", "w"
+    ) as f_coh2, open(out_dir + "/cohort_speech_shape", "w") as f_shape, open(
+        out_dir + "/cohort_label", "w"
+    ) as f_lbl:
+        for utt1, utt2 in zip(utt_list1, utt_list2):
+            f_coh.write(f"{utt1}*{utt2} {wav2dir_dic[utt1]}\n")
+            f_coh2.write(f"{utt1}*{utt2} {wav2dir_dic[utt2]}\n")
+            f_shape.write(f"{utt1}*{utt2} {trg_samp}\n")
+            f_lbl.write(f"{utt1}*{utt2} 0\n")
+
+
+if __name__ == "__main__":
+    sys.exit(main(sys.argv[1:]))