Merge pull request #5478 from jctian98/owsm_data

add centralized data preparation for OWSM
espnet · Dec 5, 2023 · a45a53c · a45a53c
2 parents 35b8f01 + ee00c6c
commit a45a53c
Show file tree

Hide file tree

Showing 128 changed files with 3,354 additions and 94 deletions.
diff --git a/egs/wsj/asr1/local/wsj_format_data.sh b/egs/wsj/asr1/local/wsj_format_data.sh
@@ -14,6 +14,7 @@
 # We'll create train_si84 after doing the feature extraction.
 
 lang_suffix=
+whisper_text_norm=false
 
 echo "$0 $@"  # Print the command line for logging
 . utils/parse_options.sh || exit 1;
@@ -26,7 +27,11 @@ srcdir=data/local/data
 for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
   mkdir -p data/$x
   cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
-  cp $srcdir/$x.txt data/$x/text || exit 1;
+  if $whisper_text_norm; then
+      python3 local/prepare_transcription_whisper.py -i $srcdir/$x.trans1 -o data/$x/text
+  else
+      cp $srcdir/$x.txt data/$x/text || exit 1;
+  fi
   cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
   cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
   utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;

diff --git a/egs2/TEMPLATE/asr1/db.sh b/egs2/TEMPLATE/asr1/db.sh
@@ -18,6 +18,7 @@ AR_SC=
 AUDIOSET=
 ASVSpoof_CMD=
 BIBLETTS=downloads
+COVOST2=
 DIRHA_ENGLISH_PHDEV=
 DIRHA_WSJ=
 DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed"  # Output file path
@@ -67,7 +68,6 @@ SLURP=
 SLURP_S= # Output file path
 LIBRITRANS_S= # Output file path
 VOICES=downloads
-VOXCELEB=
 MAGICDATA=downloads
 MEDIASPEECH=downloads
 MINI_LIBRISPEECH=downloads
@@ -99,6 +99,7 @@ SPGISPEECH=
 SPEECH_PROMPT_v2=
 STOP=
 SWBD=
+FISHER_CALLHOME_SPANISH=
 SWBD_NXT=
 THCHS30=downloads
 TIMIT=
@@ -150,6 +151,7 @@ RU_OPEN_STT=downloads
 RUSLAN=downloads
 SIWIS=downloads
 GIGASPEECH=
+GIGAST=
 GOOGLEI18N=downloads
 NOISY_SPEECH=
 NOISY_REVERBERANT_SPEECH=
@@ -207,6 +209,8 @@ KATHBATH=downloads
 GRAMVAANI=downloads
 SPRING_INX=downloads
 VOXCELEB=
+KSPONSPEECH=
+
 # For only CMU TIR environment
 if [[ "$(hostname)" == tir* ]]; then
     BABEL_101=/projects/tir5/data/speech_corpora/babel/IARPA_BABEL_BP_101/

diff --git a/egs2/ksponspeech/asr1/local/data.sh b/egs2/ksponspeech/asr1/local/data.sh
@@ -5,6 +5,10 @@ set -e
 set -u
 set -o pipefail
 
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
+. ./db.sh || exit 1;
+
 log() {
     local fname=${BASH_SOURCE[1]##*/}
     echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
@@ -15,7 +19,7 @@ stage=1
 stop_stage=100
 
 #data
-datadir=/ocean/projects/cis210027p/shared/corpora/KsponSpeech/KsponSpeech/
+datadir=${KSPONSPEECH}
 # KsponSpeech
 #  |_ KsponSpeech_01/
 #  |_ KsponSpeech_02/

diff --git a/egs2/mixed_v3/s2t1/README.md b/egs2/mixed_v3/s2t1/README.md
diff --git a/egs2/open_li110/asr1/local/data.sh b/egs2/open_li110/asr1/local/data.sh
diff --git a/egs2/owsm_v1/s2t1/README.md b/egs2/owsm_v1/s2t1/README.md
@@ -0,0 +1,8 @@
+### Guidance for data preparation
+(1) Please work progressively from v1 to v3: this means you need to prepare data for v1, v2 and v3 in order to obtain the full v3 data. To start the data preparation, run `bash local/data.sh --VERSION v1 # or v2, v3`
+(2) Please revise `db.sh` for all datasets before running `local/data.sh`. Some datasets cannot be downloaded and untared automatically due to license issues. Users should take care of it by themselves.
+(3) Due to the large volume of data, we are not confident the scripts will run smoothly for each dataset. Please raise an issue if you believe there is a bug.
+(4) This script only prepares data for train and valid subsets. Test data should be prepared separately following the conventional Espnet2 format.
+(5) Even though we provide this centralized data preparation script and combine all datasets in it, we strongly recommend users to NOT use the merged train_v* and valid_v* for feature extractions. Instead, users may run stage 2-4 for each dataset separately and combine all datasets together under `dump/raw` directory. This will allow you to handle all datasets simultaneously; inspection and debugging will also be easier. This is exactly what we did in our experiments.
+(6) Users can also refer to this PR to check more details: https://github.com/espnet/espnet/pull/5478
+(7) The detailed data list is in `local/data.sh`. Also see: https://arxiv.org/pdf/2309.13876.pdf
diff --git a/egs2/mixed_v1/s2t1/cmd.sh → egs2/owsm_v1/s2t1/cmd.sh b/egs2/mixed_v1/s2t1/cmd.sh → egs2/owsm_v1/s2t1/cmd.sh
diff --git a/egs2/mixed_v1/s2t1/conf/decode_s2t.yaml → egs2/owsm_v1/s2t1/conf/decode_s2t.yaml b/egs2/mixed_v1/s2t1/conf/decode_s2t.yaml → egs2/owsm_v1/s2t1/conf/decode_s2t.yaml
diff --git a/egs2/mixed_v1/s2t1/conf/fbank.conf → egs2/owsm_v1/s2t1/conf/fbank.conf b/egs2/mixed_v1/s2t1/conf/fbank.conf → egs2/owsm_v1/s2t1/conf/fbank.conf
diff --git a/egs2/mixed_v1/s2t1/conf/pbs.conf → egs2/owsm_v1/s2t1/conf/pbs.conf b/egs2/mixed_v1/s2t1/conf/pbs.conf → egs2/owsm_v1/s2t1/conf/pbs.conf
diff --git a/egs2/mixed_v1/s2t1/conf/pitch.conf → egs2/owsm_v1/s2t1/conf/pitch.conf b/egs2/mixed_v1/s2t1/conf/pitch.conf → egs2/owsm_v1/s2t1/conf/pitch.conf
diff --git a/egs2/mixed_v1/s2t1/conf/queue.conf → egs2/owsm_v1/s2t1/conf/queue.conf b/egs2/mixed_v1/s2t1/conf/queue.conf → egs2/owsm_v1/s2t1/conf/queue.conf
diff --git a/egs2/mixed_v1/s2t1/conf/slurm.conf → egs2/owsm_v1/s2t1/conf/slurm.conf b/egs2/mixed_v1/s2t1/conf/slurm.conf → egs2/owsm_v1/s2t1/conf/slurm.conf
diff --git a/...mer_size768_e12_d12_lr1e-3_warmup10k.yaml → ...mer_size768_e12_d12_lr1e-3_warmup10k.yaml b/...mer_size768_e12_d12_lr1e-3_warmup10k.yaml → ...mer_size768_e12_d12_lr1e-3_warmup10k.yaml
diff --git a/egs2/mixed_v1/s2t1/db.sh → egs2/owsm_v1/s2t1/db.sh b/egs2/mixed_v1/s2t1/db.sh → egs2/owsm_v1/s2t1/db.sh
diff --git a/egs2/mixed_v1/s2t1/dump.sh → egs2/owsm_v1/s2t1/dump.sh b/egs2/mixed_v1/s2t1/dump.sh → egs2/owsm_v1/s2t1/dump.sh
diff --git a/egs2/mixed_v1/s2t1/local/combine.sh → egs2/owsm_v1/s2t1/local/combine.sh b/egs2/mixed_v1/s2t1/local/combine.sh → egs2/owsm_v1/s2t1/local/combine.sh
diff --git a/egs2/owsm_v1/s2t1/local/data.sh b/egs2/owsm_v1/s2t1/local/data.sh
@@ -0,0 +1,220 @@
+#!/usr/bin/env bash
+# Copyright 2023 Carnegie Mellon University (Jinchuan Tian)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+# Centralized data preparation for OWSM (https://arxiv.org/abs/2309.13876)
+# Details of this script is also in: https://github.com/espnet/espnet/pull/5478/
+
+# Guidance for data preparation
+# (1) Please work progressively from v1 to v3: this means you need to prepare
+#     data for v1, v2 and v3 in order to obtain the full v3 data. To start the
+#     data preparation, run `bash local/data.sh --VERSION v1 # or v2, v3`
+# (2) Please revise `db.sh` for all datasets before running `local/data.sh`.
+#     Some datasets cannot be downloaded and untared automatically due to
+#     license issues. Users should take care of it by themselves.
+# (3) Due to the large volume of data, we are not confident the scripts will
+#     run smoothly for each dataset. Please raise an issue if you believe
+#     there is a bug.
+# (4) This script only prepares data for train and valid subsets.
+#     Test data should be prepared separately following the conventional
+#     Espnet2 format.
+# (5) Even though we provide this centralized data preparation script and
+#     combine all datasets in it, we strongly recommend users to NOT use
+#     the merged train_v* and valid_v* for feature extractions. Instead,
+#     users may run stage 2-4 for each dataset separately and combine all
+#     datasets together under `dump/raw` directory. This will allow you to
+#     handle all datasets simultaneously; inspection and debugging will
+#     also be easier. This is exactly what we did in our experiments.
+
+set -e
+set -u
+set -o pipefail
+
+. ./path.sh || exit 1;
+. ./db.sh || exit 1;
+
+function check_sorted {
+  file=$1
+  sort -k1,1 -u <$file >$file.tmp
+  if ! cmp -s $file $file.tmp; then
+    echo "$0: file $1 is not in sorted order or not unique, sorting it"
+    mv $file.tmp $file
+  else
+    rm $file.tmp
+  fi
+}
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+VERSION=v1 # specify v1, v2 or v3
+stage=1
+stop_stage=2
+
+. utils/parse_options.sh
+
+# Change accordingly if you only want to prepare a subset of it
+if [ ${VERSION} = "v1" ]; then
+    datasets="aishell covost2 gigaspeech librispeech must-c spgispeech"
+    train_sets="data/AISHELL-1/train \
+                data/CoVoST2/train \
+                data/GigaSpeech/XL \
+                data/LibriSpeech/train-clean-100 \
+                data/LibriSpeech/train-clean-360 \
+                data/LibriSpeech/train-other-500 \
+                data/MuST-C_v1.2/train \
+                data/MuST-C_v2/train \
+                data/MuST-C_v3/train \
+                data/SPGISpeech/train \
+                data/TEDLIUM3/train"
+    valid_sets="data/AISHELL-1/dev \
+                data/CoVoST2/dev \
+                data/GigaSpeech/DEV \
+                data/LibriSpeech/dev-clean \
+                data/LibriSpeech/dev-other \
+                data/MuST-C_v1.2/dev \
+                data/MuST-C_v2/dev \
+                data/MuST-C_v3/dev \
+                data/SPGISpeech/val \
+                data/TEDLIUM3/dev"
+
+elif [ ${VERSION} = "v2" ]; then
+    datasets="gigast multilingual_librispeech wenetspeech"
+    train_sets="data/GigaST/XL.en-* \
+                data/MLS/train.* \
+                data/WenetSpeech/L"
+    valid_sets="data/MLS/dev.* \
+                data/WenetSpeech/DEV"
+
+# Note(jinchuan):
+# AMI & SWBD & BABEL & OpenSLR: the original egs2/ami/asr1/local/data.sh is not smooth.
+# AMI & VoxForge: original text are all in upper case. Change them all into lower cases.
+# ReazonSpeech: no train / valid /test splits. Generated by ourselves.
+elif [ ${VERSION} = "v3" ]; then
+    datasets="aidatatang ami commonvoice swbd fisher_callhome \
+              fleurs ksponspeech magicdata reazonspeech ru_open_stt \
+              vctk voxpopuli wsj voxforge babel openslr"
+    train_sets="data/aidatatang/train_whisper \
+                data/ami/ihm_train_whisper \
+                data/CommonVoice/train \
+                data/swbd/train_nodup_whisper \
+                data/swbd/train_fisher_whisper \
+                data/fisher_callhome/train_whisper \
+                data/FLEURS/train \
+                data/ksponspeech/train_whisper \
+                data/magicdata/train_whisper \
+                data/ReazonSpeech/train \
+                data/ru_open_stt/train_whisper \
+                data/vctk/tr_no_dev_whisper \
+                data/VoxPopuli/train \
+                data/wsj/train_si284_whisper \
+                data/voxforge/tr \
+                data/babel/train \
+                data/openslr/train"
+    valid_sets="data/aidatatang/dev_whisper \
+                data/ami/ihm_dev_whisper \
+                data/CommonVoice/dev \
+                data/swbd/train_dev_whisper \
+                data/fisher_callhome/dev_whisper \
+                data/FLEURS/valid \
+                data/ksponspeech/dev_whisper \
+                data/magicdata/dev_whisper \
+                data/ReazonSpeech/valid \
+                data/ru_open_stt/dev_whisper \
+                data/vctk/dev_whisper \
+                data/VoxPopuli/dev \
+                data/wsj/test_dev93_whisper \
+                data/voxforge/dt \
+                data/babel/dev \
+                data/openslr/dev"
+else
+    echo "Invalid version argument ${VERSION}." && exit 1;
+fi
+echo "Preparing data for OSWM with version ${VERSION}"
+echo "Datasets to prepare: ${datasets}"
+
+utt_extra_files="text.prev text.ctc"
+train_out=data/train_${VERSION}
+valid_out=data/valid_${VERSION}
+
+# v3 data adopts ISO-639-3 langauge-IDs
+if [ ! -d ./iso639 ] && [ ${VERSION} = "v3" ]; then
+    echo "installing ISO-639 dependency"
+    git clone https://github.com/noumar/iso639
+    cd iso639; python3 setup.py install || exit 1;
+    cd ..
+fi
+
+# call data preparation script for each dataset
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    for dataset in ${datasets}; do
+        if [ -f data/.${dataset}.done ]; then
+            echo "${dataset} has been processed. Skip!"
+        else
+            if [ ! -f ./local/prepare_${dataset}.sh ]; then
+                echo "script for ${dataset} is not found." && exit 1;
+            fi
+            echo "preparing ${dataset} dataset ..."
+            ./local/prepare_${dataset}.sh || \
+                echo "preparing ${dataset} failed" && exit 1;
+            touch data/.${dataset}.done
+        fi
+    done
+fi
+
+# combine all datasets.
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+
+    if [ ${VERSION} = "v2" ]; then
+        if [ ! -d data/train_v1 ] || [ ! -d data/valid_v1 ]; then
+            echo "Cannot find v1 data. copy it ..."
+            cp -r ../../owsm_v1/s2t1/data/{train,valid}_v1/ ./data || exit 1;
+        fi
+        train_sets="${train_sets} data/train_v1"
+        valid_sets="${valid_sets} data/valid_v1"
+    fi
+
+    if [ ${VERSION} = "v3" ]; then
+        if [ ! -d data/train_v2 ] || [ ! -d data/valid_v2 ]; then
+            echo "Cannot find v2 data. copy it ..."
+            cp -r ../../owsm_v2/s2t1/data/{train,valid}_v2/ ./data || exit 1;
+        fi
+        train_sets="${train_sets} data/train_v2"
+        valid_sets="${valid_sets} data/valid_v2"
+
+        # v3 adopts ISO-639-3 language-IDs
+        # So change all langauge-IDs in v2 to ISO-639-3 before merging
+        for part in train valid; do
+            if [ ! -f data/${part}_v2/text_raw ]; then
+                mv data/${part}_v2/text data/${part}_v2/text_raw || exit 1;
+                python3 local/filter_lang_id.py \
+                    -i data/${part}_v2/text_raw -o data/${part}_v2/text || exit 1;
+            fi
+        done
+    fi
+
+    # Combine valid
+    utils/combine_data.sh --skip_fix true --extra-files "${utt_extra_files}" \
+        ${valid_out} ${valid_sets} || exit 1;
+    # NOTE(yifan): extra text files must be sorted and unique
+    for f in ${utt_extra_files}; do
+        check_sorted ${valid_out}/${f}
+    done
+    utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" ${valid_out} || exit 1;
+    utils/validate_data_dir.sh --no-feats --non-print ${valid_out} || exit 1;
+
+    # Combine train
+    utils/combine_data.sh --skip_fix true --extra-files "${utt_extra_files}" \
+        ${train_out} ${train_sets} || exit 1;
+    # NOTE(yifan): extra text files must be sorted and unique
+    for f in ${utt_extra_files}; do
+        check_sorted ${train_out}/${f}
+    done
+    utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" ${train_out} || exit 1;
+    utils/validate_data_dir.sh --no-feats --non-print ${train_out} || exit 1;
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mixed_v1/s2t1/local/filter_covost2.py → egs2/owsm_v1/s2t1/local/filter_covost2.py b/egs2/mixed_v1/s2t1/local/filter_covost2.py → egs2/owsm_v1/s2t1/local/filter_covost2.py
diff --git a/egs2/mixed_v1/s2t1/local/filter_covost2.sh → egs2/owsm_v1/s2t1/local/filter_covost2.sh b/egs2/mixed_v1/s2t1/local/filter_covost2.sh → egs2/owsm_v1/s2t1/local/filter_covost2.sh
diff --git a/egs2/mixed_v1/s2t1/local/generate_nlsyms.py → egs2/owsm_v1/s2t1/local/generate_nlsyms.py b/egs2/mixed_v1/s2t1/local/generate_nlsyms.py → egs2/owsm_v1/s2t1/local/generate_nlsyms.py
diff --git a/egs2/mixed_v1/s2t1/local/path.sh → egs2/owsm_v1/s2t1/local/path.sh b/egs2/mixed_v1/s2t1/local/path.sh → egs2/owsm_v1/s2t1/local/path.sh
diff --git a/egs2/mixed_v1/s2t1/local/prepare_aishell.py → egs2/owsm_v1/s2t1/local/prepare_aishell.py b/egs2/mixed_v1/s2t1/local/prepare_aishell.py → egs2/owsm_v1/s2t1/local/prepare_aishell.py
diff --git a/egs2/mixed_v1/s2t1/local/prepare_aishell.sh → egs2/owsm_v1/s2t1/local/prepare_aishell.sh b/egs2/mixed_v1/s2t1/local/prepare_aishell.sh → egs2/owsm_v1/s2t1/local/prepare_aishell.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-data_dir=/scratch/bbjs/peng6/corpora/AISHELL-1
+data_dir=${AISHELL}
 prefix=AISHELL-1
 output_dir=data/AISHELL-1
 splits="dev train"

diff --git a/egs2/mixed_v1/s2t1/local/prepare_covost2.py → egs2/owsm_v1/s2t1/local/prepare_covost2.py b/egs2/mixed_v1/s2t1/local/prepare_covost2.py → egs2/owsm_v1/s2t1/local/prepare_covost2.py
diff --git a/egs2/mixed_v1/s2t1/local/prepare_covost2.sh → egs2/owsm_v1/s2t1/local/prepare_covost2.sh b/egs2/mixed_v1/s2t1/local/prepare_covost2.sh → egs2/owsm_v1/s2t1/local/prepare_covost2.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-data_dir=/scratch/bbjs/peng6/espnet-whisper-public/egs2/covost2/st1/data
+data_dir=${COVOST2}
 prefix=CoVoST2
 output_dir=data/CoVoST2
 splits="dev train"

diff --git a/...mixed_v1/s2t1/local/prepare_gigaspeech.py → .../owsm_v1/s2t1/local/prepare_gigaspeech.py b/...mixed_v1/s2t1/local/prepare_gigaspeech.py → .../owsm_v1/s2t1/local/prepare_gigaspeech.py
diff --git a/...mixed_v1/s2t1/local/prepare_gigaspeech.sh → .../owsm_v1/s2t1/local/prepare_gigaspeech.sh b/...mixed_v1/s2t1/local/prepare_gigaspeech.sh → .../owsm_v1/s2t1/local/prepare_gigaspeech.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-data_dir=/scratch/bbjs/peng6/corpora/GigaSpeech
+data_dir=${GIGASPEECH}
 prefix=GigaSpeech
 output_dir=data/GigaSpeech
 splits="DEV XL"

diff --git a/...ixed_v1/s2t1/local/prepare_librispeech.py → ...owsm_v1/s2t1/local/prepare_librispeech.py b/...ixed_v1/s2t1/local/prepare_librispeech.py → ...owsm_v1/s2t1/local/prepare_librispeech.py
diff --git a/...ixed_v1/s2t1/local/prepare_librispeech.sh → ...owsm_v1/s2t1/local/prepare_librispeech.sh b/...ixed_v1/s2t1/local/prepare_librispeech.sh → ...owsm_v1/s2t1/local/prepare_librispeech.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-data_dir=/scratch/bbjs/peng6/corpora/librispeech_full/LibriSpeech
+data_dir=${LIBRISPEECH}
 prefix=LibriSpeech
 output_dir=data/LibriSpeech
 splits="dev-clean dev-other train-clean-100 train-clean-360 train-other-500"

diff --git a/egs2/mixed_v1/s2t1/local/prepare_must-c.py → egs2/owsm_v1/s2t1/local/prepare_must-c.py b/egs2/mixed_v1/s2t1/local/prepare_must-c.py → egs2/owsm_v1/s2t1/local/prepare_must-c.py