espnet · sw005320 · Dec 5, 2023 · Oct 17, 2023 · Oct 17, 2023 · Oct 17, 2023
diff --git a/egs2/mixed_v1/s2t1/local/data.sh b/egs2/mixed_v1/s2t1/local/data.sh
@@ -0,0 +1,157 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+
+# Centralized data preparation for OWSM (https://arxiv.org/abs/2309.13876)
+
+# Note (jinchuan):
+# (1) please work progressively from v1 to v3: you need to
+# prepare data for v1, v2 and v3 in order to obtain the full v3 data
+# (2) please revise db.sh for all datasets before running this script.
+# Some datasets cannot be downloaded and untared automatically due to
+# liscence issue. Please take care of it in advance.
+# (3) Due to the large volume of data, we can not ensure the scripts
+# will run smoothly for each dataset. Please raise an issue if you
+# believe there is a bug.
+# (4) This script only prepare data for train and valid. Test data
+# should be prepared separately following standard Espnet2 format.
+
+set -e
+set -u
+set -o pipefail
+
+. ./path.sh || exit 1;
+. ./db.sh || exit 1;
+
+function check_sorted {
+  file=$1
+  sort -k1,1 -u <$file >$file.tmp
+  if ! cmp -s $file $file.tmp; then
+    echo "$0: file $1 is not in sorted order or not unique, sorting it"
+    mv $file.tmp $file
+  else
+    rm $file.tmp
+  fi
+}
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+VERSION=v1 # specify v1, v2 or v3
+stage=1
+stop_stage=2
+
+. utils/parse_options.sh
+
+# Change accordingly if you only want to prepare a subset of it
+if [ ${VERSION} = "v1" ]; then
+    datasets="aishell covost2 gigaspeech librispeech must-c spgispeech"
+    train_sets="data/AISHELL-1/train \
+                data/CoVoST2/train \
+                data/GigaSpeech/XL \
+                data/LibriSpeech/train-clean-100 \
+                data/LibriSpeech/train-clean-360 \
+                data/LibriSpeech/train-other-500 \
+                data/MuST-C_v1.2/train \
+                data/MuST-C_v2/train \
+                data/MuST-C_v3/train \
+                data/SPGISpeech/train \
+                data/TEDLIUM3/train"
+    valid_sets="data/AISHELL-1/dev \
+                data/CoVoST2/dev \
+                data/GigaSpeech/DEV \
+                data/LibriSpeech/dev-clean \
+                data/LibriSpeech/dev-other \
+                data/MuST-C_v1.2/dev \
+                data/MuST-C_v2/dev \
+                data/MuST-C_v3/dev \
+                data/SPGISpeech/val \
+                data/TEDLIUM3/dev"
+
+elif [ ${VERSION} = "v2" ]; then
+    datasets="gigast multilingual_librispeech wenetspeech"
+    train_sets="data/GigaST/XL.en-* \
+                data/MLS/train.* \
+                data/WenetSpeech/L"
+    # question (jinchuan): why don't include GigaST-dev?
+    valid_sets="data/MLS/dev.* \
+                data/WenetSpeech/DEV"
+
+elif [ ${VERSION} = "v3" ]; then
+    datasets="aidatatang ami babel commonvoice swbd fisher_callhome \
+              fleurs ksponspeech magicdata reazonspeech ru_open_stt \
+              vctk voxpopuli wsj" \
+    # still working on it
+    train_sets="data/aidatatang/train \
+    "
+    valid_sets="data/aidatatang/dev \
+    "
+else
+    echo "Invalid version argument ${VERSION}." && exit 1;
+fi
+echo "Preparing data for OSWM with version ${VERSION}"
+echo "Datasets to prepare: ${datasets}"
+
+utt_extra_files="text.prev text.ctc"
+train_out=data/train_${VERSION}
+valid_out=data/valid_${VERSION}
+
+# call data preparation script for each dataset
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    for dataset in ${datasets}; do
+        if [ -f data/.${dataset}.done ]; then
+            echo ${dataset} has been processed. Skip!
+        else
+            echo preparing ${dataset} dataset ...
+            ./local/prepare_${dataset}.sh && touch data/.${dataset}.done
+        fi
+    done
+fi
+
+# combine all datasets.
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+
+    if [ ${VERSION} = "v2" ]; then
+        if [ ! -d data/train_v1 ] || [ ! -d data/valid_v1 ]; then
+            echo "Cannot find v1 data. Please link it here. Exit!" && exit 1;
+        fi
+        train_sets="${train_sets} data/train_v1"
+        valid_sets="${valid_sets} data/valid_v1"
+    fi
+
+    if [ ${VERSION} = "v3" ]; then
+        if [ ! -d data/train_v2 ] || [ ! -d data/valid_v2 ]; then
+            echo "Cannot find v2 data. Please link it here. Exit!" && exit 1;
+        fi
+        train_sets="${train_sets} data/train_v2"
+        valid_sets="${valid_sets} data/valid_v2"
+    fi
+
+    # Combine valid
+    utils/combine_data.sh --skip_fix true --extra-files "${utt_extra_files}" \
+        ${valid_out} ${valid_sets} || exit 1;
+    # NOTE(yifan): extra text files must be sorted and unique
+    for f in ${utt_extra_files}; do
+        check_sorted ${valid_out}/${f}
+    done
+    utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" ${valid_out} || exit 1;
+    utils/validate_data_dir.sh --no-feats --non-print ${valid_out} || exit 1;
+
+    # Combine train
+    utils/combine_data.sh --skip_fix true --extra-files "${utt_extra_files}" \
+        ${train_out} ${train_sets} || exit 1;
+    # NOTE(yifan): extra text files must be sorted and unique
+    for f in ${utt_extra_files}; do
+        check_sorted ${train_out}/${f}
+    done
+    utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" ${train_out} || exit 1;
+    utils/validate_data_dir.sh --no-feats --non-print ${train_out} || exit 1;
+fi
+
+# todo: some v3-specific operations
+
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs2/mixed_v1/s2t1/local/prepare_aishell.sh b/egs2/mixed_v1/s2t1/local/prepare_aishell.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-data_dir=/scratch/bbjs/peng6/corpora/AISHELL-1
+data_dir=${AISHELL}
 prefix=AISHELL-1
 output_dir=data/AISHELL-1
 splits="dev train"

diff --git a/egs2/mixed_v1/s2t1/local/prepare_covost2.sh b/egs2/mixed_v1/s2t1/local/prepare_covost2.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-data_dir=/scratch/bbjs/peng6/espnet-whisper-public/egs2/covost2/st1/data
+data_dir=${COVOST2}
 prefix=CoVoST2
 output_dir=data/CoVoST2
 splits="dev train"

diff --git a/egs2/mixed_v1/s2t1/local/prepare_gigaspeech.sh b/egs2/mixed_v1/s2t1/local/prepare_gigaspeech.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-data_dir=/scratch/bbjs/peng6/corpora/GigaSpeech
+data_dir=${GIGASPEECH}
 prefix=GigaSpeech
 output_dir=data/GigaSpeech
 splits="DEV XL"

diff --git a/egs2/mixed_v1/s2t1/local/prepare_librispeech.sh b/egs2/mixed_v1/s2t1/local/prepare_librispeech.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-data_dir=/scratch/bbjs/peng6/corpora/librispeech_full/LibriSpeech
+data_dir=${LIBRISPEECH}
 prefix=LibriSpeech
 output_dir=data/LibriSpeech
 splits="dev-clean dev-other train-clean-100 train-clean-360 train-other-500"

diff --git a/egs2/mixed_v1/s2t1/local/prepare_must-c.sh b/egs2/mixed_v1/s2t1/local/prepare_must-c.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-data_dir=/scratch/bbjs/peng6/corpora/MuST-C_v1.2
+data_dir=${MUST_C}
 prefix=$(basename ${data_dir})
 output_dir=data/${prefix}
 splits="dev train"

diff --git a/egs2/mixed_v1/s2t1/local/prepare_spgispeech.sh b/egs2/mixed_v1/s2t1/local/prepare_spgispeech.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-data_dir=/scratch/bbjs/peng6/corpora/SPGISpeech
+data_dir=${SPGISPEECH}
 prefix=SPGISpeech
 output_dir=data/SPGISpeech
 splits="val train"

diff --git a/egs2/mixed_v1/s2t1/local/prepare_tedlium.sh b/egs2/mixed_v1/s2t1/local/prepare_tedlium.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-data_dir=/scratch/bbjs/peng6/corpora/TEDLIUM/TEDLIUM_release-3/legacy/
+data_dir=${TEDLIUM3}
 prefix=TEDLIUM3
 output_dir=data/TEDLIUM3
 splits="dev train"

diff --git a/egs2/mixed_v2/s2t1/local/data.sh b/egs2/mixed_v2/s2t1/local/data.sh
@@ -0,0 +1 @@
+../../../mixed_v1/s2t1/local/data.sh
diff --git a/egs2/mixed_v2/s2t1/local/prepare_gigast.sh b/egs2/mixed_v2/s2t1/local/prepare_gigast.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,8 +26,8 @@ log() {
 }
 SECONDS=0
 
-gigaspeech_dir=/scratch/bbjs/peng6/corpora/GigaSpeech
-gigast_dir=/scratch/bbjs/peng6/corpora/GigaST
+gigaspeech_dir=${GIGASPEECH}
+gigast_dir=${GIGAST}
 prefix=GigaST
 output_dir=data/GigaST
 languages="de zh"

diff --git a/egs2/mixed_v2/s2t1/local/prepare_multilingual_librispeech.sh b/egs2/mixed_v2/s2t1/local/prepare_multilingual_librispeech.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-mls_dir=/scratch/bbjs/peng6/corpora/multilingual_librispeech
+mls_dir=${MLS}
 prefix=MLS
 output_dir=data/${prefix}
 # languages="nl fr de it pl pt es en"

diff --git a/egs2/mixed_v2/s2t1/local/prepare_wenetspeech.sh b/egs2/mixed_v2/s2t1/local/prepare_wenetspeech.sh
@@ -6,6 +6,7 @@ set -u
 set -o pipefail
 
 . ./path.sh || exit 1;
+. ./db.sh || exit 1;
 
 # Copied from utils/fix_data_dir.sh
 function check_sorted {
@@ -25,7 +26,7 @@ log() {
 }
 SECONDS=0
 
-wenetspeech_dir=/scratch/bbjs/peng6/corpora_shared/WenetSpeech/untar
+wenetspeech_dir=${WENETSPEECH}
 prefix=WenetSpeech
 output_dir=data/WenetSpeech
 splits="DEV L"