Skip to content

Commit

Permalink
Merge pull request #5478 from jctian98/owsm_data
Browse files Browse the repository at this point in the history
add centralized data preparation for OWSM
  • Loading branch information
sw005320 authored Dec 5, 2023
2 parents 35b8f01 + ee00c6c commit a45a53c
Show file tree
Hide file tree
Showing 128 changed files with 3,354 additions and 94 deletions.
7 changes: 6 additions & 1 deletion egs/wsj/asr1/local/wsj_format_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# We'll create train_si84 after doing the feature extraction.

lang_suffix=
whisper_text_norm=false

echo "$0 $@" # Print the command line for logging
. utils/parse_options.sh || exit 1;
Expand All @@ -26,7 +27,11 @@ srcdir=data/local/data
for x in train_si284 test_eval92 test_eval93 test_dev93 test_eval92_5k test_eval93_5k test_dev93_5k dev_dt_05 dev_dt_20; do
mkdir -p data/$x
cp $srcdir/${x}_wav.scp data/$x/wav.scp || exit 1;
cp $srcdir/$x.txt data/$x/text || exit 1;
if $whisper_text_norm; then
python3 local/prepare_transcription_whisper.py -i $srcdir/$x.trans1 -o data/$x/text
else
cp $srcdir/$x.txt data/$x/text || exit 1;
fi
cp $srcdir/$x.spk2utt data/$x/spk2utt || exit 1;
cp $srcdir/$x.utt2spk data/$x/utt2spk || exit 1;
utils/filter_scp.pl data/$x/spk2utt $srcdir/spk2gender > data/$x/spk2gender || exit 1;
Expand Down
6 changes: 5 additions & 1 deletion egs2/TEMPLATE/asr1/db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ AR_SC=
AUDIOSET=
ASVSpoof_CMD=
BIBLETTS=downloads
COVOST2=
DIRHA_ENGLISH_PHDEV=
DIRHA_WSJ=
DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed" # Output file path
Expand Down Expand Up @@ -67,7 +68,6 @@ SLURP=
SLURP_S= # Output file path
LIBRITRANS_S= # Output file path
VOICES=downloads
VOXCELEB=
MAGICDATA=downloads
MEDIASPEECH=downloads
MINI_LIBRISPEECH=downloads
Expand Down Expand Up @@ -99,6 +99,7 @@ SPGISPEECH=
SPEECH_PROMPT_v2=
STOP=
SWBD=
FISHER_CALLHOME_SPANISH=
SWBD_NXT=
THCHS30=downloads
TIMIT=
Expand Down Expand Up @@ -150,6 +151,7 @@ RU_OPEN_STT=downloads
RUSLAN=downloads
SIWIS=downloads
GIGASPEECH=
GIGAST=
GOOGLEI18N=downloads
NOISY_SPEECH=
NOISY_REVERBERANT_SPEECH=
Expand Down Expand Up @@ -207,6 +209,8 @@ KATHBATH=downloads
GRAMVAANI=downloads
SPRING_INX=downloads
VOXCELEB=
KSPONSPEECH=

# For only CMU TIR environment
if [[ "$(hostname)" == tir* ]]; then
BABEL_101=/projects/tir5/data/speech_corpora/babel/IARPA_BABEL_BP_101/
Expand Down
6 changes: 5 additions & 1 deletion egs2/ksponspeech/asr1/local/data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ set -e
set -u
set -o pipefail

. ./path.sh || exit 1;
. ./cmd.sh || exit 1;
. ./db.sh || exit 1;

log() {
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
Expand All @@ -15,7 +19,7 @@ stage=1
stop_stage=100

#data
datadir=/ocean/projects/cis210027p/shared/corpora/KsponSpeech/KsponSpeech/
datadir=${KSPONSPEECH}
# KsponSpeech
# |_ KsponSpeech_01/
# |_ KsponSpeech_02/
Expand Down
3 changes: 0 additions & 3 deletions egs2/mixed_v3/s2t1/README.md

This file was deleted.

271 changes: 195 additions & 76 deletions egs2/open_li110/asr1/local/data.sh

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions egs2/owsm_v1/s2t1/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
### Guidance for data preparation
(1) Please work progressively from v1 to v3: this means you need to prepare data for v1, v2 and v3 in order to obtain the full v3 data. To start the data preparation, run `bash local/data.sh --VERSION v1 # or v2, v3`
(2) Please revise `db.sh` for all datasets before running `local/data.sh`. Some datasets cannot be downloaded and untared automatically due to license issues. Users should take care of it by themselves.
(3) Due to the large volume of data, we are not confident the scripts will run smoothly for each dataset. Please raise an issue if you believe there is a bug.
(4) This script only prepares data for train and valid subsets. Test data should be prepared separately following the conventional Espnet2 format.
(5) Even though we provide this centralized data preparation script and combine all datasets in it, we strongly recommend users to NOT use the merged train_v* and valid_v* for feature extractions. Instead, users may run stage 2-4 for each dataset separately and combine all datasets together under `dump/raw` directory. This will allow you to handle all datasets simultaneously; inspection and debugging will also be easier. This is exactly what we did in our experiments.
(6) Users can also refer to this PR to check more details: https://github.com/espnet/espnet/pull/5478
(7) The detailed data list is in `local/data.sh`. Also see: https://arxiv.org/pdf/2309.13876.pdf
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
220 changes: 220 additions & 0 deletions egs2/owsm_v1/s2t1/local/data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
#!/usr/bin/env bash
# Copyright 2023 Carnegie Mellon University (Jinchuan Tian)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)

# Centralized data preparation for OWSM (https://arxiv.org/abs/2309.13876)
# Details of this script is also in: https://github.com/espnet/espnet/pull/5478/

# Guidance for data preparation
# (1) Please work progressively from v1 to v3: this means you need to prepare
# data for v1, v2 and v3 in order to obtain the full v3 data. To start the
# data preparation, run `bash local/data.sh --VERSION v1 # or v2, v3`
# (2) Please revise `db.sh` for all datasets before running `local/data.sh`.
# Some datasets cannot be downloaded and untared automatically due to
# license issues. Users should take care of it by themselves.
# (3) Due to the large volume of data, we are not confident the scripts will
# run smoothly for each dataset. Please raise an issue if you believe
# there is a bug.
# (4) This script only prepares data for train and valid subsets.
# Test data should be prepared separately following the conventional
# Espnet2 format.
# (5) Even though we provide this centralized data preparation script and
# combine all datasets in it, we strongly recommend users to NOT use
# the merged train_v* and valid_v* for feature extractions. Instead,
# users may run stage 2-4 for each dataset separately and combine all
# datasets together under `dump/raw` directory. This will allow you to
# handle all datasets simultaneously; inspection and debugging will
# also be easier. This is exactly what we did in our experiments.

set -e
set -u
set -o pipefail

. ./path.sh || exit 1;
. ./db.sh || exit 1;

function check_sorted {
file=$1
sort -k1,1 -u <$file >$file.tmp
if ! cmp -s $file $file.tmp; then
echo "$0: file $1 is not in sorted order or not unique, sorting it"
mv $file.tmp $file
else
rm $file.tmp
fi
}

log() {
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
SECONDS=0

VERSION=v1 # specify v1, v2 or v3
stage=1
stop_stage=2

. utils/parse_options.sh

# Change accordingly if you only want to prepare a subset of it
if [ ${VERSION} = "v1" ]; then
datasets="aishell covost2 gigaspeech librispeech must-c spgispeech"
train_sets="data/AISHELL-1/train \
data/CoVoST2/train \
data/GigaSpeech/XL \
data/LibriSpeech/train-clean-100 \
data/LibriSpeech/train-clean-360 \
data/LibriSpeech/train-other-500 \
data/MuST-C_v1.2/train \
data/MuST-C_v2/train \
data/MuST-C_v3/train \
data/SPGISpeech/train \
data/TEDLIUM3/train"
valid_sets="data/AISHELL-1/dev \
data/CoVoST2/dev \
data/GigaSpeech/DEV \
data/LibriSpeech/dev-clean \
data/LibriSpeech/dev-other \
data/MuST-C_v1.2/dev \
data/MuST-C_v2/dev \
data/MuST-C_v3/dev \
data/SPGISpeech/val \
data/TEDLIUM3/dev"

elif [ ${VERSION} = "v2" ]; then
datasets="gigast multilingual_librispeech wenetspeech"
train_sets="data/GigaST/XL.en-* \
data/MLS/train.* \
data/WenetSpeech/L"
valid_sets="data/MLS/dev.* \
data/WenetSpeech/DEV"

# Note(jinchuan):
# AMI & SWBD & BABEL & OpenSLR: the original egs2/ami/asr1/local/data.sh is not smooth.
# AMI & VoxForge: original text are all in upper case. Change them all into lower cases.
# ReazonSpeech: no train / valid /test splits. Generated by ourselves.
elif [ ${VERSION} = "v3" ]; then
datasets="aidatatang ami commonvoice swbd fisher_callhome \
fleurs ksponspeech magicdata reazonspeech ru_open_stt \
vctk voxpopuli wsj voxforge babel openslr"
train_sets="data/aidatatang/train_whisper \
data/ami/ihm_train_whisper \
data/CommonVoice/train \
data/swbd/train_nodup_whisper \
data/swbd/train_fisher_whisper \
data/fisher_callhome/train_whisper \
data/FLEURS/train \
data/ksponspeech/train_whisper \
data/magicdata/train_whisper \
data/ReazonSpeech/train \
data/ru_open_stt/train_whisper \
data/vctk/tr_no_dev_whisper \
data/VoxPopuli/train \
data/wsj/train_si284_whisper \
data/voxforge/tr \
data/babel/train \
data/openslr/train"
valid_sets="data/aidatatang/dev_whisper \
data/ami/ihm_dev_whisper \
data/CommonVoice/dev \
data/swbd/train_dev_whisper \
data/fisher_callhome/dev_whisper \
data/FLEURS/valid \
data/ksponspeech/dev_whisper \
data/magicdata/dev_whisper \
data/ReazonSpeech/valid \
data/ru_open_stt/dev_whisper \
data/vctk/dev_whisper \
data/VoxPopuli/dev \
data/wsj/test_dev93_whisper \
data/voxforge/dt \
data/babel/dev \
data/openslr/dev"
else
echo "Invalid version argument ${VERSION}." && exit 1;
fi
echo "Preparing data for OSWM with version ${VERSION}"
echo "Datasets to prepare: ${datasets}"

utt_extra_files="text.prev text.ctc"
train_out=data/train_${VERSION}
valid_out=data/valid_${VERSION}

# v3 data adopts ISO-639-3 langauge-IDs
if [ ! -d ./iso639 ] && [ ${VERSION} = "v3" ]; then
echo "installing ISO-639 dependency"
git clone https://github.com/noumar/iso639
cd iso639; python3 setup.py install || exit 1;
cd ..
fi

# call data preparation script for each dataset
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
for dataset in ${datasets}; do
if [ -f data/.${dataset}.done ]; then
echo "${dataset} has been processed. Skip!"
else
if [ ! -f ./local/prepare_${dataset}.sh ]; then
echo "script for ${dataset} is not found." && exit 1;
fi
echo "preparing ${dataset} dataset ..."
./local/prepare_${dataset}.sh || \
echo "preparing ${dataset} failed" && exit 1;
touch data/.${dataset}.done
fi
done
fi

# combine all datasets.
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

if [ ${VERSION} = "v2" ]; then
if [ ! -d data/train_v1 ] || [ ! -d data/valid_v1 ]; then
echo "Cannot find v1 data. copy it ..."
cp -r ../../owsm_v1/s2t1/data/{train,valid}_v1/ ./data || exit 1;
fi
train_sets="${train_sets} data/train_v1"
valid_sets="${valid_sets} data/valid_v1"
fi

if [ ${VERSION} = "v3" ]; then
if [ ! -d data/train_v2 ] || [ ! -d data/valid_v2 ]; then
echo "Cannot find v2 data. copy it ..."
cp -r ../../owsm_v2/s2t1/data/{train,valid}_v2/ ./data || exit 1;
fi
train_sets="${train_sets} data/train_v2"
valid_sets="${valid_sets} data/valid_v2"

# v3 adopts ISO-639-3 language-IDs
# So change all langauge-IDs in v2 to ISO-639-3 before merging
for part in train valid; do
if [ ! -f data/${part}_v2/text_raw ]; then
mv data/${part}_v2/text data/${part}_v2/text_raw || exit 1;
python3 local/filter_lang_id.py \
-i data/${part}_v2/text_raw -o data/${part}_v2/text || exit 1;
fi
done
fi

# Combine valid
utils/combine_data.sh --skip_fix true --extra-files "${utt_extra_files}" \
${valid_out} ${valid_sets} || exit 1;
# NOTE(yifan): extra text files must be sorted and unique
for f in ${utt_extra_files}; do
check_sorted ${valid_out}/${f}
done
utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" ${valid_out} || exit 1;
utils/validate_data_dir.sh --no-feats --non-print ${valid_out} || exit 1;

# Combine train
utils/combine_data.sh --skip_fix true --extra-files "${utt_extra_files}" \
${train_out} ${train_sets} || exit 1;
# NOTE(yifan): extra text files must be sorted and unique
for f in ${utt_extra_files}; do
check_sorted ${train_out}/${f}
done
utils/fix_data_dir.sh --utt_extra_files "${utt_extra_files}" ${train_out} || exit 1;
utils/validate_data_dir.sh --no-feats --non-print ${train_out} || exit 1;
fi

log "Successfully finished. [elapsed=${SECONDS}s]"
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set -u
set -o pipefail

. ./path.sh || exit 1;
. ./db.sh || exit 1;

# Copied from utils/fix_data_dir.sh
function check_sorted {
Expand All @@ -25,7 +26,7 @@ log() {
}
SECONDS=0

data_dir=/scratch/bbjs/peng6/corpora/AISHELL-1
data_dir=${AISHELL}
prefix=AISHELL-1
output_dir=data/AISHELL-1
splits="dev train"
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set -u
set -o pipefail

. ./path.sh || exit 1;
. ./db.sh || exit 1;

# Copied from utils/fix_data_dir.sh
function check_sorted {
Expand All @@ -25,7 +26,7 @@ log() {
}
SECONDS=0

data_dir=/scratch/bbjs/peng6/espnet-whisper-public/egs2/covost2/st1/data
data_dir=${COVOST2}
prefix=CoVoST2
output_dir=data/CoVoST2
splits="dev train"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set -u
set -o pipefail

. ./path.sh || exit 1;
. ./db.sh || exit 1;

# Copied from utils/fix_data_dir.sh
function check_sorted {
Expand All @@ -25,7 +26,7 @@ log() {
}
SECONDS=0

data_dir=/scratch/bbjs/peng6/corpora/GigaSpeech
data_dir=${GIGASPEECH}
prefix=GigaSpeech
output_dir=data/GigaSpeech
splits="DEV XL"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ set -u
set -o pipefail

. ./path.sh || exit 1;
. ./db.sh || exit 1;

# Copied from utils/fix_data_dir.sh
function check_sorted {
Expand All @@ -25,7 +26,7 @@ log() {
}
SECONDS=0

data_dir=/scratch/bbjs/peng6/corpora/librispeech_full/LibriSpeech
data_dir=${LIBRISPEECH}
prefix=LibriSpeech
output_dir=data/LibriSpeech
splits="dev-clean dev-other train-clean-100 train-clean-360 train-other-500"
Expand Down
File renamed without changes.
Loading

0 comments on commit a45a53c

Please sign in to comment.