Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add centralized data preparation for OWSM #5478

Merged
merged 31 commits into from
Dec 5, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
3be1f40
add whisper data.sh for v1 and v2
jctian98 Oct 17, 2023
37ab173
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 17, 2023
92bf631
add OWSM v3 data recipe
jctian98 Oct 17, 2023
ac8e423
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Oct 17, 2023
a3c24bd
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 17, 2023
063dc3f
fix ci issues
jctian98 Oct 18, 2023
5e14a62
update with ci issues
jctian98 Oct 18, 2023
7b707cd
change egs name from mixed_v* to owsm_v*
jctian98 Oct 23, 2023
14204e2
v3 shuold be ready except wsj
jctian98 Oct 30, 2023
ae05a6c
add wsj
jctian98 Oct 30, 2023
c515f76
update db.sh
jctian98 Oct 30, 2023
b53ce47
Merge branch 'master' into owsm_data
jctian98 Oct 30, 2023
ec109e2
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Oct 30, 2023
31ad173
almost finish all scripts
jctian98 Nov 10, 2023
8a09625
fix small problems
jctian98 Nov 10, 2023
952acf6
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 10, 2023
2fd2668
merge master
jctian98 Nov 10, 2023
c53afd0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 10, 2023
bdaf344
update the langauge mapping
jctian98 Nov 11, 2023
d379fd0
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 11, 2023
b2cb427
Merge branch 'master' into owsm_data
jctian98 Nov 11, 2023
f5e5414
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 11, 2023
51e3691
fix CI issue
jctian98 Nov 11, 2023
7f75d15
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 11, 2023
66176bc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 11, 2023
3d89d78
update wsj and commonvoice
jctian98 Nov 26, 2023
8f1e0fa
Merge commit 'FETCH_HEAD' into owsm_data
jctian98 Nov 26, 2023
77fe14b
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 26, 2023
c391765
update wsj text norm script
jctian98 Nov 26, 2023
642fd22
update wsj text norm 2
jctian98 Nov 26, 2023
ee00c6c
revise voxpopuli
jctian98 Nov 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add OWSM v3 data recipe
  • Loading branch information
jctian98 committed Oct 17, 2023
commit 92bf6313421001c78a400194b309248241024e99
4 changes: 4 additions & 0 deletions egs2/TEMPLATE/asr1/db.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ ASVTutorial=espnet_tutorial_asvspoof
APHASIABANK=
AUDIOSET=
BIBLETTS=downloads
COVOST2=
DIRHA_ENGLISH_PHDEV=
DIRHA_WSJ=
DIRHA_WSJ_PROCESSED="${PWD}/data/local/dirha_wsj_processed" # Output file path
Expand Down Expand Up @@ -88,6 +89,7 @@ QASR_TTS=downloads
SNIPS= # smart-light-en-closed-field data path
SPGISPEECH=
SWBD=
FISHER_CALLHOME_SPANISH=
SWBD_NXT=
THCHS30=downloads
TIMIT=
Expand Down Expand Up @@ -139,6 +141,7 @@ RU_OPEN_STT=downloads
RUSLAN=downloads
SIWIS=downloads
GIGASPEECH=
GIGAST=
GOOGLEI18N=downloads
NOISY_SPEECH=
NOISY_REVERBERANT_SPEECH=
Expand Down Expand Up @@ -194,6 +197,7 @@ KIRITAN=
NAMINE=
KATHBATH=downloads
GRAMVAANI=downloads
KSPONSPEECH=

# For only CMU TIR environment
if [[ "$(hostname)" == tir* ]]; then
Expand Down
6 changes: 5 additions & 1 deletion egs2/ksponspeech/asr1/local/data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ set -e
set -u
set -o pipefail

. ./path.sh || exit 1;
. ./cmd.sh || exit 1;
. ./db.sh || exit 1;

log() {
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
Expand All @@ -15,7 +19,7 @@ stage=1
stop_stage=100

#data
datadir=/ocean/projects/cis210027p/shared/corpora/KsponSpeech/KsponSpeech/
datadir=${KSPONSPEECH}
# KsponSpeech
# |_ KsponSpeech_01/
# |_ KsponSpeech_02/
Expand Down
74 changes: 59 additions & 15 deletions egs2/mixed_v1/s2t1/local/data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',

# Centralized data preparation for OWSM (https://arxiv.org/abs/2309.13876)
# Details of this script is also in: https://github.com/espnet/espnet/pull/5478/

# Note (jinchuan):
# (1) please work progressively from v1 to v3: you need to
Expand Down Expand Up @@ -76,19 +77,40 @@ elif [ ${VERSION} = "v2" ]; then
train_sets="data/GigaST/XL.en-* \
data/MLS/train.* \
data/WenetSpeech/L"
# question (jinchuan): why don't include GigaST-dev?
valid_sets="data/MLS/dev.* \
data/WenetSpeech/DEV"

elif [ ${VERSION} = "v3" ]; then
datasets="aidatatang ami babel commonvoice swbd fisher_callhome \
datasets="aidatatang ami commonvoice swbd fisher_callhome \
fleurs ksponspeech magicdata reazonspeech ru_open_stt \
vctk voxpopuli wsj" \
# still working on it
train_sets="data/aidatatang/train \
"
valid_sets="data/aidatatang/dev \
"
train_sets="data/aidatatang/train_whisper \
ftshijt marked this conversation as resolved.
Show resolved Hide resolved
data/ami/ihm_train_whisper \
data/CommonVoice/train \
data/swbd/train_nodup_whisper \
data/swbd/train_fisher_whisper \
data/fisher_callhome/train_whisper \
data/FLEURS/train \
data/ksponspeech/train_whisper \
data/magicdata/train_whisper \
data/ReazonSpeech/train \
data/ru_open_stt/train_whisper \
data/vctk/tr_no_dev_whisper \
data/VoxPopuli/train \
data/wsj/train_si284_whisper"
valid_sets="data/aidatatang/dev_whisper \
data/ami/ihm_dev_whisper \
data/CommonVoice/dev \
data/swbd/train_dev_whisper \
data/fisher_callhome/dev_whisper \
data/FLEURS/valid \
data/ksponspeech/dev_whisper \
data/magicdata/dev_whisper \
data/ReazonSpeech/valid \
data/ru_open_stt/dev_whisper \
data/vctk/dev_whisper \
data/VoxPopuli/dev \
data/wsj/test_dev93_whisper"
else
echo "Invalid version argument ${VERSION}." && exit 1;
fi
Expand All @@ -99,14 +121,27 @@ utt_extra_files="text.prev text.ctc"
train_out=data/train_${VERSION}
valid_out=data/valid_${VERSION}

# v3 data adopts ISO-639-3 langauge-IDs
if [ ! -d ./iso639 ] && [ ${VERSION} = "v3" ]; then
echo "installing ISO-639 dependency"
git clone https://github.com/noumar/iso639
cd iso639; python3 setup.py install || exit 1;
cd ..
fi

# call data preparation script for each dataset
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
for dataset in ${datasets}; do
if [ -f data/.${dataset}.done ]; then
echo ${dataset} has been processed. Skip!
echo "${dataset} has been processed. Skip!"
else
echo preparing ${dataset} dataset ...
./local/prepare_${dataset}.sh && touch data/.${dataset}.done
if [ ! -f ./local/prepare_${dataset}.sh ]; then
echo "script for ${dataset} is not found." && exit 1;
fi
echo "preparing ${dataset} dataset ..."
./local/prepare_${dataset}.sh || \
echo "preparing ${dataset} failed" && exit 1;
touch data/.${dataset}.done
fi
done
fi
Expand All @@ -116,18 +151,30 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

if [ ${VERSION} = "v2" ]; then
if [ ! -d data/train_v1 ] || [ ! -d data/valid_v1 ]; then
echo "Cannot find v1 data. Please link it here. Exit!" && exit 1;
echo "Cannot find v1 data. copy it ..."
cp -r ../../mixed_v1/s2t1/data/{train,valid}_v1/ ./data || exit 1;
fi
train_sets="${train_sets} data/train_v1"
valid_sets="${valid_sets} data/valid_v1"
fi

if [ ${VERSION} = "v3" ]; then
if [ ! -d data/train_v2 ] || [ ! -d data/valid_v2 ]; then
echo "Cannot find v2 data. Please link it here. Exit!" && exit 1;
echo "Cannot find v2 data. copy it ..."
cp -r ../../mixed_v2/s2t1/data/{train,valid}_v2/ ./data || exit 1;
fi
train_sets="${train_sets} data/train_v2"
valid_sets="${valid_sets} data/valid_v2"

# v3 adopts ISO-639-3 language-IDs
# So change all langauge-IDs in v2 to ISO-639-3 before merging
for part in train valid; do
if [ ! -f data/${part}_v2/text_raw ]; then
mv data/${part}_v2/text data/${part}_v2/text_raw || exit 1;
python3 local/filter_lang_id.py \
-i data/${part}_v2/text_raw -o data/${part}_v2/text || exit 1;
fi
done
fi

# Combine valid
Expand All @@ -151,7 +198,4 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
utils/validate_data_dir.sh --no-feats --non-print ${train_out} || exit 1;
fi

# todo: some v3-specific operations


log "Successfully finished. [elapsed=${SECONDS}s]"
108 changes: 108 additions & 0 deletions egs2/mixed_v3/s2t1/local/cv-iso-693-3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
ab abk
bn ben
de deu
fi fin
hu hun
kk kaz
ml mal
nn-NO nno
rw kin
sw swa
ug uig
zh-TW cmn
ar ara
br bre
dv div
fr fra
hy-AM hye
or ory
sah sah
as asm
sat sat
ast ast
az aze
ba bak
bas bas
hsb hsb
kab kab
mk mkd
nl nld
ru rus
sv-SE swe
tw twi
zh-HK cmn
zh-CN cmn
ne-NP nep
bg bul
da dan
be bel
kmr kmr
mn mon
ta tam
uk ukr
ca cat
el ell
fy-NL frr
ia ina
ky kir
mr mar
pa-IN pan
th tha
ur urd
ckb ckb
en eng
ga-IE gle
id ind
lg lga
mrj mrj
pl pol
sc srd
ti tir
uz uzb
cnh cnh
eo epo
gl glg
ig ibo
lt lit
mt mlt
pt por
sk slk
tig tig
vi vie
cs ces
es spa
gn grn
it ita
lv lav
myv myv
rm-sursilv roh
skr skr
tok tok
vot vot
cv chv
et est
ha hau
ja jpn
mdf mdf
nan-tw nan
rm-vallader roh
sl slv
tr tur
yue yue
cy cym
eu eus
hi hin
ka kat
mhr mhr
ro ron
sr srp
tt tat
fa fas
tk tuk
is isl
ko kor
quy quy
yo yor
lo lao
dyu dgd
oc oci
1 change: 1 addition & 0 deletions egs2/mixed_v3/s2t1/local/data.sh
77 changes: 77 additions & 0 deletions egs2/mixed_v3/s2t1/local/filter_lang_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from argparse import ArgumentParser
from collections import defaultdict
from pathlib import Path
from utils import TO_ISO_LANGUAGE_CODE
from utils import (
SYMBOL_NA,
SYMBOL_NOSPEECH,
SYMBOLS_TIME,
TO_ISO_LANGUAGE_CODE,
)


def filter_lang_id(reader, writer):
langs, st_langs = set(), set()
for line in reader:
utt, ctx = line.strip().split(maxsplit=1)
src, tgt, other = ctx.split("><", maxsplit=2)

src = TO_ISO_LANGUAGE_CODE[src[1:]]
langs.add(src)

tgt = (
tgt
if tgt == "asr"
else "st_" + TO_ISO_LANGUAGE_CODE[tgt.replace("st_", "")]
)
if tgt.startswith("st_"):
st_langs.add(tgt.replace("st_", ""))

line = f"{utt} <{src}><{tgt}><{other}\n"
writer.write(line)

langs = list(langs)
langs.sort()

st_langs = list(st_langs)
st_langs.sort()

return langs, st_langs


def get_parser():
parser = ArgumentParser(description="Show statistics of the data directory.")
parser.add_argument(
"-i", "--input", type=Path, required=True, help="input text file"
)
parser.add_argument(
"-o", "--output", type=Path, required=True, help="output text file"
)
parser.add_argument(
"--nlsyms", type=Path, default=None, help="output path of nlsyms"
)
return parser


if __name__ == "__main__":
parser = get_parser()
args = parser.parse_args()

writer = open(args.output, "w")
reader = open(args.input, "r")
langs, st_langs = filter_lang_id(reader, writer)
writer.close()
reader.close()

if args.nlsyms is not None:
special_tokens = [
SYMBOL_NA,
SYMBOL_NOSPEECH,
*[f"<{lang}>" for lang in langs],
*[f"<st_{lang}>" for lang in st_langs],
*SYMBOLS_TIME,
]

with open(args.nlsyms, "w") as fp:
for tok in special_tokens:
fp.write(f"{tok}\n")
Loading