-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* init chime4 recipe * update decoding recipe in run,sh * fix tokenizer issue for char unit * add initial results * finalize recipe * fix tab issues in run.sh * fix trailing whitespace * fix comments * commit missing changes
- Loading branch information
Showing
22 changed files
with
1,226 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# Performance Record | ||
|
||
## Conformer Result | ||
|
||
* Feature info: dither + specaug + speed perturb | ||
* Training info: lr 0.0005, batch size 8, 1 gpu, acc_grad 4, 80 epochs | ||
* Decoding info: average_num 10 | ||
|
||
| decoding mode | dt05_real_1ch | dt05_simu_1ch | et05_real_1ch | et05_simu_1ch | | ||
|:----------------------:|:-------------:|:-------------:|:-------------:|:-------------:| | ||
| ctc_prefix_beam_search | 19.06% | 21.17% | 28.39% | 29.16% | | ||
| attention_rescoring | 17.92% | 20.22% | 27.40% | 28.25% | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# network architecture | ||
# encoder related | ||
encoder: conformer | ||
encoder_conf: | ||
output_size: 512 # dimension of attention | ||
attention_heads: 8 | ||
linear_units: 2048 # the number of units of position-wise feed forward | ||
num_blocks: 12 # the number of encoder blocks | ||
dropout_rate: 0.1 | ||
positional_dropout_rate: 0.1 | ||
attention_dropout_rate: 0.0 | ||
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 | ||
normalize_before: true | ||
cnn_module_kernel: 15 | ||
use_cnn_module: True | ||
activation_type: 'swish' | ||
pos_enc_layer_type: 'rel_pos' | ||
selfattention_layer_type: 'rel_selfattn' | ||
|
||
# decoder related | ||
decoder: transformer | ||
decoder_conf: | ||
attention_heads: 8 | ||
linear_units: 2048 | ||
num_blocks: 6 | ||
dropout_rate: 0.1 | ||
positional_dropout_rate: 0.1 | ||
self_attention_dropout_rate: 0.0 | ||
src_attention_dropout_rate: 0.0 | ||
|
||
# hybrid CTC/attention | ||
model_conf: | ||
ctc_weight: 0.3 | ||
lsm_weight: 0.1 # label smoothing option | ||
length_normalized_loss: false | ||
|
||
dataset_conf: | ||
split_with_space: true | ||
filter_conf: | ||
max_length: 40960 | ||
min_length: 0 | ||
token_max_length: 200 | ||
token_min_length: 1 | ||
resample_conf: | ||
resample_rate: 16000 | ||
speed_perturb: true | ||
fbank_conf: | ||
num_mel_bins: 80 | ||
frame_shift: 10 | ||
frame_length: 25 | ||
dither: 0.1 | ||
spec_aug: true | ||
spec_aug_conf: | ||
num_t_mask: 2 | ||
num_f_mask: 2 | ||
max_t: 40 | ||
max_f: 10 | ||
shuffle: true | ||
shuffle_conf: | ||
shuffle_size: 1500 | ||
sort: true | ||
sort_conf: | ||
sort_size: 500 # sort_size should be less than shuffle_size | ||
batch_conf: | ||
batch_type: 'static' # static or dynamic | ||
batch_size: 8 | ||
|
||
grad_clip: 10 | ||
accum_grad: 4 | ||
max_epoch: 80 | ||
log_interval: 200 | ||
|
||
optim: adam | ||
optim_conf: | ||
lr: 0.0005 | ||
scheduler: warmuplr # pytorch v1.1.0+ required | ||
scheduler_conf: | ||
warmup_steps: 20000 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/usr/bin/env bash | ||
|
||
# wujian@2020 | ||
|
||
set -eu | ||
|
||
echo "$0: Formating chime4 data dir..." | ||
|
||
track=isolated_1ch_track | ||
data_dir=data/chime4 | ||
|
||
mkdir -p $data_dir/{train,dev} | ||
|
||
cat $data_dir/tr05_{simu,real}_noisy/wav.scp $data_dir/tr05_orig_clean/wav.scp \ | ||
$data_dir/train_si200_wsj1_clean/wav.scp | sort -k1 > $data_dir/train/wav.scp | ||
cat $data_dir/tr05_{simu,real}_noisy/text $data_dir/tr05_orig_clean/text \ | ||
$data_dir/train_si200_wsj1_clean/text | sort -k1 > $data_dir/train/text | ||
|
||
cat $data_dir/dt05_{real,simu}_${track}/wav.scp | sort -k1 > $data_dir/dev/wav.scp | ||
cat $data_dir/dt05_{real,simu}_${track}/text | sort -k1 > $data_dir/dev/text | ||
|
||
echo "$0: Format $data_dir done" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#!/usr/bin/env bash | ||
|
||
# wujian@2020 | ||
|
||
set -eu | ||
|
||
[ $# -ne 2 ] && echo "Script format error: $0 <data-dir> <dump-dir>" && exit 0 | ||
|
||
data_dir=$1 | ||
dump_dir=$2 | ||
|
||
mkdir -p $dump_dir | ||
|
||
num_utts=$(cat $data_dir/wav.scp | wc -l) | ||
echo "Orginal utterances (.wav + .wv1): $num_utts" | ||
|
||
# cat $data_dir/wav.scp | grep "sph2pipe" | \ | ||
# awk -v dir=$dump_dir '{printf("%s -f wav %s %s/%s.wav\n", $2, $5, dir, $1)}' | bash | ||
|
||
cat $data_dir/wav.scp | grep -v "sph2pipe" > $data_dir/raw_wav.scp | ||
find $dump_dir -name "*.wav" | awk -F '/' '{printf("%s %s\n", $NF, $0)}' | \ | ||
sed 's:\.wav::' > $data_dir/sph_wav.scp | ||
|
||
cat $data_dir/{raw_wav,sph_wav}.scp | sort -k1 > $data_dir/wav.scp | ||
num_utts=$(cat $data_dir/wav.scp | wc -l) | ||
echo "Wave utterances (.wav): $num_utts" | ||
|
||
echo "$0: Generate wav => $dump_dir done" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) | ||
# Apache 2.0. | ||
|
||
# Modified from Kaldi's chime4 recipe | ||
|
||
set -eu | ||
|
||
dataset=chime4 | ||
|
||
. ./tools/parse_options.sh || exit 1; | ||
|
||
if [ $# -ne 1 ]; then | ||
printf "\nUSAGE: %s <original WSJ0 corpus-directory>\n\n" `basename $0` | ||
echo "The argument should be a the top-level WSJ corpus directory." | ||
echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory" | ||
echo "within the top-level corpus directory." | ||
exit 1; | ||
fi | ||
|
||
wsj0=$1 | ||
|
||
srcdir=$PWD/data/chime4/local | ||
dstdir=$PWD/data/$dataset | ||
local=$PWD/local | ||
utils=$PWD/utils | ||
sph2pipe=sph2pipe | ||
|
||
if [ ! `which sph2pipe` ]; then | ||
echo "Could not find sph2pipe, install it first..." | ||
mkdir -p exp && cd exp && wget https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz | ||
tar -zxf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5 | ||
gcc -o sph2pipe *.c -lm && cd .. && rm -rf sph2pipe_v2.5.tar.gz | ||
sph2pipe=$PWD/sph2pipe_v2.5/sph2pipe | ||
cd .. | ||
fi | ||
|
||
mkdir -p $srcdir && cd $srcdir | ||
|
||
# This version for SI-84 | ||
cat $wsj0/wsj0/doc/indices/train/tr_s_wv1.ndx \ | ||
| $local/cstr_ndx2flist.pl $wsj0 | sort -u > tr05.flist | ||
|
||
# Now for the test sets. | ||
# $wsj0/wsj1/doc/indices/readme.doc | ||
# describes all the different test sets. | ||
# Note: each test-set seems to come in multiple versions depending | ||
# on different vocabulary sizes, verbalized vs. non-verbalized | ||
# pronunciations, etc. We use the largest vocab and non-verbalized | ||
# pronunciations. | ||
# The most normal one seems to be the "baseline 60k test set", which | ||
# is h1_p0. | ||
|
||
# Nov'92 (330 utts, 5k vocab) | ||
cat $wsj0/wsj0/doc/indices/test/nvp/si_et_05.ndx | \ | ||
$local/cstr_ndx2flist.pl $wsj0 | sort > et05.flist | ||
|
||
# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt. | ||
# Sometimes this gets copied from the CD's with upcasing, don't know | ||
# why (could be older versions of the disks). | ||
find $wsj0/wsj0/si_dt_05 -print | grep -i ".wv1" | sort > dt05.flist | ||
|
||
# Finding the transcript files: | ||
find -L $wsj0 -iname '*.dot' > dot_files.flist | ||
|
||
# Convert the transcripts into our format (no normalization yet) | ||
# adding suffix to utt_id | ||
# 0 for clean condition | ||
for x in tr05 et05 dt05; do | ||
$local/flist2scp.pl $x.flist | sort > ${x}_sph_tmp.scp | ||
cat ${x}_sph_tmp.scp | awk '{print $1}' \ | ||
| $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1 | ||
cat ${x}_sph_tmp.scp | awk '{printf("%s %s\n", $1, $2);}' > ${x}_sph.scp | ||
cat ${x}_tmp.trans1 | awk '{printf("%s ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1 | ||
done | ||
|
||
# Do some basic normalization steps. At this point we don't remove OOVs-- | ||
# that will be done inside the training scripts, as we'd like to make the | ||
# data-preparation stage independent of the specific lexicon used. | ||
noiseword="<NOISE>"; | ||
for x in tr05 et05 dt05; do | ||
cat $x.trans1 | $local/normalize_transcript.pl $noiseword \ | ||
| sort > $x.txt || exit 1; | ||
done | ||
|
||
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) | ||
for x in tr05 et05 dt05; do | ||
awk -v cmd=$sph2pipe '{printf("%s %s -f wav %s |\n", $1, cmd, $2);}' ${x}_sph.scp > ${x}_wav.scp | ||
done | ||
|
||
if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then | ||
rm -f wsj0-train-spkrinfo.txt | ||
wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \ | ||
|| ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \ | ||
wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt ); | ||
fi | ||
|
||
if [ ! -f wsj0-train-spkrinfo.txt ]; then | ||
echo "Could not get the spkrinfo.txt file from LDC website (moved)?" | ||
echo "This is possibly omitted from the training disks; couldn't find it." | ||
echo "Everything else may have worked; we just may be missing gender info" | ||
echo "which is only needed for VTLN-related diagnostics anyway." | ||
exit 1 | ||
fi | ||
# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the | ||
# LDC put it on the web. Perhaps it was accidentally omitted from the | ||
# disks. | ||
|
||
cat $wsj0/wsj0/doc/spkrinfo.txt \ | ||
./wsj0-train-spkrinfo.txt | \ | ||
perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \ | ||
awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender | ||
|
||
# return back | ||
cd - | ||
|
||
for x in et05 dt05 tr05; do | ||
mkdir -p $dstdir/${x}_orig_clean | ||
cp $srcdir/$x.txt $dstdir/${x}_orig_clean/text || exit 1 | ||
cp $srcdir/${x}_wav.scp $dstdir/${x}_orig_clean/wav.scp || exit 1 | ||
done | ||
|
||
echo "Data preparation succeeded" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#!/usr/bin/env bash | ||
|
||
# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey) | ||
# Apache 2.0. | ||
|
||
set -eu | ||
|
||
|
||
if [ $# -ne 1 ]; then | ||
echo "Arguments should be WSJ1 directory" | ||
exit 1; | ||
fi | ||
|
||
wsj1=$1 | ||
dir=$PWD/data/chime4/local | ||
odir=$PWD/data/chime4 | ||
mkdir -p $dir | ||
local=$PWD/local | ||
sph2pipe=sph2pipe | ||
|
||
if [ ! `which sph2pipe` ]; then | ||
echo "Could not find sph2pipe, install it first..." | ||
mkdir -p exp && cd exp && wget https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz | ||
tar -zxf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5 | ||
gcc -o sph2pipe *.c -lm && cd .. && rm -rf sph2pipe_v2.5.tar.gz | ||
sph2pipe=$PWD/sph2pipe_v2.5/sph2pipe | ||
cd .. | ||
fi | ||
|
||
cd $dir | ||
# This version for SI-200 | ||
cat $wsj1/13-34.1/wsj1/doc/indices/si_tr_s.ndx | \ | ||
$local/ndx2flist.pl $wsj1/??-{?,??}.? | sort > train_si200.flist | ||
|
||
nl=`cat train_si200.flist | wc -l` | ||
[ "$nl" -eq 30278 ] || echo "Warning: expected 30278 lines in train_si200.flist, got $nl" | ||
|
||
# Dev-set for Nov'93 (503 utts) | ||
cat $wsj1/13-34.1/wsj1/doc/indices/h1_p0.ndx | \ | ||
$local/ndx2flist.pl $wsj1/??-{?,??}.? | sort > test_dev93.flist | ||
|
||
# Finding the transcript files: | ||
for x in $wsj1/??-{?,??}.?; do find -L $x -iname '*.dot'; done > dot_files.flist | ||
|
||
# Convert the transcripts into our format (no normalization yet) | ||
for x in train_si200 test_dev93; do | ||
$local/flist2scp.pl $x.flist | sort > ${x}_sph.scp | ||
cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1 | ||
done | ||
|
||
# Do some basic normalization steps. At this point we don't remove OOVs-- | ||
# that will be done inside the training scripts, as we'd like to make the | ||
# data-preparation stage independent of the specific lexicon used. | ||
noiseword="<NOISE>"; | ||
for x in train_si200 test_dev93; do | ||
cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1; | ||
done | ||
|
||
# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.) | ||
for x in train_si200 test_dev93; do | ||
awk -v cmd=$sph2pipe '{printf("%s %s -f wav %s |\n", $1, cmd, $2);}' ${x}_sph.scp > ${x}_wav.scp | ||
done | ||
|
||
# return back | ||
cd - | ||
|
||
for x in train_si200 test_dev93; do | ||
mkdir -p $odir/${x}_wsj1_clean | ||
cp $dir/$x.txt $odir/${x}_wsj1_clean/text || exit 1 | ||
cp $dir/${x}_wav.scp $odir/${x}_wsj1_clean/wav.scp || exit 1 | ||
done | ||
|
||
echo "Data preparation WSJ1 succeeded" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#!/usr/bin/env perl | ||
|
||
# Copyright 2010-2011 Microsoft Corporation | ||
|
||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED | ||
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, | ||
# MERCHANTABLITY OR NON-INFRINGEMENT. | ||
# See the Apache 2 License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
# This is modified from the script in standard Kaldi recipe to account | ||
# for the way the WSJ data is structured on the Edinburgh systems. | ||
# - Arnab Ghoshal, 12/1/12 | ||
|
||
# This program takes as its standard input an .ndx file from the WSJ corpus that looks | ||
# like this: | ||
#;; File: tr_s_wv1.ndx, updated 04/26/94 | ||
#;; | ||
#;; Index for WSJ0 SI-short Sennheiser training data | ||
#;; Data is read WSJ sentences, Sennheiser mic. | ||
#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts | ||
#;; per speaker TI) = 7236 utts | ||
#;; | ||
#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1 | ||
#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1 | ||
#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1 | ||
|
||
# and as command-line argument it takes the names of the WSJ disk locations, e.g.: | ||
# /group/corpora/public/wsjcam0/data on DICE machines. | ||
# It outputs a list of absolute pathnames. | ||
|
||
$wsj_dir = $ARGV[0]; | ||
|
||
while(<STDIN>){ | ||
if(m/^;/){ next; } # Comment. Ignore it. | ||
else { | ||
m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_"; | ||
$filename = $2; # as a subdirectory of the distributed disk. | ||
if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; } | ||
$filename = "$wsj_dir/$filename"; | ||
if (-e $filename) { | ||
print "$filename\n"; | ||
} else { | ||
print STDERR "File $filename found in the index but not on disk\n"; | ||
} | ||
} | ||
} |
Oops, something went wrong.