Skip to content

Commit

Permalink
[examples] add CHiME4 recipe (#746)
Browse files Browse the repository at this point in the history
* init chime4 recipe

* update decoding recipe in run,sh

* fix tokenizer issue for char unit

* add initial results

* finalize recipe

* fix tab issues in run.sh

* fix trailing whitespace

* fix comments

* commit missing changes
  • Loading branch information
funcwj authored Nov 19, 2021
1 parent 33eeff6 commit d01276a
Show file tree
Hide file tree
Showing 22 changed files with 1,226 additions and 2 deletions.
12 changes: 12 additions & 0 deletions examples/chime4/s0/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Performance Record

## Conformer Result

* Feature info: dither + specaug + speed perturb
* Training info: lr 0.0005, batch size 8, 1 gpu, acc_grad 4, 80 epochs
* Decoding info: average_num 10

| decoding mode | dt05_real_1ch | dt05_simu_1ch | et05_real_1ch | et05_simu_1ch |
|:----------------------:|:-------------:|:-------------:|:-------------:|:-------------:|
| ctc_prefix_beam_search | 19.06% | 21.17% | 28.39% | 29.16% |
| attention_rescoring | 17.92% | 20.22% | 27.40% | 28.25% |
78 changes: 78 additions & 0 deletions examples/chime4/s0/conf/train_conformer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'

# decoder related
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0

# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

dataset_conf:
split_with_space: true
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 40
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'static' # static or dynamic
batch_size: 8

grad_clip: 10
accum_grad: 4
max_epoch: 80
log_interval: 200

optim: adam
optim_conf:
lr: 0.0005
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 20000
22 changes: 22 additions & 0 deletions examples/chime4/s0/local/chime4_format_dir.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

# wujian@2020

set -eu

echo "$0: Formating chime4 data dir..."

track=isolated_1ch_track
data_dir=data/chime4

mkdir -p $data_dir/{train,dev}

cat $data_dir/tr05_{simu,real}_noisy/wav.scp $data_dir/tr05_orig_clean/wav.scp \
$data_dir/train_si200_wsj1_clean/wav.scp | sort -k1 > $data_dir/train/wav.scp
cat $data_dir/tr05_{simu,real}_noisy/text $data_dir/tr05_orig_clean/text \
$data_dir/train_si200_wsj1_clean/text | sort -k1 > $data_dir/train/text

cat $data_dir/dt05_{real,simu}_${track}/wav.scp | sort -k1 > $data_dir/dev/wav.scp
cat $data_dir/dt05_{real,simu}_${track}/text | sort -k1 > $data_dir/dev/text

echo "$0: Format $data_dir done"
28 changes: 28 additions & 0 deletions examples/chime4/s0/local/chime4_gen_wav.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash

# wujian@2020

set -eu

[ $# -ne 2 ] && echo "Script format error: $0 <data-dir> <dump-dir>" && exit 0

data_dir=$1
dump_dir=$2

mkdir -p $dump_dir

num_utts=$(cat $data_dir/wav.scp | wc -l)
echo "Orginal utterances (.wav + .wv1): $num_utts"

# cat $data_dir/wav.scp | grep "sph2pipe" | \
# awk -v dir=$dump_dir '{printf("%s -f wav %s %s/%s.wav\n", $2, $5, dir, $1)}' | bash

cat $data_dir/wav.scp | grep -v "sph2pipe" > $data_dir/raw_wav.scp
find $dump_dir -name "*.wav" | awk -F '/' '{printf("%s %s\n", $NF, $0)}' | \
sed 's:\.wav::' > $data_dir/sph_wav.scp

cat $data_dir/{raw_wav,sph_wav}.scp | sort -k1 > $data_dir/wav.scp
num_utts=$(cat $data_dir/wav.scp | wc -l)
echo "Wave utterances (.wav): $num_utts"

echo "$0: Generate wav => $dump_dir done"
124 changes: 124 additions & 0 deletions examples/chime4/s0/local/clean_wsj0_data_prep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
#!/usr/bin/env bash

# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

# Modified from Kaldi's chime4 recipe

set -eu

dataset=chime4

. ./tools/parse_options.sh || exit 1;

if [ $# -ne 1 ]; then
printf "\nUSAGE: %s <original WSJ0 corpus-directory>\n\n" `basename $0`
echo "The argument should be a the top-level WSJ corpus directory."
echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
echo "within the top-level corpus directory."
exit 1;
fi

wsj0=$1

srcdir=$PWD/data/chime4/local
dstdir=$PWD/data/$dataset
local=$PWD/local
utils=$PWD/utils
sph2pipe=sph2pipe

if [ ! `which sph2pipe` ]; then
echo "Could not find sph2pipe, install it first..."
mkdir -p exp && cd exp && wget https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz
tar -zxf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5
gcc -o sph2pipe *.c -lm && cd .. && rm -rf sph2pipe_v2.5.tar.gz
sph2pipe=$PWD/sph2pipe_v2.5/sph2pipe
cd ..
fi

mkdir -p $srcdir && cd $srcdir

# This version for SI-84
cat $wsj0/wsj0/doc/indices/train/tr_s_wv1.ndx \
| $local/cstr_ndx2flist.pl $wsj0 | sort -u > tr05.flist

# Now for the test sets.
# $wsj0/wsj1/doc/indices/readme.doc
# describes all the different test sets.
# Note: each test-set seems to come in multiple versions depending
# on different vocabulary sizes, verbalized vs. non-verbalized
# pronunciations, etc. We use the largest vocab and non-verbalized
# pronunciations.
# The most normal one seems to be the "baseline 60k test set", which
# is h1_p0.

# Nov'92 (330 utts, 5k vocab)
cat $wsj0/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
$local/cstr_ndx2flist.pl $wsj0 | sort > et05.flist

# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
# Sometimes this gets copied from the CD's with upcasing, don't know
# why (could be older versions of the disks).
find $wsj0/wsj0/si_dt_05 -print | grep -i ".wv1" | sort > dt05.flist

# Finding the transcript files:
find -L $wsj0 -iname '*.dot' > dot_files.flist

# Convert the transcripts into our format (no normalization yet)
# adding suffix to utt_id
# 0 for clean condition
for x in tr05 et05 dt05; do
$local/flist2scp.pl $x.flist | sort > ${x}_sph_tmp.scp
cat ${x}_sph_tmp.scp | awk '{print $1}' \
| $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
cat ${x}_sph_tmp.scp | awk '{printf("%s %s\n", $1, $2);}' > ${x}_sph.scp
cat ${x}_tmp.trans1 | awk '{printf("%s ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1
done

# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in tr05 et05 dt05; do
cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
| sort > $x.txt || exit 1;
done

# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
for x in tr05 et05 dt05; do
awk -v cmd=$sph2pipe '{printf("%s %s -f wav %s |\n", $1, cmd, $2);}' ${x}_sph.scp > ${x}_wav.scp
done

if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
rm -f wsj0-train-spkrinfo.txt
wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \
|| ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt );
fi

if [ ! -f wsj0-train-spkrinfo.txt ]; then
echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
echo "This is possibly omitted from the training disks; couldn't find it."
echo "Everything else may have worked; we just may be missing gender info"
echo "which is only needed for VTLN-related diagnostics anyway."
exit 1
fi
# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
# LDC put it on the web. Perhaps it was accidentally omitted from the
# disks.

cat $wsj0/wsj0/doc/spkrinfo.txt \
./wsj0-train-spkrinfo.txt | \
perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender

# return back
cd -

for x in et05 dt05 tr05; do
mkdir -p $dstdir/${x}_orig_clean
cp $srcdir/$x.txt $dstdir/${x}_orig_clean/text || exit 1
cp $srcdir/${x}_wav.scp $dstdir/${x}_orig_clean/wav.scp || exit 1
done

echo "Data preparation succeeded"
73 changes: 73 additions & 0 deletions examples/chime4/s0/local/clean_wsj1_data_prep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#!/usr/bin/env bash

# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

set -eu


if [ $# -ne 1 ]; then
echo "Arguments should be WSJ1 directory"
exit 1;
fi

wsj1=$1
dir=$PWD/data/chime4/local
odir=$PWD/data/chime4
mkdir -p $dir
local=$PWD/local
sph2pipe=sph2pipe

if [ ! `which sph2pipe` ]; then
echo "Could not find sph2pipe, install it first..."
mkdir -p exp && cd exp && wget https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz
tar -zxf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5
gcc -o sph2pipe *.c -lm && cd .. && rm -rf sph2pipe_v2.5.tar.gz
sph2pipe=$PWD/sph2pipe_v2.5/sph2pipe
cd ..
fi

cd $dir
# This version for SI-200
cat $wsj1/13-34.1/wsj1/doc/indices/si_tr_s.ndx | \
$local/ndx2flist.pl $wsj1/??-{?,??}.? | sort > train_si200.flist

nl=`cat train_si200.flist | wc -l`
[ "$nl" -eq 30278 ] || echo "Warning: expected 30278 lines in train_si200.flist, got $nl"

# Dev-set for Nov'93 (503 utts)
cat $wsj1/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
$local/ndx2flist.pl $wsj1/??-{?,??}.? | sort > test_dev93.flist

# Finding the transcript files:
for x in $wsj1/??-{?,??}.?; do find -L $x -iname '*.dot'; done > dot_files.flist

# Convert the transcripts into our format (no normalization yet)
for x in train_si200 test_dev93; do
$local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl dot_files.flist > $x.trans1
done

# Do some basic normalization steps. At this point we don't remove OOVs--
# that will be done inside the training scripts, as we'd like to make the
# data-preparation stage independent of the specific lexicon used.
noiseword="<NOISE>";
for x in train_si200 test_dev93; do
cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
done

# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
for x in train_si200 test_dev93; do
awk -v cmd=$sph2pipe '{printf("%s %s -f wav %s |\n", $1, cmd, $2);}' ${x}_sph.scp > ${x}_wav.scp
done

# return back
cd -

for x in train_si200 test_dev93; do
mkdir -p $odir/${x}_wsj1_clean
cp $dir/$x.txt $odir/${x}_wsj1_clean/text || exit 1
cp $dir/${x}_wav.scp $odir/${x}_wsj1_clean/wav.scp || exit 1
done

echo "Data preparation WSJ1 succeeded"
54 changes: 54 additions & 0 deletions examples/chime4/s0/local/cstr_ndx2flist.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/usr/bin/env perl

# Copyright 2010-2011 Microsoft Corporation

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.

# This is modified from the script in standard Kaldi recipe to account
# for the way the WSJ data is structured on the Edinburgh systems.
# - Arnab Ghoshal, 12/1/12

# This program takes as its standard input an .ndx file from the WSJ corpus that looks
# like this:
#;; File: tr_s_wv1.ndx, updated 04/26/94
#;;
#;; Index for WSJ0 SI-short Sennheiser training data
#;; Data is read WSJ sentences, Sennheiser mic.
#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
#;; per speaker TI) = 7236 utts
#;;
#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1

# and as command-line argument it takes the names of the WSJ disk locations, e.g.:
# /group/corpora/public/wsjcam0/data on DICE machines.
# It outputs a list of absolute pathnames.

$wsj_dir = $ARGV[0];

while(<STDIN>){
if(m/^;/){ next; } # Comment. Ignore it.
else {
m/^([0-9_]+):\s*(\S+)$/ || die "Could not parse line $_";
$filename = $2; # as a subdirectory of the distributed disk.
if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; }
$filename = "$wsj_dir/$filename";
if (-e $filename) {
print "$filename\n";
} else {
print STDERR "File $filename found in the index but not on disk\n";
}
}
}
Loading

0 comments on commit d01276a

Please sign in to comment.