[examples] add CHiME4 recipe (#746)

* init chime4 recipe * update decoding recipe in run,sh * fix tokenizer issue for char unit * add initial results * finalize recipe * fix tab issues in run.sh * fix trailing whitespace * fix comments * commit missing changes
wenet-e2e · Nov 19, 2021 · d01276a · d01276a
1 parent 33eeff6
commit d01276a
Show file tree

Hide file tree

Showing 22 changed files with 1,226 additions and 2 deletions.
diff --git a/examples/chime4/s0/README.md b/examples/chime4/s0/README.md
@@ -0,0 +1,12 @@
+# Performance Record
+
+## Conformer Result
+
+* Feature info: dither + specaug + speed perturb
+* Training info: lr 0.0005, batch size 8, 1 gpu, acc_grad 4, 80 epochs
+* Decoding info: average_num 10
+
+|      decoding mode     | dt05_real_1ch | dt05_simu_1ch | et05_real_1ch | et05_simu_1ch |
+|:----------------------:|:-------------:|:-------------:|:-------------:|:-------------:|
+| ctc_prefix_beam_search |   19.06%      |   21.17%      |   28.39%      |    29.16%     |
+|  attention_rescoring   |   17.92%      |   20.22%      |   27.40%      |    28.25%     |
diff --git a/examples/chime4/s0/conf/train_conformer.yaml b/examples/chime4/s0/conf/train_conformer.yaml
@@ -0,0 +1,78 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    split_with_space: true
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 40
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'static' # static or dynamic
+        batch_size: 8
+
+grad_clip: 10
+accum_grad: 4
+max_epoch: 80
+log_interval: 200
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 20000
diff --git a/examples/chime4/s0/local/chime4_format_dir.sh b/examples/chime4/s0/local/chime4_format_dir.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# wujian@2020
+
+set -eu
+
+echo "$0: Formating chime4 data dir..."
+
+track=isolated_1ch_track
+data_dir=data/chime4
+
+mkdir -p $data_dir/{train,dev}
+
+cat $data_dir/tr05_{simu,real}_noisy/wav.scp $data_dir/tr05_orig_clean/wav.scp \
+  $data_dir/train_si200_wsj1_clean/wav.scp | sort -k1 > $data_dir/train/wav.scp
+cat $data_dir/tr05_{simu,real}_noisy/text $data_dir/tr05_orig_clean/text \
+  $data_dir/train_si200_wsj1_clean/text | sort -k1 > $data_dir/train/text
+
+cat $data_dir/dt05_{real,simu}_${track}/wav.scp | sort -k1 > $data_dir/dev/wav.scp
+cat $data_dir/dt05_{real,simu}_${track}/text | sort -k1 > $data_dir/dev/text
+
+echo "$0: Format $data_dir done"
diff --git a/examples/chime4/s0/local/chime4_gen_wav.sh b/examples/chime4/s0/local/chime4_gen_wav.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+# wujian@2020
+
+set -eu
+
+[ $# -ne 2 ] && echo "Script format error: $0 <data-dir> <dump-dir>" && exit 0
+
+data_dir=$1
+dump_dir=$2
+
+mkdir -p $dump_dir
+
+num_utts=$(cat $data_dir/wav.scp | wc -l)
+echo "Orginal utterances (.wav + .wv1): $num_utts"
+
+# cat $data_dir/wav.scp | grep "sph2pipe" | \
+#   awk -v dir=$dump_dir '{printf("%s -f wav %s %s/%s.wav\n", $2, $5, dir, $1)}' | bash
+
+cat $data_dir/wav.scp | grep -v "sph2pipe" > $data_dir/raw_wav.scp
+find $dump_dir -name "*.wav" | awk -F '/' '{printf("%s %s\n", $NF, $0)}' | \
+  sed 's:\.wav::' > $data_dir/sph_wav.scp
+
+cat $data_dir/{raw_wav,sph_wav}.scp | sort -k1 > $data_dir/wav.scp
+num_utts=$(cat $data_dir/wav.scp | wc -l)
+echo "Wave utterances (.wav): $num_utts"
+
+echo "$0: Generate wav => $dump_dir done"
diff --git a/examples/chime4/s0/local/clean_wsj0_data_prep.sh b/examples/chime4/s0/local/clean_wsj0_data_prep.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# Modified from Kaldi's chime4 recipe
+
+set -eu
+
+dataset=chime4
+
+. ./tools/parse_options.sh || exit 1;
+
+if [ $# -ne 1 ]; then
+  printf "\nUSAGE: %s <original WSJ0 corpus-directory>\n\n" `basename $0`
+  echo "The argument should be a the top-level WSJ corpus directory."
+  echo "It is assumed that there will be a 'wsj0' and a 'wsj1' subdirectory"
+  echo "within the top-level corpus directory."
+  exit 1;
+fi
+
+wsj0=$1
+
+srcdir=$PWD/data/chime4/local
+dstdir=$PWD/data/$dataset
+local=$PWD/local
+utils=$PWD/utils
+sph2pipe=sph2pipe
+
+if [ ! `which sph2pipe` ]; then
+  echo "Could not find sph2pipe, install it first..."
+  mkdir -p exp && cd exp && wget https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz
+  tar -zxf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5
+  gcc -o sph2pipe *.c -lm && cd .. && rm -rf sph2pipe_v2.5.tar.gz
+  sph2pipe=$PWD/sph2pipe_v2.5/sph2pipe
+  cd ..
+fi
+
+mkdir -p $srcdir && cd $srcdir
+
+# This version for SI-84
+cat $wsj0/wsj0/doc/indices/train/tr_s_wv1.ndx \
+  | $local/cstr_ndx2flist.pl $wsj0 | sort -u > tr05.flist
+
+# Now for the test sets.
+# $wsj0/wsj1/doc/indices/readme.doc
+# describes all the different test sets.
+# Note: each test-set seems to come in multiple versions depending
+# on different vocabulary sizes, verbalized vs. non-verbalized
+# pronunciations, etc.  We use the largest vocab and non-verbalized
+# pronunciations.
+# The most normal one seems to be the "baseline 60k test set", which
+# is h1_p0.
+
+# Nov'92 (330 utts, 5k vocab)
+cat $wsj0/wsj0/doc/indices/test/nvp/si_et_05.ndx | \
+  $local/cstr_ndx2flist.pl $wsj0 | sort > et05.flist
+
+# Note: the ???'s below match WSJ and SI_DT, or wsj and si_dt.
+# Sometimes this gets copied from the CD's with upcasing, don't know
+# why (could be older versions of the disks).
+find $wsj0/wsj0/si_dt_05 -print | grep -i ".wv1" | sort > dt05.flist
+
+# Finding the transcript files:
+find -L $wsj0 -iname '*.dot' > dot_files.flist
+
+# Convert the transcripts into our format (no normalization yet)
+# adding suffix to utt_id
+# 0 for clean condition
+for x in tr05 et05 dt05; do
+  $local/flist2scp.pl $x.flist | sort > ${x}_sph_tmp.scp
+  cat ${x}_sph_tmp.scp | awk '{print $1}' \
+    | $local/find_transcripts.pl dot_files.flist > ${x}_tmp.trans1
+  cat ${x}_sph_tmp.scp | awk '{printf("%s %s\n", $1, $2);}' > ${x}_sph.scp
+  cat ${x}_tmp.trans1 | awk '{printf("%s ", $1); for(i=2;i<=NF;i++) printf("%s ", $i); printf("\n");}' > ${x}.trans1
+done
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in tr05 et05 dt05; do
+  cat $x.trans1 | $local/normalize_transcript.pl $noiseword \
+    | sort > $x.txt || exit 1;
+done
+
+# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
+for x in tr05 et05 dt05; do
+  awk -v cmd=$sph2pipe '{printf("%s %s -f wav %s |\n", $1, cmd, $2);}' ${x}_sph.scp > ${x}_wav.scp
+done
+
+if [ ! -f wsj0-train-spkrinfo.txt ] || [ `cat wsj0-train-spkrinfo.txt | wc -l` -ne 134 ]; then
+  rm -f wsj0-train-spkrinfo.txt
+  wget http://www.ldc.upenn.edu/Catalog/docs/LDC93S6A/wsj0-train-spkrinfo.txt \
+    || ( echo "Getting wsj0-train-spkrinfo.txt from backup location" && \
+         wget --no-check-certificate https://sourceforge.net/projects/kaldi/files/wsj0-train-spkrinfo.txt );
+fi
+
+if [ ! -f wsj0-train-spkrinfo.txt ]; then
+  echo "Could not get the spkrinfo.txt file from LDC website (moved)?"
+  echo "This is possibly omitted from the training disks; couldn't find it."
+  echo "Everything else may have worked; we just may be missing gender info"
+  echo "which is only needed for VTLN-related diagnostics anyway."
+  exit 1
+fi
+# Note: wsj0-train-spkrinfo.txt doesn't seem to be on the disks but the
+# LDC put it on the web.  Perhaps it was accidentally omitted from the
+# disks.
+
+cat $wsj0/wsj0/doc/spkrinfo.txt \
+    ./wsj0-train-spkrinfo.txt  | \
+    perl -ane 'tr/A-Z/a-z/; m/^;/ || print;' | \
+    awk '{print $1, $2}' | grep -v -- -- | sort | uniq > spk2gender
+
+# return back
+cd -
+
+for x in et05 dt05 tr05; do
+  mkdir -p $dstdir/${x}_orig_clean
+  cp $srcdir/$x.txt $dstdir/${x}_orig_clean/text || exit 1
+  cp $srcdir/${x}_wav.scp $dstdir/${x}_orig_clean/wav.scp || exit 1
+done
+
+echo "Data preparation succeeded"
diff --git a/examples/chime4/s0/local/clean_wsj1_data_prep.sh b/examples/chime4/s0/local/clean_wsj1_data_prep.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+set -eu
+
+
+if [ $# -ne 1 ]; then
+  echo "Arguments should be WSJ1 directory"
+  exit 1;
+fi
+
+wsj1=$1
+dir=$PWD/data/chime4/local
+odir=$PWD/data/chime4
+mkdir -p $dir
+local=$PWD/local
+sph2pipe=sph2pipe
+
+if [ ! `which sph2pipe` ]; then
+  echo "Could not find sph2pipe, install it first..."
+  mkdir -p exp && cd exp && wget https://www.openslr.org/resources/3/sph2pipe_v2.5.tar.gz
+  tar -zxf sph2pipe_v2.5.tar.gz && cd sph2pipe_v2.5
+  gcc -o sph2pipe *.c -lm && cd .. && rm -rf sph2pipe_v2.5.tar.gz
+  sph2pipe=$PWD/sph2pipe_v2.5/sph2pipe
+  cd ..
+fi
+
+cd $dir
+# This version for SI-200
+cat $wsj1/13-34.1/wsj1/doc/indices/si_tr_s.ndx | \
+ $local/ndx2flist.pl $wsj1/??-{?,??}.? | sort > train_si200.flist
+
+nl=`cat train_si200.flist | wc -l`
+[ "$nl" -eq 30278 ] || echo "Warning: expected 30278 lines in train_si200.flist, got $nl"
+
+# Dev-set for Nov'93 (503 utts)
+cat $wsj1/13-34.1/wsj1/doc/indices/h1_p0.ndx | \
+  $local/ndx2flist.pl $wsj1/??-{?,??}.? | sort > test_dev93.flist
+
+# Finding the transcript files:
+for x in $wsj1/??-{?,??}.?; do find -L $x -iname '*.dot'; done > dot_files.flist
+
+# Convert the transcripts into our format (no normalization yet)
+for x in train_si200 test_dev93; do
+   $local/flist2scp.pl $x.flist | sort > ${x}_sph.scp
+   cat ${x}_sph.scp | awk '{print $1}' | $local/find_transcripts.pl  dot_files.flist > $x.trans1
+done
+
+# Do some basic normalization steps.  At this point we don't remove OOVs--
+# that will be done inside the training scripts, as we'd like to make the
+# data-preparation stage independent of the specific lexicon used.
+noiseword="<NOISE>";
+for x in train_si200 test_dev93; do
+   cat $x.trans1 | $local/normalize_transcript.pl $noiseword | sort > $x.txt || exit 1;
+done
+
+# Create scp's with wav's. (the wv1 in the distribution is not really wav, it is sph.)
+for x in train_si200 test_dev93; do
+  awk -v cmd=$sph2pipe '{printf("%s %s -f wav %s |\n", $1, cmd, $2);}' ${x}_sph.scp > ${x}_wav.scp
+done
+
+# return back
+cd -
+
+for x in train_si200 test_dev93; do
+  mkdir -p $odir/${x}_wsj1_clean
+  cp $dir/$x.txt $odir/${x}_wsj1_clean/text || exit 1
+  cp $dir/${x}_wav.scp $odir/${x}_wsj1_clean/wav.scp || exit 1
+done
+
+echo "Data preparation WSJ1 succeeded"
diff --git a/examples/chime4/s0/local/cstr_ndx2flist.pl b/examples/chime4/s0/local/cstr_ndx2flist.pl
@@ -0,0 +1,54 @@
+#!/usr/bin/env perl
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+# This is modified from the script in standard Kaldi recipe to account
+# for the way the WSJ data is structured on the Edinburgh systems.
+# - Arnab Ghoshal, 12/1/12
+
+# This program takes as its standard input an .ndx file from the WSJ corpus that looks
+# like this:
+#;; File: tr_s_wv1.ndx, updated 04/26/94
+#;;
+#;; Index for WSJ0 SI-short Sennheiser training data
+#;; Data is read WSJ sentences, Sennheiser mic.
+#;; Contains 84 speakers X (~100 utts per speaker MIT/SRI and ~50 utts
+#;; per speaker TI) = 7236 utts
+#;;
+#11_1_1:wsj0/si_tr_s/01i/01ic0201.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0202.wv1
+#11_1_1:wsj0/si_tr_s/01i/01ic0203.wv1
+
+# and as command-line argument it takes the names of the WSJ disk locations, e.g.:
+# /group/corpora/public/wsjcam0/data on DICE machines.
+# It outputs a list of absolute pathnames.
+
+$wsj_dir = $ARGV[0];
+
+while(<STDIN>){
+  if(m/^;/){ next; } # Comment.  Ignore it.
+  else {
+    m/^([0-9_]+):\s*(\S+)$/  || die "Could not parse line $_";
+    $filename = $2; # as a subdirectory of the distributed disk.
+    if ($filename !~ m/\.wv1$/) { $filename .= ".wv1"; }
+    $filename = "$wsj_dir/$filename";
+    if (-e $filename) {
+      print "$filename\n";
+    } else {
+      print STDERR "File $filename found in the index but not on disk\n";
+    }
+  }
+}