[examples] add common voice french recipe (wenet-e2e#1069)

* France dataset * add readme.md * delete space * add ENDL * Update tmp.py * test_1 * debug_1 * debug_2 * debug_3 * debug_4 * no tmp.py * add simple tmp.py * only tmp.py * add chmod tmp.py * delete zhushi tmp.py * debug_5 * debug_6 * debug_7 * debug_8 * debug_9 * debug_10 * debug_11 * debug_12 * debug_13 * debug_14 * debug_15 * debug_16 * debug_17 * debug_18 * debug_19 * test_1 * test_2 * test_3 * test_4 * t1 * t2 * t3 * t4 * t5 * t6 * t7 * fixed bug * fixed print bug * fixed no_import error * fixed last bug * add soft link * change dir name * fix_1
Mddct · Apr 30, 2022 · 282e5f7 · 282e5f7
1 parent ddd8ed6
commit 282e5f7
Show file tree

Hide file tree

Showing 9 changed files with 426 additions and 0 deletions.
diff --git a/examples/commonvoice/fr/README.md b/examples/commonvoice/fr/README.md
@@ -0,0 +1,16 @@
+# Performance Record
+# Should be installed ffmpeg , pandas !!!
+## Conformer Result
+
+* Feature info: dither + specaug + speed perturb
+* Training info: lr 0.0005, warmup_steps 20000 batch size 8, 3 gpu, 30 epochs
+* Decoding info: average_num 20
+
+
+
+|     decoding mode      | test (wer) |
+| :--------------------: | :---------: |
+|   ctc_greedy_search    |   16.12%    |
+| ctc_prefix_beam_search |   16.07%    |
+|       attention        |   13.56%    |
+|  attention_rescoring   |   14.01%    |
diff --git a/examples/commonvoice/fr/conf/train_conformer.yaml b/examples/commonvoice/fr/conf/train_conformer.yaml
@@ -0,0 +1,78 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    split_with_space: true
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 40
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'dynamic' # static or dynamic
+        batch_size: 8
+
+grad_clip: 10
+accum_grad: 4
+max_epoch: 30
+log_interval: 200
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 20000
diff --git a/examples/commonvoice/fr/local/create_scp_text.py b/examples/commonvoice/fr/local/create_scp_text.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+import re
+def process(src_str):
+    punc = '~`!#$%^&*()_+-=|\';":/.,?><~·！@#￥%……&*（）——+-=“：’；、。，？》《{}'
+    return re.sub(r"[{0}]+".format(punc), "", src_str).upper()
+
+if __name__ == '__main__':
+    src_dir = sys.argv[1]
+    tsv_file = src_dir + "/" + sys.argv[2] + ".tsv"
+    output_dir = sys.argv[3]
+    for file_path in os.listdir(src_dir + "/clips"):
+        if(os.path.exists(src_dir + "/wavs/" + file_path.split('.')[0] + ".wav")):
+            continue
+        t_str = src_dir + "/clips/" + file_path
+        tt_str = src_dir + "/wavs/" + file_path.split('.')[0] + ".wav"
+        os.system("ffmpeg -i {0} -ac 1 -ar 16000 -f wav {1}".format(t_str, tt_str))
+    import pandas
+    tsv_content = pandas.read_csv(tsv_file, sep="\t")
+    path_list = tsv_content["path"]
+    sentence = tsv_content["sentence"]
+    client_list = tsv_content["client_id"]
+    scp_file = open(output_dir + "/wav.scp", "w")
+    text_file = open(output_dir + "/text", "w")
+    utt2spk = open(output_dir + "/utt2spk", "w")
+    for i in range(len(path_list)):
+        temple_str = path_list[i].split(".")[0]
+        now_sentence = process(sentence[i])
+        wav_file = src_dir + "/wavs/" + temple_str + ".wav"
+        scp_file.writelines(temple_str + " " + wav_file + "\n")
+        text_file.writelines(temple_str + " " + now_sentence + "\n")
+        utt2spk.writelines(temple_str + " " + client_list[i] + "\n")
+    scp_file.close()
+    text_file.close()
+    utt2spk.close()
diff --git a/examples/commonvoice/fr/local/download_data.sh b/examples/commonvoice/fr/local/download_data.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+if [ $# -le 1 ]; then
+    echo "Args_Error:Two parameters are required."
+    exit 1;
+fi
+download_path=$1
+data_France=$2
+wget -O ${download_path}/tmp.zip https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-8.0-2022-01-19/cv-corpus-8.0-2022-01-19-fr.tar.gz
+tar -xvf ${download_path}/tmp.zip  -C ${data_France}
+rm -rf ${download_path}/tmp.zip
diff --git a/examples/commonvoice/fr/local/prepare_data.sh b/examples/commonvoice/fr/local/prepare_data.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+if [ $# -le 0 ]; then
+    echo "Argument should be France src directory, see ../run.sh for example."
+    exit 1;
+fi
+dir=`pwd`/data
+local=`pwd`/local
+src_path=$1
+if [ ! -d ${dir} ]; then
+    mkdir ${dir}
+  else
+    rm -rf ${dir}
+    mkdir ${dir}
+fi
+
+for x in train dev test; do
+    if [ ! ${dir}/${x} ]; then
+        mkdir ${dir}/${x}
+    else
+        rm -rf ${dir}/${x}
+        mkdir ${dir}/${x}
+    fi
+done
+
+if [ ! -d ${src_path}/wavs ]; then
+    mkdir ${src_path}/wavs
+fi
+for x in train dev test; do
+    python3 ${local}/create_scp_text.py  ${src_path} ${x} ${dir}/${x}
+done
diff --git a/examples/commonvoice/fr/path.sh b/examples/commonvoice/fr/path.sh
@@ -0,0 +1,8 @@
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/server/x86/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../:$PYTHONPATH