From 282e5f7f795b9272cbd04bc96cf1b5959d90bde7 Mon Sep 17 00:00:00 2001
From: wrz1999 <59393703+wrz1999@users.noreply.github.com>
Date: Sat, 30 Apr 2022 22:33:38 +0800
Subject: [PATCH] [examples] add common voice french recipe (#1069)

* France dataset

* add readme.md

* delete space

* add ENDL

* Update tmp.py

* test_1

* debug_1

* debug_2

* debug_3

* debug_4

* no tmp.py

* add simple tmp.py

* only tmp.py

* add chmod tmp.py

* delete zhushi  tmp.py

* debug_5

* debug_6

* debug_7

* debug_8

* debug_9

* debug_10

* debug_11

* debug_12

* debug_13

* debug_14

* debug_15

* debug_16

* debug_17

* debug_18

* debug_19

* test_1

* test_2

* test_3

* test_4

* t1

* t2

* t3

* t4

* t5

* t6

* t7

* fixed bug

* fixed print bug

* fixed no_import error

* fixed last bug

* add soft link

* change dir name

* fix_1
---
 examples/commonvoice/fr/README.md             |  16 ++
 .../commonvoice/fr/conf/train_conformer.yaml  |  78 ++++++
 .../commonvoice/fr/local/create_scp_text.py   |  38 +++
 .../commonvoice/fr/local/download_data.sh     |  10 +
 examples/commonvoice/fr/local/prepare_data.sh |  30 +++
 examples/commonvoice/fr/path.sh               |   8 +
 examples/commonvoice/fr/run.sh                | 244 ++++++++++++++++++
 examples/commonvoice/fr/tools                 |   1 +
 examples/commonvoice/fr/wenet                 |   1 +
 9 files changed, 426 insertions(+)
 create mode 100644 examples/commonvoice/fr/README.md
 create mode 100644 examples/commonvoice/fr/conf/train_conformer.yaml
 create mode 100755 examples/commonvoice/fr/local/create_scp_text.py
 create mode 100755 examples/commonvoice/fr/local/download_data.sh
 create mode 100755 examples/commonvoice/fr/local/prepare_data.sh
 create mode 100644 examples/commonvoice/fr/path.sh
 create mode 100644 examples/commonvoice/fr/run.sh
 create mode 120000 examples/commonvoice/fr/tools
 create mode 120000 examples/commonvoice/fr/wenet

diff --git a/examples/commonvoice/fr/README.md b/examples/commonvoice/fr/README.md
new file mode 100644
index 000000000..853415bf3
--- /dev/null
+++ b/examples/commonvoice/fr/README.md
@@ -0,0 +1,16 @@
+# Performance Record
+# Should be installed ffmpeg , pandas !!!
+## Conformer Result
+
+* Feature info: dither + specaug + speed perturb
+* Training info: lr 0.0005, warmup_steps 20000 batch size 8, 3 gpu, 30 epochs
+* Decoding info: average_num 20
+
+
+
+|     decoding mode      | test (wer) |
+| :--------------------: | :---------: |
+|   ctc_greedy_search    |   16.12%    |
+| ctc_prefix_beam_search |   16.07%    |
+|       attention        |   13.56%    |
+|  attention_rescoring   |   14.01%    |
\ No newline at end of file
diff --git a/examples/commonvoice/fr/conf/train_conformer.yaml b/examples/commonvoice/fr/conf/train_conformer.yaml
new file mode 100644
index 000000000..1e20f5822
--- /dev/null
+++ b/examples/commonvoice/fr/conf/train_conformer.yaml
@@ -0,0 +1,78 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'rel_pos'
+    selfattention_layer_type: 'rel_selfattn'
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+dataset_conf:
+    split_with_space: true
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 0.1
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 40
+        max_f: 10
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1500
+    sort: true
+    sort_conf:
+        sort_size: 500  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'dynamic' # static or dynamic
+        batch_size: 8
+
+grad_clip: 10
+accum_grad: 4
+max_epoch: 30
+log_interval: 200
+
+optim: adam
+optim_conf:
+    lr: 0.0005
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 20000
diff --git a/examples/commonvoice/fr/local/create_scp_text.py b/examples/commonvoice/fr/local/create_scp_text.py
new file mode 100755
index 000000000..b3d94276e
--- /dev/null
+++ b/examples/commonvoice/fr/local/create_scp_text.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import sys
+import os
+import re
+def process(src_str):
+    punc = '~`!#$%^&*()_+-=|\';":/.,?><~·！@#￥%……&*（）——+-=“：’；、。，？》《{}'
+    return re.sub(r"[{0}]+".format(punc), "", src_str).upper()
+
+if __name__ == '__main__':
+    src_dir = sys.argv[1]
+    tsv_file = src_dir + "/" + sys.argv[2] + ".tsv"
+    output_dir = sys.argv[3]
+    for file_path in os.listdir(src_dir + "/clips"):
+        if(os.path.exists(src_dir + "/wavs/" + file_path.split('.')[0] + ".wav")):
+            continue
+        t_str = src_dir + "/clips/" + file_path
+        tt_str = src_dir + "/wavs/" + file_path.split('.')[0] + ".wav"
+        os.system("ffmpeg -i {0} -ac 1 -ar 16000 -f wav {1}".format(t_str, tt_str))
+    import pandas
+    tsv_content = pandas.read_csv(tsv_file, sep="\t")
+    path_list = tsv_content["path"]
+    sentence = tsv_content["sentence"]
+    client_list = tsv_content["client_id"]
+    scp_file = open(output_dir + "/wav.scp", "w")
+    text_file = open(output_dir + "/text", "w")
+    utt2spk = open(output_dir + "/utt2spk", "w")
+    for i in range(len(path_list)):
+        temple_str = path_list[i].split(".")[0]
+        now_sentence = process(sentence[i])
+        wav_file = src_dir + "/wavs/" + temple_str + ".wav"
+        scp_file.writelines(temple_str + " " + wav_file + "\n")
+        text_file.writelines(temple_str + " " + now_sentence + "\n")
+        utt2spk.writelines(temple_str + " " + client_list[i] + "\n")
+    scp_file.close()
+    text_file.close()
+    utt2spk.close()
diff --git a/examples/commonvoice/fr/local/download_data.sh b/examples/commonvoice/fr/local/download_data.sh
new file mode 100755
index 000000000..1dc1914a5
--- /dev/null
+++ b/examples/commonvoice/fr/local/download_data.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+if [ $# -le 1 ]; then
+    echo "Args_Error:Two parameters are required."
+    exit 1;
+fi
+download_path=$1
+data_France=$2
+wget -O ${download_path}/tmp.zip https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-8.0-2022-01-19/cv-corpus-8.0-2022-01-19-fr.tar.gz
+tar -xvf ${download_path}/tmp.zip  -C ${data_France}
+rm -rf ${download_path}/tmp.zip
\ No newline at end of file
diff --git a/examples/commonvoice/fr/local/prepare_data.sh b/examples/commonvoice/fr/local/prepare_data.sh
new file mode 100755
index 000000000..5e561a556
--- /dev/null
+++ b/examples/commonvoice/fr/local/prepare_data.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+if [ $# -le 0 ]; then
+    echo "Argument should be France src directory, see ../run.sh for example."
+    exit 1;
+fi
+dir=`pwd`/data
+local=`pwd`/local
+src_path=$1
+if [ ! -d ${dir} ]; then
+    mkdir ${dir}
+  else
+    rm -rf ${dir}
+    mkdir ${dir}
+fi
+
+for x in train dev test; do
+    if [ ! ${dir}/${x} ]; then
+        mkdir ${dir}/${x}
+    else
+        rm -rf ${dir}/${x}
+        mkdir ${dir}/${x}
+    fi
+done
+
+if [ ! -d ${src_path}/wavs ]; then
+    mkdir ${src_path}/wavs
+fi
+for x in train dev test; do
+    python3 ${local}/create_scp_text.py  ${src_path} ${x} ${dir}/${x}
+done
diff --git a/examples/commonvoice/fr/path.sh b/examples/commonvoice/fr/path.sh
new file mode 100644
index 000000000..3fee16c72
--- /dev/null
+++ b/examples/commonvoice/fr/path.sh
@@ -0,0 +1,8 @@
+export WENET_DIR=$PWD/../../..
+export BUILD_DIR=${WENET_DIR}/runtime/server/x86/build
+export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
+export PATH=$PWD:${BUILD_DIR}:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=../../:$PYTHONPATH
diff --git a/examples/commonvoice/fr/run.sh b/examples/commonvoice/fr/run.sh
new file mode 100644
index 000000000..788a3015e
--- /dev/null
+++ b/examples/commonvoice/fr/run.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+
+# Copyright 2019 Mobvoi Inc. All Rights Reserved.
+. ./path.sh || exit 1;
+
+# Use this to control how many gpu you use, It's 1-gpu training if you specify
+# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch
+export CUDA_VISIBLE_DEVICES="0,1,2"
+# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl
+# communication. More details can be found in
+# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html
+# export NCCL_SOCKET_IFNAME=ens4f1
+export NCCL_DEBUG=INFO
+stage=0     # start from 0 if you need to start from data download
+stop_stage=2
+# The num of nodes or machines used for multi-machine training
+# Default 1 for single machine/node
+# NFS will be needed if you want run multi-machine training
+num_nodes=1
+# The rank of each node or machine, range from 0 to num_nodes -1
+# The first node/machine sets node_rank 0, the second one sets node_rank 1
+# the third one set node_rank 2, and so on. Default 0
+node_rank=0
+# data
+download_path=/root/autodl-tmp
+french_data=/root/autodl-tmp/cv-corpus-8.0-2022-01-19
+# path to save preproecssed data
+# export data=data
+. ./path.sh
+. ./tools/parse_options.sh || exit 1
+
+nj=16
+
+# data_type can be `raw` or `shard`. Typically, raw is used for small dataset,
+# `shard` is used for large dataset which is over 1k hours, and `shard` is
+# faster on reading data and training.
+data_type=raw
+num_utts_per_shard=1000
+
+train_set=train
+# Optional train_config
+# 1. conf/train_transformer.yaml: Standard transformer
+# 2. conf/train_conformer.yaml: Standard conformer
+# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer
+# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer
+# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding
+# 6. conf/train_u2++_conformer.yaml: U2++ conformer
+# 7. conf/train_u2++_transformer.yaml: U2++ transformer
+train_config=conf/train_conformer.yaml
+cmvn=true
+dir=exp/conformer
+checkpoint=
+nbpe=5000
+
+# use average_checkpoint will get better result
+average_checkpoint=true
+decode_checkpoint=$dir/final.pt
+average_num=20
+#decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring"
+decode_modes="attention attention_rescoring"
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+
+    echo "stage -1: Data download"
+    echo "download Dataset!"
+    local/download_data.sh ${download_path} ${french_data}
+    echo "Finish stage 0"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+
+    echo "stage 0: Data preparation"
+    local/prepare_data.sh ${french_data}/fr
+    echo "Finish stage 0"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: compute global cmvn"
+    # compute cmvn
+    python tools/compute_cmvn_stats.py --num_workers 1 --train_config $train_config \
+        --in_scp data/${train_set}/wav.scp \
+        --out_cmvn data/${train_set}/global_cmvn
+    echo "Finish stage 1"
+fi
+
+
+bpemode=unigram
+dict=data/lang_char_/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=data/lang_char_/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${dict}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+  ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+  echo "stage 2: Dictionary and Json Data Preparation"
+  mkdir -p data/lang_char_/
+  echo "<blank> 0" > ${dict} # 0 will be used for "blank" in CTC
+  echo "<unk> 1" >> ${dict} # <unk> must be 1
+
+  # we borrowed these code and scripts which are related bpe from ESPnet.
+  cut -f 2- -d" " data/${train_set}/text > data/lang_char_/input.txt
+  tools/spm_train --input=data/lang_char_/input.txt --vocab_size=${nbpe} \
+    --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+  tools/spm_encode --model=${bpemodel}.model --output_format=piece \
+    < data/lang_char_/input.txt | \
+    tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict}
+  num_token=$(cat $dict | wc -l)
+  echo "<sos/eos> $num_token" >> $dict # <eos>
+  wc -l ${dict}
+fi
+
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+  echo "stage 3: Prepare data, prepare requried format"
+  for x in dev test ${train_set}; do
+    if [ $data_type == "shard" ]; then
+      python tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \
+        --num_threads 16 data/$x/wav.scp data/$x/text \
+        $(realpath data/$x/shards) data/$x/data.list
+    else
+      python tools/make_raw_list.py data/$x/wav.scp data/$x/text \
+        data/$x/data.list
+    fi
+  done
+  echo "Finish stage 3"
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+  mkdir -p $dir
+  # You have to rm `INIT_FILE` manually when you resume or restart a
+  # multi-machine training.
+  INIT_FILE=$dir/ddp_init
+  init_method=file://$(readlink -f $INIT_FILE)
+  echo "$0: init method is $init_method"
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  # Use "nccl" if it works, otherwise use "gloo"
+  dist_backend="gloo"
+  world_size=`expr $num_gpus \* $num_nodes`
+  echo "total gpus is: $world_size"
+  cmvn_opts=
+  $cmvn && cp data/${train_set}/global_cmvn $dir
+  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
+
+  # train.py rewrite $train_config to $dir/train.yaml with model input
+  # and output dimension, and $dir/train.yaml will be used for inference
+  # and export.
+  for ((i = 0; i < $num_gpus; ++i)); do
+  {
+    gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+    # Rank of each gpu/process used for knowing whether it is
+    # the master of a worker.
+    rank=`expr $node_rank \* $num_gpus + $i`
+    python wenet/bin/train.py --gpu $gpu_id \
+      --config $train_config \
+      --data_type $data_type \
+      --symbol_table $dict \
+      --bpe_model $bpemodel.model \
+      --train_data data/$train_set/data.list \
+      --cv_data data/dev/data.list \
+      ${checkpoint:+--checkpoint $checkpoint} \
+      --model_dir $dir \
+      --ddp.init_method $init_method \
+      --ddp.world_size $world_size \
+      --ddp.rank $rank \
+      --ddp.dist_backend $dist_backend \
+      --num_workers 1 \
+      $cmvn_opts \
+      --pin_memory
+  } &
+  done
+  wait
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+  # Test model, please specify the model you want to test by --checkpoint
+  cmvn_opts=
+  $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn"
+  # TODO, Add model average here
+  mkdir -p $dir/test
+  if [ ${average_checkpoint} == true ]; then
+      decode_checkpoint=$dir/avg_${average_num}.pt
+      echo "do model average and final checkpoint is $decode_checkpoint"
+      python wenet/bin/average_model.py \
+          --dst_model $decode_checkpoint \
+          --src_path $dir  \
+          --num ${average_num} \
+          --val_best
+  fi
+  # Specify decoding_chunk_size if it's a unified dynamic chunk trained model
+  # -1 for full chunk
+  decoding_chunk_size=
+  ctc_weight=0.5
+  # Polling GPU id begin with index 0
+  num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+  idx=0
+  for mode in ${decode_modes}; do
+    {
+      {
+        test_dir=$dir/test_${mode}
+        mkdir -p $test_dir
+        gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1])
+        python wenet/bin/recognize.py --gpu 0 \
+          --mode $mode \
+          --config $dir/train.yaml \
+          --data_type "raw" \
+          --bpe_model $bpemodel.model \
+          --test_data data/test/data.list \
+          --checkpoint $decode_checkpoint \
+          --beam_size 20 \
+          --batch_size 1 \
+          --penalty 0.0 \
+          --dict $dict \
+          --result_file $test_dir/text_bpe \
+          --ctc_weight $ctc_weight \
+          ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size}
+
+        cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp
+        cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp
+
+         tools/spm_decode --model=${bpemodel}.model --input_format=piece \
+           < $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value
+        #sed -e "s/▁/ /g" $test_dir/text_bpe_value_tmp > $test_dir/text_value
+        paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value > $test_dir/text
+        # a raw version wer without refining processs
+        python tools/compute-wer.py --char=1 --v=1 \
+          data/test/text $test_dir/text > $test_dir/wer
+      } &
+
+      ((idx+=1))
+      if [ $idx -eq $num_gpus ]; then
+        idx=0
+      fi
+    }
+    done
+
+  wait
+fi
+
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+  # Export the best model you want
+  python wenet/bin/export_jit.py \
+    --config $dir/train.yaml \
+    --checkpoint $dir/avg_${average_num}.pt \
+    --output_file $dir/final.zip
+fi
+
diff --git a/examples/commonvoice/fr/tools b/examples/commonvoice/fr/tools
new file mode 120000
index 000000000..570c2efd6
--- /dev/null
+++ b/examples/commonvoice/fr/tools
@@ -0,0 +1 @@
+../../../tools/
\ No newline at end of file
diff --git a/examples/commonvoice/fr/wenet b/examples/commonvoice/fr/wenet
new file mode 120000
index 000000000..5f46eee4d
--- /dev/null
+++ b/examples/commonvoice/fr/wenet
@@ -0,0 +1 @@
+../../../wenet/
\ No newline at end of file