From 282e5f7f795b9272cbd04bc96cf1b5959d90bde7 Mon Sep 17 00:00:00 2001 From: wrz1999 <59393703+wrz1999@users.noreply.github.com> Date: Sat, 30 Apr 2022 22:33:38 +0800 Subject: [PATCH] [examples] add common voice french recipe (#1069) * France dataset * add readme.md * delete space * add ENDL * Update tmp.py * test_1 * debug_1 * debug_2 * debug_3 * debug_4 * no tmp.py * add simple tmp.py * only tmp.py * add chmod tmp.py * delete zhushi tmp.py * debug_5 * debug_6 * debug_7 * debug_8 * debug_9 * debug_10 * debug_11 * debug_12 * debug_13 * debug_14 * debug_15 * debug_16 * debug_17 * debug_18 * debug_19 * test_1 * test_2 * test_3 * test_4 * t1 * t2 * t3 * t4 * t5 * t6 * t7 * fixed bug * fixed print bug * fixed no_import error * fixed last bug * add soft link * change dir name * fix_1 --- examples/commonvoice/fr/README.md | 16 ++ .../commonvoice/fr/conf/train_conformer.yaml | 78 ++++++ .../commonvoice/fr/local/create_scp_text.py | 38 +++ .../commonvoice/fr/local/download_data.sh | 10 + examples/commonvoice/fr/local/prepare_data.sh | 30 +++ examples/commonvoice/fr/path.sh | 8 + examples/commonvoice/fr/run.sh | 244 ++++++++++++++++++ examples/commonvoice/fr/tools | 1 + examples/commonvoice/fr/wenet | 1 + 9 files changed, 426 insertions(+) create mode 100644 examples/commonvoice/fr/README.md create mode 100644 examples/commonvoice/fr/conf/train_conformer.yaml create mode 100755 examples/commonvoice/fr/local/create_scp_text.py create mode 100755 examples/commonvoice/fr/local/download_data.sh create mode 100755 examples/commonvoice/fr/local/prepare_data.sh create mode 100644 examples/commonvoice/fr/path.sh create mode 100644 examples/commonvoice/fr/run.sh create mode 120000 examples/commonvoice/fr/tools create mode 120000 examples/commonvoice/fr/wenet diff --git a/examples/commonvoice/fr/README.md b/examples/commonvoice/fr/README.md new file mode 100644 index 000000000..853415bf3 --- /dev/null +++ b/examples/commonvoice/fr/README.md @@ -0,0 +1,16 @@ +# Performance Record +# Should be installed ffmpeg , pandas !!! +## Conformer Result + +* Feature info: dither + specaug + speed perturb +* Training info: lr 0.0005, warmup_steps 20000 batch size 8, 3 gpu, 30 epochs +* Decoding info: average_num 20 + + + +| decoding mode | test (wer) | +| :--------------------: | :---------: | +| ctc_greedy_search | 16.12% | +| ctc_prefix_beam_search | 16.07% | +| attention | 13.56% | +| attention_rescoring | 14.01% | \ No newline at end of file diff --git a/examples/commonvoice/fr/conf/train_conformer.yaml b/examples/commonvoice/fr/conf/train_conformer.yaml new file mode 100644 index 000000000..1e20f5822 --- /dev/null +++ b/examples/commonvoice/fr/conf/train_conformer.yaml @@ -0,0 +1,78 @@ +# network architecture +# encoder related +encoder: conformer +encoder_conf: + output_size: 512 # dimension of attention + attention_heads: 8 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8 + normalize_before: true + cnn_module_kernel: 15 + use_cnn_module: True + activation_type: 'swish' + pos_enc_layer_type: 'rel_pos' + selfattention_layer_type: 'rel_selfattn' + +# decoder related +decoder: transformer +decoder_conf: + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + +# hybrid CTC/attention +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + +dataset_conf: + split_with_space: true + filter_conf: + max_length: 40960 + min_length: 0 + token_max_length: 200 + token_min_length: 1 + resample_conf: + resample_rate: 16000 + speed_perturb: true + fbank_conf: + num_mel_bins: 80 + frame_shift: 10 + frame_length: 25 + dither: 0.1 + spec_aug: true + spec_aug_conf: + num_t_mask: 2 + num_f_mask: 2 + max_t: 40 + max_f: 10 + shuffle: true + shuffle_conf: + shuffle_size: 1500 + sort: true + sort_conf: + sort_size: 500 # sort_size should be less than shuffle_size + batch_conf: + batch_type: 'dynamic' # static or dynamic + batch_size: 8 + +grad_clip: 10 +accum_grad: 4 +max_epoch: 30 +log_interval: 200 + +optim: adam +optim_conf: + lr: 0.0005 +scheduler: warmuplr # pytorch v1.1.0+ required +scheduler_conf: + warmup_steps: 20000 diff --git a/examples/commonvoice/fr/local/create_scp_text.py b/examples/commonvoice/fr/local/create_scp_text.py new file mode 100755 index 000000000..b3d94276e --- /dev/null +++ b/examples/commonvoice/fr/local/create_scp_text.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import sys +import os +import re +def process(src_str): + punc = '~`!#$%^&*()_+-=|\';":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》《{}' + return re.sub(r"[{0}]+".format(punc), "", src_str).upper() + +if __name__ == '__main__': + src_dir = sys.argv[1] + tsv_file = src_dir + "/" + sys.argv[2] + ".tsv" + output_dir = sys.argv[3] + for file_path in os.listdir(src_dir + "/clips"): + if(os.path.exists(src_dir + "/wavs/" + file_path.split('.')[0] + ".wav")): + continue + t_str = src_dir + "/clips/" + file_path + tt_str = src_dir + "/wavs/" + file_path.split('.')[0] + ".wav" + os.system("ffmpeg -i {0} -ac 1 -ar 16000 -f wav {1}".format(t_str, tt_str)) + import pandas + tsv_content = pandas.read_csv(tsv_file, sep="\t") + path_list = tsv_content["path"] + sentence = tsv_content["sentence"] + client_list = tsv_content["client_id"] + scp_file = open(output_dir + "/wav.scp", "w") + text_file = open(output_dir + "/text", "w") + utt2spk = open(output_dir + "/utt2spk", "w") + for i in range(len(path_list)): + temple_str = path_list[i].split(".")[0] + now_sentence = process(sentence[i]) + wav_file = src_dir + "/wavs/" + temple_str + ".wav" + scp_file.writelines(temple_str + " " + wav_file + "\n") + text_file.writelines(temple_str + " " + now_sentence + "\n") + utt2spk.writelines(temple_str + " " + client_list[i] + "\n") + scp_file.close() + text_file.close() + utt2spk.close() diff --git a/examples/commonvoice/fr/local/download_data.sh b/examples/commonvoice/fr/local/download_data.sh new file mode 100755 index 000000000..1dc1914a5 --- /dev/null +++ b/examples/commonvoice/fr/local/download_data.sh @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +if [ $# -le 1 ]; then + echo "Args_Error:Two parameters are required." + exit 1; +fi +download_path=$1 +data_France=$2 +wget -O ${download_path}/tmp.zip https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-8.0-2022-01-19/cv-corpus-8.0-2022-01-19-fr.tar.gz +tar -xvf ${download_path}/tmp.zip -C ${data_France} +rm -rf ${download_path}/tmp.zip \ No newline at end of file diff --git a/examples/commonvoice/fr/local/prepare_data.sh b/examples/commonvoice/fr/local/prepare_data.sh new file mode 100755 index 000000000..5e561a556 --- /dev/null +++ b/examples/commonvoice/fr/local/prepare_data.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash +if [ $# -le 0 ]; then + echo "Argument should be France src directory, see ../run.sh for example." + exit 1; +fi +dir=`pwd`/data +local=`pwd`/local +src_path=$1 +if [ ! -d ${dir} ]; then + mkdir ${dir} + else + rm -rf ${dir} + mkdir ${dir} +fi + +for x in train dev test; do + if [ ! ${dir}/${x} ]; then + mkdir ${dir}/${x} + else + rm -rf ${dir}/${x} + mkdir ${dir}/${x} + fi +done + +if [ ! -d ${src_path}/wavs ]; then + mkdir ${src_path}/wavs +fi +for x in train dev test; do + python3 ${local}/create_scp_text.py ${src_path} ${x} ${dir}/${x} +done diff --git a/examples/commonvoice/fr/path.sh b/examples/commonvoice/fr/path.sh new file mode 100644 index 000000000..3fee16c72 --- /dev/null +++ b/examples/commonvoice/fr/path.sh @@ -0,0 +1,8 @@ +export WENET_DIR=$PWD/../../.. +export BUILD_DIR=${WENET_DIR}/runtime/server/x86/build +export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix +export PATH=$PWD:${BUILD_DIR}:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH + +# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C +export PYTHONIOENCODING=UTF-8 +export PYTHONPATH=../../:$PYTHONPATH diff --git a/examples/commonvoice/fr/run.sh b/examples/commonvoice/fr/run.sh new file mode 100644 index 000000000..788a3015e --- /dev/null +++ b/examples/commonvoice/fr/run.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +# Copyright 2019 Mobvoi Inc. All Rights Reserved. +. ./path.sh || exit 1; + +# Use this to control how many gpu you use, It's 1-gpu training if you specify +# just 1gpu, otherwise it's is multiple gpu training based on DDP in pytorch +export CUDA_VISIBLE_DEVICES="0,1,2" +# The NCCL_SOCKET_IFNAME variable specifies which IP interface to use for nccl +# communication. More details can be found in +# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html +# export NCCL_SOCKET_IFNAME=ens4f1 +export NCCL_DEBUG=INFO +stage=0 # start from 0 if you need to start from data download +stop_stage=2 +# The num of nodes or machines used for multi-machine training +# Default 1 for single machine/node +# NFS will be needed if you want run multi-machine training +num_nodes=1 +# The rank of each node or machine, range from 0 to num_nodes -1 +# The first node/machine sets node_rank 0, the second one sets node_rank 1 +# the third one set node_rank 2, and so on. Default 0 +node_rank=0 +# data +download_path=/root/autodl-tmp +french_data=/root/autodl-tmp/cv-corpus-8.0-2022-01-19 +# path to save preproecssed data +# export data=data +. ./path.sh +. ./tools/parse_options.sh || exit 1 + +nj=16 + +# data_type can be `raw` or `shard`. Typically, raw is used for small dataset, +# `shard` is used for large dataset which is over 1k hours, and `shard` is +# faster on reading data and training. +data_type=raw +num_utts_per_shard=1000 + +train_set=train +# Optional train_config +# 1. conf/train_transformer.yaml: Standard transformer +# 2. conf/train_conformer.yaml: Standard conformer +# 3. conf/train_unified_conformer.yaml: Unified dynamic chunk causal conformer +# 4. conf/train_unified_transformer.yaml: Unified dynamic chunk transformer +# 5. conf/train_conformer_no_pos.yaml: Conformer without relative positional encoding +# 6. conf/train_u2++_conformer.yaml: U2++ conformer +# 7. conf/train_u2++_transformer.yaml: U2++ transformer +train_config=conf/train_conformer.yaml +cmvn=true +dir=exp/conformer +checkpoint= +nbpe=5000 + +# use average_checkpoint will get better result +average_checkpoint=true +decode_checkpoint=$dir/final.pt +average_num=20 +#decode_modes="ctc_greedy_search ctc_prefix_beam_search attention attention_rescoring" +decode_modes="attention attention_rescoring" + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + + echo "stage -1: Data download" + echo "download Dataset!" + local/download_data.sh ${download_path} ${french_data} + echo "Finish stage 0" +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + + echo "stage 0: Data preparation" + local/prepare_data.sh ${french_data}/fr + echo "Finish stage 0" +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "stage 1: compute global cmvn" + # compute cmvn + python tools/compute_cmvn_stats.py --num_workers 1 --train_config $train_config \ + --in_scp data/${train_set}/wav.scp \ + --out_cmvn data/${train_set}/global_cmvn + echo "Finish stage 1" +fi + + +bpemode=unigram +dict=data/lang_char_/${train_set}_${bpemode}${nbpe}_units.txt +bpemodel=data/lang_char_/${train_set}_${bpemode}${nbpe} +echo "dictionary: ${dict}" +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + ### Task dependent. You have to check non-linguistic symbols used in the corpus. + echo "stage 2: Dictionary and Json Data Preparation" + mkdir -p data/lang_char_/ + echo " 0" > ${dict} # 0 will be used for "blank" in CTC + echo " 1" >> ${dict} # must be 1 + + # we borrowed these code and scripts which are related bpe from ESPnet. + cut -f 2- -d" " data/${train_set}/text > data/lang_char_/input.txt + tools/spm_train --input=data/lang_char_/input.txt --vocab_size=${nbpe} \ + --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000 + tools/spm_encode --model=${bpemodel}.model --output_format=piece \ + < data/lang_char_/input.txt | \ + tr ' ' '\n' | sort | uniq | awk '{print $0 " " NR+1}' >> ${dict} + num_token=$(cat $dict | wc -l) + echo " $num_token" >> $dict # + wc -l ${dict} +fi + + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "stage 3: Prepare data, prepare requried format" + for x in dev test ${train_set}; do + if [ $data_type == "shard" ]; then + python tools/make_shard_list.py --num_utts_per_shard $num_utts_per_shard \ + --num_threads 16 data/$x/wav.scp data/$x/text \ + $(realpath data/$x/shards) data/$x/data.list + else + python tools/make_raw_list.py data/$x/wav.scp data/$x/text \ + data/$x/data.list + fi + done + echo "Finish stage 3" +fi + +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + mkdir -p $dir + # You have to rm `INIT_FILE` manually when you resume or restart a + # multi-machine training. + INIT_FILE=$dir/ddp_init + init_method=file://$(readlink -f $INIT_FILE) + echo "$0: init method is $init_method" + num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') + # Use "nccl" if it works, otherwise use "gloo" + dist_backend="gloo" + world_size=`expr $num_gpus \* $num_nodes` + echo "total gpus is: $world_size" + cmvn_opts= + $cmvn && cp data/${train_set}/global_cmvn $dir + $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn" + + # train.py rewrite $train_config to $dir/train.yaml with model input + # and output dimension, and $dir/train.yaml will be used for inference + # and export. + for ((i = 0; i < $num_gpus; ++i)); do + { + gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) + # Rank of each gpu/process used for knowing whether it is + # the master of a worker. + rank=`expr $node_rank \* $num_gpus + $i` + python wenet/bin/train.py --gpu $gpu_id \ + --config $train_config \ + --data_type $data_type \ + --symbol_table $dict \ + --bpe_model $bpemodel.model \ + --train_data data/$train_set/data.list \ + --cv_data data/dev/data.list \ + ${checkpoint:+--checkpoint $checkpoint} \ + --model_dir $dir \ + --ddp.init_method $init_method \ + --ddp.world_size $world_size \ + --ddp.rank $rank \ + --ddp.dist_backend $dist_backend \ + --num_workers 1 \ + $cmvn_opts \ + --pin_memory + } & + done + wait +fi + +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + # Test model, please specify the model you want to test by --checkpoint + cmvn_opts= + $cmvn && cmvn_opts="--cmvn data/${train_set}/global_cmvn" + # TODO, Add model average here + mkdir -p $dir/test + if [ ${average_checkpoint} == true ]; then + decode_checkpoint=$dir/avg_${average_num}.pt + echo "do model average and final checkpoint is $decode_checkpoint" + python wenet/bin/average_model.py \ + --dst_model $decode_checkpoint \ + --src_path $dir \ + --num ${average_num} \ + --val_best + fi + # Specify decoding_chunk_size if it's a unified dynamic chunk trained model + # -1 for full chunk + decoding_chunk_size= + ctc_weight=0.5 + # Polling GPU id begin with index 0 + num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') + idx=0 + for mode in ${decode_modes}; do + { + { + test_dir=$dir/test_${mode} + mkdir -p $test_dir + gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$idx+1]) + python wenet/bin/recognize.py --gpu 0 \ + --mode $mode \ + --config $dir/train.yaml \ + --data_type "raw" \ + --bpe_model $bpemodel.model \ + --test_data data/test/data.list \ + --checkpoint $decode_checkpoint \ + --beam_size 20 \ + --batch_size 1 \ + --penalty 0.0 \ + --dict $dict \ + --result_file $test_dir/text_bpe \ + --ctc_weight $ctc_weight \ + ${decoding_chunk_size:+--decoding_chunk_size $decoding_chunk_size} + + cut -f2- -d " " $test_dir/text_bpe > $test_dir/text_bpe_value_tmp + cut -f1 -d " " $test_dir/text_bpe > $test_dir/text_bpe_key_tmp + + tools/spm_decode --model=${bpemodel}.model --input_format=piece \ + < $test_dir/text_bpe_value_tmp | sed -e "s/▁/ /g" > $test_dir/text_value + #sed -e "s/▁/ /g" $test_dir/text_bpe_value_tmp > $test_dir/text_value + paste -d " " $test_dir/text_bpe_key_tmp $test_dir/text_value > $test_dir/text + # a raw version wer without refining processs + python tools/compute-wer.py --char=1 --v=1 \ + data/test/text $test_dir/text > $test_dir/wer + } & + + ((idx+=1)) + if [ $idx -eq $num_gpus ]; then + idx=0 + fi + } + done + + wait +fi + +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + # Export the best model you want + python wenet/bin/export_jit.py \ + --config $dir/train.yaml \ + --checkpoint $dir/avg_${average_num}.pt \ + --output_file $dir/final.zip +fi + diff --git a/examples/commonvoice/fr/tools b/examples/commonvoice/fr/tools new file mode 120000 index 000000000..570c2efd6 --- /dev/null +++ b/examples/commonvoice/fr/tools @@ -0,0 +1 @@ +../../../tools/ \ No newline at end of file diff --git a/examples/commonvoice/fr/wenet b/examples/commonvoice/fr/wenet new file mode 120000 index 000000000..5f46eee4d --- /dev/null +++ b/examples/commonvoice/fr/wenet @@ -0,0 +1 @@ +../../../wenet/ \ No newline at end of file