Skip to content

Commit

Permalink
[examples] add common voice french recipe (wenet-e2e#1069)
Browse files Browse the repository at this point in the history
* France dataset

* add readme.md

* delete space

* add ENDL

* Update tmp.py

* test_1

* debug_1

* debug_2

* debug_3

* debug_4

* no tmp.py

* add simple tmp.py

* only tmp.py

* add chmod tmp.py

* delete zhushi  tmp.py

* debug_5

* debug_6

* debug_7

* debug_8

* debug_9

* debug_10

* debug_11

* debug_12

* debug_13

* debug_14

* debug_15

* debug_16

* debug_17

* debug_18

* debug_19

* test_1

* test_2

* test_3

* test_4

* t1

* t2

* t3

* t4

* t5

* t6

* t7

* fixed bug

* fixed print bug

* fixed no_import error

* fixed last bug

* add soft link

* change dir name

* fix_1
  • Loading branch information
wrz1999 authored Apr 30, 2022
1 parent ddd8ed6 commit 282e5f7
Show file tree
Hide file tree
Showing 9 changed files with 426 additions and 0 deletions.
16 changes: 16 additions & 0 deletions examples/commonvoice/fr/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Performance Record
# Should be installed ffmpeg , pandas !!!
## Conformer Result

* Feature info: dither + specaug + speed perturb
* Training info: lr 0.0005, warmup_steps 20000 batch size 8, 3 gpu, 30 epochs
* Decoding info: average_num 20



| decoding mode | test (wer) |
| :--------------------: | :---------: |
| ctc_greedy_search | 16.12% |
| ctc_prefix_beam_search | 16.07% |
| attention | 13.56% |
| attention_rescoring | 14.01% |
78 changes: 78 additions & 0 deletions examples/commonvoice/fr/conf/train_conformer.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# network architecture
# encoder related
encoder: conformer
encoder_conf:
output_size: 512 # dimension of attention
attention_heads: 8
linear_units: 2048 # the number of units of position-wise feed forward
num_blocks: 12 # the number of encoder blocks
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.0
input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
normalize_before: true
cnn_module_kernel: 15
use_cnn_module: True
activation_type: 'swish'
pos_enc_layer_type: 'rel_pos'
selfattention_layer_type: 'rel_selfattn'

# decoder related
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.0
src_attention_dropout_rate: 0.0

# hybrid CTC/attention
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false

dataset_conf:
split_with_space: true
filter_conf:
max_length: 40960
min_length: 0
token_max_length: 200
token_min_length: 1
resample_conf:
resample_rate: 16000
speed_perturb: true
fbank_conf:
num_mel_bins: 80
frame_shift: 10
frame_length: 25
dither: 0.1
spec_aug: true
spec_aug_conf:
num_t_mask: 2
num_f_mask: 2
max_t: 40
max_f: 10
shuffle: true
shuffle_conf:
shuffle_size: 1500
sort: true
sort_conf:
sort_size: 500 # sort_size should be less than shuffle_size
batch_conf:
batch_type: 'dynamic' # static or dynamic
batch_size: 8

grad_clip: 10
accum_grad: 4
max_epoch: 30
log_interval: 200

optim: adam
optim_conf:
lr: 0.0005
scheduler: warmuplr # pytorch v1.1.0+ required
scheduler_conf:
warmup_steps: 20000
38 changes: 38 additions & 0 deletions examples/commonvoice/fr/local/create_scp_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import sys
import os
import re
def process(src_str):
punc = '~`!#$%^&*()_+-=|\';":/.,?><~·!@#¥%……&*()——+-=“:’;、。,?》《{}'
return re.sub(r"[{0}]+".format(punc), "", src_str).upper()

if __name__ == '__main__':
src_dir = sys.argv[1]
tsv_file = src_dir + "/" + sys.argv[2] + ".tsv"
output_dir = sys.argv[3]
for file_path in os.listdir(src_dir + "/clips"):
if(os.path.exists(src_dir + "/wavs/" + file_path.split('.')[0] + ".wav")):
continue
t_str = src_dir + "/clips/" + file_path
tt_str = src_dir + "/wavs/" + file_path.split('.')[0] + ".wav"
os.system("ffmpeg -i {0} -ac 1 -ar 16000 -f wav {1}".format(t_str, tt_str))
import pandas
tsv_content = pandas.read_csv(tsv_file, sep="\t")
path_list = tsv_content["path"]
sentence = tsv_content["sentence"]
client_list = tsv_content["client_id"]
scp_file = open(output_dir + "/wav.scp", "w")
text_file = open(output_dir + "/text", "w")
utt2spk = open(output_dir + "/utt2spk", "w")
for i in range(len(path_list)):
temple_str = path_list[i].split(".")[0]
now_sentence = process(sentence[i])
wav_file = src_dir + "/wavs/" + temple_str + ".wav"
scp_file.writelines(temple_str + " " + wav_file + "\n")
text_file.writelines(temple_str + " " + now_sentence + "\n")
utt2spk.writelines(temple_str + " " + client_list[i] + "\n")
scp_file.close()
text_file.close()
utt2spk.close()
10 changes: 10 additions & 0 deletions examples/commonvoice/fr/local/download_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/usr/bin/env bash
if [ $# -le 1 ]; then
echo "Args_Error:Two parameters are required."
exit 1;
fi
download_path=$1
data_France=$2
wget -O ${download_path}/tmp.zip https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-8.0-2022-01-19/cv-corpus-8.0-2022-01-19-fr.tar.gz
tar -xvf ${download_path}/tmp.zip -C ${data_France}
rm -rf ${download_path}/tmp.zip
30 changes: 30 additions & 0 deletions examples/commonvoice/fr/local/prepare_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash
if [ $# -le 0 ]; then
echo "Argument should be France src directory, see ../run.sh for example."
exit 1;
fi
dir=`pwd`/data
local=`pwd`/local
src_path=$1
if [ ! -d ${dir} ]; then
mkdir ${dir}
else
rm -rf ${dir}
mkdir ${dir}
fi

for x in train dev test; do
if [ ! ${dir}/${x} ]; then
mkdir ${dir}/${x}
else
rm -rf ${dir}/${x}
mkdir ${dir}/${x}
fi
done

if [ ! -d ${src_path}/wavs ]; then
mkdir ${src_path}/wavs
fi
for x in train dev test; do
python3 ${local}/create_scp_text.py ${src_path} ${x} ${dir}/${x}
done
8 changes: 8 additions & 0 deletions examples/commonvoice/fr/path.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
export WENET_DIR=$PWD/../../..
export BUILD_DIR=${WENET_DIR}/runtime/server/x86/build
export OPENFST_PREFIX_DIR=${BUILD_DIR}/../fc_base/openfst-subbuild/openfst-populate-prefix
export PATH=$PWD:${BUILD_DIR}:${BUILD_DIR}/kaldi:${OPENFST_PREFIX_DIR}/bin:$PATH

# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PYTHONPATH=../../:$PYTHONPATH
Loading

0 comments on commit 282e5f7

Please sign in to comment.