From f4b172fd9090a24b4d47d3ba56fd32176dec4add Mon Sep 17 00:00:00 2001 From: ftshijt Date: Tue, 5 Dec 2023 04:33:01 -0500 Subject: [PATCH 01/40] fix setup --- egs2/TEMPLATE/spk1/setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs2/TEMPLATE/spk1/setup.sh b/egs2/TEMPLATE/spk1/setup.sh index ee6e1ce2f02..5db1e4416cc 100755 --- a/egs2/TEMPLATE/spk1/setup.sh +++ b/egs2/TEMPLATE/spk1/setup.sh @@ -41,7 +41,7 @@ done # Symlinks to TEMPLATE/spk1 -for f in sv.sh path.sh scripts pyscripts; do +for f in spk.sh path.sh scripts pyscripts; do target=../../TEMPLATE/spk1/"${f}" ln -sf "${target}" "${dir}" targets+="${dir}/${target} " From 9dc73d7d29f96c495183562e49f43051024f24c0 Mon Sep 17 00:00:00 2001 From: ftshijt Date: Tue, 5 Dec 2023 04:38:33 -0500 Subject: [PATCH 02/40] fix readme with new stage --- egs2/TEMPLATE/tts1/README.md | 41 ++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 20 deletions(-) diff --git a/egs2/TEMPLATE/tts1/README.md b/egs2/TEMPLATE/tts1/README.md index b8e3e46b00e..e8928907c23 100644 --- a/egs2/TEMPLATE/tts1/README.md +++ b/egs2/TEMPLATE/tts1/README.md @@ -244,10 +244,10 @@ If you want to combine with neural vocoders, you can combine with [kan-bayashi/P $ . ./path.sh && pip install -U parallel_wavegan # Use parallel_wavegan provided pretrained ljspeech style melgan as a vocoder -$ ./run.sh --stage 7 --inference_args "--vocoder_tag parallel_wavegan/ljspeech_style_melgan.v1" --inference_tag decode_with_ljspeech_style_melgan.v1 +$ ./run.sh --stage 8 --inference_args "--vocoder_tag parallel_wavegan/ljspeech_style_melgan.v1" --inference_tag decode_with_ljspeech_style_melgan.v1 # Use the vocoder trained by `parallel_wavegan` repo manually -$ ./run.sh --stage 7 --vocoder_file /path/to/checkpoint-xxxxxxsteps.pkl --inference_tag decode_with_my_vocoder +$ ./run.sh --stage 8 --vocoder_file /path/to/checkpoint-xxxxxxsteps.pkl --inference_tag decode_with_my_vocoder ``` If you want to generate waveform from dumped features, please check [decoding with ESPnet-TTS model's feature](https://github.com/kan-bayashi/ParallelWaveGAN#decoding-with-espnet-tts-models-features). @@ -257,7 +257,7 @@ For the first time, we recommend performing each stage step-by-step via `--stage $ ./run.sh --stage 1 --stop-stage 1 $ ./run.sh --stage 2 --stop-stage 2 ... -$ ./run.sh --stage 7 --stop-stage 7 +$ ./run.sh --stage 8 --stop-stage 8 ``` This might helps you to understand each stage's processing and directory structure. @@ -269,7 +269,7 @@ Please make sure you already finished the training of the teacher model (Tacotro First, decode all of data including training, validation, and evaluation set. ```sh # specify teacher model directory via --tts_exp option -$ ./run.sh --stage 7 \ +$ ./run.sh --stage 8 \ --tts_exp exp/tts_train_raw_phn_tacotron_g2p_en_no_space \ --test_sets "tr_no_dev dev eval1" ``` @@ -277,7 +277,7 @@ This will generate `durations` for training, validation, and evaluation sets in Then, you can train FastSpeech by specifying the directory including `durations` via `--teacher_dumpdir` option. ```sh -$ ./run.sh --stage 6 \ +$ ./run.sh --stage 7 \ --train_config conf/tuning/train_fastspeech.yaml \ --teacher_dumpdir exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_train.loss.ave ``` @@ -285,7 +285,7 @@ $ ./run.sh --stage 6 \ In the above example, we use generated mel-spectrogram as the target, which is known as knowledge distillation training. If you want to use groundtruth mel-spectrogram as the target, we need to use teacher forcing in decoding. ```sh -$ ./run.sh --stage 7 \ +$ ./run.sh --stage 8 \ --tts_exp exp/tts_train_raw_phn_tacotron_g2p_en_no_space \ --inference_args "--use_teacher_forcing true" \ --test_sets "tr_no_dev dev eval1" @@ -294,7 +294,7 @@ You can get the groundtruth aligned durations in `exp/tts_train_raw_phn_tacotron Then, you can train FastSpeech without knowledge distillation. ```sh -$ ./run.sh --stage 6 \ +$ ./run.sh --stage 7 \ --train_config conf/tuning/train_fastspeech.yaml \ --teacher_dumpdir exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave ``` @@ -303,7 +303,7 @@ $ ./run.sh --stage 6 \ The procedure is almost the same as FastSpeech but we **MUST** use teacher forcing in decoding. ```sh -$ ./run.sh --stage 7 \ +$ ./run.sh --stage 8 \ --tts_exp exp/tts_train_raw_phn_tacotron_g2p_en_no_space \ --inference_args "--use_teacher_forcing true" \ --test_sets "tr_no_dev dev eval1" @@ -312,7 +312,7 @@ $ ./run.sh --stage 7 \ To train FastSpeech2, we use additional feature (F0 and energy). Therefore, we need to start from `stage 5` to calculate additional statistics. ```sh -$ ./run.sh --stage 5 \ +$ ./run.sh --stage 6 \ --train_config conf/tuning/train_fastspeech2.yaml \ --teacher_dumpdir exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave \ --tts_stats_dir exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/stats \ @@ -325,7 +325,7 @@ The use of `--write_collected_feats` is optional but it helps to accelerate the First, you need to run from the stage 2 and 3 with `--use_xvector true` to extract X-vector. ```sh -$ ./run.sh --stage 2 --stop-stage 3 --use_xvector true +$ ./run.sh --stage 3 --stop-stage 4 --use_xvector true ``` You can find the extracted X-vector in `dump/xvector/*/xvector.{ark,scp}`. Then, you can run the training with the config which has `spk_embed_dim: 512` in `tts_conf`. @@ -355,7 +355,7 @@ The original xvector.scp files are renamed to xvector.scp.bak in case you wish t Once you've performed extraction and optionally the speaker-averaged replacement step, please run the training from stage 6. ```sh -$ ./run.sh --stage 6 --use_xvector true --train_config /path/to/your_xvector_config.yaml +$ ./run.sh --stage 7 --use_xvector true --train_config /path/to/your_xvector_config.yaml ``` You can find the example config in [`egs2/vctk/tts1/conf/tuning`](../../vctk/tts1/conf/tuning). @@ -364,7 +364,7 @@ You can find the example config in [`egs2/vctk/tts1/conf/tuning`](../../vctk/tts First, you need to run from the stage 2 and 3 with `--use_sid true` to extract speaker ID. ```sh -$ ./run.sh --stage 2 --stop-stage 3 --use_sid true +$ ./run.sh --stage 3 --stop-stage 4 --use_sid true ``` You can find the speaker ID file in `dump/raw/*/utt2sid`. Note that you need to correctly create `utt2spk` in data prep stage to generate `utt2sid`. @@ -376,14 +376,14 @@ tts_conf: ``` Please run the training from stage 6. ```sh -$ ./run.sh --stage 6 --use_sid true --train_config /path/to/your_multi_spk_config.yaml +$ ./run.sh --stage 7 --use_sid true --train_config /path/to/your_multi_spk_config.yaml ``` ### Multi-language model with language ID embedding training First, you need to run from the stage 2 and 3 with `--use_lid true` to extract speaker ID. ```sh -$ ./run.sh --stage 2 --stop-stage 3 --use_lid true +$ ./run.sh --stage 3 --stop-stage 4 --use_lid true ``` You can find the speaker ID file in `dump/raw/*/utt2lid`. **Note that you need to additionally create `utt2lang` file in data prep stage to generate `utt2lid`.** @@ -395,13 +395,13 @@ tts_conf: ``` Please run the training from stage 6. ```sh -$ ./run.sh --stage 6 --use_lid true --train_config /path/to/your_multi_lang_config.yaml +$ ./run.sh --stage 7 --use_lid true --train_config /path/to/your_multi_lang_config.yaml ``` Of course you can further combine with x-vector or speaker ID embedding. If you want to use both sid and lid, the process should be like this: ```sh -$ ./run.sh --stage 2 --stop-stage 3 --use_lid true --use_sid true +$ ./run.sh --stage 3 --stop-stage 4 --use_lid true --use_sid true ``` Make your config. ```yaml @@ -412,7 +412,7 @@ tts_conf: ``` Please run the training from stage 6. ```sh -$ ./run.sh --stage 6 --use_lid true --use_sid true --train_config /path/to/your_multi_spk_multi_lang_config.yaml +$ ./run.sh --stage 7 --use_lid true --use_sid true --train_config /path/to/your_multi_spk_multi_lang_config.yaml ``` ### VITS training @@ -550,7 +550,7 @@ $ ... # Case 1: Train conformer fastspeech2 + hifigan G + hifigan D from scratch $ ./run.sh \ - --stage 6 \ + --stage 7 \ --tts_task gan_tts \ --train_config ./conf/tuning/train_joint_conformer_fastspeech2_hifigan.yaml @@ -595,7 +595,7 @@ $ vim conf/tuning/finetune_joint_conformer_fastspeech2_hifigan.yaml # (d) Run training $ ./run.sh \ - --stage 6 \ + --stage 7 \ --tts_task gan_tts \ --train_config ./conf/tuning/finetune_joint_conformer_fastspeech2_hifigan.yaml ``` @@ -980,7 +980,7 @@ The trained vocoder can be used as follows: - With TTS recipe ```sh - $ ./run.sh --stage 7 --vocoder_file /path/to/your_trained_vocoder_checkpoint.pkl --inference_tag decode_with_my_vocoder + $ ./run.sh --stage 8 --vocoder_file /path/to/your_trained_vocoder_checkpoint.pkl --inference_tag decode_with_my_vocoder ``` - [With command line](https://github.com/kan-bayashi/ParallelWaveGAN#decoding-with-espnet-tts-models-features) @@ -1046,3 +1046,4 @@ This is because we use prenet in the decoder, which always applies dropout. See more info in [Tacotron2 paper](https://arxiv.org/abs/1712.05884). If you want to fix the results, you can use [`--always_fix_seed` option](https://github.com/espnet/espnet/blob/f03101557753517ebac8c432f0793d97d68fa5f0/espnet2/bin/tts_inference.py#L601-L606). + From 824121ce345dbe1289ccff4fc4a557f1c824681e Mon Sep 17 00:00:00 2001 From: ftshijt Date: Tue, 5 Dec 2023 04:40:01 -0500 Subject: [PATCH 03/40] add espnet spk in tts, add spk embedding as a separate stage --- egs2/TEMPLATE/tts1/tts.sh | 92 ++++++++++++++++++++++++--------------- 1 file changed, 57 insertions(+), 35 deletions(-) diff --git a/egs2/TEMPLATE/tts1/tts.sh b/egs2/TEMPLATE/tts1/tts.sh index 94cf7299155..0cbb1da0983 100755 --- a/egs2/TEMPLATE/tts1/tts.sh +++ b/egs2/TEMPLATE/tts1/tts.sh @@ -67,8 +67,10 @@ f0max=400 # Minimum f0 for pitch extraction. # X-Vector related use_xvector=false # Whether to use x-vector. -xvector_tool=kaldi # Toolkit for extracting x-vector (speechbrain, rawnet, espnet, kaldi) -xvector_model=speechbrain/spkrec-ecapa-voxceleb # For only espnet, speechbrain, or rawnet +xvector_tag= # The tag of xvector folder. +xvector_gpu_inference=false # Whether to use gpu to inference xvector. +xvector_tool=kaldi # Toolkit for extracting x-vector (speechbrain, rawnet, espnet, kaldi). +xvector_model=speechbrain/spkrec-ecapa-voxceleb # For only espnet, speechbrain, or rawnet. # Vocabulary related oov="" # Out of vocabrary symbol. @@ -146,6 +148,8 @@ Options: --min_wav_duration # Minimum duration in second (default="${min_wav_duration}"). --max_wav_duration # Maximum duration in second (default="${max_wav_duration}"). --use_xvector # Whether to use X-vector (default="${use_xvector}"). + --xvector_tag # The tag of xvector folder (default="${xvector_tag}"). + --xvector_gpu_inference # Whether to use gpu to inference xvector (default="${xvector_gpu_inference}"). --xvector_tool # Toolkit for generating the X-vectors (default="${xvector_tool}"). --xvector_model # Pretrained model to generate the X-vectors (default="${xvector_model}"). --use_sid # Whether to use speaker id as the inputs (default="${use_sid}"). @@ -322,6 +326,7 @@ if ! "${skip_data_prep}"; then log "Stage 2: Format wav.scp: data/ -> ${data_feats}/" for dset in "${train_set}" "${valid_set}" ${test_sets}; do + continue if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then _suf="/org" else @@ -333,17 +338,20 @@ if ! "${skip_data_prep}"; then if [ -e data/"${dset}"/segments ]; then _opts+="--segments data/${dset}/segments " fi + # shellcheck disable=SC2086 scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \ --audio-format "${audio_format}" --fs "${fs}" ${_opts} \ "data/${dset}/wav.scp" "${data_feats}${_suf}/${dset}" echo "${feats_type}" > "${data_feats}${_suf}/${dset}/feats_type" done + fi + if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then # Extract X-vector if "${use_xvector}"; then if [ "${xvector_tool}" = "kaldi" ]; then - log "Stage 2+: Extract X-vector: data/ -> ${dumpdir}/xvector (Require Kaldi)" + log "Stage 3.1: Extract X-vector: data/ -> ${dumpdir}/xvector${xvector_tag} (Require Kaldi)" # Download X-vector pretrained model xvector_exp=${expdir}/xvector_nnet_1a if [ ! -e "${xvector_exp}" ]; then @@ -385,7 +393,7 @@ if ! "${skip_data_prep}"; then sid/nnet3/xvector/extract_xvectors.sh --nj "${_nj}" --cmd "${train_cmd}" \ "${xvector_exp}" \ "${dumpdir}/mfcc/${dset}" \ - "${dumpdir}/xvector/${dset}" + "${dumpdir}/xvector${xvector_tag}/${dset}" # 5. Filter scp # NOTE(kan-bayashi): Since sometimes mfcc or x-vector extraction is failed, @@ -393,13 +401,22 @@ if ! "${skip_data_prep}"; then # To avoid this mismatch, perform filtering of the original feature scp here. cp "${data_feats}${_suf}/${dset}"/wav.{scp,scp.bak} <"${data_feats}${_suf}/${dset}/wav.scp.bak" \ - utils/filter_scp.pl "${dumpdir}/xvector/${dset}/xvector.scp" \ + utils/filter_scp.pl "${dumpdir}/xvector${xvector_tag}/${dset}/xvector.scp" \ >"${data_feats}${_suf}/${dset}/wav.scp" utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}" done else # Assume that others toolkits are python-based - log "Stage 2+: Extract X-vector: data/ -> ${dumpdir}/xvector using python toolkits" + log "Stage 3.1: Extract X-vector: data/ -> ${dumpdir}/xvector${xvector_tag} using python toolkits" + + if ${xvector_gpu_inference}; then + _cmd="${cuda_cmd}" + _ngpu=1 + else + _cmd="${decode_cmd}" + _ngpu=0 + fi + for dset in "${train_set}" "${valid_set}" ${test_sets}; do if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then _suf="/org" @@ -409,18 +426,22 @@ if ! "${skip_data_prep}"; then if [ "${xvector_tool}" = "rawnet" ]; then xvector_model="RawNet" fi + + ${_cmd} --gpu "${_ngpu}" ${dumpdir}/xvector/${dset}/xvector_extract.log \ pyscripts/utils/extract_xvectors.py \ --pretrained_model ${xvector_model} \ --toolkit ${xvector_tool} \ ${data_feats}${_suf}/${dset} \ - ${dumpdir}/xvector/${dset} + ${dumpdir}/xvector${xvector_tag}/${dset} done fi + else + log "Skip Stage 3.1, no xvector extraction set" fi # Prepare spk id input if "${use_sid}"; then - log "Stage 2+: Prepare speaker id: data/ -> ${data_feats}/" + log "Stage 3.2: Prepare speaker id: data/ -> ${data_feats}/" for dset in "${train_set}" "${valid_set}" ${test_sets}; do if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then _suf="/org" @@ -443,7 +464,7 @@ if ! "${skip_data_prep}"; then # Prepare lang id input if "${use_lid}"; then - log "Stage 2+: Prepare lang id: data/ -> ${data_feats}/" + log "Stage 3.3: Prepare lang id: data/ -> ${data_feats}/" for dset in "${train_set}" "${valid_set}" ${test_sets}; do if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then _suf="/org" @@ -467,8 +488,8 @@ if ! "${skip_data_prep}"; then fi - if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - log "Stage 3: Remove long/short data: ${data_feats}/org -> ${data_feats}" + if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + log "Stage 4: Remove long/short data: ${data_feats}/org -> ${data_feats}" # NOTE(kamo): Not applying to test_sets to keep original data for dset in "${train_set}" "${valid_set}"; do @@ -513,17 +534,17 @@ if ! "${skip_data_prep}"; then # Filter x-vector if "${use_xvector}"; then - cp "${dumpdir}/xvector/${dset}"/xvector.{scp,scp.bak} - <"${dumpdir}/xvector/${dset}/xvector.scp.bak" \ + cp "${dumpdir}/xvector${xvector_tag}/${dset}"/xvector.{scp,scp.bak} + <"${dumpdir}/xvector${xvector_tag}/${dset}/xvector.scp.bak" \ utils/filter_scp.pl "${data_feats}/${dset}/wav.scp" \ - >"${dumpdir}/xvector/${dset}/xvector.scp" + >"${dumpdir}/xvector${xvector_tag}/${dset}/xvector.scp" fi done fi - if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then - log "Stage 4: Generate token_list from ${srctexts}" + if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + log "Stage 5: Generate token_list from ${srctexts}" # "nlsyms_txt" should be generated by local/data.sh if need # The first symbol in token_list must be "" and the last must be also sos/eos: @@ -552,10 +573,10 @@ fi if ! "${skip_train}"; then - if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then _train_dir="${data_feats}/${train_set}" _valid_dir="${data_feats}/${valid_set}" - log "Stage 5: TTS collect stats: train_set=${_train_dir}, valid_set=${_valid_dir}" + log "Stage 6: TTS collect stats: train_set=${_train_dir}, valid_set=${_valid_dir}" _opts= if [ -n "${train_config}" ]; then @@ -602,8 +623,8 @@ if ! "${skip_train}"; then fi if "${use_xvector}"; then - _xvector_train_dir="${dumpdir}/xvector/${train_set}" - _xvector_valid_dir="${dumpdir}/xvector/${valid_set}" + _xvector_train_dir="${dumpdir}/xvector${xvector_tag}/${train_set}" + _xvector_valid_dir="${dumpdir}/xvector${xvector_tag}/${valid_set}" _opts+="--train_data_path_and_name_and_type ${_xvector_train_dir}/xvector.scp,spembs,kaldi_ark " _opts+="--valid_data_path_and_name_and_type ${_xvector_valid_dir}/xvector.scp,spembs,kaldi_ark " fi @@ -692,10 +713,10 @@ if ! "${skip_train}"; then fi - if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then _train_dir="${data_feats}/${train_set}" _valid_dir="${data_feats}/${valid_set}" - log "Stage 6: TTS Training: train_set=${_train_dir}, valid_set=${_valid_dir}" + log "Stage 7: TTS Training: train_set=${_train_dir}, valid_set=${_valid_dir}" _opts= if [ -n "${train_config}" ]; then @@ -853,8 +874,8 @@ if ! "${skip_train}"; then # Add X-vector to the inputs if needed if "${use_xvector}"; then - _xvector_train_dir="${dumpdir}/xvector/${train_set}" - _xvector_valid_dir="${dumpdir}/xvector/${valid_set}" + _xvector_train_dir="${dumpdir}/xvector${xvector_tag}/${train_set}" + _xvector_valid_dir="${dumpdir}/xvector${xvector_tag}/${valid_set}" _opts+="--train_data_path_and_name_and_type ${_xvector_train_dir}/xvector.scp,spembs,kaldi_ark " _opts+="--valid_data_path_and_name_and_type ${_xvector_valid_dir}/xvector.scp,spembs,kaldi_ark " fi @@ -936,8 +957,8 @@ fi if ! "${skip_eval}"; then - if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then - log "Stage 7: Decoding: training_dir=${tts_exp}" + if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then + log "Stage 8: Decoding: training_dir=${tts_exp}" if ${gpu_inference}; then _cmd="${cuda_cmd}" @@ -986,7 +1007,7 @@ if ! "${skip_eval}"; then # Add X-vector to the inputs if needed if "${use_xvector}"; then - _xvector_dir="${dumpdir}/xvector/${dset}" + _xvector_dir="${dumpdir}/xvector${xvector_tag}/${dset}" _ex_opts+="--data_path_and_name_and_type ${_xvector_dir}/xvector.scp,spembs,kaldi_ark " fi @@ -1090,8 +1111,8 @@ fi packed_model="${tts_exp}/${tts_exp##*/}_${inference_model%.*}.zip" if [ -z "${download_model}" ]; then # Skip pack preparation if using a downloaded model - if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then - log "Stage 8: Pack model: ${packed_model}" + if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then + log "Stage 9: Pack model: ${packed_model}" log "Warning: Upload model to Zenodo will be deprecated. We encourage to use Hugging Face" _opts="" @@ -1106,8 +1127,8 @@ if [ -z "${download_model}" ]; then fi if "${use_xvector}"; then for dset in "${train_set}" ${test_sets}; do - _opts+=" --option ${dumpdir}/xvector/${dset}/spk_xvector.scp" - _opts+=" --option ${dumpdir}/xvector/${dset}/spk_xvector.ark" + _opts+=" --option ${dumpdir}/xvector${xvector_tag}/${dset}/spk_xvector.scp" + _opts+=" --option ${dumpdir}/xvector${xvector_tag}/${dset}/spk_xvector.ark" done fi if "${use_sid}"; then @@ -1130,8 +1151,8 @@ if [ -z "${download_model}" ]; then fi if ! "${skip_upload}"; then - if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then - log "Stage 9: Upload model to Zenodo: ${packed_model}" + if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then + log "Stage 10: Upload model to Zenodo: ${packed_model}" # To upload your model, you need to do: # 1. Signup to Zenodo: https://zenodo.org/ @@ -1188,11 +1209,11 @@ else fi if ! "${skip_upload_hf}"; then - if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then + if [ ${stage} -le 11 ] && [ ${stop_stage} -ge 11 ]; then [ -z "${hf_repo}" ] && \ log "ERROR: You need to setup the variable hf_repo with the name of the repository located at HuggingFace" && \ exit 1 - log "Stage 10: Upload model to HuggingFace: ${hf_repo}" + log "Stage 11: Upload model to HuggingFace: ${hf_repo}" gitlfs=$(git lfs --version 2> /dev/null || true) [ -z "${gitlfs}" ] && \ @@ -1240,3 +1261,4 @@ else fi log "Successfully finished. [elapsed=${SECONDS}s]" + From 17a033b4a2b448c6b7599396f39a39bd6f9526be Mon Sep 17 00:00:00 2001 From: ftshijt Date: Tue, 5 Dec 2023 04:41:38 -0500 Subject: [PATCH 04/40] fix espnet model --- espnet2/spk/espnet_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/espnet2/spk/espnet_model.py b/espnet2/spk/espnet_model.py index 4d638521af3..321229c092e 100644 --- a/espnet2/spk/espnet_model.py +++ b/espnet2/spk/espnet_model.py @@ -64,7 +64,7 @@ def __init__( def forward( self, speech: torch.Tensor, - spk_labels: torch.Tensor, + spk_labels: torch.Tensor = None, task_tokens: torch.Tensor = None, extract_embd: bool = False, **kwargs, @@ -111,6 +111,7 @@ def forward( return spk_embd # 4. calculate loss + assert spk_labels is not None, "spk_labels is None, cannot compute loss" loss = self.loss(spk_embd, spk_labels.squeeze()) stats = dict(loss=loss.detach()) From 626ae9e5783e9b0285b88f725a16775257bd2988 Mon Sep 17 00:00:00 2001 From: ftshijt Date: Tue, 5 Dec 2023 04:47:21 -0500 Subject: [PATCH 05/40] minor fix --- espnet2/bin/spk_inference.py | 612 ++++++++++------------------------- 1 file changed, 166 insertions(+), 446 deletions(-) mode change 100755 => 100644 espnet2/bin/spk_inference.py diff --git a/espnet2/bin/spk_inference.py b/espnet2/bin/spk_inference.py old mode 100755 new mode 100644 index b4bcf00557b..ade85343b43 --- a/espnet2/bin/spk_inference.py +++ b/espnet2/bin/spk_inference.py @@ -2,106 +2,197 @@ import argparse import logging import sys +from distutils.version import LooseVersion +from itertools import groupby +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union -import humanfriendly import numpy as np import torch from typeguard import check_argument_types, check_return_type -from espnet2.samplers.build_batch_sampler import BATCH_TYPES +from espnet2.fileio.npy_scp import NpyScpWriter from espnet2.tasks.spk import SpeakerTask from espnet2.torch_utils.set_all_random_seed import set_all_random_seed -from espnet2.train.distributed_utils import ( - DistributedOption, - free_port, - get_master_port, - get_node_rank, - get_num_nodes, - resolve_distributed_mode, -) -from espnet2.train.reporter import Reporter, SubReporter -from espnet2.utils import config_argparse -from espnet2.utils.build_dataclass import build_dataclass -from espnet2.utils.types import ( - humanfriendly_parse_size_or_none, - int_or_none, - str2bool, - str2triple_str, - str_or_none, -) from espnet.utils.cli_utils import get_commandline_args -def inference(args): +class Speech2Embedding: + """Speech2Embedding class + + Examples: + >>> import soundfile + >>> speech2embed = Speech2Embedding("spk_config.yml", "spk.pth") + >>> audio, rate = soundfile.read("speech.wav") + >>> speech2embed(audio) + + """ + + def __init__( + self, + spk_train_config: Uhion[Path, str] = None, + spk_model_file: Union[Path, str] = None, + device: str = "cpu", + dtype: str = "float32", + batch_size: int = 1, + ): + assert check_argument_types() + + spk_model, spk_train_args = SpeakerTask.build_model_from_file(spk_train_config, spk_model_file, device) + self.spk_model = spk_model + self.spk_train_args = spk_train_args + self.dtype = dtype + self.batch_size = batch_size + + @torch.no_grad() + def __call__( + self, seech: Union[torch.Tensor, np.ndarray] + ) -> torch.Tensor: + """Inference + + Args: + speech: Input speech data + + Returns: + spk_embedding + + """ + + assert check_argument_types() + + # Input as audio signal + if isinstance(speech, np.ndarray): + speech = torch.tensor(speech) + + # data: (Nsamples,) -> (1, Nsamples) + speech = speech.unsqueeze(0).to(getattr(torch, self.dtype)) + logging.info("speech length: " + str(speech.size(1))) + batch = {"speech": speech} + + # a. To device + batch = to_device(batch, device=self.device) + + # b. Forward the model embedding extraction + output = self.spk_model(**batch) + + return output + + @staticmethod + def from_pretrained( + model_tag: Optional[str] = None, + **kwargs: Optional[Any], + ): + """Build Speech2Embedding instance from the pretrained model. + + Args: + model_tag (Optional[str]): Model tag of the pretrained models. + Currently, the tags of espnet_model_zoo are supported. + + Returns: + Speech2Text: Speech2Embedding instance. + + """ + if model_tag is not None: + try: + from espnet_model_zoo.downloader import ModelDownloader + + except ImportError: + logging.error( + "`espnet_model_zoo` is not installed. " + "Please install via `pip install -U espnet_model_zoo`." + ) + raise + d = ModelDownloader() + kwargs.update(**d.download_and_unpack(model_tag)) + + return Speech2Embedding(**kwargs) + + +def inference( + output_dir: str, + batch_size: int, + dtype: str, + ngpu: int, + seed: int, + num_workers: int, + log_level: Union[int, str], + data_path_and_name_and_type: Sequence[Tuple[str, str, str]], + key_file: Optional[str], + spk_train_config: Optional[str], + spk_model_file: Optional[str], + model_tag: Optional[str], +): assert check_argument_types() + if batch_size > 1: + raise NotImplementedError("batch decoding is not implemented") + if ngpu > 1: + raise NotImplementedError("only single GPU decoding is supported") logging.basicConfig( - level=args.log_level, + level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) - if args.ngpu >= 1: + if ngpu >= 1: device = "cuda" else: device = "cpu" # 1. Set random-seed - set_all_random_seed(args.seed) + set_all_random_seed(seed) - # 2. define train args - spk_model, spk_train_args = SpeakerTask.build_model_from_file( - args.spk_train_config, args.spk_model_file, device + # 2. Build speech2embedding + speech2embedding_kwargs = dict( + batch_size=batch_size, + dtype=dtype, + spk_train_config=spk_train_config, + spk_model_file=spk_model_file, ) - # 3. Overwrite args with inference args - args = vars(args) - args["valid_data_path_and_name_and_type"] = args["data_path_and_name_and_type"] - args["valid_shape_file"] = args["shape_file"] - args["preprocessor_conf"] = { - "target_duration": args["target_duration"], - "num_eval": args["num_eval"], - "noise_apply_prob": 0.0, - "rir_apply_prob": 0.0, - } - - merged_args = vars(spk_train_args) - merged_args.update(args) - args = argparse.Namespace(**merged_args) - - # 4. Build data-iterator - resolve_distributed_mode(args) - distributed_option = build_dataclass(DistributedOption, args) - distributed_option.init_options() - - iterator = SpeakerTask.build_iter_factory( - args=args, - distributed_option=distributed_option, - mode="valid", + speech2embedding = Speech2Embedding.from_pretrained( + model_tag=model_tag, + **speech2text_kwargs, ) - loader = iterator.build_iter(0) - trainer_options = SpeakerTask.trainer.build_options(args) - reporter = Reporter() + # 3. Build data-iterator + loader = SpeakerTask.build_streaming_iterator( + data_path_and_name_and_type, + dtype=dtype, + batch_size=batch_size, + key_file=key_file, + num_workers=num_workers, + preprocess_fn=SpeakerTask.build_preprocess_fn(speech2embedding.spk_train_args, False), + collate_fn=SpeakerTask.build_colate_fn(speech2embedding.spk_train_args, False), + inference=True, + ) + + # 4. Start for-loop + with NpyScpWriter( + output_dir / "embed", + output_dir / "embed.scp" + ) as writer: + for keys, batch in loader: + assert isinstance(batch, dict), type(batch) + assert all(isinstance(s, str) for s in keys), keys + _bs = len(next(iter(batch.values()))) + assert len(keys) == _bs, f"{len(keys)} != {_bs}" + batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} + result = speech2embedding(**batch) + + # Only supporting batch_size==1 + key = keys[0] - # 5. Run inference for EER and minDCF calculation - with reporter.observe("valid") as sub_reporter: - SpeakerTask.trainer.validate_one_epoch( - model=spk_model, - iterator=loader, - reporter=sub_reporter, - options=trainer_options, - distributed_option=distributed_option, - ) - if not distributed_option.distributed or distributed_option.dist_rank == 0: - logging.info(reporter.log_message()) + writer[key] = result.cpu().numpy() def get_parser(): parser = config_argparse.ArgumentParser( - description="SPK Decoding", + description="Speaker Embedding Extraction", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) + # Note(kamo): Use '_' instead of '-' as separator. + # '-' is confusing if written in yaml. parser.add_argument( "--log_level", type=lambda x: x.upper(), @@ -109,6 +200,7 @@ def get_parser(): choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), help="The verbose level of logging", ) + parser.add_argument("--output_dir", type=str, required=True) parser.add_argument( "--ngpu", @@ -137,152 +229,24 @@ def get_parser(): required=True, action="append", ) - _batch_type_help = "" - for key, value in BATCH_TYPES.items(): - _batch_type_help += f'"{key}":\n{value}\n' - group.add_argument( - "--batch_type", - type=str, - default="folded", - choices=list(BATCH_TYPES), - help=_batch_type_help, - ) - group.add_argument( - "--batch_bins", - type=int, - default=1000000, - help="The number of batch bins. Used if batch_type='length' or 'numel'", - ) - group.add_argument( - "--valid_batch_bins", - type=int_or_none, - default=None, - help="If not given, the value of --batch_bins is used", - ) - group.add_argument( - "--valid_batch_type", - type=str_or_none, - default=None, - choices=list(BATCH_TYPES) + [None], - help="If not given, the value of --batch_type is used", - ) - group.add_argument( - "--max_cache_size", - type=humanfriendly.parse_size, - default=0.0, - help="The maximum cache size for data loader. e.g. 10MB, 20GB.", - ) - group.add_argument( - "--max_cache_fd", - type=int, - default=32, - help="The maximum number of file descriptors to be kept " - "as opened for ark files. " - "This feature is only valid when data type is 'kaldi_ark'.", - ) - group.add_argument( - "--valid_max_cache_size", - type=humanfriendly_parse_size_or_none, - default=None, - help="The maximum cache size for validation data loader. e.g. 10MB, 20GB. " - "If None, the 5 percent size of --max_cache_size", - ) - group.add_argument("--shape_file", type=str, action="append", default=[]) - group.add_argument( - "--input_size", - type=int_or_none, - default=None, - help="The number of input dimension of the feature", - ) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) + group.add_argument("--key_file", type=str_or_none) group.add_argument( - "--spk2utt", - type=str, - default="", - help="Directory of spk2utt file to be used in label mapping", - ) - group.add_argument( - "--train_dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type for training.", - ) - group.add_argument( - "--use_amp", - type=str2bool, - default=False, - help="Enable Automatic Mixed Precision. This feature requires pytorch>=1.6", - ) - group.add_argument( - "--grad_clip", - type=float, - default=5.0, - help="Gradient norm threshold to clip", - ) - group.add_argument( - "--accum_grad", + "--batch_size", type=int, default=1, - help="The number of gradient accumulation", - ) - group.add_argument( - "--no_forward_run", - type=str2bool, - default=False, - help="Just only iterating data loading without " - "model forwarding and training", - ) - group.add_argument( - "--grad_clip_type", - type=float, - default=2.0, - help="The type of the used p-norm for gradient clip. Can be inf", - ) - group.add_argument( - "--grad_noise", - type=str2bool, - default=False, - help="The flag to switch to use noise injection to " - "gradients during training", - ) - group.add_argument( - "--resume", - type=str2bool, - default=False, - help="Enable resuming if checkpoint is existing", - ) - group.add_argument( - "--sort_in_batch", - type=str, - default="descending", - choices=["descending", "ascending"], - help="Sort the samples in each mini-batches by the sample " - 'lengths. To enable this, "shape_file" must have the length information.', - ) - group.add_argument( - "--sort_batch", - type=str, - default="descending", - choices=["descending", "ascending"], - help="Sort mini-batches by the sample lengths", - ) - group.add_argument( - "--drop_last_iter", - type=str2bool, - default=False, - help="Exclude the minibatch with leftovers.", + help="The batch size for inference", ) group = parser.add_argument_group("The model configuration related") group.add_argument( "--spk_train_config", type=str, - help="SPK training configuration", + help="Speaker model training configuration", ) group.add_argument( "--spk_model_file", type=str, - help="SPK model parameter file", + help="Speaker model parameter file", ) group.add_argument( "--model_tag", @@ -291,252 +255,6 @@ def get_parser(): "*_file will be overwritten", ) - group = parser.add_argument_group("distributed training related") - group.add_argument( - "--dist_backend", - default="nccl", - type=str, - help="distributed backend", - ) - group.add_argument( - "--dist_init_method", - type=str, - default="env://", - help='if init_method="env://", env values of "MASTER_PORT", "MASTER_ADDR", ' - '"WORLD_SIZE", and "RANK" are referred.', - ) - group.add_argument( - "--dist_world_size", - default=None, - type=int_or_none, - help="number of nodes for distributed training", - ) - group.add_argument( - "--dist_rank", - type=int_or_none, - default=None, - help="node rank for distributed training", - ) - group.add_argument( - # Not starting with "dist_" for compatibility to launch.py - "--local_rank", - type=int_or_none, - default=None, - help="local rank for distributed training. This option is used if " - "--multiprocessing_distributed=false", - ) - group.add_argument( - "--dist_master_addr", - default=None, - type=str_or_none, - help="The master address for distributed training. " - "This value is used when dist_init_method == 'env://'", - ) - group.add_argument( - "--dist_master_port", - default=None, - type=int_or_none, - help="The master port for distributed training" - "This value is used when dist_init_method == 'env://'", - ) - group.add_argument( - "--dist_launcher", - default=None, - type=str_or_none, - choices=["slurm", "mpi", None], - help="The launcher type for distributed training", - ) - group.add_argument( - "--multiprocessing_distributed", - default=False, - type=str2bool, - help="Use multi-processing distributed training to launch " - "N processes per node, which has N GPUs. This is the " - "fastest way to use PyTorch for either single node or " - "multi node data parallel training", - ) - group.add_argument( - "--unused_parameters", - type=str2bool, - default=False, - help="Whether to use the find_unused_parameters in " - "torch.nn.parallel.DistributedDataParallel ", - ) - group.add_argument( - "--sharded_ddp", - default=False, - type=str2bool, - help="Enable sharded training provided by fairscale", - ) - - group = parser.add_argument_group("trainer initialization related") - group.add_argument( - "--max_epoch", - type=int, - default=40, - help="The maximum number epoch to train", - ) - group.add_argument( - "--patience", - type=int_or_none, - default=None, - help="Number of epochs to wait without improvement " - "before stopping the training", - ) - group.add_argument( - "--val_scheduler_criterion", - type=str, - nargs=2, - default=("valid", "loss"), - help="The criterion used for the value given to the lr scheduler. " - 'Give a pair referring the phase, "train" or "valid",' - 'and the criterion name. The mode specifying "min" or "max" can ' - "be changed by --scheduler_conf", - ) - group.add_argument( - "--early_stopping_criterion", - type=str, - nargs=3, - default=("valid", "loss", "min"), - help="The criterion used for judging of early stopping. " - 'Give a pair referring the phase, "train" or "valid",' - 'the criterion name and the mode, "min" or "max", e.g. "acc,max".', - ) - group.add_argument( - "--best_model_criterion", - type=str2triple_str, - nargs="+", - default=[ - ("train", "loss", "min"), - ("valid", "loss", "min"), - ("train", "acc", "max"), - ("valid", "acc", "max"), - ], - help="The criterion used for judging of the best model. " - 'Give a pair referring the phase, "train" or "valid",' - 'the criterion name, and the mode, "min" or "max", e.g. "acc,max".', - ) - group.add_argument( - "--keep_nbest_models", - type=int, - nargs="+", - default=[10], - help="Remove previous snapshots excluding the n-best scored epochs", - ) - group.add_argument( - "--nbest_averaging_interval", - type=int, - default=0, - help="The epoch interval to apply model averaging and save nbest models", - ) - group.add_argument( - "--use_matplotlib", - type=str2bool, - default=True, - help="Enable matplotlib logging", - ) - group.add_argument( - "--use_tensorboard", - type=str2bool, - default=True, - help="Enable tensorboard logging", - ) - group.add_argument( - "--create_graph_in_tensorboard", - type=str2bool, - default=False, - help="Whether to create graph in tensorboard", - ) - group.add_argument( - "--use_wandb", - type=str2bool, - default=False, - help="Enable wandb logging", - ) - group.add_argument( - "--wandb_project", - type=str, - default=None, - help="Specify wandb project", - ) - group.add_argument( - "--wandb_id", - type=str, - default=None, - help="Specify wandb id", - ) - group.add_argument( - "--wandb_entity", - type=str, - default=None, - help="Specify wandb entity", - ) - group.add_argument( - "--wandb_name", - type=str, - default=None, - help="Specify wandb run name", - ) - group.add_argument( - "--wandb_model_log_interval", - type=int, - default=-1, - help="Set the model log period", - ) - group.add_argument( - "--detect_anomaly", - type=str2bool, - default=False, - help="Set torch.autograd.set_detect_anomaly", - ) - - group = parser.add_argument_group("cudnn mode related") - group.add_argument( - "--cudnn_enabled", - type=str2bool, - default=torch.backends.cudnn.enabled, - help="Enable CUDNN", - ) - group.add_argument( - "--cudnn_benchmark", - type=str2bool, - default=torch.backends.cudnn.benchmark, - help="Enable cudnn-benchmark mode", - ) - group.add_argument( - "--cudnn_deterministic", - type=str2bool, - default=True, - help="Enable cudnn-deterministic mode", - ) - - group = parser.add_argument_group("The inference hyperparameter related") - group.add_argument( - "--valid_batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - group.add_argument( - "--target_duration", - type=float, - default=3.0, - help="Duration (in seconds) of samples in a minibatch", - ) - group.add_argument( - "--num_eval", - type=int, - default=10, - help="Number of segments to make from one utterance in the inference phase", - ) - group.add_argument("--fold_length", type=int, action="append", default=[]) - group.add_argument( - "--use_preprocessor", - type=str2bool, - default=True, - help="Apply preprocessing to data or not", - ) - return parser @@ -544,7 +262,9 @@ def main(cmd=None): print(get_commandline_args(), file=sys.stderr) parser = get_parser() args = parser.parse_args(cmd) - inference(args) + kwargs = vars(args) + kwargs.pop("config", None) + inference(**kwargs) if __name__ == "__main__": From 2b1c8ace0ec24570eb3cd93a5c150762dddfe65e Mon Sep 17 00:00:00 2001 From: ftshijt Date: Tue, 5 Dec 2023 04:47:45 -0500 Subject: [PATCH 06/40] change type --- espnet2/bin/spk_inference.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 espnet2/bin/spk_inference.py diff --git a/espnet2/bin/spk_inference.py b/espnet2/bin/spk_inference.py old mode 100644 new mode 100755 From 849021c451c05bafcf01bc1be8becd50bb5c6fc1 Mon Sep 17 00:00:00 2001 From: ftshijt Date: Tue, 5 Dec 2023 05:07:29 -0500 Subject: [PATCH 07/40] apply black --- .../asr1/pyscripts/utils/extract_xvectors.py | 41 ++++++++++++++++++- espnet2/bin/spk_inference.py | 29 +++++++------ 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py index 01cd0e85a52..c2ecf63b0e6 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py @@ -81,8 +81,37 @@ def __init__(self, args, device): )["model"] ) self.model.to(device).eval() + elif self.toolkit == "espnet": + from espnet2.bin.spk_inference import Speech2Embedding + + # NOTE(jiatong): set default config file as None + # assume config is the same path as the model file + speech2embedding_kwargs = dict( + batch_size=1, + dtype="float32", + spk_train_config=None, + spk_model_file=args.pretrained_model, + ) + + if args.pretrained_model.endwith("pth"): + logging.info( + "the provided model path is end with pth," + "assume it not a huggingface model" + ) + model_tag = None + else: + logging.info( + "the provided model path is not end with pth," + "assume use huggingface model" + ) + model_tag = args.pretrained_model + + self.speech2embedding = Speech2Embedding.from_pretrained( + model_tag=model, + **speech2embedding_kwargs, + ) - def rawnet_extract_embd(self, audio, n_samples=48000, n_segments=10): + def _rawnet_extract_embd(self, audio, n_samples=48000, n_segments=10): if len(audio.shape) > 1: raise ValueError( "RawNet3 supports mono input only." @@ -102,6 +131,16 @@ def rawnet_extract_embd(self, audio, n_samples=48000, n_segments=10): output = self.model(audios) return output.mean(0).detach().cpu().numpy() + def _espnet_extract_embd(self, audio): + if len(audio.shape) > 1: + raise ValueError( + "Not support multi-channel input for ESPnet pre-trained model" + f"Input data has a shape of {audio.shape}." + ) + audio = torch.from_numpy(audio.astype(np.float32)).to(self.device) + output = self.self.speech2embedding(audio) + return output.cpu().numpy() + def __call__(self, wav, in_sr): if self.toolkit == "speechbrain": wav = self.audio_norm(torch.from_numpy(wav), in_sr).to(self.device) diff --git a/espnet2/bin/spk_inference.py b/espnet2/bin/spk_inference.py index ade85343b43..1e345f11ab9 100755 --- a/espnet2/bin/spk_inference.py +++ b/espnet2/bin/spk_inference.py @@ -25,7 +25,7 @@ class Speech2Embedding: >>> speech2embed = Speech2Embedding("spk_config.yml", "spk.pth") >>> audio, rate = soundfile.read("speech.wav") >>> speech2embed(audio) - + """ def __init__( @@ -38,21 +38,21 @@ def __init__( ): assert check_argument_types() - spk_model, spk_train_args = SpeakerTask.build_model_from_file(spk_train_config, spk_model_file, device) + spk_model, spk_train_args = SpeakerTask.build_model_from_file( + spk_train_config, spk_model_file, device + ) self.spk_model = spk_model self.spk_train_args = spk_train_args self.dtype = dtype self.batch_size = batch_size - + @torch.no_grad() - def __call__( - self, seech: Union[torch.Tensor, np.ndarray] - ) -> torch.Tensor: + def __call__(self, seech: Union[torch.Tensor, np.ndarray]) -> torch.Tensor: """Inference Args: speech: Input speech data - + Returns: spk_embedding @@ -76,7 +76,7 @@ def __call__( output = self.spk_model(**batch) return output - + @staticmethod def from_pretrained( model_tag: Optional[str] = None, @@ -106,7 +106,7 @@ def from_pretrained( kwargs.update(**d.download_and_unpack(model_tag)) return Speech2Embedding(**kwargs) - + def inference( output_dir: str, @@ -151,7 +151,7 @@ def inference( speech2embedding = Speech2Embedding.from_pretrained( model_tag=model_tag, - **speech2text_kwargs, + **speech2embedding_kwargs, ) # 3. Build data-iterator @@ -161,16 +161,15 @@ def inference( batch_size=batch_size, key_file=key_file, num_workers=num_workers, - preprocess_fn=SpeakerTask.build_preprocess_fn(speech2embedding.spk_train_args, False), + preprocess_fn=SpeakerTask.build_preprocess_fn( + speech2embedding.spk_train_args, False + ), collate_fn=SpeakerTask.build_colate_fn(speech2embedding.spk_train_args, False), inference=True, ) # 4. Start for-loop - with NpyScpWriter( - output_dir / "embed", - output_dir / "embed.scp" - ) as writer: + with NpyScpWriter(output_dir / "embed", output_dir / "embed.scp") as writer: for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys From 0dfefaba831e77c8112eefde564b83d21d536802 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 5 Dec 2023 10:14:33 +0000 Subject: [PATCH 08/40] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- egs2/TEMPLATE/tts1/README.md | 1 - egs2/TEMPLATE/tts1/tts.sh | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/egs2/TEMPLATE/tts1/README.md b/egs2/TEMPLATE/tts1/README.md index e8928907c23..9d6cbbe77f4 100644 --- a/egs2/TEMPLATE/tts1/README.md +++ b/egs2/TEMPLATE/tts1/README.md @@ -1046,4 +1046,3 @@ This is because we use prenet in the decoder, which always applies dropout. See more info in [Tacotron2 paper](https://arxiv.org/abs/1712.05884). If you want to fix the results, you can use [`--always_fix_seed` option](https://github.com/espnet/espnet/blob/f03101557753517ebac8c432f0793d97d68fa5f0/espnet2/bin/tts_inference.py#L601-L606). - diff --git a/egs2/TEMPLATE/tts1/tts.sh b/egs2/TEMPLATE/tts1/tts.sh index 0cbb1da0983..0b6323220e5 100755 --- a/egs2/TEMPLATE/tts1/tts.sh +++ b/egs2/TEMPLATE/tts1/tts.sh @@ -338,7 +338,7 @@ if ! "${skip_data_prep}"; then if [ -e data/"${dset}"/segments ]; then _opts+="--segments data/${dset}/segments " fi - + # shellcheck disable=SC2086 scripts/audio/format_wav_scp.sh --nj "${nj}" --cmd "${train_cmd}" \ --audio-format "${audio_format}" --fs "${fs}" ${_opts} \ @@ -1261,4 +1261,3 @@ else fi log "Successfully finished. [elapsed=${SECONDS}s]" - From 4760f0e34e2e7b6dc24df4c5fc16ed6468d248ef Mon Sep 17 00:00:00 2001 From: ftshijt Date: Tue, 5 Dec 2023 05:17:16 -0500 Subject: [PATCH 09/40] update missing code for espnet type --- .../TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py index c2ecf63b0e6..6daf9801790 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py @@ -147,7 +147,10 @@ def __call__(self, wav, in_sr): embeds = self.model.encode_batch(wav).detach().cpu().numpy()[0] elif self.toolkit == "rawnet": wav = librosa.resample(wav, orig_sr=in_sr, target_sr=16000) - embeds = self.rawnet_extract_embd(wav) + embeds = self._rawnet_extract_embd(wav) + elif self.toolkit == "espnet": + wav = librosa.resample(wav, orig_sr=in_sr, target_sr=16000) + embeds = self._espnet_extract_embd(wav) return embeds @@ -173,7 +176,7 @@ def main(argv): else: device = "cpu" - if args.toolkit in ("speechbrain", "rawnet"): + if args.toolkit in ("speechbrain", "rawnet", "espnet"): # Prepare spk2utt for mean x-vector spk2utt = dict() with open(os.path.join(args.in_folder, "spk2utt"), "r") as reader: @@ -207,10 +210,6 @@ def main(argv): writer_utt.close() writer_spk.close() - elif args.toolkit == "espnet": - raise NotImplementedError( - "Follow details at: https://github.com/espnet/espnet/issues/3040" - ) else: raise ValueError( "Unkown type of toolkit. Only supported: speechbrain, rawnet, espnet, kaldi" From bf84668accbdb082206266737e06990a4028c6c7 Mon Sep 17 00:00:00 2001 From: ftshijt Date: Tue, 5 Dec 2023 05:23:44 -0500 Subject: [PATCH 10/40] wrong name update --- egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py index 6daf9801790..18d271efdb3 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py @@ -93,7 +93,7 @@ def __init__(self, args, device): spk_model_file=args.pretrained_model, ) - if args.pretrained_model.endwith("pth"): + if args.pretrained_model.endswith("pth"): logging.info( "the provided model path is end with pth," "assume it not a huggingface model" From a911df4fdf76549b01167270ed96b6ad6db34c91 Mon Sep 17 00:00:00 2001 From: ftshijt Date: Tue, 5 Dec 2023 05:48:34 -0500 Subject: [PATCH 11/40] switch to train_config and model_file to align with huggingface pre-trained models --- .../asr1/pyscripts/utils/extract_xvectors.py | 4 ++-- espnet2/bin/spk_inference.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py index 18d271efdb3..a0f688be7b7 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py @@ -89,8 +89,8 @@ def __init__(self, args, device): speech2embedding_kwargs = dict( batch_size=1, dtype="float32", - spk_train_config=None, - spk_model_file=args.pretrained_model, + train_config=None, + model_file=args.pretrained_model, ) if args.pretrained_model.endswith("pth"): diff --git a/espnet2/bin/spk_inference.py b/espnet2/bin/spk_inference.py index 1e345f11ab9..6787f2c0012 100755 --- a/espnet2/bin/spk_inference.py +++ b/espnet2/bin/spk_inference.py @@ -30,8 +30,8 @@ class Speech2Embedding: def __init__( self, - spk_train_config: Uhion[Path, str] = None, - spk_model_file: Union[Path, str] = None, + train_config: Uhion[Path, str] = None, + model_file: Union[Path, str] = None, device: str = "cpu", dtype: str = "float32", batch_size: int = 1, @@ -39,7 +39,7 @@ def __init__( assert check_argument_types() spk_model, spk_train_args = SpeakerTask.build_model_from_file( - spk_train_config, spk_model_file, device + train_config, model_file, device ) self.spk_model = spk_model self.spk_train_args = spk_train_args @@ -118,8 +118,8 @@ def inference( log_level: Union[int, str], data_path_and_name_and_type: Sequence[Tuple[str, str, str]], key_file: Optional[str], - spk_train_config: Optional[str], - spk_model_file: Optional[str], + train_config: Optional[str], + model_file: Optional[str], model_tag: Optional[str], ): assert check_argument_types() @@ -145,8 +145,8 @@ def inference( speech2embedding_kwargs = dict( batch_size=batch_size, dtype=dtype, - spk_train_config=spk_train_config, - spk_model_file=spk_model_file, + train_config=train_config, + model_file=model_file, ) speech2embedding = Speech2Embedding.from_pretrained( @@ -238,12 +238,12 @@ def get_parser(): group = parser.add_argument_group("The model configuration related") group.add_argument( - "--spk_train_config", + "--train_config", type=str, help="Speaker model training configuration", ) group.add_argument( - "--spk_model_file", + "--model_file", type=str, help="Speaker model parameter file", ) From 3ace83cf1c2d6d234819f73c9aa8001c23dfd343 Mon Sep 17 00:00:00 2001 From: ftshijt Date: Tue, 5 Dec 2023 05:59:43 -0500 Subject: [PATCH 12/40] fix debug info and update huggingface compatibale inference setting --- egs2/TEMPLATE/tts1/tts.sh | 1 - espnet2/bin/spk_inference.py | 8 +++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/egs2/TEMPLATE/tts1/tts.sh b/egs2/TEMPLATE/tts1/tts.sh index 0b6323220e5..eef5b9b5904 100755 --- a/egs2/TEMPLATE/tts1/tts.sh +++ b/egs2/TEMPLATE/tts1/tts.sh @@ -326,7 +326,6 @@ if ! "${skip_data_prep}"; then log "Stage 2: Format wav.scp: data/ -> ${data_feats}/" for dset in "${train_set}" "${valid_set}" ${test_sets}; do - continue if [ "${dset}" = "${train_set}" ] || [ "${dset}" = "${valid_set}" ]; then _suf="/org" else diff --git a/espnet2/bin/spk_inference.py b/espnet2/bin/spk_inference.py index 6787f2c0012..88db285d4de 100755 --- a/espnet2/bin/spk_inference.py +++ b/espnet2/bin/spk_inference.py @@ -13,6 +13,7 @@ from espnet2.fileio.npy_scp import NpyScpWriter from espnet2.tasks.spk import SpeakerTask +from espnet2.torch_utils.device_funcs import to_device from espnet2.torch_utils.set_all_random_seed import set_all_random_seed from espnet.utils.cli_utils import get_commandline_args @@ -41,13 +42,14 @@ def __init__( spk_model, spk_train_args = SpeakerTask.build_model_from_file( train_config, model_file, device ) - self.spk_model = spk_model + self.spk_model = spk_model.eval() self.spk_train_args = spk_train_args + self.device = device self.dtype = dtype self.batch_size = batch_size @torch.no_grad() - def __call__(self, seech: Union[torch.Tensor, np.ndarray]) -> torch.Tensor: + def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> torch.Tensor: """Inference Args: @@ -67,7 +69,7 @@ def __call__(self, seech: Union[torch.Tensor, np.ndarray]) -> torch.Tensor: # data: (Nsamples,) -> (1, Nsamples) speech = speech.unsqueeze(0).to(getattr(torch, self.dtype)) logging.info("speech length: " + str(speech.size(1))) - batch = {"speech": speech} + batch = {"speech": speech, "extract_embd": True} # a. To device batch = to_device(batch, device=self.device) From 2b59b27fb1fdf9bdd4c07bc63a1dcdbfe8285524 Mon Sep 17 00:00:00 2001 From: ftshijt Date: Tue, 5 Dec 2023 06:01:38 -0500 Subject: [PATCH 13/40] fix Uhion typo --- espnet2/bin/spk_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/espnet2/bin/spk_inference.py b/espnet2/bin/spk_inference.py index 88db285d4de..70e49d92c43 100755 --- a/espnet2/bin/spk_inference.py +++ b/espnet2/bin/spk_inference.py @@ -31,7 +31,7 @@ class Speech2Embedding: def __init__( self, - train_config: Uhion[Path, str] = None, + train_config: Union[Path, str] = None, model_file: Union[Path, str] = None, device: str = "cpu", dtype: str = "float32", From 57cbf08ffd21653a9d7d1551d76223196e9ac112 Mon Sep 17 00:00:00 2001 From: ftshijt Date: Wed, 6 Dec 2023 05:10:27 -0500 Subject: [PATCH 14/40] reset to spk_embed instead of xvector --- ...extract_xvectors.py => extract_spk_embed.py} | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) rename egs2/TEMPLATE/asr1/pyscripts/utils/{extract_xvectors.py => extract_spk_embed.py} (93%) diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_spk_embed.py similarity index 93% rename from egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py rename to egs2/TEMPLATE/asr1/pyscripts/utils/extract_spk_embed.py index a0f688be7b7..c40b6c323d6 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_xvectors.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_spk_embed.py @@ -82,11 +82,11 @@ def __init__(self, args, device): ) self.model.to(device).eval() elif self.toolkit == "espnet": - from espnet2.bin.spk_inference import Speech2Embedding + from espnet2.bin.spk_inference import Speech2SpkEmbedding # NOTE(jiatong): set default config file as None # assume config is the same path as the model file - speech2embedding_kwargs = dict( + speech2spkembedding_kwargs = dict( batch_size=1, dtype="float32", train_config=None, @@ -106,9 +106,9 @@ def __init__(self, args, device): ) model_tag = args.pretrained_model - self.speech2embedding = Speech2Embedding.from_pretrained( + self.speech2spkembedding = Speech2SpkEmbedding.from_pretrained( model_tag=model, - **speech2embedding_kwargs, + **speech2spkembedding_kwargs, ) def _rawnet_extract_embd(self, audio, n_samples=48000, n_segments=10): @@ -132,11 +132,14 @@ def _rawnet_extract_embd(self, audio, n_samples=48000, n_segments=10): return output.mean(0).detach().cpu().numpy() def _espnet_extract_embd(self, audio): - if len(audio.shape) > 1: - raise ValueError( + if len(audio.shape) == 2: + logging.info( "Not support multi-channel input for ESPnet pre-trained model" - f"Input data has a shape of {audio.shape}." + f"Input data has shape {audio.shape}, default set avg across channel" ) + audio = np.mean(audio, axis=0) + elif len(audio.shape) > 1: + raise ValueError(f"Input data has shape {audio.shape} thatis not support") audio = torch.from_numpy(audio.astype(np.float32)).to(self.device) output = self.self.speech2embedding(audio) return output.cpu().numpy() From 82acb628e8e685523db9bd09a12fcf6a1a17312d Mon Sep 17 00:00:00 2001 From: ftshijt Date: Wed, 6 Dec 2023 05:53:04 -0500 Subject: [PATCH 15/40] change default to spk embedding instead of xvector --- .../asr1/pyscripts/utils/extract_spk_embed.py | 32 ++++-- egs2/TEMPLATE/tts1/tts.sh | 101 +++++++++--------- 2 files changed, 72 insertions(+), 61 deletions(-) diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_spk_embed.py b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_spk_embed.py index c40b6c323d6..e4ee827edce 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/extract_spk_embed.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/extract_spk_embed.py @@ -27,9 +27,15 @@ def get_parser(): parser.add_argument( "--toolkit", type=str, - help="Toolkit for Extracting X-vectors.", + help="Toolkit for Extracting speaker speaker embeddingss.", choices=["espnet", "speechbrain", "rawnet"], ) + parser.add_argument( + "--spk_embed_tag", + type=str, + help="the target data name (e.g., xvector for xvector.scp)", + default="spk_embed", + ) parser.add_argument("--verbose", type=int, default=1, help="Verbosity level.") parser.add_argument("--device", type=str, default="cuda:0", help="Inference device") parser.add_argument( @@ -38,12 +44,12 @@ def get_parser(): parser.add_argument( "out_folder", type=Path, - help="Output folder to save the xvectors.", + help="Output folder to save the speaker embeddings.", ) return parser -class XVExtractor: +class SpkEmbedExtractor: def __init__(self, args, device): self.toolkit = args.toolkit self.device = device @@ -190,25 +196,29 @@ def main(argv): wav_scp = SoundScpReader(os.path.join(args.in_folder, "wav.scp"), np.float32) os.makedirs(args.out_folder, exist_ok=True) writer_utt = kaldiio.WriteHelper( - "ark,scp:{0}/xvector.ark,{0}/xvector.scp".format(args.out_folder) + "ark,scp:{0}/{1}.ark,{0}/{1}.scp".format( + args.out_folder, args.spk_embed_tag + ) ) writer_spk = kaldiio.WriteHelper( - "ark,scp:{0}/spk_xvector.ark,{0}/spk_xvector.scp".format(args.out_folder) + "ark,scp:{0}/spk_{1}.ark,{0}/spk_{1}.scp".format( + args.out_folder, args.spk_embed_tag + ) ) - xv_extractor = XVExtractor(args, device) + spk_embed_extractor = SpkEmbedExtractor(args, device) for speaker in tqdm(spk2utt): - xvectors = list() + spk_embeddings = list() for utt in spk2utt[speaker]: in_sr, wav = wav_scp[utt] - # X-vector Embedding - embeds = xv_extractor(wav, in_sr) + # Speaker Embedding + embeds = spk_embed_extractor(wav, in_sr) writer_utt[utt] = np.squeeze(embeds) - xvectors.append(embeds) + spk_embeddings.append(embeds) # Speaker Normalization - embeds = np.mean(np.stack(xvectors, 0), 0) + embeds = np.mean(np.stack(spk_embeddings, 0), 0) writer_spk[speaker] = embeds writer_utt.close() writer_spk.close() diff --git a/egs2/TEMPLATE/tts1/tts.sh b/egs2/TEMPLATE/tts1/tts.sh index eef5b9b5904..ef078500fe8 100755 --- a/egs2/TEMPLATE/tts1/tts.sh +++ b/egs2/TEMPLATE/tts1/tts.sh @@ -65,12 +65,12 @@ n_mels=80 # The number of mel basis. f0min=80 # Maximum f0 for pitch extraction. f0max=400 # Minimum f0 for pitch extraction. -# X-Vector related -use_xvector=false # Whether to use x-vector. -xvector_tag= # The tag of xvector folder. -xvector_gpu_inference=false # Whether to use gpu to inference xvector. -xvector_tool=kaldi # Toolkit for extracting x-vector (speechbrain, rawnet, espnet, kaldi). -xvector_model=speechbrain/spkrec-ecapa-voxceleb # For only espnet, speechbrain, or rawnet. +# Speaker embedding related +use_spk_embed=false # Whether to use speaker embedding. +spk_embed_tag=espnet_spk # The additional tag of speaker embedding folder, use "xvector" for compatibility. +spk_embed_gpu_inference=false # Whether to use gpu to inference speaker embedding. +spk_embed_tool=espnet # Toolkit for extracting x-vector (speechbrain, rawnet, espnet, kaldi). +spk_embed_model=espnet/voxcelebs12_rawnet3 # For only espnet, speechbrain, or rawnet. # Vocabulary related oov="" # Out of vocabrary symbol. @@ -147,11 +147,11 @@ Options: --audio_format # Audio format: wav, flac, wav.ark, flac.ark (only in feats_type=raw, default="${audio_format}"). --min_wav_duration # Minimum duration in second (default="${min_wav_duration}"). --max_wav_duration # Maximum duration in second (default="${max_wav_duration}"). - --use_xvector # Whether to use X-vector (default="${use_xvector}"). - --xvector_tag # The tag of xvector folder (default="${xvector_tag}"). - --xvector_gpu_inference # Whether to use gpu to inference xvector (default="${xvector_gpu_inference}"). - --xvector_tool # Toolkit for generating the X-vectors (default="${xvector_tool}"). - --xvector_model # Pretrained model to generate the X-vectors (default="${xvector_model}"). + --use_spk_embed # Whether to use speaker_embedding (default="${use_spk_embed}"). + --spk_embed_tag # The tag of speaker embedding folder, use "xvector" for compatibility (default="${spk_embed_tag}"). + --spk_embed_gpu_inference # Whether to use gpu to inference speaker embedding (default="${spk_embed_gpu_inference}"). + --spk_embed_tool # Toolkit for generating the speaker embedding (default="${spk_embed_tool}"). + --spk_embed_model # Pretrained model to generate the speaker embedding (default="${spk_embed_model}"). --use_sid # Whether to use speaker id as the inputs (default="${use_sid}"). --use_lid # Whether to use language id as the inputs (default="${use_lid}"). --feats_extract # On the fly feature extractor (default="${feats_extract}"). @@ -347,10 +347,10 @@ if ! "${skip_data_prep}"; then fi if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then - # Extract X-vector - if "${use_xvector}"; then - if [ "${xvector_tool}" = "kaldi" ]; then - log "Stage 3.1: Extract X-vector: data/ -> ${dumpdir}/xvector${xvector_tag} (Require Kaldi)" + # Extract speaker embedding + if "${use_spk_embed}"; then + if [ "${spk_embed_tool}" = "kaldi" ]; then + log "Stage 3.1: Extract X-vector with Kaldi: data/ -> ${dumpdir}/${spk_embed_tag} (Require Kaldi)" # Download X-vector pretrained model xvector_exp=${expdir}/xvector_nnet_1a if [ ! -e "${xvector_exp}" ]; then @@ -392,7 +392,7 @@ if ! "${skip_data_prep}"; then sid/nnet3/xvector/extract_xvectors.sh --nj "${_nj}" --cmd "${train_cmd}" \ "${xvector_exp}" \ "${dumpdir}/mfcc/${dset}" \ - "${dumpdir}/xvector${xvector_tag}/${dset}" + "${dumpdir}/${spk_embed_tag}/${dset}" # 5. Filter scp # NOTE(kan-bayashi): Since sometimes mfcc or x-vector extraction is failed, @@ -400,15 +400,15 @@ if ! "${skip_data_prep}"; then # To avoid this mismatch, perform filtering of the original feature scp here. cp "${data_feats}${_suf}/${dset}"/wav.{scp,scp.bak} <"${data_feats}${_suf}/${dset}/wav.scp.bak" \ - utils/filter_scp.pl "${dumpdir}/xvector${xvector_tag}/${dset}/xvector.scp" \ + utils/filter_scp.pl "${dumpdir}/${spk_embed_tag}/${dset}/${spk_embed_tag}.scp" \ >"${data_feats}${_suf}/${dset}/wav.scp" utils/fix_data_dir.sh "${data_feats}${_suf}/${dset}" done else # Assume that others toolkits are python-based - log "Stage 3.1: Extract X-vector: data/ -> ${dumpdir}/xvector${xvector_tag} using python toolkits" + log "Stage 3.1: Extract speaker embedding: data/ -> ${dumpdir}/${spk_embed_tag} using python toolkits" - if ${xvector_gpu_inference}; then + if ${spk_embed_gpu_inference}; then _cmd="${cuda_cmd}" _ngpu=1 else @@ -422,20 +422,21 @@ if ! "${skip_data_prep}"; then else _suf="" fi - if [ "${xvector_tool}" = "rawnet" ]; then - xvector_model="RawNet" + if [ "${spk_embed_tool}" = "rawnet" ]; then + spk_embed_model="RawNet" fi - ${_cmd} --gpu "${_ngpu}" ${dumpdir}/xvector/${dset}/xvector_extract.log \ - pyscripts/utils/extract_xvectors.py \ - --pretrained_model ${xvector_model} \ - --toolkit ${xvector_tool} \ + ${_cmd} --gpu "${_ngpu}" ${dumpdir}/${spk_embed_tag}/${dset}/spk_embed_extract.log \ + pyscripts/utils/extract_spk_embed.py \ + --pretrained_model ${spk_embed_model} \ + --toolkit ${spk_embed_tool} \ + --spk_embed_tag ${spk_embed_tag} \ ${data_feats}${_suf}/${dset} \ - ${dumpdir}/xvector${xvector_tag}/${dset} + ${dumpdir}/${spk_embed_tag}/${dset} done fi else - log "Skip Stage 3.1, no xvector extraction set" + log "Skip Stage 3.1, no speaker embedding extraction set" fi # Prepare spk id input @@ -531,12 +532,12 @@ if ! "${skip_data_prep}"; then # shellcheck disable=SC2086 utils/fix_data_dir.sh --utt_extra_files "${_utt_extra_files}" "${data_feats}/${dset}" - # Filter x-vector - if "${use_xvector}"; then - cp "${dumpdir}/xvector${xvector_tag}/${dset}"/xvector.{scp,scp.bak} - <"${dumpdir}/xvector${xvector_tag}/${dset}/xvector.scp.bak" \ + # Filter spk_embedding + if "${use_spk_embed}"; then + cp "${dumpdir}/${spk_embed_tag}/${dset}"/${spk_embed_tag}.{scp,scp.bak} + <"${dumpdir}/${spk_embed_tag}/${dset}/${spk_embed_tag}.scp.bak" \ utils/filter_scp.pl "${data_feats}/${dset}/wav.scp" \ - >"${dumpdir}/xvector${xvector_tag}/${dset}/xvector.scp" + >"${dumpdir}/${spk_embed_tag}/${dset}/${spk_embed_tag}.scp" fi done fi @@ -621,11 +622,11 @@ if ! "${skip_train}"; then _opts+="--valid_data_path_and_name_and_type ${_teacher_valid_dir}/durations,durations,text_int " fi - if "${use_xvector}"; then - _xvector_train_dir="${dumpdir}/xvector${xvector_tag}/${train_set}" - _xvector_valid_dir="${dumpdir}/xvector${xvector_tag}/${valid_set}" - _opts+="--train_data_path_and_name_and_type ${_xvector_train_dir}/xvector.scp,spembs,kaldi_ark " - _opts+="--valid_data_path_and_name_and_type ${_xvector_valid_dir}/xvector.scp,spembs,kaldi_ark " + if "${use_spk_embed}"; then + _spk_embed_train_dir="${dumpdir}/${spk_embed_tag}/${train_set}" + _spk_embed_valid_dir="${dumpdir}/${spk_embed_tag}/${valid_set}" + _opts+="--train_data_path_and_name_and_type ${_spk_embed_train_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark " + _opts+="--valid_data_path_and_name_and_type ${_spk_embed_valid_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark " fi if "${use_sid}"; then @@ -871,12 +872,12 @@ if ! "${skip_train}"; then _opts+="--energy_normalize_conf stats_file=${tts_stats_dir}/train/energy_stats.npz " fi - # Add X-vector to the inputs if needed - if "${use_xvector}"; then - _xvector_train_dir="${dumpdir}/xvector${xvector_tag}/${train_set}" - _xvector_valid_dir="${dumpdir}/xvector${xvector_tag}/${valid_set}" - _opts+="--train_data_path_and_name_and_type ${_xvector_train_dir}/xvector.scp,spembs,kaldi_ark " - _opts+="--valid_data_path_and_name_and_type ${_xvector_valid_dir}/xvector.scp,spembs,kaldi_ark " + # Add speaker embedding to the inputs if needed + if "${use_spk_embed}"; then + _spk_embed_train_dir="${dumpdir}/${spk_embed_tag}/${train_set}" + _spk_embed_valid_dir="${dumpdir}/${spk_embed_tag}/${valid_set}" + _opts+="--train_data_path_and_name_and_type ${_spk_embed_train_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark " + _opts+="--valid_data_path_and_name_and_type ${_spk_embed_valid_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark " fi # Add spekaer ID to the inputs if needed @@ -1004,10 +1005,10 @@ if ! "${skip_eval}"; then fi fi - # Add X-vector to the inputs if needed - if "${use_xvector}"; then - _xvector_dir="${dumpdir}/xvector${xvector_tag}/${dset}" - _ex_opts+="--data_path_and_name_and_type ${_xvector_dir}/xvector.scp,spembs,kaldi_ark " + # Add speaker embedding to the inputs if needed + if "${use_spk_embed}"; then + _spk_embed_dir="${dumpdir}/${spk_embed_tag}/${dset}" + _ex_opts+="--data_path_and_name_and_type ${_spk_embed_dir}/${spk_embed_tag}.scp,spembs,kaldi_ark " fi # Add spekaer ID to the inputs if needed @@ -1124,10 +1125,10 @@ if [ -z "${download_model}" ]; then if [ -e "${tts_stats_dir}/train/energy_stats.npz" ]; then _opts+=" --option ${tts_stats_dir}/train/energy_stats.npz" fi - if "${use_xvector}"; then + if "${use_spk_embed}"; then for dset in "${train_set}" ${test_sets}; do - _opts+=" --option ${dumpdir}/xvector${xvector_tag}/${dset}/spk_xvector.scp" - _opts+=" --option ${dumpdir}/xvector${xvector_tag}/${dset}/spk_xvector.ark" + _opts+=" --option ${dumpdir}/${spk_embed_tag}/${dset}/${spk_embed_tag}.scp" + _opts+=" --option ${dumpdir}/${spk_embed_tag}/${dset}/${spk_embed_tag}.ark" done fi if "${use_sid}"; then From 3d9aee5d9995a58dabfe44cb006a10099171fa7b Mon Sep 17 00:00:00 2001 From: ftshijt Date: Wed, 6 Dec 2023 06:20:58 -0500 Subject: [PATCH 16/40] fix spk embedding naming --- ...vectors.py => convert_to_avg_spk_embed.py} | 72 +- .../steps/data/data_dir_manipulation_lib.py | 21 +- .../asr1/steps/data/reverberate_data_dir.py | 1021 +++-- egs2/TEMPLATE/asr1/steps/libs/__init__.py | 2 - egs2/TEMPLATE/asr1/steps/libs/common.py | 265 +- .../asr1/steps/libs/nnet3/__init__.py | 2 - .../asr1/steps/libs/nnet3/report/__init__.py | 2 - .../asr1/steps/libs/nnet3/report/log_parse.py | 466 ++- .../asr1/steps/libs/nnet3/train/__init__.py | 1 - .../libs/nnet3/train/chain_objf/__init__.py | 2 - .../nnet3/train/chain_objf/acoustic_model.py | 703 ++-- .../asr1/steps/libs/nnet3/train/common.py | 1184 +++--- .../libs/nnet3/train/dropout_schedule.py | 222 +- .../nnet3/train/frame_level_objf/__init__.py | 2 - .../train/frame_level_objf/acoustic_model.py | 97 +- .../nnet3/train/frame_level_objf/common.py | 698 ++-- .../nnet3/train/frame_level_objf/raw_model.py | 72 +- .../steps/libs/nnet3/xconfig/attention.py | 360 +- .../steps/libs/nnet3/xconfig/basic_layers.py | 1126 +++--- .../libs/nnet3/xconfig/composite_layers.py | 339 +- .../steps/libs/nnet3/xconfig/convolution.py | 1304 ++++--- .../asr1/steps/libs/nnet3/xconfig/gru.py | 3363 ++++++++++++----- .../asr1/steps/libs/nnet3/xconfig/lstm.py | 1760 ++++++--- .../asr1/steps/libs/nnet3/xconfig/parser.py | 202 +- .../steps/libs/nnet3/xconfig/stats_layer.py | 143 +- .../libs/nnet3/xconfig/trivial_layers.py | 497 ++- .../asr1/steps/libs/nnet3/xconfig/utils.py | 479 ++- .../convert_utt2spk_and_segments_to_rttm.py | 49 +- .../asr1/utils/data/extend_segment_times.py | 94 +- egs2/TEMPLATE/tts1/README.md | 40 +- egs2/TEMPLATE/tts1/tts.sh | 3 + 31 files changed, 9135 insertions(+), 5456 deletions(-) rename egs2/TEMPLATE/asr1/pyscripts/utils/{convert_to_avg_xvectors.py => convert_to_avg_spk_embed.py} (51%) diff --git a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_to_avg_xvectors.py b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_to_avg_spk_embed.py similarity index 51% rename from egs2/TEMPLATE/asr1/pyscripts/utils/convert_to_avg_xvectors.py rename to egs2/TEMPLATE/asr1/pyscripts/utils/convert_to_avg_spk_embed.py index 23d5ac81888..6cad05d1c0e 100755 --- a/egs2/TEMPLATE/asr1/pyscripts/utils/convert_to_avg_xvectors.py +++ b/egs2/TEMPLATE/asr1/pyscripts/utils/convert_to_avg_spk_embed.py @@ -9,31 +9,31 @@ def get_parser(): parser = argparse.ArgumentParser( description=""" - Replaces xvectors in a specified xvector directory with the average xvector + Replaces spk_embeds in a specified spk_embed directory with the average spk_embed for a given speaker. - The xvectors generally reside in dump/xvector//xvector.scp, whereas - speaker-averaged xvectors reside in dump/xvector//spk_xvector.scp. + The spk_embeds generally reside in dump/${spk_embed_tag}//${spk_embed_tag}.scp, whereas + speaker-averaged spk_embeds reside in dump/${spk_embed_tag}//spk_${spk_embed_tag}.scp. - The old xvector.scp file will be renamed to xvector.scp.bak and + The old spk_embed.scp file will be renamed to spk_embed.scp.bak and the corresponding .ark files are left unchanged. - If no speaker id is provided, the average xvector for the speaker who + If no speaker id is provided, the average spk_embed for the speaker who the utterance belongs to will be used in each case. - At inference time in a TTS task, you are unlikely to have the xvector - for that sentence in particular. Thus, using average xvectors + At inference time in a TTS task, you are unlikely to have the spk_embed + for that sentence in particular. Thus, using average spk_embeds during training may yield better performance at inference time. This is also useful for conditioning inference on a particular speaker. To transform the training data, this script should be run after - xvectors are extracted (stage 2), but before training commences (stage 6). + spk_embeds are extracted (stage 3), but before training commences (stage 7). """ ) parser.add_argument( - "--xvector-path", + "--utt-embed-path", type=str, required=True, - help="Path to the xvector file to be modified.", + help="Path to the spk_embed file to be modified.", ) group = parser.add_mutually_exclusive_group(required=True) group.add_argument( @@ -47,30 +47,30 @@ def get_parser(): help="Path to the relevant utt2spk file, if the source speakers are used", ) parser.add_argument( - "--spk-xvector-path", + "--spk-embed-path", type=str, required=True, - help="The path to the spk_xvector.scp file for the speakers being used.", + help="The path to the spk_{spk_embed_tag}.scp file for the speakers being used.", ) return parser def check_args(args): - xvector_path = args.xvector_path - spk_xvector_path = args.spk_xvector_path + spk_embed_path = args.spk_embed_path + spk_spk_embed_path = args.spk_spk_embed_path utt2spk = args.utt2spk - if not os.path.exists(xvector_path): + if not os.path.exists(spk_embed_path): sys.stderr.write( - f"Error: provided --xvector-path ({xvector_path}) does not exist. " + f"Error: provided --utt-embed-path ({utt_embed_path}) does not exist. " ) sys.stderr.write("Exiting...\n") sys.stderr.flush() exit(1) - if not os.path.exists(spk_xvector_path): + if not os.path.exists(spk_spk_embed_path): sys.stderr.write( - f"Error: provided --spk-xvector-path ({spk_xvector_path}) does not exist. " + f"Error: provided --spk-embed-path ({spk_embed_path}) does not exist. " ) sys.stderr.write("Exiting...\n") sys.stderr.flush() @@ -88,34 +88,34 @@ def check_args(args): check_args(args) spk_id = args.spk_id utt2spk = args.utt2spk - xvector_path = args.xvector_path - spk_xvector_path = args.spk_xvector_path + utt_embed_path = args.utt_embed_path + spk_embed_path = args.spk_embed_path - print(f"Loading {spk_xvector_path}...") - spk_xvector_paths = {} - with open(spk_xvector_path) as spembfile: + print(f"Loading {spk_embed_path}...") + spk_embed_paths = {} + with open(spk_embed_path) as spembfile: for line in spembfile.readlines(): spkid, spembpath = line.split() - spk_xvector_paths[spkid] = spembpath + spk_embed_paths[spkid] = spembpath - if spk_id and (spk_id not in spk_xvector_paths): + if spk_id and (spk_id not in spk_embed_paths): sys.stderr.write( - f"Error: provided --spk-id: {spk_id} not present in --spk-xvector-path." + f"Error: provided --spk-id: {spk_id} not present in --spk-embed-path." ) sys.stderr.write("Exiting...\n") sys.stderr.flush() exit(1) - print("Backing up xvector file...") - print(os.path.dirname(xvector_path)) - shutil.copy(xvector_path, f"{os.path.dirname(xvector_path)}/xvector.scp.bak") + print("Backing up utt_embed file...") + print(os.path.dirname(utt_embed_path)) + shutil.copy(utt_embed_path, f"{os.path.dirname(utt_embed_path)}/${spk_embed_tag}.scp.bak") - utt2xvector = [] - with open(args.xvector_path) as f: + utt2spk_embed = [] + with open(args.utt_embed_path) as f: for line in f.readlines(): - utt, xvector = line.split() - utt2xvector.append((utt, spk_xvector_paths[spk_id])) + utt, spk_embed = line.split() + utt2spk_embed.append((utt, spk_embed_paths[spk_id])) - with open(args.xvector_path, "w") as f: - for utt, xvector in utt2xvector: - f.write(f"{utt} {xvector}\n") + with open(args.utt_embed_path, "w") as f: + for utt, spk_embed in utt2spk_embed: + f.write(f"{utt} {spk_embed}\n") diff --git a/egs2/TEMPLATE/asr1/steps/data/data_dir_manipulation_lib.py b/egs2/TEMPLATE/asr1/steps/data/data_dir_manipulation_lib.py index 0092424ac5f..3e6361eee4a 100644 --- a/egs2/TEMPLATE/asr1/steps/data/data_dir_manipulation_lib.py +++ b/egs2/TEMPLATE/asr1/steps/data/data_dir_manipulation_lib.py @@ -1,17 +1,22 @@ import subprocess -def RunKaldiCommand(command, wait = True): - """ Runs commands frequently seen in Kaldi scripts. These are usually a - sequence of commands connected by pipes, so we use shell=True """ - #logger.info("Running the command\n{0}".format(command)) - p = subprocess.Popen(command, shell = True, - stdout = subprocess.PIPE, - stderr = subprocess.PIPE) + +def RunKaldiCommand(command, wait=True): + """Runs commands frequently seen in Kaldi scripts. These are usually a + sequence of commands connected by pipes, so we use shell=True""" + # logger.info("Running the command\n{0}".format(command)) + p = subprocess.Popen( + command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) if wait: [stdout, stderr] = p.communicate() if p.returncode is not 0: - raise Exception("There was an error while running the command {0}\n------------\n{1}".format(command, stderr)) + raise Exception( + "There was an error while running the command {0}\n------------\n{1}".format( + command, stderr + ) + ) return stdout, stderr else: return p diff --git a/egs2/TEMPLATE/asr1/steps/data/reverberate_data_dir.py b/egs2/TEMPLATE/asr1/steps/data/reverberate_data_dir.py index ea504244d38..fb9a9624467 100755 --- a/egs2/TEMPLATE/asr1/steps/data/reverberate_data_dir.py +++ b/egs2/TEMPLATE/asr1/steps/data/reverberate_data_dir.py @@ -7,87 +7,168 @@ import argparse, shlex, glob, math, os, random, sys, warnings, copy, imp, ast -data_lib = imp.load_source('dml', 'steps/data/data_dir_manipulation_lib.py') +data_lib = imp.load_source("dml", "steps/data/data_dir_manipulation_lib.py") + def get_args(): # we add required arguments as named arguments for readability - parser = argparse.ArgumentParser(description="Reverberate the data directory with an option " - "to add isotropic and point source noises. " - "Usage: reverberate_data_dir.py [options...] " - "E.g. reverberate_data_dir.py --rir-set-parameters rir_list " - "--foreground-snrs 20:10:15:5:0 --background-snrs 20:10:15:5:0 " - "--noise-list-file noise_list --speech-rvb-probability 1 --num-replications 2 " - "--random-seed 1 data/train data/train_rvb", - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - - parser.add_argument("--rir-set-parameters", type=str, action='append', required = True, dest = "rir_set_para_array", - help="Specifies the parameters of an RIR set. " - "Supports the specification of mixture_weight and rir_list_file_name. The mixture weight is optional. " - "The default mixture weight is the probability mass remaining after adding the mixture weights " - "of all the RIR lists, uniformly divided among the RIR lists without mixture weights. " - "E.g. --rir-set-parameters '0.3, rir_list' or 'rir_list' " - "the format of the RIR list file is " - "--rir-id --room-id " - "--receiver-position-id --source-position-id " - "--rt-60 --drr location " - "E.g. --rir-id 00001 --room-id 001 --receiver-position-id 001 --source-position-id 00001 " - "--rt60 0.58 --drr -4.885 data/impulses/Room001-00001.wav") - parser.add_argument("--noise-set-parameters", type=str, action='append', default = None, dest = "noise_set_para_array", - help="Specifies the parameters of an noise set. " - "Supports the specification of mixture_weight and noise_list_file_name. The mixture weight is optional. " - "The default mixture weight is the probability mass remaining after adding the mixture weights " - "of all the noise lists, uniformly divided among the noise lists without mixture weights. " - "E.g. --noise-set-parameters '0.3, noise_list' or 'noise_list' " - "the format of the noise list file is " - "--noise-id --noise-type " - "--bg-fg-type " - "--room-linkage " - "location " - "E.g. --noise-id 001 --noise-type isotropic --rir-id 00019 iso_noise.wav") - parser.add_argument("--num-replications", type=int, dest = "num_replicas", default = 1, - help="Number of replicate to generated for the data") - parser.add_argument('--foreground-snrs', type=str, dest = "foreground_snr_string", default = '20:10:0', help='When foreground noises are being added the script will iterate through these SNRs.') - parser.add_argument('--background-snrs', type=str, dest = "background_snr_string", default = '20:10:0', help='When background noises are being added the script will iterate through these SNRs.') - parser.add_argument('--prefix', type=str, default = None, help='This prefix will modified for each reverberated copy, by adding additional affixes.') - parser.add_argument("--speech-rvb-probability", type=float, default = 1.0, - help="Probability of reverberating a speech signal, e.g. 0 <= p <= 1") - parser.add_argument("--pointsource-noise-addition-probability", type=float, default = 1.0, - help="Probability of adding point-source noises, e.g. 0 <= p <= 1") - parser.add_argument("--isotropic-noise-addition-probability", type=float, default = 1.0, - help="Probability of adding isotropic noises, e.g. 0 <= p <= 1") - parser.add_argument("--rir-smoothing-weight", type=float, default = 0.3, - help="Smoothing weight for the RIR probabilties, e.g. 0 <= p <= 1. If p = 0, no smoothing will be done. " - "The RIR distribution will be mixed with a uniform distribution according to the smoothing weight") - parser.add_argument("--noise-smoothing-weight", type=float, default = 0.3, - help="Smoothing weight for the noise probabilties, e.g. 0 <= p <= 1. If p = 0, no smoothing will be done. " - "The noise distribution will be mixed with a uniform distribution according to the smoothing weight") - parser.add_argument("--max-noises-per-minute", type=int, default = 2, - help="This controls the maximum number of point-source noises that could be added to a recording according to its duration") - parser.add_argument('--random-seed', type=int, default=0, help='seed to be used in the randomization of impulses and noises') - parser.add_argument("--shift-output", type=str, help="If true, the reverberated waveform will be shifted by the amount of the peak position of the RIR", - choices=['true', 'false'], default = "true") - parser.add_argument('--source-sampling-rate', type=int, default=None, - help="Sampling rate of the source data. If a positive integer is specified with this option, " - "the RIRs/noises will be resampled to the rate of the source data.") - parser.add_argument("--include-original-data", type=str, help="If true, the output data includes one copy of the original data", - choices=['true', 'false'], default = "false") - parser.add_argument("input_dir", - help="Input data directory") - parser.add_argument("output_dir", - help="Output data directory") - - print(' '.join(sys.argv)) + parser = argparse.ArgumentParser( + description="Reverberate the data directory with an option " + "to add isotropic and point source noises. " + "Usage: reverberate_data_dir.py [options...] " + "E.g. reverberate_data_dir.py --rir-set-parameters rir_list " + "--foreground-snrs 20:10:15:5:0 --background-snrs 20:10:15:5:0 " + "--noise-list-file noise_list --speech-rvb-probability 1 --num-replications 2 " + "--random-seed 1 data/train data/train_rvb", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "--rir-set-parameters", + type=str, + action="append", + required=True, + dest="rir_set_para_array", + help="Specifies the parameters of an RIR set. " + "Supports the specification of mixture_weight and rir_list_file_name. The mixture weight is optional. " + "The default mixture weight is the probability mass remaining after adding the mixture weights " + "of all the RIR lists, uniformly divided among the RIR lists without mixture weights. " + "E.g. --rir-set-parameters '0.3, rir_list' or 'rir_list' " + "the format of the RIR list file is " + "--rir-id --room-id " + "--receiver-position-id --source-position-id " + "--rt-60 --drr location " + "E.g. --rir-id 00001 --room-id 001 --receiver-position-id 001 --source-position-id 00001 " + "--rt60 0.58 --drr -4.885 data/impulses/Room001-00001.wav", + ) + parser.add_argument( + "--noise-set-parameters", + type=str, + action="append", + default=None, + dest="noise_set_para_array", + help="Specifies the parameters of an noise set. " + "Supports the specification of mixture_weight and noise_list_file_name. The mixture weight is optional. " + "The default mixture weight is the probability mass remaining after adding the mixture weights " + "of all the noise lists, uniformly divided among the noise lists without mixture weights. " + "E.g. --noise-set-parameters '0.3, noise_list' or 'noise_list' " + "the format of the noise list file is " + "--noise-id --noise-type " + "--bg-fg-type " + "--room-linkage " + "location " + "E.g. --noise-id 001 --noise-type isotropic --rir-id 00019 iso_noise.wav", + ) + parser.add_argument( + "--num-replications", + type=int, + dest="num_replicas", + default=1, + help="Number of replicate to generated for the data", + ) + parser.add_argument( + "--foreground-snrs", + type=str, + dest="foreground_snr_string", + default="20:10:0", + help="When foreground noises are being added the script will iterate through these SNRs.", + ) + parser.add_argument( + "--background-snrs", + type=str, + dest="background_snr_string", + default="20:10:0", + help="When background noises are being added the script will iterate through these SNRs.", + ) + parser.add_argument( + "--prefix", + type=str, + default=None, + help="This prefix will modified for each reverberated copy, by adding additional affixes.", + ) + parser.add_argument( + "--speech-rvb-probability", + type=float, + default=1.0, + help="Probability of reverberating a speech signal, e.g. 0 <= p <= 1", + ) + parser.add_argument( + "--pointsource-noise-addition-probability", + type=float, + default=1.0, + help="Probability of adding point-source noises, e.g. 0 <= p <= 1", + ) + parser.add_argument( + "--isotropic-noise-addition-probability", + type=float, + default=1.0, + help="Probability of adding isotropic noises, e.g. 0 <= p <= 1", + ) + parser.add_argument( + "--rir-smoothing-weight", + type=float, + default=0.3, + help="Smoothing weight for the RIR probabilties, e.g. 0 <= p <= 1. If p = 0, no smoothing will be done. " + "The RIR distribution will be mixed with a uniform distribution according to the smoothing weight", + ) + parser.add_argument( + "--noise-smoothing-weight", + type=float, + default=0.3, + help="Smoothing weight for the noise probabilties, e.g. 0 <= p <= 1. If p = 0, no smoothing will be done. " + "The noise distribution will be mixed with a uniform distribution according to the smoothing weight", + ) + parser.add_argument( + "--max-noises-per-minute", + type=int, + default=2, + help="This controls the maximum number of point-source noises that could be added to a recording according to its duration", + ) + parser.add_argument( + "--random-seed", + type=int, + default=0, + help="seed to be used in the randomization of impulses and noises", + ) + parser.add_argument( + "--shift-output", + type=str, + help="If true, the reverberated waveform will be shifted by the amount of the peak position of the RIR", + choices=["true", "false"], + default="true", + ) + parser.add_argument( + "--source-sampling-rate", + type=int, + default=None, + help="Sampling rate of the source data. If a positive integer is specified with this option, " + "the RIRs/noises will be resampled to the rate of the source data.", + ) + parser.add_argument( + "--include-original-data", + type=str, + help="If true, the output data includes one copy of the original data", + choices=["true", "false"], + default="false", + ) + parser.add_argument("input_dir", help="Input data directory") + parser.add_argument("output_dir", help="Output data directory") + + print(" ".join(sys.argv)) args = parser.parse_args() args = check_args(args) return args + def check_args(args): if args.prefix is None: if args.num_replicas > 1 or args.include_original_data == "true": args.prefix = "rvb" - warnings.warn("--prefix is set to 'rvb' as more than one copy of data is generated") + warnings.warn( + "--prefix is set to 'rvb' as more than one copy of data is generated" + ) if not args.num_replicas > 0: raise Exception("--num-replications cannot be non-positive") @@ -95,11 +176,21 @@ def check_args(args): if args.speech_rvb_probability < 0 or args.speech_rvb_probability > 1: raise Exception("--speech-rvb-probability must be between 0 and 1") - if args.pointsource_noise_addition_probability < 0 or args.pointsource_noise_addition_probability > 1: - raise Exception("--pointsource-noise-addition-probability must be between 0 and 1") - - if args.isotropic_noise_addition_probability < 0 or args.isotropic_noise_addition_probability > 1: - raise Exception("--isotropic-noise-addition-probability must be between 0 and 1") + if ( + args.pointsource_noise_addition_probability < 0 + or args.pointsource_noise_addition_probability > 1 + ): + raise Exception( + "--pointsource-noise-addition-probability must be between 0 and 1" + ) + + if ( + args.isotropic_noise_addition_probability < 0 + or args.isotropic_noise_addition_probability > 1 + ): + raise Exception( + "--isotropic-noise-addition-probability must be between 0 and 1" + ) if args.rir_smoothing_weight < 0 or args.rir_smoothing_weight > 1: raise Exception("--rir-smoothing-weight must be between 0 and 1") @@ -129,11 +220,12 @@ def __next__(self): next = __next__ # for Python 2 + def pick_item_with_probability(x): - """ This functions picks an item from the collection according to the associated - probability distribution. The probability estimate of each item in the collection - is stored in the "probability" field of the particular item. x : a - collection (list or dictionary) where the values contain a field called probability + """This functions picks an item from the collection according to the associated + probability distribution. The probability estimate of each item in the collection + is stored in the "probability" field of the particular item. x : a + collection (list or dictionary) where the values contain a field called probability """ if isinstance(x, dict): keylist = list(x.keys()) @@ -149,53 +241,58 @@ def pick_item_with_probability(x): if accumulate_p + item.probability >= p: return item accumulate_p += item.probability - assert False, "Shouldn't get here as the accumulated probability should always equal to 1" + assert ( + False + ), "Shouldn't get here as the accumulated probability should always equal to 1" -def parse_file_to_dict(file, assert2fields = False, value_processor = None): - """ This function parses a file and pack the data into a dictionary - It is useful for parsing file like wav.scp, utt2spk, text...etc +def parse_file_to_dict(file, assert2fields=False, value_processor=None): + """This function parses a file and pack the data into a dictionary + It is useful for parsing file like wav.scp, utt2spk, text...etc """ if value_processor is None: value_processor = lambda x: x[0] dict = {} - for line in open(file, 'r', encoding='utf-8'): + for line in open(file, "r", encoding="utf-8"): parts = line.split() if assert2fields: - assert(len(parts) == 2) + assert len(parts) == 2 dict[parts[0]] = value_processor(parts[1:]) return dict + def write_dict_to_file(dict, file_name): - """ This function creates a file and write the content of a dictionary into it - """ - file = open(file_name, 'w', encoding='utf-8') + """This function creates a file and write the content of a dictionary into it""" + file = open(file_name, "w", encoding="utf-8") keys = sorted(dict.keys()) for key in keys: value = dict[key] - if type(value) in [list, tuple] : + if type(value) in [list, tuple]: if type(value) is tuple: value = list(value) value = sorted(value) - value = ' '.join(str(value)) - file.write('{0} {1}\n'.format(key, value)) + value = " ".join(str(value)) + file.write("{0} {1}\n".format(key, value)) file.close() -def create_corrupted_utt2uniq(input_dir, output_dir, num_replicas, include_original, prefix): - """This function creates the utt2uniq file from the utterance id in utt2spk file - """ +def create_corrupted_utt2uniq( + input_dir, output_dir, num_replicas, include_original, prefix +): + """This function creates the utt2uniq file from the utterance id in utt2spk file""" corrupted_utt2uniq = {} # Parse the utt2spk to get the utterance id - utt2spk = parse_file_to_dict(input_dir + "/utt2spk", value_processor = lambda x: " ".join(x)) + utt2spk = parse_file_to_dict( + input_dir + "/utt2spk", value_processor=lambda x: " ".join(x) + ) keys = sorted(utt2spk.keys()) if include_original: start_index = 0 else: start_index = 1 - for i in range(start_index, num_replicas+1): + for i in range(start_index, num_replicas + 1): for utt_id in keys: new_utt_id = get_new_id(utt_id, prefix, i) corrupted_utt2uniq[new_utt_id] = utt_id @@ -203,16 +300,21 @@ def create_corrupted_utt2uniq(input_dir, output_dir, num_replicas, include_origi write_dict_to_file(corrupted_utt2uniq, output_dir + "/utt2uniq") -def add_point_source_noise(noise_addition_descriptor, # descriptor to store the information of the noise added - room, # the room selected - pointsource_noise_list, # the point source noise list - pointsource_noise_addition_probability, # Probability of adding point-source noises - foreground_snrs, # the SNR for adding the foreground noises - background_snrs, # the SNR for adding the background noises - speech_dur, # duration of the recording - max_noises_recording # Maximum number of point-source noises that can be added - ): - if len(pointsource_noise_list) > 0 and random.random() < pointsource_noise_addition_probability and max_noises_recording >= 1: +def add_point_source_noise( + noise_addition_descriptor, # descriptor to store the information of the noise added + room, # the room selected + pointsource_noise_list, # the point source noise list + pointsource_noise_addition_probability, # Probability of adding point-source noises + foreground_snrs, # the SNR for adding the foreground noises + background_snrs, # the SNR for adding the background noises + speech_dur, # duration of the recording + max_noises_recording, # Maximum number of point-source noises that can be added +): + if ( + len(pointsource_noise_list) > 0 + and random.random() < pointsource_noise_addition_probability + and max_noises_recording >= 1 + ): for k in range(random.randint(1, max_noises_recording)): # pick the RIR to reverberate the point-source noise noise = pick_item_with_probability(pointsource_noise_list) @@ -220,42 +322,53 @@ def add_point_source_noise(noise_addition_descriptor, # descriptor to store the # If it is a background noise, the noise will be extended and be added to the whole speech # if it is a foreground noise, the noise will not extended and be added at a random time of the speech if noise.bg_fg_type == "background": - noise_rvb_command = """wav-reverberate --impulse-response="{0}" --duration={1}""".format(noise_rir.rir_rspecifier, speech_dur) - noise_addition_descriptor['start_times'].append(0) - noise_addition_descriptor['snrs'].append(next(background_snrs)) + noise_rvb_command = """wav-reverberate --impulse-response="{0}" --duration={1}""".format( + noise_rir.rir_rspecifier, speech_dur + ) + noise_addition_descriptor["start_times"].append(0) + noise_addition_descriptor["snrs"].append(next(background_snrs)) else: - noise_rvb_command = """wav-reverberate --impulse-response="{0}" """.format(noise_rir.rir_rspecifier) - noise_addition_descriptor['start_times'].append(round(random.random() * speech_dur, 2)) - noise_addition_descriptor['snrs'].append(next(foreground_snrs)) + noise_rvb_command = ( + """wav-reverberate --impulse-response="{0}" """.format( + noise_rir.rir_rspecifier + ) + ) + noise_addition_descriptor["start_times"].append( + round(random.random() * speech_dur, 2) + ) + noise_addition_descriptor["snrs"].append(next(foreground_snrs)) # check if the rspecifier is a pipe or not if len(noise.noise_rspecifier.split()) == 1: - noise_addition_descriptor['noise_io'].append("{1} {0} - |".format(noise.noise_rspecifier, noise_rvb_command)) + noise_addition_descriptor["noise_io"].append( + "{1} {0} - |".format(noise.noise_rspecifier, noise_rvb_command) + ) else: - noise_addition_descriptor['noise_io'].append("{0} {1} - - |".format(noise.noise_rspecifier, noise_rvb_command)) + noise_addition_descriptor["noise_io"].append( + "{0} {1} - - |".format(noise.noise_rspecifier, noise_rvb_command) + ) return noise_addition_descriptor -def generate_reverberation_opts(room_dict, # the room dictionary, please refer to make_room_dict() for the format - pointsource_noise_list, # the point source noise list - iso_noise_dict, # the isotropic noise dictionary - foreground_snrs, # the SNR for adding the foreground noises - background_snrs, # the SNR for adding the background noises - speech_rvb_probability, # Probability of reverberating a speech signal - isotropic_noise_addition_probability, # Probability of adding isotropic noises - pointsource_noise_addition_probability, # Probability of adding point-source noises - speech_dur, # duration of the recording - max_noises_recording # Maximum number of point-source noises that can be added - ): - """ This function randomly decides whether to reverberate, and sample a RIR if it does - It also decides whether to add the appropriate noises - This function return the string of options to the binary wav-reverberate +def generate_reverberation_opts( + room_dict, # the room dictionary, please refer to make_room_dict() for the format + pointsource_noise_list, # the point source noise list + iso_noise_dict, # the isotropic noise dictionary + foreground_snrs, # the SNR for adding the foreground noises + background_snrs, # the SNR for adding the background noises + speech_rvb_probability, # Probability of reverberating a speech signal + isotropic_noise_addition_probability, # Probability of adding isotropic noises + pointsource_noise_addition_probability, # Probability of adding point-source noises + speech_dur, # duration of the recording + max_noises_recording, # Maximum number of point-source noises that can be added +): + """This function randomly decides whether to reverberate, and sample a RIR if it does + It also decides whether to add the appropriate noises + This function return the string of options to the binary wav-reverberate """ reverberate_opts = "" - noise_addition_descriptor = {'noise_io': [], - 'start_times': [], - 'snrs': []} + noise_addition_descriptor = {"noise_io": [], "start_times": [], "snrs": []} # Randomly select the room # Here the room probability is a sum of the probabilities of the RIRs recorded in the room. room = pick_item_with_probability(room_dict) @@ -263,46 +376,71 @@ def generate_reverberation_opts(room_dict, # the room dictionary, please refer speech_rir = pick_item_with_probability(room.rir_list) if random.random() < speech_rvb_probability: # pick the RIR to reverberate the speech - reverberate_opts += """--impulse-response="{0}" """.format(speech_rir.rir_rspecifier) + reverberate_opts += """--impulse-response="{0}" """.format( + speech_rir.rir_rspecifier + ) rir_iso_noise_list = [] if speech_rir.room_id in iso_noise_dict: rir_iso_noise_list = iso_noise_dict[speech_rir.room_id] # Add the corresponding isotropic noise associated with the selected RIR - if len(rir_iso_noise_list) > 0 and random.random() < isotropic_noise_addition_probability: + if ( + len(rir_iso_noise_list) > 0 + and random.random() < isotropic_noise_addition_probability + ): isotropic_noise = pick_item_with_probability(rir_iso_noise_list) # extend the isotropic noise to the length of the speech waveform # check if the rspecifier is a pipe or not if len(isotropic_noise.noise_rspecifier.split()) == 1: - noise_addition_descriptor['noise_io'].append("wav-reverberate --duration={1} {0} - |".format(isotropic_noise.noise_rspecifier, speech_dur)) + noise_addition_descriptor["noise_io"].append( + "wav-reverberate --duration={1} {0} - |".format( + isotropic_noise.noise_rspecifier, speech_dur + ) + ) else: - noise_addition_descriptor['noise_io'].append("{0} wav-reverberate --duration={1} - - |".format(isotropic_noise.noise_rspecifier, speech_dur)) - noise_addition_descriptor['start_times'].append(0) - noise_addition_descriptor['snrs'].append(next(background_snrs)) - - noise_addition_descriptor = add_point_source_noise(noise_addition_descriptor, # descriptor to store the information of the noise added - room, # the room selected - pointsource_noise_list, # the point source noise list - pointsource_noise_addition_probability, # Probability of adding point-source noises - foreground_snrs, # the SNR for adding the foreground noises - background_snrs, # the SNR for adding the background noises - speech_dur, # duration of the recording - max_noises_recording # Maximum number of point-source noises that can be added - ) - - assert len(noise_addition_descriptor['noise_io']) == len(noise_addition_descriptor['start_times']) - assert len(noise_addition_descriptor['noise_io']) == len(noise_addition_descriptor['snrs']) - if len(noise_addition_descriptor['noise_io']) > 0: - reverberate_opts += "--additive-signals='{0}' ".format(','.join(noise_addition_descriptor['noise_io'])) - reverberate_opts += "--start-times='{0}' ".format(','.join([str(x) for x in noise_addition_descriptor['start_times']])) - reverberate_opts += "--snrs='{0}' ".format(','.join([str(x) for x in noise_addition_descriptor['snrs']])) + noise_addition_descriptor["noise_io"].append( + "{0} wav-reverberate --duration={1} - - |".format( + isotropic_noise.noise_rspecifier, speech_dur + ) + ) + noise_addition_descriptor["start_times"].append(0) + noise_addition_descriptor["snrs"].append(next(background_snrs)) + + noise_addition_descriptor = add_point_source_noise( + noise_addition_descriptor, # descriptor to store the information of the noise added + room, # the room selected + pointsource_noise_list, # the point source noise list + pointsource_noise_addition_probability, # Probability of adding point-source noises + foreground_snrs, # the SNR for adding the foreground noises + background_snrs, # the SNR for adding the background noises + speech_dur, # duration of the recording + max_noises_recording, # Maximum number of point-source noises that can be added + ) + + assert len(noise_addition_descriptor["noise_io"]) == len( + noise_addition_descriptor["start_times"] + ) + assert len(noise_addition_descriptor["noise_io"]) == len( + noise_addition_descriptor["snrs"] + ) + if len(noise_addition_descriptor["noise_io"]) > 0: + reverberate_opts += "--additive-signals='{0}' ".format( + ",".join(noise_addition_descriptor["noise_io"]) + ) + reverberate_opts += "--start-times='{0}' ".format( + ",".join([str(x) for x in noise_addition_descriptor["start_times"]]) + ) + reverberate_opts += "--snrs='{0}' ".format( + ",".join([str(x) for x in noise_addition_descriptor["snrs"]]) + ) return reverberate_opts + def get_new_id(id, prefix=None, copy=0): - """ This function generates a new id from the input id - This is needed when we have to create multiple copies of the original data - E.g. get_new_id("swb0035", prefix="rvb", copy=1) returns a string "rvb1-swb0035" + """This function generates a new id from the input id + This is needed when we have to create multiple copies of the original data + E.g. get_new_id("swb0035", prefix="rvb", copy=1) returns a string "rvb1-swb0035" """ if prefix is not None: new_id = prefix + str(copy) + "-" + id @@ -312,27 +450,28 @@ def get_new_id(id, prefix=None, copy=0): return new_id -def generate_reverberated_wav_scp(wav_scp, # a dictionary whose values are the Kaldi-IO strings of the speech recordings - durations, # a dictionary whose values are the duration (in sec) of the speech recordings - output_dir, # output directory to write the corrupted wav.scp - room_dict, # the room dictionary, please refer to make_room_dict() for the format - pointsource_noise_list, # the point source noise list - iso_noise_dict, # the isotropic noise dictionary - foreground_snr_array, # the SNR for adding the foreground noises - background_snr_array, # the SNR for adding the background noises - num_replicas, # Number of replicate to generated for the data - include_original, # include a copy of the original data - prefix, # prefix for the id of the corrupted utterances - speech_rvb_probability, # Probability of reverberating a speech signal - shift_output, # option whether to shift the output waveform - isotropic_noise_addition_probability, # Probability of adding isotropic noises - pointsource_noise_addition_probability, # Probability of adding point-source noises - max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration - ): - """ This is the main function to generate pipeline command for the corruption - The generic command of wav-reverberate will be like: - wav-reverberate --duration=t --impulse-response=rir.wav - --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav +def generate_reverberated_wav_scp( + wav_scp, # a dictionary whose values are the Kaldi-IO strings of the speech recordings + durations, # a dictionary whose values are the duration (in sec) of the speech recordings + output_dir, # output directory to write the corrupted wav.scp + room_dict, # the room dictionary, please refer to make_room_dict() for the format + pointsource_noise_list, # the point source noise list + iso_noise_dict, # the isotropic noise dictionary + foreground_snr_array, # the SNR for adding the foreground noises + background_snr_array, # the SNR for adding the background noises + num_replicas, # Number of replicate to generated for the data + include_original, # include a copy of the original data + prefix, # prefix for the id of the corrupted utterances + speech_rvb_probability, # Probability of reverberating a speech signal + shift_output, # option whether to shift the output waveform + isotropic_noise_addition_probability, # Probability of adding isotropic noises + pointsource_noise_addition_probability, # Probability of adding point-source noises + max_noises_per_minute, # maximum number of point-source noises that can be added to a recording according to its duration +): + """This is the main function to generate pipeline command for the corruption + The generic command of wav-reverberate will be like: + wav-reverberate --duration=t --impulse-response=rir.wav + --additive-signals='noise1.wav,noise2.wav' --snrs='snr1,snr2' --start-times='s1,s2' input.wav output.wav """ foreground_snrs = list_cyclic_iterator(foreground_snr_array) background_snrs = list_cyclic_iterator(background_snr_array) @@ -343,7 +482,7 @@ def generate_reverberated_wav_scp(wav_scp, # a dictionary whose values are the else: start_index = 1 - for i in range(start_index, num_replicas+1): + for i in range(start_index, num_replicas + 1): for recording_id in keys: wav_original_pipe = wav_scp[recording_id] # check if it is really a pipe @@ -352,23 +491,28 @@ def generate_reverberated_wav_scp(wav_scp, # a dictionary whose values are the speech_dur = durations[recording_id] max_noises_recording = math.floor(max_noises_per_minute * speech_dur / 60) - reverberate_opts = generate_reverberation_opts(room_dict, # the room dictionary, please refer to make_room_dict() for the format - pointsource_noise_list, # the point source noise list - iso_noise_dict, # the isotropic noise dictionary - foreground_snrs, # the SNR for adding the foreground noises - background_snrs, # the SNR for adding the background noises - speech_rvb_probability, # Probability of reverberating a speech signal - isotropic_noise_addition_probability, # Probability of adding isotropic noises - pointsource_noise_addition_probability, # Probability of adding point-source noises - speech_dur, # duration of the recording - max_noises_recording # Maximum number of point-source noises that can be added - ) + reverberate_opts = generate_reverberation_opts( + room_dict, # the room dictionary, please refer to make_room_dict() for the format + pointsource_noise_list, # the point source noise list + iso_noise_dict, # the isotropic noise dictionary + foreground_snrs, # the SNR for adding the foreground noises + background_snrs, # the SNR for adding the background noises + speech_rvb_probability, # Probability of reverberating a speech signal + isotropic_noise_addition_probability, # Probability of adding isotropic noises + pointsource_noise_addition_probability, # Probability of adding point-source noises + speech_dur, # duration of the recording + max_noises_recording, # Maximum number of point-source noises that can be added + ) # prefix using index 0 is reserved for original data e.g. rvb0_swb0035 corresponds to the swb0035 recording in original data if reverberate_opts == "" or i == 0: wav_corrupted_pipe = "{0}".format(wav_original_pipe) else: - wav_corrupted_pipe = "{0} wav-reverberate --shift-output={1} {2} - - |".format(wav_original_pipe, shift_output, reverberate_opts) + wav_corrupted_pipe = ( + "{0} wav-reverberate --shift-output={1} {2} - - |".format( + wav_original_pipe, shift_output, reverberate_opts + ) + ) new_recording_id = get_new_id(recording_id, prefix, i) corrupted_wav_scp[new_recording_id] = wav_corrupted_pipe @@ -376,19 +520,20 @@ def generate_reverberated_wav_scp(wav_scp, # a dictionary whose values are the write_dict_to_file(corrupted_wav_scp, output_dir + "/wav.scp") -def add_prefix_to_fields(input_file, output_file, num_replicas, include_original, prefix, field = [0]): - """ This function replicate the entries in files like segments, utt2spk, text - """ - list = [x.strip() for x in open(input_file, encoding='utf-8')] - f = open(output_file, "w", encoding='utf-8') +def add_prefix_to_fields( + input_file, output_file, num_replicas, include_original, prefix, field=[0] +): + """This function replicate the entries in files like segments, utt2spk, text""" + list = [x.strip() for x in open(input_file, encoding="utf-8")] + f = open(output_file, "w", encoding="utf-8") if include_original: start_index = 0 else: start_index = 1 - for i in range(start_index, num_replicas+1): + for i in range(start_index, num_replicas + 1): for line in list: - if len(line) > 0 and line[0] != ';': + if len(line) > 0 and line[0] != ";": split1 = line.split() for j in field: split1[j] = get_new_id(split1[j], prefix, i) @@ -398,109 +543,181 @@ def add_prefix_to_fields(input_file, output_file, num_replicas, include_original f.close() -def create_reverberated_copy(input_dir, - output_dir, - room_dict, # the room dictionary, please refer to make_room_dict() for the format - pointsource_noise_list, # the point source noise list - iso_noise_dict, # the isotropic noise dictionary - foreground_snr_string, # the SNR for adding the foreground noises - background_snr_string, # the SNR for adding the background noises - num_replicas, # Number of replicate to generated for the data - include_original, # include a copy of the original data - prefix, # prefix for the id of the corrupted utterances - speech_rvb_probability, # Probability of reverberating a speech signal - shift_output, # option whether to shift the output waveform - isotropic_noise_addition_probability, # Probability of adding isotropic noises - pointsource_noise_addition_probability, # Probability of adding point-source noises - max_noises_per_minute # maximum number of point-source noises that can be added to a recording according to its duration - ): - """ This function creates multiple copies of the necessary files, - e.g. utt2spk, wav.scp ... +def create_reverberated_copy( + input_dir, + output_dir, + room_dict, # the room dictionary, please refer to make_room_dict() for the format + pointsource_noise_list, # the point source noise list + iso_noise_dict, # the isotropic noise dictionary + foreground_snr_string, # the SNR for adding the foreground noises + background_snr_string, # the SNR for adding the background noises + num_replicas, # Number of replicate to generated for the data + include_original, # include a copy of the original data + prefix, # prefix for the id of the corrupted utterances + speech_rvb_probability, # Probability of reverberating a speech signal + shift_output, # option whether to shift the output waveform + isotropic_noise_addition_probability, # Probability of adding isotropic noises + pointsource_noise_addition_probability, # Probability of adding point-source noises + max_noises_per_minute, # maximum number of point-source noises that can be added to a recording according to its duration +): + """This function creates multiple copies of the necessary files, + e.g. utt2spk, wav.scp ... """ if not os.path.exists(output_dir): os.makedirs(output_dir) - wav_scp = parse_file_to_dict(input_dir + "/wav.scp", value_processor = lambda x: " ".join(x)) + wav_scp = parse_file_to_dict( + input_dir + "/wav.scp", value_processor=lambda x: " ".join(x) + ) if not os.path.isfile(input_dir + "/reco2dur"): - print("Getting the duration of the recordings..."); + print("Getting the duration of the recordings...") data_lib.RunKaldiCommand("utils/data/get_reco2dur.sh {}".format(input_dir)) - durations = parse_file_to_dict(input_dir + "/reco2dur", value_processor = lambda x: float(x[0])) - foreground_snr_array = [float(x) for x in foreground_snr_string.split(':')] - background_snr_array = [float(x) for x in background_snr_string.split(':')] - - generate_reverberated_wav_scp(wav_scp, durations, output_dir, room_dict, pointsource_noise_list, iso_noise_dict, - foreground_snr_array, background_snr_array, num_replicas, include_original, prefix, - speech_rvb_probability, shift_output, isotropic_noise_addition_probability, - pointsource_noise_addition_probability, max_noises_per_minute) - - add_prefix_to_fields(input_dir + "/utt2spk", output_dir + "/utt2spk", num_replicas, include_original, prefix, field = [0,1]) - data_lib.RunKaldiCommand("utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt" - .format(output_dir = output_dir)) + durations = parse_file_to_dict( + input_dir + "/reco2dur", value_processor=lambda x: float(x[0]) + ) + foreground_snr_array = [float(x) for x in foreground_snr_string.split(":")] + background_snr_array = [float(x) for x in background_snr_string.split(":")] + + generate_reverberated_wav_scp( + wav_scp, + durations, + output_dir, + room_dict, + pointsource_noise_list, + iso_noise_dict, + foreground_snr_array, + background_snr_array, + num_replicas, + include_original, + prefix, + speech_rvb_probability, + shift_output, + isotropic_noise_addition_probability, + pointsource_noise_addition_probability, + max_noises_per_minute, + ) + + add_prefix_to_fields( + input_dir + "/utt2spk", + output_dir + "/utt2spk", + num_replicas, + include_original, + prefix, + field=[0, 1], + ) + data_lib.RunKaldiCommand( + "utils/utt2spk_to_spk2utt.pl <{output_dir}/utt2spk >{output_dir}/spk2utt".format( + output_dir=output_dir + ) + ) if os.path.isfile(input_dir + "/utt2uniq"): - add_prefix_to_fields(input_dir + "/utt2uniq", output_dir + "/utt2uniq", num_replicas, include_original, prefix, field =[0]) + add_prefix_to_fields( + input_dir + "/utt2uniq", + output_dir + "/utt2uniq", + num_replicas, + include_original, + prefix, + field=[0], + ) else: # Create the utt2uniq file - create_corrupted_utt2uniq(input_dir, output_dir, num_replicas, include_original, prefix) + create_corrupted_utt2uniq( + input_dir, output_dir, num_replicas, include_original, prefix + ) if os.path.isfile(input_dir + "/text"): - add_prefix_to_fields(input_dir + "/text", output_dir + "/text", num_replicas, include_original, prefix, field =[0]) + add_prefix_to_fields( + input_dir + "/text", + output_dir + "/text", + num_replicas, + include_original, + prefix, + field=[0], + ) if os.path.isfile(input_dir + "/segments"): - add_prefix_to_fields(input_dir + "/segments", output_dir + "/segments", num_replicas, include_original, prefix, field = [0,1]) + add_prefix_to_fields( + input_dir + "/segments", + output_dir + "/segments", + num_replicas, + include_original, + prefix, + field=[0, 1], + ) if os.path.isfile(input_dir + "/reco2file_and_channel"): - add_prefix_to_fields(input_dir + "/reco2file_and_channel", output_dir + "/reco2file_and_channel", num_replicas, include_original, prefix, field = [0,1]) + add_prefix_to_fields( + input_dir + "/reco2file_and_channel", + output_dir + "/reco2file_and_channel", + num_replicas, + include_original, + prefix, + field=[0, 1], + ) if os.path.isfile(input_dir + "/vad.scp"): - add_prefix_to_fields(input_dir + "/vad.scp", output_dir + "/vad.scp", num_replicas, include_original, prefix, field=[0]) - - data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats --no-text {output_dir}" - .format(output_dir = output_dir)) + add_prefix_to_fields( + input_dir + "/vad.scp", + output_dir + "/vad.scp", + num_replicas, + include_original, + prefix, + field=[0], + ) + + data_lib.RunKaldiCommand( + "utils/validate_data_dir.sh --no-feats --no-text {output_dir}".format( + output_dir=output_dir + ) + ) def smooth_probability_distribution(set_list, smoothing_weight=0.0, target_sum=1.0): - """ This function smooths the probability distribution in the list - """ + """This function smooths the probability distribution in the list""" if len(list(set_list)) > 0: - num_unspecified = 0 - accumulated_prob = 0 - for item in set_list: - if item.probability is None: - num_unspecified += 1 - else: - accumulated_prob += item.probability - - # Compute the probability for the items without specifying their probability - uniform_probability = 0 - if num_unspecified > 0 and accumulated_prob < 1: - uniform_probability = (1 - accumulated_prob) / float(num_unspecified) - elif num_unspecified > 0 and accumulated_prob >= 1: - warnings.warn("The sum of probabilities specified by user is larger than or equal to 1. " - "The items without probabilities specified will be given zero to their probabilities.") - - for item in set_list: - if item.probability is None: - item.probability = uniform_probability - else: - # smooth the probability - item.probability = (1 - smoothing_weight) * item.probability + smoothing_weight * uniform_probability - - # Normalize the probability - sum_p = sum(item.probability for item in set_list) - for item in set_list: - item.probability = item.probability / sum_p * target_sum + num_unspecified = 0 + accumulated_prob = 0 + for item in set_list: + if item.probability is None: + num_unspecified += 1 + else: + accumulated_prob += item.probability + + # Compute the probability for the items without specifying their probability + uniform_probability = 0 + if num_unspecified > 0 and accumulated_prob < 1: + uniform_probability = (1 - accumulated_prob) / float(num_unspecified) + elif num_unspecified > 0 and accumulated_prob >= 1: + warnings.warn( + "The sum of probabilities specified by user is larger than or equal to 1. " + "The items without probabilities specified will be given zero to their probabilities." + ) + + for item in set_list: + if item.probability is None: + item.probability = uniform_probability + else: + # smooth the probability + item.probability = ( + 1 - smoothing_weight + ) * item.probability + smoothing_weight * uniform_probability + + # Normalize the probability + sum_p = sum(item.probability for item in set_list) + for item in set_list: + item.probability = item.probability / sum_p * target_sum return set_list def parse_set_parameter_strings(set_para_array): - """ This function parse the array of rir set parameter strings. - It will assign probabilities to those rir sets which don't have a probability - It will also check the existence of the rir list files. + """This function parse the array of rir set parameter strings. + It will assign probabilities to those rir sets which don't have a probability + It will also check the existence of the rir list files. """ set_list = [] for set_para in set_para_array: set = lambda: None setattr(set, "filename", None) setattr(set, "probability", None) - parts = set_para.split(',') + parts = set_para.split(",") if len(parts) == 2: set.probability = float(parts[0]) set.filename = parts[1].strip() @@ -513,53 +730,99 @@ def parse_set_parameter_strings(set_para_array): return smooth_probability_distribution(set_list) -def parse_rir_list(rir_set_para_array, smoothing_weight, sampling_rate = None): - """ This function creates the RIR list - Each rir object in the list contains the following attributes: - rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability - Please refer to the help messages in the parser for the meaning of these attributes +def parse_rir_list(rir_set_para_array, smoothing_weight, sampling_rate=None): + """This function creates the RIR list + Each rir object in the list contains the following attributes: + rir_id, room_id, receiver_position_id, source_position_id, rt60, drr, probability + Please refer to the help messages in the parser for the meaning of these attributes """ rir_parser = argparse.ArgumentParser() - rir_parser.add_argument('--rir-id', type=str, required=True, help='This id is unique for each RIR and the noise may associate with a particular RIR by refering to this id') - rir_parser.add_argument('--room-id', type=str, required=True, help='This is the room that where the RIR is generated') - rir_parser.add_argument('--receiver-position-id', type=str, default=None, help='receiver position id') - rir_parser.add_argument('--source-position-id', type=str, default=None, help='source position id') - rir_parser.add_argument('--rt60', type=float, default=None, help='RT60 is the time required for reflections of a direct sound to decay 60 dB.') - rir_parser.add_argument('--drr', type=float, default=None, help='Direct-to-reverberant-ratio of the impulse response.') - rir_parser.add_argument('--cte', type=float, default=None, help='Early-to-late index of the impulse response.') - rir_parser.add_argument('--probability', type=float, default=None, help='probability of the impulse response.') - rir_parser.add_argument('rir_rspecifier', type=str, help="""rir rspecifier, it can be either a filename or a piped command. - E.g. data/impulses/Room001-00001.wav or "sox data/impulses/Room001-00001.wav -t wav - |" """) + rir_parser.add_argument( + "--rir-id", + type=str, + required=True, + help="This id is unique for each RIR and the noise may associate with a particular RIR by refering to this id", + ) + rir_parser.add_argument( + "--room-id", + type=str, + required=True, + help="This is the room that where the RIR is generated", + ) + rir_parser.add_argument( + "--receiver-position-id", type=str, default=None, help="receiver position id" + ) + rir_parser.add_argument( + "--source-position-id", type=str, default=None, help="source position id" + ) + rir_parser.add_argument( + "--rt60", + type=float, + default=None, + help="RT60 is the time required for reflections of a direct sound to decay 60 dB.", + ) + rir_parser.add_argument( + "--drr", + type=float, + default=None, + help="Direct-to-reverberant-ratio of the impulse response.", + ) + rir_parser.add_argument( + "--cte", + type=float, + default=None, + help="Early-to-late index of the impulse response.", + ) + rir_parser.add_argument( + "--probability", + type=float, + default=None, + help="probability of the impulse response.", + ) + rir_parser.add_argument( + "rir_rspecifier", + type=str, + help="""rir rspecifier, it can be either a filename or a piped command. + E.g. data/impulses/Room001-00001.wav or "sox data/impulses/Room001-00001.wav -t wav - |" """, + ) set_list = parse_set_parameter_strings(rir_set_para_array) rir_list = [] for rir_set in set_list: - current_rir_list = [rir_parser.parse_args(shlex.split(x.strip())) for x in open(rir_set.filename)] + current_rir_list = [ + rir_parser.parse_args(shlex.split(x.strip())) + for x in open(rir_set.filename) + ] for rir in current_rir_list: if sampling_rate is not None: # check if the rspecifier is a pipe or not if len(rir.rir_rspecifier.split()) == 1: - rir.rir_rspecifier = "sox {0} -r {1} -t wav - |".format(rir.rir_rspecifier, sampling_rate) + rir.rir_rspecifier = "sox {0} -r {1} -t wav - |".format( + rir.rir_rspecifier, sampling_rate + ) else: - rir.rir_rspecifier = "{0} sox -t wav - -r {1} -t wav - |".format(rir.rir_rspecifier, sampling_rate) + rir.rir_rspecifier = "{0} sox -t wav - -r {1} -t wav - |".format( + rir.rir_rspecifier, sampling_rate + ) - rir_list += smooth_probability_distribution(current_rir_list, smoothing_weight, rir_set.probability) + rir_list += smooth_probability_distribution( + current_rir_list, smoothing_weight, rir_set.probability + ) return rir_list -def almost_equal(value_1, value_2, accuracy = 10**-8): - """ This function checks if the inputs are approximately equal assuming they are floats. - """ +def almost_equal(value_1, value_2, accuracy=10**-8): + """This function checks if the inputs are approximately equal assuming they are floats.""" return abs(value_1 - value_2) < accuracy def make_room_dict(rir_list): - """ This function converts a list of RIRs into a dictionary of RIRs indexed by the room-id. - Its values are objects with two attributes: a local RIR list - and the probability of the corresponding room - Please look at the comments at parse_rir_list() for the attributes that a RIR object contains + """This function converts a list of RIRs into a dictionary of RIRs indexed by the room-id. + Its values are objects with two attributes: a local RIR list + and the probability of the corresponding room + Please look at the comments at parse_rir_list() for the attributes that a RIR object contains """ room_dict = {} for rir in rir_list: @@ -572,50 +835,89 @@ def make_room_dict(rir_list): # the probability of the room is the sum of probabilities of its RIR for key in room_dict.keys(): - room_dict[key].probability = sum(rir.probability for rir in room_dict[key].rir_list) + room_dict[key].probability = sum( + rir.probability for rir in room_dict[key].rir_list + ) - assert almost_equal(sum(room_dict[key].probability for key in room_dict.keys()), 1.0) + assert almost_equal( + sum(room_dict[key].probability for key in room_dict.keys()), 1.0 + ) return room_dict -def parse_noise_list(noise_set_para_array, smoothing_weight, sampling_rate = None): - """ This function creates the point-source noise list - and the isotropic noise dictionary from the noise information file - The isotropic noise dictionary is indexed by the room - and its value is the corrresponding isotropic noise list - Each noise object in the list contains the following attributes: - noise_id, noise_type, bg_fg_type, room_linkage, probability, noise_rspecifier - Please refer to the help messages in the parser for the meaning of these attributes + +def parse_noise_list(noise_set_para_array, smoothing_weight, sampling_rate=None): + """This function creates the point-source noise list + and the isotropic noise dictionary from the noise information file + The isotropic noise dictionary is indexed by the room + and its value is the corrresponding isotropic noise list + Each noise object in the list contains the following attributes: + noise_id, noise_type, bg_fg_type, room_linkage, probability, noise_rspecifier + Please refer to the help messages in the parser for the meaning of these attributes """ noise_parser = argparse.ArgumentParser() - noise_parser.add_argument('--noise-id', type=str, required=True, help='noise id') - noise_parser.add_argument('--noise-type', type=str, required=True, help='the type of noise; i.e. isotropic or point-source', choices = ["isotropic", "point-source"]) - noise_parser.add_argument('--bg-fg-type', type=str, default="background", help='background or foreground noise, for background noises, ' - 'they will be extended before addition to cover the whole speech; for foreground noise, they will be kept ' - 'to their original duration and added at a random point of the speech.', choices = ["background", "foreground"]) - noise_parser.add_argument('--room-linkage', type=str, default=None, help='required if isotropic, should not be specified if point-source.') - noise_parser.add_argument('--probability', type=float, default=None, help='probability of the noise.') - noise_parser.add_argument('noise_rspecifier', type=str, help="""noise rspecifier, it can be either a filename or a piped command. - E.g. type5_noise_cirline_ofc_ambient1.wav or "sox type5_noise_cirline_ofc_ambient1.wav -t wav - |" """) + noise_parser.add_argument("--noise-id", type=str, required=True, help="noise id") + noise_parser.add_argument( + "--noise-type", + type=str, + required=True, + help="the type of noise; i.e. isotropic or point-source", + choices=["isotropic", "point-source"], + ) + noise_parser.add_argument( + "--bg-fg-type", + type=str, + default="background", + help="background or foreground noise, for background noises, " + "they will be extended before addition to cover the whole speech; for foreground noise, they will be kept " + "to their original duration and added at a random point of the speech.", + choices=["background", "foreground"], + ) + noise_parser.add_argument( + "--room-linkage", + type=str, + default=None, + help="required if isotropic, should not be specified if point-source.", + ) + noise_parser.add_argument( + "--probability", type=float, default=None, help="probability of the noise." + ) + noise_parser.add_argument( + "noise_rspecifier", + type=str, + help="""noise rspecifier, it can be either a filename or a piped command. + E.g. type5_noise_cirline_ofc_ambient1.wav or "sox type5_noise_cirline_ofc_ambient1.wav -t wav - |" """, + ) set_list = parse_set_parameter_strings(noise_set_para_array) pointsource_noise_list = [] iso_noise_dict = {} for noise_set in set_list: - current_noise_list = [noise_parser.parse_args(shlex.split(x.strip())) for x in open(noise_set.filename)] + current_noise_list = [ + noise_parser.parse_args(shlex.split(x.strip())) + for x in open(noise_set.filename) + ] current_pointsource_noise_list = [] for noise in current_noise_list: if sampling_rate is not None: # check if the rspecifier is a pipe or not if len(noise.noise_rspecifier.split()) == 1: - noise.noise_rspecifier = "sox {0} -r {1} -t wav - |".format(noise.noise_rspecifier, sampling_rate) + noise.noise_rspecifier = "sox {0} -r {1} -t wav - |".format( + noise.noise_rspecifier, sampling_rate + ) else: - noise.noise_rspecifier = "{0} sox -t wav - -r {1} -t wav - |".format(noise.noise_rspecifier, sampling_rate) + noise.noise_rspecifier = ( + "{0} sox -t wav - -r {1} -t wav - |".format( + noise.noise_rspecifier, sampling_rate + ) + ) if noise.noise_type == "isotropic": if noise.room_linkage is None: - raise Exception("--room-linkage must be specified if --noise-type is isotropic") + raise Exception( + "--room-linkage must be specified if --noise-type is isotropic" + ) else: if noise.room_linkage not in iso_noise_dict: iso_noise_dict[noise.room_linkage] = [] @@ -623,17 +925,25 @@ def parse_noise_list(noise_set_para_array, smoothing_weight, sampling_rate = Non else: current_pointsource_noise_list.append(noise) - pointsource_noise_list += smooth_probability_distribution(current_pointsource_noise_list, smoothing_weight, noise_set.probability) + pointsource_noise_list += smooth_probability_distribution( + current_pointsource_noise_list, smoothing_weight, noise_set.probability + ) # ensure the point-source noise probabilities sum to 1 - pointsource_noise_list = smooth_probability_distribution(pointsource_noise_list, smoothing_weight, 1.0) + pointsource_noise_list = smooth_probability_distribution( + pointsource_noise_list, smoothing_weight, 1.0 + ) if len(pointsource_noise_list) > 0: - assert almost_equal(sum(noise.probability for noise in pointsource_noise_list), 1.0) + assert almost_equal( + sum(noise.probability for noise in pointsource_noise_list), 1.0 + ) # ensure the isotropic noise source probabilities for a given room sum to 1 for key in iso_noise_dict.keys(): iso_noise_dict[key] = smooth_probability_distribution(iso_noise_dict[key]) - assert almost_equal(sum(noise.probability for noise in iso_noise_dict[key]), 1.0) + assert almost_equal( + sum(noise.probability for noise in iso_noise_dict[key]), 1.0 + ) return (pointsource_noise_list, iso_noise_dict) @@ -642,41 +952,56 @@ def main(): args = get_args() random.seed(args.random_seed) - rir_list = parse_rir_list(args.rir_set_para_array, args.rir_smoothing_weight, args.source_sampling_rate) + rir_list = parse_rir_list( + args.rir_set_para_array, args.rir_smoothing_weight, args.source_sampling_rate + ) print("Number of RIRs is {0}".format(len(rir_list))) pointsource_noise_list = [] iso_noise_dict = {} if args.noise_set_para_array is not None: - pointsource_noise_list, iso_noise_dict = parse_noise_list(args.noise_set_para_array, - args.noise_smoothing_weight, - args.source_sampling_rate) - print("Number of point-source noises is {0}".format(len(pointsource_noise_list))) - print("Number of isotropic noises is {0}".format(sum(len(iso_noise_dict[key]) for key in iso_noise_dict.keys()))) + pointsource_noise_list, iso_noise_dict = parse_noise_list( + args.noise_set_para_array, + args.noise_smoothing_weight, + args.source_sampling_rate, + ) + print( + "Number of point-source noises is {0}".format(len(pointsource_noise_list)) + ) + print( + "Number of isotropic noises is {0}".format( + sum(len(iso_noise_dict[key]) for key in iso_noise_dict.keys()) + ) + ) room_dict = make_room_dict(rir_list) if args.include_original_data == "true": include_original = True else: include_original = False - create_reverberated_copy(input_dir = args.input_dir, - output_dir = args.output_dir, - room_dict = room_dict, - pointsource_noise_list = pointsource_noise_list, - iso_noise_dict = iso_noise_dict, - foreground_snr_string = args.foreground_snr_string, - background_snr_string = args.background_snr_string, - num_replicas = args.num_replicas, - include_original = include_original, - prefix = args.prefix, - speech_rvb_probability = args.speech_rvb_probability, - shift_output = args.shift_output, - isotropic_noise_addition_probability = args.isotropic_noise_addition_probability, - pointsource_noise_addition_probability = args.pointsource_noise_addition_probability, - max_noises_per_minute = args.max_noises_per_minute) - - - data_lib.RunKaldiCommand("utils/validate_data_dir.sh --no-feats --no-text {output_dir}" - .format(output_dir = args.output_dir)) + create_reverberated_copy( + input_dir=args.input_dir, + output_dir=args.output_dir, + room_dict=room_dict, + pointsource_noise_list=pointsource_noise_list, + iso_noise_dict=iso_noise_dict, + foreground_snr_string=args.foreground_snr_string, + background_snr_string=args.background_snr_string, + num_replicas=args.num_replicas, + include_original=include_original, + prefix=args.prefix, + speech_rvb_probability=args.speech_rvb_probability, + shift_output=args.shift_output, + isotropic_noise_addition_probability=args.isotropic_noise_addition_probability, + pointsource_noise_addition_probability=args.pointsource_noise_addition_probability, + max_noises_per_minute=args.max_noises_per_minute, + ) + + data_lib.RunKaldiCommand( + "utils/validate_data_dir.sh --no-feats --no-text {output_dir}".format( + output_dir=args.output_dir + ) + ) + if __name__ == "__main__": main() diff --git a/egs2/TEMPLATE/asr1/steps/libs/__init__.py b/egs2/TEMPLATE/asr1/steps/libs/__init__.py index b78141be659..bbe4c258eb6 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/__init__.py +++ b/egs2/TEMPLATE/asr1/steps/libs/__init__.py @@ -1,5 +1,3 @@ - - # Copyright 2016 Vimal Manohar # Apache 2.0. diff --git a/egs2/TEMPLATE/asr1/steps/libs/common.py b/egs2/TEMPLATE/asr1/steps/libs/common.py index a5e294575bf..cf312c7a3b1 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/common.py +++ b/egs2/TEMPLATE/asr1/steps/libs/common.py @@ -1,5 +1,3 @@ - - # Copyright 2016 Vijayaditya Peddinti. # 2016 Vimal Manohar # 2017 Johns Hopkins University (author: Daniel Povey) @@ -32,12 +30,12 @@ def send_mail(message, subject, email_id): try: subprocess.Popen( 'echo "{message}" | mail -s "{subject}" {email}'.format( - message=message, - subject=subject, - email=email_id), shell=True) + message=message, subject=subject, email=email_id + ), + shell=True, + ) except Exception as e: - logger.info("Unable to send mail due to error:\n {error}".format( - error=str(e))) + logger.info("Unable to send mail due to error:\n {error}".format(error=str(e))) pass @@ -51,21 +49,20 @@ def str_to_bool(value): class StrToBoolAction(argparse.Action): - """ A custom action to convert bools from shell format i.e., true/false - to python format i.e., True/False """ + """A custom action to convert bools from shell format i.e., true/false + to python format i.e., True/False""" def __call__(self, parser, namespace, values, option_string=None): try: setattr(namespace, self.dest, str_to_bool(values)) except ValueError: - raise Exception( - "Unknown value {0} for --{1}".format(values, self.dest)) + raise Exception("Unknown value {0} for --{1}".format(values, self.dest)) class NullstrToNoneAction(argparse.Action): - """ A custom action to convert empty strings passed by shell to None in + """A custom action to convert empty strings passed by shell to None in python. This is necessary as shell scripts print null strings when a - variable is not specified. We could use the more apt None in python. """ + variable is not specified. We could use the more apt None in python.""" def __call__(self, parser, namespace, values, option_string=None): if values.strip() == "": @@ -84,6 +81,7 @@ class smart_open(object): e.g.: with smart_open(filename, 'w') as fh: print ("foo", file=fh) """ + def __init__(self, filename, mode="r"): self.filename = filename self.mode = mode @@ -113,40 +111,39 @@ def check_if_cuda_compiled(): def execute_command(command): - """ Runs a kaldi job in the foreground and waits for it to complete; raises an - exception if its return status is nonzero. The command is executed in - 'shell' mode so 'command' can involve things like pipes. Often, - 'command' will start with 'run.pl' or 'queue.pl'. The stdout and stderr - are merged with the calling process's stdout and stderr so they will - appear on the screen. - - See also: get_command_stdout, background_command + """Runs a kaldi job in the foreground and waits for it to complete; raises an + exception if its return status is nonzero. The command is executed in + 'shell' mode so 'command' can involve things like pipes. Often, + 'command' will start with 'run.pl' or 'queue.pl'. The stdout and stderr + are merged with the calling process's stdout and stderr so they will + appear on the screen. + + See also: get_command_stdout, background_command """ p = subprocess.Popen(command, shell=True) p.communicate() if p.returncode is not 0: - raise Exception("Command exited with status {0}: {1}".format( - p.returncode, command)) + raise Exception( + "Command exited with status {0}: {1}".format(p.returncode, command) + ) -def get_command_stdout(command, require_zero_status = True): - """ Executes a command and returns its stdout output as a string. The - command is executed with shell=True, so it may contain pipes and - other shell constructs. +def get_command_stdout(command, require_zero_status=True): + """Executes a command and returns its stdout output as a string. The + command is executed with shell=True, so it may contain pipes and + other shell constructs. - If require_zero_stats is True, this function will raise an exception if - the command has nonzero exit status. If False, it just prints a warning - if the exit status is nonzero. + If require_zero_stats is True, this function will raise an exception if + the command has nonzero exit status. If False, it just prints a warning + if the exit status is nonzero. - See also: execute_command, background_command + See also: execute_command, background_command """ - p = subprocess.Popen(command, shell=True, - stdout=subprocess.PIPE) + p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE) stdout = p.communicate()[0] if p.returncode is not 0: - output = "Command exited with status {0}: {1}".format( - p.returncode, command) + output = "Command exited with status {0}: {1}".format(p.returncode, command) if require_zero_status: raise Exception(output) else: @@ -154,55 +151,56 @@ def get_command_stdout(command, require_zero_status = True): return stdout if type(stdout) is str else stdout.decode() - - def wait_for_background_commands(): - """ This waits for all threads to exit. You will often want to - run this at the end of programs that have launched background - threads, so that the program will wait for its child processes - to terminate before it dies.""" + """This waits for all threads to exit. You will often want to + run this at the end of programs that have launched background + threads, so that the program will wait for its child processes + to terminate before it dies.""" for t in threading.enumerate(): if not t == threading.current_thread(): t.join() -def background_command(command, require_zero_status = False): + +def background_command(command, require_zero_status=False): """Executes a command in a separate thread, like running with '&' in the shell. - If you want the program to die if the command eventually returns with - nonzero status, then set require_zero_status to True. 'command' will be - executed in 'shell' mode, so it's OK for it to contain pipes and other - shell constructs. - - This function returns the Thread object created, just in case you want - to wait for that specific command to finish. For example, you could do: - thread = background_command('foo | bar') - # do something else while waiting for it to finish - thread.join() - - See also: - - wait_for_background_commands(), which can be used - at the end of the program to wait for all these commands to terminate. - - execute_command() and get_command_stdout(), which allow you to - execute commands in the foreground. + If you want the program to die if the command eventually returns with + nonzero status, then set require_zero_status to True. 'command' will be + executed in 'shell' mode, so it's OK for it to contain pipes and other + shell constructs. + + This function returns the Thread object created, just in case you want + to wait for that specific command to finish. For example, you could do: + thread = background_command('foo | bar') + # do something else while waiting for it to finish + thread.join() + + See also: + - wait_for_background_commands(), which can be used + at the end of the program to wait for all these commands to terminate. + - execute_command() and get_command_stdout(), which allow you to + execute commands in the foreground. """ p = subprocess.Popen(command, shell=True) - thread = threading.Thread(target=background_command_waiter, - args=(command, p, require_zero_status)) - thread.daemon=True # make sure it exits if main thread is terminated - # abnormally. + thread = threading.Thread( + target=background_command_waiter, args=(command, p, require_zero_status) + ) + thread.daemon = True # make sure it exits if main thread is terminated + # abnormally. thread.start() return thread def background_command_waiter(command, popen_object, require_zero_status): - """ This is the function that is called from background_command, in - a separate thread.""" + """This is the function that is called from background_command, in + a separate thread.""" popen_object.communicate() if popen_object.returncode is not 0: str = "Command exited with status {0}: {1}".format( - popen_object.returncode, command) + popen_object.returncode, command + ) if require_zero_status: logger.error(str) # thread.interrupt_main() sends a KeyboardInterrupt to the main @@ -214,9 +212,10 @@ def background_command_waiter(command, popen_object, require_zero_status): def get_number_of_leaves_from_tree(alidir): stdout = get_command_stdout( - "tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir)) + "tree-info {0}/tree 2>/dev/null | grep num-pdfs".format(alidir) + ) parts = stdout.split() - assert(parts[0] == "num-pdfs") + assert parts[0] == "num-pdfs" num_leaves = int(parts[1]) if num_leaves == 0: raise Exception("Number of leaves is 0") @@ -225,10 +224,11 @@ def get_number_of_leaves_from_tree(alidir): def get_number_of_leaves_from_model(dir): stdout = get_command_stdout( - "am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir)) + "am-info {0}/final.mdl 2>/dev/null | grep -w pdfs".format(dir) + ) parts = stdout.split() # number of pdfs 7115 - assert(' '.join(parts[0:3]) == "number of pdfs") + assert " ".join(parts[0:3]) == "number of pdfs" num_leaves = int(parts[3]) if num_leaves == 0: raise Exception("Number of leaves is 0") @@ -237,10 +237,11 @@ def get_number_of_leaves_from_model(dir): def get_number_of_jobs(alidir): try: - num_jobs = int(open('{0}/num_jobs'.format(alidir)).readline().strip()) + num_jobs = int(open("{0}/num_jobs".format(alidir)).readline().strip()) except (IOError, ValueError) as e: - logger.error("Exception while reading the " - "number of alignment jobs: ", exc_info=True) + logger.error( + "Exception while reading the " "number of alignment jobs: ", exc_info=True + ) raise SystemExit(1) return num_jobs @@ -250,35 +251,39 @@ def get_ivector_dim(ivector_dir=None): return 0 stdout_val = get_command_stdout( "feat-to-dim --print-args=false " - "scp:{dir}/ivector_online.scp -".format(dir=ivector_dir)) + "scp:{dir}/ivector_online.scp -".format(dir=ivector_dir) + ) ivector_dim = int(stdout_val) return ivector_dim + def get_ivector_extractor_id(ivector_dir=None): if ivector_dir is None: return None stdout_val = get_command_stdout( - "steps/nnet2/get_ivector_id.sh {dir}".format(dir=ivector_dir)) + "steps/nnet2/get_ivector_id.sh {dir}".format(dir=ivector_dir) + ) if (stdout_val.strip() == "") or (stdout_val is None): return None return stdout_val.strip() + def get_feat_dim(feat_dir): if feat_dir is None: return 0 stdout_val = get_command_stdout( - "feat-to-dim --print-args=false " - "scp:{data}/feats.scp -".format(data=feat_dir)) + "feat-to-dim --print-args=false " "scp:{data}/feats.scp -".format(data=feat_dir) + ) feat_dim = int(stdout_val) return feat_dim def get_feat_dim_from_scp(feat_scp): stdout_val = get_command_stdout( - "feat-to-dim --print-args=false " - "scp:{feat_scp} -".format(feat_scp=feat_scp)) + "feat-to-dim --print-args=false " "scp:{feat_scp} -".format(feat_scp=feat_scp) + ) feat_dim = int(stdout_val) return feat_dim @@ -296,20 +301,22 @@ def read_kaldi_matrix(matrix_file): if not (first_field == "[" and last_field == "]"): raise Exception( "Kaldi matrix file has incorrect format, " - "only text format matrix files can be read by this script") + "only text format matrix files can be read by this script" + ) for i in range(len(lines)): lines[i] = [int(float(x)) for x in lines[i]] return lines except IOError: - raise Exception("Error while reading the kaldi matrix file " - "{0}".format(matrix_file)) + raise Exception( + "Error while reading the kaldi matrix file " "{0}".format(matrix_file) + ) def write_kaldi_matrix(output_file, matrix): """This function writes the matrix stored as a list of lists into 'output_file' in kaldi matrix text format. """ - with open(output_file, 'w') as f: + with open(output_file, "w") as f: f.write("[ ") num_rows = len(matrix) if num_rows == 0: @@ -318,8 +325,9 @@ def write_kaldi_matrix(output_file, matrix): for row_index in range(len(matrix)): if num_cols != len(matrix[row_index]): - raise Exception("All the rows of a matrix are expected to " - "have the same length") + raise Exception( + "All the rows of a matrix are expected to " "have the same length" + ) f.write(" ".join([str(x) for x in matrix[row_index]])) if row_index != num_rows - 1: f.write("\n") @@ -334,32 +342,33 @@ def write_matrix_ascii(file_or_fd, mat, key=None): as the index field. """ try: - fd = open(file_or_fd, 'w') + fd = open(file_or_fd, "w") except TypeError: # 'file_or_fd' is opened file descriptor, fd = file_or_fd try: if key is not None: - print ("{0} [".format(key), - file=fd) # ark-files have keys (utterance-id) + print("{0} [".format(key), file=fd) # ark-files have keys (utterance-id) else: - print (" [", file=fd) + print(" [", file=fd) num_cols = 0 for i, row in enumerate(mat): - line = ' '.join(["{0:f}".format(x) for x in row]) + line = " ".join(["{0:f}".format(x) for x in row]) if i == 0: num_cols = len(row) elif len(row) != num_cols: - raise Exception("All the rows of a matrix are expected to " - "have the same length") + raise Exception( + "All the rows of a matrix are expected to " "have the same length" + ) if i == len(mat) - 1: line += " ]" - print (line, file=fd) + print(line, file=fd) finally: - if fd is not file_or_fd : fd.close() + if fd is not file_or_fd: + fd.close() def read_matrix_ascii(file_or_fd): @@ -368,7 +377,7 @@ def read_matrix_ascii(file_or_fd): The input can be a file or an opened file descriptor. """ try: - fd = open(file_or_fd, 'r') + fd = open(file_or_fd, "r") fname = file_or_fd except TypeError: # 'file_or_fd' is opened file descriptor, @@ -376,49 +385,57 @@ def read_matrix_ascii(file_or_fd): fname = file_or_fd.name first = fd.read(2) - if first != ' [' and first != b' [': + if first != " [" and first != b" [": logger.error( "Kaldi matrix file %s has incorrect format, " "only text format matrix files can be read by this script", - fname) - if fd is not file_or_fd: fd.close() + fname, + ) + if fd is not file_or_fd: + fd.close() raise RuntimeError rows = [] while True: line = fd.readline() if not line: - logger.error("Kaldi matrix file %s has incorrect format; " - "got EOF before end of matrix", fname) - if fd is not file_or_fd: fd.close() + logger.error( + "Kaldi matrix file %s has incorrect format; " + "got EOF before end of matrix", + fname, + ) + if fd is not file_or_fd: + fd.close() raise RuntimeError line = line.strip() - if len(line) == 0 : continue # skip empty line + if len(line) == 0: + continue # skip empty line arr = line.split() - if arr[-1] != b']' and arr[-1] != ']': + if arr[-1] != b"]" and arr[-1] != "]": rows.append([float(x) for x in arr]) # not last line else: rows.append([float(x) for x in arr[:-1]]) # lastline - if fd is not file_or_fd: fd.close() + if fd is not file_or_fd: + fd.close() return rows def read_key(fd): - """ [str] = read_key(fd) - Read the utterance-key from the opened ark/stream descriptor 'fd'. - """ - str_ = '' - while True: - char = fd.read(1) - if char == '': - break - if char == ' ': - break - str_ += char - str_ = str_.strip() - if str_ == '': - return None # end of file, - return str_ + """[str] = read_key(fd) + Read the utterance-key from the opened ark/stream descriptor 'fd'. + """ + str_ = "" + while True: + char = fd.read(1) + if char == "": + break + if char == " ": + break + str_ += char + str_ = str_.strip() + if str_ == "": + return None # end of file, + return str_ def read_mat_ark(file_or_fd): @@ -430,7 +447,7 @@ def read_mat_ark(file_or_fd): mat_dict = { key: mat for key, mat in read_mat_ark(file) } """ try: - fd = open(file_or_fd, 'r') + fd = open(file_or_fd, "r") fname = file_or_fd except TypeError: # 'file_or_fd' is opened file descriptor, @@ -440,9 +457,9 @@ def read_mat_ark(file_or_fd): try: key = read_key(fd) while key: - mat = read_matrix_ascii(fd) - yield key, mat - key = read_key(fd) + mat = read_matrix_ascii(fd) + yield key, mat + key = read_key(fd) finally: if fd is not file_or_fd: fd.close() @@ -450,6 +467,7 @@ def read_mat_ark(file_or_fd): def force_symlink(file1, file2): import errno + try: os.symlink(file1, file2) except OSError as e: @@ -476,8 +494,7 @@ def compute_idct_matrix(K, N, cepstral_lifter=0): normalizer = math.sqrt(2.0 / float(N)) for k in range(1, K): for n in range(0, N): - matrix[n][ - k] = normalizer * math.cos(math.pi / float(N) * (n + 0.5) * k) + matrix[n][k] = normalizer * math.cos(math.pi / float(N) * (n + 0.5) * k) if cepstral_lifter != 0: lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, K) diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/__init__.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/__init__.py index 94b7c52002a..2025d3cfe5a 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/__init__.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/__init__.py @@ -1,5 +1,3 @@ - - # Copyright 2016 Johns Hopkins University (Dan Povey) # 2016 Vimal Manohar # 2016 Vijayaditya Peddinti diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/report/__init__.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/report/__init__.py index 620c4238a22..46362261a78 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/report/__init__.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/report/__init__.py @@ -1,5 +1,3 @@ - - # Copyright 2016 Vimal Manohar # Apache 2.0. diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/report/log_parse.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/report/log_parse.py index 97da5e04962..c9c6d76db39 100755 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/report/log_parse.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/report/log_parse.py @@ -1,5 +1,3 @@ - - # Copyright 2016 Vijayaditya Peddinti # Vimal Manohar # Apache 2.0. @@ -16,68 +14,86 @@ logger = logging.getLogger(__name__) logger.addHandler(logging.NullHandler()) -g_lstmp_nonlin_regex_pattern = ''.join([".*progress.([0-9]+).log:component name=(.+) ", - "type=(.*)Component,.*", - "i_t_sigmoid.*", - "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", - "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", - "f_t_sigmoid.*", - "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", - "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", - "c_t_tanh.*", - "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", - "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", - "o_t_sigmoid.*", - "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", - "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", - "m_t_tanh.*", - "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", - "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]"]) - -g_normal_nonlin_regex_pattern = ''.join([".*progress.([0-9]+).log:component name=(.+) ", - "type=(.*)Component,.*", - "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", - "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]"]) - -g_normal_nonlin_regex_pattern_with_oderiv = ''.join([".*progress.([0-9]+).log:component name=(.+) ", - "type=(.*)Component,.*", - "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", - "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", - "oderiv-rms=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]"]) +g_lstmp_nonlin_regex_pattern = "".join( + [ + ".*progress.([0-9]+).log:component name=(.+) ", + "type=(.*)Component,.*", + "i_t_sigmoid.*", + "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", + "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", + "f_t_sigmoid.*", + "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", + "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", + "c_t_tanh.*", + "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", + "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", + "o_t_sigmoid.*", + "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", + "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", + "m_t_tanh.*", + "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", + "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]", + ] +) + +g_normal_nonlin_regex_pattern = "".join( + [ + ".*progress.([0-9]+).log:component name=(.+) ", + "type=(.*)Component,.*", + "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", + "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]", + ] +) + +g_normal_nonlin_regex_pattern_with_oderiv = "".join( + [ + ".*progress.([0-9]+).log:component name=(.+) ", + "type=(.*)Component,.*", + "value-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", + "deriv-avg=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\].*", + "oderiv-rms=\[.*=\((.+)\), mean=([0-9\.\-e]+), stddev=([0-9\.e\-]+)\]", + ] +) + class KaldiLogParseException(Exception): - """ An Exception class that throws an error when there is an issue in + """An Exception class that throws an error when there is an issue in parsing the log files. Extend this class if more granularity is needed. """ - def __init__(self, message = None): + + def __init__(self, message=None): if message is not None and message.strip() == "": message = None - Exception.__init__(self, - "There was an error while trying to parse the logs." - " Details : \n{0}\n".format(message)) + Exception.__init__( + self, + "There was an error while trying to parse the logs." + " Details : \n{0}\n".format(message), + ) + # This function is used to fill stats_per_component_per_iter table with the # results of regular expression. + def fill_nonlin_stats_table_with_regex_result(groups, gate_index, stats_table): iteration = int(groups[0]) component_name = groups[1] component_type = groups[2] # for value-avg - value_percentiles = groups[3+gate_index*6] - value_mean = float(groups[4+gate_index*6]) - value_stddev = float(groups[5+gate_index*6]) - value_percentiles_split = re.split(',| ',value_percentiles) + value_percentiles = groups[3 + gate_index * 6] + value_mean = float(groups[4 + gate_index * 6]) + value_stddev = float(groups[5 + gate_index * 6]) + value_percentiles_split = re.split(",| ", value_percentiles) assert len(value_percentiles_split) == 13 value_5th = float(value_percentiles_split[4]) value_50th = float(value_percentiles_split[6]) value_95th = float(value_percentiles_split[9]) # for deriv-avg - deriv_percentiles = groups[6+gate_index*6] - deriv_mean = float(groups[7+gate_index*6]) - deriv_stddev = float(groups[8+gate_index*6]) - deriv_percentiles_split = re.split(',| ',deriv_percentiles) + deriv_percentiles = groups[6 + gate_index * 6] + deriv_mean = float(groups[7 + gate_index * 6]) + deriv_stddev = float(groups[8 + gate_index * 6]) + deriv_percentiles_split = re.split(",| ", deriv_percentiles) assert len(deriv_percentiles_split) == 13 deriv_5th = float(deriv_percentiles_split[4]) deriv_50th = float(deriv_percentiles_split[6]) @@ -85,69 +101,124 @@ def fill_nonlin_stats_table_with_regex_result(groups, gate_index, stats_table): if len(groups) <= 9: try: - if iteration in stats_table[component_name]['stats']: - stats_table[component_name]['stats'][iteration].extend( - [value_mean, value_stddev, - deriv_mean, deriv_stddev, - value_5th, value_50th, value_95th, - deriv_5th, deriv_50th, deriv_95th]) + if iteration in stats_table[component_name]["stats"]: + stats_table[component_name]["stats"][iteration].extend( + [ + value_mean, + value_stddev, + deriv_mean, + deriv_stddev, + value_5th, + value_50th, + value_95th, + deriv_5th, + deriv_50th, + deriv_95th, + ] + ) else: - stats_table[component_name]['stats'][iteration] = [ - value_mean, value_stddev, - deriv_mean, deriv_stddev, - value_5th, value_50th, value_95th, - deriv_5th, deriv_50th, deriv_95th] + stats_table[component_name]["stats"][iteration] = [ + value_mean, + value_stddev, + deriv_mean, + deriv_stddev, + value_5th, + value_50th, + value_95th, + deriv_5th, + deriv_50th, + deriv_95th, + ] except KeyError: stats_table[component_name] = {} - stats_table[component_name]['type'] = component_type - stats_table[component_name]['stats'] = {} - stats_table[component_name][ - 'stats'][iteration] = [value_mean, value_stddev, - deriv_mean, deriv_stddev, - value_5th, value_50th, value_95th, - deriv_5th, deriv_50th, deriv_95th] + stats_table[component_name]["type"] = component_type + stats_table[component_name]["stats"] = {} + stats_table[component_name]["stats"][iteration] = [ + value_mean, + value_stddev, + deriv_mean, + deriv_stddev, + value_5th, + value_50th, + value_95th, + deriv_5th, + deriv_50th, + deriv_95th, + ] else: - #for oderiv-rms - oderiv_percentiles = groups[9+gate_index*6] - oderiv_mean = float(groups[10+gate_index*6]) - oderiv_stddev = float(groups[11+gate_index*6]) - oderiv_percentiles_split = re.split(',| ',oderiv_percentiles) + # for oderiv-rms + oderiv_percentiles = groups[9 + gate_index * 6] + oderiv_mean = float(groups[10 + gate_index * 6]) + oderiv_stddev = float(groups[11 + gate_index * 6]) + oderiv_percentiles_split = re.split(",| ", oderiv_percentiles) assert len(oderiv_percentiles_split) == 13 oderiv_5th = float(oderiv_percentiles_split[4]) oderiv_50th = float(oderiv_percentiles_split[6]) oderiv_95th = float(oderiv_percentiles_split[9]) try: - if iteration in stats_table[component_name]['stats']: - stats_table[component_name]['stats'][iteration].extend( - [value_mean, value_stddev, - deriv_mean, deriv_stddev, - oderiv_mean, oderiv_stddev, - value_5th, value_50th, value_95th, - deriv_5th, deriv_50th, deriv_95th, - oderiv_5th, oderiv_50th, oderiv_95th]) + if iteration in stats_table[component_name]["stats"]: + stats_table[component_name]["stats"][iteration].extend( + [ + value_mean, + value_stddev, + deriv_mean, + deriv_stddev, + oderiv_mean, + oderiv_stddev, + value_5th, + value_50th, + value_95th, + deriv_5th, + deriv_50th, + deriv_95th, + oderiv_5th, + oderiv_50th, + oderiv_95th, + ] + ) else: - stats_table[component_name]['stats'][iteration] = [ - value_mean, value_stddev, - deriv_mean, deriv_stddev, - oderiv_mean, oderiv_stddev, - value_5th, value_50th, value_95th, - deriv_5th, deriv_50th, deriv_95th, - oderiv_5th, oderiv_50th, oderiv_95th] + stats_table[component_name]["stats"][iteration] = [ + value_mean, + value_stddev, + deriv_mean, + deriv_stddev, + oderiv_mean, + oderiv_stddev, + value_5th, + value_50th, + value_95th, + deriv_5th, + deriv_50th, + deriv_95th, + oderiv_5th, + oderiv_50th, + oderiv_95th, + ] except KeyError: stats_table[component_name] = {} - stats_table[component_name]['type'] = component_type - stats_table[component_name]['stats'] = {} - stats_table[component_name][ - 'stats'][iteration] = [value_mean, value_stddev, - deriv_mean, deriv_stddev, - oderiv_mean, oderiv_stddev, - value_5th, value_50th, value_95th, - deriv_5th, deriv_50th, deriv_95th, - oderiv_5th, oderiv_50th, oderiv_95th] + stats_table[component_name]["type"] = component_type + stats_table[component_name]["stats"] = {} + stats_table[component_name]["stats"][iteration] = [ + value_mean, + value_stddev, + deriv_mean, + deriv_stddev, + oderiv_mean, + oderiv_stddev, + value_5th, + value_50th, + value_95th, + deriv_5th, + deriv_50th, + deriv_95th, + oderiv_5th, + oderiv_50th, + oderiv_95th, + ] -def parse_progress_logs_for_nonlinearity_stats(exp_dir): - """ Parse progress logs for mean and std stats for non-linearities. +def parse_progress_logs_for_nonlinearity_stats(exp_dir): + """Parse progress logs for mean and std stats for non-linearities. e.g. for a line that is parsed from progress.*.log: exp/nnet3/lstm_self_repair_ld5_sp/log/progress.9.log:component name=Lstm3_i type=SigmoidComponent, dim=1280, self-repair-scale=1e-05, count=1.96e+05, @@ -164,7 +235,8 @@ def parse_progress_logs_for_nonlinearity_stats(exp_dir): progress_log_lines = common_lib.get_command_stdout( 'grep -e "value-avg.*deriv-avg.*oderiv" {0}'.format(progress_log_files), - require_zero_status = False) + require_zero_status=False, + ) if progress_log_lines: # cases with oderiv-rms @@ -172,8 +244,9 @@ def parse_progress_logs_for_nonlinearity_stats(exp_dir): else: # cases with only value-avg and deriv-avg progress_log_lines = common_lib.get_command_stdout( - 'grep -e "value-avg.*deriv-avg" {0}'.format(progress_log_files), - require_zero_status = False) + 'grep -e "value-avg.*deriv-avg" {0}'.format(progress_log_files), + require_zero_status=False, + ) parse_regex = re.compile(g_normal_nonlin_regex_pattern) for line in progress_log_lines.split("\n"): @@ -184,17 +257,19 @@ def parse_progress_logs_for_nonlinearity_stats(exp_dir): # '0.009...0.21', '0.134', '0.0397') groups = mat_obj.groups() component_type = groups[2] - if component_type == 'LstmNonlinearity': + if component_type == "LstmNonlinearity": parse_regex_lstmp = re.compile(g_lstmp_nonlin_regex_pattern) mat_obj = parse_regex_lstmp.search(line) groups = mat_obj.groups() assert len(groups) == 33 - for i in list(range(0,5)): - fill_nonlin_stats_table_with_regex_result(groups, i, - stats_per_component_per_iter) + for i in list(range(0, 5)): + fill_nonlin_stats_table_with_regex_result( + groups, i, stats_per_component_per_iter + ) else: - fill_nonlin_stats_table_with_regex_result(groups, 0, - stats_per_component_per_iter) + fill_nonlin_stats_table_with_regex_result( + groups, 0, stats_per_component_per_iter + ) return stats_per_component_per_iter @@ -208,13 +283,15 @@ def parse_difference_string(string): class MalformedClippedProportionLineException(Exception): def __init__(self, line): - Exception.__init__(self, - "Malformed line encountered while trying to " - "extract clipped-proportions.\n{0}".format(line)) + Exception.__init__( + self, + "Malformed line encountered while trying to " + "extract clipped-proportions.\n{0}".format(line), + ) def parse_progress_logs_for_clipped_proportion(exp_dir): - """ Parse progress logs for clipped proportion stats. + """Parse progress logs for clipped proportion stats. e.g. for a line that is parsed from progress.*.log: exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:component @@ -228,12 +305,14 @@ def parse_progress_logs_for_clipped_proportion(exp_dir): progress_log_files = "%s/log/progress.*.log" % (exp_dir) component_names = set([]) progress_log_lines = common_lib.get_command_stdout( - 'grep -e "{0}" {1}'.format( - "clipped-proportion", progress_log_files), - require_zero_status=False) - parse_regex = re.compile(".*progress\.([0-9]+)\.log:component " - "name=(.*) type=.* " - "clipped-proportion=([0-9\.e\-]+)") + 'grep -e "{0}" {1}'.format("clipped-proportion", progress_log_files), + require_zero_status=False, + ) + parse_regex = re.compile( + ".*progress\.([0-9]+)\.log:component " + "name=(.*) type=.* " + "clipped-proportion=([0-9\.e\-]+)" + ) cp_per_component_per_iter = {} @@ -265,8 +344,8 @@ def parse_progress_logs_for_clipped_proportion(exp_dir): for component_name in component_names: cp_per_iter_per_component[component_name] = [] data = [] - data.append(["iteration"]+component_names) - for iter in range(max_iteration+1): + data.append(["iteration"] + component_names) + for iter in range(max_iteration + 1): if iter not in cp_per_component_per_iter: continue comp_dict = cp_per_component_per_iter[iter] @@ -275,7 +354,8 @@ def parse_progress_logs_for_clipped_proportion(exp_dir): try: row.append(comp_dict[component]) cp_per_iter_per_component[component].append( - [iter, comp_dict[component]]) + [iter, comp_dict[component]] + ) except KeyError: # if clipped proportion is not available for a particular # component it is set to None @@ -284,13 +364,15 @@ def parse_progress_logs_for_clipped_proportion(exp_dir): row.append(None) data.append(row) - return {'table': data, - 'cp_per_component_per_iter': cp_per_component_per_iter, - 'cp_per_iter_per_component': cp_per_iter_per_component} + return { + "table": data, + "cp_per_component_per_iter": cp_per_component_per_iter, + "cp_per_iter_per_component": cp_per_iter_per_component, + } def parse_progress_logs_for_param_diff(exp_dir, pattern): - """ Parse progress logs for per-component parameter differences. + """Parse progress logs for per-component parameter differences. e.g. for a line that is parsed from progress.*.log: exp/chain/cwrnn_trial2_ld5_sp/log/progress.245.log:LOG @@ -304,17 +386,18 @@ def parse_progress_logs_for_param_diff(exp_dir, pattern): Cwrnn3_T2_W_x:1.3296e-07 Cwrnn3_T1_W_r:0.0194809 Cwrnn3_T1_W_x:0.0271934 ] """ - if pattern not in set(["Relative parameter differences", - "Parameter differences"]): + if pattern not in set(["Relative parameter differences", "Parameter differences"]): raise Exception("Unknown value for pattern : {0}".format(pattern)) progress_log_files = "%s/log/progress.*.log" % (exp_dir) progress_per_iter = {} component_names = set([]) progress_log_lines = common_lib.get_command_stdout( - 'grep -e "{0}" {1}'.format(pattern, progress_log_files)) - parse_regex = re.compile(".*progress\.([0-9]+)\.log:" - "LOG.*{0}.*\[(.*)\]".format(pattern)) + 'grep -e "{0}" {1}'.format(pattern, progress_log_files) + ) + parse_regex = re.compile( + ".*progress\.([0-9]+)\.log:" "LOG.*{0}.*\[(.*)\]".format(pattern) + ) for line in progress_log_lines.split("\n"): mat_obj = parse_regex.search(line) if mat_obj is None: @@ -345,34 +428,47 @@ def parse_progress_logs_for_param_diff(exp_dir, pattern): for component_name in component_names: try: progress_per_component[component_name][iter] = component_dict[ - component_name] + component_name + ] except KeyError: total_missing_iterations += 1 # the component was not found this iteration, may be because of # layerwise discriminative training pass - if (total_missing_iterations/len(component_names) > 20 - and not gave_user_warning and logger is not None): - logger.warning("There are more than {0} missing iterations per " - "component. Something might be wrong.".format( - total_missing_iterations/len(component_names))) + if ( + total_missing_iterations / len(component_names) > 20 + and not gave_user_warning + and logger is not None + ): + logger.warning( + "There are more than {0} missing iterations per " + "component. Something might be wrong.".format( + total_missing_iterations / len(component_names) + ) + ) gave_user_warning = True - return {'progress_per_component': progress_per_component, - 'component_names': component_names, - 'max_iter': max_iter} + return { + "progress_per_component": progress_per_component, + "component_names": component_names, + "max_iter": max_iter, + } def get_train_times(exp_dir): train_log_files = "%s/log/" % (exp_dir) train_log_names = "train.*.log" train_log_lines = common_lib.get_command_stdout( - 'find {0} -name "{1}" | xargs grep -H -e Accounting'.format(train_log_files,train_log_names)) - parse_regex = re.compile(".*train\.([0-9]+)\.([0-9]+)\.log:# " - "Accounting: time=([0-9]+) thread.*") + 'find {0} -name "{1}" | xargs grep -H -e Accounting'.format( + train_log_files, train_log_names + ) + ) + parse_regex = re.compile( + ".*train\.([0-9]+)\.([0-9]+)\.log:# " "Accounting: time=([0-9]+) thread.*" + ) train_times = {} - for line in train_log_lines.split('\n'): + for line in train_log_lines.split("\n"): mat_obj = parse_regex.search(line) if mat_obj is not None: groups = mat_obj.groups() @@ -387,13 +483,16 @@ def get_train_times(exp_dir): train_times[iter] = max(values) return train_times -def parse_prob_logs(exp_dir, key='accuracy', output="output"): + +def parse_prob_logs(exp_dir, key="accuracy", output="output"): train_prob_files = "%s/log/compute_prob_train.*.log" % (exp_dir) valid_prob_files = "%s/log/compute_prob_valid.*.log" % (exp_dir) train_prob_strings = common_lib.get_command_stdout( - 'grep -e {0} {1}'.format(key, train_prob_files)) + "grep -e {0} {1}".format(key, train_prob_files) + ) valid_prob_strings = common_lib.get_command_stdout( - 'grep -e {0} {1}'.format(key, valid_prob_files)) + "grep -e {0} {1}".format(key, valid_prob_files) + ) # LOG # (nnet3-chain-compute-prob:PrintTotalStats():nnet-chain-diagnostics.cc:149) @@ -409,22 +508,25 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"): ".*compute_prob_.*\.([0-9]+).log:LOG " ".nnet3.*compute-prob.*:PrintTotalStats..:" "nnet.*diagnostics.cc:[0-9]+. Overall ([a-zA-Z\-]+) for " - "'{output}'.*is ([0-9.\-e]+) .*per frame".format(output=output)) + "'{output}'.*is ([0-9.\-e]+) .*per frame".format(output=output) + ) train_objf = {} valid_objf = {} - for line in train_prob_strings.split('\n'): + for line in train_prob_strings.split("\n"): mat_obj = parse_regex.search(line) if mat_obj is not None: groups = mat_obj.groups() if groups[1] == key: train_objf[int(groups[0])] = groups[2] if not train_objf: - raise KaldiLogParseException("Could not find any lines with {k} in " - " {l}".format(k=key, l=train_prob_files)) + raise KaldiLogParseException( + "Could not find any lines with {k} in " + " {l}".format(k=key, l=train_prob_files) + ) - for line in valid_prob_strings.split('\n'): + for line in valid_prob_strings.split("\n"): mat_obj = parse_regex.search(line) if mat_obj is not None: groups = mat_obj.groups() @@ -432,25 +534,32 @@ def parse_prob_logs(exp_dir, key='accuracy', output="output"): valid_objf[int(groups[0])] = groups[2] if not valid_objf: - raise KaldiLogParseException("Could not find any lines with {k} in " - " {l}".format(k=key, l=valid_prob_files)) + raise KaldiLogParseException( + "Could not find any lines with {k} in " + " {l}".format(k=key, l=valid_prob_files) + ) iters = list(set(valid_objf.keys()).intersection(list(train_objf.keys()))) if not iters: - raise KaldiLogParseException("Could not any common iterations with" - " key {k} in both {tl} and {vl}".format( - k=key, tl=train_prob_files, vl=valid_prob_files)) + raise KaldiLogParseException( + "Could not any common iterations with" + " key {k} in both {tl} and {vl}".format( + k=key, tl=train_prob_files, vl=valid_prob_files + ) + ) iters.sort() - return list([(int(x), float(train_objf[x]), - float(valid_objf[x])) for x in iters]) + return list([(int(x), float(train_objf[x]), float(valid_objf[x])) for x in iters]) + -def parse_rnnlm_prob_logs(exp_dir, key='objf'): +def parse_rnnlm_prob_logs(exp_dir, key="objf"): train_prob_files = "%s/log/train.*.*.log" % (exp_dir) valid_prob_files = "%s/log/compute_prob.*.log" % (exp_dir) train_prob_strings = common_lib.get_command_stdout( - 'grep -e {0} {1}'.format(key, train_prob_files)) + "grep -e {0} {1}".format(key, train_prob_files) + ) valid_prob_strings = common_lib.get_command_stdout( - 'grep -e {0} {1}'.format(key, valid_prob_files)) + "grep -e {0} {1}".format(key, valid_prob_files) + ) # LOG # (rnnlm-train[5.3.36~8-2ec51]:PrintStatsOverall():rnnlm-core-training.cc:118) @@ -466,28 +575,32 @@ def parse_rnnlm_prob_logs(exp_dir, key='objf'): ".*train\.([0-9]+).1.log:LOG " ".rnnlm-train.*:PrintStatsOverall..:" "rnnlm.*training.cc:[0-9]+. Overall ([a-zA-Z\-]+) is " - ".*exact = \(.+\) = ([0-9.\-\+e]+)") + ".*exact = \(.+\) = ([0-9.\-\+e]+)" + ) parse_regex_valid = re.compile( ".*compute_prob\.([0-9]+).log:LOG " ".rnnlm.*compute-prob.*:PrintStatsOverall..:" "rnnlm.*training.cc:[0-9]+. Overall ([a-zA-Z\-]+) is " - ".*exact = \(.+\) = ([0-9.\-\+e]+)") + ".*exact = \(.+\) = ([0-9.\-\+e]+)" + ) train_objf = {} valid_objf = {} - for line in train_prob_strings.split('\n'): + for line in train_prob_strings.split("\n"): mat_obj = parse_regex_train.search(line) if mat_obj is not None: groups = mat_obj.groups() if groups[1] == key: train_objf[int(groups[0])] = groups[2] if not train_objf: - raise KaldiLogParseException("Could not find any lines with {k} in " - " {l}".format(k=key, l=train_prob_files)) + raise KaldiLogParseException( + "Could not find any lines with {k} in " + " {l}".format(k=key, l=train_prob_files) + ) - for line in valid_prob_strings.split('\n'): + for line in valid_prob_strings.split("\n"): mat_obj = parse_regex_valid.search(line) if mat_obj is not None: groups = mat_obj.groups() @@ -495,18 +608,21 @@ def parse_rnnlm_prob_logs(exp_dir, key='objf'): valid_objf[int(groups[0])] = groups[2] if not valid_objf: - raise KaldiLogParseException("Could not find any lines with {k} in " - " {l}".format(k=key, l=valid_prob_files)) + raise KaldiLogParseException( + "Could not find any lines with {k} in " + " {l}".format(k=key, l=valid_prob_files) + ) iters = list(set(valid_objf.keys()).intersection(list(train_objf.keys()))) if not iters: - raise KaldiLogParseException("Could not any common iterations with" - " key {k} in both {tl} and {vl}".format( - k=key, tl=train_prob_files, vl=valid_prob_files)) + raise KaldiLogParseException( + "Could not any common iterations with" + " key {k} in both {tl} and {vl}".format( + k=key, tl=train_prob_files, vl=valid_prob_files + ) + ) iters.sort() - return [(int(x), float(train_objf[x]), - float(valid_objf[x])) for x in iters] - + return [(int(x), float(train_objf[x]), float(valid_objf[x])) for x in iters] def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): @@ -521,7 +637,7 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): report.append("%Iter\tduration\ttrain_objective\tvalid_objective\tdifference") try: if key == "rnnlm_objective": - data = list(parse_rnnlm_prob_logs(exp_dir, 'objf')) + data = list(parse_rnnlm_prob_logs(exp_dir, "objf")) else: data = list(parse_prob_logs(exp_dir, key, output)) except: @@ -530,14 +646,18 @@ def generate_acc_logprob_report(exp_dir, key="accuracy", output="output"): data = [] for x in data: try: - report.append("%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), - x[1], x[2], x[2]-x[1])) + report.append( + "%d\t%s\t%g\t%g\t%g" % (x[0], str(times[x[0]]), x[1], x[2], x[2] - x[1]) + ) except (KeyError, IndexError): continue total_time = 0 for iter in times.keys(): total_time += times[iter] - report.append("Total training time is {0}\n".format( - str(datetime.timedelta(seconds=total_time)))) + report.append( + "Total training time is {0}\n".format( + str(datetime.timedelta(seconds=total_time)) + ) + ) return ["\n".join(report), times, data] diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/__init__.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/__init__.py index 0503c0135cd..0346762ba1b 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/__init__.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/__init__.py @@ -1,4 +1,3 @@ - # Copyright 2016 Vimal Manohar # Apache 2.0 diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/chain_objf/__init__.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/chain_objf/__init__.py index d7fccd8bbe6..403840810e5 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/chain_objf/__init__.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/chain_objf/__init__.py @@ -1,5 +1,3 @@ - - # Copyright 2016 Vimal Manohar # Apache 2.0. diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/chain_objf/acoustic_model.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/chain_objf/acoustic_model.py index 15807228fb3..847cb586d42 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/chain_objf/acoustic_model.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/chain_objf/acoustic_model.py @@ -1,5 +1,3 @@ - - # Copyright 2016 Vijayaditya Peddinti. # 2016 Vimal Manohar # Apache 2.0. @@ -29,49 +27,74 @@ def create_phone_lm(dir, tree_dir, run_opts, lm_opts=None): in "tree_dir" """ try: - f = open(tree_dir + "/num_jobs", 'r') + f = open(tree_dir + "/num_jobs", "r") num_ali_jobs = int(f.readline()) assert num_ali_jobs > 0 except: - raise Exception("""There was an error getting the number of alignment - jobs from {0}/num_jobs""".format(tree_dir)) + raise Exception( + """There was an error getting the number of alignment + jobs from {0}/num_jobs""".format( + tree_dir + ) + ) - alignments=' '.join(['{0}/ali.{1}.gz'.format(tree_dir, job) - for job in range(1, num_ali_jobs + 1)]) + alignments = " ".join( + ["{0}/ali.{1}.gz".format(tree_dir, job) for job in range(1, num_ali_jobs + 1)] + ) common_lib.execute_command( """{command} {dir}/log/make_phone_lm.log \ gunzip -c {alignments} \| \ ali-to-phones {tree_dir}/final.mdl ark:- ark:- \| \ chain-est-phone-lm {lm_opts} ark:- {dir}/phone_lm.fst""".format( - command=run_opts.command, dir=dir, - alignments=alignments, - lm_opts=lm_opts if lm_opts is not None else '', - tree_dir=tree_dir)) + command=run_opts.command, + dir=dir, + alignments=alignments, + lm_opts=lm_opts if lm_opts is not None else "", + tree_dir=tree_dir, + ) + ) def create_denominator_fst(dir, tree_dir, run_opts): common_lib.execute_command( """copy-transition-model {tree_dir}/final.mdl \ - {dir}/0.trans_mdl""".format(dir=dir, tree_dir=tree_dir)) + {dir}/0.trans_mdl""".format( + dir=dir, tree_dir=tree_dir + ) + ) common_lib.execute_command( """{command} {dir}/log/make_den_fst.log \ chain-make-den-fst {dir}/tree {dir}/0.trans_mdl \ {dir}/phone_lm.fst \ {dir}/den.fst {dir}/normalization.fst""".format( - dir=dir, command=run_opts.command)) - - -def generate_chain_egs(dir, data, lat_dir, egs_dir, - left_context, right_context, - run_opts, stage=0, - left_tolerance=None, right_tolerance=None, - left_context_initial=-1, right_context_final=-1, - frame_subsampling_factor=3, - alignment_subsampling_factor=3, - online_ivector_dir=None, - frames_per_iter=20000, frames_per_eg_str="20", srand=0, - egs_opts=None, cmvn_opts=None): + dir=dir, command=run_opts.command + ) + ) + + +def generate_chain_egs( + dir, + data, + lat_dir, + egs_dir, + left_context, + right_context, + run_opts, + stage=0, + left_tolerance=None, + right_tolerance=None, + left_context_initial=-1, + right_context_final=-1, + frame_subsampling_factor=3, + alignment_subsampling_factor=3, + online_ivector_dir=None, + frames_per_iter=20000, + frames_per_eg_str="20", + srand=0, + egs_opts=None, + cmvn_opts=None, +): """Wrapper for steps/nnet3/chain/get_egs.sh See options in that script. @@ -95,41 +118,57 @@ def generate_chain_egs(dir, data, lat_dir, egs_dir, --frames-per-eg {frames_per_eg_str} \ --srand {srand} \ {data} {dir} {lat_dir} {egs_dir}""".format( - command=run_opts.egs_command, - cmvn_opts=cmvn_opts if cmvn_opts is not None else '', - ivector_dir=(online_ivector_dir - if online_ivector_dir is not None - else ''), - left_context=left_context, - right_context=right_context, - left_context_initial=left_context_initial, - right_context_final=right_context_final, - left_tolerance=(left_tolerance - if left_tolerance is not None - else ''), - right_tolerance=(right_tolerance - if right_tolerance is not None - else ''), - frame_subsampling_factor=frame_subsampling_factor, - alignment_subsampling_factor=alignment_subsampling_factor, - stage=stage, frames_per_iter=frames_per_iter, - frames_per_eg_str=frames_per_eg_str, srand=srand, - data=data, lat_dir=lat_dir, dir=dir, egs_dir=egs_dir, - egs_opts=egs_opts if egs_opts is not None else '')) - - -def train_new_models(dir, iter, srand, num_jobs, - num_archives_processed, num_archives, - raw_model_string, egs_dir, - apply_deriv_weights, - min_deriv_time, max_deriv_time_relative, - l2_regularize, xent_regularize, leaky_hmm_coefficient, - momentum, max_param_change, - shuffle_buffer_size, num_chunk_per_minibatch_str, - frame_subsampling_factor, run_opts, train_opts, - backstitch_training_scale=0.0, backstitch_training_interval=1, - use_multitask_egs=False, - chain_opts=''): + command=run_opts.egs_command, + cmvn_opts=cmvn_opts if cmvn_opts is not None else "", + ivector_dir=(online_ivector_dir if online_ivector_dir is not None else ""), + left_context=left_context, + right_context=right_context, + left_context_initial=left_context_initial, + right_context_final=right_context_final, + left_tolerance=(left_tolerance if left_tolerance is not None else ""), + right_tolerance=(right_tolerance if right_tolerance is not None else ""), + frame_subsampling_factor=frame_subsampling_factor, + alignment_subsampling_factor=alignment_subsampling_factor, + stage=stage, + frames_per_iter=frames_per_iter, + frames_per_eg_str=frames_per_eg_str, + srand=srand, + data=data, + lat_dir=lat_dir, + dir=dir, + egs_dir=egs_dir, + egs_opts=egs_opts if egs_opts is not None else "", + ) + ) + + +def train_new_models( + dir, + iter, + srand, + num_jobs, + num_archives_processed, + num_archives, + raw_model_string, + egs_dir, + apply_deriv_weights, + min_deriv_time, + max_deriv_time_relative, + l2_regularize, + xent_regularize, + leaky_hmm_coefficient, + momentum, + max_param_change, + shuffle_buffer_size, + num_chunk_per_minibatch_str, + frame_subsampling_factor, + run_opts, + train_opts, + backstitch_training_scale=0.0, + backstitch_training_interval=1, + use_multitask_egs=False, + chain_opts="", +): """ Called from train_one_iteration(), this method trains new models with 'num_jobs' jobs, and @@ -150,11 +189,15 @@ def train_new_models(dir, iter, srand, num_jobs, deriv_time_opts = [] if min_deriv_time is not None: - deriv_time_opts.append("--optimization.min-deriv-time={0}".format( - min_deriv_time)) + deriv_time_opts.append( + "--optimization.min-deriv-time={0}".format(min_deriv_time) + ) if max_deriv_time_relative is not None: - deriv_time_opts.append("--optimization.max-deriv-time-relative={0}".format( - int(max_deriv_time_relative))) + deriv_time_opts.append( + "--optimization.max-deriv-time-relative={0}".format( + int(max_deriv_time_relative) + ) + ) threads = [] # the GPU timing info is only printed if we use the --verbose=1 flag; this @@ -162,28 +205,27 @@ def train_new_models(dir, iter, srand, num_jobs, # iteration. Don't do it on iteration 0 either, because we use a smaller # than normal minibatch size, and people may get confused thinking it's # slower for iteration 0 because of the verbose option. - verbose_opt = ("--verbose=1" if iter % 20 == 0 and iter > 0 else "") - for job in range(1, num_jobs+1): - + verbose_opt = "--verbose=1" if iter % 20 == 0 and iter > 0 else "" + for job in range(1, num_jobs + 1): # k is a zero-based index that we will derive the other indexes from. k = num_archives_processed + job - 1 # work out the 1-based archive index. archive_index = (k % num_archives) + 1 # previous : frame_shift = (k/num_archives) % frame_subsampling_factor - frame_shift = ((archive_index + k//num_archives) - % frame_subsampling_factor) + frame_shift = (archive_index + k // num_archives) % frame_subsampling_factor multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="cegs.", archive_index=archive_index, - use_multitask_egs=use_multitask_egs) + use_multitask_egs=use_multitask_egs, + ) scp_or_ark = "scp" if use_multitask_egs else "ark" - cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir, - iter=iter) - if iter > 0 else "") + - (" --write-cache={0}/cache.{1}".format(dir, iter + 1) - if job == 1 else "")) + cache_io_opts = ( + "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) + if iter > 0 + else "" + ) + (" --write-cache={0}/cache.{1}".format(dir, iter + 1) if job == 1 else "") thread = common_lib.background_command( """{command} {train_queue_opt} {dir}/log/train.{iter}.{job}.log \ nnet3-chain-train {parallel_train_opts} {verbose_opt} \ @@ -205,29 +247,39 @@ def train_new_models(dir, iter, srand, num_jobs, --srand={srand} ark:- ark:- | nnet3-chain-merge-egs \ --minibatch-size={num_chunk_per_mb} ark:- ark:- |" \ {dir}/{next_iter}.{job}.raw""".format( - command=run_opts.command, - train_queue_opt=run_opts.train_queue_opt, - dir=dir, iter=iter, srand=iter + srand, - next_iter=iter + 1, job=job, - deriv_time_opts=" ".join(deriv_time_opts), - app_deriv_wts=apply_deriv_weights, - fr_shft=frame_shift, l2=l2_regularize, - train_opts=train_opts,chain_opts=chain_opts, - xent_reg=xent_regularize, leaky=leaky_hmm_coefficient, - cache_io_opts=cache_io_opts, - parallel_train_opts=run_opts.parallel_train_opts, - verbose_opt=verbose_opt, - momentum=momentum, max_param_change=max_param_change, - backstitch_training_scale=backstitch_training_scale, - backstitch_training_interval=backstitch_training_interval, - l2_regularize_factor=1.0/num_jobs, - raw_model=raw_model_string, - egs_dir=egs_dir, archive_index=archive_index, - buf_size=shuffle_buffer_size, - num_chunk_per_mb=num_chunk_per_minibatch_str, - multitask_egs_opts=multitask_egs_opts, - scp_or_ark=scp_or_ark), - require_zero_status=True) + command=run_opts.command, + train_queue_opt=run_opts.train_queue_opt, + dir=dir, + iter=iter, + srand=iter + srand, + next_iter=iter + 1, + job=job, + deriv_time_opts=" ".join(deriv_time_opts), + app_deriv_wts=apply_deriv_weights, + fr_shft=frame_shift, + l2=l2_regularize, + train_opts=train_opts, + chain_opts=chain_opts, + xent_reg=xent_regularize, + leaky=leaky_hmm_coefficient, + cache_io_opts=cache_io_opts, + parallel_train_opts=run_opts.parallel_train_opts, + verbose_opt=verbose_opt, + momentum=momentum, + max_param_change=max_param_change, + backstitch_training_scale=backstitch_training_scale, + backstitch_training_interval=backstitch_training_interval, + l2_regularize_factor=1.0 / num_jobs, + raw_model=raw_model_string, + egs_dir=egs_dir, + archive_index=archive_index, + buf_size=shuffle_buffer_size, + num_chunk_per_mb=num_chunk_per_minibatch_str, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, + ), + require_zero_status=True, + ) threads.append(thread) @@ -235,20 +287,36 @@ def train_new_models(dir, iter, srand, num_jobs, thread.join() -def train_one_iteration(dir, iter, srand, egs_dir, - num_jobs, num_archives_processed, num_archives, - learning_rate, shrinkage_value, - num_chunk_per_minibatch_str, - apply_deriv_weights, min_deriv_time, - max_deriv_time_relative, - l2_regularize, xent_regularize, - leaky_hmm_coefficient, - momentum, max_param_change, shuffle_buffer_size, - frame_subsampling_factor, - run_opts, dropout_edit_string="", train_opts="", chain_opts="", - backstitch_training_scale=0.0, backstitch_training_interval=1, - use_multitask_egs=False): - """ Called from steps/nnet3/chain/train.py for one iteration for +def train_one_iteration( + dir, + iter, + srand, + egs_dir, + num_jobs, + num_archives_processed, + num_archives, + learning_rate, + shrinkage_value, + num_chunk_per_minibatch_str, + apply_deriv_weights, + min_deriv_time, + max_deriv_time_relative, + l2_regularize, + xent_regularize, + leaky_hmm_coefficient, + momentum, + max_param_change, + shuffle_buffer_size, + frame_subsampling_factor, + run_opts, + dropout_edit_string="", + train_opts="", + chain_opts="", + backstitch_training_scale=0.0, + backstitch_training_interval=1, + use_multitask_egs=False, +): + """Called from steps/nnet3/chain/train.py for one iteration for neural network training with LF-MMI objective """ @@ -256,39 +324,46 @@ def train_one_iteration(dir, iter, srand, egs_dir, # Set off jobs doing some diagnostics, in the background. # Use the egs dir from the previous iteration for the diagnostics # check if different iterations use the same random seed - if os.path.exists('{0}/srand'.format(dir)): + if os.path.exists("{0}/srand".format(dir)): try: - saved_srand = int(open('{0}/srand'.format(dir)).readline().strip()) + saved_srand = int(open("{0}/srand".format(dir)).readline().strip()) except (IOError, ValueError): - logger.error("Exception while reading the random seed " - "for training") + logger.error("Exception while reading the random seed " "for training") raise if srand != saved_srand: - logger.warning("The random seed provided to this iteration " - "(srand={0}) is different from the one saved last " - "time (srand={1}). Using srand={0}.".format( - srand, saved_srand)) + logger.warning( + "The random seed provided to this iteration " + "(srand={0}) is different from the one saved last " + "time (srand={1}). Using srand={0}.".format(srand, saved_srand) + ) else: - with open('{0}/srand'.format(dir), 'w') as f: + with open("{0}/srand".format(dir), "w") as f: f.write(str(srand)) # Sets off some background jobs to compute train and # validation set objectives compute_train_cv_probabilities( - dir=dir, iter=iter, egs_dir=egs_dir, - l2_regularize=l2_regularize, xent_regularize=xent_regularize, - leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, - use_multitask_egs=use_multitask_egs, chain_opts=chain_opts) + dir=dir, + iter=iter, + egs_dir=egs_dir, + l2_regularize=l2_regularize, + xent_regularize=xent_regularize, + leaky_hmm_coefficient=leaky_hmm_coefficient, + run_opts=run_opts, + use_multitask_egs=use_multitask_egs, + chain_opts=chain_opts, + ) if iter > 0: # Runs in the background compute_progress(dir, iter, run_opts) - do_average = (iter > 0) + do_average = iter > 0 - raw_model_string = ("nnet3-am-copy --raw=true --learning-rate={0} " - "--scale={1} {2}/{3}.mdl - |".format( - learning_rate, shrinkage_value, dir, iter)) + raw_model_string = ( + "nnet3-am-copy --raw=true --learning-rate={0} " + "--scale={1} {2}/{3}.mdl - |".format(learning_rate, shrinkage_value, dir, iter) + ) if do_average: cur_num_chunk_per_minibatch_str = num_chunk_per_minibatch_str @@ -300,36 +375,48 @@ def train_one_iteration(dir, iter, srand, egs_dir, # the objective function), and the smaller minibatch size will help to # keep the update stable. cur_num_chunk_per_minibatch_str = common_train_lib.halve_minibatch_size_str( - num_chunk_per_minibatch_str) + num_chunk_per_minibatch_str + ) cur_max_param_change = float(max_param_change) / math.sqrt(2) raw_model_string = raw_model_string + dropout_edit_string - train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, - num_archives_processed=num_archives_processed, - num_archives=num_archives, - raw_model_string=raw_model_string, - egs_dir=egs_dir, - apply_deriv_weights=apply_deriv_weights, - min_deriv_time=min_deriv_time, - max_deriv_time_relative=max_deriv_time_relative, - l2_regularize=l2_regularize, - xent_regularize=xent_regularize, - leaky_hmm_coefficient=leaky_hmm_coefficient, - momentum=momentum, - max_param_change=cur_max_param_change, - shuffle_buffer_size=shuffle_buffer_size, - num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str, - frame_subsampling_factor=frame_subsampling_factor, - run_opts=run_opts, train_opts=train_opts,chain_opts=chain_opts, - # linearly increase backstitch_training_scale during the - # first few iterations (hard-coded as 15) - backstitch_training_scale=(backstitch_training_scale * - iter / 15 if iter < 15 else backstitch_training_scale), - backstitch_training_interval=backstitch_training_interval, - use_multitask_egs=use_multitask_egs) + train_new_models( + dir=dir, + iter=iter, + srand=srand, + num_jobs=num_jobs, + num_archives_processed=num_archives_processed, + num_archives=num_archives, + raw_model_string=raw_model_string, + egs_dir=egs_dir, + apply_deriv_weights=apply_deriv_weights, + min_deriv_time=min_deriv_time, + max_deriv_time_relative=max_deriv_time_relative, + l2_regularize=l2_regularize, + xent_regularize=xent_regularize, + leaky_hmm_coefficient=leaky_hmm_coefficient, + momentum=momentum, + max_param_change=cur_max_param_change, + shuffle_buffer_size=shuffle_buffer_size, + num_chunk_per_minibatch_str=cur_num_chunk_per_minibatch_str, + frame_subsampling_factor=frame_subsampling_factor, + run_opts=run_opts, + train_opts=train_opts, + chain_opts=chain_opts, + # linearly increase backstitch_training_scale during the + # first few iterations (hard-coded as 15) + backstitch_training_scale=( + backstitch_training_scale * iter / 15 + if iter < 15 + else backstitch_training_scale + ), + backstitch_training_interval=backstitch_training_interval, + use_multitask_egs=use_multitask_egs, + ) [models_to_average, best_model] = common_train_lib.get_successful_models( - num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) + num_jobs, "{0}/log/train.{1}.%.log".format(dir, iter) + ) nnets_list = [] for n in models_to_average: nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) @@ -337,16 +424,14 @@ def train_one_iteration(dir, iter, srand, egs_dir, if do_average: # average the output of the different jobs. common_train_lib.get_average_nnet_model( - dir=dir, iter=iter, - nnets_list=" ".join(nnets_list), - run_opts=run_opts) + dir=dir, iter=iter, nnets_list=" ".join(nnets_list), run_opts=run_opts + ) else: # choose the best model from different jobs common_train_lib.get_best_nnet_model( - dir=dir, iter=iter, - best_model_index=best_model, - run_opts=run_opts) + dir=dir, iter=iter, best_model_index=best_model, run_opts=run_opts + ) try: for i in range(1, num_jobs + 1): @@ -357,31 +442,47 @@ def train_one_iteration(dir, iter, srand, egs_dir, new_model = "{0}/{1}.mdl".format(dir, iter + 1) if not os.path.isfile(new_model): - raise Exception("Could not find {0}, at the end of " - "iteration {1}".format(new_model, iter)) + raise Exception( + "Could not find {0}, at the end of " "iteration {1}".format(new_model, iter) + ) elif os.stat(new_model).st_size == 0: - raise Exception("{0} has size 0. Something went wrong in " - "iteration {1}".format(new_model, iter)) + raise Exception( + "{0} has size 0. Something went wrong in " + "iteration {1}".format(new_model, iter) + ) if os.path.exists("{0}/cache.{1}".format(dir, iter)): os.remove("{0}/cache.{1}".format(dir, iter)) def check_for_required_files(feat_dir, tree_dir, lat_dir=None): - files = ['{0}/feats.scp'.format(feat_dir), '{0}/ali.1.gz'.format(tree_dir), - '{0}/final.mdl'.format(tree_dir), '{0}/tree'.format(tree_dir)] + files = [ + "{0}/feats.scp".format(feat_dir), + "{0}/ali.1.gz".format(tree_dir), + "{0}/final.mdl".format(tree_dir), + "{0}/tree".format(tree_dir), + ] if lat_dir is not None: files += [ - '{0}/lat.1.gz'.format(lat_dir), '{0}/final.mdl'.format(lat_dir), - '{0}/num_jobs'.format(lat_dir)] + "{0}/lat.1.gz".format(lat_dir), + "{0}/final.mdl".format(lat_dir), + "{0}/num_jobs".format(lat_dir), + ] for file in files: if not os.path.isfile(file): - raise Exception('Expected {0} to exist.'.format(file)) + raise Exception("Expected {0} to exist.".format(file)) -def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, - max_lda_jobs=None, rand_prune=4.0, - lda_opts=None, use_multitask_egs=False): - """ Function to estimate and write LDA matrix from cegs +def compute_preconditioning_matrix( + dir, + egs_dir, + num_lda_jobs, + run_opts, + max_lda_jobs=None, + rand_prune=4.0, + lda_opts=None, + use_multitask_egs=False, +): + """Function to estimate and write LDA matrix from cegs This function is exactly similar to the version in module libs.nnet3.train.frame_level_objf.common except this uses cegs instead of @@ -394,13 +495,18 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, egs_dir, egs_prefix="cegs.", archive_index="JOB", - use_multitask_egs=use_multitask_egs) + use_multitask_egs=use_multitask_egs, + ) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_rspecifier = ( "ark:nnet3-chain-copy-egs {multitask_egs_opts} " "{scp_or_ark}:{egs_dir}/cegs.JOB.{scp_or_ark} ark:- |" - "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark, - multitask_egs_opts=multitask_egs_opts)) + "".format( + egs_dir=egs_dir, + scp_or_ark=scp_or_ark, + multitask_egs_opts=multitask_egs_opts, + ) + ) # Write stats with the same format as stats for LDA. common_lib.execute_command( @@ -408,27 +514,31 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, nnet3-chain-acc-lda-stats --rand-prune={rand_prune} \ {dir}/init.raw "{egs_rspecifier}" \ {dir}/JOB.lda_stats""".format( - command=run_opts.command, - num_lda_jobs=num_lda_jobs, - dir=dir, - egs_rspecifier=egs_rspecifier, - rand_prune=rand_prune)) + command=run_opts.command, + num_lda_jobs=num_lda_jobs, + dir=dir, + egs_rspecifier=egs_rspecifier, + rand_prune=rand_prune, + ) + ) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats - lda_stat_files = ['{0}/{1}.lda_stats'.format(dir, x) for x in range(1, num_lda_jobs + 1)] + lda_stat_files = [ + "{0}/{1}.lda_stats".format(dir, x) for x in range(1, num_lda_jobs + 1) + ] common_lib.execute_command( """{command} {dir}/log/sum_transform_stats.log \ sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format( - command=run_opts.command, - dir=dir, lda_stat_files=" ".join(lda_stat_files))) + command=run_opts.command, dir=dir, lda_stat_files=" ".join(lda_stat_files) + ) + ) for file in lda_stat_files: try: os.remove(file) except OSError: - raise Exception("There was error while trying to remove " - "lda stat files.") + raise Exception("There was error while trying to remove " "lda stat files.") # this computes a fixed affine transform computed in the way we described # in Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled # variant of an LDA transform but without dimensionality reduction. @@ -437,23 +547,25 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, """{command} {dir}/log/get_transform.log \ nnet-get-feature-transform {lda_opts} {dir}/lda.mat \ {dir}/lda_stats""".format( - command=run_opts.command, dir=dir, - lda_opts=lda_opts if lda_opts is not None else "")) + command=run_opts.command, + dir=dir, + lda_opts=lda_opts if lda_opts is not None else "", + ) + ) common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir)) def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_model=None): - """ This function adds the first layer; It will also prepare the acoustic - model with the transition model. - If 'input_model' is specified, no initial network preparation(adding - the first layer) is done and this model is used as initial 'raw' model - instead of '0.raw' model to prepare '0.mdl' as acoustic model by adding the - transition model. + """This function adds the first layer; It will also prepare the acoustic + model with the transition model. + If 'input_model' is specified, no initial network preparation(adding + the first layer) is done and this model is used as initial 'raw' model + instead of '0.raw' model to prepare '0.mdl' as acoustic model by adding the + transition model. """ if input_model is None: - common_train_lib.prepare_initial_network(dir, run_opts, - srand=srand) + common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) # The model-format for a 'chain' acoustic model is just the transition # model and then the raw nnet, so we can use 'cat' to create this, as @@ -464,24 +576,34 @@ def prepare_initial_acoustic_model(dir, run_opts, srand=-1, input_model=None): common_lib.execute_command( """{command} {dir}/log/init_mdl.log \ nnet3-am-init {dir}/0.trans_mdl {raw_mdl} \ - {dir}/0.mdl""".format(command=run_opts.command, dir=dir, - raw_mdl=(input_model if input_model is not None - else '{0}/0.raw'.format(dir)))) - - -def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, - xent_regularize, leaky_hmm_coefficient, - run_opts, - use_multitask_egs=False, - chain_opts=""): - model = '{0}/{1}.mdl'.format(dir, iter) + {dir}/0.mdl""".format( + command=run_opts.command, + dir=dir, + raw_mdl=( + input_model if input_model is not None else "{0}/0.raw".format(dir) + ), + ) + ) + + +def compute_train_cv_probabilities( + dir, + iter, + egs_dir, + l2_regularize, + xent_regularize, + leaky_hmm_coefficient, + run_opts, + use_multitask_egs=False, + chain_opts="", +): + model = "{0}/{1}.mdl".format(dir, iter) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".cegs" multitask_egs_opts = common_train_lib.get_multitask_egs_opts( - egs_dir, - egs_prefix="valid_diagnostic.", - use_multitask_egs=use_multitask_egs) + egs_dir, egs_prefix="valid_diagnostic.", use_multitask_egs=use_multitask_egs + ) common_lib.background_command( """{command} {dir}/log/compute_prob_valid.{iter}.log \ @@ -490,17 +612,25 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, {model} {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/valid_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ - """.format(command=run_opts.command, dir=dir, iter=iter, model=model, - l2=l2_regularize, leaky=leaky_hmm_coefficient, - xent_reg=xent_regularize, - egs_dir=egs_dir, - multitask_egs_opts=multitask_egs_opts, - scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, chain_opts=chain_opts)) + """.format( + command=run_opts.command, + dir=dir, + iter=iter, + model=model, + l2=l2_regularize, + leaky=leaky_hmm_coefficient, + xent_reg=xent_regularize, + egs_dir=egs_dir, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, + egs_suffix=egs_suffix, + chain_opts=chain_opts, + ) + ) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( - egs_dir, - egs_prefix="train_diagnostic.", - use_multitask_egs=use_multitask_egs) + egs_dir, egs_prefix="train_diagnostic.", use_multitask_egs=use_multitask_egs + ) common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ @@ -509,28 +639,39 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, l2_regularize, {model} {dir}/den.fst \ "ark,bg:nnet3-chain-copy-egs {multitask_egs_opts} {scp_or_ark}:{egs_dir}/train_diagnostic{egs_suffix} \ ark:- | nnet3-chain-merge-egs --minibatch-size=1:64 ark:- ark:- |" \ - """.format(command=run_opts.command, dir=dir, iter=iter, model=model, - l2=l2_regularize, leaky=leaky_hmm_coefficient, - xent_reg=xent_regularize, - egs_dir=egs_dir, - multitask_egs_opts=multitask_egs_opts, - scp_or_ark=scp_or_ark, egs_suffix=egs_suffix, chain_opts=chain_opts)) + """.format( + command=run_opts.command, + dir=dir, + iter=iter, + model=model, + l2=l2_regularize, + leaky=leaky_hmm_coefficient, + xent_reg=xent_regularize, + egs_dir=egs_dir, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, + egs_suffix=egs_suffix, + chain_opts=chain_opts, + ) + ) def compute_progress(dir, iter, run_opts): - - prev_model = '{0}/{1}.mdl'.format(dir, iter - 1) - model = '{0}/{1}.mdl'.format(dir, iter) + prev_model = "{0}/{1}.mdl".format(dir, iter - 1) + model = "{0}/{1}.mdl".format(dir, iter) common_lib.background_command( """{command} {dir}/log/progress.{iter}.log \ nnet3-am-info {model} '&&' \ nnet3-show-progress --use-gpu=no {prev_model} {model} - """.format(command=run_opts.command, - dir=dir, - iter=iter, - model=model, - prev_model=prev_model)) + """.format( + command=run_opts.command, + dir=dir, + iter=iter, + model=model, + prev_model=prev_model, + ) + ) if iter % 10 == 0 and iter > 0: # Every 10 iters, print some more detailed information. # full_progress.X.log contains some diagnostics of the difference in @@ -538,29 +679,39 @@ def compute_progress(dir, iter, run_opts): common_lib.background_command( """{command} {dir}/log/full_progress.{iter}.log \ nnet3-show-progress --use-gpu=no --verbose=2 {prev_model} {model} - """.format(command=run_opts.command, - dir=dir, - iter=iter, - model=model, - prev_model=prev_model)) + """.format( + command=run_opts.command, + dir=dir, + iter=iter, + model=model, + prev_model=prev_model, + ) + ) # full_info.X.log is just the nnet3-info of the model, with the --verbose=2 # option which includes stats on the singular values of the parameter matrices. common_lib.background_command( """{command} {dir}/log/full_info.{iter}.log \ nnet3-info --verbose=2 {model} - """.format(command=run_opts.command, - dir=dir, - iter=iter, - model=model)) - - - -def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_str, - egs_dir, leaky_hmm_coefficient, l2_regularize, - xent_regularize, run_opts, - max_objective_evaluations=30, - use_multitask_egs=False): - """ Function to do model combination + """.format( + command=run_opts.command, dir=dir, iter=iter, model=model + ) + ) + + +def combine_models( + dir, + num_iters, + models_to_combine, + num_chunk_per_minibatch_str, + egs_dir, + leaky_hmm_coefficient, + l2_regularize, + xent_regularize, + run_opts, + max_objective_evaluations=30, + use_multitask_egs=False, +): + """Function to do model combination In the nnet3 setup, the logic for doing averaging of subsets of the models in the case where @@ -573,22 +724,23 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st models_to_combine.add(num_iters) for iter in sorted(models_to_combine): - model_file = '{0}/{1}.mdl'.format(dir, iter) + model_file = "{0}/{1}.mdl".format(dir, iter) if os.path.exists(model_file): # we used to copy them with nnet3-am-copy --raw=true, but now # the raw-model-reading code discards the other stuff itself. raw_model_strings.append(model_file) else: - print("{0}: warning: model file {1} does not exist " - "(final combination)".format(sys.argv[0], model_file)) + print( + "{0}: warning: model file {1} does not exist " + "(final combination)".format(sys.argv[0], model_file) + ) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".cegs" multitask_egs_opts = common_train_lib.get_multitask_egs_opts( - egs_dir, - egs_prefix="combine.", - use_multitask_egs=use_multitask_egs) + egs_dir, egs_prefix="combine.", use_multitask_egs=use_multitask_egs + ) # We reverse the order of the raw model strings so that the freshest one # goes first. This is important for systems that include batch @@ -609,24 +761,33 @@ def combine_models(dir, num_iters, models_to_combine, num_chunk_per_minibatch_st ark:- ark:- |" - \| \ nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl \ {dir}/final.mdl""".format( - command=run_opts.command, - combine_queue_opt=run_opts.combine_queue_opt, - combine_gpu_opt=run_opts.combine_gpu_opt, - max_objective_evaluations=max_objective_evaluations, - l2=l2_regularize, leaky=leaky_hmm_coefficient, - dir=dir, raw_models=" ".join(raw_model_strings), - num_chunk_per_mb=num_chunk_per_minibatch_str, - num_iters=num_iters, - egs_dir=egs_dir, - multitask_egs_opts=multitask_egs_opts, - scp_or_ark=scp_or_ark, egs_suffix=egs_suffix)) + command=run_opts.command, + combine_queue_opt=run_opts.combine_queue_opt, + combine_gpu_opt=run_opts.combine_gpu_opt, + max_objective_evaluations=max_objective_evaluations, + l2=l2_regularize, + leaky=leaky_hmm_coefficient, + dir=dir, + raw_models=" ".join(raw_model_strings), + num_chunk_per_mb=num_chunk_per_minibatch_str, + num_iters=num_iters, + egs_dir=egs_dir, + multitask_egs_opts=multitask_egs_opts, + scp_or_ark=scp_or_ark, + egs_suffix=egs_suffix, + ) + ) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. compute_train_cv_probabilities( - dir=dir, iter='final', egs_dir=egs_dir, - l2_regularize=l2_regularize, xent_regularize=xent_regularize, + dir=dir, + iter="final", + egs_dir=egs_dir, + l2_regularize=l2_regularize, + xent_regularize=xent_regularize, leaky_hmm_coefficient=leaky_hmm_coefficient, run_opts=run_opts, - use_multitask_egs=use_multitask_egs) + use_multitask_egs=use_multitask_egs, + ) diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/common.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/common.py index eac376f165d..ea29cd17a3f 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/common.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/common.py @@ -1,5 +1,3 @@ - - # Copyright 2016 Vijayaditya Peddinti. # 2016 Vimal Manohar # Apache 2.0 @@ -41,81 +39,83 @@ def __init__(self): self.prior_queue_opt = None self.parallel_train_opts = None + def get_outputs_list(model_file, get_raw_nnet_from_am=True): - """ Generates list of output-node-names used in nnet3 model configuration. - It will normally return 'output'. + """Generates list of output-node-names used in nnet3 model configuration. + It will normally return 'output'. """ if get_raw_nnet_from_am: outputs_list = common_lib.get_command_stdout( "nnet3-am-info --print-args=false {0} | " - "grep -e 'output-node' | cut -f2 -d' ' | cut -f2 -d'=' ".format(model_file)) + "grep -e 'output-node' | cut -f2 -d' ' | cut -f2 -d'=' ".format(model_file) + ) else: outputs_list = common_lib.get_command_stdout( "nnet3-info --print-args=false {0} | " - "grep -e 'output-node' | cut -f2 -d' ' | cut -f2 -d'=' ".format(model_file)) + "grep -e 'output-node' | cut -f2 -d' ' | cut -f2 -d'=' ".format(model_file) + ) return outputs_list.split() -def get_multitask_egs_opts(egs_dir, egs_prefix="", - archive_index=-1, - use_multitask_egs=False): - """ Generates egs option for multitask(or multilingual) training setup, - if {egs_prefix}output.*.ark or {egs_prefix}weight.*.ark files exists in egs_dir. - Each line in {egs_prefix}*.scp has a corresponding line containing - name of the output-node in the network and language-dependent weight in - {egs_prefix}output.*.ark or {egs_prefix}weight.*.ark respectively. - e.g. Returns the empty string ('') if use_multitask_egs == False, - otherwise something like: - '--output=ark:foo/egs/output.3.ark --weight=ark:foo/egs/weights.3.ark' - i.e. egs_prefix is "" for train and - "valid_diagnostic." for validation. - - Caution: archive_index is usually an integer, but may be a string ("JOB") - in some cases. +def get_multitask_egs_opts( + egs_dir, egs_prefix="", archive_index=-1, use_multitask_egs=False +): + """Generates egs option for multitask(or multilingual) training setup, + if {egs_prefix}output.*.ark or {egs_prefix}weight.*.ark files exists in egs_dir. + Each line in {egs_prefix}*.scp has a corresponding line containing + name of the output-node in the network and language-dependent weight in + {egs_prefix}output.*.ark or {egs_prefix}weight.*.ark respectively. + e.g. Returns the empty string ('') if use_multitask_egs == False, + otherwise something like: + '--output=ark:foo/egs/output.3.ark --weight=ark:foo/egs/weights.3.ark' + i.e. egs_prefix is "" for train and + "valid_diagnostic." for validation. + + Caution: archive_index is usually an integer, but may be a string ("JOB") + in some cases. """ multitask_egs_opts = "" egs_suffix = ".{0}".format(archive_index) if archive_index != -1 else "" if use_multitask_egs: - output_file_name = ("{egs_dir}/{egs_prefix}output{egs_suffix}.ark" - "".format(egs_dir=egs_dir, - egs_prefix=egs_prefix, - egs_suffix=egs_suffix)) + output_file_name = "{egs_dir}/{egs_prefix}output{egs_suffix}.ark" "".format( + egs_dir=egs_dir, egs_prefix=egs_prefix, egs_suffix=egs_suffix + ) output_rename_opt = "" if os.path.isfile(output_file_name): - output_rename_opt = ("--outputs=ark:{output_file_name}".format( - output_file_name=output_file_name)) + output_rename_opt = "--outputs=ark:{output_file_name}".format( + output_file_name=output_file_name + ) - weight_file_name = ("{egs_dir}/{egs_prefix}weight{egs_suffix}.ark" - "".format(egs_dir=egs_dir, - egs_prefix=egs_prefix, - egs_suffix=egs_suffix)) + weight_file_name = "{egs_dir}/{egs_prefix}weight{egs_suffix}.ark" "".format( + egs_dir=egs_dir, egs_prefix=egs_prefix, egs_suffix=egs_suffix + ) weight_opt = "" if os.path.isfile(weight_file_name): - weight_opt = ("--weights=ark:{weight_file_name}" - "".format(weight_file_name=weight_file_name)) + weight_opt = "--weights=ark:{weight_file_name}" "".format( + weight_file_name=weight_file_name + ) - multitask_egs_opts = ( - "{output_rename_opt} {weight_opt}".format( - output_rename_opt=output_rename_opt, - weight_opt=weight_opt)) + multitask_egs_opts = "{output_rename_opt} {weight_opt}".format( + output_rename_opt=output_rename_opt, weight_opt=weight_opt + ) return multitask_egs_opts -def get_successful_models(num_models, log_file_pattern, - difference_threshold=1.0): +def get_successful_models(num_models, log_file_pattern, difference_threshold=1.0): assert num_models > 0 parse_regex = re.compile( "LOG .* Overall average objective function for " - "'output' is ([0-9e.\-+= ]+) over ([0-9e.\-+]+) frames") + "'output' is ([0-9e.\-+= ]+) over ([0-9e.\-+]+) frames" + ) objf = [] for i in range(num_models): model_num = i + 1 - logfile = re.sub('%', str(model_num), log_file_pattern) - lines = open(logfile, 'r').readlines() + logfile = re.sub("%", str(model_num), log_file_pattern) + lines = open(logfile, "r").readlines() this_objf = -100000.0 for line_num in range(1, len(lines) + 1): # we search from the end as this would result in @@ -132,60 +132,65 @@ def get_successful_models(num_models, log_file_pattern, accepted_models.append(i + 1) if len(accepted_models) != num_models: - logger.warn("Only {0}/{1} of the models have been accepted " - "for averaging, based on log files {2}.".format( - len(accepted_models), - num_models, log_file_pattern)) + logger.warn( + "Only {0}/{1} of the models have been accepted " + "for averaging, based on log files {2}.".format( + len(accepted_models), num_models, log_file_pattern + ) + ) return [accepted_models, max_index + 1] -def get_average_nnet_model(dir, iter, nnets_list, run_opts, - get_raw_nnet_from_am=True): - +def get_average_nnet_model(dir, iter, nnets_list, run_opts, get_raw_nnet_from_am=True): next_iter = iter + 1 if get_raw_nnet_from_am: - out_model = ("""- \| nnet3-am-copy --set-raw-nnet=- \ + out_model = """- \| nnet3-am-copy --set-raw-nnet=- \ {dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format( - dir=dir, iter=iter, - next_iter=next_iter)) + dir=dir, iter=iter, next_iter=next_iter + ) else: - out_model = "{dir}/{next_iter}.raw".format( - dir=dir, next_iter=next_iter) + out_model = "{dir}/{next_iter}.raw".format(dir=dir, next_iter=next_iter) common_lib.execute_command( """{command} {dir}/log/average.{iter}.log \ nnet3-average {nnets_list} \ - {out_model}""".format(command=run_opts.command, - dir=dir, - iter=iter, - nnets_list=nnets_list, - out_model=out_model)) - - -def get_best_nnet_model(dir, iter, best_model_index, run_opts, - get_raw_nnet_from_am=True): - + {out_model}""".format( + command=run_opts.command, + dir=dir, + iter=iter, + nnets_list=nnets_list, + out_model=out_model, + ) + ) + + +def get_best_nnet_model( + dir, iter, best_model_index, run_opts, get_raw_nnet_from_am=True +): best_model = "{dir}/{next_iter}.{best_model_index}.raw".format( - dir=dir, - next_iter=iter + 1, - best_model_index=best_model_index) + dir=dir, next_iter=iter + 1, best_model_index=best_model_index + ) if get_raw_nnet_from_am: - out_model = ("""- \| nnet3-am-copy --set-raw-nnet=- \ + out_model = """- \| nnet3-am-copy --set-raw-nnet=- \ {dir}/{iter}.mdl {dir}/{next_iter}.mdl""".format( - dir=dir, iter=iter, next_iter=iter + 1)) + dir=dir, iter=iter, next_iter=iter + 1 + ) else: - out_model = "{dir}/{next_iter}.raw".format(dir=dir, - next_iter=iter + 1) + out_model = "{dir}/{next_iter}.raw".format(dir=dir, next_iter=iter + 1) common_lib.execute_command( """{command} {dir}/log/select.{iter}.log \ nnet3-copy {best_model} \ - {out_model}""".format(command=run_opts.command, - dir=dir, iter=iter, - best_model=best_model, - out_model=out_model)) + {out_model}""".format( + command=run_opts.command, + dir=dir, + iter=iter, + best_model=best_model, + out_model=out_model, + ) + ) def validate_chunk_width(chunk_width): @@ -257,7 +262,7 @@ def validate_minibatch_size_str(minibatch_size_str): assert len(a) != 0 # would be code error for elem in a: - b = elem.split('=') + b = elem.split("=") # We expect b to have length 2 in the normal case. if len(b) != 2: # one-element 'b' is OK if len(a) is 1 (so there is only @@ -291,9 +296,9 @@ def halve_range_str(range_str): halved_ranges = [] for r in ranges: # a range may be either e.g. '64', or '128:256' - c = [str(max(1, int(x)//2)) for x in r.split(":")] + c = [str(max(1, int(x) // 2)) for x in r.split(":")] halved_ranges.append(":".join(c)) - return ','.join(halved_ranges) + return ",".join(halved_ranges) def halve_minibatch_size_str(minibatch_size_str): @@ -303,53 +308,63 @@ def halve_minibatch_size_str(minibatch_size_str): sizes (as opposed to chunk-lengths) and that are >1.""" if not validate_minibatch_size_str(minibatch_size_str): - raise Exception("Invalid minibatch-size string '{0}'".format(minibatch_size_str)) + raise Exception( + "Invalid minibatch-size string '{0}'".format(minibatch_size_str) + ) a = minibatch_size_str.split("/") ans = [] for elem in a: - b = elem.split('=') + b = elem.split("=") # We expect b to have length 2 in the normal case. if len(b) == 1: return halve_range_str(elem) else: assert len(b) == 2 - ans.append('{0}={1}'.format(b[0], halve_range_str(b[1]))) - return '/'.join(ans) + ans.append("{0}={1}".format(b[0], halve_range_str(b[1]))) + return "/".join(ans) def copy_egs_properties_to_exp_dir(egs_dir, dir): try: - for file in ['cmvn_opts', 'splice_opts', 'info/final.ie.id', 'final.mat', - 'global_cmvn.stats', 'online_cmvn']: - file_name = '{dir}/{file}'.format(dir=egs_dir, file=file) + for file in [ + "cmvn_opts", + "splice_opts", + "info/final.ie.id", + "final.mat", + "global_cmvn.stats", + "online_cmvn", + ]: + file_name = "{dir}/{file}".format(dir=egs_dir, file=file) if os.path.isfile(file_name): shutil.copy(file_name, dir) except IOError: - logger.error("Error while trying to copy egs " - "property files to {dir}".format(dir=dir)) + logger.error( + "Error while trying to copy egs " "property files to {dir}".format(dir=dir) + ) raise def parse_generic_config_vars_file(var_file): variables = {} try: - var_file_handle = open(var_file, 'r') + var_file_handle = open(var_file, "r") for line in var_file_handle: - parts = line.split('=') + parts = line.split("=") field_name = parts[0].strip() field_value = parts[1].strip() - if field_name in ['model_left_context', 'left_context']: - variables['model_left_context'] = int(field_value) - elif field_name in ['model_right_context', 'right_context']: - variables['model_right_context'] = int(field_value) - elif field_name == 'num_hidden_layers': + if field_name in ["model_left_context", "left_context"]: + variables["model_left_context"] = int(field_value) + elif field_name in ["model_right_context", "right_context"]: + variables["model_right_context"] = int(field_value) + elif field_name == "num_hidden_layers": if int(field_value) > 1: raise Exception( "You have num_hidden_layers={0} (real meaning: your config files " "are intended to do discriminative pretraining). Since Kaldi 5.2, " "this is no longer supported --> use newer config-creation scripts, " - "i.e. xconfig_to_configs.py.".format(field_value)) + "i.e. xconfig_to_configs.py.".format(field_value) + ) else: variables[field_name] = field_value @@ -358,20 +373,21 @@ def parse_generic_config_vars_file(var_file): # we will throw an error at the end of the function so I will just pass pass - raise Exception('Error while parsing the file {0}'.format(var_file)) + raise Exception("Error while parsing the file {0}".format(var_file)) def get_input_model_info(input_model): - """ This function returns a dictionary with keys "model_left_context" and - "model_right_context" and values equal to the left/right model contexts - for input_model. - This function is useful when using the --trainer.input-model option - instead of initializing the model using configs. + """This function returns a dictionary with keys "model_left_context" and + "model_right_context" and values equal to the left/right model contexts + for input_model. + This function is useful when using the --trainer.input-model option + instead of initializing the model using configs. """ variables = {} try: - out = common_lib.get_command_stdout("""nnet3-info {0} | """ - """head -4 """.format(input_model)) + out = common_lib.get_command_stdout( + """nnet3-info {0} | """ """head -4 """.format(input_model) + ) # out looks like this # left-context: 7 # right-context: 0 @@ -381,29 +397,36 @@ def get_input_model_info(input_model): parts = line.split(":") if len(parts) != 2: continue - if parts[0].strip() == 'left-context': - variables['model_left_context'] = int(parts[1].strip()) - elif parts[0].strip() == 'right-context': - variables['model_right_context'] = int(parts[1].strip()) + if parts[0].strip() == "left-context": + variables["model_left_context"] = int(parts[1].strip()) + elif parts[0].strip() == "right-context": + variables["model_right_context"] = int(parts[1].strip()) except ValueError: pass return variables -def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id, - left_context, right_context, - left_context_initial=-1, right_context_final=-1): +def verify_egs_dir( + egs_dir, + feat_dim, + ivector_dim, + ivector_extractor_id, + left_context, + right_context, + left_context_initial=-1, + right_context_final=-1, +): try: - egs_feat_dim = int(open('{0}/info/feat_dim'.format( - egs_dir)).readline()) + egs_feat_dim = int(open("{0}/info/feat_dim".format(egs_dir)).readline()) egs_ivector_id = None try: - egs_ivector_id = open('{0}/info/final.ie.id'.format( - egs_dir)).readline().strip() - if (egs_ivector_id == ""): - egs_ivector_id = None; + egs_ivector_id = ( + open("{0}/info/final.ie.id".format(egs_dir)).readline().strip() + ) + if egs_ivector_id == "": + egs_ivector_id = None except: # it could actually happen that the file is not there # for example in cases where the egs were dumped by @@ -411,54 +434,76 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id, pass try: - egs_ivector_dim = int(open('{0}/info/ivector_dim'.format( - egs_dir)).readline()) + egs_ivector_dim = int( + open("{0}/info/ivector_dim".format(egs_dir)).readline() + ) except: egs_ivector_dim = 0 - egs_left_context = int(open('{0}/info/left_context'.format( - egs_dir)).readline()) - egs_right_context = int(open('{0}/info/right_context'.format( - egs_dir)).readline()) + egs_left_context = int(open("{0}/info/left_context".format(egs_dir)).readline()) + egs_right_context = int( + open("{0}/info/right_context".format(egs_dir)).readline() + ) try: - egs_left_context_initial = int(open('{0}/info/left_context_initial'.format( - egs_dir)).readline()) + egs_left_context_initial = int( + open("{0}/info/left_context_initial".format(egs_dir)).readline() + ) except: # older scripts didn't write this, treat it as -1 in that case. egs_left_context_initial = -1 try: - egs_right_context_final = int(open('{0}/info/right_context_final'.format( - egs_dir)).readline()) + egs_right_context_final = int( + open("{0}/info/right_context_final".format(egs_dir)).readline() + ) except: # older scripts didn't write this, treat it as -1 in that case. egs_right_context_final = -1 # if feat_dim was supplied as 0, it means the --feat-dir option was not # supplied to the script, so we simply don't know what the feature dim is. - if (feat_dim != 0 and feat_dim != egs_feat_dim) or (ivector_dim != egs_ivector_dim): - raise Exception("There is mismatch between featdim/ivector_dim of " - "the current experiment and the provided " - "egs directory") - - if (((egs_ivector_id is None) and (ivector_extractor_id is not None)) or - ((egs_ivector_id is not None) and (ivector_extractor_id is None))): - logger.warning("The ivector ids are used inconsistently. It's your " - "responsibility to make sure the ivector extractor " - "has been used consistently") - logger.warning("ivector id for egs: {0} in dir {1}".format(egs_ivector_id, egs_dir)) + if (feat_dim != 0 and feat_dim != egs_feat_dim) or ( + ivector_dim != egs_ivector_dim + ): + raise Exception( + "There is mismatch between featdim/ivector_dim of " + "the current experiment and the provided " + "egs directory" + ) + + if ((egs_ivector_id is None) and (ivector_extractor_id is not None)) or ( + (egs_ivector_id is not None) and (ivector_extractor_id is None) + ): + logger.warning( + "The ivector ids are used inconsistently. It's your " + "responsibility to make sure the ivector extractor " + "has been used consistently" + ) + logger.warning( + "ivector id for egs: {0} in dir {1}".format(egs_ivector_id, egs_dir) + ) logger.warning("ivector id for extractor: {0}".format(ivector_extractor_id)) - elif ((egs_ivector_dim > 0) and (egs_ivector_id is None) and (ivector_extractor_id is None)): - logger.warning("The ivector ids are not used. It's your " - "responsibility to make sure the ivector extractor " - "has been used consistently") + elif ( + (egs_ivector_dim > 0) + and (egs_ivector_id is None) + and (ivector_extractor_id is None) + ): + logger.warning( + "The ivector ids are not used. It's your " + "responsibility to make sure the ivector extractor " + "has been used consistently" + ) elif ivector_extractor_id != egs_ivector_id: - raise Exception("The egs were generated using a different ivector " - "extractor. id1 = {0}, id2={1}".format( - ivector_extractor_id, egs_ivector_id)); - - if (egs_left_context < left_context or - egs_right_context < right_context): - raise Exception('The egs have insufficient (l,r) context ({0},{1}) ' - 'versus expected ({2},{3})'.format( - egs_left_context, egs_right_context, - left_context, right_context)) + raise Exception( + "The egs were generated using a different ivector " + "extractor. id1 = {0}, id2={1}".format( + ivector_extractor_id, egs_ivector_id + ) + ) + + if egs_left_context < left_context or egs_right_context < right_context: + raise Exception( + "The egs have insufficient (l,r) context ({0},{1}) " + "versus expected ({2},{3})".format( + egs_left_context, egs_right_context, left_context, right_context + ) + ) # the condition on the initial/final context is an equality condition, # not an inequality condition, as there is no mechanism to 'correct' the @@ -468,72 +513,88 @@ def verify_egs_dir(egs_dir, feat_dim, ivector_dim, ivector_extractor_id, # --egs.chunk-left-context-initial and --egs.chunk-right-context-final # options to make things matched up. [note: the model l/r context gets # added in, so you have to correct for changes in that.] - if (egs_left_context_initial != left_context_initial or - egs_right_context_final != right_context_final): - raise Exception('The egs have incorrect initial/final (l,r) context ' - '({0},{1}) versus expected ({2},{3}). See code from ' - 'where this exception was raised for more info'.format( - egs_left_context_initial, egs_right_context_final, - left_context_initial, right_context_final)) - - frames_per_eg_str = open('{0}/info/frames_per_eg'.format( - egs_dir)).readline().rstrip() + if ( + egs_left_context_initial != left_context_initial + or egs_right_context_final != right_context_final + ): + raise Exception( + "The egs have incorrect initial/final (l,r) context " + "({0},{1}) versus expected ({2},{3}). See code from " + "where this exception was raised for more info".format( + egs_left_context_initial, + egs_right_context_final, + left_context_initial, + right_context_final, + ) + ) + + frames_per_eg_str = ( + open("{0}/info/frames_per_eg".format(egs_dir)).readline().rstrip() + ) if not validate_chunk_width(frames_per_eg_str): - raise Exception("Invalid frames_per_eg in directory {0}/info".format( - egs_dir)) - num_archives = int(open('{0}/info/num_archives'.format( - egs_dir)).readline()) + raise Exception( + "Invalid frames_per_eg in directory {0}/info".format(egs_dir) + ) + num_archives = int(open("{0}/info/num_archives".format(egs_dir)).readline()) - return [egs_left_context, egs_right_context, - frames_per_eg_str, num_archives] + return [egs_left_context, egs_right_context, frames_per_eg_str, num_archives] except (IOError, ValueError): - logger.error("The egs dir {0} has missing or " - "malformed files.".format(egs_dir)) + logger.error( + "The egs dir {0} has missing or " "malformed files.".format(egs_dir) + ) raise -def compute_presoftmax_prior_scale(dir, alidir, num_jobs, run_opts, - presoftmax_prior_scale_power=-0.25): - +def compute_presoftmax_prior_scale( + dir, alidir, num_jobs, run_opts, presoftmax_prior_scale_power=-0.25 +): # getting the raw pdf count common_lib.execute_command( """{command} JOB=1:{num_jobs} {dir}/log/acc_pdf.JOB.log \ ali-to-post "ark:gunzip -c {alidir}/ali.JOB.gz|" ark:- \| \ post-to-tacc --per-pdf=true {alidir}/final.mdl ark:- \ - {dir}/pdf_counts.JOB""".format(command=run_opts.command, - num_jobs=num_jobs, - dir=dir, - alidir=alidir)) + {dir}/pdf_counts.JOB""".format( + command=run_opts.command, num_jobs=num_jobs, dir=dir, alidir=alidir + ) + ) common_lib.execute_command( """{command} {dir}/log/sum_pdf_counts.log \ vector-sum --binary=false {dir}/pdf_counts.* {dir}/pdf_counts \ - """.format(command=run_opts.command, dir=dir)) + """.format( + command=run_opts.command, dir=dir + ) + ) - for file in glob.glob('{0}/pdf_counts.*'.format(dir)): + for file in glob.glob("{0}/pdf_counts.*".format(dir)): os.remove(file) - pdf_counts = common_lib.read_kaldi_matrix('{0}/pdf_counts'.format(dir))[0] + pdf_counts = common_lib.read_kaldi_matrix("{0}/pdf_counts".format(dir))[0] scaled_counts = smooth_presoftmax_prior_scale_vector( pdf_counts, presoftmax_prior_scale_power=presoftmax_prior_scale_power, - smooth=0.01) + smooth=0.01, + ) output_file = "{0}/presoftmax_prior_scale.vec".format(dir) common_lib.write_kaldi_matrix(output_file, [scaled_counts]) - common_lib.force_symlink("../presoftmax_prior_scale.vec", - "{0}/configs/presoftmax_prior_scale.vec".format( - dir)) + common_lib.force_symlink( + "../presoftmax_prior_scale.vec", + "{0}/configs/presoftmax_prior_scale.vec".format(dir), + ) -def smooth_presoftmax_prior_scale_vector(pdf_counts, - presoftmax_prior_scale_power=-0.25, - smooth=0.01): +def smooth_presoftmax_prior_scale_vector( + pdf_counts, presoftmax_prior_scale_power=-0.25, smooth=0.01 +): total = sum(pdf_counts) average_count = float(total) / len(pdf_counts) scales = [] for i in range(len(pdf_counts)): - scales.append(math.pow(pdf_counts[i] + smooth * average_count, - presoftmax_prior_scale_power)) + scales.append( + math.pow( + pdf_counts[i] + smooth * average_count, presoftmax_prior_scale_power + ) + ) num_pdfs = len(pdf_counts) scaled_counts = [x * float(num_pdfs) / sum(scales) for x in scales] return scaled_counts @@ -543,27 +604,29 @@ def prepare_initial_network(dir, run_opts, srand=-3, input_model=None): if input_model is not None: shutil.copy(input_model, "{0}/0.raw".format(dir)) return - if os.path.exists(dir+"/configs/init.config"): + if os.path.exists(dir + "/configs/init.config"): common_lib.execute_command( """{command} {dir}/log/add_first_layer.log \ nnet3-init --srand={srand} {dir}/init.raw \ {dir}/configs/final.config {dir}/0.raw""".format( - command=run_opts.command, srand=srand, - dir=dir)) + command=run_opts.command, srand=srand, dir=dir + ) + ) else: common_lib.execute_command( """{command} {dir}/log/init_model.log \ nnet3-init --srand={srand} {dir}/configs/final.config {dir}/0.raw""".format( - command=run_opts.command, srand=srand, - dir=dir)) + command=run_opts.command, srand=srand, dir=dir + ) + ) -def get_model_combine_iters(num_iters, num_epochs, - num_archives, max_models_combine, - num_jobs_final): - """ Figures out the list of iterations for which we'll use those models - in the final model-averaging phase. (note: it's a weighted average - where the weights are worked out from a subset of training data.)""" +def get_model_combine_iters( + num_iters, num_epochs, num_archives, max_models_combine, num_jobs_final +): + """Figures out the list of iterations for which we'll use those models + in the final model-averaging phase. (note: it's a weighted average + where the weights are worked out from a subset of training data.)""" approx_iters_per_epoch_final = float(num_archives) / num_jobs_final # Note: it used to be that we would combine over an entire epoch, @@ -582,22 +645,27 @@ def get_model_combine_iters(num_iters, num_epochs, # But if this value is > max_models_combine, then the models # are subsampled to get these many models to combine. - num_iters_combine_initial = min(int(approx_iters_per_epoch_final/2) + 1, - int(num_iters/2)) + num_iters_combine_initial = min( + int(approx_iters_per_epoch_final / 2) + 1, int(num_iters / 2) + ) if num_iters_combine_initial > max_models_combine: subsample_model_factor = int( - float(num_iters_combine_initial) / max_models_combine) + float(num_iters_combine_initial) / max_models_combine + ) num_iters_combine = num_iters_combine_initial - models_to_combine = set(range( - num_iters - num_iters_combine_initial + 1, - num_iters + 1, subsample_model_factor)) + models_to_combine = set( + range( + num_iters - num_iters_combine_initial + 1, + num_iters + 1, + subsample_model_factor, + ) + ) models_to_combine.add(num_iters) else: subsample_model_factor = 1 - num_iters_combine = min(max_models_combine, num_iters//2) - models_to_combine = set(range(num_iters - num_iters_combine + 1, - num_iters + 1)) + num_iters_combine = min(max_models_combine, num_iters // 2) + models_to_combine = set(range(num_iters - num_iters_combine + 1, num_iters + 1)) return models_to_combine @@ -612,91 +680,116 @@ def get_current_num_jobs(it, num_it, start, step, end): return int(0.5 + ideal / step) * step -def get_learning_rate(iter, num_jobs, num_iters, num_archives_processed, - num_archives_to_process, - initial_effective_lrate, final_effective_lrate): +def get_learning_rate( + iter, + num_jobs, + num_iters, + num_archives_processed, + num_archives_to_process, + initial_effective_lrate, + final_effective_lrate, +): if iter + 1 >= num_iters: effective_learning_rate = final_effective_lrate else: - effective_learning_rate = ( - initial_effective_lrate - * math.exp(num_archives_processed - * math.log(float(final_effective_lrate) / initial_effective_lrate) - / num_archives_to_process)) + effective_learning_rate = initial_effective_lrate * math.exp( + num_archives_processed + * math.log(float(final_effective_lrate) / initial_effective_lrate) + / num_archives_to_process + ) return num_jobs * effective_learning_rate -def should_do_shrinkage(iter, model_file, shrink_saturation_threshold, - get_raw_nnet_from_am=True): - +def should_do_shrinkage( + iter, model_file, shrink_saturation_threshold, get_raw_nnet_from_am=True +): if iter == 0: return True if get_raw_nnet_from_am: output = common_lib.get_command_stdout( "nnet3-am-info {0} 2>/dev/null | " - "steps/nnet3/get_saturation.pl".format(model_file)) + "steps/nnet3/get_saturation.pl".format(model_file) + ) else: output = common_lib.get_command_stdout( "nnet3-info 2>/dev/null {0} | " - "steps/nnet3/get_saturation.pl".format(model_file)) + "steps/nnet3/get_saturation.pl".format(model_file) + ) output = output.strip().split("\n") try: assert len(output) == 1 saturation = float(output[0]) assert saturation >= 0 and saturation <= 1 except: - raise Exception("Something went wrong, could not get " - "saturation from the output '{0}' of " - "get_saturation.pl on the info of " - "model {1}".format(output, model_file)) + raise Exception( + "Something went wrong, could not get " + "saturation from the output '{0}' of " + "get_saturation.pl on the info of " + "model {1}".format(output, model_file) + ) return saturation > shrink_saturation_threshold def remove_nnet_egs(egs_dir): - common_lib.execute_command("steps/nnet2/remove_egs.sh {egs_dir}".format( - egs_dir=egs_dir)) - - -def clean_nnet_dir(nnet_dir, num_iters, egs_dir, - preserve_model_interval=100, - remove_egs=True, - get_raw_nnet_from_am=True): + common_lib.execute_command( + "steps/nnet2/remove_egs.sh {egs_dir}".format(egs_dir=egs_dir) + ) + + +def clean_nnet_dir( + nnet_dir, + num_iters, + egs_dir, + preserve_model_interval=100, + remove_egs=True, + get_raw_nnet_from_am=True, +): try: if remove_egs: remove_nnet_egs(egs_dir) for iter in range(num_iters): - remove_model(nnet_dir, iter, num_iters, None, - preserve_model_interval, - get_raw_nnet_from_am=get_raw_nnet_from_am) + remove_model( + nnet_dir, + iter, + num_iters, + None, + preserve_model_interval, + get_raw_nnet_from_am=get_raw_nnet_from_am, + ) except (IOError, OSError): logger.error("Error while cleaning up the nnet directory") raise -def remove_model(nnet_dir, iter, num_iters, models_to_combine=None, - preserve_model_interval=100, - get_raw_nnet_from_am=True): +def remove_model( + nnet_dir, + iter, + num_iters, + models_to_combine=None, + preserve_model_interval=100, + get_raw_nnet_from_am=True, +): if iter % preserve_model_interval == 0: return if models_to_combine is not None and iter in models_to_combine: return if get_raw_nnet_from_am: - file_name = '{0}/{1}.mdl'.format(nnet_dir, iter) + file_name = "{0}/{1}.mdl".format(nnet_dir, iter) else: - file_name = '{0}/{1}.raw'.format(nnet_dir, iter) + file_name = "{0}/{1}.raw".format(nnet_dir, iter) if os.path.isfile(file_name): os.remove(file_name) def positive_int(arg): - val = int(arg) - if (val <= 0): - raise argparse.ArgumentTypeError("must be positive int: '%s'" % arg) - return val + val = int(arg) + if val <= 0: + raise argparse.ArgumentTypeError("must be positive int: '%s'" % arg) + return val class CommonParser(object): @@ -710,86 +803,128 @@ class CommonParser(object): parser = argparse.ArgumentParser(add_help=False) - def __init__(self, - include_chunk_context=True, - default_chunk_left_context=0): + def __init__(self, include_chunk_context=True, default_chunk_left_context=0): # feat options - self.parser.add_argument("--feat.online-ivector-dir", type=str, - dest='online_ivector_dir', default=None, - action=common_lib.NullstrToNoneAction, - help="""directory with the ivectors extracted - in an online fashion.""") - self.parser.add_argument("--feat.cmvn-opts", type=str, - dest='cmvn_opts', default=None, - action=common_lib.NullstrToNoneAction, - help="A string specifying '--norm-means' " - "and '--norm-vars' values") + self.parser.add_argument( + "--feat.online-ivector-dir", + type=str, + dest="online_ivector_dir", + default=None, + action=common_lib.NullstrToNoneAction, + help="""directory with the ivectors extracted + in an online fashion.""", + ) + self.parser.add_argument( + "--feat.cmvn-opts", + type=str, + dest="cmvn_opts", + default=None, + action=common_lib.NullstrToNoneAction, + help="A string specifying '--norm-means' " "and '--norm-vars' values", + ) # egs extraction options. there is no point adding the chunk context # option for non-RNNs (by which we mean basic TDNN-type topologies), as # it wouldn't affect anything, so we disable them if we know in advance # that we're not supporting RNN-type topologies (as in train_dnn.py). if include_chunk_context: - self.parser.add_argument("--egs.chunk-left-context", type=int, - dest='chunk_left_context', - default=default_chunk_left_context, - help="""Number of additional frames of input + self.parser.add_argument( + "--egs.chunk-left-context", + type=int, + dest="chunk_left_context", + default=default_chunk_left_context, + help="""Number of additional frames of input to the left of the input chunk. This extra context will be used in the estimation of RNN state before prediction of the first label. In the case of FF-DNN this extra context will be - used to allow for frame-shifts""") - self.parser.add_argument("--egs.chunk-right-context", type=int, - dest='chunk_right_context', default=0, - help="""Number of additional frames of input + used to allow for frame-shifts""", + ) + self.parser.add_argument( + "--egs.chunk-right-context", + type=int, + dest="chunk_right_context", + default=0, + help="""Number of additional frames of input to the right of the input chunk. This extra context will be used in the estimation of bidirectional RNN state before prediction of - the first label.""") - self.parser.add_argument("--egs.chunk-left-context-initial", type=int, - dest='chunk_left_context_initial', default=-1, - help="""Number of additional frames of input + the first label.""", + ) + self.parser.add_argument( + "--egs.chunk-left-context-initial", + type=int, + dest="chunk_left_context_initial", + default=-1, + help="""Number of additional frames of input to the left of the *first* input chunk extracted from an utterance. If negative, defaults to - the same as --egs.chunk-left-context""") - self.parser.add_argument("--egs.chunk-right-context-final", type=int, - dest='chunk_right_context_final', default=-1, - help="""Number of additional frames of input + the same as --egs.chunk-left-context""", + ) + self.parser.add_argument( + "--egs.chunk-right-context-final", + type=int, + dest="chunk_right_context_final", + default=-1, + help="""Number of additional frames of input to the right of the *last* input chunk extracted from an utterance. If negative, defaults to the - same as --egs.chunk-right-context""") - self.parser.add_argument("--egs.dir", type=str, dest='egs_dir', - default=None, - action=common_lib.NullstrToNoneAction, - help="""Directory with egs. If specified this + same as --egs.chunk-right-context""", + ) + self.parser.add_argument( + "--egs.dir", + type=str, + dest="egs_dir", + default=None, + action=common_lib.NullstrToNoneAction, + help="""Directory with egs. If specified this directory will be used rather than extracting - egs""") - self.parser.add_argument("--egs.stage", type=int, dest='egs_stage', - default=0, - help="Stage at which get_egs.sh should be " - "restarted") - self.parser.add_argument("--egs.opts", type=str, dest='egs_opts', - default=None, - action=common_lib.NullstrToNoneAction, - help="""String to provide options directly - to steps/nnet3/get_egs.sh script""") + egs""", + ) + self.parser.add_argument( + "--egs.stage", + type=int, + dest="egs_stage", + default=0, + help="Stage at which get_egs.sh should be " "restarted", + ) + self.parser.add_argument( + "--egs.opts", + type=str, + dest="egs_opts", + default=None, + action=common_lib.NullstrToNoneAction, + help="""String to provide options directly + to steps/nnet3/get_egs.sh script""", + ) # trainer options - self.parser.add_argument("--trainer.srand", type=int, dest='srand', - default=0, - help="""Sets the random seed for model + self.parser.add_argument( + "--trainer.srand", + type=int, + dest="srand", + default=0, + help="""Sets the random seed for model initialization and egs shuffling. Warning: This random seed does not control all aspects of this experiment. There might be other random seeds used in other stages of the experiment like data preparation (e.g. volume - perturbation).""") - self.parser.add_argument("--trainer.num-epochs", type=float, - dest='num_epochs', default=8.0, - help="Number of epochs to train the model") - self.parser.add_argument("--trainer.shuffle-buffer-size", type=int, - dest='shuffle_buffer_size', default=5000, - help=""" Controls randomization of the samples + perturbation).""", + ) + self.parser.add_argument( + "--trainer.num-epochs", + type=float, + dest="num_epochs", + default=8.0, + help="Number of epochs to train the model", + ) + self.parser.add_argument( + "--trainer.shuffle-buffer-size", + type=int, + dest="shuffle_buffer_size", + default=5000, + help=""" Controls randomization of the samples on each iteration. If 0 or a large value the randomization is complete, but this will consume memory and cause spikes in disk I/O. @@ -800,32 +935,51 @@ def __init__(self, different minibatches on different iterations, since in the preconditioning method, 2 samples in the same minibatch can affect each others' - gradients.""") - self.parser.add_argument("--trainer.max-param-change", type=float, - dest='max_param_change', default=2.0, - help="""The maximum change in parameters + gradients.""", + ) + self.parser.add_argument( + "--trainer.max-param-change", + type=float, + dest="max_param_change", + default=2.0, + help="""The maximum change in parameters allowed per minibatch, measured in Frobenius - norm over the entire model""") - self.parser.add_argument("--trainer.samples-per-iter", type=int, - dest='samples_per_iter', default=400000, - help="This is really the number of egs in " - "each archive.") - self.parser.add_argument("--trainer.lda.rand-prune", type=float, - dest='rand_prune', default=4.0, - help="Value used in preconditioning " - "matrix estimation") - self.parser.add_argument("--trainer.lda.max-lda-jobs", type=int, - dest='max_lda_jobs', default=10, - help="Max number of jobs used for " - "LDA stats accumulation") - self.parser.add_argument("--trainer.presoftmax-prior-scale-power", - type=float, - dest='presoftmax_prior_scale_power', - default=-0.25, - help="Scale on presofmax prior") - self.parser.add_argument("--trainer.optimization.proportional-shrink", type=float, - dest='proportional_shrink', default=0.0, - help="""If nonzero, this will set a shrinkage (scaling) + norm over the entire model""", + ) + self.parser.add_argument( + "--trainer.samples-per-iter", + type=int, + dest="samples_per_iter", + default=400000, + help="This is really the number of egs in " "each archive.", + ) + self.parser.add_argument( + "--trainer.lda.rand-prune", + type=float, + dest="rand_prune", + default=4.0, + help="Value used in preconditioning " "matrix estimation", + ) + self.parser.add_argument( + "--trainer.lda.max-lda-jobs", + type=int, + dest="max_lda_jobs", + default=10, + help="Max number of jobs used for " "LDA stats accumulation", + ) + self.parser.add_argument( + "--trainer.presoftmax-prior-scale-power", + type=float, + dest="presoftmax_prior_scale_power", + default=-0.25, + help="Scale on presofmax prior", + ) + self.parser.add_argument( + "--trainer.optimization.proportional-shrink", + type=float, + dest="proportional_shrink", + default=0.0, + help="""If nonzero, this will set a shrinkage (scaling) factor for the parameters, whose value is set as: shrink-value=(1.0 - proportional-shrink * learning-rate), where 'learning-rate' is the learning rate being applied @@ -834,67 +988,108 @@ def __init__(self, final-effective-lrate*num-jobs-final. Unlike for train_rnn.py, this is applied unconditionally, it does not depend on saturation of nonlinearities. - Can be used to roughly approximate l2 regularization.""") + Can be used to roughly approximate l2 regularization.""", + ) # Parameters for the optimization self.parser.add_argument( - "--trainer.optimization.initial-effective-lrate", type=float, - dest='initial_effective_lrate', default=0.0003, - help="Learning rate used during the initial iteration") + "--trainer.optimization.initial-effective-lrate", + type=float, + dest="initial_effective_lrate", + default=0.0003, + help="Learning rate used during the initial iteration", + ) + self.parser.add_argument( + "--trainer.optimization.final-effective-lrate", + type=float, + dest="final_effective_lrate", + default=0.00003, + help="Learning rate used during the final iteration", + ) self.parser.add_argument( - "--trainer.optimization.final-effective-lrate", type=float, - dest='final_effective_lrate', default=0.00003, - help="Learning rate used during the final iteration") - self.parser.add_argument("--trainer.optimization.num-jobs-initial", - type=int, dest='num_jobs_initial', default=1, - help="Number of neural net jobs to run in " - "parallel at the start of training") - self.parser.add_argument("--trainer.optimization.num-jobs-final", - type=int, dest='num_jobs_final', default=8, - help="Number of neural net jobs to run in " - "parallel at the end of training") - self.parser.add_argument("--trainer.optimization.num-jobs-step", - type=positive_int, metavar='N', dest='num_jobs_step', default=1, + "--trainer.optimization.num-jobs-initial", + type=int, + dest="num_jobs_initial", + default=1, + help="Number of neural net jobs to run in " + "parallel at the start of training", + ) + self.parser.add_argument( + "--trainer.optimization.num-jobs-final", + type=int, + dest="num_jobs_final", + default=8, + help="Number of neural net jobs to run in " + "parallel at the end of training", + ) + self.parser.add_argument( + "--trainer.optimization.num-jobs-step", + type=positive_int, + metavar="N", + dest="num_jobs_step", + default=1, help="""Number of jobs increment, when exceeds this number. For - example, if N=3, the number of jobs may progress as 1, 2, 3, 6, 9...""") - self.parser.add_argument("--trainer.optimization.max-models-combine", - "--trainer.max-models-combine", - type=int, dest='max_models_combine', - default=20, - help="""The maximum number of models used in + example, if N=3, the number of jobs may progress as 1, 2, 3, 6, 9...""", + ) + self.parser.add_argument( + "--trainer.optimization.max-models-combine", + "--trainer.max-models-combine", + type=int, + dest="max_models_combine", + default=20, + help="""The maximum number of models used in the final model combination stage. These models will themselves be averages of - iteration-number ranges""") - self.parser.add_argument("--trainer.optimization.max-objective-evaluations", - "--trainer.max-objective-evaluations", - type=int, dest='max_objective_evaluations', - default=30, - help="""The maximum number of objective + iteration-number ranges""", + ) + self.parser.add_argument( + "--trainer.optimization.max-objective-evaluations", + "--trainer.max-objective-evaluations", + type=int, + dest="max_objective_evaluations", + default=30, + help="""The maximum number of objective evaluations in order to figure out the best number of models to combine. It helps to speedup if the number of models provided to the model combination binary is quite large (e.g. - several hundred).""") - self.parser.add_argument("--trainer.optimization.do-final-combination", - dest='do_final_combination', type=str, - action=common_lib.StrToBoolAction, - choices=["true", "false"], default=True, - help="""Set this to false to disable the final + several hundred).""", + ) + self.parser.add_argument( + "--trainer.optimization.do-final-combination", + dest="do_final_combination", + type=str, + action=common_lib.StrToBoolAction, + choices=["true", "false"], + default=True, + help="""Set this to false to disable the final 'combine' stage (in this case we just use the - last-numbered model as the final.mdl).""") - self.parser.add_argument("--trainer.optimization.combine-sum-to-one-penalty", - type=float, dest='combine_sum_to_one_penalty', default=0.0, - help="""This option is deprecated and does nothing.""") - self.parser.add_argument("--trainer.optimization.momentum", type=float, - dest='momentum', default=0.0, - help="""Momentum used in update computation. + last-numbered model as the final.mdl).""", + ) + self.parser.add_argument( + "--trainer.optimization.combine-sum-to-one-penalty", + type=float, + dest="combine_sum_to_one_penalty", + default=0.0, + help="""This option is deprecated and does nothing.""", + ) + self.parser.add_argument( + "--trainer.optimization.momentum", + type=float, + dest="momentum", + default=0.0, + help="""Momentum used in update computation. Note: we implemented it in such a way that it doesn't increase the effective learning - rate.""") - self.parser.add_argument("--trainer.dropout-schedule", type=str, - action=common_lib.NullstrToNoneAction, - dest='dropout_schedule', default=None, - help="""Use this to specify the dropout + rate.""", + ) + self.parser.add_argument( + "--trainer.dropout-schedule", + type=str, + action=common_lib.NullstrToNoneAction, + dest="dropout_schedule", + default=None, + help="""Use this to specify the dropout schedule. You specify a piecewise linear function on the domain [0,1], where 0 is the start and 1 is the end of training; the @@ -915,123 +1110,202 @@ def __init__(self, pattern2=func2', e.g. 'relu*=0,0.1,0 lstm*=0,0.2,0'. More general should precede less general patterns, as they are applied - sequentially.""") - self.parser.add_argument("--trainer.add-option", type=str, - dest='train_opts', action='append', default=[], - help="""You can use this to add arbitrary options that + sequentially.""", + ) + self.parser.add_argument( + "--trainer.add-option", + type=str, + dest="train_opts", + action="append", + default=[], + help="""You can use this to add arbitrary options that will be passed through to the core training code (nnet3-train - or nnet3-chain-train)""") - self.parser.add_argument("--trainer.optimization.backstitch-training-scale", - type=float, dest='backstitch_training_scale', - default=0.0, help="""scale of parameters changes - used in backstitch training step.""") - self.parser.add_argument("--trainer.optimization.backstitch-training-interval", - type=int, dest='backstitch_training_interval', - default=1, help="""the interval of minibatches - that backstitch training is applied on.""") - self.parser.add_argument("--trainer.compute-per-dim-accuracy", - dest='compute_per_dim_accuracy', - type=str, choices=['true', 'false'], - default=False, - action=common_lib.StrToBoolAction, - help="Compute train and validation " - "accuracy per-dim") + or nnet3-chain-train)""", + ) + self.parser.add_argument( + "--trainer.optimization.backstitch-training-scale", + type=float, + dest="backstitch_training_scale", + default=0.0, + help="""scale of parameters changes + used in backstitch training step.""", + ) + self.parser.add_argument( + "--trainer.optimization.backstitch-training-interval", + type=int, + dest="backstitch_training_interval", + default=1, + help="""the interval of minibatches + that backstitch training is applied on.""", + ) + self.parser.add_argument( + "--trainer.compute-per-dim-accuracy", + dest="compute_per_dim_accuracy", + type=str, + choices=["true", "false"], + default=False, + action=common_lib.StrToBoolAction, + help="Compute train and validation " "accuracy per-dim", + ) # General options - self.parser.add_argument("--stage", type=int, default=-4, - help="Specifies the stage of the experiment " - "to execution from") - self.parser.add_argument("--exit-stage", type=int, default=None, - help="If specified, training exits before " - "running this stage") - self.parser.add_argument("--cmd", type=str, dest="command", - action=common_lib.NullstrToNoneAction, - help="""Specifies the script to launch jobs. + self.parser.add_argument( + "--stage", + type=int, + default=-4, + help="Specifies the stage of the experiment " "to execution from", + ) + self.parser.add_argument( + "--exit-stage", + type=int, + default=None, + help="If specified, training exits before " "running this stage", + ) + self.parser.add_argument( + "--cmd", + type=str, + dest="command", + action=common_lib.NullstrToNoneAction, + help="""Specifies the script to launch jobs. e.g. queue.pl for launching on SGE cluster run.pl for launching on local machine - """, default="queue.pl") - self.parser.add_argument("--egs.cmd", type=str, dest="egs_command", - action=common_lib.NullstrToNoneAction, - help="Script to launch egs jobs") - self.parser.add_argument("--use-gpu", type=str, - choices=["true", "false", "yes", "no", "wait"], - help="Use GPU for training. " - "Note 'true' and 'false' are deprecated.", - default="yes") - self.parser.add_argument("--cleanup", type=str, - action=common_lib.StrToBoolAction, - choices=["true", "false"], default=True, - help="Clean up models after training") - self.parser.add_argument("--cleanup.remove-egs", type=str, - dest='remove_egs', default=True, - action=common_lib.StrToBoolAction, - choices=["true", "false"], - help="If true, remove egs after experiment") - self.parser.add_argument("--cleanup.preserve-model-interval", - dest="preserve_model_interval", - type=int, default=100, - help="""Determines iterations for which models + """, + default="queue.pl", + ) + self.parser.add_argument( + "--egs.cmd", + type=str, + dest="egs_command", + action=common_lib.NullstrToNoneAction, + help="Script to launch egs jobs", + ) + self.parser.add_argument( + "--use-gpu", + type=str, + choices=["true", "false", "yes", "no", "wait"], + help="Use GPU for training. " "Note 'true' and 'false' are deprecated.", + default="yes", + ) + self.parser.add_argument( + "--cleanup", + type=str, + action=common_lib.StrToBoolAction, + choices=["true", "false"], + default=True, + help="Clean up models after training", + ) + self.parser.add_argument( + "--cleanup.remove-egs", + type=str, + dest="remove_egs", + default=True, + action=common_lib.StrToBoolAction, + choices=["true", "false"], + help="If true, remove egs after experiment", + ) + self.parser.add_argument( + "--cleanup.preserve-model-interval", + dest="preserve_model_interval", + type=int, + default=100, + help="""Determines iterations for which models will be preserved during cleanup. If mod(iter,preserve_model_interval) == 0 - model will be preserved.""") + model will be preserved.""", + ) - self.parser.add_argument("--reporting.email", dest="email", - type=str, default=None, - action=common_lib.NullstrToNoneAction, - help=""" Email-id to report about the progress + self.parser.add_argument( + "--reporting.email", + dest="email", + type=str, + default=None, + action=common_lib.NullstrToNoneAction, + help=""" Email-id to report about the progress of the experiment. NOTE: It assumes the machine on which the script is being run can send emails from command line via. mail program. The Kaldi mailing list will not support this feature. It might require local - expertise to setup. """) - self.parser.add_argument("--reporting.interval", - dest="reporting_interval", - type=float, default=0.1, - help="""Frequency with which reports have to + expertise to setup. """, + ) + self.parser.add_argument( + "--reporting.interval", + dest="reporting_interval", + type=float, + default=0.1, + help="""Frequency with which reports have to be sent, measured in terms of fraction of iterations. If 0 and reporting mail has been specified - then only failure notifications are sent""") + then only failure notifications are sent""", + ) import unittest -class SelfTest(unittest.TestCase): +class SelfTest(unittest.TestCase): def test_halve_minibatch_size_str(self): - self.assertEqual('32', halve_minibatch_size_str('64')) - self.assertEqual('32,8:16', halve_minibatch_size_str('64,16:32')) - self.assertEqual('1', halve_minibatch_size_str('1')) - self.assertEqual('128=32/256=20,40:50', halve_minibatch_size_str('128=64/256=40,80:100')) - + self.assertEqual("32", halve_minibatch_size_str("64")) + self.assertEqual("32,8:16", halve_minibatch_size_str("64,16:32")) + self.assertEqual("1", halve_minibatch_size_str("1")) + self.assertEqual( + "128=32/256=20,40:50", halve_minibatch_size_str("128=64/256=40,80:100") + ) def test_validate_chunk_width(self): - for s in [ '64', '64,25,128' ]: + for s in ["64", "64,25,128"]: self.assertTrue(validate_chunk_width(s), s) - def test_validate_minibatch_size_str(self): # Good descriptors. - for s in [ '32', '32,64', '1:32', '1:32,64', '64,1:32', '1:5,10:15', - '128=64:128/256=32,64', '1=2/3=4', '1=1/2=2/3=3/4=4' ]: + for s in [ + "32", + "32,64", + "1:32", + "1:32,64", + "64,1:32", + "1:5,10:15", + "128=64:128/256=32,64", + "1=2/3=4", + "1=1/2=2/3=3/4=4", + ]: self.assertTrue(validate_minibatch_size_str(s), s) # Bad descriptors. - for s in [ None, 42, (43,), '', '1:', ':2', '3,', ',4', '5:6,', ',7:8', - '9=', '10=10/', '11=11/11', '12=1:2//13=1:3' '14=/15=15', - '16/17=17', '/18=18', '/18', '//19', '/' ]: + for s in [ + None, + 42, + (43,), + "", + "1:", + ":2", + "3,", + ",4", + "5:6,", + ",7:8", + "9=", + "10=10/", + "11=11/11", + "12=1:2//13=1:3" "14=/15=15", + "16/17=17", + "/18=18", + "/18", + "//19", + "/", + ]: self.assertFalse(validate_minibatch_size_str(s), s) - def test_get_current_num_jobs(self): niters = 12 - self.assertEqual([2, 3, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8], - [get_current_num_jobs(i, niters, 2, 1, 9) - for i in range(niters)]) - self.assertEqual([2, 3, 3, 3, 3, 6, 6, 6, 6, 6, 9, 9], - [get_current_num_jobs(i, niters, 2, 3, 9) - for i in range(niters)]) + self.assertEqual( + [2, 3, 3, 4, 4, 5, 6, 6, 7, 7, 8, 8], + [get_current_num_jobs(i, niters, 2, 1, 9) for i in range(niters)], + ) + self.assertEqual( + [2, 3, 3, 3, 3, 6, 6, 6, 6, 6, 9, 9], + [get_current_num_jobs(i, niters, 2, 3, 9) for i in range(niters)], + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/dropout_schedule.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/dropout_schedule.py index fa3991a2774..5d556930c58 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/dropout_schedule.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/dropout_schedule.py @@ -15,6 +15,7 @@ _debug_dropout = False + def _parse_dropout_option(dropout_option): """Parses the string option to --trainer.dropout-schedule and returns a list of dropout schedules for different component name patterns. @@ -34,31 +35,35 @@ def _parse_dropout_option(dropout_option): A data fraction of 0 corresponds to beginning of training and 1 corresponds to all data. """ - components = dropout_option.strip().split(' ') + components = dropout_option.strip().split(" ") dropout_schedule = [] for component in components: - parts = component.split('=') + parts = component.split("=") if len(parts) == 2: component_name = parts[0] this_dropout_str = parts[1] elif len(parts) == 1: - component_name = '*' + component_name = "*" this_dropout_str = parts[0] else: - raise Exception("The dropout schedule must be specified in the " - "format 'pattern1=func1 patter2=func2' where " - "the pattern can be omitted for a global function " - "for all components.\n" - "Got {0} in {1}".format(component, dropout_option)) + raise Exception( + "The dropout schedule must be specified in the " + "format 'pattern1=func1 patter2=func2' where " + "the pattern can be omitted for a global function " + "for all components.\n" + "Got {0} in {1}".format(component, dropout_option) + ) this_dropout_values = _parse_dropout_string(this_dropout_str) dropout_schedule.append((component_name, this_dropout_values)) if _debug_dropout: logger.info("Dropout schedules for component names is as follows:") - logger.info(": [(num_archives_processed), " - "(dropout_proportion) ...]") + logger.info( + ": [(num_archives_processed), " + "(dropout_proportion) ...]" + ) for name, schedule in dropout_schedule: logger.info("{0}: {1}".format(name, schedule)) @@ -80,17 +85,19 @@ def _parse_dropout_string(dropout_str): A data fraction of 1 corresponds to all data. """ dropout_values = [] - parts = dropout_str.strip().split(',') + parts = dropout_str.strip().split(",") try: if len(parts) < 2: - raise Exception("dropout proportion string must specify " - "at least the start and end dropouts") + raise Exception( + "dropout proportion string must specify " + "at least the start and end dropouts" + ) # Starting dropout proportion dropout_values.append((0, float(parts[0]))) for i in range(1, len(parts) - 1): - value_x_pair = parts[i].split('@') + value_x_pair = parts[i].split("@") if len(value_x_pair) == 1: # Dropout proportion at half of training dropout_proportion = float(value_x_pair[0]) @@ -101,21 +108,25 @@ def _parse_dropout_string(dropout_str): dropout_proportion = float(value_x_pair[0]) data_fraction = float(value_x_pair[1]) - if (data_fraction < dropout_values[-1][0] - or data_fraction > 1.0): + if data_fraction < dropout_values[-1][0] or data_fraction > 1.0: logger.error( "Failed while parsing value %s in dropout-schedule. " "dropout-schedule must be in incresing " - "order of data fractions.", value_x_pair) + "order of data fractions.", + value_x_pair, + ) raise ValueError dropout_values.append((data_fraction, float(dropout_proportion))) dropout_values.append((1.0, float(parts[-1]))) except Exception: - logger.error("Unable to parse dropout proportion string %s. " - "See help for option " - "--trainer.dropout-schedule.", dropout_str) + logger.error( + "Unable to parse dropout proportion string %s. " + "See help for option " + "--trainer.dropout-schedule.", + dropout_str, + ) raise # reverse sort so that its easy to retrieve the dropout proportion @@ -151,16 +162,18 @@ def _get_component_dropout(dropout_schedule, data_fraction): try: # Find lower bound of the data_fraction. This is the # lower end of the piecewise linear function. - (dropout_schedule_index, initial_data_fraction, - initial_dropout) = next((i, tup[0], tup[1]) - for i, tup in enumerate(dropout_schedule) - if tup[0] <= data_fraction) + (dropout_schedule_index, initial_data_fraction, initial_dropout) = next( + (i, tup[0], tup[1]) + for i, tup in enumerate(dropout_schedule) + if tup[0] <= data_fraction + ) except StopIteration: raise RuntimeError( "Could not find data_fraction in dropout schedule " "corresponding to data_fraction {0}.\n" "Maybe something wrong with the parsed " - "dropout schedule {1}.".format(data_fraction, dropout_schedule)) + "dropout schedule {1}.".format(data_fraction, dropout_schedule) + ) if dropout_schedule_index == 0: assert dropout_schedule[0][0] == 1 and data_fraction == 1 @@ -168,20 +181,19 @@ def _get_component_dropout(dropout_schedule, data_fraction): # The upper bound of data_fraction is at the index before the # lower bound. - final_data_fraction, final_dropout = dropout_schedule[ - dropout_schedule_index - 1] + final_data_fraction, final_dropout = dropout_schedule[dropout_schedule_index - 1] if final_data_fraction == initial_data_fraction: assert data_fraction == initial_data_fraction return initial_dropout - assert (data_fraction >= initial_data_fraction - and data_fraction < final_data_fraction) + assert ( + data_fraction >= initial_data_fraction and data_fraction < final_data_fraction + ) - return ((data_fraction - initial_data_fraction) - * (final_dropout - initial_dropout) - / (final_data_fraction - initial_data_fraction) - + initial_dropout) + return (data_fraction - initial_data_fraction) * ( + final_dropout - initial_dropout + ) / (final_data_fraction - initial_data_fraction) + initial_dropout def _get_dropout_proportions(dropout_schedule, data_fraction): @@ -219,10 +231,14 @@ def _get_dropout_proportions(dropout_schedule, data_fraction): dropout_proportions = [] for component_name, component_dropout_schedule in dropout_schedule: dropout_proportions.append( - (component_name, _get_component_dropout( - component_dropout_schedule, data_fraction))) + ( + component_name, + _get_component_dropout(component_dropout_schedule, data_fraction), + ) + ) return dropout_proportions + def get_dropout_edit_option(dropout_schedule, data_fraction, iter_): """Return an option to be passed to nnet3-copy (or nnet3-am-copy) that will set the appropriate dropout proportion. If no dropout @@ -246,8 +262,7 @@ def get_dropout_edit_option(dropout_schedule, data_fraction, iter_): if dropout_schedule is None: return "" - dropout_proportions = _get_dropout_proportions( - dropout_schedule, data_fraction) + dropout_proportions = _get_dropout_proportions(dropout_schedule, data_fraction) edit_config_lines = [] dropout_info = [] @@ -255,17 +270,21 @@ def get_dropout_edit_option(dropout_schedule, data_fraction, iter_): for component_name, dropout_proportion in dropout_proportions: edit_config_lines.append( "set-dropout-proportion name={0} proportion={1}".format( - component_name, dropout_proportion)) - dropout_info.append("pattern/dropout-proportion={0}/{1}".format( - component_name, dropout_proportion)) + component_name, dropout_proportion + ) + ) + dropout_info.append( + "pattern/dropout-proportion={0}/{1}".format( + component_name, dropout_proportion + ) + ) if _debug_dropout: - logger.info("On iteration %d, %s", iter_, ', '.join(dropout_info)) + logger.info("On iteration %d, %s", iter_, ", ".join(dropout_info)) return "--edits='{0}'".format(";".join(edit_config_lines)) - def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): """Return an nnet3-copy --edits line to modify raw_model_string to set dropout proportions according to dropout_proportions. @@ -286,8 +305,7 @@ def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): if dropout_schedule is None: return "" - dropout_proportions = _get_dropout_proportions( - dropout_schedule, data_fraction) + dropout_proportions = _get_dropout_proportions(dropout_schedule, data_fraction) edit_config_lines = [] dropout_info = [] @@ -295,14 +313,20 @@ def get_dropout_edit_string(dropout_schedule, data_fraction, iter_): for component_name, dropout_proportion in dropout_proportions: edit_config_lines.append( "set-dropout-proportion name={0} proportion={1}".format( - component_name, dropout_proportion)) - dropout_info.append("pattern/dropout-proportion={0}/{1}".format( - component_name, dropout_proportion)) + component_name, dropout_proportion + ) + ) + dropout_info.append( + "pattern/dropout-proportion={0}/{1}".format( + component_name, dropout_proportion + ) + ) if _debug_dropout: - logger.info("On iteration %d, %s", iter_, ', '.join(dropout_info)) - return ("""nnet3-copy --edits='{edits}' - - |""".format( - edits=";".join(edit_config_lines))) + logger.info("On iteration %d, %s", iter_, ", ".join(dropout_info)) + return """nnet3-copy --edits='{edits}' - - |""".format( + edits=";".join(edit_config_lines) + ) def _self_test(): @@ -319,53 +343,57 @@ def assert_approx_equal(list1, list2): assert list1[i][0] == list2[i][0] assert abs(list1[i][1] - list2[i][1]) < 1e-8 - assert (_parse_dropout_option('*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0') - == [ ('*', [ (1.0, 0.0), (0.5, 0.5), (0.0, 0.0) ]), - ('lstm.*', [ (1.0, 0.0), (0.75, 0.3), (0.0, 0.0) ]) ]) - assert_approx_equal(_get_dropout_proportions( - '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0', 0.75), - [ ('*', 0.25), ('lstm.*', 0.3) ]) - assert_approx_equal(_get_dropout_proportions( - '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0', 0.5), - [ ('*', 0.5), ('lstm.*', 0.2) ]) - assert_approx_equal(_get_dropout_proportions( - '*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0', 0.25), - [ ('*', 0.25), ('lstm.*', 0.1) ]) - - assert (_parse_dropout_option('0.0,0.3,0.0') - == [ ('*', [ (1.0, 0.0), (0.5, 0.3), (0.0, 0.0) ]) ]) - assert_approx_equal(_get_dropout_proportions('0.0,0.3,0.0', 0.5), - [ ('*', 0.3) ]) - assert_approx_equal(_get_dropout_proportions('0.0,0.3,0.0', 0.0), - [ ('*', 0.0) ]) - assert_approx_equal(_get_dropout_proportions('0.0,0.3,0.0', 1.0), - [ ('*', 0.0) ]) - assert_approx_equal(_get_dropout_proportions('0.0,0.3,0.0', 0.25), - [ ('*', 0.15) ]) - - assert (_parse_dropout_option('0.0,0.5@0.25,0.0,0.6@0.75,0.0') - == [ ('*', [ (1.0, 0.0), (0.75, 0.6), (0.5, 0.0), (0.25, 0.5), (0.0, 0.0) ]) ]) - assert_approx_equal(_get_dropout_proportions( - '0.0,0.5@0.25,0.0,0.6@0.75,0.0', 0.25), - [ ('*', 0.5) ]) - assert_approx_equal(_get_dropout_proportions( - '0.0,0.5@0.25,0.0,0.6@0.75,0.0', 0.1), - [ ('*', 0.2) ]) - - assert (_parse_dropout_option('lstm.*=0.0,0.3,0.0@0.75,1.0') - == [ ('lstm.*', [ (1.0, 1.0), (0.75, 0.0), (0.5, 0.3), (0.0, 0.0) ]) ]) - assert_approx_equal(_get_dropout_proportions( - 'lstm.*=0.0,0.3,0.0@0.75,1.0', 0.25), - [ ('lstm.*', 0.15) ]) - assert_approx_equal(_get_dropout_proportions( - 'lstm.*=0.0,0.3,0.0@0.75,1.0', 0.5), - [ ('lstm.*', 0.3) ]) - assert_approx_equal(_get_dropout_proportions( - 'lstm.*=0.0,0.3,0.0@0.75,1.0', 0.9), - [ ('lstm.*', 0.6) ]) - - -if __name__ == '__main__': + assert _parse_dropout_option("*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0") == [ + ("*", [(1.0, 0.0), (0.5, 0.5), (0.0, 0.0)]), + ("lstm.*", [(1.0, 0.0), (0.75, 0.3), (0.0, 0.0)]), + ] + assert_approx_equal( + _get_dropout_proportions("*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0", 0.75), + [("*", 0.25), ("lstm.*", 0.3)], + ) + assert_approx_equal( + _get_dropout_proportions("*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0", 0.5), + [("*", 0.5), ("lstm.*", 0.2)], + ) + assert_approx_equal( + _get_dropout_proportions("*=0.0,0.5,0.0 lstm.*=0.0,0.3@0.75,0.0", 0.25), + [("*", 0.25), ("lstm.*", 0.1)], + ) + + assert _parse_dropout_option("0.0,0.3,0.0") == [ + ("*", [(1.0, 0.0), (0.5, 0.3), (0.0, 0.0)]) + ] + assert_approx_equal(_get_dropout_proportions("0.0,0.3,0.0", 0.5), [("*", 0.3)]) + assert_approx_equal(_get_dropout_proportions("0.0,0.3,0.0", 0.0), [("*", 0.0)]) + assert_approx_equal(_get_dropout_proportions("0.0,0.3,0.0", 1.0), [("*", 0.0)]) + assert_approx_equal(_get_dropout_proportions("0.0,0.3,0.0", 0.25), [("*", 0.15)]) + + assert _parse_dropout_option("0.0,0.5@0.25,0.0,0.6@0.75,0.0") == [ + ("*", [(1.0, 0.0), (0.75, 0.6), (0.5, 0.0), (0.25, 0.5), (0.0, 0.0)]) + ] + assert_approx_equal( + _get_dropout_proportions("0.0,0.5@0.25,0.0,0.6@0.75,0.0", 0.25), [("*", 0.5)] + ) + assert_approx_equal( + _get_dropout_proportions("0.0,0.5@0.25,0.0,0.6@0.75,0.0", 0.1), [("*", 0.2)] + ) + + assert _parse_dropout_option("lstm.*=0.0,0.3,0.0@0.75,1.0") == [ + ("lstm.*", [(1.0, 1.0), (0.75, 0.0), (0.5, 0.3), (0.0, 0.0)]) + ] + assert_approx_equal( + _get_dropout_proportions("lstm.*=0.0,0.3,0.0@0.75,1.0", 0.25), + [("lstm.*", 0.15)], + ) + assert_approx_equal( + _get_dropout_proportions("lstm.*=0.0,0.3,0.0@0.75,1.0", 0.5), [("lstm.*", 0.3)] + ) + assert_approx_equal( + _get_dropout_proportions("lstm.*=0.0,0.3,0.0@0.75,1.0", 0.9), [("lstm.*", 0.6)] + ) + + +if __name__ == "__main__": try: _self_test() except Exception: diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/__init__.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/__init__.py index 46fd0f36553..e447125654a 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/__init__.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/__init__.py @@ -1,5 +1,3 @@ - - # Copyright 2016 Vimal Manohar # Apache 2.0 diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py index 4a39ed9dae6..1f994466f57 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/acoustic_model.py @@ -1,5 +1,3 @@ - - # Copyright 2016 Vijayaditya Peddinti. # 2016 Vimal Manohar # Apache 2.0. @@ -18,15 +16,24 @@ logger.addHandler(logging.NullHandler()) -def generate_egs(data, alidir, egs_dir, - left_context, right_context, - run_opts, stage=0, - left_context_initial=-1, right_context_final=-1, - online_ivector_dir=None, - samples_per_iter=20000, frames_per_eg_str="20", srand=0, - egs_opts=None, cmvn_opts=None): - - """ Wrapper for calling steps/nnet3/get_egs.sh +def generate_egs( + data, + alidir, + egs_dir, + left_context, + right_context, + run_opts, + stage=0, + left_context_initial=-1, + right_context_final=-1, + online_ivector_dir=None, + samples_per_iter=20000, + frames_per_eg_str="20", + srand=0, + egs_opts=None, + cmvn_opts=None, +): + """Wrapper for calling steps/nnet3/get_egs.sh Generates targets from alignment directory 'alidir', which contains the model final.mdl and alignments. @@ -46,35 +53,38 @@ def generate_egs(data, alidir, egs_dir, --frames-per-eg {frames_per_eg_str} \ --srand {srand} \ {data} {alidir} {egs_dir} - """.format(command=run_opts.egs_command, - cmvn_opts=cmvn_opts if cmvn_opts is not None else '', - ivector_dir=(online_ivector_dir - if online_ivector_dir is not None - else ''), - left_context=left_context, - right_context=right_context, - left_context_initial=left_context_initial, - right_context_final=right_context_final, - stage=stage, samples_per_iter=samples_per_iter, - frames_per_eg_str=frames_per_eg_str, srand=srand, data=data, - alidir=alidir, egs_dir=egs_dir, - egs_opts=egs_opts if egs_opts is not None else '')) - - -def prepare_initial_acoustic_model(dir, alidir, run_opts, - srand=-3, input_model=None): - """ Adds the first layer; this will also add in the lda.mat and - presoftmax_prior_scale.vec. It will also prepare the acoustic model - with the transition model. - If 'input_model' is specified, no initial network preparation(adding - the first layer) is done and this model is used as initial 'raw' model - instead of '0.raw' model to prepare '0.mdl' as acoustic model by adding the - transition model. + """.format( + command=run_opts.egs_command, + cmvn_opts=cmvn_opts if cmvn_opts is not None else "", + ivector_dir=(online_ivector_dir if online_ivector_dir is not None else ""), + left_context=left_context, + right_context=right_context, + left_context_initial=left_context_initial, + right_context_final=right_context_final, + stage=stage, + samples_per_iter=samples_per_iter, + frames_per_eg_str=frames_per_eg_str, + srand=srand, + data=data, + alidir=alidir, + egs_dir=egs_dir, + egs_opts=egs_opts if egs_opts is not None else "", + ) + ) + + +def prepare_initial_acoustic_model(dir, alidir, run_opts, srand=-3, input_model=None): + """Adds the first layer; this will also add in the lda.mat and + presoftmax_prior_scale.vec. It will also prepare the acoustic model + with the transition model. + If 'input_model' is specified, no initial network preparation(adding + the first layer) is done and this model is used as initial 'raw' model + instead of '0.raw' model to prepare '0.mdl' as acoustic model by adding the + transition model. """ if input_model is None: - common_train_lib.prepare_initial_network(dir, run_opts, - srand=srand) + common_train_lib.prepare_initial_network(dir, run_opts, srand=srand) # Convert to .mdl, train the transitions, set the priors. common_lib.execute_command( @@ -82,7 +92,12 @@ def prepare_initial_acoustic_model(dir, alidir, run_opts, nnet3-am-init {alidir}/final.mdl {raw_mdl} - \| \ nnet3-am-train-transitions - \ "ark:gunzip -c {alidir}/ali.*.gz|" {dir}/0.mdl - """.format(command=run_opts.command, - dir=dir, alidir=alidir, - raw_mdl=(input_model if input_model is not None - else '{0}/0.raw'.format(dir)))) + """.format( + command=run_opts.command, + dir=dir, + alidir=alidir, + raw_mdl=( + input_model if input_model is not None else "{0}/0.raw".format(dir) + ), + ) + ) diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/common.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/common.py index f2722350e41..39c8665b92f 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/common.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/common.py @@ -1,4 +1,3 @@ - # Copyright 2016 Vijayaditya Peddinti. # 2016 Vimal Manohar # 2017 Johns Hopkins University (author: Daniel Povey) @@ -25,17 +24,30 @@ logger.addHandler(logging.NullHandler()) -def train_new_models(dir, iter, srand, num_jobs, - num_archives_processed, num_archives, - raw_model_string, egs_dir, - momentum, max_param_change, - shuffle_buffer_size, minibatch_size_str, - image_augmentation_opts, - run_opts, frames_per_eg=-1, - min_deriv_time=None, max_deriv_time_relative=None, - use_multitask_egs=False, train_opts="", - backstitch_training_scale=0.0, backstitch_training_interval=1): - """ Called from train_one_iteration(), this model does one iteration of +def train_new_models( + dir, + iter, + srand, + num_jobs, + num_archives_processed, + num_archives, + raw_model_string, + egs_dir, + momentum, + max_param_change, + shuffle_buffer_size, + minibatch_size_str, + image_augmentation_opts, + run_opts, + frames_per_eg=-1, + min_deriv_time=None, + max_deriv_time_relative=None, + use_multitask_egs=False, + train_opts="", + backstitch_training_scale=0.0, + backstitch_training_interval=1, +): + """Called from train_one_iteration(), this model does one iteration of training with 'num_jobs' jobs, and writes files like exp/tdnn_a/24.{1,2,3,..}.raw @@ -70,11 +82,13 @@ def train_new_models(dir, iter, srand, num_jobs, deriv_time_opts = [] if min_deriv_time is not None: - deriv_time_opts.append("--optimization.min-deriv-time={0}".format( - min_deriv_time)) + deriv_time_opts.append( + "--optimization.min-deriv-time={0}".format(min_deriv_time) + ) if max_deriv_time_relative is not None: - deriv_time_opts.append("--optimization.max-deriv-time-relative={0}".format( - max_deriv_time_relative)) + deriv_time_opts.append( + "--optimization.max-deriv-time-relative={0}".format(max_deriv_time_relative) + ) threads = [] @@ -83,9 +97,9 @@ def train_new_models(dir, iter, srand, num_jobs, # iteration. Don't do it on iteration 0 either, because we use a smaller # than normal minibatch size, and people may get confused thinking it's # slower for iteration 0 because of the verbose option. - verbose_opt = ("--verbose=1" if iter % 20 == 0 and iter > 0 else "") + verbose_opt = "--verbose=1" if iter % 20 == 0 and iter > 0 else "" - for job in range(1, num_jobs+1): + for job in range(1, num_jobs + 1): # k is a zero-based index that we will derive the other indexes from. k = num_archives_processed + job - 1 @@ -95,44 +109,43 @@ def train_new_models(dir, iter, srand, num_jobs, if not chunk_level_training: frame = (k // num_archives + archive_index) % frames_per_eg - cache_io_opts = (("--read-cache={dir}/cache.{iter}".format(dir=dir, - iter=iter) - if iter > 0 else "") + - (" --write-cache={0}/cache.{1}".format(dir, iter + 1) - if job == 1 else "")) + cache_io_opts = ( + "--read-cache={dir}/cache.{iter}".format(dir=dir, iter=iter) + if iter > 0 + else "" + ) + (" --write-cache={0}/cache.{1}".format(dir, iter + 1) if job == 1 else "") if image_augmentation_opts: - image_augmentation_cmd = ( - 'nnet3-egs-augment-image --srand={srand} {aug_opts} ark:- ark:- |'.format( - srand=k+srand, - aug_opts=image_augmentation_opts)) + image_augmentation_cmd = "nnet3-egs-augment-image --srand={srand} {aug_opts} ark:- ark:- |".format( + srand=k + srand, aug_opts=image_augmentation_opts + ) else: - image_augmentation_cmd = '' - + image_augmentation_cmd = "" multitask_egs_opts = common_train_lib.get_multitask_egs_opts( egs_dir, egs_prefix="egs.", archive_index=archive_index, - use_multitask_egs=use_multitask_egs) + use_multitask_egs=use_multitask_egs, + ) scp_or_ark = "scp" if use_multitask_egs else "ark" - egs_rspecifier = ( - """ark,bg:nnet3-copy-egs {frame_opts} {multitask_egs_opts} \ + egs_rspecifier = """ark,bg:nnet3-copy-egs {frame_opts} {multitask_egs_opts} \ {scp_or_ark}:{egs_dir}/egs.{archive_index}.{scp_or_ark} ark:- | \ nnet3-shuffle-egs --buffer-size={shuffle_buffer_size} \ --srand={srand} ark:- ark:- | {aug_cmd} \ nnet3-merge-egs --minibatch-size={minibatch_size} ark:- ark:- |""".format( - frame_opts=("" if chunk_level_training - else "--frame={0}".format(frame)), - egs_dir=egs_dir, archive_index=archive_index, - shuffle_buffer_size=shuffle_buffer_size, - minibatch_size=minibatch_size_str, - aug_cmd=image_augmentation_cmd, - srand=iter+srand, - scp_or_ark=scp_or_ark, - multitask_egs_opts=multitask_egs_opts)) + frame_opts=("" if chunk_level_training else "--frame={0}".format(frame)), + egs_dir=egs_dir, + archive_index=archive_index, + shuffle_buffer_size=shuffle_buffer_size, + minibatch_size=minibatch_size_str, + aug_cmd=image_augmentation_cmd, + srand=iter + srand, + scp_or_ark=scp_or_ark, + multitask_egs_opts=multitask_egs_opts, + ) # note: the thread waits on that process's completion. thread = common_lib.background_command( @@ -149,21 +162,26 @@ def train_new_models(dir, iter, srand, num_jobs, {dir}/{next_iter}.{job}.raw""".format( command=run_opts.command, train_queue_opt=run_opts.train_queue_opt, - dir=dir, iter=iter, - next_iter=iter + 1, srand=iter + srand, + dir=dir, + iter=iter, + next_iter=iter + 1, + srand=iter + srand, job=job, parallel_train_opts=run_opts.parallel_train_opts, cache_io_opts=cache_io_opts, verbose_opt=verbose_opt, - momentum=momentum, max_param_change=max_param_change, - l2_regularize_factor=1.0/num_jobs, + momentum=momentum, + max_param_change=max_param_change, + l2_regularize_factor=1.0 / num_jobs, backstitch_training_scale=backstitch_training_scale, backstitch_training_interval=backstitch_training_interval, train_opts=train_opts, deriv_time_opts=" ".join(deriv_time_opts), raw_model=raw_model_string, - egs_rspecifier=egs_rspecifier), - require_zero_status=True) + egs_rspecifier=egs_rspecifier, + ), + require_zero_status=True, + ) threads.append(thread) @@ -171,18 +189,34 @@ def train_new_models(dir, iter, srand, num_jobs, thread.join() -def train_one_iteration(dir, iter, srand, egs_dir, - num_jobs, num_archives_processed, num_archives, - learning_rate, minibatch_size_str, - momentum, max_param_change, shuffle_buffer_size, - run_opts, image_augmentation_opts=None, - frames_per_eg=-1, - min_deriv_time=None, max_deriv_time_relative=None, - shrinkage_value=1.0, dropout_edit_string="", train_opts="", - get_raw_nnet_from_am=True, use_multitask_egs=False, - backstitch_training_scale=0.0, backstitch_training_interval=1, - compute_per_dim_accuracy=False): - """ Called from steps/nnet3/train_*.py scripts for one iteration of neural +def train_one_iteration( + dir, + iter, + srand, + egs_dir, + num_jobs, + num_archives_processed, + num_archives, + learning_rate, + minibatch_size_str, + momentum, + max_param_change, + shuffle_buffer_size, + run_opts, + image_augmentation_opts=None, + frames_per_eg=-1, + min_deriv_time=None, + max_deriv_time_relative=None, + shrinkage_value=1.0, + dropout_edit_string="", + train_opts="", + get_raw_nnet_from_am=True, + use_multitask_egs=False, + backstitch_training_scale=0.0, + backstitch_training_interval=1, + compute_per_dim_accuracy=False, +): + """Called from steps/nnet3/train_*.py scripts for one iteration of neural network training Selected args: @@ -202,45 +236,56 @@ def train_one_iteration(dir, iter, srand, egs_dir, # Use the egs dir from the previous iteration for the diagnostics # check if different iterations use the same random seed - if os.path.exists('{0}/srand'.format(dir)): + if os.path.exists("{0}/srand".format(dir)): try: - saved_srand = int(open('{0}/srand'.format(dir)).readline().strip()) + saved_srand = int(open("{0}/srand".format(dir)).readline().strip()) except (IOError, ValueError): - logger.error("Exception while reading the random seed " - "for training") + logger.error("Exception while reading the random seed " "for training") raise if srand != saved_srand: - logger.warning("The random seed provided to this iteration " - "(srand={0}) is different from the one saved last " - "time (srand={1}). Using srand={0}.".format( - srand, saved_srand)) + logger.warning( + "The random seed provided to this iteration " + "(srand={0}) is different from the one saved last " + "time (srand={1}). Using srand={0}.".format(srand, saved_srand) + ) else: - with open('{0}/srand'.format(dir), 'w') as f: + with open("{0}/srand".format(dir), "w") as f: f.write(str(srand)) # Sets off some background jobs to compute train and # validation set objectives compute_train_cv_probabilities( - dir=dir, iter=iter, egs_dir=egs_dir, + dir=dir, + iter=iter, + egs_dir=egs_dir, run_opts=run_opts, get_raw_nnet_from_am=get_raw_nnet_from_am, use_multitask_egs=use_multitask_egs, - compute_per_dim_accuracy=compute_per_dim_accuracy) + compute_per_dim_accuracy=compute_per_dim_accuracy, + ) if iter > 0: # Runs in the background - compute_progress(dir=dir, iter=iter, egs_dir=egs_dir, - run_opts=run_opts, - get_raw_nnet_from_am=get_raw_nnet_from_am) - - do_average = (iter > 0) + compute_progress( + dir=dir, + iter=iter, + egs_dir=egs_dir, + run_opts=run_opts, + get_raw_nnet_from_am=get_raw_nnet_from_am, + ) + do_average = iter > 0 - raw_model_string = ("nnet3-copy --learning-rate={lr} --scale={s} " - "{dir}/{iter}.{suf} - |".format( - lr=learning_rate, s=shrinkage_value, - suf="mdl" if get_raw_nnet_from_am else "raw", - dir=dir, iter=iter)) + raw_model_string = ( + "nnet3-copy --learning-rate={lr} --scale={s} " + "{dir}/{iter}.{suf} - |".format( + lr=learning_rate, + s=shrinkage_value, + suf="mdl" if get_raw_nnet_from_am else "raw", + dir=dir, + iter=iter, + ) + ) raw_model_string = raw_model_string + dropout_edit_string @@ -253,28 +298,38 @@ def train_one_iteration(dir, iter, srand, egs_dir, # always helpful when the model is changing too fast (i.e. it can worsen # the objective function), and the smaller minibatch size will help to # keep the update stable. - cur_minibatch_size_str = common_train_lib.halve_minibatch_size_str(minibatch_size_str) + cur_minibatch_size_str = common_train_lib.halve_minibatch_size_str( + minibatch_size_str + ) cur_max_param_change = float(max_param_change) / math.sqrt(2) - train_new_models(dir=dir, iter=iter, srand=srand, num_jobs=num_jobs, - num_archives_processed=num_archives_processed, - num_archives=num_archives, - raw_model_string=raw_model_string, egs_dir=egs_dir, - momentum=momentum, max_param_change=cur_max_param_change, - shuffle_buffer_size=shuffle_buffer_size, - minibatch_size_str=cur_minibatch_size_str, - run_opts=run_opts, - frames_per_eg=frames_per_eg, - min_deriv_time=min_deriv_time, - max_deriv_time_relative=max_deriv_time_relative, - image_augmentation_opts=image_augmentation_opts, - use_multitask_egs=use_multitask_egs, - train_opts=train_opts, - backstitch_training_scale=backstitch_training_scale, - backstitch_training_interval=backstitch_training_interval) + train_new_models( + dir=dir, + iter=iter, + srand=srand, + num_jobs=num_jobs, + num_archives_processed=num_archives_processed, + num_archives=num_archives, + raw_model_string=raw_model_string, + egs_dir=egs_dir, + momentum=momentum, + max_param_change=cur_max_param_change, + shuffle_buffer_size=shuffle_buffer_size, + minibatch_size_str=cur_minibatch_size_str, + run_opts=run_opts, + frames_per_eg=frames_per_eg, + min_deriv_time=min_deriv_time, + max_deriv_time_relative=max_deriv_time_relative, + image_augmentation_opts=image_augmentation_opts, + use_multitask_egs=use_multitask_egs, + train_opts=train_opts, + backstitch_training_scale=backstitch_training_scale, + backstitch_training_interval=backstitch_training_interval, + ) [models_to_average, best_model] = common_train_lib.get_successful_models( - num_jobs, '{0}/log/train.{1}.%.log'.format(dir, iter)) + num_jobs, "{0}/log/train.{1}.%.log".format(dir, iter) + ) nnets_list = [] for n in models_to_average: nnets_list.append("{0}/{1}.{2}.raw".format(dir, iter + 1, n)) @@ -282,18 +337,22 @@ def train_one_iteration(dir, iter, srand, egs_dir, if do_average: # average the output of the different jobs. common_train_lib.get_average_nnet_model( - dir=dir, iter=iter, + dir=dir, + iter=iter, nnets_list=" ".join(nnets_list), run_opts=run_opts, - get_raw_nnet_from_am=get_raw_nnet_from_am) + get_raw_nnet_from_am=get_raw_nnet_from_am, + ) else: # choose the best model from different jobs common_train_lib.get_best_nnet_model( - dir=dir, iter=iter, + dir=dir, + iter=iter, best_model_index=best_model, run_opts=run_opts, - get_raw_nnet_from_am=get_raw_nnet_from_am) + get_raw_nnet_from_am=get_raw_nnet_from_am, + ) try: for i in range(1, num_jobs + 1): @@ -308,18 +367,28 @@ def train_one_iteration(dir, iter, srand, egs_dir, new_model = "{0}/{1}.raw".format(dir, iter + 1) if not os.path.isfile(new_model): - raise Exception("Could not find {0}, at the end of " - "iteration {1}".format(new_model, iter)) + raise Exception( + "Could not find {0}, at the end of " "iteration {1}".format(new_model, iter) + ) elif os.stat(new_model).st_size == 0: - raise Exception("{0} has size 0. Something went wrong in " - "iteration {1}".format(new_model, iter)) + raise Exception( + "{0} has size 0. Something went wrong in " + "iteration {1}".format(new_model, iter) + ) if os.path.exists("{0}/cache.{1}".format(dir, iter)): os.remove("{0}/cache.{1}".format(dir, iter)) -def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, - max_lda_jobs=None, rand_prune=4.0, - lda_opts=None, use_multitask_egs=False): +def compute_preconditioning_matrix( + dir, + egs_dir, + num_lda_jobs, + run_opts, + max_lda_jobs=None, + rand_prune=4.0, + lda_opts=None, + use_multitask_egs=False, +): if max_lda_jobs is not None: if num_lda_jobs > max_lda_jobs: num_lda_jobs = max_lda_jobs @@ -327,13 +396,18 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, egs_dir, egs_prefix="egs.", archive_index="JOB", - use_multitask_egs=use_multitask_egs) + use_multitask_egs=use_multitask_egs, + ) scp_or_ark = "scp" if use_multitask_egs else "ark" egs_rspecifier = ( "ark:nnet3-copy-egs {multitask_egs_opts} " "{scp_or_ark}:{egs_dir}/egs.JOB.{scp_or_ark} ark:- |" - "".format(egs_dir=egs_dir, scp_or_ark=scp_or_ark, - multitask_egs_opts=multitask_egs_opts)) + "".format( + egs_dir=egs_dir, + scp_or_ark=scp_or_ark, + multitask_egs_opts=multitask_egs_opts, + ) + ) # Write stats with the same format as stats for LDA. common_lib.execute_command( @@ -341,27 +415,31 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, nnet3-acc-lda-stats --rand-prune={rand_prune} \ {dir}/init.raw "{egs_rspecifier}" \ {dir}/JOB.lda_stats""".format( - command=run_opts.command, - num_lda_jobs=num_lda_jobs, - dir=dir, - egs_rspecifier=egs_rspecifier, - rand_prune=rand_prune)) + command=run_opts.command, + num_lda_jobs=num_lda_jobs, + dir=dir, + egs_rspecifier=egs_rspecifier, + rand_prune=rand_prune, + ) + ) # the above command would have generated dir/{1..num_lda_jobs}.lda_stats - lda_stat_files = ['{0}/{1}.lda_stats'.format(dir, x) for x in range(1, num_lda_jobs + 1)] + lda_stat_files = [ + "{0}/{1}.lda_stats".format(dir, x) for x in range(1, num_lda_jobs + 1) + ] common_lib.execute_command( """{command} {dir}/log/sum_transform_stats.log \ sum-lda-accs {dir}/lda_stats {lda_stat_files}""".format( - command=run_opts.command, - dir=dir, lda_stat_files=" ".join(lda_stat_files))) + command=run_opts.command, dir=dir, lda_stat_files=" ".join(lda_stat_files) + ) + ) for file in lda_stat_files: try: os.remove(file) except OSError: - logger.error("There was error while trying to remove " - "lda stat files.") + logger.error("There was error while trying to remove " "lda stat files.") raise # this computes a fixed affine transform computed in the way we described # in Appendix C.6 of http://arxiv.org/pdf/1410.7455v6.pdf; it's a scaled @@ -371,16 +449,24 @@ def compute_preconditioning_matrix(dir, egs_dir, num_lda_jobs, run_opts, """{command} {dir}/log/get_transform.log \ nnet-get-feature-transform {lda_opts} {dir}/lda.mat \ {dir}/lda_stats""".format( - command=run_opts.command, dir=dir, - lda_opts=lda_opts if lda_opts is not None else "")) + command=run_opts.command, + dir=dir, + lda_opts=lda_opts if lda_opts is not None else "", + ) + ) common_lib.force_symlink("../lda.mat", "{0}/configs/lda.mat".format(dir)) -def compute_train_cv_probabilities(dir, iter, egs_dir, run_opts, - get_raw_nnet_from_am=True, - use_multitask_egs=False, - compute_per_dim_accuracy=False): +def compute_train_cv_probabilities( + dir, + iter, + egs_dir, + run_opts, + get_raw_nnet_from_am=True, + use_multitask_egs=False, + compute_per_dim_accuracy=False, +): if get_raw_nnet_from_am: model = "{dir}/{iter}.mdl".format(dir=dir, iter=iter) else: @@ -388,17 +474,17 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, run_opts, scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".egs" - egs_rspecifier = ("{0}:{1}/valid_diagnostic{2}".format( - scp_or_ark, egs_dir, egs_suffix)) + egs_rspecifier = "{0}:{1}/valid_diagnostic{2}".format( + scp_or_ark, egs_dir, egs_suffix + ) opts = [] if compute_per_dim_accuracy: opts.append("--compute-per-dim-accuracy") multitask_egs_opts = common_train_lib.get_multitask_egs_opts( - egs_dir, - egs_prefix="valid_diagnostic.", - use_multitask_egs=use_multitask_egs) + egs_dir, egs_prefix="valid_diagnostic.", use_multitask_egs=use_multitask_egs + ) common_lib.background_command( """ {command} {dir}/log/compute_prob_valid.{iter}.log \ @@ -406,20 +492,24 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, run_opts, "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ {egs_rspecifier} ark:- | \ nnet3-merge-egs --minibatch-size=1:64 ark:- \ - ark:- |" """.format(command=run_opts.command, - dir=dir, - iter=iter, - egs_rspecifier=egs_rspecifier, - opts=' '.join(opts), model=model, - multitask_egs_opts=multitask_egs_opts)) - - egs_rspecifier = ("{0}:{1}/train_diagnostic{2}".format( - scp_or_ark, egs_dir, egs_suffix)) + ark:- |" """.format( + command=run_opts.command, + dir=dir, + iter=iter, + egs_rspecifier=egs_rspecifier, + opts=" ".join(opts), + model=model, + multitask_egs_opts=multitask_egs_opts, + ) + ) + + egs_rspecifier = "{0}:{1}/train_diagnostic{2}".format( + scp_or_ark, egs_dir, egs_suffix + ) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( - egs_dir, - egs_prefix="train_diagnostic.", - use_multitask_egs=use_multitask_egs) + egs_dir, egs_prefix="train_diagnostic.", use_multitask_egs=use_multitask_egs + ) common_lib.background_command( """{command} {dir}/log/compute_prob_train.{iter}.log \ @@ -427,27 +517,35 @@ def compute_train_cv_probabilities(dir, iter, egs_dir, run_opts, "ark,bg:nnet3-copy-egs {multitask_egs_opts} \ {egs_rspecifier} ark:- | \ nnet3-merge-egs --minibatch-size=1:64 ark:- \ - ark:- |" """.format(command=run_opts.command, - dir=dir, - iter=iter, - egs_rspecifier=egs_rspecifier, - opts=' '.join(opts), model=model, - multitask_egs_opts=multitask_egs_opts)) + ark:- |" """.format( + command=run_opts.command, + dir=dir, + iter=iter, + egs_rspecifier=egs_rspecifier, + opts=" ".join(opts), + model=model, + multitask_egs_opts=multitask_egs_opts, + ) + ) -def compute_progress(dir, iter, egs_dir, - run_opts, - get_raw_nnet_from_am=True): +def compute_progress(dir, iter, egs_dir, run_opts, get_raw_nnet_from_am=True): suffix = "mdl" if get_raw_nnet_from_am else "raw" - prev_model = '{0}/{1}.{2}'.format(dir, iter - 1, suffix) - model = '{0}/{1}.{2}'.format(dir, iter, suffix) + prev_model = "{0}/{1}.{2}".format(dir, iter - 1, suffix) + model = "{0}/{1}.{2}".format(dir, iter, suffix) common_lib.background_command( - """{command} {dir}/log/progress.{iter}.log \ + """{command} {dir}/log/progress.{iter}.log \ nnet3-info {model} '&&' \ nnet3-show-progress --use-gpu=no {prev_model} {model} """ - ''.format(command=run_opts.command, dir=dir, - iter=iter, model=model, prev_model=prev_model)) + "".format( + command=run_opts.command, + dir=dir, + iter=iter, + model=model, + prev_model=prev_model, + ) + ) if iter % 10 == 0 and iter > 0: # Every 10 iters, print some more detailed information. @@ -456,31 +554,39 @@ def compute_progress(dir, iter, egs_dir, common_lib.background_command( """{command} {dir}/log/full_progress.{iter}.log \ nnet3-show-progress --use-gpu=no --verbose=2 {prev_model} {model} - """.format(command=run_opts.command, - dir=dir, - iter=iter, - model=model, - prev_model=prev_model)) + """.format( + command=run_opts.command, + dir=dir, + iter=iter, + model=model, + prev_model=prev_model, + ) + ) # full_info.X.log is just the nnet3-info of the model, with the --verbose=2 # option which includes stats on the singular values of the parameter matrices. common_lib.background_command( """{command} {dir}/log/full_info.{iter}.log \ nnet3-info --verbose=2 {model} - """.format(command=run_opts.command, - dir=dir, - iter=iter, - model=model)) - - - -def combine_models(dir, num_iters, models_to_combine, egs_dir, - minibatch_size_str, - run_opts, - chunk_width=None, get_raw_nnet_from_am=True, - max_objective_evaluations=30, - use_multitask_egs=False, - compute_per_dim_accuracy=False): - """ Function to do model combination + """.format( + command=run_opts.command, dir=dir, iter=iter, model=model + ) + ) + + +def combine_models( + dir, + num_iters, + models_to_combine, + egs_dir, + minibatch_size_str, + run_opts, + chunk_width=None, + get_raw_nnet_from_am=True, + max_objective_evaluations=30, + use_multitask_egs=False, + compute_per_dim_accuracy=False, +): + """Function to do model combination In the nnet3 setup, the logic for doing averaging of subsets of the models in the case where @@ -494,17 +600,18 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir, for iter in sorted(models_to_combine): suffix = "mdl" if get_raw_nnet_from_am else "raw" - model_file = '{0}/{1}.{2}'.format(dir, iter, suffix) + model_file = "{0}/{1}.{2}".format(dir, iter, suffix) if not os.path.exists(model_file): - raise Exception('Model file {0} missing'.format(model_file)) + raise Exception("Model file {0} missing".format(model_file)) raw_model_strings.append(model_file) if get_raw_nnet_from_am: - out_model = ("| nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl " - "{dir}/combined.mdl".format(dir=dir, num_iters=num_iters)) + out_model = ( + "| nnet3-am-copy --set-raw-nnet=- {dir}/{num_iters}.mdl " + "{dir}/combined.mdl".format(dir=dir, num_iters=num_iters) + ) else: - out_model = '{dir}/final.raw'.format(dir=dir) - + out_model = "{dir}/final.raw".format(dir=dir) # We reverse the order of the raw model strings so that the freshest one # goes first. This is important for systems that include batch @@ -517,13 +624,11 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir, scp_or_ark = "scp" if use_multitask_egs else "ark" egs_suffix = ".scp" if use_multitask_egs else ".egs" - egs_rspecifier = "{0}:{1}/combine{2}".format(scp_or_ark, - egs_dir, egs_suffix) + egs_rspecifier = "{0}:{1}/combine{2}".format(scp_or_ark, egs_dir, egs_suffix) multitask_egs_opts = common_train_lib.get_multitask_egs_opts( - egs_dir, - egs_prefix="combine.", - use_multitask_egs=use_multitask_egs) + egs_dir, egs_prefix="combine.", use_multitask_egs=use_multitask_egs + ) common_lib.execute_command( """{command} {combine_queue_opt} {dir}/log/combine.log \ nnet3-combine {combine_gpu_opt} \ @@ -533,35 +638,46 @@ def combine_models(dir, num_iters, models_to_combine, egs_dir, {egs_rspecifier} ark:- | \ nnet3-merge-egs --minibatch-size=1:{mbsize} ark:- ark:- |" \ "{out_model}" - """.format(command=run_opts.command, - combine_queue_opt=run_opts.combine_queue_opt, - combine_gpu_opt=run_opts.combine_gpu_opt, - dir=dir, raw_models=" ".join(raw_model_strings), - max_objective_evaluations=max_objective_evaluations, - egs_rspecifier=egs_rspecifier, - mbsize=minibatch_size_str, - out_model=out_model, - multitask_egs_opts=multitask_egs_opts)) + """.format( + command=run_opts.command, + combine_queue_opt=run_opts.combine_queue_opt, + combine_gpu_opt=run_opts.combine_gpu_opt, + dir=dir, + raw_models=" ".join(raw_model_strings), + max_objective_evaluations=max_objective_evaluations, + egs_rspecifier=egs_rspecifier, + mbsize=minibatch_size_str, + out_model=out_model, + multitask_egs_opts=multitask_egs_opts, + ) + ) # Compute the probability of the final, combined model with # the same subset we used for the previous compute_probs, as the # different subsets will lead to different probs. if get_raw_nnet_from_am: compute_train_cv_probabilities( - dir=dir, iter='combined', egs_dir=egs_dir, - run_opts=run_opts, use_multitask_egs=use_multitask_egs, - compute_per_dim_accuracy=compute_per_dim_accuracy) + dir=dir, + iter="combined", + egs_dir=egs_dir, + run_opts=run_opts, + use_multitask_egs=use_multitask_egs, + compute_per_dim_accuracy=compute_per_dim_accuracy, + ) else: compute_train_cv_probabilities( - dir=dir, iter='final', egs_dir=egs_dir, - run_opts=run_opts, get_raw_nnet_from_am=False, + dir=dir, + iter="final", + egs_dir=egs_dir, + run_opts=run_opts, + get_raw_nnet_from_am=False, use_multitask_egs=use_multitask_egs, - compute_per_dim_accuracy=compute_per_dim_accuracy) + compute_per_dim_accuracy=compute_per_dim_accuracy, + ) -def get_realign_iters(realign_times, num_iters, - num_jobs_initial, num_jobs_final): - """ Takes the realign_times string and identifies the approximate +def get_realign_iters(realign_times, num_iters, num_jobs_initial, num_jobs_final): + """Takes the realign_times string and identifies the approximate iterations at which realignments have to be done. realign_times is a space seperated string of values between 0 and 1 @@ -570,14 +686,14 @@ def get_realign_iters(realign_times, num_iters, realign_iters = [] for realign_time in realign_times.split(): realign_time = float(realign_time) - assert(realign_time > 0 and realign_time < 1) + assert realign_time > 0 and realign_time < 1 if num_jobs_initial == num_jobs_final: realign_iter = int(0.5 + num_iters * realign_time) else: - realign_iter = math.sqrt((1 - realign_time) - * math.pow(num_jobs_initial, 2) - + realign_time * math.pow(num_jobs_final, - 2)) + realign_iter = math.sqrt( + (1 - realign_time) * math.pow(num_jobs_initial, 2) + + realign_time * math.pow(num_jobs_final, 2) + ) realign_iter = realign_iter - num_jobs_initial realign_iter = realign_iter // (num_jobs_final - num_jobs_initial) realign_iter = realign_iter * num_iters @@ -586,94 +702,117 @@ def get_realign_iters(realign_times, num_iters, return realign_iters -def align(dir, data, lang, run_opts, iter=None, - online_ivector_dir=None): +def align(dir, data, lang, run_opts, iter=None, online_ivector_dir=None): + alidir = "{dir}/ali{ali_suffix}".format( + dir=dir, ali_suffix="_iter_{0}".format(iter) if iter is not None else "" + ) - alidir = '{dir}/ali{ali_suffix}'.format( - dir=dir, - ali_suffix="_iter_{0}".format(iter) if iter is not None else "") - - logger.info("Aligning the data{gpu}with {num_jobs} jobs.".format( - gpu=" using gpu " if run_opts.realign_use_gpu else " ", - num_jobs=run_opts.realign_num_jobs)) + logger.info( + "Aligning the data{gpu}with {num_jobs} jobs.".format( + gpu=" using gpu " if run_opts.realign_use_gpu else " ", + num_jobs=run_opts.realign_num_jobs, + ) + ) common_lib.execute_command( """steps/nnet3/align.sh --nj {num_jobs_align} \ --cmd "{align_cmd} {align_queue_opt}" \ --use-gpu {align_use_gpu} \ --online-ivector-dir "{online_ivector_dir}" \ --iter "{iter}" {data} {lang} {dir} {alidir}""".format( - dir=dir, align_use_gpu=("yes" - if run_opts.realign_use_gpu - else "no"), - align_cmd=run_opts.realign_command, - align_queue_opt=run_opts.realign_queue_opt, - num_jobs_align=run_opts.realign_num_jobs, - online_ivector_dir=(online_ivector_dir - if online_ivector_dir is not None - else ""), - iter=iter if iter is not None else "", - alidir=alidir, - lang=lang, data=data)) + dir=dir, + align_use_gpu=("yes" if run_opts.realign_use_gpu else "no"), + align_cmd=run_opts.realign_command, + align_queue_opt=run_opts.realign_queue_opt, + num_jobs_align=run_opts.realign_num_jobs, + online_ivector_dir=( + online_ivector_dir if online_ivector_dir is not None else "" + ), + iter=iter if iter is not None else "", + alidir=alidir, + lang=lang, + data=data, + ) + ) return alidir -def realign(dir, iter, feat_dir, lang, prev_egs_dir, cur_egs_dir, - prior_subset_size, num_archives, - run_opts, online_ivector_dir=None): +def realign( + dir, + iter, + feat_dir, + lang, + prev_egs_dir, + cur_egs_dir, + prior_subset_size, + num_archives, + run_opts, + online_ivector_dir=None, +): raise Exception("Realignment stage has not been implemented in nnet3") - logger.info("Getting average posterior for purposes of adjusting " - "the priors.") + logger.info("Getting average posterior for purposes of adjusting " "the priors.") # Note: this just uses CPUs, using a smallish subset of data. # always use the first egs archive, which makes the script simpler; # we're using different random subsets of it. avg_post_vec_file = compute_average_posterior( - dir=dir, iter=iter, egs_dir=prev_egs_dir, - num_archives=num_archives, prior_subset_size=prior_subset_size, - run_opts=run_opts) + dir=dir, + iter=iter, + egs_dir=prev_egs_dir, + num_archives=num_archives, + prior_subset_size=prior_subset_size, + run_opts=run_opts, + ) avg_post_vec_file = "{dir}/post.{iter}.vec".format(dir=dir, iter=iter) logger.info("Re-adjusting priors based on computed posteriors") - model = '{0}/{1}.mdl'.format(dir, iter) + model = "{0}/{1}.mdl".format(dir, iter) adjust_am_priors(dir, model, avg_post_vec_file, model, run_opts) - alidir = align(dir, feat_dir, lang, run_opts, iter, - online_ivector_dir) + alidir = align(dir, feat_dir, lang, run_opts, iter, online_ivector_dir) common_lib.execute_command( """steps/nnet3/relabel_egs.sh --cmd "{command}" --iter {iter} \ {alidir} {prev_egs_dir} {cur_egs_dir}""".format( - command=run_opts.command, - iter=iter, - dir=dir, - alidir=alidir, - prev_egs_dir=prev_egs_dir, - cur_egs_dir=cur_egs_dir)) + command=run_opts.command, + iter=iter, + dir=dir, + alidir=alidir, + prev_egs_dir=prev_egs_dir, + cur_egs_dir=cur_egs_dir, + ) + ) -def adjust_am_priors(dir, input_model, avg_posterior_vector, output_model, - run_opts): +def adjust_am_priors(dir, input_model, avg_posterior_vector, output_model, run_opts): common_lib.execute_command( """{command} {dir}/log/adjust_priors.final.log \ nnet3-am-adjust-priors "{input_model}" {avg_posterior_vector} \ "{output_model}" """.format( - command=run_opts.command, - dir=dir, input_model=input_model, - avg_posterior_vector=avg_posterior_vector, - output_model=output_model)) - - -def compute_average_posterior(dir, iter, egs_dir, num_archives, - prior_subset_size, - run_opts, get_raw_nnet_from_am=True): - """ Computes the average posterior of the network - """ - for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)): + command=run_opts.command, + dir=dir, + input_model=input_model, + avg_posterior_vector=avg_posterior_vector, + output_model=output_model, + ) + ) + + +def compute_average_posterior( + dir, + iter, + egs_dir, + num_archives, + prior_subset_size, + run_opts, + get_raw_nnet_from_am=True, +): + """Computes the average posterior of the network""" + for file in glob.glob("{0}/post.{1}.*.vec".format(dir, iter)): os.remove(file) if run_opts.num_jobs_compute_prior > num_archives: egs_part = 1 else: - egs_part = 'JOB' + egs_part = "JOB" suffix = "mdl" if get_raw_nnet_from_am else "raw" model = "{0}/{1}.{2}".format(dir, iter, suffix) @@ -690,13 +829,18 @@ def compute_average_posterior(dir, iter, egs_dir, num_archives, "{model}" ark:- ark:- \| \ matrix-sum-rows ark:- ark:- \| vector-sum ark:- \ {dir}/post.{iter}.JOB.vec""".format( - command=run_opts.command, - dir=dir, model=model, - num_jobs_compute_prior=run_opts.num_jobs_compute_prior, - prior_queue_opt=run_opts.prior_queue_opt, - iter=iter, prior_subset_size=prior_subset_size, - egs_dir=egs_dir, egs_part=egs_part, - prior_gpu_opt=run_opts.prior_gpu_opt)) + command=run_opts.command, + dir=dir, + model=model, + num_jobs_compute_prior=run_opts.num_jobs_compute_prior, + prior_queue_opt=run_opts.prior_queue_opt, + iter=iter, + prior_subset_size=prior_subset_size, + egs_dir=egs_dir, + egs_part=egs_part, + prior_gpu_opt=run_opts.prior_gpu_opt, + ) + ) # make sure there is time for $dir/post.{iter}.*.vec to appear. time.sleep(5) @@ -704,9 +848,11 @@ def compute_average_posterior(dir, iter, egs_dir, num_archives, common_lib.execute_command( """{command} {dir}/log/vector_sum.{iter}.log \ vector-sum {dir}/post.{iter}.*.vec {output_file} - """.format(command=run_opts.command, - dir=dir, iter=iter, output_file=avg_post_vec_file)) + """.format( + command=run_opts.command, dir=dir, iter=iter, output_file=avg_post_vec_file + ) + ) - for file in glob.glob('{0}/post.{1}.*.vec'.format(dir, iter)): + for file in glob.glob("{0}/post.{1}.*.vec".format(dir, iter)): os.remove(file) return avg_post_vec_file diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/raw_model.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/raw_model.py index 7f21af06a16..0344290031a 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/raw_model.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/train/frame_level_objf/raw_model.py @@ -1,5 +1,3 @@ - - # Copyright 2016 Vijayaditya Peddinti. # 2016 Vimal Manohar # Apache 2.0. @@ -17,15 +15,26 @@ logger.addHandler(logging.NullHandler()) -def generate_egs_using_targets(data, targets_scp, egs_dir, - left_context, right_context, - run_opts, stage=0, - left_context_initial=-1, right_context_final=-1, - online_ivector_dir=None, - target_type='dense', num_targets=-1, - samples_per_iter=20000, frames_per_eg_str="20", - srand=0, egs_opts=None, cmvn_opts=None): - """ Wrapper for calling steps/nnet3/get_egs_targets.sh +def generate_egs_using_targets( + data, + targets_scp, + egs_dir, + left_context, + right_context, + run_opts, + stage=0, + left_context_initial=-1, + right_context_final=-1, + online_ivector_dir=None, + target_type="dense", + num_targets=-1, + samples_per_iter=20000, + frames_per_eg_str="20", + srand=0, + egs_opts=None, + cmvn_opts=None, +): + """Wrapper for calling steps/nnet3/get_egs_targets.sh This method generates egs directly from an scp file of targets, instead of getting them from the alignments (as with the method generate_egs() in @@ -40,12 +49,11 @@ def generate_egs_using_targets(data, targets_scp, egs_dir, For other options, see the file steps/nnet3/get_egs_targets.sh """ - if target_type == 'dense': + if target_type == "dense": num_targets = common_lib.get_feat_dim_from_scp(targets_scp) else: if num_targets == -1: - raise Exception("--num-targets is required if " - "target-type is sparse") + raise Exception("--num-targets is required if " "target-type is sparse") common_lib.execute_command( """steps/nnet3/get_egs_targets.sh {egs_opts} \ @@ -63,19 +71,23 @@ def generate_egs_using_targets(data, targets_scp, egs_dir, --target-type {target_type} \ --num-targets {num_targets} \ {data} {targets_scp} {egs_dir} - """.format(command=run_opts.egs_command, - cmvn_opts=cmvn_opts if cmvn_opts is not None else '', - ivector_dir=(online_ivector_dir - if online_ivector_dir is not None - else ''), - left_context=left_context, - right_context=right_context, - left_context_initial=left_context_initial, - right_context_final=right_context_final, - stage=stage, samples_per_iter=samples_per_iter, - frames_per_eg_str=frames_per_eg_str, srand=srand, - num_targets=num_targets, - data=data, - targets_scp=targets_scp, target_type=target_type, - egs_dir=egs_dir, - egs_opts=egs_opts if egs_opts is not None else '')) + """.format( + command=run_opts.egs_command, + cmvn_opts=cmvn_opts if cmvn_opts is not None else "", + ivector_dir=(online_ivector_dir if online_ivector_dir is not None else ""), + left_context=left_context, + right_context=right_context, + left_context_initial=left_context_initial, + right_context_final=right_context_final, + stage=stage, + samples_per_iter=samples_per_iter, + frames_per_eg_str=frames_per_eg_str, + srand=srand, + num_targets=num_targets, + data=data, + targets_scp=targets_scp, + target_type=target_type, + egs_dir=egs_dir, + egs_opts=egs_opts if egs_opts is not None else "", + ) + ) diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/attention.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/attention.py index db4cb392f10..d2c47a40a17 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/attention.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/attention.py @@ -25,111 +25,126 @@ class XconfigAttentionLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): # Here we just list some likely combinations.. you can just add any # combinations you want to use, to this list. - assert first_token in ['attention-renorm-layer', - 'attention-relu-renorm-layer', - 'attention-relu-batchnorm-layer', - 'relu-renorm-attention-layer'] + assert first_token in [ + "attention-renorm-layer", + "attention-relu-renorm-layer", + "attention-relu-batchnorm-layer", + "relu-renorm-attention-layer", + ] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): # note: self.config['input'] is a descriptor, '[-1]' means output # the most recent layer. - self.config = { 'input':'[-1]', - 'dim': -1, - 'max-change' : 0.75, - 'self-repair-scale' : 1.0e-05, - 'target-rms' : 1.0, - 'learning-rate-factor' : 1.0, - 'ng-affine-options' : '', - 'l2-regularize': 0.0, - 'num-left-inputs-required': -1, - 'num-right-inputs-required': -1, - 'output-context': True, - 'time-stride': 1, - 'num-heads': 1, - 'key-dim': -1, - 'key-scale': 0.0, - 'value-dim': -1, - 'num-left-inputs': -1, - 'num-right-inputs': -1, - 'dropout-proportion': 0.5} # dropout-proportion only - # affects layers with - # 'dropout' in the name. + self.config = { + "input": "[-1]", + "dim": -1, + "max-change": 0.75, + "self-repair-scale": 1.0e-05, + "target-rms": 1.0, + "learning-rate-factor": 1.0, + "ng-affine-options": "", + "l2-regularize": 0.0, + "num-left-inputs-required": -1, + "num-right-inputs-required": -1, + "output-context": True, + "time-stride": 1, + "num-heads": 1, + "key-dim": -1, + "key-scale": 0.0, + "value-dim": -1, + "num-left-inputs": -1, + "num-right-inputs": -1, + "dropout-proportion": 0.5, + } # dropout-proportion only + # affects layers with + # 'dropout' in the name. def check_configs(self): - if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0: - raise RuntimeError("self-repair-scale has invalid value {0}" - .format(self.config['self-repair-scale'])) - if self.config['target-rms'] < 0.0: - raise RuntimeError("target-rms has invalid value {0}" - .format(self.config['target-rms'])) - if self.config['learning-rate-factor'] <= 0.0: - raise RuntimeError("learning-rate-factor has invalid value {0}" - .format(self.config['learning-rate-factor'])) - for conf in ['value-dim', 'key-dim', - 'num-left-inputs', 'num-right-inputs']: + if ( + self.config["self-repair-scale"] < 0.0 + or self.config["self-repair-scale"] > 1.0 + ): + raise RuntimeError( + "self-repair-scale has invalid value {0}".format( + self.config["self-repair-scale"] + ) + ) + if self.config["target-rms"] < 0.0: + raise RuntimeError( + "target-rms has invalid value {0}".format(self.config["target-rms"]) + ) + if self.config["learning-rate-factor"] <= 0.0: + raise RuntimeError( + "learning-rate-factor has invalid value {0}".format( + self.config["learning-rate-factor"] + ) + ) + for conf in ["value-dim", "key-dim", "num-left-inputs", "num-right-inputs"]: if self.config[conf] < 0: - raise RuntimeError("{0} has invalid value {1}" - .format(conf, self.config[conf])) - if self.config['key-scale'] == 0.0: - self.config['key-scale'] = 1.0 / math.sqrt(self.config['key-dim']) + raise RuntimeError( + "{0} has invalid value {1}".format(conf, self.config[conf]) + ) + if self.config["key-scale"] == 0.0: + self.config["key-scale"] = 1.0 / math.sqrt(self.config["key-dim"]) def output_name(self, auxiliary_output=None): # at a later stage we might want to expose even the pre-nonlinearity # vectors assert auxiliary_output == None - split_layer_name = self.layer_type.split('-') - assert split_layer_name[-1] == 'layer' + split_layer_name = self.layer_type.split("-") + assert split_layer_name[-1] == "layer" last_nonlinearity = split_layer_name[-2] # return something like: layer3.renorm - return '{0}.{1}'.format(self.name, last_nonlinearity) + return "{0}.{1}".format(self.name, last_nonlinearity) def attention_input_dim(self): - context_dim = (self.config['num-left-inputs'] + - self.config['num-right-inputs'] + 1) - num_heads = self.config['num-heads'] - key_dim = self.config['key-dim'] - value_dim = self.config['value-dim'] - query_dim = key_dim + context_dim; + context_dim = ( + self.config["num-left-inputs"] + self.config["num-right-inputs"] + 1 + ) + num_heads = self.config["num-heads"] + key_dim = self.config["key-dim"] + value_dim = self.config["value-dim"] + query_dim = key_dim + context_dim return num_heads * (key_dim + value_dim + query_dim) def attention_output_dim(self): - context_dim = (self.config['num-left-inputs'] + - self.config['num-right-inputs'] + 1) - num_heads = self.config['num-heads'] - value_dim = self.config['value-dim'] - return (num_heads * - (value_dim + - (context_dim if self.config['output-context'] else 0))) + context_dim = ( + self.config["num-left-inputs"] + self.config["num-right-inputs"] + 1 + ) + num_heads = self.config["num-heads"] + value_dim = self.config["value-dim"] + return num_heads * ( + value_dim + (context_dim if self.config["output-context"] else 0) + ) - def output_dim(self, auxiliary_output = None): - return self.attention_output_dim() + def output_dim(self, auxiliary_output=None): + return self.attention_output_dim() def get_full_config(self): ans = [] config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) return ans - def _generate_config(self): - split_layer_name = self.layer_type.split('-') - assert split_layer_name[-1] == 'layer' + split_layer_name = self.layer_type.split("-") + assert split_layer_name[-1] == "layer" nonlinearities = split_layer_name[:-1] # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] + input_desc = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] # the child classes e.g. tdnn might want to process the input # before adding the other components @@ -138,112 +153,141 @@ def _generate_config(self): def _add_components(self, input_desc, input_dim, nonlinearities): dim = self.attention_input_dim() - self_repair_scale = self.config['self-repair-scale'] - target_rms = self.config['target-rms'] - max_change = self.config['max-change'] - ng_affine_options = self.config['ng-affine-options'] - l2_regularize = self.config['l2-regularize'] - learning_rate_factor=self.config['learning-rate-factor'] - learning_rate_option=('learning-rate-factor={0}'.format(learning_rate_factor) - if learning_rate_factor != 1.0 else '') - l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize) - if l2_regularize != 0.0 else '') + self_repair_scale = self.config["self-repair-scale"] + target_rms = self.config["target-rms"] + max_change = self.config["max-change"] + ng_affine_options = self.config["ng-affine-options"] + l2_regularize = self.config["l2-regularize"] + learning_rate_factor = self.config["learning-rate-factor"] + learning_rate_option = ( + "learning-rate-factor={0}".format(learning_rate_factor) + if learning_rate_factor != 1.0 + else "" + ) + l2_regularize_option = ( + "l2-regularize={0} ".format(l2_regularize) if l2_regularize != 0.0 else "" + ) configs = [] # First the affine node. - line = ('component name={0}.affine' - ' type=NaturalGradientAffineComponent' - ' input-dim={1}' - ' output-dim={2}' - ' max-change={3}' - ' {4} {5} {6}' - ''.format(self.name, input_dim, dim, - max_change, ng_affine_options, - learning_rate_option, l2_regularize_option)) + line = ( + "component name={0}.affine" + " type=NaturalGradientAffineComponent" + " input-dim={1}" + " output-dim={2}" + " max-change={3}" + " {4} {5} {6}" + "".format( + self.name, + input_dim, + dim, + max_change, + ng_affine_options, + learning_rate_option, + l2_regularize_option, + ) + ) configs.append(line) - line = ('component-node name={0}.affine' - ' component={0}.affine input={1}' - ''.format(self.name, input_desc)) + line = ( + "component-node name={0}.affine" + " component={0}.affine input={1}" + "".format(self.name, input_desc) + ) configs.append(line) - cur_node = '{0}.affine'.format(self.name) + cur_node = "{0}.affine".format(self.name) for nonlinearity in nonlinearities: - if nonlinearity == 'relu': - line = ('component name={0}.{1}' - ' type=RectifiedLinearComponent dim={2}' - ' self-repair-scale={3}' - ''.format(self.name, nonlinearity, dim, - self_repair_scale)) - - elif nonlinearity == 'attention': - line = ('component name={0}.{1}' - ' type=RestrictedAttentionComponent' - ' value-dim={2}' - ' key-dim={3}' - ' num-left-inputs={4}' - ' num-right-inputs={5}' - ' num-left-inputs-required={6}' - ' num-right-inputs-required={7}' - ' output-context={8}' - ' time-stride={9}' - ' num-heads={10}' - ' key-scale={11}' - ''.format(self.name, nonlinearity, - self.config['value-dim'], - self.config['key-dim'], - self.config['num-left-inputs'], - self.config['num-right-inputs'], - self.config['num-left-inputs-required'], - self.config['num-right-inputs-required'], - self.config['output-context'], - self.config['time-stride'], - self.config['num-heads'], - self.config['key-scale'])) + if nonlinearity == "relu": + line = ( + "component name={0}.{1}" + " type=RectifiedLinearComponent dim={2}" + " self-repair-scale={3}" + "".format(self.name, nonlinearity, dim, self_repair_scale) + ) + + elif nonlinearity == "attention": + line = ( + "component name={0}.{1}" + " type=RestrictedAttentionComponent" + " value-dim={2}" + " key-dim={3}" + " num-left-inputs={4}" + " num-right-inputs={5}" + " num-left-inputs-required={6}" + " num-right-inputs-required={7}" + " output-context={8}" + " time-stride={9}" + " num-heads={10}" + " key-scale={11}" + "".format( + self.name, + nonlinearity, + self.config["value-dim"], + self.config["key-dim"], + self.config["num-left-inputs"], + self.config["num-right-inputs"], + self.config["num-left-inputs-required"], + self.config["num-right-inputs-required"], + self.config["output-context"], + self.config["time-stride"], + self.config["num-heads"], + self.config["key-scale"], + ) + ) dim = self.attention_output_dim() - elif nonlinearity == 'sigmoid': - line = ('component name={0}.{1}' - ' type=SigmoidComponent dim={2}' - ' self-repair-scale={3}' - ''.format(self.name, nonlinearity, dim, - self_repair_scale)) - - elif nonlinearity == 'tanh': - line = ('component name={0}.{1}' - ' type=TanhComponent dim={2}' - ' self-repair-scale={3}' - ''.format(self.name, nonlinearity, dim, - self_repair_scale)) - - elif nonlinearity == 'renorm': - line = ('component name={0}.{1}' - ' type=NormalizeComponent dim={2}' - ' target-rms={3}' - ''.format(self.name, nonlinearity, dim, - target_rms)) - - elif nonlinearity == 'batchnorm': - line = ('component name={0}.{1}' - ' type=BatchNormComponent dim={2}' - ' target-rms={3}' - ''.format(self.name, nonlinearity, dim, - target_rms)) - - elif nonlinearity == 'dropout': - line = ('component name={0}.{1} type=DropoutComponent ' - 'dim={2} dropout-proportion={3}'.format( - self.name, nonlinearity, dim, - self.config['dropout-proportion'])) + elif nonlinearity == "sigmoid": + line = ( + "component name={0}.{1}" + " type=SigmoidComponent dim={2}" + " self-repair-scale={3}" + "".format(self.name, nonlinearity, dim, self_repair_scale) + ) + + elif nonlinearity == "tanh": + line = ( + "component name={0}.{1}" + " type=TanhComponent dim={2}" + " self-repair-scale={3}" + "".format(self.name, nonlinearity, dim, self_repair_scale) + ) + + elif nonlinearity == "renorm": + line = ( + "component name={0}.{1}" + " type=NormalizeComponent dim={2}" + " target-rms={3}" + "".format(self.name, nonlinearity, dim, target_rms) + ) + + elif nonlinearity == "batchnorm": + line = ( + "component name={0}.{1}" + " type=BatchNormComponent dim={2}" + " target-rms={3}" + "".format(self.name, nonlinearity, dim, target_rms) + ) + + elif nonlinearity == "dropout": + line = ( + "component name={0}.{1} type=DropoutComponent " + "dim={2} dropout-proportion={3}".format( + self.name, nonlinearity, dim, self.config["dropout-proportion"] + ) + ) else: - raise RuntimeError("Unknown nonlinearity type: {0}" - .format(nonlinearity)) + raise RuntimeError( + "Unknown nonlinearity type: {0}".format(nonlinearity) + ) configs.append(line) - line = ('component-node name={0}.{1}' - ' component={0}.{1} input={2}' - ''.format(self.name, nonlinearity, cur_node)) + line = ( + "component-node name={0}.{1}" + " component={0}.{1} input={2}" + "".format(self.name, nonlinearity, cur_node) + ) configs.append(line) - cur_node = '{0}.{1}'.format(self.name, nonlinearity) + cur_node = "{0}.{1}".format(self.name, nonlinearity) return configs diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/basic_layers.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/basic_layers.py index e18c1359b61..c12773e94cf 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/basic_layers.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/basic_layers.py @@ -18,30 +18,28 @@ class XconfigLayerBase(object): - """ A base-class for classes representing layers of xconfig files. - """ + """A base-class for classes representing layers of xconfig files.""" def __init__(self, first_token, key_to_value, all_layers): """ - first_token: first token on the xconfig line, e.g. 'affine-layer'.f - key_to_value: dictionary with parameter values - { 'name':'affine1', - 'input':'Append(0, 1, 2, ReplaceIndex(ivector, t, 0))', - 'dim=1024' }. - The only required and 'special' values that are dealt with directly - at this level, are 'name' and 'input'. The rest are put in - self.config and are dealt with by the child classes' init functions. - all_layers: An array of objects inheriting XconfigLayerBase for all - previously parsed layers. + first_token: first token on the xconfig line, e.g. 'affine-layer'.f + key_to_value: dictionary with parameter values + { 'name':'affine1', + 'input':'Append(0, 1, 2, ReplaceIndex(ivector, t, 0))', + 'dim=1024' }. + The only required and 'special' values that are dealt with directly + at this level, are 'name' and 'input'. The rest are put in + self.config and are dealt with by the child classes' init functions. + all_layers: An array of objects inheriting XconfigLayerBase for all + previously parsed layers. """ self.layer_type = first_token - if 'name' not in key_to_value: + if "name" not in key_to_value: raise RuntimeError("Expected 'name' to be specified.") - self.name = key_to_value['name'] + self.name = key_to_value["name"] if not xutils.is_valid_line_name(self.name): - raise RuntimeError("Invalid value: name={0}".format( - key_to_value['name'])) + raise RuntimeError("Invalid value: name={0}".format(key_to_value["name"])) # It is possible to have two layers with a same name in 'all_layer', if # the layer type for one of them is 'existing'. @@ -54,10 +52,10 @@ def __init__(self, first_token, key_to_value, all_layers): # and 'output-node' of type 'output-layer' with the same name 'output' in # 'all_layers'. for prev_layer in all_layers: - if (self.name == prev_layer.name and - prev_layer.layer_type is not 'existing'): - raise RuntimeError("Name '{0}' is used for more than one " - "layer.".format(self.name)) + if self.name == prev_layer.name and prev_layer.layer_type is not "existing": + raise RuntimeError( + "Name '{0}' is used for more than one " "layer.".format(self.name) + ) self.config = {} # the following, which should be overridden in the child class, sets @@ -75,43 +73,53 @@ def __init__(self, first_token, key_to_value, all_layers): # that the config parameters that have been set are reasonable. self.check_configs() - def set_configs(self, key_to_value, all_layers): - """ Sets the config variables. - We broke this code out of __init__ for clarity. - the child-class constructor will deal with the configuration values - in a more specific way. + """Sets the config variables. + We broke this code out of __init__ for clarity. + the child-class constructor will deal with the configuration values + in a more specific way. """ # First check that there are no keys that don't correspond to any config # parameter of this layer, and if so, raise an exception with an # informative message saying what configs are allowed. for key, value in key_to_value.items(): - if key != 'name': + if key != "name": if key not in self.config: - configs = ' '.join([('{0}->"{1}"'.format(x, y) if isinstance(y, str) - else '{0}->{1}'.format(x, y)) - for x, y in self.config.items()]) - raise RuntimeError("Configuration value {0}={1} was not " - "expected in layer of type {2}; allowed " - "configs with their defaults: {3}" - "" .format(key, value, self.layer_type, configs)) + configs = " ".join( + [ + ( + '{0}->"{1}"'.format(x, y) + if isinstance(y, str) + else "{0}->{1}".format(x, y) + ) + for x, y in self.config.items() + ] + ) + raise RuntimeError( + "Configuration value {0}={1} was not " + "expected in layer of type {2}; allowed " + "configs with their defaults: {3}" + "".format(key, value, self.layer_type, configs) + ) for key, value in key_to_value.items(): - if key != 'name': + if key != "name": assert key in self.config # we checked above. - self.config[key] = xutils.convert_value_to_type(key, - type(self.config[key]), - value) + self.config[key] = xutils.convert_value_to_type( + key, type(self.config[key]), value + ) self.descriptors = dict() self.descriptor_dims = dict() # Parse Descriptors and get their dims and their 'final' string form. # in self.descriptors[key] for key in self.get_input_descriptor_names(): if key not in self.config: - raise RuntimeError("{0}: object of type {1} needs to override" - " get_input_descriptor_names()." - "".format(sys.argv[0], str(type(self)))) + raise RuntimeError( + "{0}: object of type {1} needs to override" + " get_input_descriptor_names()." + "".format(sys.argv[0], str(type(self))) + ) descriptor_string = self.config[key] # input string. assert isinstance(descriptor_string, str) @@ -129,18 +137,22 @@ def set_configs(self, key_to_value, all_layers): # when auxiliary_output is not None. # That's up to the designer of the layer type. desc_output_str = self.get_string_for_descriptor(desc, all_layers) - self.descriptors[key] = {'string': desc, - 'normalized-string': desc_norm_str, - 'final-string': desc_output_str, - 'dim': desc_dim} + self.descriptors[key] = { + "string": desc, + "normalized-string": desc_norm_str, + "final-string": desc_output_str, + "dim": desc_dim, + } # the following helps to check the code by parsing it again. desc2 = self.convert_to_descriptor(desc_norm_str, all_layers) desc_norm_str2 = desc2.str() # if the following ever fails we'll have to do some debugging. if desc_norm_str != desc_norm_str2: - raise RuntimeError("Likely code error: '{0}' != '{1}'" - "".format(desc_norm_str, desc_norm_str2)) + raise RuntimeError( + "Likely code error: '{0}' != '{1}'" + "".format(desc_norm_str, desc_norm_str2) + ) def str(self): """Converts 'this' to a string which could be printed to @@ -149,21 +161,24 @@ def str(self): (so users can see any defaults). """ - list_of_entries = ['{0} name={1}'.format(self.layer_type, self.name)] + list_of_entries = ["{0} name={1}".format(self.layer_type, self.name)] for key, value in sorted(self.config.items()): - if isinstance(value, str) and re.search('=', value): + if isinstance(value, str) and re.search("=", value): # the value is a string that contains an '=' sign, so we need to # enclose it in double-quotes, otherwise we woudldn't be able to # parse from that output. if re.search('"', value): - print("Warning: config '{0}={1}' contains both double-quotes " - "and equals sign; it will not be possible to parse it " - "from the file.".format(key, value), file=sys.stderr) + print( + "Warning: config '{0}={1}' contains both double-quotes " + "and equals sign; it will not be possible to parse it " + "from the file.".format(key, value), + file=sys.stderr, + ) list_of_entries.append('{0}="{1}"'.format(key, value)) else: - list_of_entries.append('{0}={1}'.format(key, value)) + list_of_entries.append("{0}={1}".format(key, value)) - return ' '.join(list_of_entries) + return " ".join(list_of_entries) def __str__(self): return self.str() @@ -178,7 +193,7 @@ def normalize_descriptors(self): """ for key, desc_str_dict in self.descriptors.items(): - self.config[key] = desc_str_dict['normalized-string'] + self.config[key] = desc_str_dict["normalized-string"] def convert_to_descriptor(self, descriptor_string, all_layers): """Convenience function intended to be called from child classes, @@ -196,8 +211,10 @@ def convert_to_descriptor(self, descriptor_string, all_layers): # note: 'pos' should point to the 'end of string' marker # that terminates 'tokens'. if pos != len(tokens) - 1: - raise RuntimeError("Parsing Descriptor, saw junk at end: {0}" - "".format(' '.join(tokens[pos:-1]))) + raise RuntimeError( + "Parsing Descriptor, saw junk at end: {0}" + "".format(" ".join(tokens[pos:-1])) + ) return descriptor def get_dim_for_descriptor(self, descriptor, all_layers): @@ -205,9 +222,9 @@ def get_dim_for_descriptor(self, descriptor, all_layers): function used in set_configs. """ - layer_to_dim_func = \ - lambda name: xutils.get_dim_from_layer_name(all_layers, self, - name) + layer_to_dim_func = lambda name: xutils.get_dim_from_layer_name( + all_layers, self, name + ) return descriptor.dim(layer_to_dim_func) def get_string_for_descriptor(self, descriptor, all_layers): @@ -216,9 +233,9 @@ def get_string_for_descriptor(self, descriptor, all_layers): provided for use in child classes; """ - layer_to_string_func = \ - lambda name: xutils.get_string_from_layer_name(all_layers, - self, name) + layer_to_string_func = lambda name: xutils.get_string_from_layer_name( + all_layers, self, name + ) return descriptor.config_string(layer_to_string_func) def get_name(self): @@ -231,8 +248,7 @@ def get_name(self): ###### Functions that might be overridden by the child class: ##### def set_default_configs(self): - """Child classes should override this. - """ + """Child classes should override this.""" raise Exception("Child classes must override set_default_configs().") @@ -240,12 +256,11 @@ def set_derived_configs(self): """This is expected to be called after set_configs and before check_configs(). """ - if 'dim' in self.config and self.config['dim'] <= 0: - self.config['dim'] = self.descriptors['input']['dim'] + if "dim" in self.config and self.config["dim"] <= 0: + self.config["dim"] = self.descriptors["input"]["dim"] def check_configs(self): - """child classes should override this. - """ + """child classes should override this.""" pass @@ -263,7 +278,7 @@ def get_input_descriptor_names(self): implementation to something like: `return ['input', 'input2']` """ - return ['input'] + return ["input"] def auxiliary_outputs(self): """Returns a list of all auxiliary outputs that this layer supports. @@ -318,46 +333,44 @@ class XconfigInputLayer(XconfigLayerBase): 'input name=ivector dim=100' in the config file. """ - def __init__(self, first_token, key_to_value, prev_names=None): - assert first_token == 'input' + def __init__(self, first_token, key_to_value, prev_names=None): + assert first_token == "input" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - - self.config = {'dim': -1} + self.config = {"dim": -1} def check_configs(self): - - if self.config['dim'] <= 0: - raise RuntimeError("Dimension of input-layer '{0}'" - "should be positive.".format(self.name)) + if self.config["dim"] <= 0: + raise RuntimeError( + "Dimension of input-layer '{0}'" "should be positive.".format(self.name) + ) def get_input_descriptor_names(self): - return [] # there is no 'input' field in self.config. def output_name(self, auxiliary_outputs=None): - # there are no auxiliary outputs as this layer will just pass the input assert auxiliary_outputs is None return self.name def output_dim(self, auxiliary_outputs=None): - # there are no auxiliary outputs as this layer will just pass the input assert auxiliary_outputs is None - return self.config['dim'] + return self.config["dim"] def get_full_config(self): - # unlike other layers the input layers need to be printed in # 'init.config' (which initializes the neural network prior to the LDA) ans = [] - for config_name in ['init', 'ref', 'final']: - ans.append((config_name, - 'input-node name={0} dim={1}'.format(self.name, - self.config['dim']))) + for config_name in ["init", "ref", "final"]: + ans.append( + ( + config_name, + "input-node name={0} dim={1}".format(self.name, self.config["dim"]), + ) + ) return ans @@ -379,41 +392,42 @@ class XconfigTrivialOutputLayer(XconfigLayerBase): """ def __init__(self, first_token, key_to_value, prev_names=None): - - assert first_token == 'output' + assert first_token == "output" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - # note: self.config['input'] is a descriptor, '[-1]' means output # the most recent layer. - self.config = {'input': '[-1]', 'dim': -1, - 'objective-type': 'linear', - 'output-delay': 0} + self.config = { + "input": "[-1]", + "dim": -1, + "objective-type": "linear", + "output-delay": 0, + } def check_configs(self): - - if self.config['objective-type'] != 'linear' and \ - self.config['objective-type'] != 'quadratic': - raise RuntimeError("In output, objective-type has" - " invalid value {0}" - "".format(self.config['objective-type'])) + if ( + self.config["objective-type"] != "linear" + and self.config["objective-type"] != "quadratic" + ): + raise RuntimeError( + "In output, objective-type has" + " invalid value {0}" + "".format(self.config["objective-type"]) + ) def output_name(self, auxiliary_outputs=None): - # there are no auxiliary outputs as this layer will just pass the output # of the previous layer assert auxiliary_outputs is None return self.name def output_dim(self, auxiliary_outputs=None): - assert auxiliary_outputs is None # note: each value of self.descriptors is (descriptor, dim, normalized-string, output-string). - return self.descriptors['input']['dim'] + return self.descriptors["input"]["dim"] def get_full_config(self): - # the input layers need to be printed in 'init.config' (which # initializes the neural network prior to the LDA), in 'ref.config', # which is a version of the config file used for getting left and right @@ -426,20 +440,25 @@ def get_full_config(self): # normalized-string, output-string). # by 'output-string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - descriptor_final_str = self.descriptors['input']['final-string'] - objective_type = self.config['objective-type'] - output_delay = self.config['output-delay'] + descriptor_final_str = self.descriptors["input"]["final-string"] + objective_type = self.config["objective-type"] + output_delay = self.config["output-delay"] if output_delay != 0: - descriptor_final_str = ( - 'Offset({0}, {1})'.format(descriptor_final_str, output_delay)) - - for config_name in ['ref', 'final']: - ans.append((config_name, - 'output-node name={0} input={1} ' - 'objective={2}'.format( - self.name, descriptor_final_str, - objective_type))) + descriptor_final_str = "Offset({0}, {1})".format( + descriptor_final_str, output_delay + ) + + for config_name in ["ref", "final"]: + ans.append( + ( + config_name, + "output-node name={0} input={1} " + "objective={2}".format( + self.name, descriptor_final_str, objective_type + ), + ) + ) return ans @@ -483,188 +502,217 @@ class XconfigOutputLayer(XconfigLayerBase): """ def __init__(self, first_token, key_to_value, prev_names=None): - - assert first_token == 'output-layer' + assert first_token == "output-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - # note: self.config['input'] is a descriptor, '[-1]' means output # the most recent layer. - self.config = {'input': '[-1]', - 'dim': -1, - 'bottleneck-dim': -1, - 'orthonormal-constraint': 1.0, - # orthonormal-constraint only matters if bottleneck-dim is set. - 'include-log-softmax': True, - # this would be false for chain models - 'objective-type': 'linear', - # see Nnet::ProcessOutputNodeConfigLine in - # nnet-nnet.cc for other options - 'output-delay': 0, - 'ng-affine-options': '', - 'ng-linear-options': '', # only affects bottleneck output layers. - - # The following are just passed through to the affine - # component, and (in the bottleneck case) the linear - # component. - 'learning-rate-factor': '', # effective default: 1.0 - 'l2-regularize': '', # effective default: 0.0 - 'max-change': 1.5, - - # The following are passed through to the affine component only. - # It tends to be beneficial to initialize the output layer with - # zero values, unlike the hidden layers. - 'param-stddev': 0.0, - 'bias-stddev': 0.0, - } + self.config = { + "input": "[-1]", + "dim": -1, + "bottleneck-dim": -1, + "orthonormal-constraint": 1.0, + # orthonormal-constraint only matters if bottleneck-dim is set. + "include-log-softmax": True, + # this would be false for chain models + "objective-type": "linear", + # see Nnet::ProcessOutputNodeConfigLine in + # nnet-nnet.cc for other options + "output-delay": 0, + "ng-affine-options": "", + "ng-linear-options": "", # only affects bottleneck output layers. + # The following are just passed through to the affine + # component, and (in the bottleneck case) the linear + # component. + "learning-rate-factor": "", # effective default: 1.0 + "l2-regularize": "", # effective default: 0.0 + "max-change": 1.5, + # The following are passed through to the affine component only. + # It tends to be beneficial to initialize the output layer with + # zero values, unlike the hidden layers. + "param-stddev": 0.0, + "bias-stddev": 0.0, + } def check_configs(self): - - if self.config['dim'] <= -1: - raise RuntimeError("In output-layer, dim has invalid value {0}" - "".format(self.config['dim'])) - - if self.config['objective-type'] != 'linear' and \ - self.config['objective-type'] != 'quadratic': - raise RuntimeError("In output-layer, objective-type has" - " invalid value {0}" - "".format(self.config['objective-type'])) - - if self.config['orthonormal-constraint'] <= 0.0: - raise RuntimeError("output-layer does not support negative (floating) " - "orthonormal constraint; use a separate linear-component " - "followed by batchnorm-component.") + if self.config["dim"] <= -1: + raise RuntimeError( + "In output-layer, dim has invalid value {0}" + "".format(self.config["dim"]) + ) + + if ( + self.config["objective-type"] != "linear" + and self.config["objective-type"] != "quadratic" + ): + raise RuntimeError( + "In output-layer, objective-type has" + " invalid value {0}" + "".format(self.config["objective-type"]) + ) + + if self.config["orthonormal-constraint"] <= 0.0: + raise RuntimeError( + "output-layer does not support negative (floating) " + "orthonormal constraint; use a separate linear-component " + "followed by batchnorm-component." + ) def auxiliary_outputs(self): - - auxiliary_outputs = ['affine'] - if self.config['include-log-softmax']: - auxiliary_outputs.append('log-softmax') + auxiliary_outputs = ["affine"] + if self.config["include-log-softmax"]: + auxiliary_outputs.append("log-softmax") return auxiliary_outputs def output_name(self, auxiliary_output=None): - if auxiliary_output is None: # Note: nodes of type output-node in nnet3 may not be accessed in # Descriptors, so calling this with auxiliary_outputs=None doesn't # make sense. - raise RuntimeError("Outputs of output-layer may not be used by other" - " layers") + raise RuntimeError( + "Outputs of output-layer may not be used by other" " layers" + ) if auxiliary_output in self.auxiliary_outputs(): - return '{0}.{1}'.format(self.name, auxiliary_output) + return "{0}.{1}".format(self.name, auxiliary_output) else: - raise RuntimeError("Unknown auxiliary output name {0}" - "".format(auxiliary_output)) + raise RuntimeError( + "Unknown auxiliary output name {0}" "".format(auxiliary_output) + ) def output_dim(self, auxiliary_output=None): - if auxiliary_output is None: # Note: nodes of type output-node in nnet3 may not be accessed in # Descriptors, so calling this with auxiliary_outputs=None doesn't # make sense. - raise RuntimeError("Outputs of output-layer may not be used by other" - " layers") - return self.config['dim'] + raise RuntimeError( + "Outputs of output-layer may not be used by other" " layers" + ) + return self.config["dim"] def get_full_config(self): ans = [] config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) return ans - def _generate_config(self): - configs = [] # note: each value of self.descriptors is (descriptor, dim, # normalized-string, output-string). # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - descriptor_final_string = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] - output_dim = self.config['dim'] - bottleneck_dim = self.config['bottleneck-dim'] - objective_type = self.config['objective-type'] - include_log_softmax = self.config['include-log-softmax'] - output_delay = self.config['output-delay'] - - affine_options = self.config['ng-affine-options'] - for opt in [ 'learning-rate-factor', 'l2-regularize', 'max-change', - 'param-stddev', 'bias-stddev' ]: - if self.config[opt] != '': - affine_options += ' {0}={1}'.format(opt, self.config[opt]) + descriptor_final_string = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] + output_dim = self.config["dim"] + bottleneck_dim = self.config["bottleneck-dim"] + objective_type = self.config["objective-type"] + include_log_softmax = self.config["include-log-softmax"] + output_delay = self.config["output-delay"] + + affine_options = self.config["ng-affine-options"] + for opt in [ + "learning-rate-factor", + "l2-regularize", + "max-change", + "param-stddev", + "bias-stddev", + ]: + if self.config[opt] != "": + affine_options += " {0}={1}".format(opt, self.config[opt]) cur_node = descriptor_final_string cur_dim = input_dim if bottleneck_dim >= 0: - if bottleneck_dim == 0 or bottleneck_dim >= input_dim or bottleneck_dim >= output_dim: - raise RuntimeError("Bottleneck dim has value that does not make sense: {0}".format( - bottleneck_dim)) + if ( + bottleneck_dim == 0 + or bottleneck_dim >= input_dim + or bottleneck_dim >= output_dim + ): + raise RuntimeError( + "Bottleneck dim has value that does not make sense: {0}".format( + bottleneck_dim + ) + ) # This is the bottleneck case (it doesn't necessarily imply we # will be using the features from the bottleneck; it's just a factorization # of the matrix into two pieces without a nonlinearity in between). # We don't include the l2-regularize option because it's useless # given the orthonormality constraint. - linear_options = self.config['ng-linear-options'] - for opt in [ 'learning-rate-factor', 'l2-regularize', 'max-change' ]: - if self.config[opt] != '': - linear_options += ' {0}={1}'.format(opt, self.config[opt]) - + linear_options = self.config["ng-linear-options"] + for opt in ["learning-rate-factor", "l2-regularize", "max-change"]: + if self.config[opt] != "": + linear_options += " {0}={1}".format(opt, self.config[opt]) # note: by default the LinearComponent uses natural gradient. - line = ('component name={0}.linear type=LinearComponent ' - 'orthonormal-constraint={1} param-stddev={2} ' - 'input-dim={3} output-dim={4} max-change=0.75 {5}' - ''.format(self.name, self.config['orthonormal-constraint'], - self.config['orthonormal-constraint'] / math.sqrt(input_dim), - input_dim, bottleneck_dim, linear_options)) + line = ( + "component name={0}.linear type=LinearComponent " + "orthonormal-constraint={1} param-stddev={2} " + "input-dim={3} output-dim={4} max-change=0.75 {5}" + "".format( + self.name, + self.config["orthonormal-constraint"], + self.config["orthonormal-constraint"] / math.sqrt(input_dim), + input_dim, + bottleneck_dim, + linear_options, + ) + ) configs.append(line) - line = ('component-node name={0}.linear component={0}.linear input={1}' - ''.format(self.name, cur_node)) + line = ( + "component-node name={0}.linear component={0}.linear input={1}" + "".format(self.name, cur_node) + ) configs.append(line) - cur_node = '{0}.linear'.format(self.name) + cur_node = "{0}.linear".format(self.name) cur_dim = bottleneck_dim - - line = ('component name={0}.affine' - ' type=NaturalGradientAffineComponent' - ' input-dim={1} output-dim={2} {3}' - ''.format(self.name, cur_dim, output_dim, affine_options)) + line = ( + "component name={0}.affine" + " type=NaturalGradientAffineComponent" + " input-dim={1} output-dim={2} {3}" + "".format(self.name, cur_dim, output_dim, affine_options) + ) configs.append(line) - line = ('component-node name={0}.affine' - ' component={0}.affine input={1}' - ''.format(self.name, cur_node)) + line = ( + "component-node name={0}.affine" + " component={0}.affine input={1}" + "".format(self.name, cur_node) + ) configs.append(line) - cur_node = '{0}.affine'.format(self.name) + cur_node = "{0}.affine".format(self.name) if include_log_softmax: - line = ('component name={0}.log-softmax' - ' type=LogSoftmaxComponent dim={1}' - ''.format(self.name, output_dim)) + line = ( + "component name={0}.log-softmax" + " type=LogSoftmaxComponent dim={1}" + "".format(self.name, output_dim) + ) configs.append(line) - line = ('component-node name={0}.log-softmax' - ' component={0}.log-softmax input={1}' - ''.format(self.name, cur_node)) + line = ( + "component-node name={0}.log-softmax" + " component={0}.log-softmax input={1}" + "".format(self.name, cur_node) + ) configs.append(line) - cur_node = '{0}.log-softmax'.format(self.name) + cur_node = "{0}.log-softmax".format(self.name) if output_delay != 0: - cur_node = 'Offset({0}, {1})'.format(cur_node, output_delay) + cur_node = "Offset({0}, {1})".format(cur_node, output_delay) - line = ('output-node name={0} input={1} ' - 'objective={2}'.format( - self.name, cur_node, objective_type)) + line = "output-node name={0} input={1} " "objective={2}".format( + self.name, cur_node, objective_type + ) configs.append(line) return configs @@ -703,73 +751,87 @@ class XconfigBasicLayer(XconfigLayerBase): add l2 regularization on the parameter norm for this component. """ + def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - # note: self.config['input'] is a descriptor, '[-1]' means output # the most recent layer. - self.config = {'input': '[-1]', - 'dim': -1, - 'bottleneck-dim': -1, # Deprecated! Use tdnnf-layer for - # factorized TDNNs, or prefinal-layer - # for bottlenecks just before the output. - 'self-repair-scale': 1.0e-05, - 'target-rms': 1.0, - 'ng-affine-options': '', - 'ng-linear-options': '', # only affects bottleneck layers. - 'dropout-proportion': 0.5, # dropout-proportion only - # affects layers with - # 'dropout' in the name - 'dropout-per-dim': False, # if dropout-per-dim=true, the dropout - # mask is shared across time. - 'dropout-per-dim-continuous': False, # if you set this, it's - # like dropout-per-dim but with a - # continuous-valued (not zero-one) mask. - 'add-log-stddev': False, - # the following are not really inspected by this level of - # code, just passed through to the affine component if - # their value is not ''. - 'bias-stddev': '', - 'l2-regularize': '', - 'learning-rate-factor': '', - 'max-change': 0.75 } + self.config = { + "input": "[-1]", + "dim": -1, + "bottleneck-dim": -1, # Deprecated! Use tdnnf-layer for + # factorized TDNNs, or prefinal-layer + # for bottlenecks just before the output. + "self-repair-scale": 1.0e-05, + "target-rms": 1.0, + "ng-affine-options": "", + "ng-linear-options": "", # only affects bottleneck layers. + "dropout-proportion": 0.5, # dropout-proportion only + # affects layers with + # 'dropout' in the name + "dropout-per-dim": False, # if dropout-per-dim=true, the dropout + # mask is shared across time. + "dropout-per-dim-continuous": False, # if you set this, it's + # like dropout-per-dim but with a + # continuous-valued (not zero-one) mask. + "add-log-stddev": False, + # the following are not really inspected by this level of + # code, just passed through to the affine component if + # their value is not ''. + "bias-stddev": "", + "l2-regularize": "", + "learning-rate-factor": "", + "max-change": 0.75, + } def check_configs(self): - if self.config['dim'] < 0: - raise RuntimeError("dim has invalid value {0}".format(self.config['dim'])) - b = self.config['bottleneck-dim'] - if b >= 0 and (b >= self.config['dim'] or b == 0): + if self.config["dim"] < 0: + raise RuntimeError("dim has invalid value {0}".format(self.config["dim"])) + b = self.config["bottleneck-dim"] + if b >= 0 and (b >= self.config["dim"] or b == 0): raise RuntimeError("bottleneck-dim has an invalid value {0}".format(b)) - if self.config['self-repair-scale'] < 0.0 or self.config['self-repair-scale'] > 1.0: - raise RuntimeError("self-repair-scale has invalid value {0}" - .format(self.config['self-repair-scale'])) - if self.config['target-rms'] < 0.0: - raise RuntimeError("target-rms has invalid value {0}" - .format(self.config['target-rms'])) - if (self.config['learning-rate-factor'] != '' and - self.config['learning-rate-factor'] <= 0.0): - raise RuntimeError("learning-rate-factor has invalid value {0}" - .format(self.config['learning-rate-factor'])) + if ( + self.config["self-repair-scale"] < 0.0 + or self.config["self-repair-scale"] > 1.0 + ): + raise RuntimeError( + "self-repair-scale has invalid value {0}".format( + self.config["self-repair-scale"] + ) + ) + if self.config["target-rms"] < 0.0: + raise RuntimeError( + "target-rms has invalid value {0}".format(self.config["target-rms"]) + ) + if ( + self.config["learning-rate-factor"] != "" + and self.config["learning-rate-factor"] <= 0.0 + ): + raise RuntimeError( + "learning-rate-factor has invalid value {0}".format( + self.config["learning-rate-factor"] + ) + ) def output_name(self, auxiliary_output=None): # at a later stage we might want to expose even the pre-nonlinearity # vectors assert auxiliary_output is None - split_layer_name = self.layer_type.split('-') - assert split_layer_name[-1] == 'layer' + split_layer_name = self.layer_type.split("-") + assert split_layer_name[-1] == "layer" last_nonlinearity = split_layer_name[-2] # return something like: layer3.renorm - return '{0}.{1}'.format(self.name, last_nonlinearity) + return "{0}.{1}".format(self.name, last_nonlinearity) def output_dim(self, auxiliary_output=None): - output_dim = self.config['dim'] + output_dim = self.config["dim"] # If not set, the output-dim defaults to the input-dim. if output_dim <= 0: - self.config['dim'] = self.descriptors['input']['dim'] + self.config["dim"] = self.descriptors["input"]["dim"] return output_dim @@ -778,21 +840,21 @@ def get_full_config(self): config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) return ans def _generate_config(self): - split_layer_name = self.layer_type.split('-') - assert split_layer_name[-1] == 'layer' + split_layer_name = self.layer_type.split("-") + assert split_layer_name[-1] == "layer" nonlinearities = split_layer_name[:-1] # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] + input_desc = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] # the child classes e.g. tdnn might want to process the input # before adding the other components @@ -801,31 +863,37 @@ def _generate_config(self): def _add_components(self, input_desc, input_dim, nonlinearities): output_dim = self.output_dim() - self_repair_scale = self.config['self-repair-scale'] - target_rms = self.config['target-rms'] - - affine_options = self.config['ng-affine-options'] - for opt_name in [ 'max-change', 'learning-rate-factor', - 'bias-stddev', 'l2-regularize' ]: + self_repair_scale = self.config["self-repair-scale"] + target_rms = self.config["target-rms"] + + affine_options = self.config["ng-affine-options"] + for opt_name in [ + "max-change", + "learning-rate-factor", + "bias-stddev", + "l2-regularize", + ]: value = self.config[opt_name] - if value != '': - affine_options += ' {0}={1}'.format(opt_name, value) + if value != "": + affine_options += " {0}={1}".format(opt_name, value) # The output of the affine component needs to have one dimension fewer in order to # get the required output dim, if the final 'renorm' component has 'add-log-stddev' set # (since in that case it increases the dimension by one). - if self.config['add-log-stddev']: + if self.config["add-log-stddev"]: output_dim -= 1 - if not self.layer_type.split('-')[-2] == "renorm": - raise RuntimeError("add-log-stddev cannot be true unless " - "there is a final 'renorm' component.") + if not self.layer_type.split("-")[-2] == "renorm": + raise RuntimeError( + "add-log-stddev cannot be true unless " + "there is a final 'renorm' component." + ) configs = [] cur_dim = input_dim cur_node = input_desc # First the affine node (or linear then affine, if bottleneck). - if self.config['bottleneck-dim'] > 0: + if self.config["bottleneck-dim"] > 0: # The 'bottleneck-dim' option is deprecated and may eventually be # removed. Best to use tdnnf-layer if you want factorized TDNNs. @@ -834,103 +902,139 @@ def _add_components(self, input_desc, input_dim, nonlinearities): # of the matrix into two pieces without a nonlinearity in between). # We don't include the l2-regularize option because it's useless # given the orthonormality constraint. - linear_options = self.config['ng-linear-options'] - for opt_name in [ 'max-change', 'learning-rate-factor' ]: + linear_options = self.config["ng-linear-options"] + for opt_name in ["max-change", "learning-rate-factor"]: value = self.config[opt_name] - if value != '': - linear_options += ' {0}={1}'.format(opt_name, value) + if value != "": + linear_options += " {0}={1}".format(opt_name, value) - bottleneck_dim = self.config['bottleneck-dim'] + bottleneck_dim = self.config["bottleneck-dim"] # note: by default the LinearComponent uses natural gradient. - line = ('component name={0}.linear type=LinearComponent ' - 'input-dim={1} orthonormal-constraint=1.0 output-dim={2} {3}' - ''.format(self.name, input_dim, bottleneck_dim, linear_options)) + line = ( + "component name={0}.linear type=LinearComponent " + "input-dim={1} orthonormal-constraint=1.0 output-dim={2} {3}" + "".format(self.name, input_dim, bottleneck_dim, linear_options) + ) configs.append(line) - line = ('component-node name={0}.linear component={0}.linear input={1}' - ''.format(self.name, cur_node)) + line = ( + "component-node name={0}.linear component={0}.linear input={1}" + "".format(self.name, cur_node) + ) configs.append(line) - cur_node = '{0}.linear'.format(self.name) + cur_node = "{0}.linear".format(self.name) cur_dim = bottleneck_dim - - line = ('component name={0}.affine type=NaturalGradientAffineComponent' - ' input-dim={1} output-dim={2} {3}' - ''.format(self.name, cur_dim, output_dim, affine_options)) + line = ( + "component name={0}.affine type=NaturalGradientAffineComponent" + " input-dim={1} output-dim={2} {3}" + "".format(self.name, cur_dim, output_dim, affine_options) + ) configs.append(line) - line = ('component-node name={0}.affine component={0}.affine input={1}' - ''.format(self.name, cur_node)) + line = ( + "component-node name={0}.affine component={0}.affine input={1}" + "".format(self.name, cur_node) + ) configs.append(line) - cur_node = '{0}.affine'.format(self.name) + cur_node = "{0}.affine".format(self.name) for i, nonlinearity in enumerate(nonlinearities): - if nonlinearity == 'relu': - line = ('component name={0}.{1} type=RectifiedLinearComponent dim={2}' - ' self-repair-scale={3}' - ''.format(self.name, nonlinearity, output_dim, - self_repair_scale)) - - elif nonlinearity == 'sigmoid': - line = ('component name={0}.{1}' - ' type=SigmoidComponent dim={2}' - ' self-repair-scale={3}' - ''.format(self.name, nonlinearity, output_dim, - self_repair_scale)) - - elif nonlinearity == 'tanh': - line = ('component name={0}.{1}' - ' type=TanhComponent dim={2}' - ' self-repair-scale={3}' - ''.format(self.name, nonlinearity, output_dim, - self_repair_scale)) - - elif nonlinearity == 'renorm': + if nonlinearity == "relu": + line = ( + "component name={0}.{1} type=RectifiedLinearComponent dim={2}" + " self-repair-scale={3}" + "".format(self.name, nonlinearity, output_dim, self_repair_scale) + ) + + elif nonlinearity == "sigmoid": + line = ( + "component name={0}.{1}" + " type=SigmoidComponent dim={2}" + " self-repair-scale={3}" + "".format(self.name, nonlinearity, output_dim, self_repair_scale) + ) + + elif nonlinearity == "tanh": + line = ( + "component name={0}.{1}" + " type=TanhComponent dim={2}" + " self-repair-scale={3}" + "".format(self.name, nonlinearity, output_dim, self_repair_scale) + ) + + elif nonlinearity == "renorm": add_log_stddev = "false" if i == len(nonlinearities) - 1: - add_log_stddev = ("true" if self.config['add-log-stddev'] - else "false") - line = ('component name={0}.{1}' - ' type=NormalizeComponent dim={2}' - ' target-rms={3}' - ' add-log-stddev={4}' - ''.format(self.name, nonlinearity, output_dim, - target_rms, add_log_stddev)) - - elif nonlinearity == 'batchnorm': - line = ('component name={0}.{1}' - ' type=BatchNormComponent dim={2} target-rms={3}' - ''.format(self.name, nonlinearity, output_dim, - target_rms)) - - elif nonlinearity == 'so': - line = ('component name={0}.{1}' - ' type=ScaleAndOffsetComponent dim={2} max-change=0.5 ' - ''.format(self.name, nonlinearity, output_dim)) - - elif nonlinearity == 'dropout': - if not (self.config['dropout-per-dim'] or - self.config['dropout-per-dim-continuous']): - line = ('component name={0}.{1} type=DropoutComponent ' - 'dim={2} dropout-proportion={3}'.format( - self.name, nonlinearity, output_dim, - self.config['dropout-proportion'])) + add_log_stddev = ( + "true" if self.config["add-log-stddev"] else "false" + ) + line = ( + "component name={0}.{1}" + " type=NormalizeComponent dim={2}" + " target-rms={3}" + " add-log-stddev={4}" + "".format( + self.name, nonlinearity, output_dim, target_rms, add_log_stddev + ) + ) + + elif nonlinearity == "batchnorm": + line = ( + "component name={0}.{1}" + " type=BatchNormComponent dim={2} target-rms={3}" + "".format(self.name, nonlinearity, output_dim, target_rms) + ) + + elif nonlinearity == "so": + line = ( + "component name={0}.{1}" + " type=ScaleAndOffsetComponent dim={2} max-change=0.5 " + "".format(self.name, nonlinearity, output_dim) + ) + + elif nonlinearity == "dropout": + if not ( + self.config["dropout-per-dim"] + or self.config["dropout-per-dim-continuous"] + ): + line = ( + "component name={0}.{1} type=DropoutComponent " + "dim={2} dropout-proportion={3}".format( + self.name, + nonlinearity, + output_dim, + self.config["dropout-proportion"], + ) + ) else: - continuous_opt='continuous=true' if self.config['dropout-per-dim-continuous'] else '' - - line = ('component name={0}.dropout type=GeneralDropoutComponent ' - 'dim={1} dropout-proportion={2} {3}'.format( - self.name, output_dim, self.config['dropout-proportion'], - continuous_opt)) + continuous_opt = ( + "continuous=true" + if self.config["dropout-per-dim-continuous"] + else "" + ) + + line = ( + "component name={0}.dropout type=GeneralDropoutComponent " + "dim={1} dropout-proportion={2} {3}".format( + self.name, + output_dim, + self.config["dropout-proportion"], + continuous_opt, + ) + ) else: - raise RuntimeError("Unknown nonlinearity type: {0}" - .format(nonlinearity)) + raise RuntimeError( + "Unknown nonlinearity type: {0}".format(nonlinearity) + ) configs.append(line) - line = ('component-node name={0}.{1}' - ' component={0}.{1} input={2}' - ''.format(self.name, nonlinearity, cur_node)) + line = ( + "component-node name={0}.{1}" + " component={0}.{1} input={2}" + "".format(self.name, nonlinearity, cur_node) + ) configs.append(line) - cur_node = '{0}.{1}'.format(self.name, nonlinearity) + cur_node = "{0}.{1}".format(self.name, nonlinearity) return configs @@ -952,21 +1056,24 @@ class XconfigFixedAffineLayer(XconfigLayerBase): affine-transform-file='' [Must be specified.] delay=0 [Optional delay for the output-node in init.config] """ + def __init__(self, first_token, key_to_value, prev_names=None): - assert first_token == 'fixed-affine-layer' + assert first_token == "fixed-affine-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): # note: self.config['input'] is a descriptor, '[-1]' means output # the most recent layer. - self.config = {'input': '[-1]', - 'dim': -1, - 'affine-transform-file': '', - 'delay': 0, - 'write-init-config': True} + self.config = { + "input": "[-1]", + "dim": -1, + "affine-transform-file": "", + "delay": 0, + "write-init-config": True, + } def check_configs(self): - if self.config['affine-transform-file'] is None: + if self.config["affine-transform-file"] is None: raise RuntimeError("affine-transform-file must be set.") def output_name(self, auxiliary_output=None): @@ -976,10 +1083,10 @@ def output_name(self, auxiliary_output=None): return self.name def output_dim(self, auxiliary_output=None): - output_dim = self.config['dim'] + output_dim = self.config["dim"] # If not set, the output-dim defaults to the input-dim. if output_dim <= 0: - output_dim = self.descriptors['input']['dim'] + output_dim = self.descriptors["input"]["dim"] return output_dim def get_full_config(self): @@ -989,39 +1096,50 @@ def get_full_config(self): # normalized-string, output-string). # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - descriptor_final_string = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] + descriptor_final_string = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] output_dim = self.output_dim() - transform_file = self.config['affine-transform-file'] - - if self.config['write-init-config']: - if self.config['delay'] != 0: - line = 'component name={0}.delayed type=NoOpComponent dim={1}'.format(self.name, input_dim) - ans.append(('init', line)) - line = 'component-node name={0}.delayed component={0}.delayed input={1}'.format(self.name, descriptor_final_string) - ans.append(('init', line)) - line = 'output-node name=output input=Offset({0}.delayed, {1})'.format(self.name, self.config['delay']) - ans.append(('init', line)) + transform_file = self.config["affine-transform-file"] + + if self.config["write-init-config"]: + if self.config["delay"] != 0: + line = "component name={0}.delayed type=NoOpComponent dim={1}".format( + self.name, input_dim + ) + ans.append(("init", line)) + line = "component-node name={0}.delayed component={0}.delayed input={1}".format( + self.name, descriptor_final_string + ) + ans.append(("init", line)) + line = "output-node name=output input=Offset({0}.delayed, {1})".format( + self.name, self.config["delay"] + ) + ans.append(("init", line)) else: # to init.config we write an output-node with the name 'output' and # with a Descriptor equal to the descriptor that's the input to this # layer. This will be used to accumulate stats to learn the LDA transform. - line = 'output-node name=output input={0}'.format(descriptor_final_string) - ans.append(('init', line)) + line = "output-node name=output input={0}".format( + descriptor_final_string + ) + ans.append(("init", line)) # write the 'real' component to final.config - line = 'component name={0} type=FixedAffineComponent matrix={1}'.format( - self.name, transform_file) - ans.append(('final', line)) + line = "component name={0} type=FixedAffineComponent matrix={1}".format( + self.name, transform_file + ) + ans.append(("final", line)) # write a random version of the component, with the same dims, to ref.config - line = 'component name={0} type=FixedAffineComponent input-dim={1} output-dim={2}'.format( - self.name, input_dim, output_dim) - ans.append(('ref', line)) + line = "component name={0} type=FixedAffineComponent input-dim={1} output-dim={2}".format( + self.name, input_dim, output_dim + ) + ans.append(("ref", line)) # the component-node gets written to final.config and ref.config. - line = 'component-node name={0} component={0} input={1}'.format( - self.name, descriptor_final_string) - ans.append(('final', line)) - ans.append(('ref', line)) + line = "component-node name={0} component={0} input={1}".format( + self.name, descriptor_final_string + ) + ans.append(("final", line)) + ans.append(("ref", line)) return ans @@ -1047,7 +1165,7 @@ class XconfigAffineLayer(XconfigLayerBase): """ def __init__(self, first_token, key_to_value, prev_names=None): - assert first_token == 'affine-layer' + assert first_token == "affine-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): @@ -1057,23 +1175,27 @@ def set_default_configs(self): # C++ component provides more options but I will just expose these for now # Note : The type of the parameter is determined based on the value assigned # so please use decimal point if your parameter is a float - self.config = {'input': '[-1]', - 'dim': -1, - 'param-stddev': -1.0, # this has to be initialized to 1/sqrt(input_dim) - 'bias-stddev': 1.0, - 'bias-mean': 0.0, - 'max-change': 0.75, - 'l2-regularize': 0.0, - 'learning-rate-factor': 1.0, - 'ng-affine-options': ''} + self.config = { + "input": "[-1]", + "dim": -1, + "param-stddev": -1.0, # this has to be initialized to 1/sqrt(input_dim) + "bias-stddev": 1.0, + "bias-mean": 0.0, + "max-change": 0.75, + "l2-regularize": 0.0, + "learning-rate-factor": 1.0, + "ng-affine-options": "", + } def set_derived_configs(self): super(XconfigAffineLayer, self).set_derived_configs() - if self.config['param-stddev'] < 0: - self.config['param-stddev'] = 1.0 / math.sqrt(self.descriptors['input']['dim']) + if self.config["param-stddev"] < 0: + self.config["param-stddev"] = 1.0 / math.sqrt( + self.descriptors["input"]["dim"] + ) def check_configs(self): - if self.config['dim'] <= 0: + if self.config["dim"] <= 0: raise RuntimeError("dim specified is invalid") def output_name(self, auxiliary_output=None): @@ -1083,10 +1205,10 @@ def output_name(self, auxiliary_output=None): return self.name def output_dim(self, auxiliary_output=None): - output_dim = self.config['dim'] + output_dim = self.config["dim"] # If not set, the output-dim defaults to the input-dim. if output_dim <= 0: - output_dim = self.descriptors['input']['dim'] + output_dim = self.descriptors["input"]["dim"] return output_dim @@ -1097,29 +1219,38 @@ def get_full_config(self): # normalized-string, output-string). # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - descriptor_final_string = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] + descriptor_final_string = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] output_dim = self.output_dim() - option_string = '' - for key in ['param-stddev', 'bias-stddev', 'bias-mean', 'max-change', - 'l2-regularize']: - option_string += ' {0}={1}'.format(key, self.config[key]) - option_string += self.config['ng-affine-options'] + option_string = "" + for key in [ + "param-stddev", + "bias-stddev", + "bias-mean", + "max-change", + "l2-regularize", + ]: + option_string += " {0}={1}".format(key, self.config[key]) + option_string += self.config["ng-affine-options"] conf_lines = [] # write the 'real' component to final.config - conf_lines.append('component name={n} type=NaturalGradientAffineComponent ' - 'input-dim={i} output-dim={o} {opts}'.format(n=self.name, - i=input_dim, - o=output_dim, - opts=option_string)) + conf_lines.append( + "component name={n} type=NaturalGradientAffineComponent " + "input-dim={i} output-dim={o} {opts}".format( + n=self.name, i=input_dim, o=output_dim, opts=option_string + ) + ) # the component-node gets written to final.config and ref.config. - conf_lines.append('component-node name={0} component={0} input={1}'.format(self.name, - descriptor_final_string)) + conf_lines.append( + "component-node name={0} component={0} input={1}".format( + self.name, descriptor_final_string + ) + ) # the config is same for both final and ref configs - for conf_name in ['final', 'ref']: + for conf_name in ["final", "ref"]: for line in conf_lines: ans.append((conf_name, line)) return ans @@ -1147,21 +1278,24 @@ class XconfigIdctLayer(XconfigLayerBase): `fixed-affine-layer` that is to be initialized via LDA] """ + def __init__(self, first_token, key_to_value, prev_names=None): - assert first_token == 'idct-layer' + assert first_token == "idct-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): # note: self.config['input'] is a descriptor, '[-1]' means output # the most recent layer. - self.config = {'input': '[-1]', - 'dim': -1, - 'cepstral-lifter': 22.0, - 'affine-transform-file': '', - 'include-in-init': False} + self.config = { + "input": "[-1]", + "dim": -1, + "cepstral-lifter": 22.0, + "affine-transform-file": "", + "include-in-init": False, + } def check_configs(self): - if self.config['affine-transform-file'] is None: + if self.config["affine-transform-file"] is None: raise RuntimeError("affine-transform-file must be set.") def output_name(self, auxiliary_output=None): @@ -1171,38 +1305,37 @@ def output_name(self, auxiliary_output=None): return self.name def output_dim(self, auxiliary_output=None): - output_dim = self.config['dim'] + output_dim = self.config["dim"] # If not set, the output-dim defaults to the input-dim. if output_dim <= 0: - output_dim = self.descriptors['input']['dim'] + output_dim = self.descriptors["input"]["dim"] return output_dim def get_full_config(self): ans = [] config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) - if self.config['include-in-init']: - ans.append(('init', line)) + if self.config["include-in-init"]: + ans.append(("init", line)) return ans - def _generate_config(self): - # note: each value of self.descriptors is (descriptor, dim, # normalized-string, output-string). # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - descriptor_final_string = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] + descriptor_final_string = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] output_dim = self.output_dim() - transform_file = self.config['affine-transform-file'] + transform_file = self.config["affine-transform-file"] idct_mat = common_lib.compute_idct_matrix( - input_dim, output_dim, self.config['cepstral-lifter']) + input_dim, output_dim, self.config["cepstral-lifter"] + ) # append a zero column to the matrix, this is the bias of the fixed # affine component for n in range(0, output_dim): @@ -1212,11 +1345,13 @@ def _generate_config(self): configs = [] # write the 'real' component to final.config - line = 'component name={0} type=FixedAffineComponent matrix={1}'.format( - self.name, transform_file) + line = "component name={0} type=FixedAffineComponent matrix={1}".format( + self.name, transform_file + ) configs.append(line) - line = 'component-node name={0} component={0} input={1}'.format( - self.name, descriptor_final_string) + line = "component-node name={0} component={0} input={1}".format( + self.name, descriptor_final_string + ) configs.append(line) return configs @@ -1241,18 +1376,18 @@ class XconfigExistingLayer(XconfigLayerBase): """ def __init__(self, first_token, key_to_value, prev_names=None): - - assert first_token == 'existing' + assert first_token == "existing" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) - def set_default_configs(self): - self.config = { 'dim': -1} + self.config = {"dim": -1} def check_configs(self): - if self.config['dim'] <= 0: - raise RuntimeError("Dimension of existing-layer '{0}'" - "should be positive.".format(self.name)) + if self.config["dim"] <= 0: + raise RuntimeError( + "Dimension of existing-layer '{0}'" + "should be positive.".format(self.name) + ) def get_input_descriptor_names(self): return [] # there is no 'input' field in self.config. @@ -1265,7 +1400,7 @@ def output_name(self, auxiliary_outputs=None): def output_dim(self, auxiliary_outputs=None): # there are no auxiliary outputs as this layer will just pass the input assert auxiliary_outputs is None - return self.config['dim'] + return self.config["dim"] def get_full_config(self): # unlike other layers the existing layers should not to be printed in @@ -1294,30 +1429,35 @@ class XconfigSpecAugmentLayer(XconfigLayerBase): `fixed-affine-layer` that is to be initialized via LDA] """ + def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input': '[-1]', - 'freq-max-proportion': 0.5, - 'time-zeroed-proportion': 0.2, - 'time-mask-max-frames': 20, - 'include-in-init': False} - + self.config = { + "input": "[-1]", + "freq-max-proportion": 0.5, + "time-zeroed-proportion": 0.2, + "time-mask-max-frames": 20, + "include-in-init": False, + } def check_configs(self): - assert (self.config['freq-max-proportion'] > 0.0 and self.config['freq-max-proportion'] < 1.0 - and self.config['time-zeroed-proportion'] > 0.0 and self.config['time-zeroed-proportion'] < 1.0 - and self.config['time-mask-max-frames'] >= 1) - + assert ( + self.config["freq-max-proportion"] > 0.0 + and self.config["freq-max-proportion"] < 1.0 + and self.config["time-zeroed-proportion"] > 0.0 + and self.config["time-zeroed-proportion"] < 1.0 + and self.config["time-mask-max-frames"] >= 1 + ) def output_name(self, auxiliary_output=None): assert auxiliary_output is None - return '{0}.time-mask'.format(self.name) + return "{0}.time-mask".format(self.name) def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - input_dim = self.descriptors['input']['dim'] + input_dim = self.descriptors["input"]["dim"] return input_dim def get_full_config(self): @@ -1325,36 +1465,42 @@ def get_full_config(self): config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) - if self.config['include-in-init']: - ans.append(('init', line)) + if self.config["include-in-init"]: + ans.append(("init", line)) return ans def _generate_config(self): # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] - freq_max_proportion = self.config['freq-max-proportion'] - time_zeroed_proportion = self.config['time-zeroed-proportion'] - time_mask_max_frames = self.config['time-mask-max-frames'] + input_desc = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] + freq_max_proportion = self.config["freq-max-proportion"] + time_zeroed_proportion = self.config["time-zeroed-proportion"] + time_mask_max_frames = self.config["time-mask-max-frames"] configs = [] - line = ('component name={0}.freq-mask type=GeneralDropoutComponent dim={1} specaugment-max-proportion={2}'.format( - self.name, input_dim, freq_max_proportion)) + line = "component name={0}.freq-mask type=GeneralDropoutComponent dim={1} specaugment-max-proportion={2}".format( + self.name, input_dim, freq_max_proportion + ) configs.append(line) - line = ('component-node name={0}.freq-mask component={0}.freq-mask input={1}'.format( - self.name, input_desc)) + line = "component-node name={0}.freq-mask component={0}.freq-mask input={1}".format( + self.name, input_desc + ) configs.append(line) - line = ('component name={0}.time-mask type=SpecAugmentTimeMaskComponent dim={1} ' - 'zeroed-proportion={2} time-mask-max-frames={3}'.format( - self.name, input_dim, time_zeroed_proportion, time_mask_max_frames)) + line = ( + "component name={0}.time-mask type=SpecAugmentTimeMaskComponent dim={1} " + "zeroed-proportion={2} time-mask-max-frames={3}".format( + self.name, input_dim, time_zeroed_proportion, time_mask_max_frames + ) + ) configs.append(line) - line = ('component-node name={0}.time-mask component={0}.time-mask input={0}.freq-mask'.format( - self.name)) + line = "component-node name={0}.time-mask component={0}.time-mask input={0}.freq-mask".format( + self.name + ) configs.append(line) return configs @@ -1362,5 +1508,5 @@ def _generate_config(self): def test_layers(): # for some config lines that should be printed the same way as they # are read, check that this is the case. - for x in ['input name=input dim=30']: + for x in ["input name=input dim=30"]: assert str(config_line_to_object(x, [])) == x diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/composite_layers.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/composite_layers.py index 928ca445ccc..b4abdfc71ee 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/composite_layers.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/composite_layers.py @@ -65,135 +65,164 @@ # This is passed through to the linear and affine components. You'll normally # want this to be set to a nonzero value, e.g. 0.004. -class XconfigTdnnfLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): +class XconfigTdnnfLayer(XconfigLayerBase): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "tdnnf-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', - 'dim':-1, - 'bottleneck-dim':-1, - 'bypass-scale':0.66, - 'dropout-proportion':-1.0, - 'time-stride':1, - 'l2-regularize':0.0, - 'max-change': 0.75, - 'self-repair-scale': 1.0e-05, - 'context': 'default'} + self.config = { + "input": "[-1]", + "dim": -1, + "bottleneck-dim": -1, + "bypass-scale": 0.66, + "dropout-proportion": -1.0, + "time-stride": 1, + "l2-regularize": 0.0, + "max-change": 0.75, + "self-repair-scale": 1.0e-05, + "context": "default", + } def set_derived_configs(self): pass def check_configs(self): - if self.config['bottleneck-dim'] <= 0: + if self.config["bottleneck-dim"] <= 0: raise RuntimeError("bottleneck-dim must be set and >0.") - if self.config['dim'] <= self.config['bottleneck-dim']: + if self.config["dim"] <= self.config["bottleneck-dim"]: raise RuntimeError("dim must be greater than bottleneck-dim") - dropout = self.config['dropout-proportion'] + dropout = self.config["dropout-proportion"] if dropout != -1.0 and not (dropout >= 0.0 and dropout < 1.0): raise RuntimeError("invalid value for dropout-proportion") - if abs(self.config['bypass-scale']) > 1.0: + if abs(self.config["bypass-scale"]) > 1.0: raise RuntimeError("bypass-scale has invalid value") - input_dim = self.descriptors['input']['dim'] - output_dim = self.config['dim'] - if output_dim != input_dim and self.config['bypass-scale'] != 0.0: - raise RuntimeError('bypass-scale is nonzero but output-dim != input-dim: {0} != {1}' - ''.format(output_dim, input_dim)) - - if not self.config['context'] in ['default', 'left-only', 'shift-left', 'none']: - raise RuntimeError('context must be default, left-only shift-left or none, got {}'.format( - self.config['context'])) - + input_dim = self.descriptors["input"]["dim"] + output_dim = self.config["dim"] + if output_dim != input_dim and self.config["bypass-scale"] != 0.0: + raise RuntimeError( + "bypass-scale is nonzero but output-dim != input-dim: {0} != {1}" + "".format(output_dim, input_dim) + ) + + if not self.config["context"] in ["default", "left-only", "shift-left", "none"]: + raise RuntimeError( + "context must be default, left-only shift-left or none, got {}".format( + self.config["context"] + ) + ) def output_name(self, auxiliary_output=None): assert auxiliary_output is None - output_component = '' - if self.config['bypass-scale'] != 0.0: + output_component = "" + if self.config["bypass-scale"] != 0.0: # the no-op component is used to cache something that we don't want # to have to recompute. - output_component = 'noop' - elif self.config['dropout-proportion'] != -1.0: - output_component = 'dropout' + output_component = "noop" + elif self.config["dropout-proportion"] != -1.0: + output_component = "dropout" else: - output_component = 'batchnorm' - return '{0}.{1}'.format(self.name, output_component) - + output_component = "batchnorm" + return "{0}.{1}".format(self.name, output_component) def output_dim(self, auxiliary_output=None): - return self.config['dim'] + return self.config["dim"] def get_full_config(self): ans = [] config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: ans.append((config_name, line)) return ans - def _generate_config(self): configs = [] name = self.name - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - output_dim = self.config['dim'] - bottleneck_dim = self.config['bottleneck-dim'] - bypass_scale = self.config['bypass-scale'] - dropout_proportion = self.config['dropout-proportion'] - time_stride = self.config['time-stride'] - context = self.config['context'] - if time_stride != 0 and context != 'none': - time_offsets1 = '{0},0'.format(-time_stride) - if context == 'default': - time_offsets2 = '0,{0}'.format(time_stride) - elif context == 'shift-left': - time_offsets2 = '{0},0'.format(-time_stride) + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + output_dim = self.config["dim"] + bottleneck_dim = self.config["bottleneck-dim"] + bypass_scale = self.config["bypass-scale"] + dropout_proportion = self.config["dropout-proportion"] + time_stride = self.config["time-stride"] + context = self.config["context"] + if time_stride != 0 and context != "none": + time_offsets1 = "{0},0".format(-time_stride) + if context == "default": + time_offsets2 = "0,{0}".format(time_stride) + elif context == "shift-left": + time_offsets2 = "{0},0".format(-time_stride) else: - assert context == 'left-only' - time_offsets2 = '0' + assert context == "left-only" + time_offsets2 = "0" else: - time_offsets1 = '0' - time_offsets2 = '0' - l2_regularize = self.config['l2-regularize'] - max_change = self.config['max-change'] - self_repair_scale = self.config['self-repair-scale'] + time_offsets1 = "0" + time_offsets2 = "0" + l2_regularize = self.config["l2-regularize"] + max_change = self.config["max-change"] + self_repair_scale = self.config["self-repair-scale"] # The first linear layer, from input-dim (spliced x2) to bottleneck-dim - configs.append('component name={0}.linear type=TdnnComponent input-dim={1} ' - 'output-dim={2} l2-regularize={3} max-change={4} use-bias=false ' - 'time-offsets={5} orthonormal-constraint=-1.0'.format( - name, input_dim, bottleneck_dim, l2_regularize, - max_change, time_offsets1)) - configs.append('component-node name={0}.linear component={0}.linear ' - 'input={1}'.format(name, input_descriptor)) + configs.append( + "component name={0}.linear type=TdnnComponent input-dim={1} " + "output-dim={2} l2-regularize={3} max-change={4} use-bias=false " + "time-offsets={5} orthonormal-constraint=-1.0".format( + name, + input_dim, + bottleneck_dim, + l2_regularize, + max_change, + time_offsets1, + ) + ) + configs.append( + "component-node name={0}.linear component={0}.linear " + "input={1}".format(name, input_descriptor) + ) # The affine layer, from bottleneck-dim (spliced x2) to output-dim - configs.append('component name={0}.affine type=TdnnComponent ' - 'input-dim={1} output-dim={2} l2-regularize={3} max-change={4} ' - 'time-offsets={5}'.format( - name, bottleneck_dim, output_dim, l2_regularize, - max_change, time_offsets2)) - configs.append('component-node name={0}.affine component={0}.affine ' - 'input={0}.linear'.format(name)) + configs.append( + "component name={0}.affine type=TdnnComponent " + "input-dim={1} output-dim={2} l2-regularize={3} max-change={4} " + "time-offsets={5}".format( + name, + bottleneck_dim, + output_dim, + l2_regularize, + max_change, + time_offsets2, + ) + ) + configs.append( + "component-node name={0}.affine component={0}.affine " + "input={0}.linear".format(name) + ) # The ReLU layer - configs.append('component name={0}.relu type=RectifiedLinearComponent dim={1} ' - 'self-repair-scale={2}'.format( - name, output_dim, self_repair_scale)) - configs.append('component-node name={0}.relu component={0}.relu ' - 'input={0}.affine'.format(name)) + configs.append( + "component name={0}.relu type=RectifiedLinearComponent dim={1} " + "self-repair-scale={2}".format(name, output_dim, self_repair_scale) + ) + configs.append( + "component-node name={0}.relu component={0}.relu " + "input={0}.affine".format(name) + ) # The BatchNorm layer - configs.append('component name={0}.batchnorm type=BatchNormComponent ' - 'dim={1}'.format(name, output_dim)) - configs.append('component-node name={0}.batchnorm component={0}.batchnorm ' - 'input={0}.relu'.format(name)) + configs.append( + "component name={0}.batchnorm type=BatchNormComponent " + "dim={1}".format(name, output_dim) + ) + configs.append( + "component-node name={0}.batchnorm component={0}.batchnorm " + "input={0}.relu".format(name) + ) if dropout_proportion != -1: # This is not normal dropout. It's dropout where the mask is shared @@ -201,14 +230,19 @@ def _generate_config(self): # zero-or-one scale, it's a continuously varying scale whose # expected value is 1, drawn from a uniform distribution over an # interval of a size that varies with dropout-proportion. - configs.append('component name={0}.dropout type=GeneralDropoutComponent ' - 'dim={1} dropout-proportion={2} continuous=true'.format( - name, output_dim, dropout_proportion)) - configs.append('component-node name={0}.dropout component={0}.dropout ' - 'input={0}.batchnorm'.format(name)) - cur_component_type = 'dropout' + configs.append( + "component name={0}.dropout type=GeneralDropoutComponent " + "dim={1} dropout-proportion={2} continuous=true".format( + name, output_dim, dropout_proportion + ) + ) + configs.append( + "component-node name={0}.dropout component={0}.dropout " + "input={0}.batchnorm".format(name) + ) + cur_component_type = "dropout" else: - cur_component_type = 'batchnorm' + cur_component_type = "batchnorm" if bypass_scale != 0.0: # Add a NoOpComponent to cache the weighted sum of the input and the @@ -217,15 +251,20 @@ def _generate_config(self): # but if we did that and you used many of this component in sequence, # the weighted sums would have more and more terms as you went deeper # in the network. - configs.append('component name={0}.noop type=NoOpComponent ' - 'dim={1}'.format(name, output_dim)) - configs.append('component-node name={0}.noop component={0}.noop ' - 'input=Sum(Scale({1}, {2}), {0}.{3})'.format( - name, bypass_scale, input_descriptor, - cur_component_type)) + configs.append( + "component name={0}.noop type=NoOpComponent " + "dim={1}".format(name, output_dim) + ) + configs.append( + "component-node name={0}.noop component={0}.noop " + "input=Sum(Scale({1}, {2}), {0}.{3})".format( + name, bypass_scale, input_descriptor, cur_component_type + ) + ) return configs + # This is for lines like the following: # prefinal-layer name=prefinal-chain input=prefinal-l l2-regularize=0.02 big-dim=1024 small-dim=256 # @@ -239,91 +278,111 @@ def _generate_config(self): # do anything that's particular hard or unusual, but it encapsulates a commonly # repeated pattern. class XconfigPrefinalLayer(XconfigLayerBase): - - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "prefinal-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', - 'big-dim':-1, - 'small-dim':-1, - 'l2-regularize':0.0, - 'max-change': 0.75, - 'self-repair-scale': 1.0e-05} + self.config = { + "input": "[-1]", + "big-dim": -1, + "small-dim": -1, + "l2-regularize": 0.0, + "max-change": 0.75, + "self-repair-scale": 1.0e-05, + } def set_derived_configs(self): pass def check_configs(self): - if self.config['small-dim'] <= 0: + if self.config["small-dim"] <= 0: raise RuntimeError("small-dim must be set and >0.") - if self.config['big-dim'] <= self.config['small-dim']: + if self.config["big-dim"] <= self.config["small-dim"]: raise RuntimeError("big-dim must be greater than small-dim") def output_name(self, auxiliary_output=None): assert auxiliary_output is None - return '{0}.batchnorm2'.format(self.name) + return "{0}.batchnorm2".format(self.name) def output_dim(self, auxiliary_output=None): - return self.config['small-dim'] + return self.config["small-dim"] def get_full_config(self): ans = [] config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: ans.append((config_name, line)) return ans - def _generate_config(self): configs = [] name = self.name - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - small_dim = self.config['small-dim'] - big_dim = self.config['big-dim'] - l2_regularize = self.config['l2-regularize'] - max_change = self.config['max-change'] - self_repair_scale = self.config['self-repair-scale'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + small_dim = self.config["small-dim"] + big_dim = self.config["big-dim"] + l2_regularize = self.config["l2-regularize"] + max_change = self.config["max-change"] + self_repair_scale = self.config["self-repair-scale"] # The affine layer, from input-dim to big-dim. - configs.append('component name={0}.affine type=NaturalGradientAffineComponent ' - 'input-dim={1} output-dim={2} l2-regularize={3} max-change={4}'.format( - name, input_dim, big_dim, l2_regularize, max_change)) - configs.append('component-node name={0}.affine component={0}.affine ' - 'input={1}'.format(name, input_descriptor)) + configs.append( + "component name={0}.affine type=NaturalGradientAffineComponent " + "input-dim={1} output-dim={2} l2-regularize={3} max-change={4}".format( + name, input_dim, big_dim, l2_regularize, max_change + ) + ) + configs.append( + "component-node name={0}.affine component={0}.affine " + "input={1}".format(name, input_descriptor) + ) # The ReLU layer - configs.append('component name={0}.relu type=RectifiedLinearComponent dim={1} ' - 'self-repair-scale={2}'.format( - name, big_dim, self_repair_scale)) - configs.append('component-node name={0}.relu component={0}.relu ' - 'input={0}.affine'.format(name)) + configs.append( + "component name={0}.relu type=RectifiedLinearComponent dim={1} " + "self-repair-scale={2}".format(name, big_dim, self_repair_scale) + ) + configs.append( + "component-node name={0}.relu component={0}.relu " + "input={0}.affine".format(name) + ) # The first BatchNorm layer - configs.append('component name={0}.batchnorm1 type=BatchNormComponent ' - 'dim={1}'.format(name, big_dim)) - configs.append('component-node name={0}.batchnorm1 component={0}.batchnorm1 ' - 'input={0}.relu'.format(name)) + configs.append( + "component name={0}.batchnorm1 type=BatchNormComponent " + "dim={1}".format(name, big_dim) + ) + configs.append( + "component-node name={0}.batchnorm1 component={0}.batchnorm1 " + "input={0}.relu".format(name) + ) # The linear layer, from big-dim to small-dim, with orthonormal-constraint=-1 # ("floating" orthonormal constraint). - configs.append('component name={0}.linear type=LinearComponent ' - 'input-dim={1} output-dim={2} l2-regularize={3} max-change={4} ' - 'orthonormal-constraint=-1 '.format( - name, big_dim, small_dim, - l2_regularize, max_change)) - configs.append('component-node name={0}.linear component={0}.linear ' - 'input={0}.batchnorm1'.format(name)) + configs.append( + "component name={0}.linear type=LinearComponent " + "input-dim={1} output-dim={2} l2-regularize={3} max-change={4} " + "orthonormal-constraint=-1 ".format( + name, big_dim, small_dim, l2_regularize, max_change + ) + ) + configs.append( + "component-node name={0}.linear component={0}.linear " + "input={0}.batchnorm1".format(name) + ) # The second BatchNorm layer - configs.append('component name={0}.batchnorm2 type=BatchNormComponent ' - 'dim={1}'.format(name, small_dim)) - configs.append('component-node name={0}.batchnorm2 component={0}.batchnorm2 ' - 'input={0}.linear'.format(name)) + configs.append( + "component name={0}.batchnorm2 type=BatchNormComponent " + "dim={1}".format(name, small_dim) + ) + configs.append( + "component-node name={0}.batchnorm2 component={0}.batchnorm2 " + "input={0}.linear".format(name) + ) return configs diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/convolution.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/convolution.py index 1628a5e314f..2401c8a7512 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/convolution.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/convolution.py @@ -3,7 +3,6 @@ # Apache 2.0. - """ This module has the implementation of convolutional layers. """ from __future__ import print_function @@ -112,55 +111,73 @@ # the following is also passed into the convolution components, if specified: # l2-regularize (float) + class XconfigConvLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): - for operation in first_token.split('-')[:-1]: - assert operation in ['conv', 'renorm', 'batchnorm', 'relu', - 'noconv', 'dropout', 'so'] + def __init__(self, first_token, key_to_value, prev_names=None): + for operation in first_token.split("-")[:-1]: + assert operation in [ + "conv", + "renorm", + "batchnorm", + "relu", + "noconv", + "dropout", + "so", + ] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', - 'height-in':-1, - 'height-subsample-out':1, - 'height-out':-1, - 'height-offsets':'', - 'num-filters-out':-1, - 'time-offsets':'', - 'required-time-offsets':'', - 'target-rms':1.0, - 'self-repair-scale': 2.0e-05, - 'self-repair-lower-threshold': 0.05, - # the following are not really inspected by this level of - # code, just passed through (but not if left at ''). - 'param-stddev':'', 'bias-stddev':'', - 'max-change': 0.75, 'learning-rate-factor':'', - 'use-natural-gradient':'', - 'rank-in':'', 'rank-out':'', 'num-minibatches-history':'', - 'alpha-in':'', 'alpha-out':'', 'l2-regularize':'', - 'dropout-proportion': 0.5} + self.config = { + "input": "[-1]", + "height-in": -1, + "height-subsample-out": 1, + "height-out": -1, + "height-offsets": "", + "num-filters-out": -1, + "time-offsets": "", + "required-time-offsets": "", + "target-rms": 1.0, + "self-repair-scale": 2.0e-05, + "self-repair-lower-threshold": 0.05, + # the following are not really inspected by this level of + # code, just passed through (but not if left at ''). + "param-stddev": "", + "bias-stddev": "", + "max-change": 0.75, + "learning-rate-factor": "", + "use-natural-gradient": "", + "rank-in": "", + "rank-out": "", + "num-minibatches-history": "", + "alpha-in": "", + "alpha-out": "", + "l2-regularize": "", + "dropout-proportion": 0.5, + } def set_derived_configs(self): # sets 'num-filters-in'. - input_dim = self.descriptors['input']['dim'] - height_in = self.config['height-in'] + input_dim = self.descriptors["input"]["dim"] + height_in = self.config["height-in"] if height_in <= 0: - raise RuntimeError("height-in must be specified"); + raise RuntimeError("height-in must be specified") if input_dim % height_in != 0: - raise RuntimeError("Input dimension {0} is not a multiple of height-in={1}".format( - input_dim, height_in)) - self.config['num-filters-in'] = input_dim // height_in - + raise RuntimeError( + "Input dimension {0} is not a multiple of height-in={1}".format( + input_dim, height_in + ) + ) + self.config["num-filters-in"] = input_dim // height_in # Check whether 'str' is a sorted, unique, nonempty list of integers, like -1,0,1., # returns true if so. def check_offsets_var(self, str): try: - a = [ int(x) for x in str.split(",") ] + a = [int(x) for x in str.split(",")] if len(a) == 0: return False for i in range(len(a) - 1): - if a[i] >= a[i+1]: + if a[i] >= a[i + 1]: return False return True except: @@ -171,71 +188,96 @@ def check_configs(self): # some more thorough checking, but if you set the height-out too small it # prints it as a warning, which the user may not see, so at a minimum we # want to check for that here. - height_subsample_out = self.config['height-subsample-out'] - height_in = self.config['height-in'] - height_out = self.config['height-out'] + height_subsample_out = self.config["height-subsample-out"] + height_in = self.config["height-in"] + height_out = self.config["height-out"] if height_subsample_out <= 0: - raise RuntimeError("height-subsample-out has invalid value {0}.".format( - height_subsample_out)) + raise RuntimeError( + "height-subsample-out has invalid value {0}.".format( + height_subsample_out + ) + ) # we already checked height-in in set_derived_configs. if height_out <= 0: - raise RuntimeError("height-out has invalid value {0}.".format( - height_out)) + raise RuntimeError("height-out has invalid value {0}.".format(height_out)) if height_out * height_subsample_out > height_in: - raise RuntimeError("The combination height-in={0}, height-out={1} and " - "height-subsample-out={2} does not look right " - "(height-out too large).".format( - height_in, height_out, height_subsample_out)) - height_offsets = self.config['height-offsets'] - time_offsets = self.config['time-offsets'] - required_time_offsets = self.config['required-time-offsets'] - - if not 'noconv' in self.layer_type.split('-'): + raise RuntimeError( + "The combination height-in={0}, height-out={1} and " + "height-subsample-out={2} does not look right " + "(height-out too large).".format( + height_in, height_out, height_subsample_out + ) + ) + height_offsets = self.config["height-offsets"] + time_offsets = self.config["time-offsets"] + required_time_offsets = self.config["required-time-offsets"] + + if not "noconv" in self.layer_type.split("-"): # only check height-offsets, time-offsets and required-time-offsets if there # is actually a convolution in this layer. if not self.check_offsets_var(height_offsets): - raise RuntimeError("height-offsets={0} is not valid".format(height_offsets)) + raise RuntimeError( + "height-offsets={0} is not valid".format(height_offsets) + ) if not self.check_offsets_var(time_offsets): raise RuntimeError("time-offsets={0} is not valid".format(time_offsets)) - if required_time_offsets != "" and not self.check_offsets_var(required_time_offsets): - raise RuntimeError("required-time-offsets={0} is not valid".format( - required_time_offsets)) - - if height_out * height_subsample_out < \ - height_in - len(height_offsets.split(',')): - raise RuntimeError("The combination height-in={0}, height-out={1} and " - "height-subsample-out={2} and height-offsets={3} " - "does not look right (height-out too small).") - - if self.config['target-rms'] <= 0.0: - raise RuntimeError("Config value target-rms={0} is not valid".format( - self.config['target_rms'])) + if required_time_offsets != "" and not self.check_offsets_var( + required_time_offsets + ): + raise RuntimeError( + "required-time-offsets={0} is not valid".format( + required_time_offsets + ) + ) + + if height_out * height_subsample_out < height_in - len( + height_offsets.split(",") + ): + raise RuntimeError( + "The combination height-in={0}, height-out={1} and " + "height-subsample-out={2} and height-offsets={3} " + "does not look right (height-out too small)." + ) + + if self.config["target-rms"] <= 0.0: + raise RuntimeError( + "Config value target-rms={0} is not valid".format( + self.config["target_rms"] + ) + ) def auxiliary_outputs(self): return [] - def output_name(self, auxiliary_output = None): + def output_name(self, auxiliary_output=None): assert auxiliary_output is None # note: the [:-1] is to remove the '-layer'. - operations = self.layer_type.split('-')[:-1] - if operations[-1] == 'noconv': + operations = self.layer_type.split("-")[:-1] + if operations[-1] == "noconv": operations = operations[:-1] assert len(operations) >= 1 last_operation = operations[-1] - assert last_operation in ['relu', 'conv', 'renorm', 'batchnorm', 'dropout', 'so'] + assert last_operation in [ + "relu", + "conv", + "renorm", + "batchnorm", + "dropout", + "so", + ] # we'll return something like 'layer1.batchnorm'. - return '{0}.{1}'.format(self.name, last_operation) + return "{0}.{1}".format(self.name, last_operation) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - return self.config['num-filters-out'] * self.config['height-out'] + return self.config["num-filters-out"] * self.config["height-out"] def get_full_config(self): ans = [] config_lines = self._generate_cnn_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in CNN initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -248,13 +290,13 @@ def _generate_cnn_config(self): name = self.name # These 3 variables will be updated as we add components. - cur_num_filters = self.config['num-filters-in'] - cur_height = self.config['height-in'] - cur_descriptor = self.descriptors['input']['final-string'] + cur_num_filters = self.config["num-filters-in"] + cur_height = self.config["height-in"] + cur_descriptor = self.descriptors["input"]["final-string"] # note: the [:-1] is to remove the '-layer'. - operations = self.layer_type.split('-')[:-1] - if operations[-1] == 'noconv': + operations = self.layer_type.split("-")[:-1] + if operations[-1] == "noconv": operations = operations[:-1] # e.g.: # operations = [ 'conv', 'relu', 'batchnorm' ] @@ -262,66 +304,113 @@ def _generate_cnn_config(self): # operations = [ 'relu', 'conv', 'renorm' ] for operation in operations: - if operation == 'conv': + if operation == "conv": a = [] for opt_name in [ - 'param-stddev', 'bias-stddev', 'use-natural-gradient', - 'max-change', 'rank-in', 'rank-out', 'num-minibatches-history', - 'alpha-in', 'alpha-out', 'num-filters-in', 'num-filters-out', - 'height-in','height-out', 'height-subsample-out', - 'height-offsets', 'time-offsets', 'required-time-offsets', - 'learning-rate-factor', 'l2-regularize' ]: + "param-stddev", + "bias-stddev", + "use-natural-gradient", + "max-change", + "rank-in", + "rank-out", + "num-minibatches-history", + "alpha-in", + "alpha-out", + "num-filters-in", + "num-filters-out", + "height-in", + "height-out", + "height-subsample-out", + "height-offsets", + "time-offsets", + "required-time-offsets", + "learning-rate-factor", + "l2-regularize", + ]: value = self.config[opt_name] - if value != '': - a.append('{0}={1}'.format(opt_name, value)) - conv_opts = ' '.join(a) - - configs.append('component name={0}.conv type=TimeHeightConvolutionComponent ' - '{1}'.format(name, conv_opts)) - configs.append('component-node name={0}.conv component={0}.conv ' - 'input={1}'.format(name, cur_descriptor)) - cur_num_filters = self.config['num-filters-out'] - cur_height = self.config['height-out'] - elif operation == 'batchnorm': - configs.append('component name={0}.batchnorm type=BatchNormComponent dim={1} ' - 'block-dim={2} target-rms={3}'.format( - name, cur_num_filters * cur_height, cur_num_filters, - self.config['target-rms'])) - configs.append('component-node name={0}.batchnorm component={0}.batchnorm ' - 'input={1}'.format(name, cur_descriptor)) - elif operation == 'renorm': - configs.append('component name={0}.renorm type=NormalizeComponent ' - 'dim={1} target-rms={2}'.format( - name, cur_num_filters * cur_height, - self.config['target-rms'])) - configs.append('component-node name={0}.renorm component={0}.renorm ' - 'input={1}'.format(name, cur_descriptor)) - elif operation == 'relu': - configs.append('component name={0}.relu type=RectifiedLinearComponent ' - 'dim={1} block-dim={2} self-repair-scale={3} ' - 'self-repair-lower-threshold={4}'.format( - name, cur_num_filters * cur_height, cur_num_filters, - self.config['self-repair-scale'], - self.config['self-repair-lower-threshold'])) - configs.append('component-node name={0}.relu component={0}.relu ' - 'input={1}'.format(name, cur_descriptor)) - elif operation == 'dropout': - configs.append('component name={0}.dropout type=DropoutComponent ' - 'dim={1} dropout-proportion={2}'.format( - name, cur_num_filters * cur_height, - self.config['dropout-proportion'])) - configs.append('component-node name={0}.dropout component={0}.dropout ' - 'input={1}'.format(name, cur_descriptor)) - elif operation == 'so': - configs.append('component name={0}.so type=ScaleAndOffsetComponent ' - 'dim={1} block-dim={2}'.format( - name, cur_num_filters * cur_height, cur_num_filters)) - configs.append('component-node name={0}.so component={0}.so ' - 'input={1}'.format(name, cur_descriptor)) + if value != "": + a.append("{0}={1}".format(opt_name, value)) + conv_opts = " ".join(a) + + configs.append( + "component name={0}.conv type=TimeHeightConvolutionComponent " + "{1}".format(name, conv_opts) + ) + configs.append( + "component-node name={0}.conv component={0}.conv " + "input={1}".format(name, cur_descriptor) + ) + cur_num_filters = self.config["num-filters-out"] + cur_height = self.config["height-out"] + elif operation == "batchnorm": + configs.append( + "component name={0}.batchnorm type=BatchNormComponent dim={1} " + "block-dim={2} target-rms={3}".format( + name, + cur_num_filters * cur_height, + cur_num_filters, + self.config["target-rms"], + ) + ) + configs.append( + "component-node name={0}.batchnorm component={0}.batchnorm " + "input={1}".format(name, cur_descriptor) + ) + elif operation == "renorm": + configs.append( + "component name={0}.renorm type=NormalizeComponent " + "dim={1} target-rms={2}".format( + name, cur_num_filters * cur_height, self.config["target-rms"] + ) + ) + configs.append( + "component-node name={0}.renorm component={0}.renorm " + "input={1}".format(name, cur_descriptor) + ) + elif operation == "relu": + configs.append( + "component name={0}.relu type=RectifiedLinearComponent " + "dim={1} block-dim={2} self-repair-scale={3} " + "self-repair-lower-threshold={4}".format( + name, + cur_num_filters * cur_height, + cur_num_filters, + self.config["self-repair-scale"], + self.config["self-repair-lower-threshold"], + ) + ) + configs.append( + "component-node name={0}.relu component={0}.relu " + "input={1}".format(name, cur_descriptor) + ) + elif operation == "dropout": + configs.append( + "component name={0}.dropout type=DropoutComponent " + "dim={1} dropout-proportion={2}".format( + name, + cur_num_filters * cur_height, + self.config["dropout-proportion"], + ) + ) + configs.append( + "component-node name={0}.dropout component={0}.dropout " + "input={1}".format(name, cur_descriptor) + ) + elif operation == "so": + configs.append( + "component name={0}.so type=ScaleAndOffsetComponent " + "dim={1} block-dim={2}".format( + name, cur_num_filters * cur_height, cur_num_filters + ) + ) + configs.append( + "component-node name={0}.so component={0}.so " + "input={1}".format(name, cur_descriptor) + ) else: raise RuntimeError("Un-handled operation type: " + operation) - cur_descriptor = '{0}.{1}'.format(name, operation) + cur_descriptor = "{0}.{1}".format(name, operation) return configs @@ -413,97 +502,108 @@ def _generate_cnn_config(self): # l2-regularize (float) # + class XconfigResBlock(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token == 'res-block' + def __init__(self, first_token, key_to_value, prev_names=None): + assert first_token == "res-block" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', - 'height':-1, - 'num-filters':-1, - 'num-bottleneck-filters':-1, - 'time-period':1, - 'height-period':1, - 'self-repair-scale': 2.0e-05, - 'self-repair-lower-threshold1': 0.05, - 'self-repair-lower-threshold2': 0.05, - 'self-repair-lower-threshold3': 0.05, - 'max-change': 0.75, - 'allow-zero-padding': True, - 'bypass-source' : 'noop', - # the following are not really inspected by this level of - # code, just passed through (but not if left at ''). - 'param-stddev':'', 'bias-stddev':'', - 'use-natural-gradient':'', - 'rank-in':'', 'rank-out':'', - 'num-minibatches-history':'', - 'alpha-in':'', 'alpha-out':'', 'l2-regularize':'' } + self.config = { + "input": "[-1]", + "height": -1, + "num-filters": -1, + "num-bottleneck-filters": -1, + "time-period": 1, + "height-period": 1, + "self-repair-scale": 2.0e-05, + "self-repair-lower-threshold1": 0.05, + "self-repair-lower-threshold2": 0.05, + "self-repair-lower-threshold3": 0.05, + "max-change": 0.75, + "allow-zero-padding": True, + "bypass-source": "noop", + # the following are not really inspected by this level of + # code, just passed through (but not if left at ''). + "param-stddev": "", + "bias-stddev": "", + "use-natural-gradient": "", + "rank-in": "", + "rank-out": "", + "num-minibatches-history": "", + "alpha-in": "", + "alpha-out": "", + "l2-regularize": "", + } def set_derived_configs(self): # set 'num-filters' or check it.. - input_dim = self.descriptors['input']['dim'] - height = self.config['height'] + input_dim = self.descriptors["input"]["dim"] + height = self.config["height"] - cur_num_filters = self.config['num-filters'] + cur_num_filters = self.config["num-filters"] if cur_num_filters == -1: if input_dim % height != 0: - raise RuntimeError("Specified image height {0} does not " - "divide the input dim {1}".format( - height, input_dim)) - self.config['num-filters'] = input_dim / height + raise RuntimeError( + "Specified image height {0} does not " + "divide the input dim {1}".format(height, input_dim) + ) + self.config["num-filters"] = input_dim / height elif input_dim != cur_num_filters * height: - raise RuntimeError("Expected the input-dim to equal " - "height={0} * num-filters={1} = {2}, but " - "it is {3}".format( - height, cur_num_filters, - height * cur_num_filters, - input_dim)); + raise RuntimeError( + "Expected the input-dim to equal " + "height={0} * num-filters={1} = {2}, but " + "it is {3}".format( + height, cur_num_filters, height * cur_num_filters, input_dim + ) + ) def check_configs(self): # we checked the dimensions in set_derived_configs. - if not self.config['bypass-source'] in [ - 'input', 'noop', 'relu', 'batchnorm' ]: - raise RuntimeError("Expected direct-convolution-source to " - "be input, relu or batchnorm, got: {1}".format( - self.config['direct-convolution-source'])) + if not self.config["bypass-source"] in ["input", "noop", "relu", "batchnorm"]: + raise RuntimeError( + "Expected direct-convolution-source to " + "be input, relu or batchnorm, got: {1}".format( + self.config["direct-convolution-source"] + ) + ) def auxiliary_outputs(self): return [] - def output_name(self, auxiliary_output = None): - bypass_source = self.config['bypass-source'] - b = self.config['num-bottleneck-filters'] - conv = ('{0}.conv2' if b <= 0 else '{0}.conv3').format(self.name) - if bypass_source == 'input': - residual = self.descriptors['input']['final-string'] - elif bypass_source == 'noop': + def output_name(self, auxiliary_output=None): + bypass_source = self.config["bypass-source"] + b = self.config["num-bottleneck-filters"] + conv = ("{0}.conv2" if b <= 0 else "{0}.conv3").format(self.name) + if bypass_source == "input": + residual = self.descriptors["input"]["final-string"] + elif bypass_source == "noop": # we let the noop be the sum of the convolutional part and the # input, so just return the output of the no-op component. - return '{0}.noop'.format(self.name) - elif bypass_source == 'relu': - residual = '{0}.relu1'.format(self.name) + return "{0}.noop".format(self.name) + elif bypass_source == "relu": + residual = "{0}.relu1".format(self.name) else: - assert bypass_source == 'batchnorm' - residual = '{0}.batchnorm1'.format(self.name) + assert bypass_source == "batchnorm" + residual = "{0}.batchnorm1".format(self.name) - return 'Sum({0}, {1})'.format(conv, residual) + return "Sum({0}, {1})".format(conv, residual) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - input_dim = self.descriptors['input']['dim'] + input_dim = self.descriptors["input"]["dim"] return input_dim def get_full_config(self): ans = [] - b = self.config['num-bottleneck-filters'] + b = self.config["num-bottleneck-filters"] if b <= 0: config_lines = self._generate_normal_resblock_config() else: config_lines = self._generate_bottleneck_resblock_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in CNN initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -536,77 +636,102 @@ def _generate_normal_resblock_config(self): configs = [] name = self.name - num_filters = self.config['num-filters'] - assert self.config['num-bottleneck-filters'] == -1 - height = self.config['height'] - input_descriptor = self.descriptors['input']['final-string'] - allow_zero_padding = self.config['allow-zero-padding'] - height_period = self.config['height-period'] - time_period = self.config['time-period'] + num_filters = self.config["num-filters"] + assert self.config["num-bottleneck-filters"] == -1 + height = self.config["height"] + input_descriptor = self.descriptors["input"]["final-string"] + allow_zero_padding = self.config["allow-zero-padding"] + height_period = self.config["height-period"] + time_period = self.config["time-period"] # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2 cur_descriptor = input_descriptor for n in [1, 2]: # the ReLU - configs.append('component name={0}.relu{1} type=RectifiedLinearComponent ' - 'dim={2} block-dim={3} self-repair-scale={4} ' - 'self-repair-lower-threshold={5}'.format( - name, n, num_filters * height, num_filters, - self.config['self-repair-scale'], - self.config['self-repair-lower-threshold{0}'.format(n)])) - configs.append('component-node name={0}.relu{1} component={0}.relu{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - - cur_descriptor = '{0}.relu{1}'.format(name, n) + configs.append( + "component name={0}.relu{1} type=RectifiedLinearComponent " + "dim={2} block-dim={3} self-repair-scale={4} " + "self-repair-lower-threshold={5}".format( + name, + n, + num_filters * height, + num_filters, + self.config["self-repair-scale"], + self.config["self-repair-lower-threshold{0}".format(n)], + ) + ) + configs.append( + "component-node name={0}.relu{1} component={0}.relu{1} " + "input={2}".format(name, n, cur_descriptor) + ) + + cur_descriptor = "{0}.relu{1}".format(name, n) # the batch-norm - configs.append('component name={0}.batchnorm{1} type=BatchNormComponent dim={2} ' - 'block-dim={3}'.format( - name, n, num_filters * height, - num_filters)) - configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - cur_descriptor = '{0}.batchnorm{1}'.format(name, n) - + configs.append( + "component name={0}.batchnorm{1} type=BatchNormComponent dim={2} " + "block-dim={3}".format(name, n, num_filters * height, num_filters) + ) + configs.append( + "component-node name={0}.batchnorm{1} component={0}.batchnorm{1} " + "input={2}".format(name, n, cur_descriptor) + ) + cur_descriptor = "{0}.batchnorm{1}".format(name, n) # the convolution. a = [] for opt_name in [ - 'param-stddev', 'bias-stddev', 'use-natural-gradient', - 'max-change', 'rank-in', 'rank-out', 'num-minibatches-history', - 'alpha-in', 'alpha-out', 'l2-regularize' ]: + "param-stddev", + "bias-stddev", + "use-natural-gradient", + "max-change", + "rank-in", + "rank-out", + "num-minibatches-history", + "alpha-in", + "alpha-out", + "l2-regularize", + ]: value = self.config[opt_name] - if value != '': - a.append('{0}={1}'.format(opt_name, value)) - conv_opts = ('height-in={h} height-out={h} height-offsets=-{hp},0,{hp} ' - 'time-offsets=-{p},0,{p} ' - 'num-filters-in={f} num-filters-out={f} {r} {o}'.format( - h=height, hp=height_period, p=time_period, f=num_filters, - r=('required-time-offsets=0' if allow_zero_padding else ''), - o=' '.join(a))) - - configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent ' - '{2}'.format(name, n, conv_opts)) - configs.append('component-node name={0}.conv{1} component={0}.conv{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - cur_descriptor = '{0}.conv{1}'.format(name, n) - - - - if self.config['bypass-source'] == 'noop': - dim = self.descriptors['input']['dim'] - configs.append('component name={0}.noop dim={1} type=NoOpComponent'.format( - name, dim)) - configs.append('component-node name={0}.noop component={0}.noop ' - 'input=Sum({1}, {0}.conv2)'.format(name, - input_descriptor)) + if value != "": + a.append("{0}={1}".format(opt_name, value)) + conv_opts = ( + "height-in={h} height-out={h} height-offsets=-{hp},0,{hp} " + "time-offsets=-{p},0,{p} " + "num-filters-in={f} num-filters-out={f} {r} {o}".format( + h=height, + hp=height_period, + p=time_period, + f=num_filters, + r=("required-time-offsets=0" if allow_zero_padding else ""), + o=" ".join(a), + ) + ) + + configs.append( + "component name={0}.conv{1} type=TimeHeightConvolutionComponent " + "{2}".format(name, n, conv_opts) + ) + configs.append( + "component-node name={0}.conv{1} component={0}.conv{1} " + "input={2}".format(name, n, cur_descriptor) + ) + cur_descriptor = "{0}.conv{1}".format(name, n) + + if self.config["bypass-source"] == "noop": + dim = self.descriptors["input"]["dim"] + configs.append( + "component name={0}.noop dim={1} type=NoOpComponent".format(name, dim) + ) + configs.append( + "component-node name={0}.noop component={0}.noop " + "input=Sum({1}, {0}.conv2)".format(name, input_descriptor) + ) # Note: the function 'output_name' is responsible for returning the # descriptor corresponding to the output of the network. return configs - - # _generate_bottleneck_resblock_config is a convenience function to generate the # res-block config (this is the bottleneck version, where there is # a 3x3 kernel with a smaller number of filters than at the input and output, @@ -630,14 +755,14 @@ def _generate_bottleneck_resblock_config(self): configs = [] name = self.name - num_filters = self.config['num-filters'] - num_bottleneck_filters = self.config['num-bottleneck-filters'] + num_filters = self.config["num-filters"] + num_bottleneck_filters = self.config["num-bottleneck-filters"] assert num_bottleneck_filters > 0 - height = self.config['height'] - input_descriptor = self.descriptors['input']['final-string'] - allow_zero_padding = self.config['allow-zero-padding'] - height_period = self.config['height-period'] - time_period = self.config['time-period'] + height = self.config["height"] + input_descriptor = self.descriptors["input"]["final-string"] + allow_zero_padding = self.config["allow-zero-padding"] + height_period = self.config["height-period"] + time_period = self.config["time-period"] # input -> relu1 -> batchnorm1 -> conv1 -> relu2 -> batchnorm2 -> conv2 cur_descriptor = input_descriptor @@ -645,62 +770,92 @@ def _generate_bottleneck_resblock_config(self): for n in [1, 2, 3]: # the ReLU - configs.append('component name={0}.relu{1} type=RectifiedLinearComponent ' - 'dim={2} block-dim={3} self-repair-scale={4} ' - 'self-repair-lower-threshold={5}'.format( - name, n, cur_num_filters * height, cur_num_filters, - self.config['self-repair-scale'], - self.config['self-repair-lower-threshold{0}'.format(n)])) - configs.append('component-node name={0}.relu{1} component={0}.relu{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - - cur_descriptor = '{0}.relu{1}'.format(name, n) + configs.append( + "component name={0}.relu{1} type=RectifiedLinearComponent " + "dim={2} block-dim={3} self-repair-scale={4} " + "self-repair-lower-threshold={5}".format( + name, + n, + cur_num_filters * height, + cur_num_filters, + self.config["self-repair-scale"], + self.config["self-repair-lower-threshold{0}".format(n)], + ) + ) + configs.append( + "component-node name={0}.relu{1} component={0}.relu{1} " + "input={2}".format(name, n, cur_descriptor) + ) + + cur_descriptor = "{0}.relu{1}".format(name, n) # the batch-norm - configs.append('component name={0}.batchnorm{1} type=BatchNormComponent dim={2} ' - 'block-dim={3}'.format( - name, n, cur_num_filters * height, - cur_num_filters)) - configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - cur_descriptor = '{0}.batchnorm{1}'.format(name, n) - + configs.append( + "component name={0}.batchnorm{1} type=BatchNormComponent dim={2} " + "block-dim={3}".format( + name, n, cur_num_filters * height, cur_num_filters + ) + ) + configs.append( + "component-node name={0}.batchnorm{1} component={0}.batchnorm{1} " + "input={2}".format(name, n, cur_descriptor) + ) + cur_descriptor = "{0}.batchnorm{1}".format(name, n) # the convolution. a = [] for opt_name in [ - 'param-stddev', 'bias-stddev', 'use-natural-gradient', - 'max-change', 'rank-in', 'rank-out', 'num-minibatches-history', - 'alpha-in', 'alpha-out', 'l2-regularize' ]: + "param-stddev", + "bias-stddev", + "use-natural-gradient", + "max-change", + "rank-in", + "rank-out", + "num-minibatches-history", + "alpha-in", + "alpha-out", + "l2-regularize", + ]: value = self.config[opt_name] - if value != '': - a.append('{0}={1}'.format(opt_name, value)) - - height_offsets = ('-{hp},0,{hp}'.format(hp=height_period) if n == 2 else '0') - time_offsets = ('-{t},0,{t}'.format(t=time_period) if n == 2 else '0') - next_num_filters = (num_filters if n == 3 else num_bottleneck_filters) - conv_opts = ('height-in={h} height-out={h} height-offsets={ho} time-offsets={to} ' - 'num-filters-in={fi} num-filters-out={fo} {r} {o}'.format( - h=height, ho=height_offsets, to=time_offsets, - fi=cur_num_filters, fo=next_num_filters, - r=('required-time-offsets=0' if allow_zero_padding else ''), - o=' '.join(a))) - - configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent ' - '{2}'.format(name, n, conv_opts)) - configs.append('component-node name={0}.conv{1} component={0}.conv{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - cur_descriptor = '{0}.conv{1}'.format(name, n) + if value != "": + a.append("{0}={1}".format(opt_name, value)) + + height_offsets = "-{hp},0,{hp}".format(hp=height_period) if n == 2 else "0" + time_offsets = "-{t},0,{t}".format(t=time_period) if n == 2 else "0" + next_num_filters = num_filters if n == 3 else num_bottleneck_filters + conv_opts = ( + "height-in={h} height-out={h} height-offsets={ho} time-offsets={to} " + "num-filters-in={fi} num-filters-out={fo} {r} {o}".format( + h=height, + ho=height_offsets, + to=time_offsets, + fi=cur_num_filters, + fo=next_num_filters, + r=("required-time-offsets=0" if allow_zero_padding else ""), + o=" ".join(a), + ) + ) + + configs.append( + "component name={0}.conv{1} type=TimeHeightConvolutionComponent " + "{2}".format(name, n, conv_opts) + ) + configs.append( + "component-node name={0}.conv{1} component={0}.conv{1} " + "input={2}".format(name, n, cur_descriptor) + ) + cur_descriptor = "{0}.conv{1}".format(name, n) cur_num_filters = next_num_filters - - if self.config['bypass-source'] == 'noop': - dim = self.descriptors['input']['dim'] - configs.append('component name={0}.noop dim={1} type=NoOpComponent'.format( - name, dim)) - configs.append('component-node name={0}.noop component={0}.noop ' - 'input=Sum({1}, {0}.conv3)'.format(name, - input_descriptor)) + if self.config["bypass-source"] == "noop": + dim = self.descriptors["input"]["dim"] + configs.append( + "component name={0}.noop dim={1} type=NoOpComponent".format(name, dim) + ) + configs.append( + "component-node name={0}.noop component={0}.noop " + "input=Sum({1}, {0}.conv3)".format(name, input_descriptor) + ) # Note: the function 'output_name' is responsible for returning the # descriptor corresponding to the output of the network. @@ -772,82 +927,98 @@ def _generate_bottleneck_resblock_config(self): # the following is also passed into the convolution components, if specified: # l2-regularize (float) + class XconfigRes2Block(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token == 'res2-block' + def __init__(self, first_token, key_to_value, prev_names=None): + assert first_token == "res2-block" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', - 'height':-1, # sets height-in and height-out - 'height-in':-1, - 'height-out':-1, - 'num-filters':-1, # interpreted as num-filters-out. - 'num-bottleneck-filters':-1, - 'time-period':1, - 'self-repair-scale': 2.0e-05, - 'self-repair-lower-threshold1': 0.05, - 'self-repair-lower-threshold2': 0.05, - 'self-repair-lower-threshold3': 0.05, - 'max-change': 0.75, - 'allow-zero-padding': True, - # the following are not really inspected by this level of - # code, just passed through (but not if left at ''). - 'param-stddev':'', 'bias-stddev':'', - 'use-natural-gradient':'', - 'rank-in':'', 'rank-out':'', - 'num-minibatches-history':'', - 'alpha-in':'', 'alpha-out':'', - 'l2-regularize':'' } + self.config = { + "input": "[-1]", + "height": -1, # sets height-in and height-out + "height-in": -1, + "height-out": -1, + "num-filters": -1, # interpreted as num-filters-out. + "num-bottleneck-filters": -1, + "time-period": 1, + "self-repair-scale": 2.0e-05, + "self-repair-lower-threshold1": 0.05, + "self-repair-lower-threshold2": 0.05, + "self-repair-lower-threshold3": 0.05, + "max-change": 0.75, + "allow-zero-padding": True, + # the following are not really inspected by this level of + # code, just passed through (but not if left at ''). + "param-stddev": "", + "bias-stddev": "", + "use-natural-gradient": "", + "rank-in": "", + "rank-out": "", + "num-minibatches-history": "", + "alpha-in": "", + "alpha-out": "", + "l2-regularize": "", + } def set_derived_configs(self): - input_dim = self.descriptors['input']['dim'] - - if not ((self.config['height'] > 0 and self.config['height-in'] == -1 and - self.config['height-out'] == -1) or - (self.config['height-out'] > 0 and self.config['height-in'] > 0)): - raise RuntimeError("You must specify height, or height-in and height-out, for res2-block.") - - if not (self.config['height-in'] > 0 and self.config['height-out'] > 0): - height = self.config['height'] + input_dim = self.descriptors["input"]["dim"] + + if not ( + ( + self.config["height"] > 0 + and self.config["height-in"] == -1 + and self.config["height-out"] == -1 + ) + or (self.config["height-out"] > 0 and self.config["height-in"] > 0) + ): + raise RuntimeError( + "You must specify height, or height-in and height-out, for res2-block." + ) + + if not (self.config["height-in"] > 0 and self.config["height-out"] > 0): + height = self.config["height"] if not height > 0: - raise RuntimeError("You must specify either height, or height-in and height-out, for " - "res2-block.") - self.config['height-in'] = height - self.config['height-out'] = height - - height_in = self.config['height-in'] + raise RuntimeError( + "You must specify either height, or height-in and height-out, for " + "res2-block." + ) + self.config["height-in"] = height + self.config["height-out"] = height + + height_in = self.config["height-in"] if input_dim % height_in != 0: - raise RuntimeError("Specified input image height {0} does not " - "divide the input dim {1}".format( - height_in, input_dim)) - self.config['num-filters'] = input_dim / height + raise RuntimeError( + "Specified input image height {0} does not " + "divide the input dim {1}".format(height_in, input_dim) + ) + self.config["num-filters"] = input_dim / height def check_configs(self): - if self.config['num-filters'] == -1: + if self.config["num-filters"] == -1: raise RuntimeError("You must specify num-filters for res2-block.") def auxiliary_outputs(self): return [] - def output_name(self, auxiliary_output = None): - b = self.config['num-bottleneck-filters'] - return ('{0}.relu2' if b <= 0 else '{0}.relu3').format(self.name) + def output_name(self, auxiliary_output=None): + b = self.config["num-bottleneck-filters"] + return ("{0}.relu2" if b <= 0 else "{0}.relu3").format(self.name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - return self.config['height-out'] * self.config['num-filters'] + return self.config["height-out"] * self.config["num-filters"] def get_full_config(self): ans = [] - b = self.config['num-bottleneck-filters'] + b = self.config["num-bottleneck-filters"] if b <= 0: config_lines = self._generate_normal_resblock_config() else: config_lines = self._generate_bottleneck_resblock_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in CNN initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -869,136 +1040,182 @@ def get_full_config(self): def _generate_normal_resblock_config(self): configs = [] name = self.name - assert self.config['num-bottleneck-filters'] == -1 - input_dim = self.descriptors['input']['dim'] - height_in = self.config['height-in'] - height_out = self.config['height-out'] - time_period_out = self.config['time-period'] + assert self.config["num-bottleneck-filters"] == -1 + input_dim = self.descriptors["input"]["dim"] + height_in = self.config["height-in"] + height_out = self.config["height-out"] + time_period_out = self.config["time-period"] if not input_dim % height_in == 0: - raise RuntimeError("input-dim {0} does not divide height-in {1}".format( - input_dim, height_in)) + raise RuntimeError( + "input-dim {0} does not divide height-in {1}".format( + input_dim, height_in + ) + ) num_filters_in = input_dim / height_in - num_filters_out = self.config['num-filters'] + num_filters_out = self.config["num-filters"] if height_out != height_in: if height_out < height_in / 2 - 1 or height_out > height_in / 2 + 1: - raise RuntimeError("Expected height-out to be about half height-in, or the same: " - "height-in={0} height-out={1}".format(height_in, height_out)) + raise RuntimeError( + "Expected height-out to be about half height-in, or the same: " + "height-in={0} height-out={1}".format(height_in, height_out) + ) if not time_period_out % 2 == 0: - raise RuntimeError("Expected time-period to be a multiple of 2 if you are subsampling " - "on height.") + raise RuntimeError( + "Expected time-period to be a multiple of 2 if you are subsampling " + "on height." + ) time_period_in = time_period_out / 2 height_subsample = 2 else: time_period_in = time_period_out height_subsample = 1 - cur_time_period = time_period_in cur_num_filters = num_filters_in cur_height = height_in - input_descriptor = self.descriptors['input']['final-string'] - allow_zero_padding = self.config['allow-zero-padding'] + input_descriptor = self.descriptors["input"]["final-string"] + allow_zero_padding = self.config["allow-zero-padding"] if height_subsample == 1 and num_filters_in == num_filters_out: bypass_descriptor = input_descriptor else: - bypass_descriptor = '{0}.conv_bypass'.format(name) + bypass_descriptor = "{0}.conv_bypass".format(name) cur_descriptor = input_descriptor # get miscellaneous convolution options passed in from the xconfig line a = [] for opt_name in [ - 'param-stddev', 'bias-stddev', 'use-natural-gradient', - 'max-change', 'rank-in', 'rank-out', 'num-minibatches-history', - 'alpha-in', 'alpha-out', 'l2-regularize' ]: + "param-stddev", + "bias-stddev", + "use-natural-gradient", + "max-change", + "rank-in", + "rank-out", + "num-minibatches-history", + "alpha-in", + "alpha-out", + "l2-regularize", + ]: value = self.config[opt_name] - if value != '': - a.append('{0}={1}'.format(opt_name, value)) - misc_conv_opts = ' '.join(a) + if value != "": + a.append("{0}={1}".format(opt_name, value)) + misc_conv_opts = " ".join(a) for n in [1, 2]: # the convolution. - conv_opts = ('height-in={hi} height-out={ho} height-offsets=-1,0,1 ' - 'height-subsample-out={hs} ' - 'time-offsets=-{p},0,{p} ' - 'num-filters-in={fi} num-filters-out={fo} {r} {o}'.format( - hi=cur_height, ho=height_out, - p=cur_time_period, - hs=(height_subsample if n == 1 else 1), - fi=cur_num_filters, - fo=num_filters_out, - r=('required-time-offsets=0' if allow_zero_padding else ''), - o=misc_conv_opts)) - - configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent ' - '{2}'.format(name, n, conv_opts)) - configs.append('component-node name={0}.conv{1} component={0}.conv{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - cur_descriptor = '{0}.conv{1}'.format(name, n) + conv_opts = ( + "height-in={hi} height-out={ho} height-offsets=-1,0,1 " + "height-subsample-out={hs} " + "time-offsets=-{p},0,{p} " + "num-filters-in={fi} num-filters-out={fo} {r} {o}".format( + hi=cur_height, + ho=height_out, + p=cur_time_period, + hs=(height_subsample if n == 1 else 1), + fi=cur_num_filters, + fo=num_filters_out, + r=("required-time-offsets=0" if allow_zero_padding else ""), + o=misc_conv_opts, + ) + ) + + configs.append( + "component name={0}.conv{1} type=TimeHeightConvolutionComponent " + "{2}".format(name, n, conv_opts) + ) + configs.append( + "component-node name={0}.conv{1} component={0}.conv{1} " + "input={2}".format(name, n, cur_descriptor) + ) + cur_descriptor = "{0}.conv{1}".format(name, n) cur_num_filters = num_filters_out cur_height = height_out cur_time_period = time_period_out # the batch-norm - configs.append('component name={0}.batchnorm{1} type=BatchNormComponent dim={2} ' - 'block-dim={3}'.format( - name, n, cur_num_filters * cur_height, - cur_num_filters)) - configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - cur_descriptor = '{0}.batchnorm{1}'.format(name, n) + configs.append( + "component name={0}.batchnorm{1} type=BatchNormComponent dim={2} " + "block-dim={3}".format( + name, n, cur_num_filters * cur_height, cur_num_filters + ) + ) + configs.append( + "component-node name={0}.batchnorm{1} component={0}.batchnorm{1} " + "input={2}".format(name, n, cur_descriptor) + ) + cur_descriptor = "{0}.batchnorm{1}".format(name, n) # the scale-and-offset - configs.append('component name={0}.scaleoffset{1} type=ScaleAndOffsetComponent dim={2} ' - 'block-dim={3}'.format( - name, n, cur_num_filters * cur_height, - cur_num_filters)) - configs.append('component-node name={0}.scaleoffset{1} component={0}.scaleoffset{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - cur_descriptor = '{0}.scaleoffset{1}'.format(name, n) - + configs.append( + "component name={0}.scaleoffset{1} type=ScaleAndOffsetComponent dim={2} " + "block-dim={3}".format( + name, n, cur_num_filters * cur_height, cur_num_filters + ) + ) + configs.append( + "component-node name={0}.scaleoffset{1} component={0}.scaleoffset{1} " + "input={2}".format(name, n, cur_descriptor) + ) + cur_descriptor = "{0}.scaleoffset{1}".format(name, n) if n == 2: # the bypass connection - cur_descriptor = 'Sum({0}, {1})'.format(cur_descriptor, bypass_descriptor) - + cur_descriptor = "Sum({0}, {1})".format( + cur_descriptor, bypass_descriptor + ) # the ReLU - configs.append('component name={0}.relu{1} type=RectifiedLinearComponent ' - 'dim={2} block-dim={3} self-repair-scale={4} ' - 'self-repair-lower-threshold={5}'.format( - name, n, cur_num_filters * cur_height, cur_num_filters, - self.config['self-repair-scale'], - self.config['self-repair-lower-threshold{0}'.format(n)])) - configs.append('component-node name={0}.relu{1} component={0}.relu{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - - cur_descriptor = '{0}.relu{1}'.format(name, n) + configs.append( + "component name={0}.relu{1} type=RectifiedLinearComponent " + "dim={2} block-dim={3} self-repair-scale={4} " + "self-repair-lower-threshold={5}".format( + name, + n, + cur_num_filters * cur_height, + cur_num_filters, + self.config["self-repair-scale"], + self.config["self-repair-lower-threshold{0}".format(n)], + ) + ) + configs.append( + "component-node name={0}.relu{1} component={0}.relu{1} " + "input={2}".format(name, n, cur_descriptor) + ) + + cur_descriptor = "{0}.relu{1}".format(name, n) if bypass_descriptor != input_descriptor: # We need to add the 1x1 bypass convolution because we're either doing height # subsampling or changing the number of filters. - conv_opts = ('height-in={hi} height-out={ho} height-offsets=0 ' - 'time-offsets=0 height-subsample-out={hs} ' - 'num-filters-in={fi} num-filters-out={fo} {o}'.format( - hi=height_in, ho=height_out, hs=height_subsample, - fi=num_filters_in, fo=num_filters_out, o=misc_conv_opts)) - configs.append('component name={0}.conv_bypass type=TimeHeightConvolutionComponent ' - '{1}'.format(name, conv_opts)) - configs.append('component-node name={0}.conv_bypass component={0}.conv_bypass ' - 'input={1}'.format(name, input_descriptor)) - - + conv_opts = ( + "height-in={hi} height-out={ho} height-offsets=0 " + "time-offsets=0 height-subsample-out={hs} " + "num-filters-in={fi} num-filters-out={fo} {o}".format( + hi=height_in, + ho=height_out, + hs=height_subsample, + fi=num_filters_in, + fo=num_filters_out, + o=misc_conv_opts, + ) + ) + configs.append( + "component name={0}.conv_bypass type=TimeHeightConvolutionComponent " + "{1}".format(name, conv_opts) + ) + configs.append( + "component-node name={0}.conv_bypass component={0}.conv_bypass " + "input={1}".format(name, input_descriptor) + ) # Note: the function 'output_name' is responsible for returning the # descriptor corresponding to the output of the network, which in # this case would be '{0}.relu2'.format(name). return configs - # _generate_bottleneck_resblock_config is a convenience function to generate the # res-block config (this is the bottleneck version, where there is # a 3x3 kernel with a smaller number of filters than at the input and output, @@ -1016,24 +1233,29 @@ def _generate_bottleneck_resblock_config(self): configs = [] name = self.name - num_bottleneck_filters = self.config['num-bottleneck-filters'] + num_bottleneck_filters = self.config["num-bottleneck-filters"] assert num_bottleneck_filters > 0 - input_dim = self.descriptors['input']['dim'] - height_in = self.config['height-in'] - height_out = self.config['height-out'] - input_descriptor = self.descriptors['input']['final-string'] - allow_zero_padding = self.config['allow-zero-padding'] - time_period_out = self.config['time-period'] + input_dim = self.descriptors["input"]["dim"] + height_in = self.config["height-in"] + height_out = self.config["height-out"] + input_descriptor = self.descriptors["input"]["final-string"] + allow_zero_padding = self.config["allow-zero-padding"] + time_period_out = self.config["time-period"] if not input_dim % height_in == 0: - raise RuntimeError("input-dim={0} does not divide height-in={1}".format( - input_dim, height_in)) + raise RuntimeError( + "input-dim={0} does not divide height-in={1}".format( + input_dim, height_in + ) + ) num_filters_in = input_dim / height_in - num_filters_out = self.config['num-filters'] + num_filters_out = self.config["num-filters"] if height_out != height_in: if height_out < height_in / 2 - 1 or height_out > height_in / 2 + 1: - raise RuntimeError("Expected height-out to be about half height-in, or the same: " - "height-in={0} height-out={1}".format(height_in, height_out)) + raise RuntimeError( + "Expected height-out to be about half height-in, or the same: " + "height-in={0} height-out={1}".format(height_in, height_out) + ) height_subsample = 2 else: height_subsample = 1 @@ -1044,92 +1266,138 @@ def _generate_bottleneck_resblock_config(self): if height_subsample == 1 and num_filters_in == num_filters_out: bypass_descriptor = input_descriptor else: - bypass_descriptor = '{0}.conv_bypass'.format(name) + bypass_descriptor = "{0}.conv_bypass".format(name) # get miscellaneous convolution options passed in from the xconfig line a = [] for opt_name in [ - 'param-stddev', 'bias-stddev', 'use-natural-gradient', - 'max-change', 'rank-in', 'rank-out', 'num-minibatches-history', - 'alpha-in', 'alpha-out', 'l2-regularize' ]: + "param-stddev", + "bias-stddev", + "use-natural-gradient", + "max-change", + "rank-in", + "rank-out", + "num-minibatches-history", + "alpha-in", + "alpha-out", + "l2-regularize", + ]: value = self.config[opt_name] - if value != '': - a.append('{0}={1}'.format(opt_name, value)) - misc_conv_opts = ' '.join(a) - + if value != "": + a.append("{0}={1}".format(opt_name, value)) + misc_conv_opts = " ".join(a) for n in [1, 2, 3]: # the convolution. - height_offsets = ('-1,0,1' if n == 2 else '0') + height_offsets = "-1,0,1" if n == 2 else "0" this_height_subsample = height_subsample if n == 1 else 1 - time_offsets = ('-{t},0,{t}'.format(t=time_period_out) if n == 2 else '0') - next_num_filters = (num_filters_out if n == 3 else num_bottleneck_filters) - - conv_opts = ('height-in={h_in} height-out={h_out} height-offsets={ho} time-offsets={to} ' - 'num-filters-in={fi} num-filters-out={fo} height-subsample-out={hs} ' - '{r} {o}'.format( - h_in=cur_height, h_out=height_out, - to=time_offsets, ho=height_offsets, - hs=this_height_subsample, - fi=cur_num_filters, fo=next_num_filters, - r=('required-time-offsets=0' if allow_zero_padding else ''), - o=misc_conv_opts)) - - configs.append('component name={0}.conv{1} type=TimeHeightConvolutionComponent ' - '{2}'.format(name, n, conv_opts)) - configs.append('component-node name={0}.conv{1} component={0}.conv{1} ' - 'input={2}'.format(name, n, cur_descriptor)) + time_offsets = "-{t},0,{t}".format(t=time_period_out) if n == 2 else "0" + next_num_filters = num_filters_out if n == 3 else num_bottleneck_filters + + conv_opts = ( + "height-in={h_in} height-out={h_out} height-offsets={ho} time-offsets={to} " + "num-filters-in={fi} num-filters-out={fo} height-subsample-out={hs} " + "{r} {o}".format( + h_in=cur_height, + h_out=height_out, + to=time_offsets, + ho=height_offsets, + hs=this_height_subsample, + fi=cur_num_filters, + fo=next_num_filters, + r=("required-time-offsets=0" if allow_zero_padding else ""), + o=misc_conv_opts, + ) + ) + + configs.append( + "component name={0}.conv{1} type=TimeHeightConvolutionComponent " + "{2}".format(name, n, conv_opts) + ) + configs.append( + "component-node name={0}.conv{1} component={0}.conv{1} " + "input={2}".format(name, n, cur_descriptor) + ) cur_num_filters = next_num_filters cur_height = height_out - cur_descriptor = '{0}.conv{1}'.format(name, n) + cur_descriptor = "{0}.conv{1}".format(name, n) # the batch-norm - configs.append('component name={0}.batchnorm{1} type=BatchNormComponent dim={2} ' - 'block-dim={3}'.format( - name, n, cur_num_filters * cur_height, - cur_num_filters)) - configs.append('component-node name={0}.batchnorm{1} component={0}.batchnorm{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - cur_descriptor = '{0}.batchnorm{1}'.format(name, n) + configs.append( + "component name={0}.batchnorm{1} type=BatchNormComponent dim={2} " + "block-dim={3}".format( + name, n, cur_num_filters * cur_height, cur_num_filters + ) + ) + configs.append( + "component-node name={0}.batchnorm{1} component={0}.batchnorm{1} " + "input={2}".format(name, n, cur_descriptor) + ) + cur_descriptor = "{0}.batchnorm{1}".format(name, n) # the scale and offset - configs.append('component name={0}.scaleoffset{1} type=ScaleAndOffsetComponent dim={2} ' - 'block-dim={3}'.format( - name, n, cur_num_filters * cur_height, - cur_num_filters)) - configs.append('component-node name={0}.scaleoffset{1} component={0}.scaleoffset{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - cur_descriptor = '{0}.scaleoffset{1}'.format(name, n) + configs.append( + "component name={0}.scaleoffset{1} type=ScaleAndOffsetComponent dim={2} " + "block-dim={3}".format( + name, n, cur_num_filters * cur_height, cur_num_filters + ) + ) + configs.append( + "component-node name={0}.scaleoffset{1} component={0}.scaleoffset{1} " + "input={2}".format(name, n, cur_descriptor) + ) + cur_descriptor = "{0}.scaleoffset{1}".format(name, n) if n == 3: # the bypass connection - cur_descriptor = 'Sum({0}, {1})'.format(cur_descriptor, bypass_descriptor) + cur_descriptor = "Sum({0}, {1})".format( + cur_descriptor, bypass_descriptor + ) # the ReLU - configs.append('component name={0}.relu{1} type=RectifiedLinearComponent ' - 'dim={2} block-dim={3} self-repair-scale={4} ' - 'self-repair-lower-threshold={5}'.format( - name, n, cur_num_filters * cur_height, cur_num_filters, - self.config['self-repair-scale'], - self.config['self-repair-lower-threshold{0}'.format(n)])) - configs.append('component-node name={0}.relu{1} component={0}.relu{1} ' - 'input={2}'.format(name, n, cur_descriptor)) - - cur_descriptor = '{0}.relu{1}'.format(name, n) + configs.append( + "component name={0}.relu{1} type=RectifiedLinearComponent " + "dim={2} block-dim={3} self-repair-scale={4} " + "self-repair-lower-threshold={5}".format( + name, + n, + cur_num_filters * cur_height, + cur_num_filters, + self.config["self-repair-scale"], + self.config["self-repair-lower-threshold{0}".format(n)], + ) + ) + configs.append( + "component-node name={0}.relu{1} component={0}.relu{1} " + "input={2}".format(name, n, cur_descriptor) + ) + + cur_descriptor = "{0}.relu{1}".format(name, n) if bypass_descriptor != input_descriptor: # We need to add the 1x1 bypass convolution because we're either doing height # subsampling or changing the number of filters. - conv_opts = ('height-in={hi} height-out={ho} height-offsets=0 ' - 'time-offsets=0 height-subsample-out={hs} ' - 'num-filters-in={fi} num-filters-out={fo} {o}'.format( - hi=height_in, ho=height_out, hs=height_subsample, - fi=num_filters_in, fo=num_filters_out, o=misc_conv_opts)) - configs.append('component name={0}.conv_bypass type=TimeHeightConvolutionComponent ' - '{1}'.format(name, conv_opts)) - configs.append('component-node name={0}.conv_bypass component={0}.conv_bypass ' - 'input={1}'.format(name, input_descriptor)) + conv_opts = ( + "height-in={hi} height-out={ho} height-offsets=0 " + "time-offsets=0 height-subsample-out={hs} " + "num-filters-in={fi} num-filters-out={fo} {o}".format( + hi=height_in, + ho=height_out, + hs=height_subsample, + fi=num_filters_in, + fo=num_filters_out, + o=misc_conv_opts, + ) + ) + configs.append( + "component name={0}.conv_bypass type=TimeHeightConvolutionComponent " + "{1}".format(name, conv_opts) + ) + configs.append( + "component-node name={0}.conv_bypass component={0}.conv_bypass " + "input={1}".format(name, input_descriptor) + ) # Note: the function 'output_name' is responsible for returning the # descriptor corresponding to the output of the network, which @@ -1143,61 +1411,65 @@ def _generate_bottleneck_resblock_config(self): # An example line using this layer is: # channel-average-layer name=channel-average input=Append(2, 4, 6, 8) dim=64 + # the configuration value 'dim' is the output dimension of this layer. # The input dimension is expected to be a multiple of 'dim'. The output # will be the average of 'dim'-sized blocks of the input. class ChannelAverageLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "channel-average-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', - 'dim': -1 } + self.config = {"input": "[-1]", "dim": -1} def set_derived_configs(self): pass def check_configs(self): - input_dim = self.descriptors['input']['dim'] - dim = self.config['dim'] + input_dim = self.descriptors["input"]["dim"] + dim = self.config["dim"] if dim <= 0: raise RuntimeError("dim must be specified and > 0.") if input_dim % dim != 0: - raise RuntimeError("input-dim={0} is not a multiple of dim={1}".format( - input_dim, dim)) + raise RuntimeError( + "input-dim={0} is not a multiple of dim={1}".format(input_dim, dim) + ) def auxiliary_outputs(self): return [] - def output_name(self, auxiliary_output = None): + def output_name(self, auxiliary_output=None): assert auxiliary_output is None return self.name - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - return self.config['dim'] - + return self.config["dim"] def get_full_config(self): ans = [] config_lines = self._generate_channel_average_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: ans.append((config_name, line)) return ans def _generate_channel_average_config(self): configs = [] name = self.name - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - dim = self.config['dim'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + dim = self.config["dim"] # choose the scale that makes it an average rather than a sum. scale = dim * 1.0 / input_dim - configs.append('component name={0} type=SumBlockComponent input-dim={1} ' - 'output-dim={2} scale={3}'.format(name, input_dim, - dim, scale)) - configs.append('component-node name={0} component={0} input={1}'.format( - name, input_descriptor)) + configs.append( + "component name={0} type=SumBlockComponent input-dim={1} " + "output-dim={2} scale={3}".format(name, input_dim, dim, scale) + ) + configs.append( + "component-node name={0} component={0} input={1}".format( + name, input_descriptor + ) + ) return configs diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/gru.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/gru.py index 2f387a6a1e5..bb5b2ce5742 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/gru.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/gru.py @@ -13,6 +13,7 @@ import sys from libs.nnet3.xconfig.basic_layers import XconfigLayerBase + # This class is for lines like # 'gru-layer name=gru1 input=[-1] delay=-3' # It generates an GRU sub-graph without output projections. @@ -34,51 +35,56 @@ # ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the GRU/LSTM ] # ng-affine-options='' [Additional options used for the full matrices in the GRU/LSTM, can be used to do things like set biases to initialize to 1] class XconfigGruLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "gru-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'clipping-threshold' : 30.0, - 'delay' : -1, - 'ng-per-element-scale-options' : ' max-change=0.75', - 'ng-affine-options' : ' max-change=0.75 ', - 'self-repair-scale-nonlinearity' : 0.00001, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0 - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "clipping-threshold": 30.0, + "delay": -1, + "ng-per-element-scale-options": " max-change=0.75", + "ng-affine-options": " max-change=0.75 ", + "self-repair-scale-nonlinearity": 0.00001, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + } def set_derived_configs(self): - if self.config['cell-dim'] <= 0: - self.config['cell-dim'] = self.descriptors['input']['dim'] + if self.config["cell-dim"] <= 0: + self.config["cell-dim"] = self.descriptors["input"]["dim"] def check_configs(self): - key = 'cell-dim' - if self.config['cell-dim'] <= 0: - raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key])) + key = "cell-dim" + if self.config["cell-dim"] <= 0: + raise RuntimeError( + "cell-dim has invalid value {0}.".format(self.config[key]) + ) - if self.config['delay'] == 0: + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - for key in ['self-repair-scale-nonlinearity']: + for key in ["self-repair-scale-nonlinearity"]: if self.config[key] < 0.0 or self.config[key] > 1.0: - raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key])) + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) - def output_name(self, auxiliary_output = None): - node_name = 's_t' - return '{0}.{1}'.format(self.name, node_name) + def output_name(self, auxiliary_output=None): + node_name = "s_t" + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): - return self.config['cell-dim'] + def output_dim(self, auxiliary_output=None): + return self.config["cell-dim"] def get_full_config(self): ans = [] config_lines = self.generate_gru_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -86,30 +92,40 @@ def get_full_config(self): # convenience function to generate the GRU config def generate_gru_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - delay = self.config['delay'] - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], abs(delay))) - repair_nonlin = self.config['self-repair-scale-nonlinearity'] - repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' - affine_str = self.config['ng-affine-options'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + delay = self.config["delay"] + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + ) + ) + repair_nonlin = self.config["self-repair-scale-nonlinearity"] + repair_nonlin_str = ( + "self-repair-scale={0:.10f}".format(repair_nonlin) + if repair_nonlin is not None + else "" + ) + affine_str = self.config["ng-affine-options"] # Natural gradient per element scale parameters # TODO: decide if we want to keep exposing these options - ng_per_element_scale_options = self.config['ng-per-element-scale-options'] - if re.search('param-mean', ng_per_element_scale_options) is None and \ - re.search('param-stddev', ng_per_element_scale_options) is None: - ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " + ng_per_element_scale_options = self.config["ng-per-element-scale-options"] + if ( + re.search("param-mean", ng_per_element_scale_options) is None + and re.search("param-stddev", ng_per_element_scale_options) is None + ): + ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " pes_str = ng_per_element_scale_options # formulation like: @@ -121,51 +137,129 @@ def generate_gru_config(self): configs = [] configs.append("# Update gate control : W_z* matrics") - configs.append("component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) - + configs.append( + "component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + cell_dim, cell_dim, affine_str + ) + ) + configs.append("# Reset gate control : W_r* matrics") - configs.append("component name={0}.W_z.xs_r type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + configs.append( + "component name={0}.W_z.xs_r type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + cell_dim, cell_dim, affine_str + ) + ) configs.append("# h related matrix : W_h* matrics") - configs.append("component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim , affine_str)) - + configs.append( + "component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + cell_dim, cell_dim, affine_str + ) + ) + configs.append("# Defining the non-linearities") - configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append( + "component name={0}.z type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.r type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.h type=TanhComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) configs.append("# Defining the components for other cell computations") - configs.append("component name={0}.h1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim)) - - recurrent_connection = '{0}.s_t'.format(name) + configs.append( + "component name={0}.h1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim) + ) + + recurrent_connection = "{0}.s_t".format(name) configs.append("# z_t") - configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) + configs.append( + "component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name) + ) configs.append("# r_t") - configs.append("component-node name={0}.r_t_pre component={0}.W_z.xs_r input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name)) - + configs.append( + "component-node name={0}.r_t_pre component={0}.W_z.xs_r input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name) + ) + configs.append("# h_t") - configs.append("component-node name={0}.h1_t component={0}.h1 input=Append({0}.r_t, IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay)) - configs.append("component-node name={0}.h_t_pre component={0}.W_h.UW input=Append({1}, {0}.h1_t)".format(name, input_descriptor)) - configs.append("component-node name={0}.h_t component={0}.h input={0}.h_t_pre".format(name)) - + configs.append( + "component-node name={0}.h1_t component={0}.h1 input=Append({0}.r_t, IfDefined(Offset({1}, {2})))".format( + name, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.h_t_pre component={0}.W_h.UW input=Append({1}, {0}.h1_t)".format( + name, input_descriptor + ) + ) + configs.append( + "component-node name={0}.h_t component={0}.h input={0}.h_t_pre".format(name) + ) + configs.append("# y_t") configs.append("# The following two lines are to implement (1 - z_t)") - configs.append("component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format(name, cell_dim)) - configs.append("component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format(name, recurrent_connection, delay)) - configs.append("component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format(name)) + configs.append( + "component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format( + name, cell_dim + ) + ) + configs.append( + "component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format( + name, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format( + name + ) + ) configs.append("# s_t : recurrence") - configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str)) + configs.append( + "component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format( + name, cell_dim, bptrunc_str + ) + ) configs.append("# s_t will be output and recurrence") - configs.append("component-node name={0}.s_t component={0}.s_r input={0}.y_t".format(name)) + configs.append( + "component-node name={0}.s_t component={0}.s_r input={0}.y_t".format(name) + ) return configs @@ -194,85 +288,108 @@ def generate_gru_config(self): # ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the GRU ] # ng-affine-options='' [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1] + class XconfigPgruLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "pgru-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input' : '[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 - 'non-recurrent-projection-dim' : -1, # defaults to - # recurrent-projection-dim - 'clipping-threshold' : 30.0, - 'delay' : -1, - 'ng-per-element-scale-options' : ' max-change=0.75 ', - 'ng-affine-options' : ' max-change=0.75 ', - 'self-repair-scale-nonlinearity' : 0.00001, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0 - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "recurrent-projection-dim": -1, # defaults to cell-dim / 4 + "non-recurrent-projection-dim": -1, # defaults to + # recurrent-projection-dim + "clipping-threshold": 30.0, + "delay": -1, + "ng-per-element-scale-options": " max-change=0.75 ", + "ng-affine-options": " max-change=0.75 ", + "self-repair-scale-nonlinearity": 0.00001, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + } def set_derived_configs(self): - if self.config['recurrent-projection-dim'] <= 0: - self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + if self.config["recurrent-projection-dim"] <= 0: + self.config["recurrent-projection-dim"] = self.config["cell-dim"] / 4 - if self.config['non-recurrent-projection-dim'] <= 0: - self.config['non-recurrent-projection-dim'] = \ - self.config['recurrent-projection-dim'] + if self.config["non-recurrent-projection-dim"] <= 0: + self.config["non-recurrent-projection-dim"] = self.config[ + "recurrent-projection-dim" + ] def check_configs(self): - for key in ['cell-dim', 'recurrent-projection-dim', - 'non-recurrent-projection-dim']: + for key in [ + "cell-dim", + "recurrent-projection-dim", + "non-recurrent-projection-dim", + ]: if self.config[key] <= 0: - raise RuntimeError("{0} has invalid value {1}.".format( - key, self.config[key])) + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) - if self.config['delay'] == 0: + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - if (self.config['recurrent-projection-dim'] + - self.config['non-recurrent-projection-dim'] > - self.config['cell-dim']): - raise RuntimeError("recurrent+non-recurrent projection dim exceeds " - "cell dim.") - for key in ['self-repair-scale-nonlinearity']: + if ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + > self.config["cell-dim"] + ): + raise RuntimeError( + "recurrent+non-recurrent projection dim exceeds " "cell dim." + ) + for key in ["self-repair-scale-nonlinearity"]: if self.config[key] < 0.0 or self.config[key] > 1.0: - raise RuntimeError("{0} has invalid value {2}." - .format(self.layer_type, key, - self.config[key])) + raise RuntimeError( + "{0} has invalid value {2}.".format( + self.layer_type, key, self.config[key] + ) + ) def auxiliary_outputs(self): - return ['h_t'] + return ["h_t"] - def output_name(self, auxiliary_output = None): - node_name = 'sn_t' + def output_name(self, auxiliary_output=None): + node_name = "sn_t" if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return '{0}.{1}'.format(self.name, node_name) + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): - if node_name == 'c_t': - return self.config['cell-dim'] + if node_name == "c_t": + return self.config["cell-dim"] # add code for other auxiliary_outputs here when we decide to expose them else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + return ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + ) def get_full_config(self): ans = [] config_lines = self.generate_pgru_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -280,34 +397,43 @@ def get_full_config(self): # convenience function to generate the PGRU config def generate_pgru_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - rec_proj_dim = self.config['recurrent-projection-dim'] - nonrec_proj_dim = self.config['non-recurrent-projection-dim'] - delay = self.config['delay'] - repair_nonlin = self.config['self-repair-scale-nonlinearity'] - repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay))) - affine_str = self.config['ng-affine-options'] - pes_str = self.config['ng-per-element-scale-options'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + rec_proj_dim = self.config["recurrent-projection-dim"] + nonrec_proj_dim = self.config["non-recurrent-projection-dim"] + delay = self.config["delay"] + repair_nonlin = self.config["self-repair-scale-nonlinearity"] + repair_nonlin_str = ( + "self-repair-scale={0:.10f}".format(repair_nonlin) + if repair_nonlin is not None + else "" + ) + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + ) + ) + affine_str = self.config["ng-affine-options"] + pes_str = self.config["ng-per-element-scale-options"] # Natural gradient per element scale parameters # TODO: decide if we want to keep exposing these options - if re.search('param-mean', pes_str) is None and \ - re.search('param-stddev', pes_str) is None: - pes_str += " param-mean=0.0 param-stddev=1.0 " + if ( + re.search("param-mean", pes_str) is None + and re.search("param-stddev", pes_str) is None + ): + pes_str += " param-mean=0.0 param-stddev=1.0 " # formulation like: # z_t = \sigmoid ( x_t * U^z + s_{t-1} * W^z ) // update gate @@ -316,57 +442,149 @@ def generate_pgru_config(self): # h_t = ( 1 - z_t ) \dot \tilde{h}_t + z_t \dot h_{t-1} # y_t = h_t * W^y # s_t = y_t (0:rec_proj_dim-1) - + configs = [] configs.append("# Update gate control : W_z* matrics") - configs.append("component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) - + configs.append( + "component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, cell_dim, affine_str + ) + ) + configs.append("# Reset gate control : W_r* matrics") - configs.append("component name={0}.W_z.xs_r type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str)) + configs.append( + "component name={0}.W_z.xs_r type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, rec_proj_dim, affine_str + ) + ) configs.append("# h related matrix : W_h* matrics") - configs.append("component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim , affine_str)) - + configs.append( + "component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, cell_dim, affine_str + ) + ) + configs.append("# Defining the non-linearities") - configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str)) - configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append( + "component name={0}.z type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.r type=SigmoidComponent dim={1} {2}".format( + name, rec_proj_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.h type=TanhComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) configs.append("# Defining the components for other cell computations") - configs.append("component name={0}.h1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * rec_proj_dim, rec_proj_dim)) - configs.append("component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim)) - - recurrent_connection = '{0}.s_t'.format(name) - recurrent_connection_y = '{0}.y_t'.format(name) + configs.append( + "component name={0}.h1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * rec_proj_dim, rec_proj_dim + ) + ) + configs.append( + "component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim) + ) + + recurrent_connection = "{0}.s_t".format(name) + recurrent_connection_y = "{0}.y_t".format(name) configs.append("# z_t") - configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) + configs.append( + "component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name) + ) configs.append("# r_t") - configs.append("component-node name={0}.r_t_pre component={0}.W_z.xs_r input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name)) + configs.append( + "component-node name={0}.r_t_pre component={0}.W_z.xs_r input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name) + ) configs.append("# h_t") - configs.append("component-node name={0}.h1_t component={0}.h1 input=Append({0}.r_t, IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay)) - configs.append("component-node name={0}.h_t_pre component={0}.W_h.UW input=Append({1}, {0}.h1_t)".format(name, input_descriptor)) - configs.append("component-node name={0}.h_t component={0}.h input={0}.h_t_pre".format(name)) - - configs.append("component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format(name, cell_dim)) - configs.append("component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format(name, recurrent_connection_y, delay)) - - configs.append("component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format(name)) + configs.append( + "component-node name={0}.h1_t component={0}.h1 input=Append({0}.r_t, IfDefined(Offset({1}, {2})))".format( + name, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.h_t_pre component={0}.W_h.UW input=Append({1}, {0}.h1_t)".format( + name, input_descriptor + ) + ) + configs.append( + "component-node name={0}.h_t component={0}.h input={0}.h_t_pre".format(name) + ) + + configs.append( + "component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format( + name, cell_dim + ) + ) + configs.append( + "component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format( + name, recurrent_connection_y, delay + ) + ) + + configs.append( + "component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format( + name + ) + ) configs.append("# s_t recurrent") - configs.append("component name={0}.W_s.ys type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) - configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) + configs.append( + "component name={0}.W_s.ys type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str + ) + ) + configs.append( + "component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format( + name, rec_proj_dim, bptrunc_str + ) + ) configs.append("# s_t and n_t : sn_t will be the output") - configs.append("component-node name={0}.sn_t component={0}.W_s.ys input={0}.y_t".format(name)) - configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.sn_t dim-offset=0 dim={1}".format(name, rec_proj_dim)) - configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip".format(name)) + configs.append( + "component-node name={0}.sn_t component={0}.W_s.ys input={0}.y_t".format( + name + ) + ) + configs.append( + "dim-range-node name={0}.s_t_preclip input-node={0}.sn_t dim-offset=0 dim={1}".format( + name, rec_proj_dim + ) + ) + configs.append( + "component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip".format( + name + ) + ) return configs @@ -397,92 +615,119 @@ def generate_pgru_config(self): # ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the GRU ] # ng-affine-options='' [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1] + class XconfigNormPgruLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "norm-pgru-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input' : '[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 - 'non-recurrent-projection-dim' : -1, # defaults to - # recurrent-projection-dim - 'clipping-threshold' : 30.0, - 'delay' : -1, - 'ng-per-element-scale-options' : ' max-change=0.75 ', - 'ng-affine-options' : ' max-change=0.75 ', - 'self-repair-scale-nonlinearity' : 0.00001, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, - 'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added - 'dropout-per-frame' : True # If False, regular dropout, not per frame. - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "recurrent-projection-dim": -1, # defaults to cell-dim / 4 + "non-recurrent-projection-dim": -1, # defaults to + # recurrent-projection-dim + "clipping-threshold": 30.0, + "delay": -1, + "ng-per-element-scale-options": " max-change=0.75 ", + "ng-affine-options": " max-change=0.75 ", + "self-repair-scale-nonlinearity": 0.00001, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + "dropout-proportion": -1.0, # If -1.0, no dropout components will be added + "dropout-per-frame": True, # If False, regular dropout, not per frame. + } def set_derived_configs(self): - if self.config['recurrent-projection-dim'] <= 0: - self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + if self.config["recurrent-projection-dim"] <= 0: + self.config["recurrent-projection-dim"] = self.config["cell-dim"] / 4 - if self.config['non-recurrent-projection-dim'] <= 0: - self.config['non-recurrent-projection-dim'] = \ - self.config['recurrent-projection-dim'] + if self.config["non-recurrent-projection-dim"] <= 0: + self.config["non-recurrent-projection-dim"] = self.config[ + "recurrent-projection-dim" + ] def check_configs(self): - for key in ['cell-dim', 'recurrent-projection-dim', - 'non-recurrent-projection-dim']: + for key in [ + "cell-dim", + "recurrent-projection-dim", + "non-recurrent-projection-dim", + ]: if self.config[key] <= 0: - raise RuntimeError("{0} has invalid value {1}.".format( - key, self.config[key])) + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) - if self.config['delay'] == 0: + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - if (self.config['recurrent-projection-dim'] + - self.config['non-recurrent-projection-dim'] > - self.config['cell-dim']): - raise RuntimeError("recurrent+non-recurrent projection dim exceeds " - "cell dim.") - for key in ['self-repair-scale-nonlinearity']: + if ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + > self.config["cell-dim"] + ): + raise RuntimeError( + "recurrent+non-recurrent projection dim exceeds " "cell dim." + ) + for key in ["self-repair-scale-nonlinearity"]: if self.config[key] < 0.0 or self.config[key] > 1.0: - raise RuntimeError("{0} has invalid value {2}." - .format(self.layer_type, key, - self.config[key])) - if ((self.config['dropout-proportion'] > 1.0 or - self.config['dropout-proportion'] < 0.0) and - self.config['dropout-proportion'] != -1.0 ): - raise RuntimeError("dropout-proportion has invalid value {0}." - .format(self.config['dropout-proportion'])) + raise RuntimeError( + "{0} has invalid value {2}.".format( + self.layer_type, key, self.config[key] + ) + ) + if ( + self.config["dropout-proportion"] > 1.0 + or self.config["dropout-proportion"] < 0.0 + ) and self.config["dropout-proportion"] != -1.0: + raise RuntimeError( + "dropout-proportion has invalid value {0}.".format( + self.config["dropout-proportion"] + ) + ) def auxiliary_outputs(self): - return ['h_t'] + return ["h_t"] - def output_name(self, auxiliary_output = None): - node_name = 'sn_t' + def output_name(self, auxiliary_output=None): + node_name = "sn_t" if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return '{0}.{1}'.format(self.name, node_name) + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): - if node_name == 'h_t': - return self.config['cell-dim'] + if node_name == "h_t": + return self.config["cell-dim"] # add code for other auxiliary_outputs here when we decide to expose them else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + return ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + ) def get_full_config(self): ans = [] config_lines = self.generate_pgru_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -490,36 +735,45 @@ def get_full_config(self): # convenience function to generate the Norm-PGRU config def generate_pgru_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - rec_proj_dim = self.config['recurrent-projection-dim'] - nonrec_proj_dim = self.config['non-recurrent-projection-dim'] - delay = self.config['delay'] - repair_nonlin = self.config['self-repair-scale-nonlinearity'] - repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay))) - affine_str = self.config['ng-affine-options'] - pes_str = self.config['ng-per-element-scale-options'] - dropout_proportion = self.config['dropout-proportion'] - dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + rec_proj_dim = self.config["recurrent-projection-dim"] + nonrec_proj_dim = self.config["non-recurrent-projection-dim"] + delay = self.config["delay"] + repair_nonlin = self.config["self-repair-scale-nonlinearity"] + repair_nonlin_str = ( + "self-repair-scale={0:.10f}".format(repair_nonlin) + if repair_nonlin is not None + else "" + ) + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + ) + ) + affine_str = self.config["ng-affine-options"] + pes_str = self.config["ng-per-element-scale-options"] + dropout_proportion = self.config["dropout-proportion"] + dropout_per_frame = "true" if self.config["dropout-per-frame"] else "false" # Natural gradient per element scale parameters # TODO: decide if we want to keep exposing these options - if re.search('param-mean', pes_str) is None and \ - re.search('param-stddev', pes_str) is None: - pes_str += " param-mean=0.0 param-stddev=1.0 " + if ( + re.search("param-mean", pes_str) is None + and re.search("param-stddev", pes_str) is None + ): + pes_str += " param-mean=0.0 param-stddev=1.0 " # formulation like: # z_t = \sigmoid ( x_t * U^z + s_{t-1} * W^z ) // update gate @@ -529,83 +783,216 @@ def generate_pgru_config(self): # y_t_tmp = h_t * W^y # s_t = renorm ( y_t_tmp (0:rec_proj_dim-1) ) # y_t = batchnorm ( y_t_tmp ) - + configs = [] configs.append("# Update gate control : W_z* matrics") - configs.append("component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) - + configs.append( + "component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, cell_dim, affine_str + ) + ) + configs.append("# Reset gate control : W_r* matrics") - configs.append("component name={0}.W_z.xs_r type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str)) + configs.append( + "component name={0}.W_z.xs_r type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, rec_proj_dim, affine_str + ) + ) configs.append("# h related matrix : W_h* matrics") - configs.append("component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim , affine_str)) - + configs.append( + "component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, cell_dim, affine_str + ) + ) + if dropout_proportion != -1.0: - configs.append("component name={0}.dropout_z type=DropoutComponent dim={1} " - "dropout-proportion={2} dropout-per-frame={3}" - .format(name, cell_dim, dropout_proportion, dropout_per_frame)) - configs.append("component name={0}.dropout_r type=DropoutComponent dim={1} " - "dropout-proportion={2} dropout-per-frame={3}" - .format(name, rec_proj_dim, dropout_proportion, dropout_per_frame)) - + configs.append( + "component name={0}.dropout_z type=DropoutComponent dim={1} " + "dropout-proportion={2} dropout-per-frame={3}".format( + name, cell_dim, dropout_proportion, dropout_per_frame + ) + ) + configs.append( + "component name={0}.dropout_r type=DropoutComponent dim={1} " + "dropout-proportion={2} dropout-per-frame={3}".format( + name, rec_proj_dim, dropout_proportion, dropout_per_frame + ) + ) + configs.append("# Defining the non-linearities") - configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str)) - configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append( + "component name={0}.z type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.r type=SigmoidComponent dim={1} {2}".format( + name, rec_proj_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.h type=TanhComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) configs.append("# Defining the components for other cell computations") - configs.append("component name={0}.h1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * rec_proj_dim, rec_proj_dim)) - configs.append("component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim)) - - recurrent_connection = '{0}.s_t'.format(name) - recurrent_connection_y = '{0}.y_t'.format(name) + configs.append( + "component name={0}.h1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * rec_proj_dim, rec_proj_dim + ) + ) + configs.append( + "component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim) + ) + + recurrent_connection = "{0}.s_t".format(name) + recurrent_connection_y = "{0}.y_t".format(name) configs.append("# z_t") - configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append( + "component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) if dropout_proportion != -1.0: - configs.append("component-node name={0}.z_predrop_t component={0}.z input={0}.z_t_pre".format(name)) - configs.append("component-node name={0}.z_t component={0}.dropout_z input={0}.z_predrop_t".format(name)) + configs.append( + "component-node name={0}.z_predrop_t component={0}.z input={0}.z_t_pre".format( + name + ) + ) + configs.append( + "component-node name={0}.z_t component={0}.dropout_z input={0}.z_predrop_t".format( + name + ) + ) else: - configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name, input_descriptor, recurrent_connection, delay)) + configs.append( + "component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format( + name, input_descriptor, recurrent_connection, delay + ) + ) configs.append("# r_t") - configs.append("component-node name={0}.r_t_pre component={0}.W_z.xs_r input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append( + "component-node name={0}.r_t_pre component={0}.W_z.xs_r input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) if dropout_proportion != -1.0: - configs.append("component-node name={0}.r_predrop_t component={0}.r input={0}.r_t_pre".format(name)) - configs.append("component-node name={0}.r_t component={0}.dropout_r input={0}.r_predrop_t".format(name)) + configs.append( + "component-node name={0}.r_predrop_t component={0}.r input={0}.r_t_pre".format( + name + ) + ) + configs.append( + "component-node name={0}.r_t component={0}.dropout_r input={0}.r_predrop_t".format( + name + ) + ) else: - configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name)) + configs.append( + "component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format( + name + ) + ) configs.append("# h_t") - configs.append("component-node name={0}.h1_t component={0}.h1 input=Append({0}.r_t, IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay)) - configs.append("component-node name={0}.h_t_pre component={0}.W_h.UW input=Append({1}, {0}.h1_t)".format(name, input_descriptor)) - configs.append("component-node name={0}.h_t component={0}.h input={0}.h_t_pre".format(name)) - - configs.append("component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format(name, cell_dim)) - configs.append("component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format(name, recurrent_connection_y, delay)) - configs.append("component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format(name)) + configs.append( + "component-node name={0}.h1_t component={0}.h1 input=Append({0}.r_t, IfDefined(Offset({1}, {2})))".format( + name, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.h_t_pre component={0}.W_h.UW input=Append({1}, {0}.h1_t)".format( + name, input_descriptor + ) + ) + configs.append( + "component-node name={0}.h_t component={0}.h input={0}.h_t_pre".format(name) + ) + + configs.append( + "component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format( + name, cell_dim + ) + ) + configs.append( + "component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format( + name, recurrent_connection_y, delay + ) + ) + configs.append( + "component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format( + name + ) + ) configs.append("# s_t recurrent") - configs.append("component name={0}.W_s.ys type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) - configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) - - configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim)) - configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim)) + configs.append( + "component name={0}.W_s.ys type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str + ) + ) + configs.append( + "component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format( + name, rec_proj_dim, bptrunc_str + ) + ) + + configs.append( + "component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format( + name, rec_proj_dim + nonrec_proj_dim + ) + ) + configs.append( + "component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format( + name, rec_proj_dim + ) + ) configs.append("# s_t and n_t : sn_t will be the output") - configs.append("component-node name={0}.sn_nobatchnorm_t component={0}.W_s.ys input={0}.y_t".format(name)) - configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.sn_nobatchnorm_t dim-offset=0 dim={1}".format(name, rec_proj_dim)) - configs.append("component-node name={0}.sn_t component={0}.batchnorm input={0}.sn_nobatchnorm_t".format(name)) - - configs.append("component-node name={0}.s_renorm_t component={0}.renorm input={0}.s_t_preclip".format(name)) - configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_renorm_t".format(name)) + configs.append( + "component-node name={0}.sn_nobatchnorm_t component={0}.W_s.ys input={0}.y_t".format( + name + ) + ) + configs.append( + "dim-range-node name={0}.s_t_preclip input-node={0}.sn_nobatchnorm_t dim-offset=0 dim={1}".format( + name, rec_proj_dim + ) + ) + configs.append( + "component-node name={0}.sn_t component={0}.batchnorm input={0}.sn_nobatchnorm_t".format( + name + ) + ) + + configs.append( + "component-node name={0}.s_renorm_t component={0}.renorm input={0}.s_t_preclip".format( + name + ) + ) + configs.append( + "component-node name={0}.s_t component={0}.s_r input={0}.s_renorm_t".format( + name + ) + ) return configs - # This class is for lines like # 'opgru-layer name=opgru1 input=[-1] delay=-3' # It generates an OPGRU sub-graph with output projections. @@ -629,84 +1016,106 @@ def generate_pgru_config(self): # ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the GRU ] # ng-affine-options='' [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1] class XconfigOpgruLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "opgru-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input' : '[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 - 'non-recurrent-projection-dim' : -1, # defaults to - # recurrent-projection-dim - 'clipping-threshold' : 30.0, - 'delay' : -1, - 'ng-per-element-scale-options' : ' max-change=0.75 ', - 'ng-affine-options' : ' max-change=0.75 ', - 'self-repair-scale-nonlinearity' : 0.00001, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0 - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "recurrent-projection-dim": -1, # defaults to cell-dim / 4 + "non-recurrent-projection-dim": -1, # defaults to + # recurrent-projection-dim + "clipping-threshold": 30.0, + "delay": -1, + "ng-per-element-scale-options": " max-change=0.75 ", + "ng-affine-options": " max-change=0.75 ", + "self-repair-scale-nonlinearity": 0.00001, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + } def set_derived_configs(self): - if self.config['recurrent-projection-dim'] <= 0: - self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + if self.config["recurrent-projection-dim"] <= 0: + self.config["recurrent-projection-dim"] = self.config["cell-dim"] / 4 - if self.config['non-recurrent-projection-dim'] <= 0: - self.config['non-recurrent-projection-dim'] = \ - self.config['recurrent-projection-dim'] + if self.config["non-recurrent-projection-dim"] <= 0: + self.config["non-recurrent-projection-dim"] = self.config[ + "recurrent-projection-dim" + ] def check_configs(self): - for key in ['cell-dim', 'recurrent-projection-dim', - 'non-recurrent-projection-dim']: + for key in [ + "cell-dim", + "recurrent-projection-dim", + "non-recurrent-projection-dim", + ]: if self.config[key] <= 0: - raise RuntimeError("{0} has invalid value {1}.".format( - key, self.config[key])) + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) - if self.config['delay'] == 0: + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - if (self.config['recurrent-projection-dim'] + - self.config['non-recurrent-projection-dim'] > - self.config['cell-dim']): - raise RuntimeError("recurrent+non-recurrent projection dim exceeds " - "cell dim.") - for key in ['self-repair-scale-nonlinearity']: + if ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + > self.config["cell-dim"] + ): + raise RuntimeError( + "recurrent+non-recurrent projection dim exceeds " "cell dim." + ) + for key in ["self-repair-scale-nonlinearity"]: if self.config[key] < 0.0 or self.config[key] > 1.0: - raise RuntimeError("{0} has invalid value {2}." - .format(self.layer_type, key, - self.config[key])) + raise RuntimeError( + "{0} has invalid value {2}.".format( + self.layer_type, key, self.config[key] + ) + ) def auxiliary_outputs(self): - return ['h_t'] + return ["h_t"] - def output_name(self, auxiliary_output = None): - node_name = 'sn_t' + def output_name(self, auxiliary_output=None): + node_name = "sn_t" if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return '{0}.{1}'.format(self.name, node_name) + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): - if node_name == 'c_t': - return self.config['cell-dim'] + if node_name == "c_t": + return self.config["cell-dim"] # add code for other auxiliary_outputs here when we decide to expose them else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + return ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + ) def get_full_config(self): ans = [] config_lines = self.generate_pgru_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -714,34 +1123,43 @@ def get_full_config(self): # convenience function to generate the OPGRU config def generate_pgru_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - rec_proj_dim = self.config['recurrent-projection-dim'] - nonrec_proj_dim = self.config['non-recurrent-projection-dim'] - delay = self.config['delay'] - repair_nonlin = self.config['self-repair-scale-nonlinearity'] - repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay))) - affine_str = self.config['ng-affine-options'] - pes_str = self.config['ng-per-element-scale-options'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + rec_proj_dim = self.config["recurrent-projection-dim"] + nonrec_proj_dim = self.config["non-recurrent-projection-dim"] + delay = self.config["delay"] + repair_nonlin = self.config["self-repair-scale-nonlinearity"] + repair_nonlin_str = ( + "self-repair-scale={0:.10f}".format(repair_nonlin) + if repair_nonlin is not None + else "" + ) + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + ) + ) + affine_str = self.config["ng-affine-options"] + pes_str = self.config["ng-per-element-scale-options"] # Natural gradient per element scale parameters # TODO: decide if we want to keep exposing these options - if re.search('param-mean', pes_str) is None and \ - re.search('param-stddev', pes_str) is None: - pes_str += " param-mean=0.0 param-stddev=1.0 " + if ( + re.search("param-mean", pes_str) is None + and re.search("param-stddev", pes_str) is None + ): + pes_str += " param-mean=0.0 param-stddev=1.0 " # formulation for OPGRU like: # z_t = \sigmoid ( x_t * U^z + s_{t-1} * W^z ) // update gate @@ -750,61 +1168,164 @@ def generate_pgru_config(self): # h_t = ( 1 - z_t ) \dot \tilde{h}_t + z_t \dot h_{t-1} # y_t = (y_t \dot o_t) * W^y # s_t = y_t(0:rec_proj_dim-1) - + configs = [] configs.append("# Update gate control : W_z* matrics") - configs.append("component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) - + configs.append( + "component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, cell_dim, affine_str + ) + ) + configs.append("# Output gate control : W_r* matrics") - configs.append("component name={0}.W_z.xs_o type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append( + "component name={0}.W_z.xs_o type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, cell_dim, affine_str + ) + ) configs.append("# h related matrix : W_h* matrics") - configs.append("component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim , cell_dim , affine_str)) - configs.append("component name={0}.W_h.UW_elementwise type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim , pes_str)) - + configs.append( + "component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim, cell_dim, affine_str + ) + ) + configs.append( + "component name={0}.W_h.UW_elementwise type=NaturalGradientPerElementScaleComponent dim={1} {2}".format( + name, cell_dim, pes_str + ) + ) + configs.append("# Defining the non-linearities") - configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append( + "component name={0}.z type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.o type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.h type=TanhComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) configs.append("# Defining the components for other cell computations") - configs.append("component name={0}.o1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim)) - - recurrent_connection = '{0}.s_t'.format(name) - recurrent_connection_y = '{0}.y_t'.format(name) + configs.append( + "component name={0}.o1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim) + ) + + recurrent_connection = "{0}.s_t".format(name) + recurrent_connection_y = "{0}.y_t".format(name) configs.append("# z_t") - configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) + configs.append( + "component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name) + ) configs.append("# o_t") - configs.append("component-node name={0}.o_t_pre component={0}.W_z.xs_o input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name)) - - configs.append("# h_t") - configs.append("component-node name={0}.h_t_pre component={0}.W_h.UW input={1}".format(name, input_descriptor)) - configs.append("component-node name={0}.h_t_pre2 component={0}.W_h.UW_elementwise input=IfDefined(Offset({1}, {2}))".format(name, recurrent_connection_y, delay)) - configs.append("component-node name={0}.h_t component={0}.h input=Sum({0}.h_t_pre, {0}.h_t_pre2)".format(name)) + configs.append( + "component-node name={0}.o_t_pre component={0}.W_z.xs_o input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name) + ) - configs.append("component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format(name, cell_dim)) - configs.append("component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format(name, recurrent_connection_y, delay)) - configs.append("component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format(name)) - configs.append("component-node name={0}.y_o_t component={0}.o1 input=Append({0}.o_t, {0}.y_t)".format(name)) + configs.append("# h_t") + configs.append( + "component-node name={0}.h_t_pre component={0}.W_h.UW input={1}".format( + name, input_descriptor + ) + ) + configs.append( + "component-node name={0}.h_t_pre2 component={0}.W_h.UW_elementwise input=IfDefined(Offset({1}, {2}))".format( + name, recurrent_connection_y, delay + ) + ) + configs.append( + "component-node name={0}.h_t component={0}.h input=Sum({0}.h_t_pre, {0}.h_t_pre2)".format( + name + ) + ) + + configs.append( + "component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format( + name, cell_dim + ) + ) + configs.append( + "component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format( + name, recurrent_connection_y, delay + ) + ) + configs.append( + "component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format( + name + ) + ) + configs.append( + "component-node name={0}.y_o_t component={0}.o1 input=Append({0}.o_t, {0}.y_t)".format( + name + ) + ) configs.append("# s_t recurrent") - configs.append("component name={0}.W_s.ys type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) - configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) + configs.append( + "component name={0}.W_s.ys type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str + ) + ) + configs.append( + "component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format( + name, rec_proj_dim, bptrunc_str + ) + ) configs.append("# s_t and n_t : sn_t will be the output") - configs.append("component-node name={0}.sn_t component={0}.W_s.ys input={0}.y_o_t".format(name)) - configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.sn_t dim-offset=0 dim={1}".format(name, rec_proj_dim)) - configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip".format(name)) + configs.append( + "component-node name={0}.sn_t component={0}.W_s.ys input={0}.y_o_t".format( + name + ) + ) + configs.append( + "dim-range-node name={0}.s_t_preclip input-node={0}.sn_t dim-offset=0 dim={1}".format( + name, rec_proj_dim + ) + ) + configs.append( + "component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip".format( + name + ) + ) return configs + # This class is for lines like # 'norm-opgru-layer name=norm-opgru1 input=[-1] delay=-3' # It generates a norm-OPGRU sub-graph with output projections. @@ -812,6 +1333,7 @@ def generate_pgru_config(self): # Different from the vanilla OPGRU, the NormOPGRU uses batchnorm in the forward direction # and renorm in the recurrence. + # The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, # the dimension defaults to the same as the input. # See other configuration values below. @@ -832,92 +1354,118 @@ def generate_pgru_config(self): # ng-per-element-scale-options='' [Additional options used for the diagonal matrices in the GRU ] # ng-affine-options='' [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1] class XconfigNormOpgruLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "norm-opgru-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input' : '[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 - 'non-recurrent-projection-dim' : -1, # defaults to - # recurrent-projection-dim - 'clipping-threshold' : 30.0, - 'delay' : -1, - 'ng-per-element-scale-options' : ' max-change=0.75 ', - 'ng-affine-options' : ' max-change=0.75 ', - 'self-repair-scale-nonlinearity' : 0.00001, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, - 'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added - 'l2-regularize': 0.0, - 'dropout-per-frame' : True # If false, regular dropout, not per frame. - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "recurrent-projection-dim": -1, # defaults to cell-dim / 4 + "non-recurrent-projection-dim": -1, # defaults to + # recurrent-projection-dim + "clipping-threshold": 30.0, + "delay": -1, + "ng-per-element-scale-options": " max-change=0.75 ", + "ng-affine-options": " max-change=0.75 ", + "self-repair-scale-nonlinearity": 0.00001, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + "dropout-proportion": -1.0, # If -1.0, no dropout components will be added + "l2-regularize": 0.0, + "dropout-per-frame": True, # If false, regular dropout, not per frame. + } def set_derived_configs(self): - if self.config['recurrent-projection-dim'] <= 0: - self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + if self.config["recurrent-projection-dim"] <= 0: + self.config["recurrent-projection-dim"] = self.config["cell-dim"] / 4 - if self.config['non-recurrent-projection-dim'] <= 0: - self.config['non-recurrent-projection-dim'] = \ - self.config['recurrent-projection-dim'] + if self.config["non-recurrent-projection-dim"] <= 0: + self.config["non-recurrent-projection-dim"] = self.config[ + "recurrent-projection-dim" + ] def check_configs(self): - for key in ['cell-dim', 'recurrent-projection-dim', - 'non-recurrent-projection-dim']: + for key in [ + "cell-dim", + "recurrent-projection-dim", + "non-recurrent-projection-dim", + ]: if self.config[key] <= 0: - raise RuntimeError("{0} has invalid value {1}.".format( - key, self.config[key])) + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) - if self.config['delay'] == 0: + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - if (self.config['recurrent-projection-dim'] + - self.config['non-recurrent-projection-dim'] > - self.config['cell-dim']): - raise RuntimeError("recurrent+non-recurrent projection dim exceeds " - "cell dim.") - for key in ['self-repair-scale-nonlinearity']: + if ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + > self.config["cell-dim"] + ): + raise RuntimeError( + "recurrent+non-recurrent projection dim exceeds " "cell dim." + ) + for key in ["self-repair-scale-nonlinearity"]: if self.config[key] < 0.0 or self.config[key] > 1.0: - raise RuntimeError("{0} has invalid value {2}." - .format(self.layer_type, key, - self.config[key])) - if ((self.config['dropout-proportion'] > 1.0 or - self.config['dropout-proportion'] < 0.0) and - self.config['dropout-proportion'] != -1.0 ): - raise RuntimeError("dropout-proportion has invalid value {0}." - .format(self.config['dropout-proportion'])) + raise RuntimeError( + "{0} has invalid value {2}.".format( + self.layer_type, key, self.config[key] + ) + ) + if ( + self.config["dropout-proportion"] > 1.0 + or self.config["dropout-proportion"] < 0.0 + ) and self.config["dropout-proportion"] != -1.0: + raise RuntimeError( + "dropout-proportion has invalid value {0}.".format( + self.config["dropout-proportion"] + ) + ) def auxiliary_outputs(self): - return ['h_t'] + return ["h_t"] - def output_name(self, auxiliary_output = None): - node_name = 'sn_t' + def output_name(self, auxiliary_output=None): + node_name = "sn_t" if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return '{0}.{1}'.format(self.name, node_name) + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): - if node_name == 'c_t': - return self.config['cell-dim'] + if node_name == "c_t": + return self.config["cell-dim"] # add code for other auxiliary_outputs here when we decide to expose them else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + return ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + ) def get_full_config(self): ans = [] config_lines = self.generate_pgru_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -925,40 +1473,50 @@ def get_full_config(self): # convenience function to generate the Norm-OPGRU config def generate_pgru_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - rec_proj_dim = self.config['recurrent-projection-dim'] - nonrec_proj_dim = self.config['non-recurrent-projection-dim'] - delay = self.config['delay'] - repair_nonlin = self.config['self-repair-scale-nonlinearity'] - repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay))) - affine_str = self.config['ng-affine-options'] - pes_str = self.config['ng-per-element-scale-options'] - dropout_proportion = self.config['dropout-proportion'] - dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' - - l2_regularize = self.config['l2-regularize'] - l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize) - if l2_regularize != 0.0 else '') + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + rec_proj_dim = self.config["recurrent-projection-dim"] + nonrec_proj_dim = self.config["non-recurrent-projection-dim"] + delay = self.config["delay"] + repair_nonlin = self.config["self-repair-scale-nonlinearity"] + repair_nonlin_str = ( + "self-repair-scale={0:.10f}".format(repair_nonlin) + if repair_nonlin is not None + else "" + ) + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + ) + ) + affine_str = self.config["ng-affine-options"] + pes_str = self.config["ng-per-element-scale-options"] + dropout_proportion = self.config["dropout-proportion"] + dropout_per_frame = "true" if self.config["dropout-per-frame"] else "false" + + l2_regularize = self.config["l2-regularize"] + l2_regularize_option = ( + "l2-regularize={0} ".format(l2_regularize) if l2_regularize != 0.0 else "" + ) # Natural gradient per element scale parameters # TODO: decide if we want to keep exposing these options - if re.search('param-mean', pes_str) is None and \ - re.search('param-stddev', pes_str) is None: - pes_str += " param-mean=0.0 param-stddev=1.0 " + if ( + re.search("param-mean", pes_str) is None + and re.search("param-stddev", pes_str) is None + ): + pes_str += " param-mean=0.0 param-stddev=1.0 " # formulation for OPGRU like: # z_t = \sigmoid ( x_t * U^z + s_{t-1} * W^z ) // update gate @@ -968,79 +1526,233 @@ def generate_pgru_config(self): # y_t_tmp = ( h_t \dot o_t) * W^y # s_t = renorm ( y_t_tmp(0:rec_proj_dim-1) ) # y_t = batchnorm ( y_t_tmp ) - + configs = [] configs.append("# Update gate control : W_z* matrics") - configs.append("component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str, l2_regularize_option)) - + configs.append( + "component name={0}.W_z.xs_z type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format( + name, + input_dim + rec_proj_dim, + cell_dim, + affine_str, + l2_regularize_option, + ) + ) + configs.append("# Output gate control : W_r* matrics") - configs.append("component name={0}.W_z.xs_o type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str, l2_regularize_option)) + configs.append( + "component name={0}.W_z.xs_o type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format( + name, + input_dim + rec_proj_dim, + cell_dim, + affine_str, + l2_regularize_option, + ) + ) configs.append("# h related matrix : W_h* matrics") - configs.append("component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, input_dim , cell_dim , affine_str, l2_regularize_option)) - configs.append("component name={0}.W_h.UW_elementwise type=NaturalGradientPerElementScaleComponent dim={1} {2}".format(name, cell_dim , pes_str)) - + configs.append( + "component name={0}.W_h.UW type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format( + name, input_dim, cell_dim, affine_str, l2_regularize_option + ) + ) + configs.append( + "component name={0}.W_h.UW_elementwise type=NaturalGradientPerElementScaleComponent dim={1} {2}".format( + name, cell_dim, pes_str + ) + ) + configs.append("# Defining the non-linearities") - configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append( + "component name={0}.z type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.o type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.h type=TanhComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) configs.append("# Defining the components for other cell computations") - configs.append("component name={0}.o1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim)) + configs.append( + "component name={0}.o1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y1 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y2 type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component name={0}.y type=NoOpComponent dim={1}".format(name, cell_dim) + ) if dropout_proportion != -1.0: - configs.append("component name={0}.dropout type=DropoutComponent dim={1} " - "dropout-proportion={2} dropout-per-frame={3}" - .format(name, cell_dim, dropout_proportion, dropout_per_frame)) + configs.append( + "component name={0}.dropout type=DropoutComponent dim={1} " + "dropout-proportion={2} dropout-per-frame={3}".format( + name, cell_dim, dropout_proportion, dropout_per_frame + ) + ) - recurrent_connection = '{0}.s_t'.format(name) - recurrent_connection_y = '{0}.y_t'.format(name) + recurrent_connection = "{0}.s_t".format(name) + recurrent_connection_y = "{0}.y_t".format(name) configs.append("# z_t") - configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append( + "component-node name={0}.z_t_pre component={0}.W_z.xs_z input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) if dropout_proportion != -1.0: - configs.append("component-node name={0}.z_predrop_t component={0}.z input={0}.z_t_pre".format(name)) - configs.append("component-node name={0}.z_t component={0}.dropout input={0}.z_predrop_t".format(name)) + configs.append( + "component-node name={0}.z_predrop_t component={0}.z input={0}.z_t_pre".format( + name + ) + ) + configs.append( + "component-node name={0}.z_t component={0}.dropout input={0}.z_predrop_t".format( + name + ) + ) else: - configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) + configs.append( + "component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format( + name + ) + ) configs.append("# o_t") - configs.append("component-node name={0}.o_t_pre component={0}.W_z.xs_o input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append( + "component-node name={0}.o_t_pre component={0}.W_z.xs_o input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) if dropout_proportion != -1.0: - configs.append("component-node name={0}.o_predrop_t component={0}.o input={0}.o_t_pre".format(name)) - configs.append("component-node name={0}.o_t component={0}.dropout input={0}.o_predrop_t".format(name)) + configs.append( + "component-node name={0}.o_predrop_t component={0}.o input={0}.o_t_pre".format( + name + ) + ) + configs.append( + "component-node name={0}.o_t component={0}.dropout input={0}.o_predrop_t".format( + name + ) + ) else: - configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name)) - + configs.append( + "component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format( + name + ) + ) + configs.append("# h_t") - configs.append("component-node name={0}.h_t_pre component={0}.W_h.UW input={1}".format(name, input_descriptor)) - configs.append("component-node name={0}.h_t_pre2 component={0}.W_h.UW_elementwise input=IfDefined(Offset({1}, {2}))".format(name, recurrent_connection_y, delay)) - configs.append("component-node name={0}.h_t component={0}.h input=Sum({0}.h_t_pre, {0}.h_t_pre2)".format(name)) + configs.append( + "component-node name={0}.h_t_pre component={0}.W_h.UW input={1}".format( + name, input_descriptor + ) + ) + configs.append( + "component-node name={0}.h_t_pre2 component={0}.W_h.UW_elementwise input=IfDefined(Offset({1}, {2}))".format( + name, recurrent_connection_y, delay + ) + ) + configs.append( + "component-node name={0}.h_t component={0}.h input=Sum({0}.h_t_pre, {0}.h_t_pre2)".format( + name + ) + ) configs.append("# The following two lines are to implement (1 - z_t)") - configs.append("component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format(name, cell_dim)) - configs.append("component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format(name, recurrent_connection_y, delay)) - configs.append("component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format(name)) - configs.append("component-node name={0}.y_o_t component={0}.o1 input=Append({0}.o_t, {0}.y_t)".format(name)) + configs.append( + "component-node name={0}.y1_t component={0}.y1 input=Append({0}.h_t, Sum(Scale(-1.0,{0}.z_t), Const(1.0, {1})))".format( + name, cell_dim + ) + ) + configs.append( + "component-node name={0}.y2_t component={0}.y2 input=Append(IfDefined(Offset({1}, {2})), {0}.z_t)".format( + name, recurrent_connection_y, delay + ) + ) + configs.append( + "component-node name={0}.y_t component={0}.y input=Sum({0}.y1_t, {0}.y2_t)".format( + name + ) + ) + configs.append( + "component-node name={0}.y_o_t component={0}.o1 input=Append({0}.o_t, {0}.y_t)".format( + name + ) + ) configs.append("# s_t recurrent") - configs.append("component name={0}.W_s.ys type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str, l2_regularize_option)) - configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) - configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim)) - configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim)) + configs.append( + "component name={0}.W_s.ys type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3} {4}".format( + name, + cell_dim, + rec_proj_dim + nonrec_proj_dim, + affine_str, + l2_regularize_option, + ) + ) + configs.append( + "component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format( + name, rec_proj_dim, bptrunc_str + ) + ) + configs.append( + "component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format( + name, rec_proj_dim + nonrec_proj_dim + ) + ) + configs.append( + "component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format( + name, rec_proj_dim + ) + ) configs.append("# s_t and n_t : sn_t will be the output") - configs.append("component-node name={0}.sn_nobatchnorm_t component={0}.W_s.ys input={0}.y_o_t".format(name)) - configs.append("component-node name={0}.sn_t component={0}.batchnorm input={0}.sn_nobatchnorm_t".format(name)) - configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.sn_nobatchnorm_t dim-offset=0 dim={1}".format(name, rec_proj_dim)) - configs.append("component-node name={0}.s_t_preclip_renorm component={0}.renorm input={0}.s_t_preclip".format(name)) - configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip_renorm".format(name)) + configs.append( + "component-node name={0}.sn_nobatchnorm_t component={0}.W_s.ys input={0}.y_o_t".format( + name + ) + ) + configs.append( + "component-node name={0}.sn_t component={0}.batchnorm input={0}.sn_nobatchnorm_t".format( + name + ) + ) + configs.append( + "dim-range-node name={0}.s_t_preclip input-node={0}.sn_nobatchnorm_t dim-offset=0 dim={1}".format( + name, rec_proj_dim + ) + ) + configs.append( + "component-node name={0}.s_t_preclip_renorm component={0}.renorm input={0}.s_t_preclip".format( + name + ) + ) + configs.append( + "component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip_renorm".format( + name + ) + ) return configs + # This class is for lines like # 'fast-gru-layer name=gru1 input=[-1] delay=-3' # It generates an GRU sub-graph without output projections. @@ -1063,57 +1775,62 @@ def generate_pgru_config(self): # gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail] # ng-affine-options='' [Additional options used for the full matrices in the GRU/LSTM, can be used to do things like set biases to initialize to 1] class XconfigFastGruLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "fast-gru-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'clipping-threshold' : 30.0, - 'delay' : -1, - 'ng-per-element-scale-options' : ' max-change=0.75', - 'ng-affine-options' : ' max-change=0.75 ', - 'self-repair-scale-nonlinearity' : 0.00001, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, - # if you want to set 'self-repair-scale', ' self-repair-threshold' - # or 'param-stddev' for GruNonlinearityComponent - # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. - # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. - # you can also see src/nnet3/nnet-combined-component.h for detail - 'gru-nonlinearity-options' : ' max-change=0.75' - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "clipping-threshold": 30.0, + "delay": -1, + "ng-per-element-scale-options": " max-change=0.75", + "ng-affine-options": " max-change=0.75 ", + "self-repair-scale-nonlinearity": 0.00001, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + # if you want to set 'self-repair-scale', ' self-repair-threshold' + # or 'param-stddev' for GruNonlinearityComponent + # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. + # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. + # you can also see src/nnet3/nnet-combined-component.h for detail + "gru-nonlinearity-options": " max-change=0.75", + } def set_derived_configs(self): - if self.config['cell-dim'] <= 0: - self.config['cell-dim'] = self.descriptors['input']['dim'] + if self.config["cell-dim"] <= 0: + self.config["cell-dim"] = self.descriptors["input"]["dim"] def check_configs(self): - key = 'cell-dim' - if self.config['cell-dim'] <= 0: - raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key])) + key = "cell-dim" + if self.config["cell-dim"] <= 0: + raise RuntimeError( + "cell-dim has invalid value {0}.".format(self.config[key]) + ) - if self.config['delay'] == 0: + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - for key in ['self-repair-scale-nonlinearity']: + for key in ["self-repair-scale-nonlinearity"]: if self.config[key] < 0.0 or self.config[key] > 1.0: - raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key])) + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) - def output_name(self, auxiliary_output = None): - node_name = 'y_t' - return '{0}.{1}'.format(self.name, node_name) + def output_name(self, auxiliary_output=None): + node_name = "y_t" + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): - return self.config['cell-dim'] + def output_dim(self, auxiliary_output=None): + return self.config["cell-dim"] def get_full_config(self): ans = [] config_lines = self.generate_gru_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -1121,28 +1838,36 @@ def get_full_config(self): # convenience function to generate the GRU config def generate_gru_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - delay = self.config['delay'] - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], abs(delay))) - repair_nonlin = self.config['self-repair-scale-nonlinearity'] - repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' - affine_str = self.config['ng-affine-options'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + delay = self.config["delay"] + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + ) + ) + repair_nonlin = self.config["self-repair-scale-nonlinearity"] + repair_nonlin_str = ( + "self-repair-scale={0:.10f}".format(repair_nonlin) + if repair_nonlin is not None + else "" + ) + affine_str = self.config["ng-affine-options"] # string for GruNonlinearityComponent - gru_nonlin_str = self.config['gru-nonlinearity-options'] - + gru_nonlin_str = self.config["gru-nonlinearity-options"] + # formulation like: # z_t = \sigmoid ( U^z x_t + W^z y_{t-1} ) # update gate # r_t = \sigmoid ( U^r x_t + W^r y_{t-1} ) # reset gate @@ -1161,39 +1886,97 @@ def generate_gru_config(self): configs.append("### Begin Gru layer '{0}'".format(name)) configs.append("# Update gate control : W_z* matrices") - configs.append("component name={0}.W_z.xh type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + configs.append( + "component name={0}.W_z.xh type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + cell_dim, cell_dim, affine_str + ) + ) configs.append("# Reset gate control : W_r* matrices") - configs.append("component name={0}.W_r.xh type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + cell_dim, cell_dim, affine_str)) + configs.append( + "component name={0}.W_r.xh type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + cell_dim, cell_dim, affine_str + ) + ) configs.append("# hpart_t related matrix : W_hpart matrice") - configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str)) - + configs.append( + "component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim, cell_dim, affine_str + ) + ) + configs.append("# Defining the non-linearities for z_t and r_t") - configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - - recurrent_connection = '{0}.s_t'.format(name) + configs.append( + "component name={0}.z type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.r type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + + recurrent_connection = "{0}.s_t".format(name) configs.append("# z_t") - configs.append("component-node name={0}.z_t_pre component={0}.W_z.xh input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) + configs.append( + "component-node name={0}.z_t_pre component={0}.W_z.xh input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name) + ) configs.append("# r_t") - configs.append("component-node name={0}.r_t_pre component={0}.W_r.xh input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name)) + configs.append( + "component-node name={0}.r_t_pre component={0}.W_r.xh input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name) + ) configs.append("# hpart_t") - configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor)) - + configs.append( + "component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format( + name, input_descriptor + ) + ) + configs.append("# y_t") - configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we just get the second half. Otherwise, in non-projection gru layer, y_t = c_t") - configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str)) - configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay)) - configs.append("dim-range-node name={0}.y_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim)) + configs.append( + "# Note: the output of GruNonlinearityComponent is (h_t, c_t), we just get the second half. Otherwise, in non-projection gru layer, y_t = c_t" + ) + configs.append( + "component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} {2}".format( + name, cell_dim, gru_nonlin_str + ) + ) + configs.append( + "component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({1}, {2})))".format( + name, recurrent_connection, delay + ) + ) + configs.append( + "dim-range-node name={0}.y_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format( + name, cell_dim + ) + ) configs.append("# s_t : recurrence") - configs.append("# Note: in non-projection gru layer, the recurrent part equals the output, namely y_t.") - configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, cell_dim, bptrunc_str)) - configs.append("component-node name={0}.s_t component={0}.s_r input={0}.y_t".format(name)) + configs.append( + "# Note: in non-projection gru layer, the recurrent part equals the output, namely y_t." + ) + configs.append( + "component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format( + name, cell_dim, bptrunc_str + ) + ) + configs.append( + "component-node name={0}.s_t component={0}.s_r input={0}.y_t".format(name) + ) return configs @@ -1223,90 +2006,112 @@ def generate_gru_config(self): # gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail] # ng-affine-options='' [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1] class XconfigFastPgruLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "fast-pgru-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input' : '[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 - 'non-recurrent-projection-dim' : -1, # defaults to - # recurrent-projection-dim - 'clipping-threshold' : 30.0, - 'delay' : -1, - 'ng-per-element-scale-options' : ' max-change=0.75 ', - 'ng-affine-options' : ' max-change=0.75 ', - 'self-repair-scale-nonlinearity' : 0.00001, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, - # if you want to set 'self-repair-scale', ' self-repair-threshold' - # or 'param-stddev' for GruNonlinearityComponent - # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. - # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. - # you can also see src/nnet3/nnet-combined-component.h for detail - 'gru-nonlinearity-options' : ' max-change=0.75' - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "recurrent-projection-dim": -1, # defaults to cell-dim / 4 + "non-recurrent-projection-dim": -1, # defaults to + # recurrent-projection-dim + "clipping-threshold": 30.0, + "delay": -1, + "ng-per-element-scale-options": " max-change=0.75 ", + "ng-affine-options": " max-change=0.75 ", + "self-repair-scale-nonlinearity": 0.00001, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + # if you want to set 'self-repair-scale', ' self-repair-threshold' + # or 'param-stddev' for GruNonlinearityComponent + # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. + # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. + # you can also see src/nnet3/nnet-combined-component.h for detail + "gru-nonlinearity-options": " max-change=0.75", + } def set_derived_configs(self): - if self.config['recurrent-projection-dim'] <= 0: - self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + if self.config["recurrent-projection-dim"] <= 0: + self.config["recurrent-projection-dim"] = self.config["cell-dim"] / 4 - if self.config['non-recurrent-projection-dim'] <= 0: - self.config['non-recurrent-projection-dim'] = \ - self.config['recurrent-projection-dim'] + if self.config["non-recurrent-projection-dim"] <= 0: + self.config["non-recurrent-projection-dim"] = self.config[ + "recurrent-projection-dim" + ] def check_configs(self): - for key in ['cell-dim', 'recurrent-projection-dim', - 'non-recurrent-projection-dim']: + for key in [ + "cell-dim", + "recurrent-projection-dim", + "non-recurrent-projection-dim", + ]: if self.config[key] <= 0: - raise RuntimeError("{0} has invalid value {1}.".format( - key, self.config[key])) + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) - if self.config['delay'] == 0: + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - if (self.config['recurrent-projection-dim'] + - self.config['non-recurrent-projection-dim'] > - self.config['cell-dim']): - raise RuntimeError("recurrent+non-recurrent projection dim exceeds " - "cell dim.") - for key in ['self-repair-scale-nonlinearity']: + if ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + > self.config["cell-dim"] + ): + raise RuntimeError( + "recurrent+non-recurrent projection dim exceeds " "cell dim." + ) + for key in ["self-repair-scale-nonlinearity"]: if self.config[key] < 0.0 or self.config[key] > 1.0: - raise RuntimeError("{0} has invalid value {2}." - .format(self.layer_type, key, - self.config[key])) + raise RuntimeError( + "{0} has invalid value {2}.".format( + self.layer_type, key, self.config[key] + ) + ) def auxiliary_outputs(self): - return ['c_t'] + return ["c_t"] - def output_name(self, auxiliary_output = None): - node_name = 'y_t' + def output_name(self, auxiliary_output=None): + node_name = "y_t" if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return '{0}.{1}'.format(self.name, node_name) + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): - if node_name == 'c_t': - return self.config['cell-dim'] + if node_name == "c_t": + return self.config["cell-dim"] # add code for other auxiliary_outputs here when we decide to expose them else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + return ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + ) def get_full_config(self): ans = [] config_lines = self.generate_pgru_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -1314,47 +2119,56 @@ def get_full_config(self): # convenience function to generate the PGRU config def generate_pgru_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - rec_proj_dim = self.config['recurrent-projection-dim'] - nonrec_proj_dim = self.config['non-recurrent-projection-dim'] - delay = self.config['delay'] - repair_nonlin = self.config['self-repair-scale-nonlinearity'] - repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay))) - affine_str = self.config['ng-affine-options'] - pes_str = self.config['ng-per-element-scale-options'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + rec_proj_dim = self.config["recurrent-projection-dim"] + nonrec_proj_dim = self.config["non-recurrent-projection-dim"] + delay = self.config["delay"] + repair_nonlin = self.config["self-repair-scale-nonlinearity"] + repair_nonlin_str = ( + "self-repair-scale={0:.10f}".format(repair_nonlin) + if repair_nonlin is not None + else "" + ) + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + ) + ) + affine_str = self.config["ng-affine-options"] + pes_str = self.config["ng-per-element-scale-options"] # Natural gradient per element scale parameters # TODO: decide if we want to keep exposing these options - if re.search('param-mean', pes_str) is None and \ - re.search('param-stddev', pes_str) is None: - pes_str += " param-mean=0.0 param-stddev=1.0 " + if ( + re.search("param-mean", pes_str) is None + and re.search("param-stddev", pes_str) is None + ): + pes_str += " param-mean=0.0 param-stddev=1.0 " # string for GruNonlinearityComponent - gru_nonlin_str = self.config['gru-nonlinearity-options'] - + gru_nonlin_str = self.config["gru-nonlinearity-options"] + # formulation like: # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} ) # update gate # r_t = \sigmoid ( U^r x_t + W^r s_{t-1} ) # reset gate # h_t = \tanh ( U^h x_t + W^h ( s_{t-1} \dot r_t ) ) # c_t = ( 1 - z_t ) \dot h_t + z_t \dot c_{t-1} # y_t = W^y c_t # dim(y_t) = recurrent_dim + non_recurrent_dim. - # This is the output of the GRU. - # s_t = y_t[0:recurrent_dim-1] # dimension range of y_t - # dim(s_t) = recurrent_dim. + # This is the output of the GRU. + # s_t = y_t[0:recurrent_dim-1] # dimension range of y_t + # dim(s_t) = recurrent_dim. # Note: # naming convention: # .W_. e.g. Gru1.W_i.xr for matrix @@ -1367,43 +2181,110 @@ def generate_pgru_config(self): configs = [] configs.append("### Begin Gru layer '{0}'".format(name)) configs.append("# Update gate control : W_z* matrices") - configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append( + "component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, cell_dim, affine_str + ) + ) configs.append("# Reset gate control : W_r* matrices") - configs.append("component name={0}.W_r.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str)) - + configs.append( + "component name={0}.W_r.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, rec_proj_dim, affine_str + ) + ) configs.append("# hpart_t related matrix : W_hpart matric") - configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str)) - + configs.append( + "component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim, cell_dim, affine_str + ) + ) + configs.append("# Defining the non-linearities") - configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str)) - - recurrent_connection = '{0}.s_t'.format(name) + configs.append( + "component name={0}.z type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.r type=SigmoidComponent dim={1} {2}".format( + name, rec_proj_dim, repair_nonlin_str + ) + ) + + recurrent_connection = "{0}.s_t".format(name) configs.append("# z_t and r_t") - configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) - configs.append("component-node name={0}.r_t_pre component={0}.W_r.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name)) + configs.append( + "component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name) + ) + configs.append( + "component-node name={0}.r_t_pre component={0}.W_r.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name) + ) configs.append("# hpart_t") - configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor)) - + configs.append( + "component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format( + name, input_descriptor + ) + ) + configs.append("# c_t") - configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we use the second half.") - configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} recurrent-dim={2} {3}".format(name, cell_dim, rec_proj_dim, gru_nonlin_str)) - configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {2})), IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay)) - configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim)) + configs.append( + "# Note: the output of GruNonlinearityComponent is (h_t, c_t), we use the second half." + ) + configs.append( + "component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} recurrent-dim={2} {3}".format( + name, cell_dim, rec_proj_dim, gru_nonlin_str + ) + ) + configs.append( + "component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {2})), IfDefined(Offset({1}, {2})))".format( + name, recurrent_connection, delay + ) + ) + configs.append( + "dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format( + name, cell_dim + ) + ) configs.append("# the projected matrix W_y.c and y_t") - configs.append("component name={0}.W_y.c type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) - configs.append("component-node name={0}.y_t component={0}.W_y.c input={0}.c_t".format(name)) + configs.append( + "component name={0}.W_y.c type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str + ) + ) + configs.append( + "component-node name={0}.y_t component={0}.W_y.c input={0}.c_t".format(name) + ) configs.append("# s_t : recurrence") - configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) - configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t dim-offset=0 dim={1}".format(name, rec_proj_dim)) - configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_pre".format(name)) + configs.append( + "component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format( + name, rec_proj_dim, bptrunc_str + ) + ) + configs.append( + "dim-range-node name={0}.s_t_pre input-node={0}.y_t dim-offset=0 dim={1}".format( + name, rec_proj_dim + ) + ) + configs.append( + "component-node name={0}.s_t component={0}.s_r input={0}.s_t_pre".format( + name + ) + ) return configs @@ -1413,6 +2294,7 @@ def generate_pgru_config(self): # Different from the vanilla PGRU, the NormPGRU uses batchnorm in the forward direction # and renorm in the recurrence. + # The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, # the dimension defaults to the same as the input. # See other configuration values below. @@ -1434,97 +2316,123 @@ def generate_pgru_config(self): # gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail] # ng-affine-options='' [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1] class XconfigFastNormPgruLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "fast-norm-pgru-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input' : '[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 - 'non-recurrent-projection-dim' : -1, # defaults to - # recurrent-projection-dim - 'clipping-threshold' : 30.0, - 'delay' : -1, - 'ng-per-element-scale-options' : ' max-change=0.75 ', - 'ng-affine-options' : ' max-change=0.75 ', - 'self-repair-scale-nonlinearity' : 0.00001, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, - # if you want to set 'self-repair-scale', ' self-repair-threshold' - # or 'param-stddev' for GruNonlinearityComponent - # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. - # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. - # you can also see src/nnet3/nnet-combined-component.h for detail - 'gru-nonlinearity-options' : ' max-change=0.75', - 'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added - 'dropout-per-frame' : True # If False, regular dropout, not per frame - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "recurrent-projection-dim": -1, # defaults to cell-dim / 4 + "non-recurrent-projection-dim": -1, # defaults to + # recurrent-projection-dim + "clipping-threshold": 30.0, + "delay": -1, + "ng-per-element-scale-options": " max-change=0.75 ", + "ng-affine-options": " max-change=0.75 ", + "self-repair-scale-nonlinearity": 0.00001, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + # if you want to set 'self-repair-scale', ' self-repair-threshold' + # or 'param-stddev' for GruNonlinearityComponent + # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. + # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. + # you can also see src/nnet3/nnet-combined-component.h for detail + "gru-nonlinearity-options": " max-change=0.75", + "dropout-proportion": -1.0, # If -1.0, no dropout components will be added + "dropout-per-frame": True, # If False, regular dropout, not per frame + } def set_derived_configs(self): - if self.config['recurrent-projection-dim'] <= 0: - self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + if self.config["recurrent-projection-dim"] <= 0: + self.config["recurrent-projection-dim"] = self.config["cell-dim"] / 4 - if self.config['non-recurrent-projection-dim'] <= 0: - self.config['non-recurrent-projection-dim'] = \ - self.config['recurrent-projection-dim'] + if self.config["non-recurrent-projection-dim"] <= 0: + self.config["non-recurrent-projection-dim"] = self.config[ + "recurrent-projection-dim" + ] def check_configs(self): - for key in ['cell-dim', 'recurrent-projection-dim', - 'non-recurrent-projection-dim']: + for key in [ + "cell-dim", + "recurrent-projection-dim", + "non-recurrent-projection-dim", + ]: if self.config[key] <= 0: - raise RuntimeError("{0} has invalid value {1}.".format( - key, self.config[key])) + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) - if self.config['delay'] == 0: + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - if (self.config['recurrent-projection-dim'] + - self.config['non-recurrent-projection-dim'] > - self.config['cell-dim']): - raise RuntimeError("recurrent+non-recurrent projection dim exceeds " - "cell dim.") - for key in ['self-repair-scale-nonlinearity']: + if ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + > self.config["cell-dim"] + ): + raise RuntimeError( + "recurrent+non-recurrent projection dim exceeds " "cell dim." + ) + for key in ["self-repair-scale-nonlinearity"]: if self.config[key] < 0.0 or self.config[key] > 1.0: - raise RuntimeError("{0} has invalid value {2}." - .format(self.layer_type, key, - self.config[key])) - if ((self.config['dropout-proportion'] > 1.0 or - self.config['dropout-proportion'] < 0.0) and - self.config['dropout-proportion'] != -1.0 ): - raise RuntimeError("dropout-proportion has invalid value {0}." - .format(self.config['dropout-proportion'])) + raise RuntimeError( + "{0} has invalid value {2}.".format( + self.layer_type, key, self.config[key] + ) + ) + if ( + self.config["dropout-proportion"] > 1.0 + or self.config["dropout-proportion"] < 0.0 + ) and self.config["dropout-proportion"] != -1.0: + raise RuntimeError( + "dropout-proportion has invalid value {0}.".format( + self.config["dropout-proportion"] + ) + ) def auxiliary_outputs(self): - return ['c_t'] + return ["c_t"] - def output_name(self, auxiliary_output = None): - node_name = 'y_t' + def output_name(self, auxiliary_output=None): + node_name = "y_t" if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return '{0}.{1}'.format(self.name, node_name) + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): - if node_name == 'c_t': - return self.config['cell-dim'] + if node_name == "c_t": + return self.config["cell-dim"] # add code for other auxiliary_outputs here when we decide to expose them else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + return ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + ) def get_full_config(self): ans = [] config_lines = self.generate_pgru_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -1532,40 +2440,49 @@ def get_full_config(self): # convenience function to generate the Norm-PGRU config def generate_pgru_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - rec_proj_dim = self.config['recurrent-projection-dim'] - nonrec_proj_dim = self.config['non-recurrent-projection-dim'] - delay = self.config['delay'] - repair_nonlin = self.config['self-repair-scale-nonlinearity'] - repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay))) - affine_str = self.config['ng-affine-options'] - pes_str = self.config['ng-per-element-scale-options'] - dropout_proportion = self.config['dropout-proportion'] - dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + rec_proj_dim = self.config["recurrent-projection-dim"] + nonrec_proj_dim = self.config["non-recurrent-projection-dim"] + delay = self.config["delay"] + repair_nonlin = self.config["self-repair-scale-nonlinearity"] + repair_nonlin_str = ( + "self-repair-scale={0:.10f}".format(repair_nonlin) + if repair_nonlin is not None + else "" + ) + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + ) + ) + affine_str = self.config["ng-affine-options"] + pes_str = self.config["ng-per-element-scale-options"] + dropout_proportion = self.config["dropout-proportion"] + dropout_per_frame = "true" if self.config["dropout-per-frame"] else "false" # Natural gradient per element scale parameters # TODO: decide if we want to keep exposing these options - if re.search('param-mean', pes_str) is None and \ - re.search('param-stddev', pes_str) is None: - pes_str += " param-mean=0.0 param-stddev=1.0 " + if ( + re.search("param-mean", pes_str) is None + and re.search("param-stddev", pes_str) is None + ): + pes_str += " param-mean=0.0 param-stddev=1.0 " # string for GruNonlinearityComponent - gru_nonlin_str = self.config['gru-nonlinearity-options'] - + gru_nonlin_str = self.config["gru-nonlinearity-options"] + # formulation like: # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} ) # update gate # r_t = \sigmoid ( U^r x_t + W^r s_{t-1} ) # reset gate @@ -1574,7 +2491,7 @@ def generate_pgru_config(self): # y_t_tmp = W^y c_t # s_t = renorm ( y_t_tmp[0:rec_proj_dim-1] ) # dim(s_t) = recurrent_dim. # y_t = batchnorm ( y_t_tmp ) # dim(y_t) = recurrent_dim + non_recurrent_dim. - # This is the output of the GRU. + # This is the output of the GRU. # Note: # naming convention: # .W_. e.g. Gru1.W_i.xr for matrix @@ -1587,69 +2504,179 @@ def generate_pgru_config(self): configs = [] configs.append("### Begin Gru layer '{0}'".format(name)) configs.append("# Update gate control : W_z* matrices") - configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append( + "component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, cell_dim, affine_str + ) + ) configs.append("# Reset gate control : W_r* matrices") - configs.append("component name={0}.W_r.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, rec_proj_dim, affine_str)) - + configs.append( + "component name={0}.W_r.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, rec_proj_dim, affine_str + ) + ) configs.append("# hpart_t related matrix : W_hpart matric") - configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str)) - + configs.append( + "component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim, cell_dim, affine_str + ) + ) + configs.append("# Defining the non-linearities") - configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.r type=SigmoidComponent dim={1} {2}".format(name, rec_proj_dim, repair_nonlin_str)) + configs.append( + "component name={0}.z type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.r type=SigmoidComponent dim={1} {2}".format( + name, rec_proj_dim, repair_nonlin_str + ) + ) if dropout_proportion != -1.0: configs.append("# Defining the dropout component") - configs.append("component name={0}.dropout_z type=DropoutComponent dim={1} " - "dropout-proportion={2} dropout-per-frame={3}" - .format(name, cell_dim, dropout_proportion, dropout_per_frame)) - configs.append("component name={0}.dropout_r type=DropoutComponent dim={1} " - "dropout-proportion={2} dropout-per-frame={3}" - .format(name, rec_proj_dim, dropout_proportion, dropout_per_frame)) - - - recurrent_connection = '{0}.s_t'.format(name) + configs.append( + "component name={0}.dropout_z type=DropoutComponent dim={1} " + "dropout-proportion={2} dropout-per-frame={3}".format( + name, cell_dim, dropout_proportion, dropout_per_frame + ) + ) + configs.append( + "component name={0}.dropout_r type=DropoutComponent dim={1} " + "dropout-proportion={2} dropout-per-frame={3}".format( + name, rec_proj_dim, dropout_proportion, dropout_per_frame + ) + ) + + recurrent_connection = "{0}.s_t".format(name) configs.append("# z_t") - configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append( + "component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) if dropout_proportion != -1.0: - configs.append("component-node name={0}.z_t_predrop component={0}.z input={0}.z_t_pre".format(name)) - configs.append("component-node name={0}.z_t component={0}.dropout_z input={0}.z_t_predrop".format(name)) + configs.append( + "component-node name={0}.z_t_predrop component={0}.z input={0}.z_t_pre".format( + name + ) + ) + configs.append( + "component-node name={0}.z_t component={0}.dropout_z input={0}.z_t_predrop".format( + name + ) + ) else: - configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) + configs.append( + "component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format( + name + ) + ) configs.append("# r_t") - configs.append("component-node name={0}.r_t_pre component={0}.W_r.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append( + "component-node name={0}.r_t_pre component={0}.W_r.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) if dropout_proportion != -1.0: - configs.append("component-node name={0}.r_t_predrop component={0}.r input={0}.r_t_pre".format(name)) - configs.append("component-node name={0}.r_t component={0}.dropout_r input={0}.r_t_predrop".format(name)) + configs.append( + "component-node name={0}.r_t_predrop component={0}.r input={0}.r_t_pre".format( + name + ) + ) + configs.append( + "component-node name={0}.r_t component={0}.dropout_r input={0}.r_t_predrop".format( + name + ) + ) else: - configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format(name)) + configs.append( + "component-node name={0}.r_t component={0}.r input={0}.r_t_pre".format( + name + ) + ) configs.append("# hpart_t") - configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor)) - + configs.append( + "component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format( + name, input_descriptor + ) + ) + configs.append("# c_t") - configs.append("# Note: the output of GruNonlinearityComponent is (h_t, c_t), we use the second half.") - configs.append("component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} recurrent-dim={2} {3}".format(name, cell_dim, rec_proj_dim, gru_nonlin_str)) - configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {2})), IfDefined(Offset({1}, {2})))".format(name, recurrent_connection, delay)) - configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim)) + configs.append( + "# Note: the output of GruNonlinearityComponent is (h_t, c_t), we use the second half." + ) + configs.append( + "component name={0}.gru_nonlin type=GruNonlinearityComponent cell-dim={1} recurrent-dim={2} {3}".format( + name, cell_dim, rec_proj_dim, gru_nonlin_str + ) + ) + configs.append( + "component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.r_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {2})), IfDefined(Offset({1}, {2})))".format( + name, recurrent_connection, delay + ) + ) + configs.append( + "dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format( + name, cell_dim + ) + ) configs.append("# the projected matrix W_y.c and y_t_tmp") - configs.append("component name={0}.W_y.c type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) - configs.append("component-node name={0}.y_t_tmp component={0}.W_y.c input={0}.c_t".format(name)) + configs.append( + "component name={0}.W_y.c type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str + ) + ) + configs.append( + "component-node name={0}.y_t_tmp component={0}.W_y.c input={0}.c_t".format( + name + ) + ) configs.append("# s_t : recurrence") - configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim)) - configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) - configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t_tmp dim-offset=0 dim={1}".format(name, rec_proj_dim)) - configs.append("component-node name={0}.s_t_renorm component={0}.renorm input={0}.s_t_pre".format(name)) - configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_renorm".format(name)) + configs.append( + "component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format( + name, rec_proj_dim + ) + ) + configs.append( + "component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format( + name, rec_proj_dim, bptrunc_str + ) + ) + configs.append( + "dim-range-node name={0}.s_t_pre input-node={0}.y_t_tmp dim-offset=0 dim={1}".format( + name, rec_proj_dim + ) + ) + configs.append( + "component-node name={0}.s_t_renorm component={0}.renorm input={0}.s_t_pre".format( + name + ) + ) + configs.append( + "component-node name={0}.s_t component={0}.s_r input={0}.s_t_renorm".format( + name + ) + ) configs.append("# y_t : output") - configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim)) - configs.append("component-node name={0}.y_t component={0}.batchnorm input={0}.y_t_tmp".format(name)) + configs.append( + "component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format( + name, rec_proj_dim + nonrec_proj_dim + ) + ) + configs.append( + "component-node name={0}.y_t component={0}.batchnorm input={0}.y_t_tmp".format( + name + ) + ) return configs @@ -1679,90 +2706,112 @@ def generate_pgru_config(self): # gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail] # ng-affine-options='' [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1] class XconfigFastOpgruLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "fast-opgru-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input' : '[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 - 'non-recurrent-projection-dim' : -1, # defaults to - # recurrent-projection-dim - 'clipping-threshold' : 30.0, - 'delay' : -1, - 'ng-per-element-scale-options' : ' max-change=0.75 ', - 'ng-affine-options' : ' max-change=0.75 ', - 'self-repair-scale-nonlinearity' : 0.00001, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, - # if you want to set 'self-repair-scale', ' self-repair-threshold' - # or 'param-stddev' for GruNonlinearityComponent - # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. - # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. - # you can also see src/nnet3/nnet-combined-component.h for detail - 'gru-nonlinearity-options' : ' max-change=0.75' - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "recurrent-projection-dim": -1, # defaults to cell-dim / 4 + "non-recurrent-projection-dim": -1, # defaults to + # recurrent-projection-dim + "clipping-threshold": 30.0, + "delay": -1, + "ng-per-element-scale-options": " max-change=0.75 ", + "ng-affine-options": " max-change=0.75 ", + "self-repair-scale-nonlinearity": 0.00001, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + # if you want to set 'self-repair-scale', ' self-repair-threshold' + # or 'param-stddev' for GruNonlinearityComponent + # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. + # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. + # you can also see src/nnet3/nnet-combined-component.h for detail + "gru-nonlinearity-options": " max-change=0.75", + } def set_derived_configs(self): - if self.config['recurrent-projection-dim'] <= 0: - self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + if self.config["recurrent-projection-dim"] <= 0: + self.config["recurrent-projection-dim"] = self.config["cell-dim"] / 4 - if self.config['non-recurrent-projection-dim'] <= 0: - self.config['non-recurrent-projection-dim'] = \ - self.config['recurrent-projection-dim'] + if self.config["non-recurrent-projection-dim"] <= 0: + self.config["non-recurrent-projection-dim"] = self.config[ + "recurrent-projection-dim" + ] def check_configs(self): - for key in ['cell-dim', 'recurrent-projection-dim', - 'non-recurrent-projection-dim']: + for key in [ + "cell-dim", + "recurrent-projection-dim", + "non-recurrent-projection-dim", + ]: if self.config[key] <= 0: - raise RuntimeError("{0} has invalid value {1}.".format( - key, self.config[key])) + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) - if self.config['delay'] == 0: + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - if (self.config['recurrent-projection-dim'] + - self.config['non-recurrent-projection-dim'] > - self.config['cell-dim']): - raise RuntimeError("recurrent+non-recurrent projection dim exceeds " - "cell dim.") - for key in ['self-repair-scale-nonlinearity']: + if ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + > self.config["cell-dim"] + ): + raise RuntimeError( + "recurrent+non-recurrent projection dim exceeds " "cell dim." + ) + for key in ["self-repair-scale-nonlinearity"]: if self.config[key] < 0.0 or self.config[key] > 1.0: - raise RuntimeError("{0} has invalid value {2}." - .format(self.layer_type, key, - self.config[key])) + raise RuntimeError( + "{0} has invalid value {2}.".format( + self.layer_type, key, self.config[key] + ) + ) def auxiliary_outputs(self): - return ['c_t'] + return ["c_t"] - def output_name(self, auxiliary_output = None): - node_name = 'y_t' + def output_name(self, auxiliary_output=None): + node_name = "y_t" if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return '{0}.{1}'.format(self.name, node_name) + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): - if node_name == 'c_t': - return self.config['cell-dim'] + if node_name == "c_t": + return self.config["cell-dim"] # add code for other auxiliary_outputs here when we decide to expose them else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + return ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + ) def get_full_config(self): ans = [] config_lines = self.generate_pgru_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -1770,47 +2819,56 @@ def get_full_config(self): # convenience function to generate the OPGRU config def generate_pgru_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - rec_proj_dim = self.config['recurrent-projection-dim'] - nonrec_proj_dim = self.config['non-recurrent-projection-dim'] - delay = self.config['delay'] - repair_nonlin = self.config['self-repair-scale-nonlinearity'] - repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay))) - affine_str = self.config['ng-affine-options'] - pes_str = self.config['ng-per-element-scale-options'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + rec_proj_dim = self.config["recurrent-projection-dim"] + nonrec_proj_dim = self.config["non-recurrent-projection-dim"] + delay = self.config["delay"] + repair_nonlin = self.config["self-repair-scale-nonlinearity"] + repair_nonlin_str = ( + "self-repair-scale={0:.10f}".format(repair_nonlin) + if repair_nonlin is not None + else "" + ) + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + ) + ) + affine_str = self.config["ng-affine-options"] + pes_str = self.config["ng-per-element-scale-options"] # Natural gradient per element scale parameters # TODO: decide if we want to keep exposing these options - if re.search('param-mean', pes_str) is None and \ - re.search('param-stddev', pes_str) is None: - pes_str += " param-mean=0.0 param-stddev=1.0 " + if ( + re.search("param-mean", pes_str) is None + and re.search("param-stddev", pes_str) is None + ): + pes_str += " param-mean=0.0 param-stddev=1.0 " # string for GruNonlinearityComponent - gru_nonlin_str = self.config['gru-nonlinearity-options'] - + gru_nonlin_str = self.config["gru-nonlinearity-options"] + # formulation like: # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} ) # update gate # o_t = \sigmoid ( U^o x_t + W^o s_{t-1} ) # reset gate # h_t = \tanh ( U^h x_t + W^h \dot c_{t-1} ) # c_t = ( 1 - z_t ) \dot h_t + z_t \dot c_{t-1} # y_t = ( c_t \dot o_t ) W^y # dim(y_t) = recurrent_dim + non_recurrent_dim. - # This is the output of the GRU. - # s_t = y_t[0:recurrent_dim-1] # dimension range of y_t - # dim(s_t) = recurrent_dim. + # This is the output of the GRU. + # s_t = y_t[0:recurrent_dim-1] # dimension range of y_t + # dim(s_t) = recurrent_dim. # Note: # naming convention: # .W_. e.g. Gru1.W_i.xr for matrix @@ -1823,45 +2881,122 @@ def generate_pgru_config(self): configs = [] configs.append("### Begin Gru layer '{0}'".format(name)) configs.append("# Update gate control : W_z* matrices") - configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append( + "component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, cell_dim, affine_str + ) + ) configs.append("# Reset gate control : W_o* matrices") - configs.append("component name={0}.W_o.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) - + configs.append( + "component name={0}.W_o.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, cell_dim, affine_str + ) + ) configs.append("# hpart_t related matrix : W_hpart matric") - configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str)) - + configs.append( + "component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim, cell_dim, affine_str + ) + ) + configs.append("# Defining the non-linearities") - configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - - recurrent_connection = '{0}.s_t'.format(name) + configs.append( + "component name={0}.z type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.o type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + + recurrent_connection = "{0}.s_t".format(name) configs.append("# z_t and o_t") - configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) - configs.append("component-node name={0}.o_t_pre component={0}.W_o.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name)) + configs.append( + "component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name) + ) + configs.append( + "component-node name={0}.o_t_pre component={0}.W_o.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name) + ) configs.append("# hpart_t") - configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor)) - + configs.append( + "component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format( + name, input_descriptor + ) + ) + configs.append("# c_t") - configs.append("# Note: the output of OutputGruNonlinearityComponent is (h_t, c_t), we use the second half.") - configs.append("component name={0}.gru_nonlin type=OutputGruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str)) - configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {1})))".format(name, delay)) - configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim)) + configs.append( + "# Note: the output of OutputGruNonlinearityComponent is (h_t, c_t), we use the second half." + ) + configs.append( + "component name={0}.gru_nonlin type=OutputGruNonlinearityComponent cell-dim={1} {2}".format( + name, cell_dim, gru_nonlin_str + ) + ) + configs.append( + "component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {1})))".format( + name, delay + ) + ) + configs.append( + "dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format( + name, cell_dim + ) + ) configs.append("# the projected matrix W_y.cdoto and y_t") - configs.append("component name={0}.cdoto type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component-node name={0}.cdoto component={0}.cdoto input=Append({0}.c_t, {0}.o_t)".format(name)) - configs.append("component name={0}.W_y.cdoto type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) - configs.append("component-node name={0}.y_t component={0}.W_y.cdoto input={0}.cdoto".format(name)) + configs.append( + "component name={0}.cdoto type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component-node name={0}.cdoto component={0}.cdoto input=Append({0}.c_t, {0}.o_t)".format( + name + ) + ) + configs.append( + "component name={0}.W_y.cdoto type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str + ) + ) + configs.append( + "component-node name={0}.y_t component={0}.W_y.cdoto input={0}.cdoto".format( + name + ) + ) configs.append("# s_t recurrence") - configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) - configs.append("dim-range-node name={0}.s_t_preclip input-node={0}.y_t dim-offset=0 dim={1}".format(name, rec_proj_dim)) - configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip".format(name)) + configs.append( + "component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format( + name, rec_proj_dim, bptrunc_str + ) + ) + configs.append( + "dim-range-node name={0}.s_t_preclip input-node={0}.y_t dim-offset=0 dim={1}".format( + name, rec_proj_dim + ) + ) + configs.append( + "component-node name={0}.s_t component={0}.s_r input={0}.s_t_preclip".format( + name + ) + ) return configs @@ -1872,6 +3007,7 @@ def generate_pgru_config(self): # Different from the vanilla OPGRU, the NormOPGRU uses batchnorm in the forward direction # and renorm in the recurrence. + # The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, # the dimension defaults to the same as the input. # See other configuration values below. @@ -1893,97 +3029,123 @@ def generate_pgru_config(self): # gru-nonlinearity-options=' max-change=0.75' [options for GruNonlinearityComponent, see below for detail] # ng-affine-options='' [Additional options used for the full matrices in the GRU, can be used to do things like set biases to initialize to 1] class XconfigFastNormOpgruLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "fast-norm-opgru-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input' : '[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 - 'non-recurrent-projection-dim' : -1, # defaults to - # recurrent-projection-dim - 'clipping-threshold' : 30.0, - 'delay' : -1, - 'ng-per-element-scale-options' : ' max-change=0.75 ', - 'ng-affine-options' : ' max-change=0.75 ', - 'self-repair-scale-nonlinearity' : 0.00001, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, - # if you want to set 'self-repair-scale', ' self-repair-threshold' - # or 'param-stddev' for GruNonlinearityComponent - # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. - # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. - # you can also see src/nnet3/nnet-combined-component.h for detail - 'gru-nonlinearity-options' : ' max-change=0.75', - 'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added - 'dropout-per-frame' : True # If False, regular dropout, not per frame - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "recurrent-projection-dim": -1, # defaults to cell-dim / 4 + "non-recurrent-projection-dim": -1, # defaults to + # recurrent-projection-dim + "clipping-threshold": 30.0, + "delay": -1, + "ng-per-element-scale-options": " max-change=0.75 ", + "ng-affine-options": " max-change=0.75 ", + "self-repair-scale-nonlinearity": 0.00001, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + # if you want to set 'self-repair-scale', ' self-repair-threshold' + # or 'param-stddev' for GruNonlinearityComponent + # For default, they are 1.0e-05, 0.2 and 1.0 / sqrt(d) where d is cell-dim. + # you can add somethig like 'self-repair-scale=xxx' to gru-nonlinearity-options. + # you can also see src/nnet3/nnet-combined-component.h for detail + "gru-nonlinearity-options": " max-change=0.75", + "dropout-proportion": -1.0, # If -1.0, no dropout components will be added + "dropout-per-frame": True, # If False, regular dropout, not per frame + } def set_derived_configs(self): - if self.config['recurrent-projection-dim'] <= 0: - self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + if self.config["recurrent-projection-dim"] <= 0: + self.config["recurrent-projection-dim"] = self.config["cell-dim"] / 4 - if self.config['non-recurrent-projection-dim'] <= 0: - self.config['non-recurrent-projection-dim'] = \ - self.config['recurrent-projection-dim'] + if self.config["non-recurrent-projection-dim"] <= 0: + self.config["non-recurrent-projection-dim"] = self.config[ + "recurrent-projection-dim" + ] def check_configs(self): - for key in ['cell-dim', 'recurrent-projection-dim', - 'non-recurrent-projection-dim']: + for key in [ + "cell-dim", + "recurrent-projection-dim", + "non-recurrent-projection-dim", + ]: if self.config[key] <= 0: - raise RuntimeError("{0} has invalid value {1}.".format( - key, self.config[key])) + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) - if self.config['delay'] == 0: + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - if (self.config['recurrent-projection-dim'] + - self.config['non-recurrent-projection-dim'] > - self.config['cell-dim']): - raise RuntimeError("recurrent+non-recurrent projection dim exceeds " - "cell dim.") - for key in ['self-repair-scale-nonlinearity']: + if ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + > self.config["cell-dim"] + ): + raise RuntimeError( + "recurrent+non-recurrent projection dim exceeds " "cell dim." + ) + for key in ["self-repair-scale-nonlinearity"]: if self.config[key] < 0.0 or self.config[key] > 1.0: - raise RuntimeError("{0} has invalid value {2}." - .format(self.layer_type, key, - self.config[key])) - if ((self.config['dropout-proportion'] > 1.0 or - self.config['dropout-proportion'] < 0.0) and - self.config['dropout-proportion'] != -1.0 ): - raise RuntimeError("dropout-proportion has invalid value {0}." - .format(self.config['dropout-proportion'])) + raise RuntimeError( + "{0} has invalid value {2}.".format( + self.layer_type, key, self.config[key] + ) + ) + if ( + self.config["dropout-proportion"] > 1.0 + or self.config["dropout-proportion"] < 0.0 + ) and self.config["dropout-proportion"] != -1.0: + raise RuntimeError( + "dropout-proportion has invalid value {0}.".format( + self.config["dropout-proportion"] + ) + ) def auxiliary_outputs(self): - return ['c_t'] + return ["c_t"] - def output_name(self, auxiliary_output = None): - node_name = 'y_t' + def output_name(self, auxiliary_output=None): + node_name = "y_t" if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return '{0}.{1}'.format(self.name, node_name) + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): - if node_name == 'c_t': - return self.config['cell-dim'] + if node_name == "c_t": + return self.config["cell-dim"] # add code for other auxiliary_outputs here when we decide to expose them else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + return ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + ) def get_full_config(self): ans = [] config_lines = self.generate_pgru_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -1991,40 +3153,49 @@ def get_full_config(self): # convenience function to generate the Norm-OPGRU config def generate_pgru_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - rec_proj_dim = self.config['recurrent-projection-dim'] - nonrec_proj_dim = self.config['non-recurrent-projection-dim'] - delay = self.config['delay'] - repair_nonlin = self.config['self-repair-scale-nonlinearity'] - repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay))) - affine_str = self.config['ng-affine-options'] - pes_str = self.config['ng-per-element-scale-options'] - dropout_proportion = self.config['dropout-proportion'] - dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + rec_proj_dim = self.config["recurrent-projection-dim"] + nonrec_proj_dim = self.config["non-recurrent-projection-dim"] + delay = self.config["delay"] + repair_nonlin = self.config["self-repair-scale-nonlinearity"] + repair_nonlin_str = ( + "self-repair-scale={0:.10f}".format(repair_nonlin) + if repair_nonlin is not None + else "" + ) + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + ) + ) + affine_str = self.config["ng-affine-options"] + pes_str = self.config["ng-per-element-scale-options"] + dropout_proportion = self.config["dropout-proportion"] + dropout_per_frame = "true" if self.config["dropout-per-frame"] else "false" # Natural gradient per element scale parameters # TODO: decide if we want to keep exposing these options - if re.search('param-mean', pes_str) is None and \ - re.search('param-stddev', pes_str) is None: - pes_str += " param-mean=0.0 param-stddev=1.0 " + if ( + re.search("param-mean", pes_str) is None + and re.search("param-stddev", pes_str) is None + ): + pes_str += " param-mean=0.0 param-stddev=1.0 " # string for GruNonlinearityComponent - gru_nonlin_str = self.config['gru-nonlinearity-options'] - + gru_nonlin_str = self.config["gru-nonlinearity-options"] + # formulation like: # z_t = \sigmoid ( U^z x_t + W^z s_{t-1} ) # update gate # o_t = \sigmoid ( U^o x_t + W^o s_{t-1} ) # output gate @@ -2033,7 +3204,7 @@ def generate_pgru_config(self): # y_t_tmp = ( c_t \dot o_t ) W^y # s_t = renorm ( y_t_tmp[0:rec_proj_dim-1] ) # dim(s_t) = recurrent_dim. # y_t = batchnorm ( y_t_tmp ) # dim(y_t) = recurrent_dim + non_recurrent_dim. - # This is the output of the GRU. + # This is the output of the GRU. # Note: # naming convention: # .W_. e.g. Gru1.W_i.xr for matrix @@ -2046,66 +3217,182 @@ def generate_pgru_config(self): configs = [] configs.append("### Begin Gru layer '{0}'".format(name)) configs.append("# Update gate control : W_z* matrices") - configs.append("component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) + configs.append( + "component name={0}.W_z.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, cell_dim, affine_str + ) + ) configs.append("# Reset gate control : W_o* matrices") - configs.append("component name={0}.W_o.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim + rec_proj_dim, cell_dim, affine_str)) - + configs.append( + "component name={0}.W_o.xs type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim + rec_proj_dim, cell_dim, affine_str + ) + ) configs.append("# hpart_t related matrix : W_hpart matric") - configs.append("component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, input_dim, cell_dim , affine_str)) - + configs.append( + "component name={0}.W_hpart.x type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, input_dim, cell_dim, affine_str + ) + ) + configs.append("# Defining the non-linearities") - configs.append("component name={0}.z type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append( + "component name={0}.z type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.o type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) if dropout_proportion != -1.0: configs.append("# Defining the dropout component") - configs.append("component name={0}.dropout type=DropoutComponent dim={1} " - "dropout-proportion={2} dropout-per-frame={3}" - .format(name, cell_dim, dropout_proportion, dropout_per_frame)) + configs.append( + "component name={0}.dropout type=DropoutComponent dim={1} " + "dropout-proportion={2} dropout-per-frame={3}".format( + name, cell_dim, dropout_proportion, dropout_per_frame + ) + ) - recurrent_connection = '{0}.s_t'.format(name) + recurrent_connection = "{0}.s_t".format(name) configs.append("# z_t") - configs.append("component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append( + "component-node name={0}.z_t_pre component={0}.W_z.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) if dropout_proportion != -1.0: - configs.append("component-node name={0}.z_t_predrop component={0}.z input={0}.z_t_pre".format(name)) - configs.append("component-node name={0}.z_t component={0}.dropout input={0}.z_t_predrop".format(name)) + configs.append( + "component-node name={0}.z_t_predrop component={0}.z input={0}.z_t_pre".format( + name + ) + ) + configs.append( + "component-node name={0}.z_t component={0}.dropout input={0}.z_t_predrop".format( + name + ) + ) else: - configs.append("component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format(name)) + configs.append( + "component-node name={0}.z_t component={0}.z input={0}.z_t_pre".format( + name + ) + ) configs.append("# o_t") - configs.append("component-node name={0}.o_t_pre component={0}.W_o.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) + configs.append( + "component-node name={0}.o_t_pre component={0}.W_o.xs input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) if dropout_proportion != -1.0: - configs.append("component-node name={0}.o_t_predrop component={0}.o input={0}.o_t_pre".format(name)) - configs.append("component-node name={0}.o_t component={0}.dropout input={0}.o_t_predrop".format(name)) + configs.append( + "component-node name={0}.o_t_predrop component={0}.o input={0}.o_t_pre".format( + name + ) + ) + configs.append( + "component-node name={0}.o_t component={0}.dropout input={0}.o_t_predrop".format( + name + ) + ) else: - configs.append("component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format(name)) + configs.append( + "component-node name={0}.o_t component={0}.o input={0}.o_t_pre".format( + name + ) + ) configs.append("# hpart_t") - configs.append("component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format(name, input_descriptor)) - + configs.append( + "component-node name={0}.hpart_t component={0}.W_hpart.x input={1}".format( + name, input_descriptor + ) + ) + configs.append("# c_t") - configs.append("# Note: the output of OutputGruNonlinearityComponent is (h_t, c_t), we use the second half.") - configs.append("component name={0}.gru_nonlin type=OutputGruNonlinearityComponent cell-dim={1} {2}".format(name, cell_dim, gru_nonlin_str)) - configs.append("component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {1})))".format(name, delay)) - configs.append("dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format(name, cell_dim)) + configs.append( + "# Note: the output of OutputGruNonlinearityComponent is (h_t, c_t), we use the second half." + ) + configs.append( + "component name={0}.gru_nonlin type=OutputGruNonlinearityComponent cell-dim={1} {2}".format( + name, cell_dim, gru_nonlin_str + ) + ) + configs.append( + "component-node name={0}.gru_nonlin_t component={0}.gru_nonlin input=Append({0}.z_t, {0}.hpart_t, IfDefined(Offset({0}.c_t, {1})))".format( + name, delay + ) + ) + configs.append( + "dim-range-node name={0}.c_t input-node={0}.gru_nonlin_t dim-offset={1} dim={1}".format( + name, cell_dim + ) + ) configs.append("# the projected matrix W_y.cdoto and y_t_tmp") - configs.append("component name={0}.cdoto type=ElementwiseProductComponent input-dim={1} output-dim={2}".format(name, 2 * cell_dim, cell_dim)) - configs.append("component-node name={0}.cdoto component={0}.cdoto input=Append({0}.c_t, {0}.o_t)".format(name)) - configs.append("component name={0}.W_y.cdoto type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str)) - configs.append("component-node name={0}.y_t_tmp component={0}.W_y.cdoto input={0}.cdoto".format(name)) + configs.append( + "component name={0}.cdoto type=ElementwiseProductComponent input-dim={1} output-dim={2}".format( + name, 2 * cell_dim, cell_dim + ) + ) + configs.append( + "component-node name={0}.cdoto component={0}.cdoto input=Append({0}.c_t, {0}.o_t)".format( + name + ) + ) + configs.append( + "component name={0}.W_y.cdoto type=NaturalGradientAffineComponent input-dim={1} output-dim={2} {3}".format( + name, cell_dim, rec_proj_dim + nonrec_proj_dim, affine_str + ) + ) + configs.append( + "component-node name={0}.y_t_tmp component={0}.W_y.cdoto input={0}.cdoto".format( + name + ) + ) configs.append("# s_t : recurrence") - configs.append("component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format(name, rec_proj_dim)) - configs.append("component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format(name, rec_proj_dim, bptrunc_str)) - configs.append("dim-range-node name={0}.s_t_pre input-node={0}.y_t_tmp dim-offset=0 dim={1}".format(name, rec_proj_dim)) - configs.append("component-node name={0}.s_t_renorm component={0}.renorm input={0}.s_t_pre".format(name)) - configs.append("component-node name={0}.s_t component={0}.s_r input={0}.s_t_renorm".format(name)) + configs.append( + "component name={0}.renorm type=NormalizeComponent dim={1} target-rms=1.0".format( + name, rec_proj_dim + ) + ) + configs.append( + "component name={0}.s_r type=BackpropTruncationComponent dim={1} {2}".format( + name, rec_proj_dim, bptrunc_str + ) + ) + configs.append( + "dim-range-node name={0}.s_t_pre input-node={0}.y_t_tmp dim-offset=0 dim={1}".format( + name, rec_proj_dim + ) + ) + configs.append( + "component-node name={0}.s_t_renorm component={0}.renorm input={0}.s_t_pre".format( + name + ) + ) + configs.append( + "component-node name={0}.s_t component={0}.s_r input={0}.s_t_renorm".format( + name + ) + ) configs.append("# y_t : output") - configs.append("component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format(name, rec_proj_dim + nonrec_proj_dim)) - configs.append("component-node name={0}.y_t component={0}.batchnorm input={0}.y_t_tmp".format(name)) - + configs.append( + "component name={0}.batchnorm type=BatchNormComponent dim={1} target-rms=1.0".format( + name, rec_proj_dim + nonrec_proj_dim + ) + ) + configs.append( + "component-node name={0}.y_t component={0}.batchnorm input={0}.y_t_tmp".format( + name + ) + ) + return configs diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/lstm.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/lstm.py index 4910a4a585b..85e0ebbdda8 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/lstm.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/lstm.py @@ -43,70 +43,79 @@ # accumulated in c_t.] # l2-regularize=0.0 Constant controlling l2 regularization for this layer class XconfigLstmLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token == "lstm-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'clipping-threshold' : 30.0, - 'delay' : -1, - 'ng-per-element-scale-options' : ' max-change=0.75', - 'ng-affine-options' : ' max-change=0.75 ', - 'self-repair-scale-nonlinearity' : 0.00001, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, - 'l2-regularize': 0.0, - 'decay-time': -1.0 - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "clipping-threshold": 30.0, + "delay": -1, + "ng-per-element-scale-options": " max-change=0.75", + "ng-affine-options": " max-change=0.75 ", + "self-repair-scale-nonlinearity": 0.00001, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + "l2-regularize": 0.0, + "decay-time": -1.0, + } def set_derived_configs(self): - if self.config['cell-dim'] <= 0: - self.config['cell-dim'] = self.descriptors['input']['dim'] + if self.config["cell-dim"] <= 0: + self.config["cell-dim"] = self.descriptors["input"]["dim"] def check_configs(self): - key = 'cell-dim' - if self.config['cell-dim'] <= 0: - raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key])) + key = "cell-dim" + if self.config["cell-dim"] <= 0: + raise RuntimeError( + "cell-dim has invalid value {0}.".format(self.config[key]) + ) - if self.config['delay'] == 0: + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - for key in ['self-repair-scale-nonlinearity']: + for key in ["self-repair-scale-nonlinearity"]: if self.config[key] < 0.0 or self.config[key] > 1.0: - raise RuntimeError("{0} has invalid value {1}.".format(key, self.config[key])) + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) def auxiliary_outputs(self): - return ['c_t'] + return ["c_t"] - def output_name(self, auxiliary_output = None): - node_name = 'm_t' + def output_name(self, auxiliary_output=None): + node_name = "m_t" if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output else: - raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output)) + raise RuntimeError( + "Unknown auxiliary output name {0}".format(auxiliary_output) + ) - return '{0}.{1}'.format(self.name, node_name) + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): - if node_name == 'c_t': - return self.config['cell-dim'] + if node_name == "c_t": + return self.config["cell-dim"] # add code for other auxiliary_outputs here when we decide to expose them else: - raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output)) + raise RuntimeError( + "Unknown auxiliary output name {0}".format(auxiliary_output) + ) - return self.config['cell-dim'] + return self.config["cell-dim"] def get_full_config(self): ans = [] config_lines = self._generate_lstm_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -114,42 +123,51 @@ def get_full_config(self): # convenience function to generate the LSTM config def _generate_lstm_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - delay = self.config['delay'] - decay_time = self.config['decay-time'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + delay = self.config["delay"] + decay_time = self.config["decay-time"] # we expect decay_time to be either -1, or large, like 10 or 50. - recurrence_scale = (1.0 if decay_time < 0 else - 1.0 - (abs(delay) / decay_time)) - assert recurrence_scale > 0 # or user may have set decay-time much - # too small. - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - " scale={4}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay), recurrence_scale)) - repair_nonlin = self.config['self-repair-scale-nonlinearity'] - repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' - affine_str = self.config['ng-affine-options'] + recurrence_scale = 1.0 if decay_time < 0 else 1.0 - (abs(delay) / decay_time) + assert recurrence_scale > 0 # or user may have set decay-time much + # too small. + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + " scale={4}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + recurrence_scale, + ) + ) + repair_nonlin = self.config["self-repair-scale-nonlinearity"] + repair_nonlin_str = ( + "self-repair-scale={0:.10f}".format(repair_nonlin) + if repair_nonlin is not None + else "" + ) + affine_str = self.config["ng-affine-options"] # Natural gradient per element scale parameters - ng_per_element_scale_options = self.config['ng-per-element-scale-options'] - if re.search('param-mean', ng_per_element_scale_options) is None and \ - re.search('param-stddev', ng_per_element_scale_options) is None: - ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " + ng_per_element_scale_options = self.config["ng-per-element-scale-options"] + if ( + re.search("param-mean", ng_per_element_scale_options) is None + and re.search("param-stddev", ng_per_element_scale_options) is None + ): + ng_per_element_scale_options += " param-mean=0.0 param-stddev=1.0 " pes_str = ng_per_element_scale_options - l2_regularize = self.config['l2-regularize'] - l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize) - if l2_regularize != 0.0 else '') - + l2_regularize = self.config["l2-regularize"] + l2_regularize_option = ( + "l2-regularize={0} ".format(l2_regularize) if l2_regularize != 0.0 else "" + ) configs = [] @@ -161,97 +179,190 @@ def _generate_lstm_config(self): configs.append("### Begin LSTM layer '{0}'".format(name)) configs.append("# Input gate control : W_i* matrices") - configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim, - affine_str, l2_regularize_option)) + configs.append( + "component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format( + name, input_dim + cell_dim, cell_dim, affine_str, l2_regularize_option + ) + ) configs.append("# note : the cell outputs pass through a diagonal matrix") - configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent " - "dim={1} {2} {3} ".format(name, cell_dim, pes_str, - l2_regularize_option)) + configs.append( + "component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent " + "dim={1} {2} {3} ".format(name, cell_dim, pes_str, l2_regularize_option) + ) configs.append("# Forget gate control : W_f* matrices") - configs.append("component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim, - affine_str, l2_regularize_option)) + configs.append( + "component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format( + name, input_dim + cell_dim, cell_dim, affine_str, l2_regularize_option + ) + ) configs.append("# note : the cell outputs pass through a diagonal matrix") - configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent " - "dim={1} {2} {3}".format(name, cell_dim, pes_str, l2_regularize_option)) + configs.append( + "component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent " + "dim={1} {2} {3}".format(name, cell_dim, pes_str, l2_regularize_option) + ) configs.append("# Output gate control : W_o* matrices") - configs.append("component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim, - affine_str, l2_regularize_option)) + configs.append( + "component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format( + name, input_dim + cell_dim, cell_dim, affine_str, l2_regularize_option + ) + ) configs.append("# note : the cell outputs pass through a diagonal matrix") - configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent " - " dim={1} {2} {3}".format(name, cell_dim, pes_str, - l2_regularize_option)) + configs.append( + "component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent " + " dim={1} {2} {3}".format(name, cell_dim, pes_str, l2_regularize_option) + ) configs.append("# Cell input matrices : W_c* matrices") - configs.append("component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim, - affine_str, l2_regularize_option)) - + configs.append( + "component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format( + name, input_dim + cell_dim, cell_dim, affine_str, l2_regularize_option + ) + ) configs.append("# Defining the non-linearities") - configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append( + "component name={0}.i type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.f type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.o type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.g type=TanhComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.h type=TanhComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) configs.append("# Defining the components for other cell computations") - configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}" - "".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}" - "".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}" - "".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}" - "".format(name, cell_dim, bptrunc_str)) + configs.append( + "component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}" + "".format(name, 2 * cell_dim, cell_dim) + ) + configs.append( + "component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}" + "".format(name, 2 * cell_dim, cell_dim) + ) + configs.append( + "component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}" + "".format(name, 2 * cell_dim, cell_dim) + ) + configs.append( + "component name={0}.c type=BackpropTruncationComponent dim={1} {2}" + "".format(name, cell_dim, bptrunc_str) + ) # c1_t and c2_t defined below - configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name)) + configs.append( + "component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format( + name + ) + ) delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay) configs.append("# i_t") - configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))" - "".format(name, input_descriptor, delay)) - configs.append("component-node name={0}.i2_t component={0}.w_i.c input={1}".format(name, delayed_c_t_descriptor)) - configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name)) + configs.append( + "component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))" + "".format(name, input_descriptor, delay) + ) + configs.append( + "component-node name={0}.i2_t component={0}.w_i.c input={1}".format( + name, delayed_c_t_descriptor + ) + ) + configs.append( + "component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format( + name + ) + ) configs.append("# f_t") - configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))" - "".format(name, input_descriptor, delay)) - configs.append("component-node name={0}.f2_t component={0}.w_f.c input={1}".format(name, delayed_c_t_descriptor)) - configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name)) + configs.append( + "component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))" + "".format(name, input_descriptor, delay) + ) + configs.append( + "component-node name={0}.f2_t component={0}.w_f.c input={1}".format( + name, delayed_c_t_descriptor + ) + ) + configs.append( + "component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format( + name + ) + ) configs.append("# o_t") - configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))" - "".format(name, input_descriptor, delay)) - configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name)) - configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name)) + configs.append( + "component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))" + "".format(name, input_descriptor, delay) + ) + configs.append( + "component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format( + name + ) + ) + configs.append( + "component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format( + name + ) + ) configs.append("# h_t") - configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name)) + configs.append( + "component-node name={0}.h_t component={0}.h input={0}.c_t".format(name) + ) configs.append("# g_t") - configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))" - "".format(name, input_descriptor, delay)) - configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name)) + configs.append( + "component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({0}.r_t, {2})))" + "".format(name, input_descriptor, delay) + ) + configs.append( + "component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name) + ) configs.append("# parts of c_t") - configs.append("component-node name={0}.c1_t component={0}.c1 input=Append({0}.f_t, {1})" - "".format(name, delayed_c_t_descriptor)) - configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)" - "".format(name)) + configs.append( + "component-node name={0}.c1_t component={0}.c1 input=Append({0}.f_t, {1})" + "".format(name, delayed_c_t_descriptor) + ) + configs.append( + "component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)" + "".format(name) + ) configs.append("# m_t") - configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)" - "".format(name)) + configs.append( + "component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)" + "".format(name) + ) # add the recurrent connections - configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}" - "".format(name, cell_dim, bptrunc_str)) - configs.append("component-node name={0}.r_t component={0}.r input={0}.m_t".format(name)) + configs.append( + "component name={0}.r type=BackpropTruncationComponent dim={1} {2}" + "".format(name, cell_dim, bptrunc_str) + ) + configs.append( + "component-node name={0}.r_t component={0}.r input={0}.m_t".format(name) + ) configs.append("### End LSTM layer '{0}'".format(name)) return configs @@ -293,97 +404,124 @@ def _generate_lstm_config(self): # accumulated in c_t.] # l2-regularize=0.0 Constant controlling l2 regularization for this layer class XconfigLstmpLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): # lstmp-batchnorm-layer is like lstmp-layer but followed by a batchnorm # component. assert first_token in ["lstmp-layer", "lstmp-batchnorm-layer"] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input' : '[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'recurrent-projection-dim' : -1, # defaults to cell-dim / 4 - 'non-recurrent-projection-dim' : -1, # defaults to - # recurrent-projection-dim - 'clipping-threshold' : 30.0, - 'delay' : -1, - 'ng-per-element-scale-options' : ' max-change=0.75 ', - 'ng-affine-options' : ' max-change=0.75 ', - 'self-repair-scale-nonlinearity' : 0.00001, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, - 'dropout-proportion' : -1.0, # If -1.0, no dropout components will be added - 'dropout-per-frame' : False, # If false, regular dropout, not per frame. - 'decay-time': -1.0, - 'l2-regularize': 0.0, - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "recurrent-projection-dim": -1, # defaults to cell-dim / 4 + "non-recurrent-projection-dim": -1, # defaults to + # recurrent-projection-dim + "clipping-threshold": 30.0, + "delay": -1, + "ng-per-element-scale-options": " max-change=0.75 ", + "ng-affine-options": " max-change=0.75 ", + "self-repair-scale-nonlinearity": 0.00001, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + "dropout-proportion": -1.0, # If -1.0, no dropout components will be added + "dropout-per-frame": False, # If false, regular dropout, not per frame. + "decay-time": -1.0, + "l2-regularize": 0.0, + } def set_derived_configs(self): - if self.config['recurrent-projection-dim'] <= 0: - self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 + if self.config["recurrent-projection-dim"] <= 0: + self.config["recurrent-projection-dim"] = self.config["cell-dim"] / 4 - if self.config['non-recurrent-projection-dim'] <= 0: - self.config['non-recurrent-projection-dim'] = \ - self.config['recurrent-projection-dim'] + if self.config["non-recurrent-projection-dim"] <= 0: + self.config["non-recurrent-projection-dim"] = self.config[ + "recurrent-projection-dim" + ] def check_configs(self): - for key in ['cell-dim', 'recurrent-projection-dim', - 'non-recurrent-projection-dim']: + for key in [ + "cell-dim", + "recurrent-projection-dim", + "non-recurrent-projection-dim", + ]: if self.config[key] <= 0: - raise RuntimeError("{0} has invalid value {1}.".format( - key, self.config[key])) + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) - if self.config['delay'] == 0: + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - if (self.config['recurrent-projection-dim'] + - self.config['non-recurrent-projection-dim'] > - self.config['cell-dim']): - raise RuntimeError("recurrent+non-recurrent projection dim exceeds " - "cell dim.") - for key in ['self-repair-scale-nonlinearity']: + if ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + > self.config["cell-dim"] + ): + raise RuntimeError( + "recurrent+non-recurrent projection dim exceeds " "cell dim." + ) + for key in ["self-repair-scale-nonlinearity"]: if self.config[key] < 0.0 or self.config[key] > 1.0: - raise RuntimeError("{0} has invalid value {2}." - .format(self.layer_type, key, - self.config[key])) - - if ((self.config['dropout-proportion'] > 1.0 or - self.config['dropout-proportion'] < 0.0) and - self.config['dropout-proportion'] != -1.0 ): - raise RuntimeError("dropout-proportion has invalid value {0}." - .format(self.config['dropout-proportion'])) + raise RuntimeError( + "{0} has invalid value {2}.".format( + self.layer_type, key, self.config[key] + ) + ) + + if ( + self.config["dropout-proportion"] > 1.0 + or self.config["dropout-proportion"] < 0.0 + ) and self.config["dropout-proportion"] != -1.0: + raise RuntimeError( + "dropout-proportion has invalid value {0}.".format( + self.config["dropout-proportion"] + ) + ) def auxiliary_outputs(self): - return ['c_t'] + return ["c_t"] - def output_name(self, auxiliary_output = None): - node_name = ( 'rp_t_batchnorm' if self.layer_type == 'lstmp-batchnorm-layer' - else 'rp_t' ) + def output_name(self, auxiliary_output=None): + node_name = ( + "rp_t_batchnorm" if self.layer_type == "lstmp-batchnorm-layer" else "rp_t" + ) if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return '{0}.{1}'.format(self.name, node_name) + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): - if node_name == 'c_t': - return self.config['cell-dim'] + if node_name == "c_t": + return self.config["cell-dim"] # add code for other auxiliary_outputs here when we decide to expose them else: - raise Exception("In {0} of type {1}, unknown auxiliary output name {1}".format(self.layer_type, auxiliary_output)) + raise Exception( + "In {0} of type {1}, unknown auxiliary output name {1}".format( + self.layer_type, auxiliary_output + ) + ) - return self.config['recurrent-projection-dim'] + self.config['non-recurrent-projection-dim'] + return ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + ) def get_full_config(self): ans = [] config_lines = self._generate_lstm_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -391,45 +529,55 @@ def get_full_config(self): # convenience function to generate the LSTM config def _generate_lstm_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - rec_proj_dim = self.config['recurrent-projection-dim'] - nonrec_proj_dim = self.config['non-recurrent-projection-dim'] - delay = self.config['delay'] - repair_nonlin = self.config['self-repair-scale-nonlinearity'] - repair_nonlin_str = "self-repair-scale={0:.10f}".format(repair_nonlin) if repair_nonlin is not None else '' - decay_time = self.config['decay-time'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + rec_proj_dim = self.config["recurrent-projection-dim"] + nonrec_proj_dim = self.config["non-recurrent-projection-dim"] + delay = self.config["delay"] + repair_nonlin = self.config["self-repair-scale-nonlinearity"] + repair_nonlin_str = ( + "self-repair-scale={0:.10f}".format(repair_nonlin) + if repair_nonlin is not None + else "" + ) + decay_time = self.config["decay-time"] # we expect decay_time to be either -1, or large, like 10 or 50. - recurrence_scale = (1.0 if decay_time < 0 else - 1.0 - (abs(delay) / decay_time)) - assert recurrence_scale > 0 # or user may have set decay-time much - # too small. - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - " scale={4}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay), recurrence_scale)) - affine_str = self.config['ng-affine-options'] - pes_str = self.config['ng-per-element-scale-options'] - dropout_proportion = self.config['dropout-proportion'] - dropout_per_frame = 'true' if self.config['dropout-per-frame'] else 'false' + recurrence_scale = 1.0 if decay_time < 0 else 1.0 - (abs(delay) / decay_time) + assert recurrence_scale > 0 # or user may have set decay-time much + # too small. + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + " scale={4}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + recurrence_scale, + ) + ) + affine_str = self.config["ng-affine-options"] + pes_str = self.config["ng-per-element-scale-options"] + dropout_proportion = self.config["dropout-proportion"] + dropout_per_frame = "true" if self.config["dropout-per-frame"] else "false" # Natural gradient per element scale parameters - if re.search('param-mean', pes_str) is None and \ - re.search('param-stddev', pes_str) is None: - pes_str += " param-mean=0.0 param-stddev=1.0 " - l2_regularize = self.config['l2-regularize'] - l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize) - if l2_regularize != 0.0 else '') + if ( + re.search("param-mean", pes_str) is None + and re.search("param-stddev", pes_str) is None + ): + pes_str += " param-mean=0.0 param-stddev=1.0 " + l2_regularize = self.config["l2-regularize"] + l2_regularize_option = ( + "l2-regularize={0} ".format(l2_regularize) if l2_regularize != 0.0 else "" + ) configs = [] @@ -442,124 +590,290 @@ def _generate_lstm_config(self): # e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating # on an appended vector [x,r] configs.append("# Input gate control : W_i* matrices") - configs.append("component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, - cell_dim, affine_str, l2_regularize_option)) + configs.append( + "component name={0}.W_i.xr type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format( + name, + input_dim + rec_proj_dim, + cell_dim, + affine_str, + l2_regularize_option, + ) + ) configs.append("# note : the cell outputs pass through a diagonal matrix") - configs.append("component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent " - "dim={1} {2} {3}".format(name, cell_dim, pes_str, - l2_regularize_option)) + configs.append( + "component name={0}.w_i.c type=NaturalGradientPerElementScaleComponent " + "dim={1} {2} {3}".format(name, cell_dim, pes_str, l2_regularize_option) + ) configs.append("# Forget gate control : W_f* matrices") - configs.append("component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim, - affine_str, l2_regularize_option)) + configs.append( + "component name={0}.W_f.xr type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format( + name, + input_dim + rec_proj_dim, + cell_dim, + affine_str, + l2_regularize_option, + ) + ) configs.append("# note : the cell outputs pass through a diagonal matrix") - configs.append("component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent " - "dim={1} {2} {3}".format(name, cell_dim, pes_str, l2_regularize_option)) + configs.append( + "component name={0}.w_f.c type=NaturalGradientPerElementScaleComponent " + "dim={1} {2} {3}".format(name, cell_dim, pes_str, l2_regularize_option) + ) configs.append("# Output gate control : W_o* matrices") - configs.append("component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim, - affine_str, l2_regularize_option)) + configs.append( + "component name={0}.W_o.xr type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format( + name, + input_dim + rec_proj_dim, + cell_dim, + affine_str, + l2_regularize_option, + ) + ) configs.append("# note : the cell outputs pass through a diagonal matrix") - configs.append("component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent " - "dim={1} {2} {3}".format(name, cell_dim, pes_str, l2_regularize_option)) + configs.append( + "component name={0}.w_o.c type=NaturalGradientPerElementScaleComponent " + "dim={1} {2} {3}".format(name, cell_dim, pes_str, l2_regularize_option) + ) configs.append("# Cell input matrices : W_c* matrices") - configs.append("component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + rec_proj_dim, cell_dim, - affine_str, l2_regularize_option)) + configs.append( + "component name={0}.W_c.xr type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format( + name, + input_dim + rec_proj_dim, + cell_dim, + affine_str, + l2_regularize_option, + ) + ) configs.append("# Defining the non-linearities") - configs.append("component name={0}.i type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.f type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.o type=SigmoidComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.g type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) - configs.append("component name={0}.h type=TanhComponent dim={1} {2}".format(name, cell_dim, repair_nonlin_str)) + configs.append( + "component name={0}.i type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.f type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.o type=SigmoidComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.g type=TanhComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) + configs.append( + "component name={0}.h type=TanhComponent dim={1} {2}".format( + name, cell_dim, repair_nonlin_str + ) + ) if dropout_proportion != -1.0: - configs.append("component name={0}.dropout type=DropoutComponent dim={1} " - "dropout-proportion={2} dropout-per-frame={3}" - .format(name, cell_dim, dropout_proportion, dropout_per_frame)) + configs.append( + "component name={0}.dropout type=DropoutComponent dim={1} " + "dropout-proportion={2} dropout-per-frame={3}".format( + name, cell_dim, dropout_proportion, dropout_per_frame + ) + ) configs.append("# Defining the components for other cell computations") - configs.append("component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}" - "".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}" - "".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}" - "".format(name, 2 * cell_dim, cell_dim)) - configs.append("component name={0}.c type=BackpropTruncationComponent dim={1} {2}" - "".format(name, cell_dim, bptrunc_str)) + configs.append( + "component name={0}.c1 type=ElementwiseProductComponent input-dim={1} output-dim={2}" + "".format(name, 2 * cell_dim, cell_dim) + ) + configs.append( + "component name={0}.c2 type=ElementwiseProductComponent input-dim={1} output-dim={2}" + "".format(name, 2 * cell_dim, cell_dim) + ) + configs.append( + "component name={0}.m type=ElementwiseProductComponent input-dim={1} output-dim={2}" + "".format(name, 2 * cell_dim, cell_dim) + ) + configs.append( + "component name={0}.c type=BackpropTruncationComponent dim={1} {2}" + "".format(name, cell_dim, bptrunc_str) + ) # c1_t and c2_t defined below - configs.append("component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format(name)) + configs.append( + "component-node name={0}.c_t component={0}.c input=Sum({0}.c1_t, {0}.c2_t)".format( + name + ) + ) delayed_c_t_descriptor = "IfDefined(Offset({0}.c_t, {1}))".format(name, delay) - recurrent_connection = '{0}.r_t'.format(name) + recurrent_connection = "{0}.r_t".format(name) configs.append("# i_t") - configs.append("component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({2}, {3})))" - "".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.i2_t component={0}.w_i.c input={1}".format(name, delayed_c_t_descriptor)) + configs.append( + "component-node name={0}.i1_t component={0}.W_i.xr input=Append({1}, IfDefined(Offset({2}, {3})))" + "".format(name, input_descriptor, recurrent_connection, delay) + ) + configs.append( + "component-node name={0}.i2_t component={0}.w_i.c input={1}".format( + name, delayed_c_t_descriptor + ) + ) if dropout_proportion != -1.0: - configs.append("component-node name={0}.i_t_predrop component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name)) - configs.append("component-node name={0}.i_t component={0}.dropout input={0}.i_t_predrop".format(name)) + configs.append( + "component-node name={0}.i_t_predrop component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format( + name + ) + ) + configs.append( + "component-node name={0}.i_t component={0}.dropout input={0}.i_t_predrop".format( + name + ) + ) else: - configs.append("component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format(name)) + configs.append( + "component-node name={0}.i_t component={0}.i input=Sum({0}.i1_t, {0}.i2_t)".format( + name + ) + ) configs.append("# f_t") - configs.append("component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({2}, {3})))" - "".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.f2_t component={0}.w_f.c input={1}".format(name, delayed_c_t_descriptor)) + configs.append( + "component-node name={0}.f1_t component={0}.W_f.xr input=Append({1}, IfDefined(Offset({2}, {3})))" + "".format(name, input_descriptor, recurrent_connection, delay) + ) + configs.append( + "component-node name={0}.f2_t component={0}.w_f.c input={1}".format( + name, delayed_c_t_descriptor + ) + ) if dropout_proportion != -1.0: - configs.append("component-node name={0}.f_t_predrop component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name)) - configs.append("component-node name={0}.f_t component={0}.dropout input={0}.f_t_predrop".format(name)) + configs.append( + "component-node name={0}.f_t_predrop component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format( + name + ) + ) + configs.append( + "component-node name={0}.f_t component={0}.dropout input={0}.f_t_predrop".format( + name + ) + ) else: - configs.append("component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format(name)) + configs.append( + "component-node name={0}.f_t component={0}.f input=Sum({0}.f1_t, {0}.f2_t)".format( + name + ) + ) configs.append("# o_t") - configs.append("component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format(name)) + configs.append( + "component-node name={0}.o1_t component={0}.W_o.xr input=Append({1}, IfDefined(Offset({2}, {3})))".format( + name, input_descriptor, recurrent_connection, delay + ) + ) + configs.append( + "component-node name={0}.o2_t component={0}.w_o.c input={0}.c_t".format( + name + ) + ) if dropout_proportion != -1.0: - configs.append("component-node name={0}.o_t_predrop component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name)) - configs.append("component-node name={0}.o_t component={0}.dropout input={0}.o_t_predrop".format(name)) + configs.append( + "component-node name={0}.o_t_predrop component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format( + name + ) + ) + configs.append( + "component-node name={0}.o_t component={0}.dropout input={0}.o_t_predrop".format( + name + ) + ) else: - configs.append("component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format(name)) + configs.append( + "component-node name={0}.o_t component={0}.o input=Sum({0}.o1_t, {0}.o2_t)".format( + name + ) + ) configs.append("# h_t") - configs.append("component-node name={0}.h_t component={0}.h input={0}.c_t".format(name)) + configs.append( + "component-node name={0}.h_t component={0}.h input={0}.c_t".format(name) + ) configs.append("# g_t") - configs.append("component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({2}, {3})))" - "".format(name, input_descriptor, recurrent_connection, delay)) - configs.append("component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name)) + configs.append( + "component-node name={0}.g1_t component={0}.W_c.xr input=Append({1}, IfDefined(Offset({2}, {3})))" + "".format(name, input_descriptor, recurrent_connection, delay) + ) + configs.append( + "component-node name={0}.g_t component={0}.g input={0}.g1_t".format(name) + ) configs.append("# parts of c_t") - configs.append("component-node name={0}.c1_t component={0}.c1 input=Append({0}.f_t, {1})".format(name, delayed_c_t_descriptor)) - configs.append("component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format(name)) + configs.append( + "component-node name={0}.c1_t component={0}.c1 input=Append({0}.f_t, {1})".format( + name, delayed_c_t_descriptor + ) + ) + configs.append( + "component-node name={0}.c2_t component={0}.c2 input=Append({0}.i_t, {0}.g_t)".format( + name + ) + ) configs.append("# m_t") - configs.append("component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format(name)) + configs.append( + "component-node name={0}.m_t component={0}.m input=Append({0}.o_t, {0}.h_t)".format( + name + ) + ) # add the recurrent connections configs.append("# projection matrices : Wrm and Wpm") - configs.append("component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, cell_dim, rec_proj_dim + nonrec_proj_dim, - affine_str, l2_regularize_option)) - configs.append("component name={0}.r type=BackpropTruncationComponent dim={1} {2}" - "".format(name, rec_proj_dim, bptrunc_str)) - - configs.append("# r_t and p_t : rp_t will be the output (if we're not doing batchnorm)") - configs.append("component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t" - "".format(name)) - configs.append("dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 " - "dim={1}".format(name, rec_proj_dim)) - configs.append("component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format(name)) + configs.append( + "component name={0}.W_rp.m type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format( + name, + cell_dim, + rec_proj_dim + nonrec_proj_dim, + affine_str, + l2_regularize_option, + ) + ) + configs.append( + "component name={0}.r type=BackpropTruncationComponent dim={1} {2}" + "".format(name, rec_proj_dim, bptrunc_str) + ) + + configs.append( + "# r_t and p_t : rp_t will be the output (if we're not doing batchnorm)" + ) + configs.append( + "component-node name={0}.rp_t component={0}.W_rp.m input={0}.m_t" + "".format(name) + ) + configs.append( + "dim-range-node name={0}.r_t_preclip input-node={0}.rp_t dim-offset=0 " + "dim={1}".format(name, rec_proj_dim) + ) + configs.append( + "component-node name={0}.r_t component={0}.r input={0}.r_t_preclip".format( + name + ) + ) if self.layer_type == "lstmp-batchnorm-layer": # Add the batchnorm component, if requested to include batchnorm. - configs.append("component name={0}.rp_t_batchnorm type=BatchNormComponent dim={1} ".format( - name, rec_proj_dim + nonrec_proj_dim)) - configs.append("component-node name={0}.rp_t_batchnorm component={0}.rp_t_batchnorm " - "input={0}.rp_t".format(name)) + configs.append( + "component name={0}.rp_t_batchnorm type=BatchNormComponent dim={1} ".format( + name, rec_proj_dim + nonrec_proj_dim + ) + ) + configs.append( + "component-node name={0}.rp_t_batchnorm component={0}.rp_t_batchnorm " + "input={0}.rp_t".format(name) + ) return configs @@ -599,73 +913,79 @@ def _generate_lstm_config(self): # accumulated in c_t.] # l2-regularize=0.0 Constant controlling l2 regularization for this layer class XconfigFastLstmLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): + def __init__(self, first_token, key_to_value, prev_names=None): assert first_token in ["fast-lstm-layer", "fast-lstm-batchnorm-layer"] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'clipping-threshold' : 30.0, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, - 'delay' : -1, - # if you want to set 'self-repair-scale' (c.f. the - # self-repair-scale-nonlinearity config value in older LSTM layers), you can - # add 'self-repair-scale=xxx' to - # lstm-nonlinearity-options. - 'lstm-nonlinearity-options' : ' max-change=0.75', - # the affine layer contains 4 of our old layers -> use a - # larger max-change than the normal value of 0.75. - 'ng-affine-options' : ' max-change=1.5', - 'l2-regularize': 0.0, - 'decay-time': -1.0 - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "clipping-threshold": 30.0, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + "delay": -1, + # if you want to set 'self-repair-scale' (c.f. the + # self-repair-scale-nonlinearity config value in older LSTM layers), you can + # add 'self-repair-scale=xxx' to + # lstm-nonlinearity-options. + "lstm-nonlinearity-options": " max-change=0.75", + # the affine layer contains 4 of our old layers -> use a + # larger max-change than the normal value of 0.75. + "ng-affine-options": " max-change=1.5", + "l2-regularize": 0.0, + "decay-time": -1.0, + } self.c_needed = False # keep track of whether the 'c' output is needed. def set_derived_configs(self): - if self.config['cell-dim'] <= 0: - self.config['cell-dim'] = self.descriptors['input']['dim'] + if self.config["cell-dim"] <= 0: + self.config["cell-dim"] = self.descriptors["input"]["dim"] def check_configs(self): - key = 'cell-dim' - if self.config['cell-dim'] <= 0: - raise RuntimeError("cell-dim has invalid value {0}.".format(self.config[key])) - if self.config['delay'] == 0: + key = "cell-dim" + if self.config["cell-dim"] <= 0: + raise RuntimeError( + "cell-dim has invalid value {0}.".format(self.config[key]) + ) + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - - def auxiliary_outputs(self): - return ['c'] + return ["c"] - def output_name(self, auxiliary_output = None): - node_name = ('m_batchnorm' if self.layer_type == 'fast-lstm-batchnorm-layer' - else 'm') + def output_name(self, auxiliary_output=None): + node_name = ( + "m_batchnorm" if self.layer_type == "fast-lstm-batchnorm-layer" else "m" + ) if auxiliary_output is not None: - if auxiliary_output == 'c': - node_name = 'c' + if auxiliary_output == "c": + node_name = "c" self.c_needed = True else: - raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output)) - return '{0}.{1}'.format(self.name, node_name) + raise RuntimeError( + "Unknown auxiliary output name {0}".format(auxiliary_output) + ) + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): if auxiliary_output is not None: - if auxiliary_output == 'c': + if auxiliary_output == "c": self.c_needed = True - return self.config['cell-dim'] + return self.config["cell-dim"] # add code for other auxiliary_outputs here when we decide to expose them else: - raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output)) - return self.config['cell-dim'] + raise RuntimeError( + "Unknown auxiliary output name {0}".format(auxiliary_output) + ) + return self.config["cell-dim"] def get_full_config(self): ans = [] config_lines = self._generate_lstm_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -673,35 +993,38 @@ def get_full_config(self): # convenience function to generate the LSTM config def _generate_lstm_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - delay = self.config['delay'] - affine_str = self.config['ng-affine-options'] - l2_regularize = self.config['l2-regularize'] - l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize) - if l2_regularize != 0.0 else '') - decay_time = self.config['decay-time'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + delay = self.config["delay"] + affine_str = self.config["ng-affine-options"] + l2_regularize = self.config["l2-regularize"] + l2_regularize_option = ( + "l2-regularize={0} ".format(l2_regularize) if l2_regularize != 0.0 else "" + ) + decay_time = self.config["decay-time"] # we expect decay_time to be either -1, or large, like 10 or 50. - recurrence_scale = (1.0 if decay_time < 0 else - 1.0 - (abs(delay) / decay_time)) - assert recurrence_scale > 0 # or user may have set decay-time much - # too small. - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - " scale={4}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay), recurrence_scale)) - lstm_str = self.config['lstm-nonlinearity-options'] - + recurrence_scale = 1.0 if decay_time < 0 else 1.0 - (abs(delay) / decay_time) + assert recurrence_scale > 0 # or user may have set decay-time much + # too small. + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + " scale={4}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + recurrence_scale, + ) + ) + lstm_str = self.config["lstm-nonlinearity-options"] configs = [] @@ -711,49 +1034,93 @@ def _generate_lstm_config(self): # .W_. e.g. Lstm1.W_i.xr for matrix # providing output to gate i and operating on an appended vector [x,r] configs.append("### Begin LSTM layer '{0}'".format(name)) - configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.") - - configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, input_dim + cell_dim, cell_dim * 4, - affine_str, l2_regularize_option)) - - configs.append("# The core LSTM nonlinearity, implemented as a single component.") - configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)") + configs.append( + "# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks." + ) + + configs.append( + "component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format( + name, + input_dim + cell_dim, + cell_dim * 4, + affine_str, + l2_regularize_option, + ) + ) + + configs.append( + "# The core LSTM nonlinearity, implemented as a single component." + ) + configs.append( + "# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)" + ) configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.") - configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent " - "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str, - l2_regularize_option)) - - configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") - configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} " - "{2}".format(name, 2 * cell_dim, bptrunc_str)) + configs.append( + "component name={0}.lstm_nonlin type=LstmNonlinearityComponent " + "cell-dim={1} {2} {3}".format( + name, cell_dim, lstm_str, l2_regularize_option + ) + ) + + configs.append( + "# Component for backprop truncation, to avoid gradient blowup in long training examples." + ) + configs.append( + "component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} " + "{2}".format(name, 2 * cell_dim, bptrunc_str) + ) configs.append("### Nodes for the components above.") - configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, " - "IfDefined(Offset({0}.m_trunc, {2})))".format( - name, input_descriptor, delay)) - - configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})))".format( - name, delay)) + configs.append( + "component-node name={0}.W_all component={0}.W_all input=Append({1}, " + "IfDefined(Offset({0}.m_trunc, {2})))".format(name, input_descriptor, delay) + ) + + configs.append( + "component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " + "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})))".format( + name, delay + ) + ) # we can print .c later if needed, but it generates a warning since it's not used. could use c_trunc instead - #configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format(name, cell_dim)) - configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name)) - configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format(name, cell_dim)) + # configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin dim-offset=0 dim={1}".format(name, cell_dim)) + configs.append( + "dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} dim={1}".format( + name, cell_dim + ) + ) + configs.append( + "component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format( + name + ) + ) + configs.append( + "dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 dim={1}".format( + name, cell_dim + ) + ) + configs.append( + "dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} dim={1}".format( + name, cell_dim + ) + ) if self.layer_type == "fast-lstm-batchnorm-layer": # Add the batchnorm component, if requested to include batchnorm. - configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( - name, cell_dim)) - configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm " - "input={0}.m".format(name)) + configs.append( + "component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( + name, cell_dim + ) + ) + configs.append( + "component-node name={0}.m_batchnorm component={0}.m_batchnorm " + "input={0}.m".format(name) + ) configs.append("### End LSTM layer '{0}'".format(name)) return configs - # This class is for lines like # 'lstmb-layer name=lstm1 input=[-1] delay=-3' # @@ -768,6 +1135,7 @@ def _generate_lstm_config(self): # And the LSTM is followed by a batchnorm component (this is by default; it's not # part of the layer name, like lstmb-batchnorm-layer). + # # The output dimension of the layer may be specified via 'cell-dim=xxx', but if not specified, # the dimension defaults to the same as the input. @@ -796,61 +1164,66 @@ def _generate_lstm_config(self): # accumulated in c_t.] # l2-regularize=0.0 Constant controlling l2 regularization for this layer class XconfigLstmbLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token == 'lstmb-layer' + def __init__(self, first_token, key_to_value, prev_names=None): + assert first_token == "lstmb-layer" XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = { 'input':'[-1]', - 'cell-dim' : -1, # this is a required argument - 'bottleneck-dim': -1, # this is a required argument - 'clipping-threshold': 30.0, - 'zeroing-interval': 20, - 'zeroing-threshold': 15.0, - 'orthonormal-constraint': 1.0, - 'delay' : -1, - 'lstm-nonlinearity-options' : ' max-change=0.75', - # the recurrence scale is the scale on m_trunc, used in the - # recurrence (to balance its size with the input). - 'self-scale' : 1.0, - # the affine layer contains 4 of our old layers -> use a - # larger max-change than the normal value of 0.75. - 'ng-affine-options' : ' max-change=1.5', - 'l2-regularize': 0.0, - 'decay-time': -1.0 - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a required argument + "bottleneck-dim": -1, # this is a required argument + "clipping-threshold": 30.0, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + "orthonormal-constraint": 1.0, + "delay": -1, + "lstm-nonlinearity-options": " max-change=0.75", + # the recurrence scale is the scale on m_trunc, used in the + # recurrence (to balance its size with the input). + "self-scale": 1.0, + # the affine layer contains 4 of our old layers -> use a + # larger max-change than the normal value of 0.75. + "ng-affine-options": " max-change=1.5", + "l2-regularize": 0.0, + "decay-time": -1.0, + } def set_derived_configs(self): - if self.config['cell-dim'] <= 0: - self.config['cell-dim'] = self.descriptors['input']['dim'] + if self.config["cell-dim"] <= 0: + self.config["cell-dim"] = self.descriptors["input"]["dim"] def check_configs(self): - if self.config['cell-dim'] <= 0: - raise RuntimeError("cell-dim has invalid value {0}.".format( - self.config['cell-dim'])) - if self.config['bottleneck-dim'] <= 0: - raise RuntimeError("bottleneck-dim has invalid value {0}.".format( - self.config['bottleneck-dim'])) - if self.config['delay'] == 0: + if self.config["cell-dim"] <= 0: + raise RuntimeError( + "cell-dim has invalid value {0}.".format(self.config["cell-dim"]) + ) + if self.config["bottleneck-dim"] <= 0: + raise RuntimeError( + "bottleneck-dim has invalid value {0}.".format( + self.config["bottleneck-dim"] + ) + ) + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") def auxiliary_outputs(self): return [] - def output_name(self, auxiliary_output = None): + def output_name(self, auxiliary_output=None): assert auxiliary_output is None - return '{0}.m_batchnorm'.format(self.name) + return "{0}.m_batchnorm".format(self.name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - return self.config['cell-dim'] + return self.config["cell-dim"] def get_full_config(self): ans = [] config_lines = self._generate_lstm_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -858,37 +1231,40 @@ def get_full_config(self): # convenience function to generate the LSTM config def _generate_lstm_config(self): - # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - bottleneck_dim = self.config['bottleneck-dim'] - self_scale = self.config['self-scale'] - delay = self.config['delay'] - affine_str = self.config['ng-affine-options'] - l2_regularize = self.config['l2-regularize'] - l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize) - if l2_regularize != 0.0 else '') - decay_time = self.config['decay-time'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + bottleneck_dim = self.config["bottleneck-dim"] + self_scale = self.config["self-scale"] + delay = self.config["delay"] + affine_str = self.config["ng-affine-options"] + l2_regularize = self.config["l2-regularize"] + l2_regularize_option = ( + "l2-regularize={0} ".format(l2_regularize) if l2_regularize != 0.0 else "" + ) + decay_time = self.config["decay-time"] # we expect decay_time to be either -1, or large, like 10 or 50. - recurrence_scale = (1.0 if decay_time < 0 else - 1.0 - (abs(delay) / decay_time)) - assert recurrence_scale > 0 # or user may have set decay-time much - # too small. - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - " scale={4}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay), recurrence_scale)) - lstm_str = self.config['lstm-nonlinearity-options'] - + recurrence_scale = 1.0 if decay_time < 0 else 1.0 - (abs(delay) / decay_time) + assert recurrence_scale > 0 # or user may have set decay-time much + # too small. + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + " scale={4}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + recurrence_scale, + ) + ) + lstm_str = self.config["lstm-nonlinearity-options"] configs = [] @@ -899,59 +1275,103 @@ def _generate_lstm_config(self): # regularization to this layer, since, with the orthonormality # constraint, it's meaningless. configs.append("### Begin LSTM layer '{0}'".format(name)) - configs.append("component name={0}.W_all_a type=LinearComponent input-dim={1} " - "orthonormal-constraint={2} output-dim={3} {4}".format( - name, input_dim + cell_dim, - self.config['orthonormal-constraint'], - bottleneck_dim, affine_str)) - - configs.append("component name={0}.W_all_b type=LinearComponent input-dim={1} " - "output-dim={2} {3} {4}".format(name, bottleneck_dim, cell_dim * 4, - affine_str, l2_regularize_option)) - configs.append("component name={0}.W_all_b_so type=ScaleAndOffsetComponent dim={1} " - "max-change=0.75".format(name, cell_dim * 4)) - - - configs.append("# The core LSTM nonlinearity, implemented as a single component.") - configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)") + configs.append( + "component name={0}.W_all_a type=LinearComponent input-dim={1} " + "orthonormal-constraint={2} output-dim={3} {4}".format( + name, + input_dim + cell_dim, + self.config["orthonormal-constraint"], + bottleneck_dim, + affine_str, + ) + ) + + configs.append( + "component name={0}.W_all_b type=LinearComponent input-dim={1} " + "output-dim={2} {3} {4}".format( + name, bottleneck_dim, cell_dim * 4, affine_str, l2_regularize_option + ) + ) + configs.append( + "component name={0}.W_all_b_so type=ScaleAndOffsetComponent dim={1} " + "max-change=0.75".format(name, cell_dim * 4) + ) + + configs.append( + "# The core LSTM nonlinearity, implemented as a single component." + ) + configs.append( + "# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)" + ) configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.") - configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent " - "cell-dim={1} {2} {3}".format(name, cell_dim, lstm_str, - l2_regularize_option)) - configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") - - configs.append("component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format( - name, 2 * cell_dim, bptrunc_str)) - configs.append("component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( - name, cell_dim)) + configs.append( + "component name={0}.lstm_nonlin type=LstmNonlinearityComponent " + "cell-dim={1} {2} {3}".format( + name, cell_dim, lstm_str, l2_regularize_option + ) + ) + configs.append( + "# Component for backprop truncation, to avoid gradient blowup in long training examples." + ) + + configs.append( + "component name={0}.cm_trunc type=BackpropTruncationComponent dim={1} {2}".format( + name, 2 * cell_dim, bptrunc_str + ) + ) + configs.append( + "component name={0}.m_batchnorm type=BatchNormComponent dim={1} ".format( + name, cell_dim + ) + ) configs.append("### Nodes for the components above.") - configs.append("component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, " - "IfDefined(Offset(Scale({2}, {0}.m_trunc), {3})))".format( - name, input_descriptor, self_scale, delay)) - configs.append("component-node name={0}.W_all_b component={0}.W_all_b " - "input={0}.W_all_a".format(name)) - configs.append("component-node name={0}.W_all_b_so component={0}.W_all_b_so " - "input={0}.W_all_b".format(name)) - - configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format( - name, delay)) - configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} " - "dim={1}".format(name, cell_dim)) - configs.append("component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format(name)) - configs.append("dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 " - "dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} " - "dim={1}".format(name, cell_dim)) - configs.append("component-node name={0}.m_batchnorm component={0}.m_batchnorm " - "input={0}.m".format(name)) + configs.append( + "component-node name={0}.W_all_a component={0}.W_all_a input=Append({1}, " + "IfDefined(Offset(Scale({2}, {0}.m_trunc), {3})))".format( + name, input_descriptor, self_scale, delay + ) + ) + configs.append( + "component-node name={0}.W_all_b component={0}.W_all_b " + "input={0}.W_all_a".format(name) + ) + configs.append( + "component-node name={0}.W_all_b_so component={0}.W_all_b_so " + "input={0}.W_all_b".format(name) + ) + + configs.append( + "component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " + "input=Append({0}.W_all_b_so, IfDefined(Offset({0}.c_trunc, {1})))".format( + name, delay + ) + ) + configs.append( + "dim-range-node name={0}.m input-node={0}.lstm_nonlin dim-offset={1} " + "dim={1}".format(name, cell_dim) + ) + configs.append( + "component-node name={0}.cm_trunc component={0}.cm_trunc input={0}.lstm_nonlin".format( + name + ) + ) + configs.append( + "dim-range-node name={0}.c_trunc input-node={0}.cm_trunc dim-offset=0 " + "dim={1}".format(name, cell_dim) + ) + configs.append( + "dim-range-node name={0}.m_trunc input-node={0}.cm_trunc dim-offset={1} " + "dim={1}".format(name, cell_dim) + ) + configs.append( + "component-node name={0}.m_batchnorm component={0}.m_batchnorm " + "input={0}.m".format(name) + ) configs.append("### End LSTM layer '{0}'".format(name)) return configs - - # This class is for lines like # 'fast-lstmp-layer name=lstm1 input=[-1] delay=-3' # or: @@ -992,92 +1412,111 @@ def _generate_lstm_config(self): # accumulated in c_t.] # l2-regularize=0.0 Constant controlling l2 regularization for this layer class XconfigFastLstmpLayer(XconfigLayerBase): - def __init__(self, first_token, key_to_value, prev_names = None): - assert first_token in ['fast-lstmp-layer', 'fast-lstmp-batchnorm-layer'] + def __init__(self, first_token, key_to_value, prev_names=None): + assert first_token in ["fast-lstmp-layer", "fast-lstmp-batchnorm-layer"] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input':'[-1]', - 'cell-dim' : -1, # this is a compulsory argument - 'recurrent-projection-dim' : -1, - 'non-recurrent-projection-dim' : -1, - 'clipping-threshold' : 30.0, - 'delay' : -1, - # if you want to set 'self-repair-scale' (c.f. the - # self-repair-scale-nonlinearity config value in older LSTM layers), you can - # add 'self-repair-scale=xxx' to - # lstm-nonlinearity-options. - 'lstm-nonlinearity-options' : ' max-change=0.75', - # the affine layer contains 4 of our old layers -> use a - # larger max-change than the normal value of 0.75. - 'ng-affine-options' : ' max-change=1.5', - 'l2-regularize': 0.0, - 'decay-time': -1.0, - 'zeroing-interval' : 20, - 'zeroing-threshold' : 15.0, - 'dropout-proportion' : -1.0, # If -1.0, no dropout will - # be used) - } + self.config = { + "input": "[-1]", + "cell-dim": -1, # this is a compulsory argument + "recurrent-projection-dim": -1, + "non-recurrent-projection-dim": -1, + "clipping-threshold": 30.0, + "delay": -1, + # if you want to set 'self-repair-scale' (c.f. the + # self-repair-scale-nonlinearity config value in older LSTM layers), you can + # add 'self-repair-scale=xxx' to + # lstm-nonlinearity-options. + "lstm-nonlinearity-options": " max-change=0.75", + # the affine layer contains 4 of our old layers -> use a + # larger max-change than the normal value of 0.75. + "ng-affine-options": " max-change=1.5", + "l2-regularize": 0.0, + "decay-time": -1.0, + "zeroing-interval": 20, + "zeroing-threshold": 15.0, + "dropout-proportion": -1.0, # If -1.0, no dropout will + # be used) + } def set_derived_configs(self): - if self.config['recurrent-projection-dim'] <= 0: - self.config['recurrent-projection-dim'] = self.config['cell-dim'] / 4 - - if self.config['non-recurrent-projection-dim'] <= 0: - self.config['non-recurrent-projection-dim'] = \ - self.config['recurrent-projection-dim'] + if self.config["recurrent-projection-dim"] <= 0: + self.config["recurrent-projection-dim"] = self.config["cell-dim"] / 4 + if self.config["non-recurrent-projection-dim"] <= 0: + self.config["non-recurrent-projection-dim"] = self.config[ + "recurrent-projection-dim" + ] def check_configs(self): - for key in ['cell-dim', 'recurrent-projection-dim', - 'non-recurrent-projection-dim']: + for key in [ + "cell-dim", + "recurrent-projection-dim", + "non-recurrent-projection-dim", + ]: if self.config[key] <= 0: - raise RuntimeError("{0} has invalid value {1}.".format( - key, self.config[key])) - if self.config['delay'] == 0: + raise RuntimeError( + "{0} has invalid value {1}.".format(key, self.config[key]) + ) + if self.config["delay"] == 0: raise RuntimeError("delay cannot be zero") - if (self.config['recurrent-projection-dim'] + - self.config['non-recurrent-projection-dim'] > - self.config['cell-dim']): - raise RuntimeError("recurrent+non-recurrent projection dim exceeds " - "cell dim") - if ((self.config['dropout-proportion'] > 1.0 or - self.config['dropout-proportion'] < 0.0) and - self.config['dropout-proportion'] != -1.0 ): - raise RuntimeError("dropout-proportion has invalid value {0}.".format(self.config['dropout-proportion'])) - + if ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + > self.config["cell-dim"] + ): + raise RuntimeError( + "recurrent+non-recurrent projection dim exceeds " "cell dim" + ) + if ( + self.config["dropout-proportion"] > 1.0 + or self.config["dropout-proportion"] < 0.0 + ) and self.config["dropout-proportion"] != -1.0: + raise RuntimeError( + "dropout-proportion has invalid value {0}.".format( + self.config["dropout-proportion"] + ) + ) def auxiliary_outputs(self): - return ['c_t'] + return ["c_t"] - def output_name(self, auxiliary_output = None): - node_name = ('rp_batchnorm' if self.layer_type == 'fast-lstmp-batchnorm-layer' - else 'rp') + def output_name(self, auxiliary_output=None): + node_name = ( + "rp_batchnorm" if self.layer_type == "fast-lstmp-batchnorm-layer" else "rp" + ) if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): node_name = auxiliary_output else: - raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output)) + raise RuntimeError( + "Unknown auxiliary output name {0}".format(auxiliary_output) + ) - return '{0}.{1}'.format(self.name, node_name) + return "{0}.{1}".format(self.name, node_name) - def output_dim(self, auxiliary_output = None): + def output_dim(self, auxiliary_output=None): if auxiliary_output is not None: if auxiliary_output in self.auxiliary_outputs(): - if node_name == 'c': - return self.config['cell-dim'] + if node_name == "c": + return self.config["cell-dim"] # add code for other auxiliary_outputs here when we decide to expose them else: - raise RuntimeError("Unknown auxiliary output name {0}".format(auxiliary_output)) - return self.config['recurrent-projection-dim'] + \ - self.config['non-recurrent-projection-dim'] + raise RuntimeError( + "Unknown auxiliary output name {0}".format(auxiliary_output) + ) + return ( + self.config["recurrent-projection-dim"] + + self.config["non-recurrent-projection-dim"] + ) def get_full_config(self): ans = [] config_lines = self._generate_lstm_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in LSTM initialization # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -1088,34 +1527,39 @@ def _generate_lstm_config(self): # assign some variables to reduce verbosity name = self.name # in the below code we will just call descriptor_strings as descriptors for conciseness - input_dim = self.descriptors['input']['dim'] - input_descriptor = self.descriptors['input']['final-string'] - cell_dim = self.config['cell-dim'] - delay = self.config['delay'] - rec_proj_dim = self.config['recurrent-projection-dim'] - nonrec_proj_dim = self.config['non-recurrent-projection-dim'] - affine_str = self.config['ng-affine-options'] - decay_time = self.config['decay-time'] + input_dim = self.descriptors["input"]["dim"] + input_descriptor = self.descriptors["input"]["final-string"] + cell_dim = self.config["cell-dim"] + delay = self.config["delay"] + rec_proj_dim = self.config["recurrent-projection-dim"] + nonrec_proj_dim = self.config["non-recurrent-projection-dim"] + affine_str = self.config["ng-affine-options"] + decay_time = self.config["decay-time"] # we expect decay_time to be either -1, or large, like 10 or 50. - recurrence_scale = (1.0 if decay_time < 0 else - 1.0 - (abs(delay) / decay_time)) - assert recurrence_scale > 0 # or user may have set decay-time much - # too small. - bptrunc_str = ("clipping-threshold={0}" - " zeroing-threshold={1}" - " zeroing-interval={2}" - " recurrence-interval={3}" - " scale={4}" - "".format(self.config['clipping-threshold'], - self.config['zeroing-threshold'], - self.config['zeroing-interval'], - abs(delay), recurrence_scale)) - - lstm_str = self.config['lstm-nonlinearity-options'] - dropout_proportion = self.config['dropout-proportion'] - l2_regularize = self.config['l2-regularize'] - l2_regularize_option = ('l2-regularize={0} '.format(l2_regularize) - if l2_regularize != 0.0 else '') + recurrence_scale = 1.0 if decay_time < 0 else 1.0 - (abs(delay) / decay_time) + assert recurrence_scale > 0 # or user may have set decay-time much + # too small. + bptrunc_str = ( + "clipping-threshold={0}" + " zeroing-threshold={1}" + " zeroing-interval={2}" + " recurrence-interval={3}" + " scale={4}" + "".format( + self.config["clipping-threshold"], + self.config["zeroing-threshold"], + self.config["zeroing-interval"], + abs(delay), + recurrence_scale, + ) + ) + + lstm_str = self.config["lstm-nonlinearity-options"] + dropout_proportion = self.config["dropout-proportion"] + l2_regularize = self.config["l2-regularize"] + l2_regularize_option = ( + "l2-regularize={0} ".format(l2_regularize) if l2_regularize != 0.0 else "" + ) configs = [] @@ -1127,72 +1571,132 @@ def _generate_lstm_config(self): # naming convention # .W_. e.g. Lstm1.W_i.xr for matrix providing output to gate i and operating on an appended vector [x,r] configs.append("## Begin LSTM layer '{0}'".format(name)) - configs.append("# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks.") - configs.append("component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} " - "output-dim={2} {3} {4}".format( - name, input_dim + rec_proj_dim, cell_dim * 4, - affine_str, l2_regularize_option)) - configs.append("# The core LSTM nonlinearity, implemented as a single component.") - configs.append("# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)") + configs.append( + "# Gate control: contains W_i, W_f, W_c and W_o matrices as blocks." + ) + configs.append( + "component name={0}.W_all type=NaturalGradientAffineComponent input-dim={1} " + "output-dim={2} {3} {4}".format( + name, + input_dim + rec_proj_dim, + cell_dim * 4, + affine_str, + l2_regularize_option, + ) + ) + configs.append( + "# The core LSTM nonlinearity, implemented as a single component." + ) + configs.append( + "# Input = (i_part, f_part, c_part, o_part, c_{t-1}), output = (c_t, m_t)" + ) configs.append("# See cu-math.h:ComputeLstmNonlinearity() for details.") - configs.append("component name={0}.lstm_nonlin type=LstmNonlinearityComponent cell-dim={1} " - "use-dropout={2} {3} {4}" - .format(name, cell_dim, - "true" if dropout_proportion != -1.0 else "false", - lstm_str, l2_regularize_option)) - configs.append("# Component for backprop truncation, to avoid gradient blowup in long training examples.") - configs.append("component name={0}.cr_trunc type=BackpropTruncationComponent " - "dim={1} {2}".format(name, cell_dim + rec_proj_dim, bptrunc_str)) + configs.append( + "component name={0}.lstm_nonlin type=LstmNonlinearityComponent cell-dim={1} " + "use-dropout={2} {3} {4}".format( + name, + cell_dim, + "true" if dropout_proportion != -1.0 else "false", + lstm_str, + l2_regularize_option, + ) + ) + configs.append( + "# Component for backprop truncation, to avoid gradient blowup in long training examples." + ) + configs.append( + "component name={0}.cr_trunc type=BackpropTruncationComponent " + "dim={1} {2}".format(name, cell_dim + rec_proj_dim, bptrunc_str) + ) if dropout_proportion != -1.0: - configs.append("component name={0}.dropout_mask type=DropoutMaskComponent output-dim=3 " - "dropout-proportion={1} " - .format(name, dropout_proportion)) - configs.append("# Component specific to 'projected' LSTM (LSTMP), contains both recurrent"); + configs.append( + "component name={0}.dropout_mask type=DropoutMaskComponent output-dim=3 " + "dropout-proportion={1} ".format(name, dropout_proportion) + ) + configs.append( + "# Component specific to 'projected' LSTM (LSTMP), contains both recurrent" + ) configs.append("# and non-recurrent projections") - configs.append("component name={0}.W_rp type=NaturalGradientAffineComponent " - "input-dim={1} output-dim={2} {3} {4}".format( - name, cell_dim, rec_proj_dim + nonrec_proj_dim, - affine_str, l2_regularize_option)) + configs.append( + "component name={0}.W_rp type=NaturalGradientAffineComponent " + "input-dim={1} output-dim={2} {3} {4}".format( + name, + cell_dim, + rec_proj_dim + nonrec_proj_dim, + affine_str, + l2_regularize_option, + ) + ) configs.append("### Nodes for the components above.") - configs.append("component-node name={0}.W_all component={0}.W_all input=Append({1}, " - "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay)) + configs.append( + "component-node name={0}.W_all component={0}.W_all input=Append({1}, " + "IfDefined(Offset({0}.r_trunc, {2})))".format(name, input_descriptor, delay) + ) if dropout_proportion != -1.0: # note: the 'input' is a don't-care as the component never uses it; it's required # in component-node lines. - configs.append("component-node name={0}.dropout_mask component={0}.dropout_mask " - "input={0}.dropout_mask".format(name)) - configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})), " - "{0}.dropout_mask)".format(name, delay)) + configs.append( + "component-node name={0}.dropout_mask component={0}.dropout_mask " + "input={0}.dropout_mask".format(name) + ) + configs.append( + "component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " + "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})), " + "{0}.dropout_mask)".format(name, delay) + ) else: - configs.append("component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " - "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})))".format( - name, delay)) - configs.append("dim-range-node name={0}.c input-node={0}.lstm_nonlin " - "dim-offset=0 dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.m input-node={0}.lstm_nonlin " - "dim-offset={1} dim={1}".format(name, cell_dim)) - configs.append("# {0}.rp is the output node of this layer (if we're not " - "including batchnorm)".format(name)) - configs.append("component-node name={0}.rp component={0}.W_rp input={0}.m".format(name)) - configs.append("dim-range-node name={0}.r input-node={0}.rp dim-offset=0 " - "dim={1}".format(name, rec_proj_dim)) + configs.append( + "component-node name={0}.lstm_nonlin component={0}.lstm_nonlin " + "input=Append({0}.W_all, IfDefined(Offset({0}.c_trunc, {1})))".format( + name, delay + ) + ) + configs.append( + "dim-range-node name={0}.c input-node={0}.lstm_nonlin " + "dim-offset=0 dim={1}".format(name, cell_dim) + ) + configs.append( + "dim-range-node name={0}.m input-node={0}.lstm_nonlin " + "dim-offset={1} dim={1}".format(name, cell_dim) + ) + configs.append( + "# {0}.rp is the output node of this layer (if we're not " + "including batchnorm)".format(name) + ) + configs.append( + "component-node name={0}.rp component={0}.W_rp input={0}.m".format(name) + ) + configs.append( + "dim-range-node name={0}.r input-node={0}.rp dim-offset=0 " + "dim={1}".format(name, rec_proj_dim) + ) configs.append("# Note: it's not 100% efficient that we have to stitch the c") - configs.append("# and r back together to truncate them but it probably"); + configs.append("# and r back together to truncate them but it probably") configs.append("# makes the deriv truncation more accurate .") - configs.append("component-node name={0}.cr_trunc component={0}.cr_trunc " - "input=Append({0}.c, {0}.r)".format(name)) - configs.append("dim-range-node name={0}.c_trunc input-node={0}.cr_trunc " - "dim-offset=0 dim={1}".format(name, cell_dim)) - configs.append("dim-range-node name={0}.r_trunc input-node={0}.cr_trunc " - "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim)) + configs.append( + "component-node name={0}.cr_trunc component={0}.cr_trunc " + "input=Append({0}.c, {0}.r)".format(name) + ) + configs.append( + "dim-range-node name={0}.c_trunc input-node={0}.cr_trunc " + "dim-offset=0 dim={1}".format(name, cell_dim) + ) + configs.append( + "dim-range-node name={0}.r_trunc input-node={0}.cr_trunc " + "dim-offset={1} dim={2}".format(name, cell_dim, rec_proj_dim) + ) if self.layer_type == "fast-lstmp-batchnorm-layer": # Add the batchnorm component, if requested to include batchnorm. - configs.append("component name={0}.rp_batchnorm type=BatchNormComponent dim={1} ".format( - name, rec_proj_dim + nonrec_proj_dim)) - configs.append("component-node name={0}.rp_batchnorm component={0}.rp_batchnorm " - "input={0}.rp".format(name)) + configs.append( + "component name={0}.rp_batchnorm type=BatchNormComponent dim={1} ".format( + name, rec_proj_dim + nonrec_proj_dim + ) + ) + configs.append( + "component-node name={0}.rp_batchnorm component={0}.rp_batchnorm " + "input={0}.rp".format(name) + ) configs.append("### End LSTM Layer '{0}'".format(name)) return configs diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/parser.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/parser.py index 5e21c4c0274..d91ac8a1eed 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/parser.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/parser.py @@ -17,86 +17,87 @@ # We have to modify this dictionary when adding new layers config_to_layer = { - 'input' : xlayers.XconfigInputLayer, - 'output' : xlayers.XconfigTrivialOutputLayer, - 'output-layer' : xlayers.XconfigOutputLayer, - 'relu-layer' : xlayers.XconfigBasicLayer, - 'relu-renorm-layer' : xlayers.XconfigBasicLayer, - 'relu-batchnorm-dropout-layer' : xlayers.XconfigBasicLayer, - 'relu-dropout-layer': xlayers.XconfigBasicLayer, - 'relu-batchnorm-layer' : xlayers.XconfigBasicLayer, - 'relu-batchnorm-so-layer' : xlayers.XconfigBasicLayer, - 'batchnorm-so-relu-layer' : xlayers.XconfigBasicLayer, - 'batchnorm-layer' : xlayers.XconfigBasicLayer, - 'sigmoid-layer' : xlayers.XconfigBasicLayer, - 'tanh-layer' : xlayers.XconfigBasicLayer, - 'fixed-affine-layer' : xlayers.XconfigFixedAffineLayer, - 'idct-layer' : xlayers.XconfigIdctLayer, - 'affine-layer' : xlayers.XconfigAffineLayer, - 'lstm-layer' : xlayers.XconfigLstmLayer, - 'lstmp-layer' : xlayers.XconfigLstmpLayer, - 'lstmp-batchnorm-layer' : xlayers.XconfigLstmpLayer, - 'fast-lstm-layer' : xlayers.XconfigFastLstmLayer, - 'fast-lstm-batchnorm-layer' : xlayers.XconfigFastLstmLayer, - 'fast-lstmp-layer' : xlayers.XconfigFastLstmpLayer, - 'fast-lstmp-batchnorm-layer' : xlayers.XconfigFastLstmpLayer, - 'lstmb-layer' : xlayers.XconfigLstmbLayer, - 'stats-layer': xlayers.XconfigStatsLayer, - 'relu-conv-layer': xlayers.XconfigConvLayer, - 'conv-layer': xlayers.XconfigConvLayer, - 'conv-relu-layer': xlayers.XconfigConvLayer, - 'conv-renorm-layer': xlayers.XconfigConvLayer, - 'relu-conv-renorm-layer': xlayers.XconfigConvLayer, - 'batchnorm-conv-layer': xlayers.XconfigConvLayer, - 'conv-relu-renorm-layer': xlayers.XconfigConvLayer, - 'batchnorm-conv-relu-layer': xlayers.XconfigConvLayer, - 'relu-batchnorm-conv-layer': xlayers.XconfigConvLayer, - 'relu-batchnorm-noconv-layer': xlayers.XconfigConvLayer, - 'relu-noconv-layer': xlayers.XconfigConvLayer, - 'conv-relu-batchnorm-layer': xlayers.XconfigConvLayer, - 'conv-relu-batchnorm-so-layer': xlayers.XconfigConvLayer, - 'conv-relu-batchnorm-dropout-layer': xlayers.XconfigConvLayer, - 'conv-relu-dropout-layer': xlayers.XconfigConvLayer, - 'res-block': xlayers.XconfigResBlock, - 'res2-block': xlayers.XconfigRes2Block, - 'channel-average-layer': xlayers.ChannelAverageLayer, - 'attention-renorm-layer': xlayers.XconfigAttentionLayer, - 'attention-relu-renorm-layer': xlayers.XconfigAttentionLayer, - 'attention-relu-batchnorm-layer': xlayers.XconfigAttentionLayer, - 'relu-renorm-attention-layer': xlayers.XconfigAttentionLayer, - 'gru-layer' : xlayers.XconfigGruLayer, - 'pgru-layer' : xlayers.XconfigPgruLayer, - 'opgru-layer' : xlayers.XconfigOpgruLayer, - 'norm-pgru-layer' : xlayers.XconfigNormPgruLayer, - 'norm-opgru-layer' : xlayers.XconfigNormOpgruLayer, - 'fast-gru-layer' : xlayers.XconfigFastGruLayer, - 'fast-pgru-layer' : xlayers.XconfigFastPgruLayer, - 'fast-norm-pgru-layer' : xlayers.XconfigFastNormPgruLayer, - 'fast-opgru-layer' : xlayers.XconfigFastOpgruLayer, - 'fast-norm-opgru-layer' : xlayers.XconfigFastNormOpgruLayer, - 'tdnnf-layer': xlayers.XconfigTdnnfLayer, - 'prefinal-layer': xlayers.XconfigPrefinalLayer, - 'spec-augment-layer': xlayers.XconfigSpecAugmentLayer, - 'renorm-component': xlayers.XconfigRenormComponent, - 'batchnorm-component': xlayers.XconfigBatchnormComponent, - 'no-op-component': xlayers.XconfigNoOpComponent, - 'linear-component': xlayers.XconfigLinearComponent, - 'affine-component': xlayers.XconfigAffineComponent, - 'scale-component': xlayers.XconfigPerElementScaleComponent, - 'dim-range-component': xlayers.XconfigDimRangeComponent, - 'offset-component': xlayers.XconfigPerElementOffsetComponent, - 'combine-feature-maps-layer': xlayers.XconfigCombineFeatureMapsLayer, - 'delta-layer': xlayers.XconfigDeltaLayer + "input": xlayers.XconfigInputLayer, + "output": xlayers.XconfigTrivialOutputLayer, + "output-layer": xlayers.XconfigOutputLayer, + "relu-layer": xlayers.XconfigBasicLayer, + "relu-renorm-layer": xlayers.XconfigBasicLayer, + "relu-batchnorm-dropout-layer": xlayers.XconfigBasicLayer, + "relu-dropout-layer": xlayers.XconfigBasicLayer, + "relu-batchnorm-layer": xlayers.XconfigBasicLayer, + "relu-batchnorm-so-layer": xlayers.XconfigBasicLayer, + "batchnorm-so-relu-layer": xlayers.XconfigBasicLayer, + "batchnorm-layer": xlayers.XconfigBasicLayer, + "sigmoid-layer": xlayers.XconfigBasicLayer, + "tanh-layer": xlayers.XconfigBasicLayer, + "fixed-affine-layer": xlayers.XconfigFixedAffineLayer, + "idct-layer": xlayers.XconfigIdctLayer, + "affine-layer": xlayers.XconfigAffineLayer, + "lstm-layer": xlayers.XconfigLstmLayer, + "lstmp-layer": xlayers.XconfigLstmpLayer, + "lstmp-batchnorm-layer": xlayers.XconfigLstmpLayer, + "fast-lstm-layer": xlayers.XconfigFastLstmLayer, + "fast-lstm-batchnorm-layer": xlayers.XconfigFastLstmLayer, + "fast-lstmp-layer": xlayers.XconfigFastLstmpLayer, + "fast-lstmp-batchnorm-layer": xlayers.XconfigFastLstmpLayer, + "lstmb-layer": xlayers.XconfigLstmbLayer, + "stats-layer": xlayers.XconfigStatsLayer, + "relu-conv-layer": xlayers.XconfigConvLayer, + "conv-layer": xlayers.XconfigConvLayer, + "conv-relu-layer": xlayers.XconfigConvLayer, + "conv-renorm-layer": xlayers.XconfigConvLayer, + "relu-conv-renorm-layer": xlayers.XconfigConvLayer, + "batchnorm-conv-layer": xlayers.XconfigConvLayer, + "conv-relu-renorm-layer": xlayers.XconfigConvLayer, + "batchnorm-conv-relu-layer": xlayers.XconfigConvLayer, + "relu-batchnorm-conv-layer": xlayers.XconfigConvLayer, + "relu-batchnorm-noconv-layer": xlayers.XconfigConvLayer, + "relu-noconv-layer": xlayers.XconfigConvLayer, + "conv-relu-batchnorm-layer": xlayers.XconfigConvLayer, + "conv-relu-batchnorm-so-layer": xlayers.XconfigConvLayer, + "conv-relu-batchnorm-dropout-layer": xlayers.XconfigConvLayer, + "conv-relu-dropout-layer": xlayers.XconfigConvLayer, + "res-block": xlayers.XconfigResBlock, + "res2-block": xlayers.XconfigRes2Block, + "channel-average-layer": xlayers.ChannelAverageLayer, + "attention-renorm-layer": xlayers.XconfigAttentionLayer, + "attention-relu-renorm-layer": xlayers.XconfigAttentionLayer, + "attention-relu-batchnorm-layer": xlayers.XconfigAttentionLayer, + "relu-renorm-attention-layer": xlayers.XconfigAttentionLayer, + "gru-layer": xlayers.XconfigGruLayer, + "pgru-layer": xlayers.XconfigPgruLayer, + "opgru-layer": xlayers.XconfigOpgruLayer, + "norm-pgru-layer": xlayers.XconfigNormPgruLayer, + "norm-opgru-layer": xlayers.XconfigNormOpgruLayer, + "fast-gru-layer": xlayers.XconfigFastGruLayer, + "fast-pgru-layer": xlayers.XconfigFastPgruLayer, + "fast-norm-pgru-layer": xlayers.XconfigFastNormPgruLayer, + "fast-opgru-layer": xlayers.XconfigFastOpgruLayer, + "fast-norm-opgru-layer": xlayers.XconfigFastNormOpgruLayer, + "tdnnf-layer": xlayers.XconfigTdnnfLayer, + "prefinal-layer": xlayers.XconfigPrefinalLayer, + "spec-augment-layer": xlayers.XconfigSpecAugmentLayer, + "renorm-component": xlayers.XconfigRenormComponent, + "batchnorm-component": xlayers.XconfigBatchnormComponent, + "no-op-component": xlayers.XconfigNoOpComponent, + "linear-component": xlayers.XconfigLinearComponent, + "affine-component": xlayers.XconfigAffineComponent, + "scale-component": xlayers.XconfigPerElementScaleComponent, + "dim-range-component": xlayers.XconfigDimRangeComponent, + "offset-component": xlayers.XconfigPerElementOffsetComponent, + "combine-feature-maps-layer": xlayers.XconfigCombineFeatureMapsLayer, + "delta-layer": xlayers.XconfigDeltaLayer, } + # Turn a config line and a list of previous layers into # either an object representing that line of the config file; or None # if the line was empty after removing comments. # 'prev_layers' is a list of objects corresponding to preceding layers of the # config file. -def xconfig_line_to_object(config_line, prev_layers = None): +def xconfig_line_to_object(config_line, prev_layers=None): try: - x = xutils.parse_config_line(config_line) + x = xutils.parse_config_line(config_line) if x is None: return None (first_token, key_to_value) = x @@ -106,7 +107,8 @@ def xconfig_line_to_object(config_line, prev_layers = None): except Exception: logging.error( "***Exception caught while parsing the following xconfig line:\n" - "*** {0}".format(config_line)) + "*** {0}".format(config_line) + ) raise @@ -127,15 +129,18 @@ def get_model_component_info(model_filename): all_layers = [] try: - f = open(model_filename, 'r') + f = open(model_filename, "r") except Exception as e: - sys.exit("{0}: error reading model file '{1}'".format(sys.argv[0], - model_filename, - repr(e))) + sys.exit( + "{0}: error reading model file '{1}'".format( + sys.argv[0], model_filename, repr(e) + ) + ) # use nnet3-info to get component names in the model. - out = common_lib.get_command_stdout("""nnet3-info {0} | grep '\-node' """ - """ """.format(model_filename)) + out = common_lib.get_command_stdout( + """nnet3-info {0} | grep '\-node' """ """ """.format(model_filename) + ) # out contains all {output, input, component}-nodes used in model_filename # It can parse lines in out like: @@ -147,27 +152,30 @@ def get_model_component_info(model_filename): for line in out.split("\n"): parts = line.split(" ") dim = -1 - for field in parts: + for field in parts: key_value = field.split("=") if len(key_value) == 2: key = key_value[0] value = key_value[1] - if key == "name": # name=** + if key == "name": # name=** layer_name = value - elif key == "dim": # for input-node + elif key == "dim": # for input-node dim = int(value) - elif key == "output-dim": # for component-node + elif key == "output-dim": # for component-node dim = int(value) if layer_name is not None and layer_name not in layer_names: layer_names.append(layer_name) - key_to_value['name'] = layer_name - assert(dim != -1) - key_to_value['dim'] = dim - all_layers.append(xlayers.XconfigExistingLayer('existing', key_to_value, all_layers)) + key_to_value["name"] = layer_name + assert dim != -1 + key_to_value["dim"] = dim + all_layers.append( + xlayers.XconfigExistingLayer("existing", key_to_value, all_layers) + ) if len(all_layers) == 0: - raise RuntimeError("{0}: model filename '{1}' is empty.".format( - sys.argv[0], model_filename)) + raise RuntimeError( + "{0}: model filename '{1}' is empty.".format(sys.argv[0], model_filename) + ) f.close() return all_layers @@ -184,14 +192,17 @@ def read_xconfig_file(xconfig_filename, existing_layers=None): if existing_layers is None: existing_layers = [] try: - f = open(xconfig_filename, 'r') + f = open(xconfig_filename, "r") except Exception as e: - sys.exit("{0}: error reading xconfig file '{1}'; error was {2}".format( - sys.argv[0], xconfig_filename, repr(e))) + sys.exit( + "{0}: error reading xconfig file '{1}'; error was {2}".format( + sys.argv[0], xconfig_filename, repr(e) + ) + ) all_layers = [] while True: line = f.readline() - if line == '': + if line == "": break # the next call will raise an easy-to-understand exception if # it fails. @@ -201,7 +212,8 @@ def read_xconfig_file(xconfig_filename, existing_layers=None): all_layers.append(this_layer) existing_layers.append(this_layer) if len(all_layers) == 0: - raise RuntimeError("{0}: xconfig file '{1}' is empty".format( - sys.argv[0], xconfig_filename)) + raise RuntimeError( + "{0}: xconfig file '{1}' is empty".format(sys.argv[0], xconfig_filename) + ) f.close() return all_layers diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/stats_layer.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/stats_layer.py index 77e7bbb33bf..656377ffc7e 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/stats_layer.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/stats_layer.py @@ -34,112 +34,137 @@ class XconfigStatsLayer(XconfigLayerBase): dimension computed from input] config='' [Required. Defines what stats must be computed.] """ + def __init__(self, first_token, key_to_value, prev_names=None): - assert first_token in ['stats-layer'] + assert first_token in ["stats-layer"] XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input': '[-1]', - 'dim': -1, - 'config': ''} + self.config = {"input": "[-1]", "dim": -1, "config": ""} def set_derived_configs(self): - config_string = self.config['config'] - if config_string == '': - raise RuntimeError("config has to be non-empty", - self.str()) - m = re.search("(mean|mean\+stddev|mean\+count|mean\+stddev\+count)" - "\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)", - config_string) + config_string = self.config["config"] + if config_string == "": + raise RuntimeError("config has to be non-empty", self.str()) + m = re.search( + "(mean|mean\+stddev|mean\+count|mean\+stddev\+count)" + "\((-?\d+):(-?\d+):(-?\d+):(-?\d+)\)", + config_string, + ) if m is None: - raise RuntimeError("Invalid statistic-config string: {0}".format( - config_string), self) + raise RuntimeError( + "Invalid statistic-config string: {0}".format(config_string), self + ) - self._output_stddev = (m.group(1) in ['mean+stddev', - 'mean+stddev+count']) - self._output_log_counts = (m.group(1) in ['mean+count', - 'mean+stddev+count']) + self._output_stddev = m.group(1) in ["mean+stddev", "mean+stddev+count"] + self._output_log_counts = m.group(1) in ["mean+count", "mean+stddev+count"] self._left_context = -int(m.group(2)) self._input_period = int(m.group(3)) self._stats_period = int(m.group(4)) self._right_context = int(m.group(5)) if self._output_stddev: - output_dim = 2 * self.descriptors['input']['dim'] + output_dim = 2 * self.descriptors["input"]["dim"] else: - output_dim = self.descriptors['input']['dim'] + output_dim = self.descriptors["input"]["dim"] if self._output_log_counts: - output_dim = output_dim + 1 + output_dim = output_dim + 1 - if self.config['dim'] > 0 and self.config['dim'] != output_dim: + if self.config["dim"] > 0 and self.config["dim"] != output_dim: raise RuntimeError( "Invalid dim supplied {0:d} != " - "actual output dim {1:d}".format( - self.config['dim'], output_dim)) - self.config['dim'] = output_dim + "actual output dim {1:d}".format(self.config["dim"], output_dim) + ) + self.config["dim"] = output_dim def check_configs(self): - if not (self._left_context >= 0 and self._right_context >= 0 - and self._input_period > 0 and self._stats_period > 0 - and self._left_context % self._stats_period == 0 - and self._right_context % self._stats_period == 0 - and self._stats_period % self._input_period == 0): + if not ( + self._left_context >= 0 + and self._right_context >= 0 + and self._input_period > 0 + and self._stats_period > 0 + and self._left_context % self._stats_period == 0 + and self._right_context % self._stats_period == 0 + and self._stats_period % self._input_period == 0 + ): raise RuntimeError( "Invalid configuration of statistics-extraction: {0}".format( - self.config['config']), self) + self.config["config"] + ), + self, + ) super(XconfigStatsLayer, self).check_configs() def _generate_config(self): - input_desc = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] + input_desc = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] configs = [] configs.append( - 'component name={name}-extraction-{lc}-{rc} ' - 'type=StatisticsExtractionComponent input-dim={dim} ' - 'input-period={input_period} output-period={output_period} ' - 'include-variance={var} '.format( - name=self.name, lc=self._left_context, rc=self._right_context, - dim=input_dim, input_period=self._input_period, + "component name={name}-extraction-{lc}-{rc} " + "type=StatisticsExtractionComponent input-dim={dim} " + "input-period={input_period} output-period={output_period} " + "include-variance={var} ".format( + name=self.name, + lc=self._left_context, + rc=self._right_context, + dim=input_dim, + input_period=self._input_period, output_period=self._stats_period, - var='true' if self._output_stddev else 'false')) + var="true" if self._output_stddev else "false", + ) + ) configs.append( - 'component-node name={name}-extraction-{lc}-{rc} ' - 'component={name}-extraction-{lc}-{rc} input={input} '.format( - name=self.name, lc=self._left_context, rc=self._right_context, - input=input_desc)) + "component-node name={name}-extraction-{lc}-{rc} " + "component={name}-extraction-{lc}-{rc} input={input} ".format( + name=self.name, + lc=self._left_context, + rc=self._right_context, + input=input_desc, + ) + ) stats_dim = 1 + input_dim * (2 if self._output_stddev else 1) configs.append( - 'component name={name}-pooling-{lc}-{rc} ' - 'type=StatisticsPoolingComponent input-dim={dim} ' - 'input-period={input_period} left-context={lc} right-context={rc} ' - 'num-log-count-features={count} output-stddevs={var} '.format( - name=self.name, lc=self._left_context, rc=self._right_context, - dim=stats_dim, input_period=self._stats_period, + "component name={name}-pooling-{lc}-{rc} " + "type=StatisticsPoolingComponent input-dim={dim} " + "input-period={input_period} left-context={lc} right-context={rc} " + "num-log-count-features={count} output-stddevs={var} ".format( + name=self.name, + lc=self._left_context, + rc=self._right_context, + dim=stats_dim, + input_period=self._stats_period, count=1 if self._output_log_counts else 0, - var='true' if self._output_stddev else 'false')) + var="true" if self._output_stddev else "false", + ) + ) configs.append( - 'component-node name={name}-pooling-{lc}-{rc} ' - 'component={name}-pooling-{lc}-{rc} ' - 'input={name}-extraction-{lc}-{rc} '.format( - name=self.name, lc=self._left_context, rc=self._right_context)) + "component-node name={name}-pooling-{lc}-{rc} " + "component={name}-pooling-{lc}-{rc} " + "input={name}-extraction-{lc}-{rc} ".format( + name=self.name, lc=self._left_context, rc=self._right_context + ) + ) return configs def output_name(self, auxiliary_output=None): - return 'Round({name}-pooling-{lc}-{rc}, {period})'.format( - name=self.name, lc=self._left_context, - rc=self._right_context, period=self._stats_period) + return "Round({name}-pooling-{lc}-{rc}, {period})".format( + name=self.name, + lc=self._left_context, + rc=self._right_context, + period=self._stats_period, + ) def output_dim(self, auxiliary_outputs=None): - return self.config['dim'] + return self.config["dim"] def get_full_config(self): ans = [] config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: ans.append((config_name, line)) return ans diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/trivial_layers.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/trivial_layers.py index 4afea78ad3f..dbf6af206e6 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/trivial_layers.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/trivial_layers.py @@ -23,15 +23,15 @@ class XconfigRenormComponent(XconfigLayerBase): input='[-1]' [Descriptor giving the input of the layer.] target-rms=1.0 [The target RMS of the NormalizeComponent] """ + def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input': '[-1]', - 'target-rms': 1.0 } + self.config = {"input": "[-1]", "target-rms": 1.0} def check_configs(self): - assert self.config['target-rms'] > 0.0 + assert self.config["target-rms"] > 0.0 def output_name(self, auxiliary_output=None): assert auxiliary_output is None @@ -39,7 +39,7 @@ def output_name(self, auxiliary_output=None): def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - input_dim = self.descriptors['input']['dim'] + input_dim = self.descriptors["input"]["dim"] return input_dim def get_full_config(self): @@ -47,7 +47,7 @@ def get_full_config(self): config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -56,21 +56,24 @@ def get_full_config(self): def _generate_config(self): # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] - target_rms = self.config['target-rms'] + input_desc = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] + target_rms = self.config["target-rms"] configs = [] - line = ('component name={0} type=NormalizeComponent dim={1} target-rms={2}'.format( - self.name, input_dim, target_rms)) + line = ( + "component name={0} type=NormalizeComponent dim={1} target-rms={2}".format( + self.name, input_dim, target_rms + ) + ) configs.append(line) - line = ('component-node name={0} component={0} input={1}'.format( - self.name, input_desc)) + line = "component-node name={0} component={0} input={1}".format( + self.name, input_desc + ) configs.append(line) return configs - class XconfigBatchnormComponent(XconfigLayerBase): """This class is for parsing lines like 'batchnorm-component name=batchnorm input=Append(-3,0,3)' @@ -83,16 +86,15 @@ class XconfigBatchnormComponent(XconfigLayerBase): `fixed-affine-layer` that is to be initialized via LDA] """ + def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input': '[-1]', - 'target-rms': 1.0, - 'include-in-init': False} + self.config = {"input": "[-1]", "target-rms": 1.0, "include-in-init": False} def check_configs(self): - assert self.config['target-rms'] > 0.0 + assert self.config["target-rms"] > 0.0 def output_name(self, auxiliary_output=None): assert auxiliary_output is None @@ -100,7 +102,7 @@ def output_name(self, auxiliary_output=None): def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - input_dim = self.descriptors['input']['dim'] + input_dim = self.descriptors["input"]["dim"] return input_dim def get_full_config(self): @@ -108,27 +110,31 @@ def get_full_config(self): config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) - if self.config['include-in-init']: - ans.append(('init', line)) + if self.config["include-in-init"]: + ans.append(("init", line)) return ans def _generate_config(self): # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] - target_rms = self.config['target-rms'] + input_desc = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] + target_rms = self.config["target-rms"] configs = [] - line = ('component name={0} type=BatchNormComponent dim={1} target-rms={2}'.format( - self.name, input_dim, target_rms)) + line = ( + "component name={0} type=BatchNormComponent dim={1} target-rms={2}".format( + self.name, input_dim, target_rms + ) + ) configs.append(line) - line = ('component-node name={0} component={0} input={1}'.format( - self.name, input_desc)) + line = "component-node name={0} component={0} input={1}".format( + self.name, input_desc + ) configs.append(line) return configs @@ -141,11 +147,12 @@ class XconfigNoOpComponent(XconfigLayerBase): Parameters of the class, and their defaults: input='[-1]' [Descriptor giving the input of the layer.] """ + def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input': '[-1]' } + self.config = {"input": "[-1]"} def check_configs(self): pass @@ -156,7 +163,7 @@ def output_name(self, auxiliary_output=None): def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - input_dim = self.descriptors['input']['dim'] + input_dim = self.descriptors["input"]["dim"] return input_dim def get_full_config(self): @@ -164,7 +171,7 @@ def get_full_config(self): config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -173,15 +180,17 @@ def get_full_config(self): def _generate_config(self): # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] + input_desc = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] configs = [] - line = ('component name={0} type=NoOpComponent dim={1}'.format( - self.name, input_dim)) + line = "component name={0} type=NoOpComponent dim={1}".format( + self.name, input_dim + ) configs.append(line) - line = ('component-node name={0} component={0} input={1}'.format( - self.name, input_desc)) + line = "component-node name={0} component={0} input={1}".format( + self.name, input_desc + ) configs.append(line) return configs @@ -190,17 +199,18 @@ class XconfigDeltaLayer(XconfigLayerBase): """This class is for parsing lines like 'delta-layer name=delta input=idct' which appends the central frame with the delta features - (i.e. -1,0,1 since scale equals 1) and delta-delta features + (i.e. -1,0,1 since scale equals 1) and delta-delta features (i.e. 1,0,-2,0,1), and then applies batchnorm to it. Parameters of the class, and their defaults: input='[-1]' [Descriptor giving the input of the layer] """ + def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input': '[-1]'} + self.config = {"input": "[-1]"} def check_configs(self): pass @@ -211,15 +221,15 @@ def output_name(self, auxiliary_output=None): def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - input_dim = self.descriptors['input']['dim'] - return (3*input_dim) + input_dim = self.descriptors["input"]["dim"] + return 3 * input_dim def get_full_config(self): ans = [] config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -228,31 +238,42 @@ def get_full_config(self): def _generate_config(self): # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] + input_desc = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] output_dim = self.output_dim() configs = [] - line = ('dim-range-node name={0}_copy1 input-node={0} dim={1} dim-offset=0'.format( - input_desc, input_dim)) + line = ( + "dim-range-node name={0}_copy1 input-node={0} dim={1} dim-offset=0".format( + input_desc, input_dim + ) + ) configs.append(line) - line = ('dim-range-node name={0}_copy2 input-node={0} dim={1} dim-offset=0'.format( - input_desc, input_dim)) + line = ( + "dim-range-node name={0}_copy2 input-node={0} dim={1} dim-offset=0".format( + input_desc, input_dim + ) + ) configs.append(line) - line = ('component name={0}_2 type=NoOpComponent dim={1}'.format( - input_desc, output_dim)) + line = "component name={0}_2 type=NoOpComponent dim={1}".format( + input_desc, output_dim + ) configs.append(line) - line = ('component-node name={0}_2 component={0}_2 input=Append(Offset({0},0),' - ' Sum(Offset(Scale(-1.0,{0}_copy1),-1), Offset({0},1)), Sum(Offset({0},-2), Offset({0},2),' - ' Offset(Scale(-2.0,{0}_copy2),0)))'.format(input_desc)) + line = ( + "component-node name={0}_2 component={0}_2 input=Append(Offset({0},0)," + " Sum(Offset(Scale(-1.0,{0}_copy1),-1), Offset({0},1)), Sum(Offset({0},-2), Offset({0},2)," + " Offset(Scale(-2.0,{0}_copy2),0)))".format(input_desc) + ) configs.append(line) - - line = ('component name={0} type=BatchNormComponent dim={1}'.format( - self.name, output_dim)) + + line = "component name={0} type=BatchNormComponent dim={1}".format( + self.name, output_dim + ) configs.append(line) - line = ('component-node name={0} component={0} input={1}_2'.format( - self.name, input_desc)) + line = "component-node name={0} component={0} input={1}_2".format( + self.name, input_desc + ) configs.append(line) return configs @@ -276,20 +297,23 @@ class XconfigLinearComponent(XconfigLayerBase): l2-regularize=0.0 """ + def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input': '[-1]', - 'dim': -1, - 'orthonormal-constraint': '', - 'max-change': 0.75, - 'l2-regularize': '', - 'param-stddev': '', - 'learning-rate-factor': '' } + self.config = { + "input": "[-1]", + "dim": -1, + "orthonormal-constraint": "", + "max-change": 0.75, + "l2-regularize": "", + "param-stddev": "", + "learning-rate-factor": "", + } def check_configs(self): - if self.config['dim'] <= 0: + if self.config["dim"] <= 0: raise RuntimeError("'dim' must be specified and > 0.") def output_name(self, auxiliary_output=None): @@ -298,15 +322,15 @@ def output_name(self, auxiliary_output=None): def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - assert self.config['dim'] > 0 - return self.config['dim'] + assert self.config["dim"] > 0 + return self.config["dim"] def get_full_config(self): ans = [] config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -315,69 +339,86 @@ def get_full_config(self): def _generate_config(self): # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] - output_dim = self.config['dim'] - - opts = '' - for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize', - 'param-stddev', 'learning-rate-factor' ]: + input_desc = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] + output_dim = self.config["dim"] + + opts = "" + for opt_name in [ + "orthonormal-constraint", + "max-change", + "l2-regularize", + "param-stddev", + "learning-rate-factor", + ]: value = self.config[opt_name] - if value != '': - opts += ' {0}={1}'.format(opt_name, value) + if value != "": + opts += " {0}={1}".format(opt_name, value) configs = [] - line = ('component name={0} type=LinearComponent input-dim={1} output-dim={2} ' - '{3}'.format(self.name, input_dim, output_dim, opts)) + line = ( + "component name={0} type=LinearComponent input-dim={1} output-dim={2} " + "{3}".format(self.name, input_dim, output_dim, opts) + ) configs.append(line) - line = ('component-node name={0} component={0} input={1}'.format( - self.name, input_desc)) + line = "component-node name={0} component={0} input={1}".format( + self.name, input_desc + ) configs.append(line) return configs class XconfigCombineFeatureMapsLayer(XconfigLayerBase): """This class is for parsing lines like - 'combine-feature-maps-layer name=combine_features1 height=40 num-filters1=1 num-filters2=4' - or - 'combine-feature-maps-layer name=combine_features1 height=40 num-filters1=1 num-filters2=4 num-filters3=2' - - It produces a PermuteComponent. It expects its input to be two or three things - appended together, where the first is of dimension height * num-filters1 and - the second is of dimension height * num-filters2 (and the third, if present is - of dimension height * num-filters2; it interpolates the filters - so the output can be interpreted as a single feature map with the same height - as the input and the sum of the num-filters. - - This is to be used in convolutional setups as part of how we combine the - filterbank inputs with ivectors. + 'combine-feature-maps-layer name=combine_features1 height=40 num-filters1=1 num-filters2=4' + or + 'combine-feature-maps-layer name=combine_features1 height=40 num-filters1=1 num-filters2=4 num-filters3=2' + + It produces a PermuteComponent. It expects its input to be two or three things + appended together, where the first is of dimension height * num-filters1 and + the second is of dimension height * num-filters2 (and the third, if present is + of dimension height * num-filters2; it interpolates the filters + so the output can be interpreted as a single feature map with the same height + as the input and the sum of the num-filters. + + This is to be used in convolutional setups as part of how we combine the + filterbank inputs with ivectors. """ def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = { 'input': '[-1]', - 'num-filters1': -1, - 'num-filters2': -1, - 'num-filters3': 0, - 'height': -1 } + self.config = { + "input": "[-1]", + "num-filters1": -1, + "num-filters2": -1, + "num-filters3": 0, + "height": -1, + } def check_configs(self): - input_dim = self.descriptors['input']['dim'] - if (self.config['num-filters1'] <= 0 or - self.config['num-filters2'] <= 0 or - self.config['num-filters3'] < 0 or - self.config['height'] <= 0): - raise RuntimeError("invalid values of num-filters1, num-filters2 and/or height") - f1 = self.config['num-filters1'] - f2 = self.config['num-filters2'] - f3 = self.config['num-filters3'] - h = self.config['height'] + input_dim = self.descriptors["input"]["dim"] + if ( + self.config["num-filters1"] <= 0 + or self.config["num-filters2"] <= 0 + or self.config["num-filters3"] < 0 + or self.config["height"] <= 0 + ): + raise RuntimeError( + "invalid values of num-filters1, num-filters2 and/or height" + ) + f1 = self.config["num-filters1"] + f2 = self.config["num-filters2"] + f3 = self.config["num-filters3"] + h = self.config["height"] if input_dim != (f1 + f2 + f3) * h: - raise RuntimeError("Expected input-dim={0} based on num-filters1={1}, num-filters2={2}, " - "num-filters3={3} and height={4}, but got input-dim={5}".format( - (f1 + f2 + f3) * h, f1, f2, f3, h, input_dim)) + raise RuntimeError( + "Expected input-dim={0} based on num-filters1={1}, num-filters2={2}, " + "num-filters3={3} and height={4}, but got input-dim={5}".format( + (f1 + f2 + f3) * h, f1, f2, f3, h, input_dim + ) + ) def output_name(self, auxiliary_output=None): assert auxiliary_output is None @@ -385,7 +426,7 @@ def output_name(self, auxiliary_output=None): def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - input_dim = self.descriptors['input']['dim'] + input_dim = self.descriptors["input"]["dim"] return input_dim def get_full_config(self): @@ -393,7 +434,7 @@ def get_full_config(self): config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -402,12 +443,12 @@ def get_full_config(self): def _generate_config(self): # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - dim = self.descriptors['input']['dim'] - num_filters1 = self.config['num-filters1'] - num_filters2 = self.config['num-filters2'] - num_filters3 = self.config['num-filters3'] # normally 0. - height = self.config['height'] + input_desc = self.descriptors["input"]["final-string"] + dim = self.descriptors["input"]["dim"] + num_filters1 = self.config["num-filters1"] + num_filters2 = self.config["num-filters2"] + num_filters3 = self.config["num-filters3"] # normally 0. + height = self.config["height"] assert dim == (num_filters1 + num_filters2 + num_filters3) * height column_map = [] @@ -417,21 +458,23 @@ def _generate_config(self): for f in range(num_filters2): column_map.append(height * num_filters1 + h * num_filters2 + f) for f in range(num_filters3): - column_map.append(height * (num_filters1 + num_filters2) + h * num_filters3 + f) + column_map.append( + height * (num_filters1 + num_filters2) + h * num_filters3 + f + ) configs = [] - line = ('component name={0} type=PermuteComponent column-map={1} '.format( - self.name, ','.join([str(x) for x in column_map]))) + line = "component name={0} type=PermuteComponent column-map={1} ".format( + self.name, ",".join([str(x) for x in column_map]) + ) configs.append(line) - line = ('component-node name={0} component={0} input={1}'.format( - self.name, input_desc)) + line = "component-node name={0} component={0} input={1}".format( + self.name, input_desc + ) configs.append(line) return configs - - class XconfigAffineComponent(XconfigLayerBase): """This class is for parsing lines like 'affine-component name=linear1 dim=1024 input=Append(-3,0,3)' @@ -451,20 +494,23 @@ class XconfigAffineComponent(XconfigLayerBase): l2-regularize=0.0 """ + def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input': '[-1]', - 'dim': -1, - 'orthonormal-constraint': '', - 'max-change': 0.75, - 'param-stddev': '', - 'bias-stddev': '', - 'l2-regularize': '' } + self.config = { + "input": "[-1]", + "dim": -1, + "orthonormal-constraint": "", + "max-change": 0.75, + "param-stddev": "", + "bias-stddev": "", + "l2-regularize": "", + } def check_configs(self): - if self.config['dim'] <= 0: + if self.config["dim"] <= 0: raise RuntimeError("'dim' must be specified and > 0.") def output_name(self, auxiliary_output=None): @@ -473,15 +519,15 @@ def output_name(self, auxiliary_output=None): def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - assert self.config['dim'] > 0 - return self.config['dim'] + assert self.config["dim"] > 0 + return self.config["dim"] def get_full_config(self): ans = [] config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -490,23 +536,31 @@ def get_full_config(self): def _generate_config(self): # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - input_dim = self.descriptors['input']['dim'] - output_dim = self.config['dim'] - - opts = '' - for opt_name in ['orthonormal-constraint', 'max-change', 'l2-regularize', - 'param-stddev', 'bias-stddev']: + input_desc = self.descriptors["input"]["final-string"] + input_dim = self.descriptors["input"]["dim"] + output_dim = self.config["dim"] + + opts = "" + for opt_name in [ + "orthonormal-constraint", + "max-change", + "l2-regularize", + "param-stddev", + "bias-stddev", + ]: value = self.config[opt_name] - if value != '': - opts += ' {0}={1}'.format(opt_name, value) + if value != "": + opts += " {0}={1}".format(opt_name, value) configs = [] - line = ('component name={0} type=NaturalGradientAffineComponent input-dim={1} output-dim={2} ' - '{3}'.format(self.name, input_dim, output_dim, opts)) + line = ( + "component name={0} type=NaturalGradientAffineComponent input-dim={1} output-dim={2} " + "{3}".format(self.name, input_dim, output_dim, opts) + ) configs.append(line) - line = ('component-node name={0} component={0} input={1}'.format( - self.name, input_desc)) + line = "component-node name={0} component={0} input={1}".format( + self.name, input_desc + ) configs.append(line) return configs @@ -530,16 +584,19 @@ class XconfigPerElementScaleComponent(XconfigLayerBase): param-stddev=0.0 # affects initialization learning-rate-factor=1.0 """ + def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input': '[-1]', - 'l2-regularize': '', - 'max-change': 0.75, - 'param-mean': '', - 'param-stddev': '', - 'learning-rate-factor': '' } + self.config = { + "input": "[-1]", + "l2-regularize": "", + "max-change": 0.75, + "param-mean": "", + "param-stddev": "", + "learning-rate-factor": "", + } def check_configs(self): pass @@ -550,14 +607,14 @@ def output_name(self, auxiliary_output=None): def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - return self.descriptors['input']['dim'] + return self.descriptors["input"]["dim"] def get_full_config(self): ans = [] config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -566,25 +623,34 @@ def get_full_config(self): def _generate_config(self): # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - dim = self.descriptors['input']['dim'] - - opts = '' - for opt_name in ['learning-rate-factor', 'max-change', 'l2-regularize', 'param-mean', - 'param-stddev' ]: + input_desc = self.descriptors["input"]["final-string"] + dim = self.descriptors["input"]["dim"] + + opts = "" + for opt_name in [ + "learning-rate-factor", + "max-change", + "l2-regularize", + "param-mean", + "param-stddev", + ]: value = self.config[opt_name] - if value != '': - opts += ' {0}={1}'.format(opt_name, value) + if value != "": + opts += " {0}={1}".format(opt_name, value) configs = [] - line = ('component name={0} type=NaturalGradientPerElementScaleComponent dim={1} {2} ' - ''.format(self.name, dim, opts)) + line = ( + "component name={0} type=NaturalGradientPerElementScaleComponent dim={1} {2} " + "".format(self.name, dim, opts) + ) configs.append(line) - line = ('component-node name={0} component={0} input={1}'.format( - self.name, input_desc)) + line = "component-node name={0} component={0} input={1}".format( + self.name, input_desc + ) configs.append(line) return configs + class XconfigPerElementOffsetComponent(XconfigLayerBase): """This class is for parsing lines like 'offset-component name=offset1 input=Append(-3,0,3)' @@ -604,16 +670,19 @@ class XconfigPerElementOffsetComponent(XconfigLayerBase): param-stddev=0.0 # affects initialization learning-rate-factor=1.0 """ + def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input': '[-1]', - 'l2-regularize': '', - 'max-change': 0.75, - 'param-mean': '', - 'param-stddev': '', - 'learning-rate-factor': '' } + self.config = { + "input": "[-1]", + "l2-regularize": "", + "max-change": 0.75, + "param-mean": "", + "param-stddev": "", + "learning-rate-factor": "", + } def check_configs(self): pass @@ -624,14 +693,14 @@ def output_name(self, auxiliary_output=None): def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - return self.descriptors['input']['dim'] + return self.descriptors["input"]["dim"] def get_full_config(self): ans = [] config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -640,22 +709,30 @@ def get_full_config(self): def _generate_config(self): # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_desc = self.descriptors['input']['final-string'] - dim = self.descriptors['input']['dim'] - - opts = '' - for opt_name in ['learning-rate-factor', 'max-change', 'l2-regularize', 'param-mean', - 'param-stddev' ]: + input_desc = self.descriptors["input"]["final-string"] + dim = self.descriptors["input"]["dim"] + + opts = "" + for opt_name in [ + "learning-rate-factor", + "max-change", + "l2-regularize", + "param-mean", + "param-stddev", + ]: value = self.config[opt_name] - if value != '': - opts += ' {0}={1}'.format(opt_name, value) + if value != "": + opts += " {0}={1}".format(opt_name, value) configs = [] - line = ('component name={0} type=PerElementOffsetComponent dim={1} {2} ' - ''.format(self.name, dim, opts)) + line = ( + "component name={0} type=PerElementOffsetComponent dim={1} {2} " + "".format(self.name, dim, opts) + ) configs.append(line) - line = ('component-node name={0} component={0} input={1}'.format( - self.name, input_desc)) + line = "component-node name={0} component={0} input={1}".format( + self.name, input_desc + ) configs.append(line) return configs @@ -669,24 +746,25 @@ class XconfigDimRangeComponent(XconfigLayerBase): dim=-1 [Dimension of the output.] dim-offset=0 [Dimension offset of the input.] """ + def __init__(self, first_token, key_to_value, prev_names=None): XconfigLayerBase.__init__(self, first_token, key_to_value, prev_names) def set_default_configs(self): - self.config = {'input': '[-1]', - 'dim': -1, - 'dim-offset': 0 } + self.config = {"input": "[-1]", "dim": -1, "dim-offset": 0} def check_configs(self): - input_dim = self.descriptors['input']['dim'] - if self.config['dim'] <= 0: + input_dim = self.descriptors["input"]["dim"] + if self.config["dim"] <= 0: raise RuntimeError("'dim' must be specified and > 0.") - elif self.config['dim'] > input_dim: + elif self.config["dim"] > input_dim: raise RuntimeError("'dim' must be specified and lower than the input dim.") - if self.config['dim-offset'] < 0 : + if self.config["dim-offset"] < 0: raise RuntimeError("'dim-offset' must be specified and >= 0.") - elif self.config['dim-offset'] + self.config['dim'] > input_dim: - raise RuntimeError("'dim-offset' plus output dim must be lower than the input dim.") + elif self.config["dim-offset"] + self.config["dim"] > input_dim: + raise RuntimeError( + "'dim-offset' plus output dim must be lower than the input dim." + ) def output_name(self, auxiliary_output=None): assert auxiliary_output is None @@ -694,9 +772,9 @@ def output_name(self, auxiliary_output=None): def output_dim(self, auxiliary_output=None): assert auxiliary_output is None - output_dim = self.config['dim'] + output_dim = self.config["dim"] if output_dim <= 0: - self.config['dim'] = self.descriptors['input']['dim'] + self.config["dim"] = self.descriptors["input"]["dim"] return output_dim def get_full_config(self): @@ -704,7 +782,7 @@ def get_full_config(self): config_lines = self._generate_config() for line in config_lines: - for config_name in ['ref', 'final']: + for config_name in ["ref", "final"]: # we do not support user specified matrices in this layer # so 'ref' and 'final' configs are the same. ans.append((config_name, line)) @@ -713,12 +791,13 @@ def get_full_config(self): def _generate_config(self): # by 'descriptor_final_string' we mean a string that can appear in # config-files, i.e. it contains the 'final' names of nodes. - input_node = self.descriptors['input']['final-string'] - output_dim = self.config['dim'] - dim_offset = self.config['dim-offset'] + input_node = self.descriptors["input"]["final-string"] + output_dim = self.config["dim"] + dim_offset = self.config["dim-offset"] configs = [] - line = ('dim-range-node name={0} input-node={1} dim={2} dim-offset={3}'.format( - self.name, input_node, output_dim, dim_offset)) + line = "dim-range-node name={0} input-node={1} dim={2} dim-offset={3}".format( + self.name, input_node, output_dim, dim_offset + ) configs.append(line) return configs diff --git a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/utils.py b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/utils.py index 0188248d694..455a51aeb13 100644 --- a/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/utils.py +++ b/egs2/TEMPLATE/asr1/steps/libs/nnet3/xconfig/utils.py @@ -30,13 +30,14 @@ def get_prev_names(all_layers, current_layer): # neural network supplied via the existing-model option, that we are # adding layers to. In this case, these layers are not considered as # layers preceding 'current_layer'. - if layer.layer_type is not 'existing': + if layer.layer_type is not "existing": prev_names.append(layer.get_name()) prev_names_set = set() for name in prev_names: if name in prev_names_set: - raise RuntimeError("{0}: Layer name {1} is used more than once.".format( - sys.argv[0], name)) + raise RuntimeError( + "{0}: Layer name {1} is used more than once.".format(sys.argv[0], name) + ) prev_names_set.add(name) return prev_names @@ -45,7 +46,7 @@ def get_prev_names(all_layers, current_layer): # full layer name def split_layer_name(full_layer_name): assert isinstance(full_layer_name, str) - split_name = full_layer_name.split('.') + split_name = full_layer_name.split(".") if len(split_name) == 0: raise RuntimeError("Bad layer name: " + full_layer_name) layer_name = split_name[0] @@ -54,10 +55,11 @@ def split_layer_name(full_layer_name): else: # we probably expect len(split_name) == 2 in this case, # but no harm in allowing dots in the auxiliary_output. - auxiliary_output = '.'.join(split_name[1:]) + auxiliary_output = ".".join(split_name[1:]) return [layer_name, auxiliary_output] + # [utility function used in xconfig_layers.py] # this converts a layer-name like 'ivector' or 'input', or a sub-layer name like # 'lstm2.memory_cell', into a dimension. 'all_layers' is a vector of objects @@ -82,20 +84,25 @@ def get_dim_from_layer_name(all_layers, current_layer, full_layer_name): # auxiliary_output, will only be used in the (rare) case when we are # using auxiliary outputs, e.g. 'lstm1.c'. if layer.get_name() == full_layer_name: - return layer.output_dim() + return layer.output_dim() if layer.get_name() == layer_name: - if (not auxiliary_output in layer.auxiliary_outputs() - and auxiliary_output is not None): - raise RuntimeError("Layer '{0}' has no such auxiliary output:" - "'{1}' ({0}.{1})".format(layer_name, - auxiliary_output)) + if ( + not auxiliary_output in layer.auxiliary_outputs() + and auxiliary_output is not None + ): + raise RuntimeError( + "Layer '{0}' has no such auxiliary output:" + "'{1}' ({0}.{1})".format(layer_name, auxiliary_output) + ) return layer.output_dim(auxiliary_output) # No such layer was found. - if layer_name in [ layer.get_name() for layer in all_layers ]: - raise RuntimeError("Layer '{0}' was requested before it appeared in " - "the xconfig file (circular dependencies or out-of-order " - "layers".format(layer_name)) + if layer_name in [layer.get_name() for layer in all_layers]: + raise RuntimeError( + "Layer '{0}' was requested before it appeared in " + "the xconfig file (circular dependencies or out-of-order " + "layers".format(layer_name) + ) else: raise RuntimeError("No such layer: '{0}'".format(layer_name)) @@ -126,17 +133,22 @@ def get_string_from_layer_name(all_layers, current_layer, full_layer_name): return layer.output_name() if layer.get_name() == layer_name: - if (not auxiliary_output in layer.auxiliary_outputs() and - auxiliary_output is not None): - raise RuntimeError("Layer '{0}' has no such auxiliary output: " - "'{1}' ({0}.{1})".format( - layer_name, auxiliary_output)) + if ( + not auxiliary_output in layer.auxiliary_outputs() + and auxiliary_output is not None + ): + raise RuntimeError( + "Layer '{0}' has no such auxiliary output: " + "'{1}' ({0}.{1})".format(layer_name, auxiliary_output) + ) return layer.output_name(auxiliary_output) # No such layer was found. - if layer_name in [ layer.get_name() for layer in all_layers ]: - raise RuntimeError("Layer '{0}' was requested before it appeared in " - "the xconfig file (circular dependencies or out-of-order " - "layers".format(layer_name)) + if layer_name in [layer.get_name() for layer in all_layers]: + raise RuntimeError( + "Layer '{0}' was requested before it appeared in " + "the xconfig file (circular dependencies or out-of-order " + "layers".format(layer_name) + ) else: raise RuntimeError("No such layer: '{0}'".format(layer_name)) @@ -152,25 +164,33 @@ def convert_value_to_type(key, dest_type, string_value): elif string_value == "False" or string_value == "false": return False else: - raise RuntimeError("Invalid configuration value {0}={1} (expected bool)".format( - key, string_value)) + raise RuntimeError( + "Invalid configuration value {0}={1} (expected bool)".format( + key, string_value + ) + ) elif dest_type == type(int()): try: return int(string_value) except: - raise RuntimeError("Invalid configuration value {0}={1} (expected int)".format( - key, string_value)) + raise RuntimeError( + "Invalid configuration value {0}={1} (expected int)".format( + key, string_value + ) + ) elif dest_type == type(float()): try: return float(string_value) except: - raise RuntimeError("Invalid configuration value {0}={1} (expected int)".format( - key, string_value)) + raise RuntimeError( + "Invalid configuration value {0}={1} (expected int)".format( + key, string_value + ) + ) elif dest_type == type(str()): return string_value - # This class parses and stores a Descriptor-- expression # like Append(Offset(input, -3), input) and so on. # For the full range of possible expressions, see the comment at the @@ -181,13 +201,12 @@ def convert_value_to_type(key, dest_type, string_value): # they are interpreted as Offset(prev_layer, -3) where 'prev_layer' # is the previous layer in the config file. + # Also, in any place a raw input/layer/output name can appear, we accept things # like [-1] meaning the previous input/layer/output's name, or [-2] meaning the # last-but-one input/layer/output, and so on. class Descriptor(object): - def __init__(self, - descriptor_string = None, - prev_names = None): + def __init__(self, descriptor_string=None, prev_names=None): # self.operator is a string that may be 'Offset', 'Append', # 'Sum', 'Failover', 'IfDefined', 'Offset', 'Switch', 'Round', # 'ReplaceIndex'; it also may be None, representing the base-case @@ -214,15 +233,20 @@ def __init__(self, # note: 'pos' should point to the 'end of string' marker # that terminates 'tokens'. if pos != len(tokens) - 1: - raise RuntimeError("Parsing Descriptor, saw junk at end: " + - ' '.join(tokens[pos:-1])) + raise RuntimeError( + "Parsing Descriptor, saw junk at end: " + + " ".join(tokens[pos:-1]) + ) # copy members from d. self.operator = d.operator self.items = d.items except RuntimeError as e: traceback.print_tb(sys.exc_info()[2]) - raise RuntimeError("Error parsing Descriptor '{0}', specific error was: {1}".format( - descriptor_string, repr(e))) + raise RuntimeError( + "Error parsing Descriptor '{0}', specific error was: {1}".format( + descriptor_string, repr(e) + ) + ) # This is like the str() function, but it uses the layer_to_string function # (which is a function from strings to strings) to convert layer names (or @@ -236,9 +260,19 @@ def config_string(self, layer_to_string): return layer_to_string(self.items[0]) else: assert isinstance(self.operator, str) - return self.operator + '(' + ', '.join( - [ item.config_string(layer_to_string) if isinstance(item, Descriptor) else str(item) - for item in self.items]) + ')' + return ( + self.operator + + "(" + + ", ".join( + [ + item.config_string(layer_to_string) + if isinstance(item, Descriptor) + else str(item) + for item in self.items + ] + ) + + ")" + ) def str(self): if self.operator is None: @@ -246,7 +280,12 @@ def str(self): return self.items[0] else: assert isinstance(self.operator, str) - return self.operator + '(' + ', '.join([str(item) for item in self.items]) + ')' + return ( + self.operator + + "(" + + ", ".join([str(item) for item in self.items]) + + ")" + ) def __str__(self): return self.str() @@ -263,43 +302,51 @@ def dim(self, layer_to_dim): # base-case: self.items = [ layer_name ] (or sub-layer name, like # 'lstm.memory_cell'). return layer_to_dim(self.items[0]) - elif self.operator in [ 'Sum', 'Failover', 'IfDefined', 'Switch' ]: + elif self.operator in ["Sum", "Failover", "IfDefined", "Switch"]: # these are all operators for which all args are descriptors # and must have the same dim. dim = self.items[0].dim(layer_to_dim) for desc in self.items[1:]: next_dim = desc.dim(layer_to_dim) if next_dim != dim: - raise RuntimeError("In descriptor {0}, different fields have different " - "dimensions: {1} != {2}".format(self.str(), dim, next_dim)) + raise RuntimeError( + "In descriptor {0}, different fields have different " + "dimensions: {1} != {2}".format(self.str(), dim, next_dim) + ) return dim - elif self.operator in [ 'Offset', 'Round', 'ReplaceIndex' ]: + elif self.operator in ["Offset", "Round", "ReplaceIndex"]: # for these operators, only the 1st arg is relevant. return self.items[0].dim(layer_to_dim) - elif self.operator == 'Append': - return sum([ x.dim(layer_to_dim) for x in self.items]) - elif self.operator == 'Scale': + elif self.operator == "Append": + return sum([x.dim(layer_to_dim) for x in self.items]) + elif self.operator == "Scale": # e.g. Scale(2.0, lstm1). Return dim of 2nd arg. return self.items[1].dim(layer_to_dim) - elif self.operator == 'Const': + elif self.operator == "Const": # e.g. Const(0.5, 512). Return 2nd arg, which is an int. return self.items[1] else: raise RuntimeError("Unknown operator {0}".format(self.operator)) - # This just checks that seen_item == expected_item, and raises an # exception if not. def expect_token(expected_item, seen_item, what_parsing): if seen_item != expected_item: - raise RuntimeError("parsing {0}, expected '{1}' but got '{2}'".format( - what_parsing, expected_item, seen_item)) + raise RuntimeError( + "parsing {0}, expected '{1}' but got '{2}'".format( + what_parsing, expected_item, seen_item + ) + ) + # returns true if 'name' is valid as the name of a line (input, layer or output); # this is the same as IsValidname() in the nnet3 code. def is_valid_line_name(name): - return isinstance(name, str) and re.match(r'^[a-zA-Z_][-a-zA-Z_0-9.]*', name) != None + return ( + isinstance(name, str) and re.match(r"^[a-zA-Z_][-a-zA-Z_0-9.]*", name) != None + ) + # This function for parsing Descriptors takes an array of tokens as produced # by tokenize_descriptor. It parses a descriptor @@ -319,56 +366,80 @@ def parse_new_descriptor(tokens, pos, prev_names): # when reading this function, be careful to note the indent level, # there is an if-statement within an if-statement. - if first_token in [ 'Offset', 'Round', 'ReplaceIndex', 'Append', 'Sum', - 'Switch', 'Failover', 'IfDefined' ]: - expect_token('(', tokens[pos], first_token + '()') + if first_token in [ + "Offset", + "Round", + "ReplaceIndex", + "Append", + "Sum", + "Switch", + "Failover", + "IfDefined", + ]: + expect_token("(", tokens[pos], first_token + "()") pos += 1 d.operator = first_token # the 1st argument of all these operators is a Descriptor. (desc, pos) = parse_new_descriptor(tokens, pos, prev_names) d.items = [desc] - if first_token == 'Offset': - expect_token(',', tokens[pos], 'Offset()') + if first_token == "Offset": + expect_token(",", tokens[pos], "Offset()") pos += 1 try: t_offset = int(tokens[pos]) pos += 1 d.items.append(t_offset) except: - raise RuntimeError("Parsing Offset(), expected integer, got " + tokens[pos]) - if tokens[pos] == ')': + raise RuntimeError( + "Parsing Offset(), expected integer, got " + tokens[pos] + ) + if tokens[pos] == ")": return (d, pos + 1) - elif tokens[pos] != ',': - raise RuntimeError("Parsing Offset(), expected ')' or ',', got " + tokens[pos]) + elif tokens[pos] != ",": + raise RuntimeError( + "Parsing Offset(), expected ')' or ',', got " + tokens[pos] + ) pos += 1 try: x_offset = int(tokens[pos]) pos += 1 d.items.append(x_offset) except: - raise RuntimeError("Parsing Offset(), expected integer, got " + tokens[pos]) - expect_token(')', tokens[pos], 'Offset()') + raise RuntimeError( + "Parsing Offset(), expected integer, got " + tokens[pos] + ) + expect_token(")", tokens[pos], "Offset()") pos += 1 - elif first_token in [ 'Append', 'Sum', 'Switch', 'Failover', 'IfDefined' ]: + elif first_token in ["Append", "Sum", "Switch", "Failover", "IfDefined"]: while True: - if tokens[pos] == ')': + if tokens[pos] == ")": # check num-items is correct for some special cases. - if first_token == 'Failover' and len(d.items) != 2: - raise RuntimeError("Parsing Failover(), expected 2 items but got {0}".format(len(d.items))) - if first_token == 'IfDefined' and len(d.items) != 1: - raise RuntimeError("Parsing IfDefined(), expected 1 item but got {0}".format(len(d.items))) + if first_token == "Failover" and len(d.items) != 2: + raise RuntimeError( + "Parsing Failover(), expected 2 items but got {0}".format( + len(d.items) + ) + ) + if first_token == "IfDefined" and len(d.items) != 1: + raise RuntimeError( + "Parsing IfDefined(), expected 1 item but got {0}".format( + len(d.items) + ) + ) pos += 1 break - elif tokens[pos] == ',': + elif tokens[pos] == ",": pos += 1 # consume the comma. else: - raise RuntimeError("Parsing Append(), expected ')' or ',', got " + tokens[pos]) + raise RuntimeError( + "Parsing Append(), expected ')' or ',', got " + tokens[pos] + ) (desc, pos) = parse_new_descriptor(tokens, pos, prev_names) d.items.append(desc) - elif first_token == 'Round': - expect_token(',', tokens[pos], 'Round()') + elif first_token == "Round": + expect_token(",", tokens[pos], "Round()") pos += 1 try: t_modulus = int(tokens[pos]) @@ -376,33 +447,38 @@ def parse_new_descriptor(tokens, pos, prev_names): pos += 1 d.items.append(t_modulus) except: - raise RuntimeError("Parsing Offset(), expected integer, got " + tokens[pos]) - expect_token(')', tokens[pos], 'Round()') + raise RuntimeError( + "Parsing Offset(), expected integer, got " + tokens[pos] + ) + expect_token(")", tokens[pos], "Round()") pos += 1 - elif first_token == 'ReplaceIndex': - expect_token(',', tokens[pos], 'ReplaceIndex()') + elif first_token == "ReplaceIndex": + expect_token(",", tokens[pos], "ReplaceIndex()") pos += 1 - if tokens[pos] in [ 'x', 't' ]: + if tokens[pos] in ["x", "t"]: d.items.append(tokens[pos]) pos += 1 else: - raise RuntimeError("Parsing ReplaceIndex(), expected 'x' or 't', got " + - tokens[pos]) - expect_token(',', tokens[pos], 'ReplaceIndex()') + raise RuntimeError( + "Parsing ReplaceIndex(), expected 'x' or 't', got " + tokens[pos] + ) + expect_token(",", tokens[pos], "ReplaceIndex()") pos += 1 try: new_value = int(tokens[pos]) pos += 1 d.items.append(new_value) except: - raise RuntimeError("Parsing Offset(), expected integer, got " + tokens[pos]) - expect_token(')', tokens[pos], 'ReplaceIndex()') + raise RuntimeError( + "Parsing Offset(), expected integer, got " + tokens[pos] + ) + expect_token(")", tokens[pos], "ReplaceIndex()") pos += 1 else: raise RuntimeError("code error") - elif first_token in ['Scale', 'Const' ]: + elif first_token in ["Scale", "Const"]: # Parsing something like 'Scale(2.0, lstm1)' or 'Const(1.0, 512)' - expect_token('(', tokens[pos], first_token + '()') + expect_token("(", tokens[pos], first_token + "()") pos += 1 d.operator = first_token # First arg of Scale() and Const() is a float: the scale or value, @@ -412,29 +488,33 @@ def parse_new_descriptor(tokens, pos, prev_names): pos += 1 d.items = [value] except: - raise RuntimeError("Parsing {0}, expected float, got {1}".format( - first_token, tokens[pos])) + raise RuntimeError( + "Parsing {0}, expected float, got {1}".format(first_token, tokens[pos]) + ) # Consume the comma. - expect_token(',', tokens[pos], first_token + '()') + expect_token(",", tokens[pos], first_token + "()") pos += 1 - if first_token == 'Scale': + if first_token == "Scale": # Second arg of Scale() is a Descriptor. (desc, pos) = parse_new_descriptor(tokens, pos, prev_names) d.items.append(desc) else: - assert first_token == 'Const' + assert first_token == "Const" try: dim = int(tokens[pos]) pos += 1 d.items.append(dim) except: - raise RuntimeError("Parsing Const() expression, expected int, got {0}".format( - tokens[pos])) - expect_token(')', tokens[pos], first_token) + raise RuntimeError( + "Parsing Const() expression, expected int, got {0}".format( + tokens[pos] + ) + ) + expect_token(")", tokens[pos], first_token) pos += 1 - elif first_token in [ 'end of string', '(', ')', ',', '@' ]: + elif first_token in ["end of string", "(", ")", ",", "@"]: raise RuntimeError("Expected descriptor, got " + first_token) - elif is_valid_line_name(first_token) or first_token == '[': + elif is_valid_line_name(first_token) or first_token == "[": # This section parses a raw input/layer/output name, e.g. "affine2" # (which must start with an alphabetic character or underscore), # optionally followed by an offset like '@-3'. @@ -445,20 +525,21 @@ def parse_new_descriptor(tokens, pos, prev_names): # If the layer-name o is followed by '@', then # we're parsing something like 'affine1@-3' which # is syntactic sugar for 'Offset(affine1, 3)'. - if tokens[pos] == '@': + if tokens[pos] == "@": pos += 1 try: offset_t = int(tokens[pos]) pos += 1 except: - raise RuntimeError("Parse error parsing {0}@{1}".format( - first_token, tokens[pos])) + raise RuntimeError( + "Parse error parsing {0}@{1}".format(first_token, tokens[pos]) + ) if offset_t != 0: inner_d = d d = Descriptor() # e.g. foo@3 is equivalent to 'Offset(foo, 3)'. - d.operator = 'Offset' - d.items = [ inner_d, offset_t ] + d.operator = "Offset" + d.items = [inner_d, offset_t] else: # the last possible case is that 'first_token' is just an integer i, # which can appear in things like Append(-3, 0, 3). @@ -468,20 +549,23 @@ def parse_new_descriptor(tokens, pos, prev_names): try: offset_t = int(first_token) except: - raise RuntimeError("Parsing descriptor, expected descriptor but got " + - first_token) + raise RuntimeError( + "Parsing descriptor, expected descriptor but got " + first_token + ) assert isinstance(prev_names, list) if len(prev_names) < 1: - raise RuntimeError("Parsing descriptor, could not interpret '{0}' because " - "there is no previous layer".format(first_token)) + raise RuntimeError( + "Parsing descriptor, could not interpret '{0}' because " + "there is no previous layer".format(first_token) + ) d.operator = None # the layer name is the name of the most recent layer. d.items = [prev_names[-1]] if offset_t != 0: inner_d = d d = Descriptor() - d.operator = 'Offset' - d.items = [ inner_d, offset_t ] + d.operator = "Offset" + d.items = [inner_d, offset_t] return (d, pos) @@ -494,33 +578,39 @@ def parse_new_descriptor(tokens, pos, prev_names): # It will throw an exception if the number is out of range. # If there are no such expressions in the string, it's OK if # prev_names == None (this is useful for testing). -def replace_bracket_expressions_in_descriptor(descriptor_string, - prev_names = None): - fields = re.split(r'(\[|\])\s*', descriptor_string) +def replace_bracket_expressions_in_descriptor(descriptor_string, prev_names=None): + fields = re.split(r"(\[|\])\s*", descriptor_string) out_fields = [] i = 0 while i < len(fields): f = fields[i] i += 1 - if f == ']': + if f == "]": raise RuntimeError("Unmatched ']' in descriptor") - elif f == '[': + elif f == "[": if i + 2 >= len(fields): - raise RuntimeError("Error tokenizing string '{0}': '[' found too close " - "to the end of the descriptor.".format(descriptor_string)) + raise RuntimeError( + "Error tokenizing string '{0}': '[' found too close " + "to the end of the descriptor.".format(descriptor_string) + ) assert isinstance(prev_names, list) try: offset = int(fields[i]) assert offset < 0 and -offset <= len(prev_names) i += 2 # consume the int and the ']'. except: - raise RuntimeError("Error tokenizing string '{0}': expression [{1}] has an " - "invalid or out of range offset.".format(descriptor_string, fields[i])) + raise RuntimeError( + "Error tokenizing string '{0}': expression [{1}] has an " + "invalid or out of range offset.".format( + descriptor_string, fields[i] + ) + ) this_field = prev_names[offset] out_fields.append(this_field) else: out_fields.append(f) - return ''.join(out_fields) + return "".join(out_fields) + # tokenizes 'descriptor_string' into the tokens that may be part of Descriptors. # Note: for convenience in parsing, we add the token 'end-of-string' to this @@ -535,22 +625,22 @@ def replace_bracket_expressions_in_descriptor(descriptor_string, # tokenize_descriptor('Append(-1, 0, 1, [-2]@0)', prev_names = ['a', 'b', 'c', 'd']) # the [-2] would get replaced with prev_names[-2] = 'c', returning: # [ 'Append', '(', '-1', ',', '0', ',', '1', ',', 'c', '@', '0', ')' ] -def tokenize_descriptor(descriptor_string, - prev_names = None): +def tokenize_descriptor(descriptor_string, prev_names=None): # split on '(', ')', ',', '@', and space. Note: the parenthesis () in the # regexp causes it to output the stuff inside the () as if it were a field, # which is how the call to re.split() keeps characters like '(' and ')' as # tokens. - fields = re.split(r'(\(|\)|@|,|\s)\s*', - replace_bracket_expressions_in_descriptor(descriptor_string, - prev_names)) + fields = re.split( + r"(\(|\)|@|,|\s)\s*", + replace_bracket_expressions_in_descriptor(descriptor_string, prev_names), + ) ans = [] for f in fields: # don't include fields that are space, or are empty. - if re.match(r'^\s*$', f) is None: + if re.match(r"^\s*$", f) is None: ans.append(f) - ans.append('end of string') + ans.append("end of string") return ans @@ -569,31 +659,36 @@ def parse_config_line(orig_config_line): # Remove comments. # note: splitting on '#' will always give at least one field... python # treats splitting on space as a special case that may give zero fields. - config_line = orig_config_line.split('#')[0] + config_line = orig_config_line.split("#")[0] # Note: this set of allowed characters may have to be expanded in future. x = re.search('[^a-zA-Z0-9\.\-\(\)@_=,/+:\s"]', config_line) if x is not None: bad_char = x.group(0) if bad_char == "'": - raise RuntimeError("Xconfig line has disallowed character ' (use " - "double quotes for strings containing = signs)") + raise RuntimeError( + "Xconfig line has disallowed character ' (use " + "double quotes for strings containing = signs)" + ) else: - raise RuntimeError("Xconfig line has disallowed character: {0}" - .format(bad_char)) + raise RuntimeError( + "Xconfig line has disallowed character: {0}".format(bad_char) + ) # Now split on space; later we may splice things back together. - fields=config_line.split() + fields = config_line.split() if len(fields) == 0: - return None # Line was only whitespace after removing comments. + return None # Line was only whitespace after removing comments. first_token = fields[0] # if first_token does not look like 'foo-bar' or 'foo-bar2', then die. - if re.match('^[a-z][-a-z0-9]+$', first_token) is None: - raise RuntimeError("Error parsing config line (first field doesn't look right).") + if re.match("^[a-z][-a-z0-9]+$", first_token) is None: + raise RuntimeError( + "Error parsing config line (first field doesn't look right)." + ) # get rid of the first field which we put in 'first_token'. fields = fields[1:] - rest_of_line = ' '.join(fields) + rest_of_line = " ".join(fields) # rest of the line can be of the form 'a=1 b=" x=1 y=2 " c=Append( i1, i2)' positions = [x.start() for x in re.finditer('"', rest_of_line)] if not len(positions) % 2 == 0: @@ -612,11 +707,13 @@ def parse_config_line(orig_config_line): end = positions[i * 2 + 1] line_before_start = rest_of_line[:start] - inside_quotes=rest_of_line[start+1:end].replace('=', '?') - line_after_end = rest_of_line[end + 1:] + inside_quotes = rest_of_line[start + 1 : end].replace("=", "?") + line_after_end = rest_of_line[end + 1 :] # the reason why we include the spaces here, is to keep the length of # rest_of_line the same, and the positions in 'positions' valid. - new_rest_of_line = line_before_start + ' ' + inside_quotes + ' ' + line_after_end + new_rest_of_line = ( + line_before_start + " " + inside_quotes + " " + line_after_end + ) assert len(new_rest_of_line) == len(rest_of_line) rest_of_line = new_rest_of_line @@ -624,69 +721,95 @@ def parse_config_line(orig_config_line): # then after the below we'll get # fields = ['', 'input', 'Append(foo, bar)', 'foo', 'bar'] ans_dict = dict() - other_fields = re.split(r'\s*([-a-zA-Z0-9_]*)=', rest_of_line) - if not (other_fields[0] == '' and len(other_fields) % 2 == 1): - raise RuntimeError("Could not parse config line."); + other_fields = re.split(r"\s*([-a-zA-Z0-9_]*)=", rest_of_line) + if not (other_fields[0] == "" and len(other_fields) % 2 == 1): + raise RuntimeError("Could not parse config line.") fields += other_fields[1:] num_variables = len(fields) // 2 for i in range(num_variables): var_name = fields[i * 2] var_value = fields[i * 2 + 1] - if re.match(r'[a-zA-Z_]', var_name) is None: - raise RuntimeError("Expected variable name '{0}' to start with alphabetic character or _, " - "in config line {1}".format(var_name, orig_config_line)) + if re.match(r"[a-zA-Z_]", var_name) is None: + raise RuntimeError( + "Expected variable name '{0}' to start with alphabetic character or _, " + "in config line {1}".format(var_name, orig_config_line) + ) if var_name in ans_dict: - raise RuntimeError("Config line has multiply defined variable {0}: {1}".format( - var_name, orig_config_line)) + raise RuntimeError( + "Config line has multiply defined variable {0}: {1}".format( + var_name, orig_config_line + ) + ) # Teplace any '?' characters that we inserted above, with the original # '=' characters. # The 'strip()' is to remove initial and final spaces that we might # have inserted while processing double-quotes above (search above # for the string 'inside_quotes' to see what is meant by this). - ans_dict[var_name] = var_value.replace('?', '=').strip() + ans_dict[var_name] = var_value.replace("?", "=").strip() return (first_token, ans_dict) def test_library(): tokenize_test = lambda x: tokenize_descriptor(x)[:-1] # remove 'end of string' - assert tokenize_test("hi") == ['hi'] - assert tokenize_test("hi there") == ['hi', 'there'] - assert tokenize_test("hi,there") == ['hi', ',', 'there'] - assert tokenize_test("hi@-1,there") == ['hi', '@', '-1', ',', 'there'] - assert tokenize_test("hi(there)") == ['hi', '(', 'there', ')'] - assert tokenize_descriptor("[-1]@2", ['foo', 'bar'])[:-1] == ['bar', '@', '2' ] - assert tokenize_descriptor("[-2].special@2", ['foo', 'bar'])[:-1] == ['foo.special', '@', '2' ] - - assert Descriptor('foo').str() == 'foo' - assert Descriptor('Sum(foo,bar)').str() == 'Sum(foo, bar)' - assert Descriptor('Sum(Offset(foo,1),Offset(foo,0))').str() == 'Sum(Offset(foo, 1), Offset(foo, 0))' - for x in [ 'Append(foo, Sum(bar, Offset(baz, 1)))', 'Failover(foo, Offset(bar, -1))', - 'IfDefined(Round(baz, 3))', 'Switch(foo1, Offset(foo2, 2), Offset(foo3, 3))', - 'IfDefined(ReplaceIndex(ivector, t, 0))', 'ReplaceIndex(foo, x, 0)' ]: + assert tokenize_test("hi") == ["hi"] + assert tokenize_test("hi there") == ["hi", "there"] + assert tokenize_test("hi,there") == ["hi", ",", "there"] + assert tokenize_test("hi@-1,there") == ["hi", "@", "-1", ",", "there"] + assert tokenize_test("hi(there)") == ["hi", "(", "there", ")"] + assert tokenize_descriptor("[-1]@2", ["foo", "bar"])[:-1] == ["bar", "@", "2"] + assert tokenize_descriptor("[-2].special@2", ["foo", "bar"])[:-1] == [ + "foo.special", + "@", + "2", + ] + + assert Descriptor("foo").str() == "foo" + assert Descriptor("Sum(foo,bar)").str() == "Sum(foo, bar)" + assert ( + Descriptor("Sum(Offset(foo,1),Offset(foo,0))").str() + == "Sum(Offset(foo, 1), Offset(foo, 0))" + ) + for x in [ + "Append(foo, Sum(bar, Offset(baz, 1)))", + "Failover(foo, Offset(bar, -1))", + "IfDefined(Round(baz, 3))", + "Switch(foo1, Offset(foo2, 2), Offset(foo3, 3))", + "IfDefined(ReplaceIndex(ivector, t, 0))", + "ReplaceIndex(foo, x, 0)", + ]: if not Descriptor(x).str() == x: print("Error: '{0}' != '{1}'".format(Descriptor(x).str(), x)) - prev_names = ['last_but_one_layer', 'prev_layer'] - for x, y in [ ('Sum(foo,bar)', 'Sum(foo, bar)'), - ('Sum(foo1,bar-3_4)', 'Sum(foo1, bar-3_4)'), - ('Append(input@-3, input@0, input@3)', - 'Append(Offset(input, -3), input, Offset(input, 3))'), - ('Append(-3,0,3)', - 'Append(Offset(prev_layer, -3), prev_layer, Offset(prev_layer, 3))'), - ('[-1]', 'prev_layer'), - ('Scale(2.0,foo)', 'Scale(2.0, foo)'), - ('Const(0.5,500)', 'Const(0.5, 500)'), - ('[-2]', 'last_but_one_layer'), - ('[-2]@3', - 'Offset(last_but_one_layer, 3)') ]: + prev_names = ["last_but_one_layer", "prev_layer"] + for x, y in [ + ("Sum(foo,bar)", "Sum(foo, bar)"), + ("Sum(foo1,bar-3_4)", "Sum(foo1, bar-3_4)"), + ( + "Append(input@-3, input@0, input@3)", + "Append(Offset(input, -3), input, Offset(input, 3))", + ), + ( + "Append(-3,0,3)", + "Append(Offset(prev_layer, -3), prev_layer, Offset(prev_layer, 3))", + ), + ("[-1]", "prev_layer"), + ("Scale(2.0,foo)", "Scale(2.0, foo)"), + ("Const(0.5,500)", "Const(0.5, 500)"), + ("[-2]", "last_but_one_layer"), + ("[-2]@3", "Offset(last_but_one_layer, 3)"), + ]: if not Descriptor(x, prev_names).str() == y: print("Error: '{0}' != '{1}'".format(Descriptor(x).str(), y)) + print(parse_config_line("affine-layer input=Append(foo, bar) foo=bar")) + print( + parse_config_line( + 'affine-layer x="y z" input=Append(foo, bar) foo=bar opt2="a=1 b=2"' + ) + ) + print(parse_config_line("affine-layer1 input=Append(foo, bar) foo=bar")) + print(parse_config_line("affine-layer")) - print(parse_config_line('affine-layer input=Append(foo, bar) foo=bar')) - print(parse_config_line('affine-layer x="y z" input=Append(foo, bar) foo=bar opt2="a=1 b=2"')) - print(parse_config_line('affine-layer1 input=Append(foo, bar) foo=bar')) - print(parse_config_line('affine-layer')) if __name__ == "__main__": test_library() diff --git a/egs2/TEMPLATE/asr1/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py b/egs2/TEMPLATE/asr1/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py index e2a76d1a830..81c9b68cbfe 100755 --- a/egs2/TEMPLATE/asr1/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py +++ b/egs2/TEMPLATE/asr1/steps/segmentation/convert_utt2spk_and_segments_to_rttm.py @@ -26,27 +26,28 @@ import argparse import sys -sys.path.insert(0, 'steps') +sys.path.insert(0, "steps") import libs.common as common_lib def get_args(): parser = argparse.ArgumentParser( description="""This script converts kaldi-style utt2spk and - segments to a NIST RTTM file""") - - parser.add_argument("--reco2file-and-channel", type=str, - action=common_lib.NullstrToNoneAction, - help="""Input reco2file_and_channel. + segments to a NIST RTTM file""" + ) + + parser.add_argument( + "--reco2file-and-channel", + type=str, + action=common_lib.NullstrToNoneAction, + help="""Input reco2file_and_channel. The format is . If not provided, then is taken as the - with = 1.""") - parser.add_argument("utt2spk", type=str, - help="Input utt2spk file") - parser.add_argument("segments", type=str, - help="Input segments file") - parser.add_argument("rttm_file", type=str, - help="Output RTTM file") + with = 1.""", + ) + parser.add_argument("utt2spk", type=str, help="Input utt2spk file") + parser.add_argument("segments", type=str, help="Input segments file") + parser.add_argument("rttm_file", type=str, help="Output RTTM file") args = parser.parse_args() return args @@ -68,8 +69,9 @@ def main(): parts = line.strip().split() utt2spk[parts[0]] = parts[1] - with common_lib.smart_open(args.segments) as segments_reader, \ - common_lib.smart_open(args.rttm_file, 'w') as rttm_writer: + with common_lib.smart_open(args.segments) as segments_reader, common_lib.smart_open( + args.rttm_file, "w" + ) as rttm_writer: for line in segments_reader: parts = line.strip().split() @@ -86,16 +88,21 @@ def main(): except KeyError: raise RuntimeError( "Could not find recording {0} in {1}".format( - reco, args.reco2file_and_channel)) + reco, args.reco2file_and_channel + ) + ) start_time = float(parts[2]) duration = float(parts[3]) - start_time - print("SPEAKER {0} {1} {2:7.2f} {3:7.2f} " - " {4} ".format( - file_id, channel, start_time, - duration, spkr), file=rttm_writer) + print( + "SPEAKER {0} {1} {2:7.2f} {3:7.2f} " + " {4} ".format( + file_id, channel, start_time, duration, spkr + ), + file=rttm_writer, + ) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/egs2/TEMPLATE/asr1/utils/data/extend_segment_times.py b/egs2/TEMPLATE/asr1/utils/data/extend_segment_times.py index 0b498b5ebe8..55acaf68f8e 100755 --- a/egs2/TEMPLATE/asr1/utils/data/extend_segment_times.py +++ b/egs2/TEMPLATE/asr1/utils/data/extend_segment_times.py @@ -6,25 +6,42 @@ from collections import defaultdict -parser = argparse.ArgumentParser(description=""" +parser = argparse.ArgumentParser( + description=""" Usage: extend_segment_times.py [options] output-segments This program pads the times in a 'segments' file (e.g. data/train/segments) with specified left and right context (for cases where there was no - silence padding in the original segments file)""") - -parser.add_argument("--start-padding", type = float, default = 0.1, - help="Amount of padding, in seconds, for the start time of " - "each segment (start times <0 will be set to zero).") -parser.add_argument("--end-padding", type = float, default = 0.1, - help="Amount of padding, in seconds, for the end time of " - "each segment.") -parser.add_argument("--last-segment-end-padding", type = float, default = 0.1, - help="Amount of padding, in seconds, for the end time of " - "the last segment of each file (maximum allowed).") -parser.add_argument("--fix-overlapping-segments", type = str, - default = 'true', choices=['true', 'false'], - help="If true, prevent segments from overlapping as a result " - "of the padding (or that were already overlapping)") + silence padding in the original segments file)""" +) + +parser.add_argument( + "--start-padding", + type=float, + default=0.1, + help="Amount of padding, in seconds, for the start time of " + "each segment (start times <0 will be set to zero).", +) +parser.add_argument( + "--end-padding", + type=float, + default=0.1, + help="Amount of padding, in seconds, for the end time of " "each segment.", +) +parser.add_argument( + "--last-segment-end-padding", + type=float, + default=0.1, + help="Amount of padding, in seconds, for the end time of " + "the last segment of each file (maximum allowed).", +) +parser.add_argument( + "--fix-overlapping-segments", + type=str, + default="true", + choices=["true", "false"], + help="If true, prevent segments from overlapping as a result " + "of the padding (or that were already overlapping)", +) args = parser.parse_args() @@ -48,17 +65,18 @@ while True: line = sys.stdin.readline() - if line == '': + if line == "": break try: - [ utt_id, recording_id, start_time, end_time ] = line.split() + [utt_id, recording_id, start_time, end_time] = line.split() start_time = float(start_time) end_time = float(end_time) except: sys.exit("extend_segment_times.py: could not interpret line: " + line) if not end_time > start_time: - print("extend_segment_times.py: bad segment (ignoring): " + line, - file = sys.stderr) + print( + "extend_segment_times.py: bad segment (ignoring): " + line, file=sys.stderr + ) recording_to_utt_indexes[recording_id].append(len(entries)) entries.append([utt_id, recording_id, start_time, end_time]) @@ -68,10 +86,11 @@ # this_entries is a list of lists, sorted on mid-time. # Notice: because lists are objects, when we change 'this_entries' # we change the underlying entries. - this_entries = sorted([ entries[x] for x in utt_indexes ], - key = lambda x : 0.5 * (x[2] + x[3])) + this_entries = sorted( + [entries[x] for x in utt_indexes], key=lambda x: 0.5 * (x[2] + x[3]) + ) min_time = 0 - max_time = max([ x[3] for x in this_entries ]) + args.last_segment_end_padding + max_time = max([x[3] for x in this_entries]) + args.last_segment_end_padding start_padding = args.start_padding end_padding = args.end_padding for n in range(len(this_entries)): @@ -80,39 +99,44 @@ for n in range(len(this_entries) - 1): this_end_time = this_entries[n][3] - next_start_time = this_entries[n+1][2] - if this_end_time > next_start_time and args.fix_overlapping_segments == 'true': + next_start_time = this_entries[n + 1][2] + if this_end_time > next_start_time and args.fix_overlapping_segments == "true": midpoint = 0.5 * (this_end_time + next_start_time) this_entries[n][3] = midpoint - this_entries[n+1][2] = midpoint + this_entries[n + 1][2] = midpoint num_times_fixed += 1 # this prints a number with a certain number of digits after # the point, while removing trailing zeros. def FloatToString(f): - num_digits = 6 # we want to print 6 digits after the zero + num_digits = 6 # we want to print 6 digits after the zero g = f while abs(g) > 1.0: g *= 0.1 num_digits += 1 - format_str = '%.{0}g'.format(num_digits) + format_str = "%.{0}g".format(num_digits) return format_str % f + for entry in entries: - [ utt_id, recording_id, start_time, end_time ] = entry + [utt_id, recording_id, start_time, end_time] = entry if not start_time < end_time: - print("extend_segment_times.py: bad segment after processing (ignoring): " + - ' '.join(entry), file = sys.stderr) + print( + "extend_segment_times.py: bad segment after processing (ignoring): " + + " ".join(entry), + file=sys.stderr, + ) continue print(utt_id, recording_id, FloatToString(start_time), FloatToString(end_time)) -print("extend_segment_times.py: extended {0} segments; fixed {1} " - "overlapping segments".format(len(entries), num_times_fixed), - file = sys.stderr) +print( + "extend_segment_times.py: extended {0} segments; fixed {1} " + "overlapping segments".format(len(entries), num_times_fixed), + file=sys.stderr, +) ## test: # (echo utt1 reco1 0.2 6.2; echo utt2 reco1 6.3 9.8 )| extend_segment_times.py # and also try the above with the options --last-segment-end-padding=0.0 --fix-overlapping-segments=false - diff --git a/egs2/TEMPLATE/tts1/README.md b/egs2/TEMPLATE/tts1/README.md index 9d6cbbe77f4..412545b0551 100644 --- a/egs2/TEMPLATE/tts1/README.md +++ b/egs2/TEMPLATE/tts1/README.md @@ -18,7 +18,7 @@ This is a template of TTS recipe for ESPnet2. * [How to run](#how-to-run) * [FastSpeech training](#fastspeech-training) * [FastSpeech2 training](#fastspeech2-training) - * [Multi speaker model with X-vector training](#multi-speaker-model-with-x-vector-training) + * [Multi speaker model with speaker embedding training](#multi-speaker-model-with-speaker-embedding-training) * [Multi speaker model with speaker ID embedding training](#multi-speaker-model-with-speaker-id-embedding-training) * [Multi language model with language ID embedding training](#multi-language-model-with-language-id-embedding-training) * [VITS training](#vits-training) @@ -103,11 +103,12 @@ Then, you can continue the training on the main script: Wav dumping stage. This stage reformats `wav.scp` in data directories. -Additionally, We support X-vector extraction in this stage as you can use in ESPnet1. -If you specify `--use_xvector true` (Default: `use_xvector=false`), we extract X-vectors. -You can select the type of toolkit to use (kaldi, speechbrain, or espnet) when you specify `--xvector_tool