From e3ea940654b51aa7dc1d44ebb9766749ea33d794 Mon Sep 17 00:00:00 2001 From: Qiantong Xu Date: Fri, 11 Dec 2020 14:55:41 -0800 Subject: [PATCH] MLS release Summary: title Reviewed By: vineelpratap Differential Revision: D25506325 fbshipit-source-id: 2b788c95c7cfa75fb3b45d447b575085ec2088b7 --- recipes/mls/README.md | 81 +++++++++++++++++++++++++++++++ recipes/mls/decode/Dutch.cfg | 11 +++++ recipes/mls/decode/English.cfg | 11 +++++ recipes/mls/decode/French.cfg | 11 +++++ recipes/mls/decode/German.cfg | 11 +++++ recipes/mls/decode/Italian.cfg | 11 +++++ recipes/mls/decode/Polish.cfg | 11 +++++ recipes/mls/decode/Portuguese.cfg | 11 +++++ recipes/mls/decode/Spanish.cfg | 11 +++++ recipes/mls/train/Dutch.cfg | 39 +++++++++++++++ recipes/mls/train/English.cfg | 40 +++++++++++++++ recipes/mls/train/French.cfg | 39 +++++++++++++++ recipes/mls/train/German.cfg | 39 +++++++++++++++ recipes/mls/train/Italian.cfg | 39 +++++++++++++++ recipes/mls/train/Polish.cfg | 39 +++++++++++++++ recipes/mls/train/Portuguese.cfg | 39 +++++++++++++++ recipes/mls/train/Spanish.cfg | 39 +++++++++++++++ 17 files changed, 482 insertions(+) create mode 100644 recipes/mls/README.md create mode 100644 recipes/mls/decode/Dutch.cfg create mode 100644 recipes/mls/decode/English.cfg create mode 100644 recipes/mls/decode/French.cfg create mode 100644 recipes/mls/decode/German.cfg create mode 100644 recipes/mls/decode/Italian.cfg create mode 100644 recipes/mls/decode/Polish.cfg create mode 100644 recipes/mls/decode/Portuguese.cfg create mode 100644 recipes/mls/decode/Spanish.cfg create mode 100644 recipes/mls/train/Dutch.cfg create mode 100644 recipes/mls/train/English.cfg create mode 100644 recipes/mls/train/French.cfg create mode 100644 recipes/mls/train/German.cfg create mode 100644 recipes/mls/train/Italian.cfg create mode 100644 recipes/mls/train/Polish.cfg create mode 100644 recipes/mls/train/Portuguese.cfg create mode 100644 recipes/mls/train/Spanish.cfg diff --git a/recipes/mls/README.md b/recipes/mls/README.md new file mode 100644 index 00000000..8fb34705 --- /dev/null +++ b/recipes/mls/README.md @@ -0,0 +1,81 @@ +# Multilingual LibriSpeech (MLS) + +Multilingual LibriSpeech (MLS) dataset is a large multilingual corpus suitable for speech research. The dataset is derived from read audiobooks from LibriVox and consists of 8 languages - English, German, Dutch, Spanish, French, Italian, Portuguese, Polish. It is released on [OpenSLR](http://openslr.org/). + +This directory contains pretrained monolingual model releasing and steps for results reproduction. + + +## Dependencies + +- [flashlight](https://github.com/facebookresearch/flashlight) + + +## Tokens and Lexicons + +| Language | Token Set | Train Lexicon | Joint Lexicon (Train + GB) | +|:----------:|:-------------------------------------------------------------------------:|:----------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------:| +| English | [TOKEN](s3://dl.fbaipublicfiles.com/wav2letter/mls/english/tokens.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/english/train_lexicon.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/english/joint_lexicon.txt) | +| German | [TOKEN](s3://dl.fbaipublicfiles.com/wav2letter/mls/german/tokens.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/german/train_lexicon.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/german/joint_lexicon.txt) | +| Dutch | [TOKEN](s3://dl.fbaipublicfiles.com/wav2letter/mls/dutch/tokens.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/dutch/train_lexicon.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/dutch/joint_lexicon.txt) | +| French | [TOKEN](s3://dl.fbaipublicfiles.com/wav2letter/mls/french/tokens.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/french/train_lexicon.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/french/joint_lexicon.txt) | +| Spanish | [TOKEN](s3://dl.fbaipublicfiles.com/wav2letter/mls/spanish/tokens.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/spanish/train_lexicon.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/spanish/joint_lexicon.txt) | +| Italian | [TOKEN](s3://dl.fbaipublicfiles.com/wav2letter/mls/italian/tokens.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/italian/train_lexicon.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/italian/joint_lexicon.txt) | +| Portuguese | [TOKEN](s3://dl.fbaipublicfiles.com/wav2letter/mls/portuguese/tokens.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/portuguese/train_lexicon.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/portuguese/joint_lexicon.txt) | +| Polish | [TOKEN](s3://dl.fbaipublicfiles.com/wav2letter/mls/polish/tokens.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/polish/train_lexicon.txt) | [Lexicon](s3://dl.fbaipublicfiles.com/wav2letter/mls/polish/joint_lexicon.txt) | + + +## Pre-trained acoustic models + +| Language | Architecture | Acoustic Model | +|:----------:|:----------------------------------------------------------------------:|:---------------------------------------------------------------------:| +| English | [Arch](s3://dl.fbaipublicfiles.com/wav2letter/mls/english/arch.txt) | [Model](s3://dl.fbaipublicfiles.com/wav2letter/mls/english/am.bin) | +| German | [Arch](s3://dl.fbaipublicfiles.com/wav2letter/mls/german/arch.txt) | [Model](s3://dl.fbaipublicfiles.com/wav2letter/mls/german/am.bin) | +| Dutch | [Arch](s3://dl.fbaipublicfiles.com/wav2letter/mls/dutch/arch.txt) | [Model](s3://dl.fbaipublicfiles.com/wav2letter/mls/dutch/am.bin) | +| French | [Arch](s3://dl.fbaipublicfiles.com/wav2letter/mls/french/arch.txt) | [Model](s3://dl.fbaipublicfiles.com/wav2letter/mls/french/am.bin) | +| Spanish | [Arch](s3://dl.fbaipublicfiles.com/wav2letter/mls/spanish/arch.txt) | [Model](s3://dl.fbaipublicfiles.com/wav2letter/mls/spanish/am.bin) | +| Italian | [Arch](s3://dl.fbaipublicfiles.com/wav2letter/mls/italian/arch.txt) | [Model](s3://dl.fbaipublicfiles.com/wav2letter/mls/italian/am.bin) | +| Portuguese | [Arch](s3://dl.fbaipublicfiles.com/wav2letter/mls/portuguese/arch.txt) | [Model](s3://dl.fbaipublicfiles.com/wav2letter/mls/portuguese/am.bin) | +| Polish | [Arch](s3://dl.fbaipublicfiles.com/wav2letter/mls/polish/arch.txt) | [Model](s3://dl.fbaipublicfiles.com/wav2letter/mls/polish/am.bin) | + + +## Pre-trained language models + +The `5-gram_lm.arpa` from the tar ball should be used to decode each acoustic model. For faster serialization, people may convert those arpa files into binaries following steps [here](https://kheafield.com/code/kenlm/estimation/). + +| Language | Language Model | +|:----------:|:--------------------------------------------------------------------:| +| English | [Model](https://dl.fbaipublicfiles.com/mls/mls_lm_english.tar.gz) | +| German | [Model](https://dl.fbaipublicfiles.com/mls/mls_lm_german.tar.gz) | +| Dutch | [Model](https://dl.fbaipublicfiles.com/mls/mls_lm_dutch.tar.gz) | +| French | [Model](https://dl.fbaipublicfiles.com/mls/mls_lm_french.tar.gz) | +| Spanish | [Model](https://dl.fbaipublicfiles.com/mls/mls_lm_spanish.tar.gz) | +| Italian | [Model](https://dl.fbaipublicfiles.com/mls/mls_lm_italian.tar.gz) | +| Portuguese | [Model](https://dl.fbaipublicfiles.com/mls/mls_lm_portuguese.tar.gz) | +| Polish | [Model](https://dl.fbaipublicfiles.com/mls/mls_lm_polish.tar.gz) | + + +## Usage + +### Training +``` +[...]/flashlight/build/bin/asr/fl_asr_train train --flagsfile=train/.cfg --minloglevel=0 --logtostderr=1 +``` + +### Decoding +``` +[...]/flashlight/build/bin/asr/fl_asr_decode --flagsfile=decode/.cfg +``` + +## Citation + +``` +@article{Pratap2020MLSAL, + title={MLS: A Large-Scale Multilingual Dataset for Speech Research}, + author={Vineel Pratap and Qiantong Xu and Anuroop Sriram and Gabriel Synnaeve and Ronan Collobert}, + journal={ArXiv}, + year={2020}, + volume={abs/2012.03411} +} +``` + +NOTE: We have made few updates to the MLS dataset after our INTERSPEECH paper was submitted to include more number of hours and also to improve the quality of transcripts. To avoid confusion (by having multiple versions), we are making **ONLY** one release with all the improvements included. For accurate dataset statistics and baselines, please refer to the arXiv paper above. diff --git a/recipes/mls/decode/Dutch.cfg b/recipes/mls/decode/Dutch.cfg new file mode 100644 index 00000000..0d265065 --- /dev/null +++ b/recipes/mls/decode/Dutch.cfg @@ -0,0 +1,11 @@ +--am=[...]/am.bin +--lm=[...]/5-gram_lm.arpa +--lexicon=[...]/joint_lexicon.txt +--datadir=[...] +--test=test.lst +--emission_dir='' +--lmweight=1.37 +--wordscore=-0.72 +--beamsize=1500 +--beamthreshold=100 +--beamsizetoken=50 diff --git a/recipes/mls/decode/English.cfg b/recipes/mls/decode/English.cfg new file mode 100644 index 00000000..148444cf --- /dev/null +++ b/recipes/mls/decode/English.cfg @@ -0,0 +1,11 @@ +--am=[...]/am.bin +--lm=[...]/5-gram_lm.arpa +--lexicon=[...]/joint_lexicon.txt +--datadir=[...] +--test=test.lst +--emission_dir='' +--lmweight=0.92 +--wordscore=1.04 +--beamsize=1500 +--beamthreshold=100 +--beamsizetoken=50 diff --git a/recipes/mls/decode/French.cfg b/recipes/mls/decode/French.cfg new file mode 100644 index 00000000..a2d984ee --- /dev/null +++ b/recipes/mls/decode/French.cfg @@ -0,0 +1,11 @@ +--am=[...]/am.bin +--lm=[...]/5-gram_lm.arpa +--lexicon=[...]/joint_lexicon.txt +--datadir=[...] +--test=test.lst +--emission_dir='' +--lmweight=1.58 +--wordscore=1.36 +--beamsize=1500 +--beamthreshold=100 +--beamsizetoken=50 diff --git a/recipes/mls/decode/German.cfg b/recipes/mls/decode/German.cfg new file mode 100644 index 00000000..6916eb97 --- /dev/null +++ b/recipes/mls/decode/German.cfg @@ -0,0 +1,11 @@ +--am=[...]/am.bin +--lm=[...]/5-gram_lm.arpa +--lexicon=[...]/joint_lexicon.txt +--datadir=[...] +--test=test.lst +--emission_dir='' +--lmweight=1.03 +--wordscore=-0.19 +--beamsize=1500 +--beamthreshold=100 +--beamsizetoken=50 diff --git a/recipes/mls/decode/Italian.cfg b/recipes/mls/decode/Italian.cfg new file mode 100644 index 00000000..cc845ef7 --- /dev/null +++ b/recipes/mls/decode/Italian.cfg @@ -0,0 +1,11 @@ +--am=[...]/am.bin +--lm=[...]/5-gram_lm.arpa +--lexicon=[...]/joint_lexicon.txt +--datadir=[...] +--test=test.lst +--emission_dir='' +--lmweight=2.82 +--wordscore=-1.28 +--beamsize=1500 +--beamthreshold=100 +--beamsizetoken=50 diff --git a/recipes/mls/decode/Polish.cfg b/recipes/mls/decode/Polish.cfg new file mode 100644 index 00000000..f338be6e --- /dev/null +++ b/recipes/mls/decode/Polish.cfg @@ -0,0 +1,11 @@ +--am=[...]/am.bin +--lm=[...]/5-gram_lm.arpa +--lexicon=[...]/joint_lexicon.txt +--datadir=[...] +--test=test.lst +--emission_dir='' +--lmweight=2.40 +--wordscore=-0.82 +--beamsize=1500 +--beamthreshold=100 +--beamsizetoken=50 diff --git a/recipes/mls/decode/Portuguese.cfg b/recipes/mls/decode/Portuguese.cfg new file mode 100644 index 00000000..9808ea58 --- /dev/null +++ b/recipes/mls/decode/Portuguese.cfg @@ -0,0 +1,11 @@ +--am=[...]/am.bin +--lm=[...]/5-gram_lm.arpa +--lexicon=[...]/joint_lexicon.txt +--datadir=[...] +--test=test.lst +--emission_dir='' +--lmweight=1.34 +--wordscore=-0.81 +--beamsize=1500 +--beamthreshold=100 +--beamsizetoken=50 diff --git a/recipes/mls/decode/Spanish.cfg b/recipes/mls/decode/Spanish.cfg new file mode 100644 index 00000000..10564008 --- /dev/null +++ b/recipes/mls/decode/Spanish.cfg @@ -0,0 +1,11 @@ +--am=[...]/am.bin +--lm=[...]/5-gram_lm.arpa +--lexicon=[...]/joint_lexicon.txt +--datadir=[...] +--test=test.lst +--emission_dir='' +--lmweight=0.91 +--wordscore=1.12 +--beamsize=1500 +--beamthreshold=100 +--beamsizetoken=50 diff --git a/recipes/mls/train/Dutch.cfg b/recipes/mls/train/Dutch.cfg new file mode 100644 index 00000000..ae4c9772 --- /dev/null +++ b/recipes/mls/train/Dutch.cfg @@ -0,0 +1,39 @@ +--target=ltr +--mfsc +--surround=| +--wordseparator=| +--criterion=ctc +--labelsmooth=0.05 +--dataorder=input +--memstepsize=5000000 +--pcttraineval=1 +--pctteacherforcing=99 +--sampletarget=0.01 +--netoptim=adagrad +--lr=0.02 +--lr_decay=200 +--lr_decay_step=100 +--adambeta1=0.95 +--adambeta2=0.99 +--momentum=0.95 +--warmup=64001 +--maxgradnorm=1 +--onorm=target +--sqnorm +--nthread=6 +--batchsize=8 +--filterbanks=80 +--saug_start_update=10000 +--saug_fmaskf=30 +--saug_fmaskn=2 +--saug_tmaskt=50 +--saug_tmaskp=0.1 +--saug_tmaskn=10 +--datadir=[...] +--archdir=[...] +--arch=arch.txt +--tokensdir=[...] +--tokens=tokens.txt +--lexicon=[...]/train_lexicon.txt +--train=train.lst +--valid=dev.lst diff --git a/recipes/mls/train/English.cfg b/recipes/mls/train/English.cfg new file mode 100644 index 00000000..cf5018d7 --- /dev/null +++ b/recipes/mls/train/English.cfg @@ -0,0 +1,40 @@ +--target=ltr +--mfsc +--surround=| +--wordseparator=| +--criterion=ctc +--labelsmooth=0.05 +--dataorder=input +--memstepsize=5000000 +--pcttraineval=1 +--pctteacherforcing=99 +--sampletarget=0.01 +--netoptim=adagrad +--lr=0.02 +--lr_decay=30 +--lr_decay_step=20 +--adambeta1=0.95 +--adambeta2=0.99 +--momentum=0.95 +--warmup=64001 +--maxgradnorm=1 +--onorm=target +--sqnorm +--nthread=6 +--batchsize=8 +--filterbanks=80 +--saug_start_update=10000 +--saug_fmaskf=30 +--saug_fmaskn=2 +--saug_tmaskt=50 +--saug_tmaskp=0.1 +--saug_tmaskn=10 +--reportiters=5000 +--datadir=[...] +--archdir=[...] +--arch=arch.txt +--tokensdir=[...] +--tokens=tokens.txt +--lexicon=[...]/train_lexicon.txt +--train=train.lst +--valid=dev.lst diff --git a/recipes/mls/train/French.cfg b/recipes/mls/train/French.cfg new file mode 100644 index 00000000..ae4c9772 --- /dev/null +++ b/recipes/mls/train/French.cfg @@ -0,0 +1,39 @@ +--target=ltr +--mfsc +--surround=| +--wordseparator=| +--criterion=ctc +--labelsmooth=0.05 +--dataorder=input +--memstepsize=5000000 +--pcttraineval=1 +--pctteacherforcing=99 +--sampletarget=0.01 +--netoptim=adagrad +--lr=0.02 +--lr_decay=200 +--lr_decay_step=100 +--adambeta1=0.95 +--adambeta2=0.99 +--momentum=0.95 +--warmup=64001 +--maxgradnorm=1 +--onorm=target +--sqnorm +--nthread=6 +--batchsize=8 +--filterbanks=80 +--saug_start_update=10000 +--saug_fmaskf=30 +--saug_fmaskn=2 +--saug_tmaskt=50 +--saug_tmaskp=0.1 +--saug_tmaskn=10 +--datadir=[...] +--archdir=[...] +--arch=arch.txt +--tokensdir=[...] +--tokens=tokens.txt +--lexicon=[...]/train_lexicon.txt +--train=train.lst +--valid=dev.lst diff --git a/recipes/mls/train/German.cfg b/recipes/mls/train/German.cfg new file mode 100644 index 00000000..ae4c9772 --- /dev/null +++ b/recipes/mls/train/German.cfg @@ -0,0 +1,39 @@ +--target=ltr +--mfsc +--surround=| +--wordseparator=| +--criterion=ctc +--labelsmooth=0.05 +--dataorder=input +--memstepsize=5000000 +--pcttraineval=1 +--pctteacherforcing=99 +--sampletarget=0.01 +--netoptim=adagrad +--lr=0.02 +--lr_decay=200 +--lr_decay_step=100 +--adambeta1=0.95 +--adambeta2=0.99 +--momentum=0.95 +--warmup=64001 +--maxgradnorm=1 +--onorm=target +--sqnorm +--nthread=6 +--batchsize=8 +--filterbanks=80 +--saug_start_update=10000 +--saug_fmaskf=30 +--saug_fmaskn=2 +--saug_tmaskt=50 +--saug_tmaskp=0.1 +--saug_tmaskn=10 +--datadir=[...] +--archdir=[...] +--arch=arch.txt +--tokensdir=[...] +--tokens=tokens.txt +--lexicon=[...]/train_lexicon.txt +--train=train.lst +--valid=dev.lst diff --git a/recipes/mls/train/Italian.cfg b/recipes/mls/train/Italian.cfg new file mode 100644 index 00000000..c759bd2f --- /dev/null +++ b/recipes/mls/train/Italian.cfg @@ -0,0 +1,39 @@ +--target=ltr +--mfsc +--surround=| +--wordseparator=| +--criterion=ctc +--labelsmooth=0.05 +--dataorder=input +--memstepsize=5000000 +--pcttraineval=1 +--pctteacherforcing=99 +--sampletarget=0.01 +--netoptim=adagrad +--lr=0.02 +--lr_decay=400 +--lr_decay_step=200 +--adambeta1=0.95 +--adambeta2=0.99 +--momentum=0.95 +--warmup=64001 +--maxgradnorm=1 +--onorm=target +--sqnorm +--nthread=6 +--batchsize=8 +--filterbanks=80 +--saug_start_update=10000 +--saug_fmaskf=30 +--saug_fmaskn=2 +--saug_tmaskt=50 +--saug_tmaskp=0.1 +--saug_tmaskn=10 +--datadir=[...] +--archdir=[...] +--arch=arch.txt +--tokensdir=[...] +--tokens=tokens.txt +--lexicon=[...]/train_lexicon.txt +--train=train.lst +--valid=dev.lst diff --git a/recipes/mls/train/Polish.cfg b/recipes/mls/train/Polish.cfg new file mode 100644 index 00000000..c759bd2f --- /dev/null +++ b/recipes/mls/train/Polish.cfg @@ -0,0 +1,39 @@ +--target=ltr +--mfsc +--surround=| +--wordseparator=| +--criterion=ctc +--labelsmooth=0.05 +--dataorder=input +--memstepsize=5000000 +--pcttraineval=1 +--pctteacherforcing=99 +--sampletarget=0.01 +--netoptim=adagrad +--lr=0.02 +--lr_decay=400 +--lr_decay_step=200 +--adambeta1=0.95 +--adambeta2=0.99 +--momentum=0.95 +--warmup=64001 +--maxgradnorm=1 +--onorm=target +--sqnorm +--nthread=6 +--batchsize=8 +--filterbanks=80 +--saug_start_update=10000 +--saug_fmaskf=30 +--saug_fmaskn=2 +--saug_tmaskt=50 +--saug_tmaskp=0.1 +--saug_tmaskn=10 +--datadir=[...] +--archdir=[...] +--arch=arch.txt +--tokensdir=[...] +--tokens=tokens.txt +--lexicon=[...]/train_lexicon.txt +--train=train.lst +--valid=dev.lst diff --git a/recipes/mls/train/Portuguese.cfg b/recipes/mls/train/Portuguese.cfg new file mode 100644 index 00000000..c759bd2f --- /dev/null +++ b/recipes/mls/train/Portuguese.cfg @@ -0,0 +1,39 @@ +--target=ltr +--mfsc +--surround=| +--wordseparator=| +--criterion=ctc +--labelsmooth=0.05 +--dataorder=input +--memstepsize=5000000 +--pcttraineval=1 +--pctteacherforcing=99 +--sampletarget=0.01 +--netoptim=adagrad +--lr=0.02 +--lr_decay=400 +--lr_decay_step=200 +--adambeta1=0.95 +--adambeta2=0.99 +--momentum=0.95 +--warmup=64001 +--maxgradnorm=1 +--onorm=target +--sqnorm +--nthread=6 +--batchsize=8 +--filterbanks=80 +--saug_start_update=10000 +--saug_fmaskf=30 +--saug_fmaskn=2 +--saug_tmaskt=50 +--saug_tmaskp=0.1 +--saug_tmaskn=10 +--datadir=[...] +--archdir=[...] +--arch=arch.txt +--tokensdir=[...] +--tokens=tokens.txt +--lexicon=[...]/train_lexicon.txt +--train=train.lst +--valid=dev.lst diff --git a/recipes/mls/train/Spanish.cfg b/recipes/mls/train/Spanish.cfg new file mode 100644 index 00000000..ae4c9772 --- /dev/null +++ b/recipes/mls/train/Spanish.cfg @@ -0,0 +1,39 @@ +--target=ltr +--mfsc +--surround=| +--wordseparator=| +--criterion=ctc +--labelsmooth=0.05 +--dataorder=input +--memstepsize=5000000 +--pcttraineval=1 +--pctteacherforcing=99 +--sampletarget=0.01 +--netoptim=adagrad +--lr=0.02 +--lr_decay=200 +--lr_decay_step=100 +--adambeta1=0.95 +--adambeta2=0.99 +--momentum=0.95 +--warmup=64001 +--maxgradnorm=1 +--onorm=target +--sqnorm +--nthread=6 +--batchsize=8 +--filterbanks=80 +--saug_start_update=10000 +--saug_fmaskf=30 +--saug_fmaskn=2 +--saug_tmaskt=50 +--saug_tmaskp=0.1 +--saug_tmaskn=10 +--datadir=[...] +--archdir=[...] +--arch=arch.txt +--tokensdir=[...] +--tokens=tokens.txt +--lexicon=[...]/train_lexicon.txt +--train=train.lst +--valid=dev.lst