From 8d0f59d09d8fc0d610226d8017b8407c653bcc9f Mon Sep 17 00:00:00 2001 From: Nick White Date: Wed, 26 Aug 2015 18:14:30 +0100 Subject: [PATCH 1/6] tesstrain.sh: Only fall back to default Latin fonts if none were provided The --fontlist argument to tesstrain.sh was always ignored, even if the language had no specific fonts specified in language-specific.sh. Change this behaviour so the --fontlist argument is used if no specifc fonts are selected by language-specific.sh. --- training/language-specific.sh | 2 +- training/tesstrain_utils.sh | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/training/language-specific.sh b/training/language-specific.sh index bc64f67c88..c7be0daf6d 100755 --- a/training/language-specific.sh +++ b/training/language-specific.sh @@ -794,7 +794,7 @@ set_lang_specific_parameters() { local lang=$1 # The default text location is now given directly from the language code. TEXT_CORPUS="${FLAGS_webtext_prefix}/${lang}.corpus.txt" - FONTS=( "${LATIN_FONTS[@]}" ) + test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" ) FILTER_ARGUMENTS="" WORDLIST2DAWG_ARGUMENTS="" # These dawg factors represent the fraction of the corpus not covered by the diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh index a3ad7f5142..c5768912f7 100755 --- a/training/tesstrain_utils.sh +++ b/training/tesstrain_utils.sh @@ -16,10 +16,6 @@ # # USAGE: source tesstrain_utils.sh -FONTS=( - "Arial" \ - "Times New Roman," \ -) if [ "$(uname)" == "Darwin" ];then FONTS_DIR="/Library/Fonts/" else From 422c424995b0cdf858751bf3d85a2ce21036b355 Mon Sep 17 00:00:00 2001 From: Nick White Date: Wed, 26 Aug 2015 18:24:14 +0100 Subject: [PATCH 2/6] tesstrain.sh: Only set FONTS if they weren't set on the command line Previously the fonts specified in language-selection.sh would override any specified on the command line. This changes language-specific.sh from overriding a user request to just setting the default fonts if none are specified with --fontlist. --- training/language-specific.sh | 93 ++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 46 deletions(-) diff --git a/training/language-specific.sh b/training/language-specific.sh index c7be0daf6d..23dee3e1cd 100755 --- a/training/language-specific.sh +++ b/training/language-specific.sh @@ -780,7 +780,7 @@ VERTICAL_FONTS=( \ # holds the text corpus file for the language, used in phase F # ${FONTS[@]} # holds a sequence of applicable fonts for the language, used in -# phase F & I +# phase F & I. only set if not already set, i.e. from command line # ${TRAINING_DATA_ARGUMENTS} # non-default arguments to the training_data program used in phase T # ${FILTER_ARGUMENTS} - @@ -794,7 +794,6 @@ set_lang_specific_parameters() { local lang=$1 # The default text location is now given directly from the language code. TEXT_CORPUS="${FLAGS_webtext_prefix}/${lang}.corpus.txt" - test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" ) FILTER_ARGUMENTS="" WORDLIST2DAWG_ARGUMENTS="" # These dawg factors represent the fraction of the corpus not covered by the @@ -816,30 +815,30 @@ set_lang_specific_parameters() { case ${lang} in # Latin languages. enm ) TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported - FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; frm ) TEXT_CORPUS="${FLAGS_webtext_prefix}/fra.corpus.txt" # Make long-s substitutions for Middle French text FILTER_ARGUMENTS="--make_early_language_variant=fra" TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported. - FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; frk ) TEXT_CORPUS="${FLAGS_webtext_prefix}/deu.corpus.txt" - FONTS=( "${FRAKTUR_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${FRAKTUR_FONTS[@]}" );; ita_old ) TEXT_CORPUS="${FLAGS_webtext_prefix}/ita.corpus.txt" # Make long-s substitutions for Early Italian text FILTER_ARGUMENTS="--make_early_language_variant=ita" TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported. - FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; spa_old ) TEXT_CORPUS="${FLAGS_webtext_prefix}/spa.corpus.txt" # Make long-s substitutions for Early Spanish text FILTER_ARGUMENTS="--make_early_language_variant=spa" TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported. - FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; srp_latn ) TEXT_CORPUS=${FLAGS_webtext_prefix}/srp.corpus.txt ;; vie ) TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;; # Highly inflective languages get a bigger dawg size. # TODO(rays) Add more here! hun ) WORD_DAWG_SIZE=1000000 ;; @@ -899,14 +898,14 @@ set_lang_specific_parameters() { # Strip unrenderable words as not all fonts will render the extended # latin symbols found in Vietnamese text. WORD_DAWG_SIZE=1000000 - FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; # Cyrillic script-based languages. - rus ) FONTS=( "${RUSSIAN_FONTS[@]}" ) + rus ) test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) NUMBER_DAWG_FACTOR=0.05 WORD_DAWG_SIZE=1000000 ;; aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl ) - FONTS=( "${RUSSIAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) ;; # Special code for performing Cyrillic language-id that is trained on # Russian, Serbian, Ukranian, Belarusian, Macedonian, Tajik and Mongolian @@ -916,70 +915,70 @@ set_lang_specific_parameters() { TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" GENERATE_WORD_BIGRAMS=0 WORD_DAWG_SIZE=1000000 - FONTS=( "${RUSSIAN_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" );; # South Asian scripts mostly have a lot of different graphemes, so trim # down the MEAN_COUNT so as not to get a huge amount of text. asm | ben ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 - FONTS=( "${BENGALI_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${BENGALI_FONTS[@]}" ) ;; bih | hin | mar | nep | san ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 - FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;; bod ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 - FONTS=( "${TIBETAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;; dzo ) WORD_DAWG_FACTOR=0.01 - FONTS=( "${TIBETAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;; guj ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 - FONTS=( "${GUJARATI_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${GUJARATI_FONTS[@]}" ) ;; kan ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" - FONTS=( "${KANNADA_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${KANNADA_FONTS[@]}" ) ;; mal ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" - FONTS=( "${MALAYALAM_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${MALAYALAM_FONTS[@]}" ) ;; ori ) WORD_DAWG_FACTOR=0.01 - FONTS=( "${ORIYA_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${ORIYA_FONTS[@]}" ) ;; pan ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.01 - FONTS=( "${PUNJABI_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${PUNJABI_FONTS[@]}" ) ;; sin ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.01 - FONTS=( "${SINHALA_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${SINHALA_FONTS[@]}" ) ;; tam ) MEAN_COUNT="30" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" - FONTS=( "${TAMIL_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${TAMIL_FONTS[@]}" ) ;; tel ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" - FONTS=( "${TELUGU_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;; # SouthEast Asian scripts. khm ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - FONTS=( "${KHMER_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${KHMER_FONTS[@]}" ) ;; lao ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;; mya ) MEAN_COUNT="12" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - FONTS=( "${BURMESE_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${BURMESE_FONTS[@]}" ) ;; tha ) MEAN_COUNT="30" WORD_DAWG_FACTOR=0.01 TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" @@ -987,7 +986,7 @@ set_lang_specific_parameters() { TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" AMBIGS_FILTER_DENOMINATOR="1000" LEADING=48 - FONTS=( "${THAI_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${THAI_FONTS[@]}" ) ;; # CJK chi_sim ) @@ -998,7 +997,7 @@ set_lang_specific_parameters() { TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" FILTER_ARGUMENTS="--charset_filter=chi_sim --segmenter_lang=chi_sim" - FONTS=( "${CHI_SIM_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${CHI_SIM_FONTS[@]}" ) ;; chi_tra ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.015 @@ -1006,14 +1005,14 @@ set_lang_specific_parameters() { TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" FILTER_ARGUMENTS="--charset_filter=chi_tra --segmenter_lang=chi_tra" - FONTS=( "${CHI_TRA_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${CHI_TRA_FONTS[@]}" ) ;; jpn ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.015 GENERATE_WORD_BIGRAMS=0 TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" FILTER_ARGUMENTS="--charset_filter=jpn --segmenter_lang=jpn" - FONTS=( "${JPN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${JPN_FONTS[@]}" ) ;; kor ) MEAN_COUNT="20" WORD_DAWG_FACTOR=0.015 NUMBER_DAWG_FACTOR=0.05 @@ -1021,38 +1020,38 @@ set_lang_specific_parameters() { TRAINING_DATA_ARGUMENTS+=" --desired_bigrams=" GENERATE_WORD_BIGRAMS=0 FILTER_ARGUMENTS="--charset_filter=kor --segmenter_lang=kor" - FONTS=( "${KOREAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${KOREAN_FONTS[@]}" ) ;; # Middle-Eastern scripts. - ara ) FONTS=( "${ARABIC_FONTS[@]}" ) ;; - div ) FONTS=( "${THAANA_FONTS[@]}" ) ;; + ara ) test -z "$FONTS" && FONTS=( "${ARABIC_FONTS[@]}" ) ;; + div ) test -z "$FONTS" && FONTS=( "${THAANA_FONTS[@]}" ) ;; fas | pus | snd | uig | urd ) - FONTS=( "${PERSIAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${PERSIAN_FONTS[@]}" ) ;; heb | yid ) NUMBER_DAWG_FACTOR=0.05 WORD_DAWG_FACTOR=0.08 - FONTS=( "${HEBREW_FONTS[@]}" ) ;; - syr ) FONTS=( "${SYRIAC_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${HEBREW_FONTS[@]}" ) ;; + syr ) test -z "$FONTS" && FONTS=( "${SYRIAC_FONTS[@]}" ) ;; # Other scripts. amh | tir) - FONTS=( "${AMHARIC_FONTS[@]}" ) ;; - chr ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \ + test -z "$FONTS" && FONTS=( "${AMHARIC_FONTS[@]}" ) ;; + chr ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \ "Noto Sans Cherokee" \ ) ;; ell | grc ) NUMBER_DAWG_FACTOR=0.05 WORD_DAWG_FACTOR=0.08 - FONTS=( "${GREEK_FONTS[@]}" ) ;; - hye ) FONTS=( "${ARMENIAN_FONTS[@]}" ) ;; - iku ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;; - kat) FONTS=( "${GEORGIAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${GREEK_FONTS[@]}" ) ;; + hye ) test -z "$FONTS" && FONTS=( "${ARMENIAN_FONTS[@]}" ) ;; + iku ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;; + kat) test -z "$FONTS" && FONTS=( "${GEORGIAN_FONTS[@]}" ) ;; kat_old) TEXT_CORPUS="${FLAGS_webtext_prefix}/kat.corpus.txt" - FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;; - kir ) FONTS=( "${KYRGYZ_FONTS[@]}" ) + test -z "$FONTS" && FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;; + kir ) test -z "$FONTS" && FONTS=( "${KYRGYZ_FONTS[@]}" ) TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;; - kur ) FONTS=( "${KURDISH_FONTS[@]}" ) ;; + kur ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;; *) err "Error: ${lang} is not a valid language code" esac @@ -1061,6 +1060,8 @@ set_lang_specific_parameters() { elif [[ ! -z ${MEAN_COUNT} ]]; then TRAINING_DATA_ARGUMENTS+=" --mean_count=${MEAN_COUNT}" fi + # Default to Latin fonts if none have been set + test -z "$FONTS" && test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" ) } #============================================================================= From e110b144656769ca76f1a23b65b5567ab6e059cb Mon Sep 17 00:00:00 2001 From: Nick White Date: Wed, 26 Aug 2015 18:32:44 +0100 Subject: [PATCH 3/6] tesstrain.sh: Initialise fontconfig even if Arial isn't available The fontconfig initialisation hardcodes using Arial. However it may not be available, whereas the fonts being used later will be, so use one of them for initialisation instead. --- training/tesstrain_utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh index c5768912f7..c6ff117efd 100755 --- a/training/tesstrain_utils.sh +++ b/training/tesstrain_utils.sh @@ -197,7 +197,7 @@ initialize_fontconfig() { local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt echo "Text" >${sample_path} run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \ - --font="Arial" --outputbase=${sample_path} --text=${sample_path} \ + --font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \ --fontconfig_tmpdir=${FONT_CONFIG_CACHE} } From 8e71c79dc20b77a0c08df004b9e01da18f4df620 Mon Sep 17 00:00:00 2001 From: Nick White Date: Wed, 26 Aug 2015 18:49:14 +0100 Subject: [PATCH 4/6] Remove --bin_dir option from tesstrain.sh (should use $PATH instead) The --bin_dir option to tesstrain.sh is not useful, as $PATH does the same job much better, so switch to relying on that instead. This also makes the code a bit more readable, as it removes the need to refer to binaries as COMMAND_NAME_EXE rather than just command_name. --- training/tesstrain.sh | 1 - training/tesstrain_utils.sh | 64 ++++++++++++------------------------- 2 files changed, 20 insertions(+), 45 deletions(-) diff --git a/training/tesstrain.sh b/training/tesstrain.sh index ecf2072083..7e292cc3f7 100755 --- a/training/tesstrain.sh +++ b/training/tesstrain.sh @@ -17,7 +17,6 @@ # USAGE: # # tesstrain.sh -# --bin_dir PATH # Location of training program. # --fontlist FONTS_STR # A plus-separated list of fontnames to train on. # --fonts_dir FONTS_PATH # Path to font files. # --lang LANG_CODE # ISO 639 code. diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh index c6ff117efd..86e57b9a12 100755 --- a/training/tesstrain_utils.sh +++ b/training/tesstrain_utils.sh @@ -41,11 +41,11 @@ err_exit() { # if the program file is not found. # Usage: run_command CMD ARG1 ARG2... run_command() { - local cmd=$1 - shift - if [[ ! -x ${cmd} ]]; then - err_exit "File ${cmd} not found" + local cmd=`which $1` + if [[ -z ${cmd} ]]; then + err_exit "$1 not found" fi + shift tlog "[$(date)] ${cmd} $@" ${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE} # check completion status @@ -65,22 +65,6 @@ check_file_readable() { done } -# Set global path variables that are based on parsed flags. -set_prog_paths() { - if [[ -z ${BINDIR} ]]; then - err_exit "Need to specify location of program files" - fi - CN_TRAINING_EXE=${BINDIR}/cntraining - COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata - MF_TRAINING_EXE=${BINDIR}/mftraining - SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties - SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering - TESSERACT_EXE=${BINDIR}/tesseract - TEXT2IMAGE_EXE=${BINDIR}/text2image - UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor - WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg -} - # Sets the named variable to given value. Aborts if the value is missing or # if it looks like a flag. # Usage: parse_value VAR_NAME VALUE @@ -105,9 +89,6 @@ parse_flags() { case ${ARGV[$i]} in --) break;; - --bin_dir) - parse_value "BINDIR" ${ARGV[$j]} - i=$j ;; --fontlist) # Expect a plus-separated list of names if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then err_exit "Invalid value passed to --fontlist" @@ -152,9 +133,6 @@ parse_flags() { if [[ -z ${LANG_CODE} ]]; then err_exit "Need to specify a language --lang" fi - if [[ -z ${BINDIR} ]]; then - err_exit "Need to specify path to built binaries --bin_dir" - fi if [[ -z ${LANGDATA_ROOT} ]]; then err_exit "Need to specify path to language files --langdata_dir" fi @@ -167,8 +145,6 @@ parse_flags() { fi fi - set_prog_paths - # Location where intermediate files will be created. TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE} # Location of log file for the whole run. @@ -196,7 +172,7 @@ initialize_fontconfig() { export FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX) local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt echo "Text" >${sample_path} - run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \ + run_command text2image --fonts_dir=${FONTS_DIR} \ --font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \ --fontconfig_tmpdir=${FONT_CONFIG_CACHE} } @@ -224,14 +200,14 @@ generate_font_image() { fi done - run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \ + run_command text2image ${common_args} --font="${font}" \ --text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS} check_file_readable ${outbase}.box ${outbase}.tif if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${TRAIN_NGRAMS_FILE} ]]; then tlog "Extracting font properties of ${font}" - run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \ + run_command text2image ${common_args} --font="${font}" \ --ligatures=false --text=${TRAIN_NGRAMS_FILE} \ --only_extract_font_properties --ptsize=32 check_file_readable ${outbase}.fontinfo @@ -287,7 +263,7 @@ phase_UP_generate_unicharset() { tlog "\n=== Phase UP: Generating unicharset and unichar properties files ===" local box_files=$(ls ${TRAINING_DIR}/*.box) - run_command ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files} + run_command unicharset_extractor -D "${TRAINING_DIR}/" ${box_files} local outfile=${TRAINING_DIR}/unicharset UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset" check_file_readable ${outfile} @@ -295,7 +271,7 @@ phase_UP_generate_unicharset() { XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights" check_file_readable ${UNICHARSET_FILE} - run_command ${SET_UNICHARSET_PROPERTIES_EXE} \ + run_command set_unicharset_properties \ -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \ --script_dir=${LANGDATA_ROOT} check_file_readable ${XHEIGHTS_FILE} @@ -323,7 +299,7 @@ phase_D_generate_dawg() { if [[ -s ${WORDLIST_FILE} ]]; then tlog "Generating word Dawg" check_file_readable ${UNICHARSET_FILE} - run_command ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \ + run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \ ${UNICHARSET_FILE} check_file_readable ${WORD_DAWG} @@ -335,13 +311,13 @@ phase_D_generate_dawg() { if [[ -s ${freq_wordlist_file} ]]; then check_file_readable ${UNICHARSET_FILE} tlog "Generating frequent-word Dawg" - run_command ${WORDLIST2DAWG_EXE} -r 1 ${freq_wordlist_file} \ + run_command wordlist2dawg -r 1 ${freq_wordlist_file} \ ${FREQ_DAWG} ${UNICHARSET_FILE} check_file_readable ${FREQ_DAWG} fi # Punctuation DAWG - # -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy + # -r arguments to wordlist2dawg denote RTL reverse policy # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h). # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG, # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS, @@ -356,20 +332,20 @@ phase_D_generate_dawg() { PUNC_FILE="${LANGDATA_ROOT}/common.punc" fi check_file_readable ${PUNC_FILE} - run_command ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \ + run_command wordlist2dawg -r ${punc_reverse_policy} \ ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE} check_file_readable ${PUNC_DAWG} # Numbers DAWG if [[ -s ${NUMBERS_FILE} ]]; then - run_command ${WORDLIST2DAWG_EXE} -r 0 \ + run_command wordlist2dawg -r 0 \ ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE} check_file_readable ${NUMBER_DAWG} fi # Bigram dawg if [[ -s ${WORD_BIGRAMS_FILE} ]]; then - run_command ${WORDLIST2DAWG_EXE} -r 1 \ + run_command wordlist2dawg -r 1 \ ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE} check_file_readable ${BIGRAM_DAWG} fi @@ -401,7 +377,7 @@ phase_E_extract_features() { tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}" local counter=0 for img_file in ${img_files}; do - run_command ${TESSERACT_EXE} ${img_file} ${img_file%.*} \ + run_command tesseract ${img_file} ${img_file%.*} \ ${box_config} ${config} & let counter=counter+1 let rem=counter%par_factor @@ -423,7 +399,7 @@ phase_C_cluster_prototypes() { tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ===" local out_normproto=$1 - run_command ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \ + run_command cntraining -D "${TRAINING_DIR}/" \ $(ls ${TRAINING_DIR}/*.tr) check_file_readable ${TRAINING_DIR}/normproto @@ -443,7 +419,7 @@ phase_S_cluster_shapes() { font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" fi - run_command ${SHAPE_TRAINING_EXE} \ + run_command shapeclustering \ -D "${TRAINING_DIR}/" \ -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ @@ -464,7 +440,7 @@ phase_M_cluster_microfeatures() { font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" fi - run_command ${MF_TRAINING_EXE} \ + run_command mftraining \ -D "${TRAINING_DIR}/" \ -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ @@ -524,7 +500,7 @@ make__traineddata() { fi # Compose the traineddata file. - run_command ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}. + run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}. # Copy it to the output dir, overwriting only if allowed by the cmdline flag. if [[ ! -d ${OUTPUT_DIR} ]]; then From c0133ecfa61855d307e0104e0aa3b28291c0567b Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 10 Sep 2015 14:57:17 +0100 Subject: [PATCH 5/6] Add --exposures option to tesstrain.sh This flag can be used to specify multiple different exposure levels for a training. There was some code already in tesstrain_utils.sh to deal with multiple exposure levels, so it looks like this functionality was always intended. The default usage does not change, with exposure level 0 being the only one used if --exposures is not used. --- training/tesstrain.sh | 1 + training/tesstrain_utils.sh | 69 +++++++++++++++++++++---------------- 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/training/tesstrain.sh b/training/tesstrain.sh index 7e292cc3f7..37de22222d 100755 --- a/training/tesstrain.sh +++ b/training/tesstrain.sh @@ -24,6 +24,7 @@ # --output_dir OUTPUTDIR # Location of output traineddata file. # --overwrite # Safe to overwrite files in output_dir. # --run_shape_clustering # Run shape clustering (use for Indic langs). +# --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1"). # # OPTIONAL flags for input data. If unspecified we will look for them in # the langdata_dir directory. diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh index 86e57b9a12..2983764978 100755 --- a/training/tesstrain_utils.sh +++ b/training/tesstrain_utils.sh @@ -26,6 +26,7 @@ OVERWRITE=0 RUN_SHAPE_CLUSTERING=0 EXTRACT_FONT_PROPERTIES=1 WORKSPACE_DIR="/tmp/tesstrain" +EXPOSURES=0 # Logging helper functions. tlog() { @@ -98,6 +99,16 @@ parse_flags() { FONTS=( ${ARGV[$j]} ) IFS=$ofs i=$j ;; + --exposures) + exp="" + while test $j -lt ${#ARGV[@]}; do + test -z ${ARGV[$j]} && break + test `echo ${ARGV[$j]} | cut -c -2` = "--" && break + exp="$exp ${ARGV[$j]}" + j=$((j+1)) + done + parse_value "EXPOSURES" "$exp" + i=$((j-1)) ;; --fonts_dir) parse_value "FONTS_DIR" ${ARGV[$j]} i=$j ;; @@ -226,35 +237,36 @@ phase_I_generate_image() { err_exit "Could not find training text file ${TRAINING_TEXT}" fi CHAR_SPACING="0.0" - EXPOSURE="0" - - if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then - # Parse .bigram_freqs file and compose a .train_ngrams file with text - # for tesseract to recognize during training. Take only the ngrams whose - # combined weight accounts for 95% of all the bigrams in the language. - NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \ - | awk '{s=s+$2}; END {print (s/100)*p}' p=99) - cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \ - | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \ - x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE} - check_file_readable ${TRAIN_NGRAMS_FILE} - fi - local counter=0 - for font in "${FONTS[@]}"; do - generate_font_image "${font}" & - let counter=counter+1 - let rem=counter%par_factor - if [[ "${rem}" -eq 0 ]]; then - wait + for EXPOSURE in $EXPOSURES; do + if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then + # Parse .bigram_freqs file and compose a .train_ngrams file with text + # for tesseract to recognize during training. Take only the ngrams whose + # combined weight accounts for 95% of all the bigrams in the language. + NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \ + | awk '{s=s+$2}; END {print (s/100)*p}' p=99) + cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \ + | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \ + x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE} + check_file_readable ${TRAIN_NGRAMS_FILE} fi - done - wait - # Check that each process was successful. - for font in "${FONTS[@]}"; do - local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g') - local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE} - check_file_readable ${outbase}.box ${outbase}.tif + + local counter=0 + for font in "${FONTS[@]}"; do + generate_font_image "${font}" & + let counter=counter+1 + let rem=counter%par_factor + if [[ "${rem}" -eq 0 ]]; then + wait + fi + done + wait + # Check that each process was successful. + for font in "${FONTS[@]}"; do + local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g') + local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE} + check_file_readable ${outbase}.box ${outbase}.tif + done done } @@ -359,10 +371,9 @@ phase_E_extract_features() { par_factor=1 fi tlog "\n=== Phase E: Extracting features ===" - TRAIN_EXPOSURES='0' local img_files="" - for exposure in ${TRAIN_EXPOSURES}; do + for exposure in ${EXPOSURES}; do img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif) done From de789ac8ea351d848e3a742ad038f9053f9cf1f4 Mon Sep 17 00:00:00 2001 From: Nick White Date: Thu, 10 Sep 2015 15:05:07 +0100 Subject: [PATCH 6/6] Use mktemp to create workspace directory mktemp is a better idea for security, as well as enabling users to specify a different directory using the TMPDIR environment variable, which is useful if /tmp is a small tmpfs. Also fix a bug where the first few log messages were failing as the workspace directory wasn't been created early enough. --- training/tesstrain.sh | 5 +---- training/tesstrain_utils.sh | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/training/tesstrain.sh b/training/tesstrain.sh index 37de22222d..c1af1e86c1 100755 --- a/training/tesstrain.sh +++ b/training/tesstrain.sh @@ -49,11 +49,8 @@ source `dirname $0`/tesstrain_utils.sh ARGV=("$@") parse_flags -tlog "\n=== Starting training for language '${LANG_CODE}'" - -tlog "Cleaning workspace directory ${TRAINING_DIR}..." mkdir -p ${TRAINING_DIR} -rm -fr ${TRAINING_DIR}/* +tlog "\n=== Starting training for language '${LANG_CODE}'" source `dirname $0`/language-specific.sh set_lang_specific_parameters ${LANG_CODE} diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh index 2983764978..30006bc1f7 100755 --- a/training/tesstrain_utils.sh +++ b/training/tesstrain_utils.sh @@ -25,7 +25,7 @@ OUTPUT_DIR="/tmp/tesstrain/tessdata" OVERWRITE=0 RUN_SHAPE_CLUSTERING=0 EXTRACT_FONT_PROPERTIES=1 -WORKSPACE_DIR="/tmp/tesstrain" +WORKSPACE_DIR=`mktemp -d` EXPOSURES=0 # Logging helper functions.