From baed0f07fa62477142c9b458401814d5eb98b716 Mon Sep 17 00:00:00 2001
From: Tom Morris <tmorris@gmail.com>
Date: Tue, 23 Jun 2015 16:19:18 -0400
Subject: [PATCH 01/22] Simplify build and run of ScrollView

---
 .gitignore        |  2 ++
 java/Makefile.am  |  8 ++++++--
 java/Manifest.txt |  2 ++
 viewer/svutil.cpp | 13 +++++--------
 4 files changed, 15 insertions(+), 10 deletions(-)
 create mode 100644 java/Manifest.txt

diff --git a/.gitignore b/.gitignore
index 91e3348402..188105bc67 100644
--- a/.gitignore
+++ b/.gitignore
@@ -58,6 +58,8 @@ training/wordlist2dawg
 *.o
 *.Plo
 *.a
+*.class
+*.jar
 
 # tessdata
 *.cube.*
diff --git a/java/Makefile.am b/java/Makefile.am
index 3ed962dfcc..43752b6f3b 100644
--- a/java/Makefile.am
+++ b/java/Makefile.am
@@ -42,18 +42,22 @@ SCROLLVIEW_LIBS = \
 CLASSPATH = $(srcdir)/piccolo2d-core-3.0.jar:$(srcdir)/piccolo2d-extras-3.0.jar
 
 ScrollView.jar : $(SCROLLVIEW_CLASSES)
-	$(JAR) cf $@ com/google/scrollview/*.class \
+	$(JAR) cfm $@ Manifest.txt com/google/scrollview/*.class \
            com/google/scrollview/events/*.class com/google/scrollview/ui/*.class
 
 $(SCROLLVIEW_CLASSES) : $(SCROLLVIEW_FILES)
 	$(JAVAC) -encoding UTF8 -sourcepath $(srcdir) -classpath $(CLASSPATH) $(SCROLLVIEW_FILES) -d $(builddir)
 
+fetch-jars :
+	curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0/piccolo2d-core-3.0.jar > piccolo2d-core-3.0.jar
+	curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-extras/3.0/piccolo2d-extras-3.0.jar > piccolo2d-extras-3.0.jar
+
 .PHONY: install-jars
 install-jars : ScrollView.jar
 	@if [ ! -d  $(scrollview_path) ]; then mkdir -p $(scrollview_path); fi;
 	$(INSTALL) -m 644 $(SCROLLVIEW_LIBS) $(scrollview_path);
 	$(INSTALL) -m 644 ScrollView.jar $(scrollview_path);
-	@echo "Don't forget to set eviroment variable SCROLLVIEW_PATH to $(scrollview_path)";
+	@echo "Don't forget to set enviroment variable SCROLLVIEW_PATH to $(scrollview_path)";
 
 uninstall:
 	rm -f $(scrollview_path)/*.jar
diff --git a/java/Manifest.txt b/java/Manifest.txt
new file mode 100644
index 0000000000..bc0b707bd8
--- /dev/null
+++ b/java/Manifest.txt
@@ -0,0 +1,2 @@
+Main-Class: com/google/scrollview/ScrollView
+Class-Path: ScrollView.jar piccolo2d-core-3.0.jar piccolo2d-extras-3.0.jar
diff --git a/viewer/svutil.cpp b/viewer/svutil.cpp
index a820eafbc5..f94c1c86d5 100644
--- a/viewer/svutil.cpp
+++ b/viewer/svutil.cpp
@@ -127,7 +127,7 @@ SVSemaphore::SVSemaphore() {
   semaphore_ = CreateSemaphore(0, 0, 10, 0);
 #elif defined(__APPLE__)
   char name[50];
-  snprintf(name, sizeof(name), "%d", random());
+  snprintf(name, sizeof(name), "%ld", random());
   sem_unlink(name);
   semaphore_ = sem_open(name, O_CREAT , S_IWUSR, 0);
   if (semaphore_ == SEM_FAILED) {
@@ -296,14 +296,11 @@ static std::string ScrollViewCommand(std::string scrollview_path) {
   // this unnecessary.
   // Also the path has to be separated by ; on windows and : otherwise.
 #ifdef _WIN32
-  const char* cmd_template = "-Djava.library.path=%s -cp %s/ScrollView.jar;"
-      "%s/piccolo2d-core-3.0.jar:%s/piccolo2d-extras-3.0.jar"
-      " com.google.scrollview.ScrollView";
+  const char* cmd_template = "-Djava.library.path=%s -jar %s/ScrollView.jar";
+
 #else
   const char* cmd_template = "-c \"trap 'kill %%1' 0 1 2 ; java "
-      "-Xms1024m -Xmx2048m -Djava.library.path=%s -cp %s/ScrollView.jar:"
-      "%s/piccolo2d-core-3.0.jar:%s/piccolo2d-extras-3.0.jar"
-      " com.google.scrollview.ScrollView"
+      "-Xms1024m -Xmx2048m -jar %s/ScrollView.jar"
       " & wait\"";
 #endif
   int cmdlen = strlen(cmd_template) + 4*strlen(scrollview_path.c_str()) + 1;
@@ -374,7 +371,7 @@ static int GetAddrInfo(const char* hostname, int port,
                        struct addrinfo** address) {
 #if defined(__linux__)
   char port_str[40];
-  snprintf(port_str, 40, "%d", port);
+  snprintf(port_str, 40, "%ld", port);
   return getaddrinfo(hostname, port_str, NULL, address);
 #else
   return GetAddrInfoNonLinux(hostname, port, address);

From 8d0f59d09d8fc0d610226d8017b8407c653bcc9f Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Wed, 26 Aug 2015 18:14:30 +0100
Subject: [PATCH 02/22] tesstrain.sh: Only fall back to default Latin fonts if
 none were provided

The --fontlist argument to tesstrain.sh was always ignored, even if
the language had no specific fonts specified in language-specific.sh.

Change this behaviour so the --fontlist argument is used if no specifc
fonts are selected by language-specific.sh.
---
 training/language-specific.sh | 2 +-
 training/tesstrain_utils.sh   | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/training/language-specific.sh b/training/language-specific.sh
index bc64f67c88..c7be0daf6d 100755
--- a/training/language-specific.sh
+++ b/training/language-specific.sh
@@ -794,7 +794,7 @@ set_lang_specific_parameters() {
   local lang=$1
   # The default text location is now given directly from the language code.
   TEXT_CORPUS="${FLAGS_webtext_prefix}/${lang}.corpus.txt"
-  FONTS=( "${LATIN_FONTS[@]}" )
+  test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" )
   FILTER_ARGUMENTS=""
   WORDLIST2DAWG_ARGUMENTS=""
   # These dawg factors represent the fraction of the corpus not covered by the
diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh
index a3ad7f5142..c5768912f7 100755
--- a/training/tesstrain_utils.sh
+++ b/training/tesstrain_utils.sh
@@ -16,10 +16,6 @@
 #
 # USAGE: source tesstrain_utils.sh
 
-FONTS=(
-    "Arial" \
-    "Times New Roman," \
-)
 if [ "$(uname)" == "Darwin" ];then
     FONTS_DIR="/Library/Fonts/"
 else

From 422c424995b0cdf858751bf3d85a2ce21036b355 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Wed, 26 Aug 2015 18:24:14 +0100
Subject: [PATCH 03/22] tesstrain.sh: Only set FONTS if they weren't set on the
 command line

Previously the fonts specified in language-selection.sh would override
any specified on the command line.

This changes language-specific.sh from overriding a user request to
just setting the default fonts if none are specified with --fontlist.
---
 training/language-specific.sh | 93 ++++++++++++++++++-----------------
 1 file changed, 47 insertions(+), 46 deletions(-)

diff --git a/training/language-specific.sh b/training/language-specific.sh
index c7be0daf6d..23dee3e1cd 100755
--- a/training/language-specific.sh
+++ b/training/language-specific.sh
@@ -780,7 +780,7 @@ VERTICAL_FONTS=( \
 #      holds the text corpus file for the language, used in phase F
 #   ${FONTS[@]}
 #      holds a sequence of applicable fonts for the language, used in
-#      phase F & I
+#      phase F & I. only set if not already set, i.e. from command line
 #   ${TRAINING_DATA_ARGUMENTS}
 #      non-default arguments to the training_data program used in phase T
 #   ${FILTER_ARGUMENTS} -
@@ -794,7 +794,6 @@ set_lang_specific_parameters() {
   local lang=$1
   # The default text location is now given directly from the language code.
   TEXT_CORPUS="${FLAGS_webtext_prefix}/${lang}.corpus.txt"
-  test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" )
   FILTER_ARGUMENTS=""
   WORDLIST2DAWG_ARGUMENTS=""
   # These dawg factors represent the fraction of the corpus not covered by the
@@ -816,30 +815,30 @@ set_lang_specific_parameters() {
   case ${lang} in
     # Latin languages.
     enm ) TEXT2IMAGE_EXTRA_ARGS=" --ligatures"   # Add ligatures when supported
-          FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
+          test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
     frm ) TEXT_CORPUS="${FLAGS_webtext_prefix}/fra.corpus.txt"
           # Make long-s substitutions for Middle French text
           FILTER_ARGUMENTS="--make_early_language_variant=fra"
           TEXT2IMAGE_EXTRA_ARGS=" --ligatures"   # Add ligatures when supported.
-          FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
+          test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
     frk ) TEXT_CORPUS="${FLAGS_webtext_prefix}/deu.corpus.txt"
-          FONTS=( "${FRAKTUR_FONTS[@]}" );;
+          test -z "$FONTS" && FONTS=( "${FRAKTUR_FONTS[@]}" );;
     ita_old )
           TEXT_CORPUS="${FLAGS_webtext_prefix}/ita.corpus.txt"
           # Make long-s substitutions for Early Italian text
           FILTER_ARGUMENTS="--make_early_language_variant=ita"
           TEXT2IMAGE_EXTRA_ARGS=" --ligatures"   # Add ligatures when supported.
-          FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
+          test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
     spa_old )
           TEXT_CORPUS="${FLAGS_webtext_prefix}/spa.corpus.txt"
           # Make long-s substitutions for Early Spanish text
           FILTER_ARGUMENTS="--make_early_language_variant=spa"
           TEXT2IMAGE_EXTRA_ARGS=" --ligatures"  # Add ligatures when supported.
-          FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
+          test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
     srp_latn )
           TEXT_CORPUS=${FLAGS_webtext_prefix}/srp.corpus.txt ;;
     vie ) TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;;
     # Highly inflective languages get a bigger dawg size.
     # TODO(rays) Add more here!
     hun ) WORD_DAWG_SIZE=1000000 ;;
@@ -899,14 +898,14 @@ set_lang_specific_parameters() {
           # Strip unrenderable words as not all fonts will render the extended
           # latin symbols found in Vietnamese text.
           WORD_DAWG_SIZE=1000000
-          FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
+          test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );;
 
     # Cyrillic script-based languages.
-    rus ) FONTS=( "${RUSSIAN_FONTS[@]}" )
+    rus ) test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" )
           NUMBER_DAWG_FACTOR=0.05
           WORD_DAWG_SIZE=1000000 ;;
     aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl )
-          FONTS=( "${RUSSIAN_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) ;;
 
     # Special code for performing Cyrillic language-id that is trained on
     # Russian, Serbian, Ukranian, Belarusian, Macedonian, Tajik and Mongolian
@@ -916,70 +915,70 @@ set_lang_specific_parameters() {
           TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
           GENERATE_WORD_BIGRAMS=0
           WORD_DAWG_SIZE=1000000
-          FONTS=( "${RUSSIAN_FONTS[@]}" );;
+          test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" );;
 
     # South Asian scripts mostly have a lot of different graphemes, so trim
     # down the MEAN_COUNT so as not to get a huge amount of text.
     asm | ben )
           MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.15
-          FONTS=( "${BENGALI_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${BENGALI_FONTS[@]}" ) ;;
     bih | hin | mar | nep | san )
           MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.15
-          FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;;
     bod ) MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.15
-          FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
     dzo )
           WORD_DAWG_FACTOR=0.01
-          FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;;
     guj ) MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.15
-          FONTS=( "${GUJARATI_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${GUJARATI_FONTS[@]}" ) ;;
     kan ) MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.15
           TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
           TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
-          FONTS=( "${KANNADA_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${KANNADA_FONTS[@]}" ) ;;
     mal ) MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.15
           TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
           TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
-          FONTS=( "${MALAYALAM_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${MALAYALAM_FONTS[@]}" ) ;;
     ori )
           WORD_DAWG_FACTOR=0.01
-          FONTS=( "${ORIYA_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${ORIYA_FONTS[@]}" ) ;;
     pan ) MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.01
-          FONTS=( "${PUNJABI_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${PUNJABI_FONTS[@]}" ) ;;
     sin ) MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.01
-          FONTS=( "${SINHALA_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${SINHALA_FONTS[@]}" ) ;;
     tam ) MEAN_COUNT="30"
           WORD_DAWG_FACTOR=0.15
           TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
           TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
-          FONTS=( "${TAMIL_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${TAMIL_FONTS[@]}" ) ;;
     tel ) MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.15
           TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output"
           TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5"
-          FONTS=( "${TELUGU_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;;
 
     # SouthEast Asian scripts.
     khm ) MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.15
           TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          FONTS=( "${KHMER_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${KHMER_FONTS[@]}" ) ;;
     lao ) MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.15
           TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;;
     mya ) MEAN_COUNT="12"
           WORD_DAWG_FACTOR=0.15
           TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
-          FONTS=( "${BURMESE_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${BURMESE_FONTS[@]}" ) ;;
     tha ) MEAN_COUNT="30"
           WORD_DAWG_FACTOR=0.01
           TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
@@ -987,7 +986,7 @@ set_lang_specific_parameters() {
           TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
           AMBIGS_FILTER_DENOMINATOR="1000"
           LEADING=48
-          FONTS=( "${THAI_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${THAI_FONTS[@]}" ) ;;
 
     # CJK
     chi_sim )
@@ -998,7 +997,7 @@ set_lang_specific_parameters() {
           TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
           TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
           FILTER_ARGUMENTS="--charset_filter=chi_sim --segmenter_lang=chi_sim"
-          FONTS=( "${CHI_SIM_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${CHI_SIM_FONTS[@]}" ) ;;
     chi_tra )
           MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.015
@@ -1006,14 +1005,14 @@ set_lang_specific_parameters() {
           TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
           TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
           FILTER_ARGUMENTS="--charset_filter=chi_tra --segmenter_lang=chi_tra"
-          FONTS=( "${CHI_TRA_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${CHI_TRA_FONTS[@]}" ) ;;
     jpn ) MEAN_COUNT="15"
           WORD_DAWG_FACTOR=0.015
           GENERATE_WORD_BIGRAMS=0
           TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000"
           TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams="
           FILTER_ARGUMENTS="--charset_filter=jpn --segmenter_lang=jpn"
-          FONTS=( "${JPN_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${JPN_FONTS[@]}" ) ;;
     kor ) MEAN_COUNT="20"
           WORD_DAWG_FACTOR=0.015
           NUMBER_DAWG_FACTOR=0.05
@@ -1021,38 +1020,38 @@ set_lang_specific_parameters() {
           TRAINING_DATA_ARGUMENTS+=" --desired_bigrams="
           GENERATE_WORD_BIGRAMS=0
           FILTER_ARGUMENTS="--charset_filter=kor --segmenter_lang=kor"
-          FONTS=( "${KOREAN_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${KOREAN_FONTS[@]}" ) ;;
 
     # Middle-Eastern scripts.
-    ara ) FONTS=( "${ARABIC_FONTS[@]}" ) ;;
-    div ) FONTS=( "${THAANA_FONTS[@]}" ) ;;
+    ara ) test -z "$FONTS" && FONTS=( "${ARABIC_FONTS[@]}" ) ;;
+    div ) test -z "$FONTS" && FONTS=( "${THAANA_FONTS[@]}" ) ;;
     fas | pus | snd | uig | urd )
-          FONTS=( "${PERSIAN_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${PERSIAN_FONTS[@]}" ) ;;
     heb | yid )
           NUMBER_DAWG_FACTOR=0.05
           WORD_DAWG_FACTOR=0.08
-          FONTS=( "${HEBREW_FONTS[@]}" ) ;;
-    syr ) FONTS=( "${SYRIAC_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${HEBREW_FONTS[@]}" ) ;;
+    syr ) test -z "$FONTS" && FONTS=( "${SYRIAC_FONTS[@]}" ) ;;
 
     # Other scripts.
     amh | tir)
-          FONTS=( "${AMHARIC_FONTS[@]}" ) ;;
-    chr ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \
+          test -z "$FONTS" && FONTS=( "${AMHARIC_FONTS[@]}" ) ;;
+    chr ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \
                   "Noto Sans Cherokee" \
                 ) ;;
     ell | grc )
           NUMBER_DAWG_FACTOR=0.05
           WORD_DAWG_FACTOR=0.08
-          FONTS=( "${GREEK_FONTS[@]}" ) ;;
-    hye ) FONTS=( "${ARMENIAN_FONTS[@]}" ) ;;
-    iku ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;;
-    kat)  FONTS=( "${GEORGIAN_FONTS[@]}" ) ;;
+          test -z "$FONTS" && FONTS=( "${GREEK_FONTS[@]}" ) ;;
+    hye ) test -z "$FONTS" && FONTS=( "${ARMENIAN_FONTS[@]}" ) ;;
+    iku ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;;
+    kat)  test -z "$FONTS" && FONTS=( "${GEORGIAN_FONTS[@]}" ) ;;
     kat_old)
           TEXT_CORPUS="${FLAGS_webtext_prefix}/kat.corpus.txt"
-          FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;;
-    kir ) FONTS=( "${KYRGYZ_FONTS[@]}" )
+          test -z "$FONTS" && FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;;
+    kir ) test -z "$FONTS" && FONTS=( "${KYRGYZ_FONTS[@]}" )
           TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;;
-    kur ) FONTS=( "${KURDISH_FONTS[@]}" ) ;;
+    kur ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;;
 
     *) err "Error: ${lang} is not a valid language code"
   esac
@@ -1061,6 +1060,8 @@ set_lang_specific_parameters() {
   elif [[ ! -z ${MEAN_COUNT} ]]; then
     TRAINING_DATA_ARGUMENTS+=" --mean_count=${MEAN_COUNT}"
   fi
+  # Default to Latin fonts if none have been set
+  test -z "$FONTS" && test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" )
 }
 
 #=============================================================================

From e110b144656769ca76f1a23b65b5567ab6e059cb Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Wed, 26 Aug 2015 18:32:44 +0100
Subject: [PATCH 04/22] tesstrain.sh: Initialise fontconfig even if Arial isn't
 available

The fontconfig initialisation hardcodes using Arial. However it may
not be available, whereas the fonts being used later will be, so use
one of them for initialisation instead.
---
 training/tesstrain_utils.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh
index c5768912f7..c6ff117efd 100755
--- a/training/tesstrain_utils.sh
+++ b/training/tesstrain_utils.sh
@@ -197,7 +197,7 @@ initialize_fontconfig() {
     local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt
     echo "Text" >${sample_path}
     run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \
-        --font="Arial" --outputbase=${sample_path} --text=${sample_path} \
+        --font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \
         --fontconfig_tmpdir=${FONT_CONFIG_CACHE}
 }
 

From 8e71c79dc20b77a0c08df004b9e01da18f4df620 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Wed, 26 Aug 2015 18:49:14 +0100
Subject: [PATCH 05/22] Remove --bin_dir option from tesstrain.sh (should use
 $PATH instead)

The --bin_dir option to tesstrain.sh is not useful, as $PATH does the
same job much better, so switch to relying on that instead.

This also makes the code a bit more readable, as it removes the need
to refer to binaries as COMMAND_NAME_EXE rather than just command_name.
---
 training/tesstrain.sh       |  1 -
 training/tesstrain_utils.sh | 64 ++++++++++++-------------------------
 2 files changed, 20 insertions(+), 45 deletions(-)

diff --git a/training/tesstrain.sh b/training/tesstrain.sh
index ecf2072083..7e292cc3f7 100755
--- a/training/tesstrain.sh
+++ b/training/tesstrain.sh
@@ -17,7 +17,6 @@
 # USAGE:
 #
 # tesstrain.sh
-#    --bin_dir PATH             # Location of training program.
 #    --fontlist FONTS_STR       # A plus-separated list of fontnames to train on.
 #    --fonts_dir FONTS_PATH     # Path to font files.
 #    --lang LANG_CODE           # ISO 639 code.
diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh
index c6ff117efd..86e57b9a12 100755
--- a/training/tesstrain_utils.sh
+++ b/training/tesstrain_utils.sh
@@ -41,11 +41,11 @@ err_exit() {
 # if the program file is not found.
 # Usage: run_command CMD ARG1 ARG2...
 run_command() {
-    local cmd=$1
-    shift
-    if [[ ! -x ${cmd} ]]; then
-        err_exit "File ${cmd} not found"
+    local cmd=`which $1`
+    if [[ -z ${cmd} ]]; then
+        err_exit "$1 not found"
     fi
+    shift
     tlog "[$(date)] ${cmd} $@"
     ${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
     # check completion status
@@ -65,22 +65,6 @@ check_file_readable() {
     done
 }
 
-# Set global path variables that are based on parsed flags.
-set_prog_paths() {
-    if [[ -z ${BINDIR} ]]; then
-        err_exit "Need to specify location of program files"
-    fi
-    CN_TRAINING_EXE=${BINDIR}/cntraining
-    COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata
-    MF_TRAINING_EXE=${BINDIR}/mftraining
-    SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties
-    SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering
-    TESSERACT_EXE=${BINDIR}/tesseract
-    TEXT2IMAGE_EXE=${BINDIR}/text2image
-    UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor
-    WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg
-}
-
 # Sets the named variable to given value. Aborts if the value is missing or
 # if it looks like a flag.
 # Usage: parse_value VAR_NAME VALUE
@@ -105,9 +89,6 @@ parse_flags() {
         case ${ARGV[$i]} in
             --)
                 break;;
-            --bin_dir)
-                parse_value "BINDIR" ${ARGV[$j]}
-                i=$j ;;
             --fontlist)   # Expect a plus-separated list of names
                 if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then
                     err_exit "Invalid value passed to --fontlist"
@@ -152,9 +133,6 @@ parse_flags() {
     if [[ -z ${LANG_CODE} ]]; then
         err_exit "Need to specify a language --lang"
     fi
-    if [[ -z ${BINDIR} ]]; then
-        err_exit "Need to specify path to built binaries --bin_dir"
-    fi
     if [[ -z ${LANGDATA_ROOT} ]]; then
         err_exit "Need to specify path to language files --langdata_dir"
     fi
@@ -167,8 +145,6 @@ parse_flags() {
         fi
     fi
 
-    set_prog_paths
-
     # Location where intermediate files will be created.
     TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE}
     # Location of log file for the whole run.
@@ -196,7 +172,7 @@ initialize_fontconfig() {
     export FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX)
     local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt
     echo "Text" >${sample_path}
-    run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \
+    run_command text2image --fonts_dir=${FONTS_DIR} \
         --font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \
         --fontconfig_tmpdir=${FONT_CONFIG_CACHE}
 }
@@ -224,14 +200,14 @@ generate_font_image() {
       fi
     done
 
-    run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
+    run_command text2image ${common_args} --font="${font}" \
         --text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS}
     check_file_readable ${outbase}.box ${outbase}.tif
 
     if (( ${EXTRACT_FONT_PROPERTIES} )) &&
         [[ -r ${TRAIN_NGRAMS_FILE} ]]; then
         tlog "Extracting font properties of ${font}"
-        run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \
+        run_command text2image ${common_args} --font="${font}" \
             --ligatures=false --text=${TRAIN_NGRAMS_FILE} \
             --only_extract_font_properties --ptsize=32
         check_file_readable ${outbase}.fontinfo
@@ -287,7 +263,7 @@ phase_UP_generate_unicharset() {
     tlog "\n=== Phase UP: Generating unicharset and unichar properties files ==="
 
     local box_files=$(ls ${TRAINING_DIR}/*.box)
-    run_command ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files}
+    run_command unicharset_extractor -D "${TRAINING_DIR}/" ${box_files}
     local outfile=${TRAINING_DIR}/unicharset
     UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset"
     check_file_readable ${outfile}
@@ -295,7 +271,7 @@ phase_UP_generate_unicharset() {
 
     XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights"
     check_file_readable ${UNICHARSET_FILE}
-    run_command ${SET_UNICHARSET_PROPERTIES_EXE} \
+    run_command set_unicharset_properties \
         -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \
         --script_dir=${LANGDATA_ROOT}
     check_file_readable ${XHEIGHTS_FILE}
@@ -323,7 +299,7 @@ phase_D_generate_dawg() {
     if [[ -s ${WORDLIST_FILE} ]]; then
         tlog "Generating word Dawg"
         check_file_readable ${UNICHARSET_FILE}
-        run_command ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
+        run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \
             ${UNICHARSET_FILE}
         check_file_readable ${WORD_DAWG}
 
@@ -335,13 +311,13 @@ phase_D_generate_dawg() {
     if [[ -s ${freq_wordlist_file} ]]; then
         check_file_readable ${UNICHARSET_FILE}
         tlog "Generating frequent-word Dawg"
-        run_command ${WORDLIST2DAWG_EXE}  -r 1 ${freq_wordlist_file} \
+        run_command wordlist2dawg  -r 1 ${freq_wordlist_file} \
             ${FREQ_DAWG} ${UNICHARSET_FILE}
         check_file_readable ${FREQ_DAWG}
     fi
 
     # Punctuation DAWG
-    # -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy
+    # -r arguments to wordlist2dawg denote RTL reverse policy
     # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h).
     # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG,
     # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS,
@@ -356,20 +332,20 @@ phase_D_generate_dawg() {
         PUNC_FILE="${LANGDATA_ROOT}/common.punc"
     fi
     check_file_readable ${PUNC_FILE}
-    run_command ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \
+    run_command wordlist2dawg -r ${punc_reverse_policy} \
         ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE}
     check_file_readable ${PUNC_DAWG}
 
     # Numbers DAWG
     if [[ -s ${NUMBERS_FILE} ]]; then
-        run_command ${WORDLIST2DAWG_EXE} -r 0 \
+        run_command wordlist2dawg -r 0 \
             ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE}
         check_file_readable ${NUMBER_DAWG}
     fi
 
     # Bigram dawg
     if [[ -s ${WORD_BIGRAMS_FILE} ]]; then
-        run_command ${WORDLIST2DAWG_EXE} -r 1 \
+        run_command wordlist2dawg -r 1 \
             ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE}
         check_file_readable ${BIGRAM_DAWG}
     fi
@@ -401,7 +377,7 @@ phase_E_extract_features() {
     tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}"
     local counter=0
     for img_file in ${img_files}; do
-        run_command ${TESSERACT_EXE} ${img_file} ${img_file%.*} \
+        run_command tesseract ${img_file} ${img_file%.*} \
             ${box_config} ${config} &
       let counter=counter+1
       let rem=counter%par_factor
@@ -423,7 +399,7 @@ phase_C_cluster_prototypes() {
     tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ==="
     local out_normproto=$1
 
-    run_command ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \
+    run_command cntraining -D "${TRAINING_DIR}/" \
         $(ls ${TRAINING_DIR}/*.tr)
 
     check_file_readable ${TRAINING_DIR}/normproto
@@ -443,7 +419,7 @@ phase_S_cluster_shapes() {
         font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
     fi
 
-    run_command ${SHAPE_TRAINING_EXE} \
+    run_command shapeclustering \
         -D "${TRAINING_DIR}/" \
         -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
         -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
@@ -464,7 +440,7 @@ phase_M_cluster_microfeatures() {
         font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights"
     fi
 
-    run_command ${MF_TRAINING_EXE} \
+    run_command mftraining \
         -D "${TRAINING_DIR}/" \
         -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \
         -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \
@@ -524,7 +500,7 @@ make__traineddata() {
   fi
 
   # Compose the traineddata file.
-  run_command ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}.
+  run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}.
 
   # Copy it to the output dir, overwriting only if allowed by the cmdline flag.
   if [[ ! -d ${OUTPUT_DIR} ]]; then

From a1e14ea93c457023087e012357583000ccd53ac0 Mon Sep 17 00:00:00 2001
From: Tom Morris <tmorris@gmail.com>
Date: Wed, 9 Sep 2015 16:51:45 -0400
Subject: [PATCH 06/22] Add ULL to constants which overflow 32 bits

---
 ccutil/helpers.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ccutil/helpers.h b/ccutil/helpers.h
index 480929c955..022a2c3066 100644
--- a/ccutil/helpers.h
+++ b/ccutil/helpers.h
@@ -61,8 +61,8 @@ class TRand {
  private:
   // Steps the generator to the next value.
   void Iterate() {
-    seed_ *= 6364136223846793005;
-    seed_ += 1442695040888963407;
+    seed_ *= 6364136223846793005ULL;
+    seed_ += 1442695040888963407ULL;
   }
 
   // The current value of the seed.

From 4d92667e899844e09d782208b0bd04d95c8cff9d Mon Sep 17 00:00:00 2001
From: Tom Morris <tmorris@gmail.com>
Date: Tue, 23 Jun 2015 16:19:18 -0400
Subject: [PATCH 07/22] Simplify build and run of ScrollView

---
 .gitignore        |  2 ++
 java/Makefile.am  |  8 ++++++--
 java/Manifest.txt |  2 ++
 viewer/svutil.cpp | 13 +++++--------
 4 files changed, 15 insertions(+), 10 deletions(-)
 create mode 100644 java/Manifest.txt

diff --git a/.gitignore b/.gitignore
index e92fdeddbf..2ed887c5ad 100644
--- a/.gitignore
+++ b/.gitignore
@@ -59,6 +59,8 @@ training/wordlist2dawg
 *.o
 *.Plo
 *.a
+*.class
+*.jar
 
 # tessdata
 *.cube.*
diff --git a/java/Makefile.am b/java/Makefile.am
index 3ed962dfcc..43752b6f3b 100644
--- a/java/Makefile.am
+++ b/java/Makefile.am
@@ -42,18 +42,22 @@ SCROLLVIEW_LIBS = \
 CLASSPATH = $(srcdir)/piccolo2d-core-3.0.jar:$(srcdir)/piccolo2d-extras-3.0.jar
 
 ScrollView.jar : $(SCROLLVIEW_CLASSES)
-	$(JAR) cf $@ com/google/scrollview/*.class \
+	$(JAR) cfm $@ Manifest.txt com/google/scrollview/*.class \
            com/google/scrollview/events/*.class com/google/scrollview/ui/*.class
 
 $(SCROLLVIEW_CLASSES) : $(SCROLLVIEW_FILES)
 	$(JAVAC) -encoding UTF8 -sourcepath $(srcdir) -classpath $(CLASSPATH) $(SCROLLVIEW_FILES) -d $(builddir)
 
+fetch-jars :
+	curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0/piccolo2d-core-3.0.jar > piccolo2d-core-3.0.jar
+	curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-extras/3.0/piccolo2d-extras-3.0.jar > piccolo2d-extras-3.0.jar
+
 .PHONY: install-jars
 install-jars : ScrollView.jar
 	@if [ ! -d  $(scrollview_path) ]; then mkdir -p $(scrollview_path); fi;
 	$(INSTALL) -m 644 $(SCROLLVIEW_LIBS) $(scrollview_path);
 	$(INSTALL) -m 644 ScrollView.jar $(scrollview_path);
-	@echo "Don't forget to set eviroment variable SCROLLVIEW_PATH to $(scrollview_path)";
+	@echo "Don't forget to set enviroment variable SCROLLVIEW_PATH to $(scrollview_path)";
 
 uninstall:
 	rm -f $(scrollview_path)/*.jar
diff --git a/java/Manifest.txt b/java/Manifest.txt
new file mode 100644
index 0000000000..bc0b707bd8
--- /dev/null
+++ b/java/Manifest.txt
@@ -0,0 +1,2 @@
+Main-Class: com/google/scrollview/ScrollView
+Class-Path: ScrollView.jar piccolo2d-core-3.0.jar piccolo2d-extras-3.0.jar
diff --git a/viewer/svutil.cpp b/viewer/svutil.cpp
index a820eafbc5..f94c1c86d5 100644
--- a/viewer/svutil.cpp
+++ b/viewer/svutil.cpp
@@ -127,7 +127,7 @@ SVSemaphore::SVSemaphore() {
   semaphore_ = CreateSemaphore(0, 0, 10, 0);
 #elif defined(__APPLE__)
   char name[50];
-  snprintf(name, sizeof(name), "%d", random());
+  snprintf(name, sizeof(name), "%ld", random());
   sem_unlink(name);
   semaphore_ = sem_open(name, O_CREAT , S_IWUSR, 0);
   if (semaphore_ == SEM_FAILED) {
@@ -296,14 +296,11 @@ static std::string ScrollViewCommand(std::string scrollview_path) {
   // this unnecessary.
   // Also the path has to be separated by ; on windows and : otherwise.
 #ifdef _WIN32
-  const char* cmd_template = "-Djava.library.path=%s -cp %s/ScrollView.jar;"
-      "%s/piccolo2d-core-3.0.jar:%s/piccolo2d-extras-3.0.jar"
-      " com.google.scrollview.ScrollView";
+  const char* cmd_template = "-Djava.library.path=%s -jar %s/ScrollView.jar";
+
 #else
   const char* cmd_template = "-c \"trap 'kill %%1' 0 1 2 ; java "
-      "-Xms1024m -Xmx2048m -Djava.library.path=%s -cp %s/ScrollView.jar:"
-      "%s/piccolo2d-core-3.0.jar:%s/piccolo2d-extras-3.0.jar"
-      " com.google.scrollview.ScrollView"
+      "-Xms1024m -Xmx2048m -jar %s/ScrollView.jar"
       " & wait\"";
 #endif
   int cmdlen = strlen(cmd_template) + 4*strlen(scrollview_path.c_str()) + 1;
@@ -374,7 +371,7 @@ static int GetAddrInfo(const char* hostname, int port,
                        struct addrinfo** address) {
 #if defined(__linux__)
   char port_str[40];
-  snprintf(port_str, 40, "%d", port);
+  snprintf(port_str, 40, "%ld", port);
   return getaddrinfo(hostname, port_str, NULL, address);
 #else
   return GetAddrInfoNonLinux(hostname, port, address);

From c0133ecfa61855d307e0104e0aa3b28291c0567b Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Thu, 10 Sep 2015 14:57:17 +0100
Subject: [PATCH 08/22] Add --exposures option to tesstrain.sh

This flag can be used to specify multiple different exposure levels
for a training. There was some code already in tesstrain_utils.sh
to deal with multiple exposure levels, so it looks like this
functionality was always intended.

The default usage does not change, with exposure level 0 being the
only one used if --exposures is not used.
---
 training/tesstrain.sh       |  1 +
 training/tesstrain_utils.sh | 69 +++++++++++++++++++++----------------
 2 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/training/tesstrain.sh b/training/tesstrain.sh
index 7e292cc3f7..37de22222d 100755
--- a/training/tesstrain.sh
+++ b/training/tesstrain.sh
@@ -24,6 +24,7 @@
 #    --output_dir OUTPUTDIR     # Location of output traineddata file.
 #    --overwrite                # Safe to overwrite files in output_dir.
 #    --run_shape_clustering     # Run shape clustering (use for Indic langs).
+#    --exposures EXPOSURES      # A list of exposure levels to use (e.g. "-1 0 1").
 #
 # OPTIONAL flags for input data. If unspecified we will look for them in
 # the langdata_dir directory.
diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh
index 86e57b9a12..2983764978 100755
--- a/training/tesstrain_utils.sh
+++ b/training/tesstrain_utils.sh
@@ -26,6 +26,7 @@ OVERWRITE=0
 RUN_SHAPE_CLUSTERING=0
 EXTRACT_FONT_PROPERTIES=1
 WORKSPACE_DIR="/tmp/tesstrain"
+EXPOSURES=0
 
 # Logging helper functions.
 tlog() {
@@ -98,6 +99,16 @@ parse_flags() {
                 FONTS=( ${ARGV[$j]} )
                 IFS=$ofs
                 i=$j ;;
+            --exposures)
+                exp=""
+                while test $j -lt ${#ARGV[@]}; do
+                    test -z ${ARGV[$j]} && break
+                    test `echo ${ARGV[$j]} | cut -c -2` = "--" && break
+                    exp="$exp ${ARGV[$j]}"
+                    j=$((j+1))
+                done
+                parse_value "EXPOSURES" "$exp"
+                i=$((j-1)) ;;
             --fonts_dir)
                 parse_value "FONTS_DIR" ${ARGV[$j]}
                 i=$j ;;
@@ -226,35 +237,36 @@ phase_I_generate_image() {
         err_exit "Could not find training text file ${TRAINING_TEXT}"
     fi
     CHAR_SPACING="0.0"
-    EXPOSURE="0"
-
-    if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
-        # Parse .bigram_freqs file and compose a .train_ngrams file with text
-        # for tesseract to recognize during training. Take only the ngrams whose
-        # combined weight accounts for 95% of all the bigrams in the language.
-        NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
-            | awk '{s=s+$2}; END {print (s/100)*p}' p=99)
-        cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
-            | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
-            x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
-        check_file_readable ${TRAIN_NGRAMS_FILE}
-    fi
 
-    local counter=0
-    for font in "${FONTS[@]}"; do
-        generate_font_image "${font}" &
-        let counter=counter+1
-        let rem=counter%par_factor
-        if [[ "${rem}" -eq 0 ]]; then
-          wait
+    for EXPOSURE in $EXPOSURES; do
+        if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then
+            # Parse .bigram_freqs file and compose a .train_ngrams file with text
+            # for tesseract to recognize during training. Take only the ngrams whose
+            # combined weight accounts for 95% of all the bigrams in the language.
+            NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \
+                | awk '{s=s+$2}; END {print (s/100)*p}' p=99)
+            cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \
+                | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \
+                x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE}
+            check_file_readable ${TRAIN_NGRAMS_FILE}
         fi
-    done
-    wait
-    # Check that each process was successful.
-    for font in "${FONTS[@]}"; do
-        local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
-        local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
-        check_file_readable ${outbase}.box ${outbase}.tif
+
+        local counter=0
+        for font in "${FONTS[@]}"; do
+            generate_font_image "${font}" &
+            let counter=counter+1
+            let rem=counter%par_factor
+            if [[ "${rem}" -eq 0 ]]; then
+              wait
+            fi
+        done
+        wait
+        # Check that each process was successful.
+        for font in "${FONTS[@]}"; do
+            local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g')
+            local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE}
+            check_file_readable ${outbase}.box ${outbase}.tif
+        done
     done
 }
 
@@ -359,10 +371,9 @@ phase_E_extract_features() {
         par_factor=1
     fi
     tlog "\n=== Phase E: Extracting features ==="
-    TRAIN_EXPOSURES='0'
 
     local img_files=""
-    for exposure in ${TRAIN_EXPOSURES}; do
+    for exposure in ${EXPOSURES}; do
         img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif)
     done
 

From de789ac8ea351d848e3a742ad038f9053f9cf1f4 Mon Sep 17 00:00:00 2001
From: Nick White <git@njw.name>
Date: Thu, 10 Sep 2015 15:05:07 +0100
Subject: [PATCH 09/22] Use mktemp to create workspace directory

mktemp is a better idea for security, as well as enabling users to
specify a different directory using the TMPDIR environment variable,
which is useful if /tmp is a small tmpfs.

Also fix a bug where the first few log messages were failing as the
workspace directory wasn't been created early enough.
---
 training/tesstrain.sh       | 5 +----
 training/tesstrain_utils.sh | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/training/tesstrain.sh b/training/tesstrain.sh
index 37de22222d..c1af1e86c1 100755
--- a/training/tesstrain.sh
+++ b/training/tesstrain.sh
@@ -49,11 +49,8 @@ source `dirname $0`/tesstrain_utils.sh
 ARGV=("$@")
 parse_flags
 
-tlog "\n=== Starting training for language '${LANG_CODE}'"
-
-tlog "Cleaning workspace directory ${TRAINING_DIR}..."
 mkdir -p ${TRAINING_DIR}
-rm -fr ${TRAINING_DIR}/*
+tlog "\n=== Starting training for language '${LANG_CODE}'"
 
 source `dirname $0`/language-specific.sh
 set_lang_specific_parameters ${LANG_CODE}
diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh
index 2983764978..30006bc1f7 100755
--- a/training/tesstrain_utils.sh
+++ b/training/tesstrain_utils.sh
@@ -25,7 +25,7 @@ OUTPUT_DIR="/tmp/tesstrain/tessdata"
 OVERWRITE=0
 RUN_SHAPE_CLUSTERING=0
 EXTRACT_FONT_PROPERTIES=1
-WORKSPACE_DIR="/tmp/tesstrain"
+WORKSPACE_DIR=`mktemp -d`
 EXPOSURES=0
 
 # Logging helper functions.

From 48171dea5f29ab89f8dffe57406190a30561c4ff Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 14 Sep 2015 21:52:54 +0200
Subject: [PATCH 10/22] COPYING: Fix typo found by codespell

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 COPYING | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/COPYING b/COPYING
index 096aaafb27..8d8d48cf91 100644
--- a/COPYING
+++ b/COPYING
@@ -1,5 +1,5 @@
 This package contains the Tesseract Open Source OCR Engine.
-Orignally developed at Hewlett Packard Laboratories Bristol and
+Originally developed at Hewlett Packard Laboratories Bristol and
 at Hewlett Packard Co, Greeley Colorado, all the code
 in this distribution is now licensed under the Apache License:
 

From 11b2a4d9af4277b452b0318f0f297f295f66b198 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 14 Sep 2015 21:54:27 +0200
Subject: [PATCH 11/22] api: Fix typos in comments (all found by codespell)

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 api/baseapi.cpp       | 2 +-
 api/pdfrenderer.cpp   | 2 +-
 api/tesseractmain.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/api/baseapi.cpp b/api/baseapi.cpp
index bdc02bfe86..fa38d29001 100644
--- a/api/baseapi.cpp
+++ b/api/baseapi.cpp
@@ -1660,7 +1660,7 @@ char* TessBaseAPI::GetUNLVText() {
             word->word->space() > 0 &&
             !word->word->flag(W_FUZZY_NON) &&
             !word->word->flag(W_FUZZY_SP)) {
-          /* Write a space to separate from preceeding good text */
+          /* Write a space to separate from preceding good text */
           *ptr++ = ' ';
           last_char_was_tilde = false;
         }
diff --git a/api/pdfrenderer.cpp b/api/pdfrenderer.cpp
index e96f67c481..4f6afbe32e 100644
--- a/api/pdfrenderer.cpp
+++ b/api/pdfrenderer.cpp
@@ -178,7 +178,7 @@ void TessPDFRenderer::AppendPDFObject(const char *data) {
   AppendString((const char *)data);
 }
 
-// Helper function to prevent us from accidentaly writing
+// Helper function to prevent us from accidentally writing
 // scientific notation to an HOCR or PDF file. Besides, three
 // decimal points are all you really need.
 double prec(double x) {
diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp
index e7abadf3d0..501b66c42c 100644
--- a/api/tesseractmain.cpp
+++ b/api/tesseractmain.cpp
@@ -227,7 +227,7 @@ int main(int argc, char **argv) {
   }
 
   // We have 2 possible sources of pagesegmode: a config file and
-  // the command line. For backwards compatability reasons, the
+  // the command line. For backwards compatibility reasons, the
   // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
   // default for this program is tesseract::PSM_AUTO. We will let
   // the config file take priority, so the command-line default

From 318b88daa6b39c8c17bf38998e2ed3cfa25ab12a Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 14 Sep 2015 21:59:16 +0200
Subject: [PATCH 12/22] ccmain: Fix typos in comments and strings

Most of them were found by codespell.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 ccmain/control.cpp        |  4 ++--
 ccmain/docqual.cpp        |  6 +++---
 ccmain/fixspace.cpp       | 10 +++++-----
 ccmain/output.cpp         |  4 ++--
 ccmain/paramsd.cpp        |  4 ++--
 ccmain/reject.cpp         |  4 ++--
 ccmain/tesseractclass.cpp | 20 ++++++++++----------
 ccmain/tesseractclass.h   | 22 +++++++++++-----------
 8 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/ccmain/control.cpp b/ccmain/control.cpp
index d40c26329b..66a2a8bb3e 100644
--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
@@ -1556,7 +1556,7 @@ void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,
        word->fix_quotes();
       if (tessedit_fix_hyphens)
         word->fix_hyphens();
-      /* Dont trust fix_quotes! - though I think I've fixed the bug */
+      /* Don't trust fix_quotes! - though I think I've fixed the bug */
       if (word->best_choice->length() != word->box_word->length()) {
         tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
                 " #Blobs=%d\n",
@@ -1694,7 +1694,7 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
       goto not_a_word;
     /*
     Allow a single hyphen in a lower case word
-    - dont trust upper case - I've seen several cases of "H" -> "I-I"
+    - don't trust upper case - I've seen several cases of "H" -> "I-I"
     */
     if (lengths[i] == 1 && s[offset] == '-') {
       hyphen_pos = i;
diff --git a/ccmain/docqual.cpp b/ccmain/docqual.cpp
index 6a7e6e67ef..327d7cbc55 100644
--- a/ccmain/docqual.cpp
+++ b/ccmain/docqual.cpp
@@ -129,7 +129,7 @@ inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
   int expected_outline_count;
 
   if (STRING (outlines_odd).contains (c))
-    return 0;                    //Dont use this char
+    return 0;                    //Don't use this char
   else if (STRING (outlines_2).contains (c))
     expected_outline_count = 2;
   else
@@ -157,7 +157,7 @@ void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
  *    - Word segmentation is the same as the original image
  *		- All characters have the expected number of outlines
  * NOTE - the rejection counts are recalculated after unrejection
- *      - CANT do it in a single pass without a bit of fiddling
+ *      - CAN'T do it in a single pass without a bit of fiddling
  *		- keep it simple but inefficient
  *************************************************************************/
 void Tesseract::unrej_good_quality_words(  //unreject potential
@@ -403,7 +403,7 @@ void Tesseract::doc_and_block_rejection(  //reject big chunks
 
 /*************************************************************************
  * reject_whole_page()
- * Dont believe any of it - set the reject map to 00..00 in all words
+ * Don't believe any of it - set the reject map to 00..00 in all words
  *
  *************************************************************************/
 
diff --git a/ccmain/fixspace.cpp b/ccmain/fixspace.cpp
index 0a561ac9a0..e42617c053 100644
--- a/ccmain/fixspace.cpp
+++ b/ccmain/fixspace.cpp
@@ -55,7 +55,7 @@ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
   WERD_RES *word_res;
   WERD_RES_LIST fuzzy_space_words;
   inT16 new_length;
-  BOOL8 prevent_null_wd_fixsp;   // DONT process blobless wds
+  BOOL8 prevent_null_wd_fixsp;   // DON'T process blobless wds
   inT32 word_index;              // current word
 
   block_res_it.set_to_list(&page_res->block_res_list);
@@ -222,7 +222,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
  * fuzzy spaces. The problem with the basic measure is that "561 63" would score
  * the same as "56163", though given our knowledge that the space is fuzzy, and
  * that there is a "1" next to the fuzzy space, we need to ensure that "56163"
- * is prefered.
+ * is preferred.
  *
  * The solution is to NOT COUNT the score of any word which has a digit at one
  * end and a "1Il" as the character the other side of the space.
@@ -272,8 +272,8 @@ inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
     } else {
       /*
         Can we add the prev word score and potentially count this word?
-        Yes IF it didnt end in a 1 when the first char of this word is a digit
-          AND it didnt end in a digit when the first char of this word is a 1
+        Yes IF it didn't end in a 1 when the first char of this word is a digit
+          AND it didn't end in a digit when the first char of this word is a 1
       */
       word_len = word->reject_map.length();
       current_word_ok_so_far = FALSE;
@@ -507,7 +507,7 @@ BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {
 
   /*
     Use all the standard pass 2 conditions for mode 5 in set_done() in
-    reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
+    reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
     CARE WHETHER WE HAVE of/at on/an etc.
   */
   if (fixsp_done_mode > 0 &&
diff --git a/ccmain/output.cpp b/ccmain/output.cpp
index 42623b9ec8..ddfcfc54b6 100644
--- a/ccmain/output.cpp
+++ b/ccmain/output.cpp
@@ -297,7 +297,7 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) {  // what char is repeated?
 /*************************************************************************
  * SUSPECT LEVELS
  *
- * 0 - dont reject ANYTHING
+ * 0 - don't reject ANYTHING
  * 1,2 - partial rejection
  * 3 - BEST
  *
@@ -337,7 +337,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
   rating_per_ch = word.rating() / word_res->reject_map.length();
 
   if (rating_per_ch >= suspect_rating_per_ch)
-    return;                      //Dont touch bad ratings
+    return;                      //Don't touch bad ratings
 
   if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
     /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
diff --git a/ccmain/paramsd.cpp b/ccmain/paramsd.cpp
index b141bede62..7784f85361 100644
--- a/ccmain/paramsd.cpp
+++ b/ccmain/paramsd.cpp
@@ -329,13 +329,13 @@ void ParamsEditor::WriteParams(char *filename,
     fclose(fp);
     sprintf (msg_str, "Overwrite file " "%s" "? (Y/N)", filename);
     int a = sv_window_->ShowYesNoDialog(msg_str);
-    if (a == 'n') { return; }  // dont write
+    if (a == 'n') { return; }  // don't write
   }
 
 
   fp = fopen (filename, "wb");  // can we write to it?
   if (fp == NULL) {
-    sv_window_->AddMessage("Cant write to file " "%s" "", filename);
+    sv_window_->AddMessage("Can't write to file " "%s" "", filename);
     return;
   }
 
diff --git a/ccmain/reject.cpp b/ccmain/reject.cpp
index 607b84179c..aacc80dd6e 100644
--- a/ccmain/reject.cpp
+++ b/ccmain/reject.cpp
@@ -521,7 +521,7 @@ BOOL8 Tesseract::word_contains_non_1_digit(const char *word,
 
 /*************************************************************************
  * dont_allow_1Il()
- * Dont unreject LONE accepted 1Il conflict set chars
+ * Don't unreject LONE accepted 1Il conflict set chars
  *************************************************************************/
 void Tesseract::dont_allow_1Il(WERD_RES *word) {
   int i = 0;
@@ -633,7 +633,7 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
       next_left = 9999;
     else
       next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
-    // Dont touch small or touching blobs - it is too dangerous.
+    // Don't touch small or touching blobs - it is too dangerous.
     if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
         (out_box.left() > prev_right) && (out_box.right() < next_left)) {
       aspect_ratio = out_box.width() / (float) out_box.height();
diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp
index 0c52f0efd9..e348c93f98 100644
--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@@ -136,7 +136,7 @@ Tesseract::Tesseract()
       BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
                   "Try to improve fuzzy spaces", this->params()),
       BOOL_MEMBER(tessedit_unrej_any_wd, false,
-                  "Dont bother with word plausibility", this->params()),
+                  "Don't bother with word plausibility", this->params()),
       BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
                   this->params()),
       BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
@@ -310,19 +310,19 @@ Tesseract::Tesseract()
                  this->params()),
       INT_MEMBER(crunch_pot_indicators, 1,
                  "How many potential indicators needed", this->params()),
-      BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings",
+      BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
                   this->params()),
       BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
                   this->params()),
       BOOL_MEMBER(crunch_leave_accept_strings, false,
-                  "Dont pot crunch sensible strings", this->params()),
+                  "Don't pot crunch sensible strings", this->params()),
       BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
                   this->params()),
       INT_MEMBER(crunch_leave_lc_strings, 4,
-                 "Dont crunch words with long lower case strings",
+                 "Don't crunch words with long lower case strings",
                  this->params()),
       INT_MEMBER(crunch_leave_uc_strings, 4,
-                 "Dont crunch words with long lower case strings",
+                 "Don't crunch words with long lower case strings",
                  this->params()),
       INT_MEMBER(crunch_long_repetitions, 3,
                  "Crunch words with long repetitions", this->params()),
@@ -393,21 +393,21 @@ Tesseract::Tesseract()
       INT_MEMBER(suspect_space_level, 100,
                  "Min suspect level for rejecting spaces", this->params()),
       INT_MEMBER(suspect_short_words, 2,
-                 "Dont Suspect dict wds longer than this", this->params()),
+                 "Don't suspect dict wds longer than this", this->params()),
       BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
                   this->params()),
-      double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit",
+      double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit",
                     this->params()),
       double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
                     this->params()),
       BOOL_MEMBER(tessedit_minimal_rejection, false,
                   "Only reject tess failures", this->params()),
-      BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING",
+      BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
                   this->params()),
       BOOL_MEMBER(tessedit_word_for_word, false,
                   "Make output have exactly one word per WERD", this->params()),
       BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
-                  "Dont reject ANYTHING AT ALL", this->params()),
+                  "Don't reject ANYTHING AT ALL", this->params()),
       BOOL_MEMBER(tessedit_consistent_reps, true,
                   "Force all rep chars the same", this->params()),
       INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
@@ -424,7 +424,7 @@ Tesseract::Tesseract()
                   "Use DOC dawg in 11l conf. detector", this->params()),
       BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
                   this->params()),
-      BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check",
+      BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
                   this->params()),
       BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
                   this->params()),
diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h
index 50141bf942..6666dec36b 100644
--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@@ -733,7 +733,7 @@ class Tesseract : public Wordrec {
                                GenericVector<UNICHAR_ID>* class_ids);
   // Resegments the word to achieve the target_text from the classifier.
   // Returns false if the re-segmentation fails.
-  // Uses brute-force combination of upto kMaxGroupSize adjacent blobs, and
+  // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
   // applies a full search on the classifier results to find the best classified
   // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
   // substitutions ARE used.
@@ -833,7 +833,7 @@ class Tesseract : public Wordrec {
   BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
              "Try to improve fuzzy spaces");
   BOOL_VAR_H(tessedit_unrej_any_wd, false,
-             "Dont bother with word plausibility");
+             "Don't bother with word plausibility");
   BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
   BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height");
   BOOL_VAR_H(tessedit_enable_doc_dict, true,
@@ -954,15 +954,15 @@ class Tesseract : public Wordrec {
   double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
   INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
   INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
-  BOOL_VAR_H(crunch_leave_ok_strings, true, "Dont touch sensible strings");
+  BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings");
   BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
   BOOL_VAR_H(crunch_leave_accept_strings, false,
-             "Dont pot crunch sensible strings");
+             "Don't pot crunch sensible strings");
   BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
   INT_VAR_H(crunch_leave_lc_strings, 4,
-            "Dont crunch words with long lower case strings");
+            "Don't crunch words with long lower case strings");
   INT_VAR_H(crunch_leave_uc_strings, 4,
-            "Dont crunch words with long lower case strings");
+            "Don't crunch words with long lower case strings");
   INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
   INT_VAR_H(crunch_debug, 0, "As it says");
   INT_VAR_H(fixsp_non_noise_limit, 1,
@@ -1010,16 +1010,16 @@ class Tesseract : public Wordrec {
   INT_VAR_H(suspect_space_level, 100,
             "Min suspect level for rejecting spaces");
   INT_VAR_H(suspect_short_words, 2,
-            "Dont Suspect dict wds longer than this");
+            "Don't Suspect dict wds longer than this");
   BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
-  double_VAR_H(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit");
+  double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit");
   double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
   BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
-  BOOL_VAR_H(tessedit_zero_rejection, false, "Dont reject ANYTHING");
+  BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING");
   BOOL_VAR_H(tessedit_word_for_word, false,
              "Make output have exactly one word per WERD");
   BOOL_VAR_H(tessedit_zero_kelvin_rejection, false,
-             "Dont reject ANYTHING AT ALL");
+             "Don't reject ANYTHING AT ALL");
   BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
   INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
   BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
@@ -1030,7 +1030,7 @@ class Tesseract : public Wordrec {
                "Aspect ratio dot/hyphen test");
   BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
   BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
-  BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Dont double check");
+  BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check");
   BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
   BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
   BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");

From bef8cad38d377966697f2d2380c1dd927b33376f Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 14 Sep 2015 22:02:00 +0200
Subject: [PATCH 13/22] ccstruct: Fix typos in comments and strings

Most of them were found by codespell.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 ccstruct/blobbox.cpp  | 2 +-
 ccstruct/boxread.cpp  | 2 +-
 ccstruct/normalis.cpp | 2 +-
 ccstruct/normalis.h   | 2 +-
 ccstruct/pdblock.h    | 2 +-
 ccstruct/rejctmap.h   | 8 ++++----
 ccstruct/statistc.cpp | 6 +++---
 ccstruct/vecfuncs.cpp | 2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/ccstruct/blobbox.cpp b/ccstruct/blobbox.cpp
index 322171f0dc..280096b5d3 100644
--- a/ccstruct/blobbox.cpp
+++ b/ccstruct/blobbox.cpp
@@ -33,7 +33,7 @@
 
 ELISTIZE (BLOBNBOX) ELIST2IZE (TO_ROW) ELISTIZE (TO_BLOCK)
 
-// Upto 30 degrees is allowed for rotations of diacritic blobs.
+// Up to 30 degrees is allowed for rotations of diacritic blobs.
 const double kCosSmallAngle = 0.866;
 // Min aspect ratio for a joined word to indicate an obvious flow direction.
 const double kDefiniteAspectRatio = 2.0;
diff --git a/ccstruct/boxread.cpp b/ccstruct/boxread.cpp
index 947fcc02fe..f4aedca5b3 100644
--- a/ccstruct/boxread.cpp
+++ b/ccstruct/boxread.cpp
@@ -35,7 +35,7 @@ FILE* OpenBoxFile(const STRING& fname) {
   FILE* box_file = NULL;
   if (!(box_file = fopen(filename.string(), "rb"))) {
     CANTOPENFILE.error("read_next_box", TESSEXIT,
-                       "Cant open box file %s",
+                       "Can't open box file %s",
                        filename.string());
   }
   return box_file;
diff --git a/ccstruct/normalis.cpp b/ccstruct/normalis.cpp
index d43a1459cb..ddf6dbf3b1 100644
--- a/ccstruct/normalis.cpp
+++ b/ccstruct/normalis.cpp
@@ -382,7 +382,7 @@ void DENORM::LocalDenormTransform(const FCOORD& pt, FCOORD* original) const {
 }
 
 // Transforms the given coords all the way back to source image space using
-// the full transformation sequence defined by this and its predecesors
+// the full transformation sequence defined by this and its predecessors
 // recursively, shallowest first, and finally any block re_rotation.
 // If last_denorm is not NULL, then the last transformation used will
 // be last_denorm, and the block re_rotation will never be executed.
diff --git a/ccstruct/normalis.h b/ccstruct/normalis.h
index c8ce7cd28b..2d75412078 100644
--- a/ccstruct/normalis.h
+++ b/ccstruct/normalis.h
@@ -218,7 +218,7 @@ class DENORM {
   void LocalDenormTransform(const TPOINT& pt, TPOINT* original) const;
   void LocalDenormTransform(const FCOORD& pt, FCOORD* original) const;
   // Transforms the given coords all the way back to source image space using
-  // the full transformation sequence defined by this and its predecesors
+  // the full transformation sequence defined by this and its predecessors
   // recursively, shallowest first, and finally any block re_rotation.
   // If last_denorm is not NULL, then the last transformation used will
   // be last_denorm, and the block re_rotation will never be executed.
diff --git a/ccstruct/pdblock.h b/ccstruct/pdblock.h
index 0dd0bf2ef8..b64eff36d0 100644
--- a/ccstruct/pdblock.h
+++ b/ccstruct/pdblock.h
@@ -108,7 +108,7 @@ class PDBLK
     PDBLK & operator= (const PDBLK & source);
 
   protected:
-    POLY_BLOCK *hand_poly;       //< wierd as well
+    POLY_BLOCK *hand_poly;       //< weird as well
     ICOORDELT_LIST leftside;     //< left side vertices
     ICOORDELT_LIST rightside;    //< right side vertices
     TBOX box;                    //< bounding box
diff --git a/ccstruct/rejctmap.h b/ccstruct/rejctmap.h
index 4b27bab49b..d945dda1fa 100644
--- a/ccstruct/rejctmap.h
+++ b/ccstruct/rejctmap.h
@@ -16,7 +16,7 @@
  ** limitations under the License.
  *
 
-This module may look unneccessarily verbose, but here's the philosophy...
+This module may look unnecessarily verbose, but here's the philosophy...
 
 ALL processing of the reject map is done in this module. There are lots of
 separate calls to set reject/accept flags. These have DELIBERATELY been kept
@@ -51,7 +51,7 @@ OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!!
 enum REJ_FLAGS
 {
   /* Reject modes which are NEVER overridden */
-  R_TESS_FAILURE,                // PERM Tess didnt classify
+  R_TESS_FAILURE,                // PERM Tess didn't classify
   R_SMALL_XHT,                   // PERM Xht too small
   R_EDGE_CHAR,                   // PERM Too close to edge of image
   R_1IL_CONFLICT,                // PERM 1Il confusion
@@ -62,7 +62,7 @@ enum REJ_FLAGS
 
   /* Initial reject modes (pre NN_ACCEPT) */
   R_POOR_MATCH,                  // TEMP Ray's original heuristic (Not used)
-  R_NOT_TESS_ACCEPTED,           // TEMP Tess didnt accept WERD
+  R_NOT_TESS_ACCEPTED,           // TEMP Tess didn't accept WERD
   R_CONTAINS_BLANKS,             // TEMP Tess failed on other chs in WERD
   R_BAD_PERMUTER,                // POTENTIAL Bad permuter for WERD
 
@@ -82,7 +82,7 @@ enum REJ_FLAGS
   R_ROW_REJ,                     // TEMP Row rejection
   R_UNLV_REJ,                    // TEMP ~ turned to - or ^ turned to space
 
-  /* Accept modes which occur inbetween the above rejection groups */
+  /* Accept modes which occur between the above rejection groups */
   R_NN_ACCEPT,                   //NN acceptance
   R_HYPHEN_ACCEPT,               //Hyphen acceptance
   R_MM_ACCEPT,                   //Matrix match acceptance
diff --git a/ccstruct/statistc.cpp b/ccstruct/statistc.cpp
index 63676c2fca..39d5edd180 100644
--- a/ccstruct/statistc.cpp
+++ b/ccstruct/statistc.cpp
@@ -204,7 +204,7 @@ double STATS::ile(double frac) const {
 /**********************************************************************
  * STATS::min_bucket
  *
- * Find REAL minimum bucket - ile(0.0) isnt necessarily correct
+ * Find REAL minimum bucket - ile(0.0) isn't necessarily correct
  **********************************************************************/
 inT32 STATS::min_bucket() const {  // Find min
   if (buckets_ == NULL || total_count_ == 0) {
@@ -219,7 +219,7 @@ inT32 STATS::min_bucket() const {  // Find min
 /**********************************************************************
  * STATS::max_bucket
  *
- * Find REAL maximum bucket - ile(1.0) isnt necessarily correct
+ * Find REAL maximum bucket - ile(1.0) isn't necessarily correct
  **********************************************************************/
 
 inT32 STATS::max_bucket() const {  // Find max
@@ -249,7 +249,7 @@ double STATS::median() const {  //get median
   if ((total_count_ > 1) && (pile_count(median_pile) == 0)) {
     inT32 min_pile;
     inT32 max_pile;
-    /* Find preceeding non zero pile */
+    /* Find preceding non zero pile */
     for (min_pile = median_pile; pile_count(min_pile) == 0; min_pile--);
     /* Find following non zero pile */
     for (max_pile = median_pile; pile_count(max_pile) == 0; max_pile++);
diff --git a/ccstruct/vecfuncs.cpp b/ccstruct/vecfuncs.cpp
index 8357c9aabe..bafca55d60 100644
--- a/ccstruct/vecfuncs.cpp
+++ b/ccstruct/vecfuncs.cpp
@@ -23,7 +23,7 @@
  *
  ********************************************************************************
  * Revision 5.1  89/07/27  11:47:50  11:47:50  ray ()
- * Added ratings acces methods.
+ * Added ratings access methods.
  * This version ready for independent development.
  */
 /*----------------------------------------------------------------------

From 539b7fbbabce3f51aeadc860ae98818f2d681b7c Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 14 Sep 2015 22:09:18 +0200
Subject: [PATCH 14/22] ccutil: Fix typos in comments and strings

Most of them were found by codespell.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 ccutil/clst.cpp            |  4 ++--
 ccutil/clst.h              | 20 ++++++++++----------
 ccutil/elst.cpp            |  6 +++---
 ccutil/elst.h              | 16 ++++++++--------
 ccutil/elst2.cpp           |  6 +++---
 ccutil/elst2.h             | 18 +++++++++---------
 ccutil/errcode.h           |  2 +-
 ccutil/genericvector.h     |  2 +-
 ccutil/lsterr.h            |  2 +-
 ccutil/ocrclass.h          |  2 +-
 ccutil/strngs.cpp          |  8 ++++----
 ccutil/tessdatamanager.cpp |  2 +-
 ccutil/tessdatamanager.h   |  2 +-
 13 files changed, 45 insertions(+), 45 deletions(-)

diff --git a/ccutil/clst.cpp b/ccutil/clst.cpp
index 60f88d3706..e71cc20100 100644
--- a/ccutil/clst.cpp
+++ b/ccutil/clst.cpp
@@ -190,7 +190,7 @@ const void *, const void *)) {
 
 // Assuming list has been sorted already, insert new_data to
 // keep the list sorted according to the same comparison function.
-// Comparision function is the same as used by sort, i.e. uses double
+// Comparison function is the same as used by sort, i.e. uses double
 // indirection. Time is O(1) to add to beginning or end.
 // Time is linear to add pre-sorted items to an empty list.
 // If unique, then don't add duplicate entries.
@@ -513,7 +513,7 @@ CLIST_LINK *CLIST_ITERATOR::extract_sublist(                             //from
 
   temp_it.mark_cycle_pt ();
   do {                           //walk sublist
-    if (temp_it.cycled_list ())  //cant find end pt
+    if (temp_it.cycled_list ())  //can't find end pt
       BAD_SUBLIST.error ("CLIST_ITERATOR.extract_sublist", ABORT, NULL);
 
     if (temp_it.at_last ()) {
diff --git a/ccutil/clst.h b/ccutil/clst.h
index 89c4369949..a209ac11cc 100644
--- a/ccutil/clst.h
+++ b/ccutil/clst.h
@@ -51,11 +51,11 @@ class DLLSYM CLIST_LINK
     }
 
     CLIST_LINK(                       //copy constructor
-               const CLIST_LINK &) {  //dont copy link
+               const CLIST_LINK &) {  //don't copy link
       data = next = NULL;
     }
 
-    void operator= (             //dont copy links
+    void operator= (             //don't copy links
     const CLIST_LINK &) {
       data = next = NULL;
     }
@@ -89,7 +89,7 @@ class DLLSYM CLIST
     void internal_deep_clear (   //destroy all links
       void (*zapper) (void *));  //ptr to zapper functn
 
-    void shallow_clear();  //clear list but dont
+    void shallow_clear();  //clear list but don't
     //delete data elements
 
     bool empty() const {  //is list empty?
@@ -117,7 +117,7 @@ class DLLSYM CLIST
 
     // Assuming list has been sorted already, insert new_data to
     // keep the list sorted according to the same comparison function.
-    // Comparision function is the same as used by sort, i.e. uses double
+    // Comparison function is the same as used by sort, i.e. uses double
     // indirection. Time is O(1) to add to beginning or end.
     // Time is linear to add pre-sorted items to an empty list.
     // If unique, then don't add duplicate entries.
@@ -232,7 +232,7 @@ class DLLSYM CLIST_ITERATOR
     BOOL8 cycled_list();  //Completed a cycle?
 
     void add_to_end(                  //add at end &
-                    void *new_data);  //dont move
+                    void *new_data);  //don't move
 
     void exchange(                            //positions of 2 links
                   CLIST_ITERATOR *other_it);  //other iterator
@@ -437,7 +437,7 @@ inline void CLIST_ITERATOR::add_before_then_move(  // element to add
 /***********************************************************************
  *							CLIST_ITERATOR::add_before_stay_put
  *
- *  Add a new element to the list before the current element but dont move the
+ *  Add a new element to the list before the current element but don't move the
  *  iterator to the new element.
  **********************************************************************/
 
@@ -485,7 +485,7 @@ inline void CLIST_ITERATOR::add_before_stay_put(  // element to add
 /***********************************************************************
  *							CLIST_ITERATOR::add_list_after
  *
- *  Insert another list to this list after the current element but dont move the
+ *  Insert another list to this list after the current element but don't move the
  *  iterator.
  **********************************************************************/
 
@@ -836,7 +836,7 @@ Replace <parm> with "<parm>".  <parm> may be an arbitrary number of tokens
 
 CLASSNAME is assumed to be the name of a class to be used in a CONS list
 
-NOTE:  Because we dont use virtual functions in the list code, the list code
+NOTE:  Because we don't use virtual functions in the list code, the list code
 will NOT work correctly for classes derived from this.
 
 The macro generates:
@@ -885,7 +885,7 @@ public:																			\
 							CLASSNAME##_CLIST():CLIST() {}						\
 														/* constructor */		\
 																				\
-							CLASSNAME##_CLIST(	/* dont construct */			\
+							CLASSNAME##_CLIST(	/* don't construct */			\
 	const CLASSNAME##_CLIST&)							/*by initial assign*/	\
 	{ DONT_CONSTRUCT_LIST_BY_COPY.error( QUOTE_IT( CLASSNAME##_CLIST ),			\
 														ABORT, NULL ); }		\
@@ -963,7 +963,7 @@ CLISTIZEH_C( CLASSNAME )
 *  A function which can delete a CLASSNAME element.  This is passed to the		\
 *  generic deep_clear list member function so that when a list is cleared the	\
 *  elements on the list are properly destroyed from the base class, even		\
-*  though we dont use a virtual destructor function.							\
+*  though we don't use a virtual destructor function.							\
 **********************************************************************/			\
 																				\
 DLLSYM void					CLASSNAME##_c1_zapper(		/*delete a link*/		\
diff --git a/ccutil/elst.cpp b/ccutil/elst.cpp
index 7762220d6e..67a8ab0cbe 100644
--- a/ccutil/elst.cpp
+++ b/ccutil/elst.cpp
@@ -117,7 +117,7 @@ inT32 ELIST::length() const {  // count elements
  *							ELIST::sort
  *
  *  Sort elements on list
- *  NB If you dont like the const declarations in the comparator, coerce yours:
+ *  NB If you don't like the const declarations in the comparator, coerce yours:
  *   ( int (*)(const void *, const void *)
  **********************************************************************/
 
@@ -161,7 +161,7 @@ const void *, const void *)) {
 
 // Assuming list has been sorted already, insert new_link to
 // keep the list sorted according to the same comparison function.
-// Comparision function is the same as used by sort, i.e. uses double
+// Comparison function is the same as used by sort, i.e. uses double
 // indirection. Time is O(1) to add to beginning or end.
 // Time is linear to add pre-sorted items to an empty list.
 // If unique is set to true and comparator() returns 0 (an entry with the
@@ -455,7 +455,7 @@ ELIST_LINK *ELIST_ITERATOR::extract_sublist(                             //from
 
   temp_it.mark_cycle_pt ();
   do {                           //walk sublist
-    if (temp_it.cycled_list ())  //cant find end pt
+    if (temp_it.cycled_list ())  //can't find end pt
       BAD_SUBLIST.error ("ELIST_ITERATOR.extract_sublist", ABORT, NULL);
 
     if (temp_it.at_last ()) {
diff --git a/ccutil/elst.h b/ccutil/elst.h
index dcc1552d16..492c03acb3 100644
--- a/ccutil/elst.h
+++ b/ccutil/elst.h
@@ -67,7 +67,7 @@ The implementation of lists is very careful about space and speed overheads.
 This is why many embedded lists are provided. The same concerns mean that
 in-line type coercion is done, rather than use virtual functions.  This is
 cumbersome in that each data type to be listed requires its own iterator and
-list class - though macros can gererate these.  It also prevents heterogenous
+list class - though macros can gererate these.  It also prevents heterogeneous
 lists.
 **********************************************************************/
 
@@ -98,7 +98,7 @@ class DLLSYM ELIST_LINK
       next = NULL;
     }
 
-    void operator= (             //dont copy links
+    void operator= (             //don't copy links
     const ELIST_LINK &) {
       next = NULL;
     }
@@ -158,7 +158,7 @@ class DLLSYM ELIST
 
     // Assuming list has been sorted already, insert new_link to
     // keep the list sorted according to the same comparison function.
-    // Comparision function is the same as used by sort, i.e. uses double
+    // Comparison function is the same as used by sort, i.e. uses double
     // indirection. Time is O(1) to add to beginning or end.
     // Time is linear to add pre-sorted items to an empty list.
     // If unique is set to true and comparator() returns 0 (an entry with the
@@ -274,7 +274,7 @@ class DLLSYM ELIST_ITERATOR
     bool cycled_list();  //Completed a cycle?
 
     void add_to_end(                        //add at end &
-                    ELIST_LINK *new_link);  //dont move
+                    ELIST_LINK *new_link);  //don't move
 
     void exchange(                            //positions of 2 links
                   ELIST_ITERATOR *other_it);  //other iterator
@@ -470,7 +470,7 @@ inline void ELIST_ITERATOR::add_before_then_move(  // element to add
 /***********************************************************************
  *                          ELIST_ITERATOR::add_before_stay_put
  *
- *  Add a new element to the list before the current element but dont move the
+ *  Add a new element to the list before the current element but don't move the
  *  iterator to the new element.
  **********************************************************************/
 
@@ -515,7 +515,7 @@ inline void ELIST_ITERATOR::add_before_stay_put(  // element to add
 /***********************************************************************
  *                          ELIST_ITERATOR::add_list_after
  *
- *  Insert another list to this list after the current element but dont move the
+ *  Insert another list to this list after the current element but don't move the
  *  iterator.
  **********************************************************************/
 
@@ -868,7 +868,7 @@ Replace <parm> with "<parm>".  <parm> may be an arbitrary number of tokens
 CLASSNAME is assumed to be the name of a class which has a baseclass of
 ELIST_LINK.
 
-NOTE:  Because we dont use virtual functions in the list code, the list code
+NOTE:  Because we don't use virtual functions in the list code, the list code
 will NOT work correctly for classes derived from this.
 
 The macros generate:
@@ -999,7 +999,7 @@ ELISTIZEH_C( CLASSNAME )
 *  A function which can delete a CLASSNAME element.  This is passed to the  \
 *  generic clear list member function so that when a list is cleared the    \
 *  elements on the list are properly destroyed from the base class, even    \
-*  though we dont use a virtual destructor function.                        \
+*  though we don't use a virtual destructor function.                       \
 **********************************************************************/     \
                                                                             \
 DLLSYM void CLASSNAME##_zapper(ELIST_LINK* link) {                          \
diff --git a/ccutil/elst2.cpp b/ccutil/elst2.cpp
index 7055686fb5..fe5b77e256 100644
--- a/ccutil/elst2.cpp
+++ b/ccutil/elst2.cpp
@@ -118,7 +118,7 @@ inT32 ELIST2::length() const {  // count elements
  *							ELIST2::sort
  *
  *  Sort elements on list
- *  NB If you dont like the const declarations in the comparator, coerce yours:
+ *  NB If you don't like the const declarations in the comparator, coerce yours:
  *   ( int (*)(const void *, const void *)
  **********************************************************************/
 
@@ -162,7 +162,7 @@ const void *, const void *)) {
 
 // Assuming list has been sorted already, insert new_link to
 // keep the list sorted according to the same comparison function.
-// Comparision function is the same as used by sort, i.e. uses double
+// Comparison function is the same as used by sort, i.e. uses double
 // indirection. Time is O(1) to add to beginning or end.
 // Time is linear to add pre-sorted items to an empty list.
 void ELIST2::add_sorted(int comparator(const void*, const void*),
@@ -475,7 +475,7 @@ ELIST2_LINK *ELIST2_ITERATOR::extract_sublist(                              //fr
 
   temp_it.mark_cycle_pt ();
   do {                           //walk sublist
-    if (temp_it.cycled_list ())  //cant find end pt
+    if (temp_it.cycled_list ())  //can't find end pt
       BAD_SUBLIST.error ("ELIST2_ITERATOR.extract_sublist", ABORT, NULL);
 
     if (temp_it.at_last ()) {
diff --git a/ccutil/elst2.h b/ccutil/elst2.h
index 7201750dcb..f7ea6ed07c 100644
--- a/ccutil/elst2.h
+++ b/ccutil/elst2.h
@@ -69,11 +69,11 @@ class DLLSYM ELIST2_LINK
     }
 
     ELIST2_LINK(                        //copy constructor
-                const ELIST2_LINK &) {  //dont copy link
+                const ELIST2_LINK &) {  //don't copy link
       prev = next = NULL;
     }
 
-    void operator= (             //dont copy links
+    void operator= (             //don't copy links
     const ELIST2_LINK &) {
       prev = next = NULL;
     }
@@ -133,7 +133,7 @@ class DLLSYM ELIST2
 
     // Assuming list has been sorted already, insert new_link to
     // keep the list sorted according to the same comparison function.
-    // Comparision function is the same as used by sort, i.e. uses double
+    // Comparison function is the same as used by sort, i.e. uses double
     // indirection. Time is O(1) to add to beginning or end.
     // Time is linear to add pre-sorted items to an empty list.
     void add_sorted(int comparator(const void*, const void*),
@@ -241,7 +241,7 @@ class DLLSYM ELIST2_ITERATOR
     BOOL8 cycled_list();  //Completed a cycle?
 
     void add_to_end(                         //add at end &
-                    ELIST2_LINK *new_link);  //dont move
+                    ELIST2_LINK *new_link);  //don't move
 
     void exchange(                             //positions of 2 links
                   ELIST2_ITERATOR *other_it);  //other iterator
@@ -450,7 +450,7 @@ inline void ELIST2_ITERATOR::add_before_then_move(  // element to add
 /***********************************************************************
  *							ELIST2_ITERATOR::add_before_stay_put
  *
- *  Add a new element to the list before the current element but dont move the
+ *  Add a new element to the list before the current element but don't move the
  *  iterator to the new element.
  **********************************************************************/
 
@@ -500,7 +500,7 @@ inline void ELIST2_ITERATOR::add_before_stay_put(  // element to add
 /***********************************************************************
  *							ELIST2_ITERATOR::add_list_after
  *
- *  Insert another list to this list after the current element but dont move the
+ *  Insert another list to this list after the current element but don't move the
  *  iterator.
  **********************************************************************/
 
@@ -883,7 +883,7 @@ Replace <parm> with "<parm>".  <parm> may be an arbitrary number of tokens
 CLASSNAME is assumed to be the name of a class which has a baseclass of
 ELIST2_LINK.
 
-NOTE:  Because we dont use virtual functions in the list code, the list code
+NOTE:  Because we don't use virtual functions in the list code, the list code
 will NOT work correctly for classes derived from this.
 
 The macro generates:
@@ -927,7 +927,7 @@ public:																								\
 							CLASSNAME##_LIST():ELIST2() {} \
 														/* constructor */		\
 																										\
-							CLASSNAME##_LIST(			/* dont construct */ \
+							CLASSNAME##_LIST(			/* don't construct */ \
 	const CLASSNAME##_LIST&)							/*by initial assign*/\
 	{ DONT_CONSTRUCT_LIST_BY_COPY.error( QUOTE_IT( CLASSNAME##_LIST ),      \
 														ABORT, NULL ); }							\
@@ -1015,7 +1015,7 @@ ELIST2IZEH_C( CLASSNAME )
 *  A function which can delete a CLASSNAME element.  This is passed to the		\
 *  generic clear list member function so that when a list is cleared the		\
 *  elements on the list are properly destroyed from the base class, even		\
-*  though we dont use a virtual destructor function.									\
+*  though we don't use a virtual destructor function.									\
 **********************************************************************/			\
 																										\
 DLLSYM void					CLASSNAME##_zapper(			/*delete a link*/		\
diff --git a/ccutil/errcode.h b/ccutil/errcode.h
index 89385d2b93..69d4187a37 100644
--- a/ccutil/errcode.h
+++ b/ccutil/errcode.h
@@ -53,7 +53,7 @@ enum TessErrorLogCode {
 #define LOC_DOC_BLK_REJ   22
 #define LOC_WRITE_RESULTS 23
 #define LOC_ADAPTIVE    24
-/* DONT DEFINE ANY LOCATION > 31 !!! */
+/* DON'T DEFINE ANY LOCATION > 31 !!! */
 
 /* Sub locatation determines whether pass2 was in normal mode or fix xht mode*/
 #define SUBLOC_NORM     0
diff --git a/ccutil/genericvector.h b/ccutil/genericvector.h
index 8433966bf9..a0ca9e2926 100644
--- a/ccutil/genericvector.h
+++ b/ccutil/genericvector.h
@@ -949,7 +949,7 @@ bool GenericVector<T>::SerializeClasses(tesseract::TFile* fp) const {
 
 // Reads a vector of classes from the given file. Assumes the existence of
 // bool T::Deserialize(bool swap, FILE* fp) that returns false in case of
-// error. Alse needs T::T() and T::T(constT&), as init_to_size is used in
+// error. Also needs T::T() and T::T(constT&), as init_to_size is used in
 // this function. Returns false in case of error.
 // If swap is true, assumes a big/little-endian swap is needed.
 template <typename T>
diff --git a/ccutil/lsterr.h b/ccutil/lsterr.h
index 6bcd7fead1..42ed07e326 100644
--- a/ccutil/lsterr.h
+++ b/ccutil/lsterr.h
@@ -38,6 +38,6 @@ const ERRCODE NULL_PREV = "Previous element on the list is NULL";
 const ERRCODE EMPTY_LIST = "List is empty";
 const ERRCODE BAD_PARAMETER = "List parameter error";
 const ERRCODE STILL_LINKED =
-"Attemting to add an element with non NULL links, to a list";
+"Attempting to add an element with non NULL links, to a list";
 #endif
 #endif
diff --git a/ccutil/ocrclass.h b/ccutil/ocrclass.h
index 37556b30b2..9be184d591 100644
--- a/ccutil/ocrclass.h
+++ b/ccutil/ocrclass.h
@@ -21,7 +21,7 @@
  * the HP OCR interface.
  * The code is designed to be used with either a C or C++ compiler.
  * The structures are designed to allow them to be used with any
- * structure alignment upto 8.
+ * structure alignment up to 8.
  **********************************************************************/
 
 #ifndef            CCUTIL_OCRCLASS_H_
diff --git a/ccutil/strngs.cpp b/ccutil/strngs.cpp
index 1c9769978a..b44c541246 100644
--- a/ccutil/strngs.cpp
+++ b/ccutil/strngs.cpp
@@ -45,7 +45,7 @@ const int kMaxDoubleSize = 15;
  *
  * The collection of MACROS provide different implementations depending
  * on whether the string keeps track of its strlen or not so that this
- * feature can be added in later when consumers dont modifify the string
+ * feature can be added in later when consumers don't modify the string
  **********************************************************************/
 
 // Smallest string to allocate by default
@@ -339,7 +339,7 @@ STRING& STRING::operator=(const STRING& str) {
   const STRING_HEADER* str_header = str.GetHeader();
   int   str_used = str_header->used_;
 
-  GetHeader()->used_ = 0;  // clear since ensure doesnt need to copy data
+  GetHeader()->used_ = 0;  // clear since ensure doesn't need to copy data
   char* this_cstr = ensure_cstr(str_used);
   STRING_HEADER* this_header = GetHeader();
 
@@ -398,7 +398,7 @@ STRING & STRING::operator=(const char* cstr) {
   if (cstr) {
     int len = strlen(cstr) + 1;
 
-    this_header->used_ = 0;  // dont bother copying data if need to realloc
+    this_header->used_ = 0;  // don't bother copying data if need to realloc
     char* this_cstr = ensure_cstr(len);
     this_header = GetHeader();  // for realloc
     memcpy(this_cstr, cstr, len);
@@ -416,7 +416,7 @@ STRING & STRING::operator=(const char* cstr) {
 
 void STRING::assign(const char *cstr, int len) {
   STRING_HEADER* this_header = GetHeader();
-  this_header->used_ = 0;  // dont bother copying data if need to realloc
+  this_header->used_ = 0;  // don't bother copying data if need to realloc
   char* this_cstr = ensure_cstr(len + 1);  // +1 for '\0'
 
   this_header = GetHeader();  // for realloc
diff --git a/ccutil/tessdatamanager.cpp b/ccutil/tessdatamanager.cpp
index 032d5fee61..23d029bb42 100644
--- a/ccutil/tessdatamanager.cpp
+++ b/ccutil/tessdatamanager.cpp
@@ -51,7 +51,7 @@ bool TessdataManager::Init(const char *data_file_name, int debug_level) {
              sizeof(actual_tessdata_num_entries_));
   }
   if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
-    // For forward compatability, truncate to the number we can handle.
+    // For forward compatibility, truncate to the number we can handle.
     actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
   }
   fread(offset_table_, sizeof(inT64),
diff --git a/ccutil/tessdatamanager.h b/ccutil/tessdatamanager.h
index de3e599025..fd2685a1d8 100644
--- a/ccutil/tessdatamanager.h
+++ b/ccutil/tessdatamanager.h
@@ -282,7 +282,7 @@ class TessdataManager {
    * same or smaller than TESSDATA_NUM_ENTRIES, but can never be larger,
    * since then it would be impossible to interpret the type of tessdata at
    * indices same and higher than TESSDATA_NUM_ENTRIES.
-   * This parameter is used to allow for backward compatiblity
+   * This parameter is used to allow for backward compatibility
    * when new tessdata types are introduced.
    */
   inT32 actual_tessdata_num_entries_;

From 55fde61a8f42d8486f5759e9e78182d1e58b4ec7 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 14 Sep 2015 22:12:06 +0200
Subject: [PATCH 15/22] classify: Fix typos in comments and strings

All of them were found by codespell.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 classify/adaptmatch.cpp | 6 +++---
 classify/classify.cpp   | 2 +-
 classify/classify.h     | 2 +-
 classify/cluster.cpp    | 2 +-
 classify/clusttool.h    | 2 +-
 classify/featdefs.cpp   | 2 +-
 classify/intfx.cpp      | 2 +-
 classify/kdtree.cpp     | 2 +-
 classify/mfoutline.cpp  | 4 ++--
 classify/picofeat.cpp   | 2 +-
 10 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/classify/adaptmatch.cpp b/classify/adaptmatch.cpp
index 3a6ef1c498..b89f1cb7ae 100644
--- a/classify/adaptmatch.cpp
+++ b/classify/adaptmatch.cpp
@@ -515,7 +515,7 @@ void Classify::EndAdaptiveClassifier() {
  *      load_pre_trained_templates  Indicates whether the pre-trained
  *                     templates (inttemp, normproto and pffmtable components)
  *                     should be lodaded. Should only be set to true if the
- *                     necesary classifier components are present in the
+ *                     necessary classifier components are present in the
  *                     [lang].traineddata file.
  *  Globals:
  *      BuiltInTemplatesFile  file to get built-in temps from
@@ -1720,7 +1720,7 @@ bool Classify::LooksLikeGarbage(TBLOB *blob) {
  *
  * Globals:
  *
- * @return Number of features extracted or 0 if an error occured.
+ * @return Number of features extracted or 0 if an error occurred.
  * @note Exceptions: none
  * @note History: Tue May 28 10:40:52 1991, DSJ, Created.
  */
@@ -2082,7 +2082,7 @@ void Classify::PrintAdaptiveMatchResults(const ADAPT_RESULTS& results) {
 
 /*---------------------------------------------------------------------------*/
 /**
- * This routine steps thru each matching class in Results
+ * This routine steps through each matching class in Results
  * and removes it from the match list if its rating
  * is worse than the BestRating plus a pad.  In other words,
  * all good matches get moved to the front of the classes
diff --git a/classify/classify.cpp b/classify/classify.cpp
index c68fc27643..436efd1f2d 100644
--- a/classify/classify.cpp
+++ b/classify/classify.cpp
@@ -151,7 +151,7 @@ Classify::Classify()
       INT_MEMBER(classify_integer_matcher_multiplier, 10,
                  "Integer Matcher Multiplier  0-255:   ", this->params()),
       EnableLearning(true),
-      INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
+      INT_MEMBER(il1_adaption_test, 0, "Don't adapt to i/I at beginning of word",
                  this->params()),
       BOOL_MEMBER(classify_bln_numeric_mode, 0,
                   "Assume the input is numbers [0-9].", this->params()),
diff --git a/classify/classify.h b/classify/classify.h
index e952394630..0de8441527 100644
--- a/classify/classify.h
+++ b/classify/classify.h
@@ -495,7 +495,7 @@ class Classify : public CCStruct {
   // font combinations that the shape represents.
   UnicityTable<FontSet> fontset_table_;
 
-  INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word");
+  INT_VAR_H(il1_adaption_test, 0, "Don't adapt to i/I at beginning of word");
   BOOL_VAR_H(classify_bln_numeric_mode, 0,
              "Assume the input is numbers [0-9].");
   double_VAR_H(speckle_large_max_size, 0.30, "Max large speckle size");
diff --git a/classify/cluster.cpp b/classify/cluster.cpp
index ef46f77c21..b723bfa82e 100644
--- a/classify/cluster.cpp
+++ b/classify/cluster.cpp
@@ -182,7 +182,7 @@ struct BUCKETS {
   FLOAT64 ChiSquared;            // test threshold
   uinT16 NumberOfBuckets;        // number of cells in histogram
   uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
-  uinT32 *Count;                 // frequency of occurence histogram
+  uinT32 *Count;                 // frequency of occurrence histogram
   FLOAT32 *ExpectedCount;        // expected histogram
 };
 
diff --git a/classify/clusttool.h b/classify/clusttool.h
index a4f3b8351d..e82fa1ef48 100644
--- a/classify/clusttool.h
+++ b/classify/clusttool.h
@@ -24,7 +24,7 @@
 #include <stdio.h>
 
 /*-------------------------------------------------------------------------
-        Public Funtion Prototype
+        Public Function Prototype
 --------------------------------------------------------------------------*/
 uinT16 ReadSampleSize(FILE *File);
 
diff --git a/classify/featdefs.cpp b/classify/featdefs.cpp
index cf9e551509..ad7b799675 100644
--- a/classify/featdefs.cpp
+++ b/classify/featdefs.cpp
@@ -285,7 +285,7 @@ CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs,
 
 /*---------------------------------------------------------------------------*/
 /**
- * Search thru all features currently defined and return
+ * Search through all features currently defined and return
  * the feature type for the feature with the specified short
  * name.  Trap an error if the specified name is not found.
  *
diff --git a/classify/intfx.cpp b/classify/intfx.cpp
index 12966aa195..78aa59bbc9 100644
--- a/classify/intfx.cpp
+++ b/classify/intfx.cpp
@@ -44,7 +44,7 @@ using tesseract::TrainingSample;
 // The entries are in binary degrees where a full circle is 256 binary degrees.
 static float cos_table[INT_CHAR_NORM_RANGE];
 static float sin_table[INT_CHAR_NORM_RANGE];
-// Guards write access to AtanTable so we dont create it more than once.
+// Guards write access to AtanTable so we don't create it more than once.
 tesseract::CCUtilMutex atan_table_mutex;
 
 
diff --git a/classify/kdtree.cpp b/classify/kdtree.cpp
index 8d05149cc1..61a94f66cc 100644
--- a/classify/kdtree.cpp
+++ b/classify/kdtree.cpp
@@ -521,7 +521,7 @@ bool KDTreeSearch::BoxIntersectsSearch(FLOAT32 *lower, FLOAT32 *upper) {
  * Walk a tree, calling action once on each node.
  *
  * Operation:
- *   This routine walks thru the specified sub_tree and invokes action
+ *   This routine walks through the specified sub_tree and invokes action
  *   action at each node as follows:
  *       action(context, data, level)
  *   data  the data contents of the node being visited,
diff --git a/classify/mfoutline.cpp b/classify/mfoutline.cpp
index 7f1b04ad44..511c34d41f 100644
--- a/classify/mfoutline.cpp
+++ b/classify/mfoutline.cpp
@@ -104,7 +104,7 @@ LIST ConvertOutlines(TESSLINE *outline,
 
 /*---------------------------------------------------------------------------*/
 /**
- * This routine searches thru the specified outline, computes
+ * This routine searches through the specified outline, computes
  * a slope for each vector in the outline, and marks each
  * vector as having one of the following directions:
  *   N, S, E, W, NE, NW, SE, SW
@@ -182,7 +182,7 @@ void FreeOutlines(LIST Outlines) {
 
 /*---------------------------------------------------------------------------*/
 /**
- * This routine searches thru the specified outline and finds
+ * This routine searches through the specified outline and finds
  * the points at which the outline changes direction.  These
  * points are then marked as "extremities".  This routine is
  * used as an alternative to FindExtremities().  It forces the
diff --git a/classify/picofeat.cpp b/classify/picofeat.cpp
index fea3b14121..74beb18f35 100644
--- a/classify/picofeat.cpp
+++ b/classify/picofeat.cpp
@@ -147,7 +147,7 @@ void ConvertSegmentToPicoFeat(FPOINT *Start,
 
 /*---------------------------------------------------------------------------*/
 /**
- * This routine steps thru the specified outline and cuts it
+ * This routine steps through the specified outline and cuts it
  * up into pieces of equal length.  These pieces become the
  * desired pico-features.  Each segment in the outline
  * is converted into an integral number of pico-features.

From 5378679dce5b9604e56657a466800db5fb6a390d Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 14 Sep 2015 22:14:03 +0200
Subject: [PATCH 16/22] cube: Fix typos in comments

All of them were found by codespell.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 cube/beam_search.cpp                  | 2 +-
 cube/beam_search.h                    | 2 +-
 cube/conv_net_classifier.cpp          | 2 +-
 cube/conv_net_classifier.h            | 2 +-
 cube/cube_line_object.cpp             | 2 +-
 cube/cube_line_segmenter.cpp          | 4 ++--
 cube/cube_search_object.cpp           | 2 +-
 cube/cube_search_object.h             | 2 +-
 cube/hybrid_neural_net_classifier.cpp | 4 ++--
 cube/hybrid_neural_net_classifier.h   | 2 +-
 cube/tess_lang_model.cpp              | 2 +-
 11 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/cube/beam_search.cpp b/cube/beam_search.cpp
index a89b15d8a9..fd17a1d59f 100644
--- a/cube/beam_search.cpp
+++ b/cube/beam_search.cpp
@@ -93,7 +93,7 @@ void BeamSearch::CreateChildren(SearchColumn *out_col, LangModel *lang_mod,
   }  // lm_edges
 }
 
-// Performs a beam seach in the specified search using the specified
+// Performs a beam search in the specified search using the specified
 // language model; returns an alternate list of possible words as a result.
 WordAltList * BeamSearch::Search(SearchObject *srch_obj, LangModel *lang_mod) {
   // verifications
diff --git a/cube/beam_search.h b/cube/beam_search.h
index a39f5b1349..cd8fc0110d 100644
--- a/cube/beam_search.h
+++ b/cube/beam_search.h
@@ -45,7 +45,7 @@ class BeamSearch {
  public:
   explicit BeamSearch(CubeRecoContext *cntxt, bool word_mode = true);
   ~BeamSearch();
-  // Performs a beam seach in the specified search using the specified
+  // Performs a beam search in the specified search using the specified
   // language model; returns an alternate list of possible words as a result.
   WordAltList *Search(SearchObject *srch_obj, LangModel *lang_mod = NULL);
   // Returns the best node in the last column of last performed search.
diff --git a/cube/conv_net_classifier.cpp b/cube/conv_net_classifier.cpp
index d6ae692e7b..ac33cd33b1 100644
--- a/cube/conv_net_classifier.cpp
+++ b/cube/conv_net_classifier.cpp
@@ -72,7 +72,7 @@ bool ConvNetCharClassifier::Train(CharSamp *char_samp, int ClassID) {
 
 /**
  * A secondary function needed for training. Allows the trainer to set the
- * value of any train-time paramter. This function is currently not
+ * value of any train-time parameter. This function is currently not
  * implemented. TODO(ahmadab): implement end-2-end training
  */
 bool ConvNetCharClassifier::SetLearnParam(char *var_name, float val) {
diff --git a/cube/conv_net_classifier.h b/cube/conv_net_classifier.h
index e9bcd8c2cc..b9e7692c28 100644
--- a/cube/conv_net_classifier.h
+++ b/cube/conv_net_classifier.h
@@ -55,7 +55,7 @@ class ConvNetCharClassifier : public CharClassifier {
   // is currently not implemented. TODO(ahmadab): implement end-2-end training
   virtual bool Train(CharSamp *char_samp, int ClassID);
   // A secondary function needed for training. Allows the trainer to set the
-  // value of any train-time paramter. This function is currently not
+  // value of any train-time parameter. This function is currently not
   // implemented. TODO(ahmadab): implement end-2-end training
   virtual bool SetLearnParam(char *var_name, float val);
   // Externally sets the Neural Net used by the classifier. Used for training
diff --git a/cube/cube_line_object.cpp b/cube/cube_line_object.cpp
index 64b90cadff..0325453740 100644
--- a/cube/cube_line_object.cpp
+++ b/cube/cube_line_object.cpp
@@ -247,7 +247,7 @@ int CubeLineObject::ComputeWordBreakThreshold(int con_comp_cnt,
     word_break_threshold--;
   } while (!valid && word_break_threshold > 0);
 
-  // failed to find a threshold that acheives the target aspect ratio.
+  // failed to find a threshold that achieves the target aspect ratio.
   // Just use the default threshold
   return  static_cast<int>(line_pix_->h *
                            cntxt_->Params()->MaxSpaceHeightRatio());
diff --git a/cube/cube_line_segmenter.cpp b/cube/cube_line_segmenter.cpp
index 82f8c8ede4..278011f090 100644
--- a/cube/cube_line_segmenter.cpp
+++ b/cube/cube_line_segmenter.cpp
@@ -237,7 +237,7 @@ Pixa *CubeLineSegmenter::CrackLine(Pix *cracked_line_pix,
   return NULL;
 }
 
-// split a line continously until valid or fail
+// split a line continuously until valid or fail
 Pixa *CubeLineSegmenter::SplitLine(Pix *line_mask_pix, Box *line_box) {
   // clone the line mask
   Pix *line_pix = pixClone(line_mask_pix);
@@ -739,7 +739,7 @@ bool CubeLineSegmenter::LineSegment() {
   return true;
 }
 
-// Estimate the paramters of the font(s) used in the page
+// Estimate the parameters of the font(s) used in the page
 bool CubeLineSegmenter::EstimateFontParams() {
   int hgt_hist[kHgtBins];
   int max_hgt;
diff --git a/cube/cube_search_object.cpp b/cube/cube_search_object.cpp
index 0cf54e31a9..61294f26b6 100644
--- a/cube/cube_search_object.cpp
+++ b/cube/cube_search_object.cpp
@@ -212,7 +212,7 @@ CharSamp *CubeSearchObject::CharSample(int start_pt, int end_pt) {
     samp->SetLastChar(last_char ? 255 : 0);
   } else {
     // for non cursive languages, these features correspond
-    // to whether the charsamp is at the begining or end of the word
+    // to whether the charsamp is at the beginning or end of the word
     samp->SetFirstChar((start_pt == -1) ? 255 : 0);
     samp->SetLastChar((end_pt == (segment_cnt_ - 1)) ? 255 : 0);
   }
diff --git a/cube/cube_search_object.h b/cube/cube_search_object.h
index 8452417a69..0a6c3ce20b 100644
--- a/cube/cube_search_object.h
+++ b/cube/cube_search_object.h
@@ -114,7 +114,7 @@ class CubeSearchObject : public SearchObject {
             end_pt <= (start_pt + max_seg_per_char_));
   }
   // computes the space and no space costs at gaps between segments
-  // return true on sucess
+  // return true on success
   bool ComputeSpaceCosts();
 };
 }
diff --git a/cube/hybrid_neural_net_classifier.cpp b/cube/hybrid_neural_net_classifier.cpp
index b5822f6f22..671a74acdf 100644
--- a/cube/hybrid_neural_net_classifier.cpp
+++ b/cube/hybrid_neural_net_classifier.cpp
@@ -72,7 +72,7 @@ bool HybridNeuralNetCharClassifier::Train(CharSamp *char_samp, int ClassID) {
 }
 
 // A secondary function needed for training. Allows the trainer to set the
-// value of any train-time paramter. This function is currently not
+// value of any train-time parameter. This function is currently not
 // implemented. TODO(ahmadab): implement end-2-end training
 bool HybridNeuralNetCharClassifier::SetLearnParam(char *var_name, float val) {
   // TODO(ahmadab): implementation of parameter initializing.
@@ -151,7 +151,7 @@ bool HybridNeuralNetCharClassifier::RunNets(CharSamp *char_samp) {
     return false;
   }
 
-  // go thru all the nets
+  // go through all the nets
   memset(net_output_, 0, class_cnt * sizeof(*net_output_));
   float *inputs = net_input_;
   for (int net_idx = 0; net_idx < nets_.size(); net_idx++) {
diff --git a/cube/hybrid_neural_net_classifier.h b/cube/hybrid_neural_net_classifier.h
index 0ab9ba1235..6ad6233f43 100644
--- a/cube/hybrid_neural_net_classifier.h
+++ b/cube/hybrid_neural_net_classifier.h
@@ -48,7 +48,7 @@ class HybridNeuralNetCharClassifier : public CharClassifier {
   // is currently not implemented. TODO(ahmadab): implement end-2-end training
   virtual bool Train(CharSamp *char_samp, int ClassID);
   // A secondary function needed for training. Allows the trainer to set the
-  // value of any train-time paramter. This function is currently not
+  // value of any train-time parameter. This function is currently not
   // implemented. TODO(ahmadab): implement end-2-end training
   virtual bool SetLearnParam(char *var_name, float val);
   // Externally sets the Neural Net used by the classifier. Used for training
diff --git a/cube/tess_lang_model.cpp b/cube/tess_lang_model.cpp
index 8b4ff68ee4..5113207260 100644
--- a/cube/tess_lang_model.cpp
+++ b/cube/tess_lang_model.cpp
@@ -397,7 +397,7 @@ int TessLangModel::NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array) {
     return 0;
   }
 
-  // go thru all valid transitions from the state
+  // go through all valid transitions from the state
   int edge_cnt = 0;
 
   EDGE_REF new_edge_ref;

From 55c81cb19306874a455f821cb9f3cfd29ef5f470 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 14 Sep 2015 22:16:13 +0200
Subject: [PATCH 17/22] cutil: Fix typos in comments

All of them were found by codespell.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 cutil/listio.h    | 2 +-
 cutil/oldlist.cpp | 2 +-
 cutil/oldlist.h   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/cutil/listio.h b/cutil/listio.h
index e758c9bcb2..7d9c19f777 100644
--- a/cutil/listio.h
+++ b/cutil/listio.h
@@ -37,7 +37,7 @@
 #include "oldlist.h"
 
 /*----------------------------------------------------------------------------
-        Public Funtion Prototypes
+        Public Function Prototypes
 --------------------------------------------------------------------------*/
 LIST read_list(const char *filename);
 #endif
diff --git a/cutil/oldlist.cpp b/cutil/oldlist.cpp
index cf93ffb518..52c0d8680a 100644
--- a/cutil/oldlist.cpp
+++ b/cutil/oldlist.cpp
@@ -407,7 +407,7 @@ LIST s_adjoin(LIST var_list, void *variable, int_compare compare) {
  *
  *  Search list, return NIL_LIST if not found. Return the list starting from
  *  the item if found.  The compare routine "is_equal" is passed in as
- *  the third paramter to this routine.   If the value NULL is supplied
+ *  the third parameter to this routine.   If the value NULL is supplied
  *  for is_equal, the is_key routine will be used.
  **********************************************************************/
 LIST search(LIST list, void *key, int_compare is_equal) {
diff --git a/cutil/oldlist.h b/cutil/oldlist.h
index 103dd72592..a0130ae061 100644
--- a/cutil/oldlist.h
+++ b/cutil/oldlist.h
@@ -234,7 +234,7 @@ first_node (list_rest (l))
 first_node (list_rest (list_rest (l)))
 
 /*----------------------------------------------------------------------
-          Public Funtion Prototypes
+          Public Function Prototypes
 ----------------------------------------------------------------------*/
 int count(LIST var_list);
 

From 97d47a406df52fd3e55d4f92c13caa81defe8a8f Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 14 Sep 2015 22:16:42 +0200
Subject: [PATCH 18/22] dict: Fix typos in comments and strings

All of them were found by codespell.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 dict/context.cpp  | 2 +-
 dict/dawg.h       | 2 +-
 dict/dict.cpp     | 2 +-
 dict/dict.h       | 2 +-
 dict/permdawg.cpp | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/dict/context.cpp b/dict/context.cpp
index 206447d98f..a9acb137c3 100644
--- a/dict/context.cpp
+++ b/dict/context.cpp
@@ -33,7 +33,7 @@ static const int kMinAbsoluteGarbageWordLength = 10;
 static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;
 
 const int case_state_table[6][4] = { {
-                                  /*  0. Begining of word        */
+                                  /*  0. Beginning of word       */
     /*    P   U   L   D                                          */
                                   /* -1. Error on case           */
       0, 1, 5, 4
diff --git a/dict/dawg.h b/dict/dawg.h
index a487d3fd1c..b37e771503 100644
--- a/dict/dawg.h
+++ b/dict/dawg.h
@@ -447,7 +447,7 @@ class SquishedDawg : public Dawg {
     EDGE_REF edge = node;
     if (!edge_occupied(edge) || edge == NO_EDGE) return;
     assert(forward_edge(edge));  // we don't expect any backward edges to
-    do {                         // be present when this funciton is called
+    do {                         // be present when this function is called
       if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
         vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
       }
diff --git a/dict/dict.cpp b/dict/dict.cpp
index 8df5b63bb4..e59b00d58b 100644
--- a/dict/dict.cpp
+++ b/dict/dict.cpp
@@ -127,7 +127,7 @@ Dict::Dict(CCUtil* ccutil)
                   " when there is a need to explore all segmentations",
                   getCCUtil()->params()),
       BOOL_MEMBER(save_raw_choices, false,
-                  "Deprecated- backward compatablity only",
+                  "Deprecated- backward compatibility only",
                   getCCUtil()->params()),
       INT_MEMBER(tessedit_truncate_wordchoice_log, 10,
                  "Max words to keep in list",
diff --git a/dict/dict.h b/dict/dict.h
index 7556bc5460..938ca3a332 100644
--- a/dict/dict.h
+++ b/dict/dict.h
@@ -614,7 +614,7 @@ class Dict {
              "Make AcceptableChoice() always return false. Useful"
              " when there is a need to explore all segmentations");
   BOOL_VAR_H(save_raw_choices, false,
-             "Deprecated- backward compatability only");
+             "Deprecated- backward compatibility only");
   INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list");
   STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information"
                " should be printed to stdout");
diff --git a/dict/permdawg.cpp b/dict/permdawg.cpp
index 7d60d395ff..71e2deca43 100644
--- a/dict/permdawg.cpp
+++ b/dict/permdawg.cpp
@@ -303,7 +303,7 @@ void Dict::append_choices(
  *
  * The given prev_char_frag_info contains:
  * - fragment: if not NULL contains information about immediately
- *   preceeding fragmented character choice
+ *   preceding fragmented character choice
  * - num_fragments: number of fragments that have been used so far
  *   to construct a character
  * - certainty: certainty of the current choice or minimum

From 4d2fd0f8c9e9eb67769558735ffc17e5668b1bb1 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 14 Sep 2015 22:17:48 +0200
Subject: [PATCH 19/22] Doxyfile: Fix typo in comment (found by codespell)

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 doc/Doxyfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/Doxyfile b/doc/Doxyfile
index 673defaf10..c4f496be39 100644
--- a/doc/Doxyfile
+++ b/doc/Doxyfile
@@ -1657,7 +1657,7 @@ EXTRA_PACKAGES         =
 # following commands have a special meaning inside the header: $title,
 # $datetime, $date, $doxygenversion, $projectname, $projectnumber,
 # $projectbrief, $projectlogo. Doxygen will replace $title with the empy string,
-# for the replacement values of the other commands the user is refered to
+# for the replacement values of the other commands the user is referred to
 # HTML_HEADER.
 # This tag requires that the tag GENERATE_LATEX is set to YES.
 

From 425c2391b202552d539b8674d2376f66124ccd1e Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 14 Sep 2015 22:18:44 +0200
Subject: [PATCH 20/22] Java: Fix typos in comments and strings

All of them were found by codespell.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 java/Makefile.am                               | 2 +-
 java/com/google/scrollview/ui/SVMenuBar.java   | 4 ++--
 java/com/google/scrollview/ui/SVPopupMenu.java | 2 +-
 java/com/google/scrollview/ui/SVWindow.java    | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/java/Makefile.am b/java/Makefile.am
index 43752b6f3b..fddbc6f9ec 100644
--- a/java/Makefile.am
+++ b/java/Makefile.am
@@ -57,7 +57,7 @@ install-jars : ScrollView.jar
 	@if [ ! -d  $(scrollview_path) ]; then mkdir -p $(scrollview_path); fi;
 	$(INSTALL) -m 644 $(SCROLLVIEW_LIBS) $(scrollview_path);
 	$(INSTALL) -m 644 ScrollView.jar $(scrollview_path);
-	@echo "Don't forget to set enviroment variable SCROLLVIEW_PATH to $(scrollview_path)";
+	@echo "Don't forget to set environment variable SCROLLVIEW_PATH to $(scrollview_path)";
 
 uninstall:
 	rm -f $(scrollview_path)/*.jar
diff --git a/java/com/google/scrollview/ui/SVMenuBar.java b/java/com/google/scrollview/ui/SVMenuBar.java
index 7c2f5d9af8..9a87524ef0 100644
--- a/java/com/google/scrollview/ui/SVMenuBar.java
+++ b/java/com/google/scrollview/ui/SVMenuBar.java
@@ -50,7 +50,7 @@ public SVMenuBar(SVWindow scrollView) {
 
 
   /**
-   * A click on one of the items in our menubar has occured. Forward it
+   * A click on one of the items in our menubar has occurred. Forward it
    * to the item itself to let it decide what happens.
    */
   public void actionPerformed(ActionEvent e) {
@@ -111,7 +111,7 @@ else if (id == -1) {
    * @param name The caption of the new entry.
    * @param id The Id of the new entry. If it is -1, the entry will be treated
    *        as a menu.
-   * @param b Whether the entry is initally flagged.
+   * @param b Whether the entry is initially flagged.
    *
    */
 
diff --git a/java/com/google/scrollview/ui/SVPopupMenu.java b/java/com/google/scrollview/ui/SVPopupMenu.java
index 6427c0ef85..14c8b3acd3 100644
--- a/java/com/google/scrollview/ui/SVPopupMenu.java
+++ b/java/com/google/scrollview/ui/SVPopupMenu.java
@@ -123,7 +123,7 @@ public void add(String parent, String name, int id, String value, String desc) {
 
 
   /**
-   * A click on one of the items in our menubar has occured. Forward it
+   * A click on one of the items in our menubar has occurred. Forward it
    * to the item itself to let it decide what happens.
    */
   public void actionPerformed(ActionEvent e) {
diff --git a/java/com/google/scrollview/ui/SVWindow.java b/java/com/google/scrollview/ui/SVWindow.java
index f4960276f6..267bfdda03 100644
--- a/java/com/google/scrollview/ui/SVWindow.java
+++ b/java/com/google/scrollview/ui/SVWindow.java
@@ -298,7 +298,7 @@ public void addMessageBox() {
       ta.setEditable(false);
       getContentPane().add(ta, BorderLayout.SOUTH);
     }
-    // We need to make the window bigger to accomodate the message box.
+    // We need to make the window bigger to accommodate the message box.
     winSizeY += DEF_MESSAGEBOX_HEIGHT;
     setSize(winSizeX, winSizeY);
   }

From 00a4e06be94486843f81a9ee2298b8c98e2a1491 Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weilnetz.de>
Date: Mon, 14 Sep 2015 22:23:48 +0200
Subject: [PATCH 21/22] wordrec: Fix typos in comments

All of them were found by codespell.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
---
 wordrec/lm_state.h | 4 ++--
 wordrec/pieces.cpp | 2 +-
 wordrec/wordrec.h  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/wordrec/lm_state.h b/wordrec/lm_state.h
index c87745b75a..623bbb5e7f 100644
--- a/wordrec/lm_state.h
+++ b/wordrec/lm_state.h
@@ -177,11 +177,11 @@ struct ViterbiStateEntry : public ELIST_LINK {
   /// the smallest rating or lower/upper case letters).
   LanguageModelFlagsType top_choice_flags;
 
-  /// Extra information maintained by Dawg laguage model component
+  /// Extra information maintained by Dawg language model component
   /// (owned by ViterbiStateEntry).
   LanguageModelDawgInfo *dawg_info;
 
-  /// Extra information maintained by Ngram laguage model component
+  /// Extra information maintained by Ngram language model component
   /// (owned by ViterbiStateEntry).
   LanguageModelNgramInfo *ngram_info;
 
diff --git a/wordrec/pieces.cpp b/wordrec/pieces.cpp
index 1818478c66..04e340396e 100644
--- a/wordrec/pieces.cpp
+++ b/wordrec/pieces.cpp
@@ -273,7 +273,7 @@ void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column,
  *
  * Recursively go through the ratings matrix to find lists of fragments
  * to be merged in the function merge_and_put_fragment_lists.
- * current_frag is the postion of the piece we are looking for.
+ * current_frag is the position of the piece we are looking for.
  * current_row is the row in the rating matrix we are currently at.
  * start is the row we started initially, so that we can know where
  * to append the results to the matrix. num_frag_parts is the total
diff --git a/wordrec/wordrec.h b/wordrec/wordrec.h
index 38f09f23d2..fb54ccae08 100644
--- a/wordrec/wordrec.h
+++ b/wordrec/wordrec.h
@@ -375,7 +375,7 @@ class Wordrec : public Classify {
                        inT16 num_blobs);
   // Recursively go through the ratings matrix to find lists of fragments
   // to be merged in the function merge_and_put_fragment_lists.
-  // current_frag is the postion of the piece we are looking for.
+  // current_frag is the position of the piece we are looking for.
   // current_row is the row in the rating matrix we are currently at.
   // start is the row we started initially, so that we can know where
   // to append the results to the matrix. num_frag_parts is the total

From bd917b9feed14c50a02830ef5ca0a51da08f6612 Mon Sep 17 00:00:00 2001
From: Pepe Bawagan <syk0saje@gmail.com>
Date: Tue, 29 Sep 2015 00:24:05 +0800
Subject: [PATCH 22/22] adds sudo to "make install" command

for consistency with instructions that show up while installing
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b932974397..5c2be10fe9 100644
--- a/README.md
+++ b/README.md
@@ -97,7 +97,7 @@ find its data directory. You must either:
     ./autogen.sh
     ./configure
     make
-    make install
+    sudo make install
     sudo ldconfig
 
 to move the data files to the standard place, or: