Fixed training leaks and randomness

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@653 d0cd1f9f-072b-0410-8dd7-cf729c803f20
jimregan · Feb 2, 2012 · e33ae59 · e33ae59
1 parent 01026af
commit e33ae59
Show file tree

Hide file tree

Showing 11 changed files with 1,090 additions and 715 deletions.
diff --git a/training/Makefile.am b/training/Makefile.am
@@ -1,5 +1,7 @@
 SUBDIRS =
 AM_CPPFLAGS = \
+    -DUSE_STD_NAMESPACE \
+    -I$(top_srcdir)/ccmain -I$(top_srcdir)/api \
     -I$(top_srcdir)/ccutil -I$(top_srcdir)/ccstruct \
     -I$(top_srcdir)/image -I$(top_srcdir)/viewer \
     -I$(top_srcdir)/textord -I$(top_srcdir)/dict \
@@ -18,7 +20,55 @@ libtesseract_tessopt_la_SOURCES = \
     tessopt.cpp
 libtesseract_tessopt_la_LDFLAGS = -static
 
-bin_PROGRAMS = cntraining combine_tessdata mftraining unicharset_extractor wordlist2dawg
+bin_PROGRAMS = ambiguous_words classifier_tester cntraining combine_tessdata dawg2wordlist mftraining shapeclustering unicharset_extractor wordlist2dawg
+
+ambiguous_words_SOURCES = ambiguous_words.cpp
+ambiguous_words_LDADD = \
+    libtesseract_training.la \
+    libtesseract_tessopt.la
+if USING_MULTIPLELIBS
+ambiguous_words_LDADD += \
+    $(top_srcdir)/api/libtesseract_api.la \
+    $(top_srcdir)/textord/libtesseract_textord.la \
+    $(top_srcdir)/classify/libtesseract_classify.la \
+    $(top_srcdir)/dict/libtesseract_dict.la \
+    $(top_srcdir)/ccstruct/libtesseract_ccstruct.la \
+    $(top_srcdir)/image/libtesseract_image.la \
+    $(top_srcdir)/cutil/libtesseract_cutil.la \
+    $(top_srcdir)/viewer/libtesseract_viewer.la \
+    $(top_srcdir)/ccmain/libtesseract_main.la \
+    $(top_srcdir)/cube/libtesseract_cube.la \
+    $(top_srcdir)/neural_networks/runtime/libtesseract_neural.la \
+    $(top_srcdir)/wordrec/libtesseract_wordrec.la \
+    $(top_srcdir)/ccutil/libtesseract_ccutil.la
+else
+ambiguous_words_LDADD += \
+    $(top_srcdir)/api/libtesseract.la
+endif
+
+classifier_tester_SOURCES = classifier_tester.cpp
+classifier_tester_LDADD = \
+    libtesseract_training.la \
+    libtesseract_tessopt.la
+if USING_MULTIPLELIBS
+classifier_tester_LDADD += \
+    $(top_srcdir)/textord/libtesseract_textord.la \
+    $(top_srcdir)/classify/libtesseract_classify.la \
+    $(top_srcdir)/dict/libtesseract_dict.la \
+    $(top_srcdir)/ccstruct/libtesseract_ccstruct.la \
+    $(top_srcdir)/image/libtesseract_image.la \
+    $(top_srcdir)/cutil/libtesseract_cutil.la \
+    $(top_srcdir)/viewer/libtesseract_viewer.la \
+    $(top_srcdir)/ccmain/libtesseract_main.la \
+    $(top_srcdir)/cube/libtesseract_cube.la \
+    $(top_srcdir)/neural_networks/runtime/libtesseract_neural.la \
+    $(top_srcdir)/wordrec/libtesseract_wordrec.la \
+    $(top_srcdir)/ccutil/libtesseract_ccutil.la
+else
+classifier_tester_LDADD += \
+    $(top_srcdir)/api/libtesseract.la
+endif
+
 combine_tessdata_SOURCES = combine_tessdata.cpp
 if USING_MULTIPLELIBS
 combine_tessdata_LDADD = \
@@ -51,6 +101,28 @@ cntraining_LDADD += \
     $(top_srcdir)/api/libtesseract.la
 endif
 
+dawg2wordlist_SOURCES = dawg2wordlist.cpp
+dawg2wordlist_LDADD = \
+    libtesseract_tessopt.la
+if USING_MULTIPLELIBS
+dawg2wordlist_LDADD += \
+    $(top_srcdir)/classify/libtesseract_classify.la \
+    $(top_srcdir)/dict/libtesseract_dict.la \
+    $(top_srcdir)/ccstruct/libtesseract_ccstruct.la \
+    $(top_srcdir)/image/libtesseract_image.la \
+    $(top_srcdir)/cutil/libtesseract_cutil.la \
+    $(top_srcdir)/viewer/libtesseract_viewer.la \
+    $(top_srcdir)/ccmain/libtesseract_main.la \
+    $(top_srcdir)/cube/libtesseract_cube.la \
+    $(top_srcdir)/neural_networks/runtime/libtesseract_neural.la \
+    $(top_srcdir)/wordrec/libtesseract_wordrec.la \
+    $(top_srcdir)/textord/libtesseract_textord.la \
+    $(top_srcdir)/ccutil/libtesseract_ccutil.la
+else
+dawg2wordlist_LDADD += \
+    $(top_srcdir)/api/libtesseract.la
+endif
+
 mftraining_SOURCES = mftraining.cpp mergenf.cpp
 mftraining_LDADD = \
     libtesseract_training.la \
@@ -74,6 +146,29 @@ mftraining_LDADD += \
     $(top_srcdir)/api/libtesseract.la
 endif
 
+shapeclustering_SOURCES = shapeclustering.cpp
+shapeclustering_LDADD = \
+    libtesseract_training.la \
+    libtesseract_tessopt.la
+if USING_MULTIPLELIBS
+shapeclustering_LDADD += \
+    $(top_srcdir)/textord/libtesseract_textord.la \
+    $(top_srcdir)/classify/libtesseract_classify.la \
+    $(top_srcdir)/dict/libtesseract_dict.la \
+    $(top_srcdir)/ccstruct/libtesseract_ccstruct.la \
+    $(top_srcdir)/image/libtesseract_image.la \
+    $(top_srcdir)/cutil/libtesseract_cutil.la \
+    $(top_srcdir)/viewer/libtesseract_viewer.la \
+    $(top_srcdir)/ccmain/libtesseract_main.la \
+    $(top_srcdir)/cube/libtesseract_cube.la \
+    $(top_srcdir)/neural_networks/runtime/libtesseract_neural.la \
+    $(top_srcdir)/wordrec/libtesseract_wordrec.la \
+    $(top_srcdir)/ccutil/libtesseract_ccutil.la
+else
+shapeclustering_LDADD += \
+    $(top_srcdir)/api/libtesseract.la
+endif
+
 unicharset_extractor_SOURCES = unicharset_extractor.cpp
 unicharset_extractor_LDADD = \
     libtesseract_tessopt.la

diff --git a/training/ambiguous_words.cpp b/training/ambiguous_words.cpp
@@ -0,0 +1,77 @@
+///////////////////////////////////////////////////////////////////////
+// File:        ambiguous_words.cpp
+// Description: A program that takes a text file with a list of words as
+//              input (one per line) and outputs a file with the words
+//              that were found in the dictionary followed by the words
+//              that are ambiguous to them.
+// Author:      Rika Antonova
+// Created:     Fri Oct 21 11:26:43 PDT 2011
+//
+// (C) Copyright 2011, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+///////////////////////////////////////////////////////////////////////
+//
+
+#include <stdio.h>
+
+#include "baseapi.h"
+#include "helpers.h"
+#include "strngs.h"
+#include "dict.h"
+#include "tesseractclass.h"
+
+int main(int argc, char** argv) {
+
+  // Parse input arguments.
+  if (argc != 4 && (argc != 6 || strcmp(argv[1], "-l") != 0)) {
+    printf("Usage: %s [-l lang] tessdata_dir wordlist_file"
+           " output_ambiguious_wordlist_file\n", argv[0]);
+    return 1;
+  }
+  int argv_offset = 0;
+  STRING lang;
+  if (argc == 6) {
+    lang = argv[2];
+    argv_offset = 2;
+  } else {
+    lang = "eng";
+  }
+  const char *tessdata_dir = argv[++argv_offset];
+  const char *input_file_str = argv[++argv_offset];
+  const char *output_file_str = argv[++argv_offset];
+
+  // Initialize Tesseract.
+  tesseract::TessBaseAPI api;
+  GenericVector<STRING> vars_vec;
+  GenericVector<STRING> vars_values;
+  vars_vec.push_back("output_ambig_words_file");
+  vars_values.push_back(output_file_str);
+  api.Init(tessdata_dir, lang.string(), tesseract::OEM_TESSERACT_ONLY,
+           NULL, NULL, &vars_vec, &vars_values, false);
+  tesseract::Dict &dict = api.tesseract()->getDict();
+  FILE *input_file = fopen(input_file_str, "r");
+  if (input_file == NULL) {
+    tprintf("Failed to open input wordlist file %s\n", input_file_str);
+    exit(1);
+  }
+  char str[CHARS_PER_LINE];
+
+  // Read word list and call Dict::NoDangerousAmbig() for each word
+  // to record ambiguities in the output file.
+  while (fgets(str, CHARS_PER_LINE, input_file) != NULL) {
+    chomp_string(str);  // remove newline
+    WERD_CHOICE word(str, dict.getUnicharset());
+    dict.NoDangerousAmbig(&word, NULL, false, NULL, NULL);
+  }
+  // Clean up.
+  fclose(input_file);
+}
diff --git a/training/classifier_tester.cpp b/training/classifier_tester.cpp
@@ -0,0 +1,139 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+// Author: rays@google.com (Ray Smith)
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+//  Filename: classifier_tester.cpp
+//  Purpose:  Tests a character classifier on data as formatted for training,
+//            but doesn't have to be the same as the training data.
+//  Author:   Ray Smith
+
+#ifndef USE_STD_NAMESPACE
+#include "base/commandlineflags.h"
+#endif
+#include "baseapi.h"
+#include "commontraining.h"
+#include "cubeclassifier.h"
+#include "mastertrainer.h"
+#include "params.h"
+#include "strngs.h"
+#include "tessclassifier.h"
+
+STRING_PARAM_FLAG(classifier, "", "Classifier to test");
+STRING_PARAM_FLAG(lang, "eng", "Language to test");
+STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
+
+enum ClassifierName {
+  CN_PRUNER,
+  CN_FULL,
+  CN_CUBE,
+  CN_CUBETESS,
+  CN_COUNT
+};
+
+const char* names[] = {"pruner", "full", "cube", "cubetess", NULL };
+
+// This program has complex setup requirements, so here is some help:
+// Two different modes, tr files and serialized mastertrainer.
+// From tr files:
+//   classifier_tester -U unicharset -F font_properties -X xheights
+//     -classifier x -lang lang [-output_trainer trainer] *.tr
+// From a serialized trainer:
+//  classifier_tester -input_trainer trainer [-lang lang] -classifier x
+//
+// In the first case, the unicharset must be the unicharset from within
+// the classifier under test, and the font_properties and xheights files must
+// match the files used during training.
+// In the second case, the trainer file must have been prepared from
+// some previous run of shapeclustering, mftraining, or classifier_tester
+// using the same conditions as above, ie matching unicharset/font_properties.
+//
+// Available values of classifier (x above) are:
+// pruner   : Tesseract class pruner only.
+// full     : Tesseract full classifier.
+// cube     : Cube classifier. (Not possible with an input trainer.)
+// cubetess : Tesseract class pruner with rescoring by Cube.  (Not possible
+//            with an input trainer.)
+int main(int argc, char **argv) {
+  ParseArguments(&argc, &argv);
+  // Decode the classifier string.
+  ClassifierName classifier = CN_COUNT;
+  for (int c = 0; c < CN_COUNT; ++c) {
+    if (strcmp(FLAGS_classifier.c_str(), names[c]) == 0) {
+      classifier = static_cast<ClassifierName>(c);
+      break;
+    }
+  }
+  if (classifier == CN_COUNT) {
+    fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
+    return 1;
+  }
+
+  STRING file_prefix;
+  tesseract::MasterTrainer* trainer = tesseract::LoadTrainingData(
+      argc, argv, true, NULL, &file_prefix);
+  // We want to test junk as well if it is available.
+  trainer->IncludeJunk();
+  // We want to test with replicated samples too.
+  trainer->ReplicateAndRandomizeSamplesIfRequired();
+
+  // We need to initialize tesseract to test.
+  tesseract::TessBaseAPI api;
+  tesseract::OcrEngineMode engine_mode = tesseract::OEM_TESSERACT_ONLY;
+  if (classifier == CN_CUBE || classifier == CN_CUBETESS)
+    engine_mode = tesseract::OEM_TESSERACT_CUBE_COMBINED;
+  if (api.Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(),
+               engine_mode) < 0) {
+    fprintf(stderr, "Tesseract initialization failed!\n");
+    return 1;
+  }
+  tesseract::ShapeClassifier* shape_classifier = NULL;
+  tesseract::Tesseract* tesseract =
+      const_cast<tesseract::Tesseract*>(api.tesseract());
+  tesseract::Classify* classify =
+      reinterpret_cast<tesseract::Classify*>(tesseract);
+  // Copy the shape_table from the classifier and add the space character if
+  // not already present to count junk.
+  tesseract::ShapeTable shape_table;
+  shape_table.set_unicharset(classify->shape_table()->unicharset());
+  shape_table.AppendMasterShapes(*classify->shape_table());
+  if (shape_table.FindShape(0, -1) < 0)
+    shape_table.AddShape(0, 0);
+  if (classifier == CN_PRUNER) {
+    shape_classifier = new tesseract::TessClassifier(true, classify);
+  } else if (classifier == CN_FULL) {
+    shape_classifier = new tesseract::TessClassifier(false, classify);
+  } else if (classifier == CN_CUBE) {
+    shape_classifier = new tesseract::CubeClassifier(tesseract);
+  } else if (classifier == CN_CUBETESS) {
+    shape_classifier = new tesseract::CubeTessClassifier(tesseract);
+  } else {
+    fprintf(stderr, "%s tester not yet implemented\n",
+            FLAGS_classifier.c_str());
+    return 1;
+  }
+  tprintf("Testing classifier %s:\n", FLAGS_classifier.c_str());
+  trainer->TestClassifierOnSamples(3, false, shape_classifier, NULL);
+  if (classifier != CN_CUBE && classifier != CN_CUBETESS) {
+    // Test with replicated samples as well.
+    trainer->TestClassifierOnSamples(3, true, shape_classifier, NULL);
+  }
+  delete shape_classifier;
+  delete trainer;
+
+  return 0;
+} /* main */
+
+
+
+
+
+