Skip to content

Commit

Permalink
Fixed training leaks and randomness
Browse files Browse the repository at this point in the history
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@653 d0cd1f9f-072b-0410-8dd7-cf729c803f20
  • Loading branch information
theraysmith@gmail.com committed Feb 2, 2012
1 parent 01026af commit e33ae59
Show file tree
Hide file tree
Showing 11 changed files with 1,090 additions and 715 deletions.
97 changes: 96 additions & 1 deletion training/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
SUBDIRS =
AM_CPPFLAGS = \
-DUSE_STD_NAMESPACE \
-I$(top_srcdir)/ccmain -I$(top_srcdir)/api \
-I$(top_srcdir)/ccutil -I$(top_srcdir)/ccstruct \
-I$(top_srcdir)/image -I$(top_srcdir)/viewer \
-I$(top_srcdir)/textord -I$(top_srcdir)/dict \
Expand All @@ -18,7 +20,55 @@ libtesseract_tessopt_la_SOURCES = \
tessopt.cpp
libtesseract_tessopt_la_LDFLAGS = -static

bin_PROGRAMS = cntraining combine_tessdata mftraining unicharset_extractor wordlist2dawg
bin_PROGRAMS = ambiguous_words classifier_tester cntraining combine_tessdata dawg2wordlist mftraining shapeclustering unicharset_extractor wordlist2dawg

ambiguous_words_SOURCES = ambiguous_words.cpp
ambiguous_words_LDADD = \
libtesseract_training.la \
libtesseract_tessopt.la
if USING_MULTIPLELIBS
ambiguous_words_LDADD += \
$(top_srcdir)/api/libtesseract_api.la \
$(top_srcdir)/textord/libtesseract_textord.la \
$(top_srcdir)/classify/libtesseract_classify.la \
$(top_srcdir)/dict/libtesseract_dict.la \
$(top_srcdir)/ccstruct/libtesseract_ccstruct.la \
$(top_srcdir)/image/libtesseract_image.la \
$(top_srcdir)/cutil/libtesseract_cutil.la \
$(top_srcdir)/viewer/libtesseract_viewer.la \
$(top_srcdir)/ccmain/libtesseract_main.la \
$(top_srcdir)/cube/libtesseract_cube.la \
$(top_srcdir)/neural_networks/runtime/libtesseract_neural.la \
$(top_srcdir)/wordrec/libtesseract_wordrec.la \
$(top_srcdir)/ccutil/libtesseract_ccutil.la
else
ambiguous_words_LDADD += \
$(top_srcdir)/api/libtesseract.la
endif

classifier_tester_SOURCES = classifier_tester.cpp
classifier_tester_LDADD = \
libtesseract_training.la \
libtesseract_tessopt.la
if USING_MULTIPLELIBS
classifier_tester_LDADD += \
$(top_srcdir)/textord/libtesseract_textord.la \
$(top_srcdir)/classify/libtesseract_classify.la \
$(top_srcdir)/dict/libtesseract_dict.la \
$(top_srcdir)/ccstruct/libtesseract_ccstruct.la \
$(top_srcdir)/image/libtesseract_image.la \
$(top_srcdir)/cutil/libtesseract_cutil.la \
$(top_srcdir)/viewer/libtesseract_viewer.la \
$(top_srcdir)/ccmain/libtesseract_main.la \
$(top_srcdir)/cube/libtesseract_cube.la \
$(top_srcdir)/neural_networks/runtime/libtesseract_neural.la \
$(top_srcdir)/wordrec/libtesseract_wordrec.la \
$(top_srcdir)/ccutil/libtesseract_ccutil.la
else
classifier_tester_LDADD += \
$(top_srcdir)/api/libtesseract.la
endif

combine_tessdata_SOURCES = combine_tessdata.cpp
if USING_MULTIPLELIBS
combine_tessdata_LDADD = \
Expand Down Expand Up @@ -51,6 +101,28 @@ cntraining_LDADD += \
$(top_srcdir)/api/libtesseract.la
endif

dawg2wordlist_SOURCES = dawg2wordlist.cpp
dawg2wordlist_LDADD = \
libtesseract_tessopt.la
if USING_MULTIPLELIBS
dawg2wordlist_LDADD += \
$(top_srcdir)/classify/libtesseract_classify.la \
$(top_srcdir)/dict/libtesseract_dict.la \
$(top_srcdir)/ccstruct/libtesseract_ccstruct.la \
$(top_srcdir)/image/libtesseract_image.la \
$(top_srcdir)/cutil/libtesseract_cutil.la \
$(top_srcdir)/viewer/libtesseract_viewer.la \
$(top_srcdir)/ccmain/libtesseract_main.la \
$(top_srcdir)/cube/libtesseract_cube.la \
$(top_srcdir)/neural_networks/runtime/libtesseract_neural.la \
$(top_srcdir)/wordrec/libtesseract_wordrec.la \
$(top_srcdir)/textord/libtesseract_textord.la \
$(top_srcdir)/ccutil/libtesseract_ccutil.la
else
dawg2wordlist_LDADD += \
$(top_srcdir)/api/libtesseract.la
endif

mftraining_SOURCES = mftraining.cpp mergenf.cpp
mftraining_LDADD = \
libtesseract_training.la \
Expand All @@ -74,6 +146,29 @@ mftraining_LDADD += \
$(top_srcdir)/api/libtesseract.la
endif

shapeclustering_SOURCES = shapeclustering.cpp
shapeclustering_LDADD = \
libtesseract_training.la \
libtesseract_tessopt.la
if USING_MULTIPLELIBS
shapeclustering_LDADD += \
$(top_srcdir)/textord/libtesseract_textord.la \
$(top_srcdir)/classify/libtesseract_classify.la \
$(top_srcdir)/dict/libtesseract_dict.la \
$(top_srcdir)/ccstruct/libtesseract_ccstruct.la \
$(top_srcdir)/image/libtesseract_image.la \
$(top_srcdir)/cutil/libtesseract_cutil.la \
$(top_srcdir)/viewer/libtesseract_viewer.la \
$(top_srcdir)/ccmain/libtesseract_main.la \
$(top_srcdir)/cube/libtesseract_cube.la \
$(top_srcdir)/neural_networks/runtime/libtesseract_neural.la \
$(top_srcdir)/wordrec/libtesseract_wordrec.la \
$(top_srcdir)/ccutil/libtesseract_ccutil.la
else
shapeclustering_LDADD += \
$(top_srcdir)/api/libtesseract.la
endif

unicharset_extractor_SOURCES = unicharset_extractor.cpp
unicharset_extractor_LDADD = \
libtesseract_tessopt.la
Expand Down
77 changes: 77 additions & 0 deletions training/ambiguous_words.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
///////////////////////////////////////////////////////////////////////
// File: ambiguous_words.cpp
// Description: A program that takes a text file with a list of words as
// input (one per line) and outputs a file with the words
// that were found in the dictionary followed by the words
// that are ambiguous to them.
// Author: Rika Antonova
// Created: Fri Oct 21 11:26:43 PDT 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
//

#include <stdio.h>

#include "baseapi.h"
#include "helpers.h"
#include "strngs.h"
#include "dict.h"
#include "tesseractclass.h"

int main(int argc, char** argv) {

// Parse input arguments.
if (argc != 4 && (argc != 6 || strcmp(argv[1], "-l") != 0)) {
printf("Usage: %s [-l lang] tessdata_dir wordlist_file"
" output_ambiguious_wordlist_file\n", argv[0]);
return 1;
}
int argv_offset = 0;
STRING lang;
if (argc == 6) {
lang = argv[2];
argv_offset = 2;
} else {
lang = "eng";
}
const char *tessdata_dir = argv[++argv_offset];
const char *input_file_str = argv[++argv_offset];
const char *output_file_str = argv[++argv_offset];

// Initialize Tesseract.
tesseract::TessBaseAPI api;
GenericVector<STRING> vars_vec;
GenericVector<STRING> vars_values;
vars_vec.push_back("output_ambig_words_file");
vars_values.push_back(output_file_str);
api.Init(tessdata_dir, lang.string(), tesseract::OEM_TESSERACT_ONLY,
NULL, NULL, &vars_vec, &vars_values, false);
tesseract::Dict &dict = api.tesseract()->getDict();
FILE *input_file = fopen(input_file_str, "r");
if (input_file == NULL) {
tprintf("Failed to open input wordlist file %s\n", input_file_str);
exit(1);
}
char str[CHARS_PER_LINE];

// Read word list and call Dict::NoDangerousAmbig() for each word
// to record ambiguities in the output file.
while (fgets(str, CHARS_PER_LINE, input_file) != NULL) {
chomp_string(str); // remove newline
WERD_CHOICE word(str, dict.getUnicharset());
dict.NoDangerousAmbig(&word, NULL, false, NULL, NULL);
}
// Clean up.
fclose(input_file);
}
139 changes: 139 additions & 0 deletions training/classifier_tester.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
// Copyright 2011 Google Inc. All Rights Reserved.
// Author: rays@google.com (Ray Smith)

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Filename: classifier_tester.cpp
// Purpose: Tests a character classifier on data as formatted for training,
// but doesn't have to be the same as the training data.
// Author: Ray Smith

#ifndef USE_STD_NAMESPACE
#include "base/commandlineflags.h"
#endif
#include "baseapi.h"
#include "commontraining.h"
#include "cubeclassifier.h"
#include "mastertrainer.h"
#include "params.h"
#include "strngs.h"
#include "tessclassifier.h"

STRING_PARAM_FLAG(classifier, "", "Classifier to test");
STRING_PARAM_FLAG(lang, "eng", "Language to test");
STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");

enum ClassifierName {
CN_PRUNER,
CN_FULL,
CN_CUBE,
CN_CUBETESS,
CN_COUNT
};

const char* names[] = {"pruner", "full", "cube", "cubetess", NULL };

// This program has complex setup requirements, so here is some help:
// Two different modes, tr files and serialized mastertrainer.
// From tr files:
// classifier_tester -U unicharset -F font_properties -X xheights
// -classifier x -lang lang [-output_trainer trainer] *.tr
// From a serialized trainer:
// classifier_tester -input_trainer trainer [-lang lang] -classifier x
//
// In the first case, the unicharset must be the unicharset from within
// the classifier under test, and the font_properties and xheights files must
// match the files used during training.
// In the second case, the trainer file must have been prepared from
// some previous run of shapeclustering, mftraining, or classifier_tester
// using the same conditions as above, ie matching unicharset/font_properties.
//
// Available values of classifier (x above) are:
// pruner : Tesseract class pruner only.
// full : Tesseract full classifier.
// cube : Cube classifier. (Not possible with an input trainer.)
// cubetess : Tesseract class pruner with rescoring by Cube. (Not possible
// with an input trainer.)
int main(int argc, char **argv) {
ParseArguments(&argc, &argv);
// Decode the classifier string.
ClassifierName classifier = CN_COUNT;
for (int c = 0; c < CN_COUNT; ++c) {
if (strcmp(FLAGS_classifier.c_str(), names[c]) == 0) {
classifier = static_cast<ClassifierName>(c);
break;
}
}
if (classifier == CN_COUNT) {
fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
return 1;
}

STRING file_prefix;
tesseract::MasterTrainer* trainer = tesseract::LoadTrainingData(
argc, argv, true, NULL, &file_prefix);
// We want to test junk as well if it is available.
trainer->IncludeJunk();
// We want to test with replicated samples too.
trainer->ReplicateAndRandomizeSamplesIfRequired();

// We need to initialize tesseract to test.
tesseract::TessBaseAPI api;
tesseract::OcrEngineMode engine_mode = tesseract::OEM_TESSERACT_ONLY;
if (classifier == CN_CUBE || classifier == CN_CUBETESS)
engine_mode = tesseract::OEM_TESSERACT_CUBE_COMBINED;
if (api.Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(),
engine_mode) < 0) {
fprintf(stderr, "Tesseract initialization failed!\n");
return 1;
}
tesseract::ShapeClassifier* shape_classifier = NULL;
tesseract::Tesseract* tesseract =
const_cast<tesseract::Tesseract*>(api.tesseract());
tesseract::Classify* classify =
reinterpret_cast<tesseract::Classify*>(tesseract);
// Copy the shape_table from the classifier and add the space character if
// not already present to count junk.
tesseract::ShapeTable shape_table;
shape_table.set_unicharset(classify->shape_table()->unicharset());
shape_table.AppendMasterShapes(*classify->shape_table());
if (shape_table.FindShape(0, -1) < 0)
shape_table.AddShape(0, 0);
if (classifier == CN_PRUNER) {
shape_classifier = new tesseract::TessClassifier(true, classify);
} else if (classifier == CN_FULL) {
shape_classifier = new tesseract::TessClassifier(false, classify);
} else if (classifier == CN_CUBE) {
shape_classifier = new tesseract::CubeClassifier(tesseract);
} else if (classifier == CN_CUBETESS) {
shape_classifier = new tesseract::CubeTessClassifier(tesseract);
} else {
fprintf(stderr, "%s tester not yet implemented\n",
FLAGS_classifier.c_str());
return 1;
}
tprintf("Testing classifier %s:\n", FLAGS_classifier.c_str());
trainer->TestClassifierOnSamples(3, false, shape_classifier, NULL);
if (classifier != CN_CUBE && classifier != CN_CUBETESS) {
// Test with replicated samples as well.
trainer->TestClassifierOnSamples(3, true, shape_classifier, NULL);
}
delete shape_classifier;
delete trainer;

return 0;
} /* main */






Loading

0 comments on commit e33ae59

Please sign in to comment.