Skip to content

Commit

Permalink
Refactored classifier to make it easier to add new ones and generaliz…
Browse files Browse the repository at this point in the history
…ed feature extractor to allow fx from grey

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@873 d0cd1f9f-072b-0410-8dd7-cf729c803f20
  • Loading branch information
theraysmith@gmail.com committed Sep 23, 2013
1 parent 2aafc9d commit 99edf4c
Show file tree
Hide file tree
Showing 48 changed files with 2,187 additions and 1,792 deletions.
6 changes: 3 additions & 3 deletions classify/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ AM_CPPFLAGS += -DTESS_EXPORTS \
endif

noinst_HEADERS = \
adaptive.h baseline.h blobclass.h chartoname.h \
adaptive.h blobclass.h chartoname.h \
classify.h cluster.h clusttool.h cutoffs.h \
errorcounter.h extern.h extract.h \
featdefs.h flexfx.h float2int.h fpoint.h fxdefs.h \
Expand All @@ -19,7 +19,7 @@ noinst_HEADERS = \
normfeat.h normmatch.h \
ocrfeatures.h outfeat.h picofeat.h protos.h \
sampleiterator.h shapeclassifier.h shapetable.h \
speckle.h tessclassifier.h trainingsample.h trainingsampleset.h xform2d.h
tessclassifier.h trainingsample.h trainingsampleset.h xform2d.h

if !USING_MULTIPLELIBS
noinst_LTLIBRARIES = libtesseract_classify.la
Expand All @@ -45,7 +45,7 @@ libtesseract_classify_la_SOURCES = \
mastertrainer.cpp mf.cpp mfdefs.cpp mfoutline.cpp mfx.cpp \
normfeat.cpp normmatch.cpp \
ocrfeatures.cpp outfeat.cpp picofeat.cpp protos.cpp \
sampleiterator.cpp shapetable.cpp speckle.cpp \
sampleiterator.cpp shapeclassifier.cpp shapetable.cpp \
tessclassifier.cpp trainingsample.cpp trainingsampleset.cpp xform2d.cpp


620 changes: 189 additions & 431 deletions classify/adaptmatch.cpp

Large diffs are not rendered by default.

41 changes: 0 additions & 41 deletions classify/baseline.h

This file was deleted.

15 changes: 10 additions & 5 deletions classify/blobclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,11 @@ extern char imagefile[];
----------------------------------------------------------------------------**/

/*---------------------------------------------------------------------------*/
// As all TBLOBs, Blob is in baseline normalized coords.
// See SetupBLCNDenorms in intfx.cpp for other args.
void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
TBLOB * Blob, const DENORM& denorm, const char* BlobText) {
TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
const INT_FX_RESULT_STRUCT& fx_info, const char* BlobText) {
/*
** Parameters:
** Blob blob whose micro-features are to be learned
Expand Down Expand Up @@ -95,18 +98,20 @@ void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
cprintf("TRAINING ... Font name = %s\n", CurrFontName.string());
}

LearnBlob(FeatureDefs, FeatureFile, Blob, denorm, BlobText,
CurrFontName.string());
LearnBlob(FeatureDefs, FeatureFile, Blob, bl_denorm, cn_denorm, fx_info,
BlobText, CurrFontName.string());
} // LearnBlob

void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile,
TBLOB* Blob, const DENORM& denorm,
TBLOB* Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
const INT_FX_RESULT_STRUCT& fx_info,
const char* BlobText, const char* FontName) {
CHAR_DESC CharDesc;

ASSERT_HOST(FeatureFile != NULL);

CharDesc = ExtractBlobFeatures(FeatureDefs, denorm, Blob);
CharDesc = ExtractBlobFeatures(FeatureDefs, bl_denorm, cn_denorm, fx_info,
Blob);
if (CharDesc == NULL) {
cprintf("LearnBLob: CharDesc was NULL. Aborting.\n");
return;
Expand Down
9 changes: 6 additions & 3 deletions classify/blobclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,11 +40,14 @@
Public Function Prototypes
----------------------------------------------------------------------------**/
void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
TBLOB * Blob, const DENORM& denorm, const char* BlobText);
TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
const INT_FX_RESULT_STRUCT& fx_info,
const char* BlobText);

void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* File, TBLOB* Blob,
const DENORM& denorm, const char* BlobText,
const char* FontName);
const DENORM& bl_denorm, const DENORM& cn_denorm,
const INT_FX_RESULT_STRUCT& fx_info,
const char* BlobText, const char* FontName);

/**----------------------------------------------------------------------------
Global Data Definitions and Declarations
Expand Down
66 changes: 63 additions & 3 deletions classify/classify.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "intproto.h"
#include "mfoutline.h"
#include "scrollview.h"
#include "shapeclassifier.h"
#include "shapetable.h"
#include "unicity_table.h"
#include <string.h>
Expand All @@ -52,6 +53,11 @@ Classify::Classify()
this->params()), /* PREV DEFAULT 0.1 */
double_MEMBER(classify_max_norm_scale_y, 0.325, "Max char y-norm scale ...",
this->params()), /* PREV DEFAULT 0.3 */
double_MEMBER(classify_max_rating_ratio, 1.5,
"Veto ratio between classifier ratings", this->params()),
double_MEMBER(classify_max_certainty_margin, 5.5,
"Veto difference between classifier certainties",
this->params()),
BOOL_MEMBER(tess_cn_matching, 0, "Character Normalized Matching",
this->params()),
BOOL_MEMBER(tess_bn_matching, 0, "Baseline Normalized Matching",
Expand All @@ -65,6 +71,8 @@ Classify::Classify()
"Save adapted templates to a file", this->params()),
BOOL_MEMBER(classify_enable_adaptive_debugger, 0, "Enable match debugger",
this->params()),
BOOL_MEMBER(classify_nonlinear_norm, 0,
"Non-linear stroke-density normalization", this->params()),
INT_MEMBER(matcher_debug_level, 0, "Matcher Debug Level", this->params()),
INT_MEMBER(matcher_debug_flags, 0, "Matcher Debug Flags", this->params()),
INT_MEMBER(classify_learning_debug_level, 0, "Learning Debug Level: ",
Expand Down Expand Up @@ -100,6 +108,12 @@ Classify::Classify()
this->params()),
double_MEMBER(tessedit_class_miss_scale, 0.00390625,
"Scale factor for features not used", this->params()),
double_MEMBER(classify_adapted_pruning_factor, 2.5,
"Prune poor adapted results this much worse than best result",
this->params()),
double_MEMBER(classify_adapted_pruning_threshold, -1.0,
"Threshold at which classify_adapted_pruning_factor starts",
this->params()),
INT_MEMBER(classify_adapt_proto_threshold, 230,
"Threshold for good protos during adaptive 0-255",
this->params()),
Expand All @@ -122,19 +136,24 @@ Classify::Classify()
this->params()),
INT_MEMBER(classify_class_pruner_threshold, 229,
"Class Pruner Threshold 0-255", this->params()),
INT_MEMBER(classify_class_pruner_multiplier, 30,
INT_MEMBER(classify_class_pruner_multiplier, 15,
"Class Pruner Multiplier 0-255: ", this->params()),
INT_MEMBER(classify_cp_cutoff_strength, 7,
"Class Pruner CutoffStrength: ", this->params()),
INT_MEMBER(classify_integer_matcher_multiplier, 14,
INT_MEMBER(classify_integer_matcher_multiplier, 10,
"Integer Matcher Multiplier 0-255: ", this->params()),
EnableLearning(true),
INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word",
this->params()),
BOOL_MEMBER(classify_bln_numeric_mode, 0,
"Assume the input is numbers [0-9].", this->params()),
double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size",
this->params()),
double_MEMBER(speckle_rating_penalty, 10.0,
"Penalty to add to worst rating for noise", this->params()),
shape_table_(NULL),
dict_(&image_) {
dict_(&image_),
static_classifier_(NULL) {
fontinfo_table_.set_compare_callback(
NewPermanentTessCallback(CompareFontInfo));
fontinfo_table_.set_clear_callback(
Expand Down Expand Up @@ -184,4 +203,45 @@ Classify::~Classify() {
delete[] BaselineCutoffs;
}


// Takes ownership of the given classifier, and uses it for future calls
// to CharNormClassifier.
void Classify::SetStaticClassifier(ShapeClassifier* static_classifier) {
delete static_classifier_;
static_classifier_ = static_classifier;
}

// Moved from speckle.cpp
// Adds a noise classification result that is a bit worse than the worst
// current result, or the worst possible result if no current results.
void Classify::AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices) {
BLOB_CHOICE_IT bc_it(choices);
// If there is no classifier result, we will use the worst possible certainty
// and corresponding rating.
float certainty = -getDict().certainty_scale;
float rating = rating_scale * blob_length;
if (!choices->empty() && blob_length > 0) {
bc_it.move_to_last();
BLOB_CHOICE* worst_choice = bc_it.data();
// Add speckle_rating_penalty to worst rating, matching old value.
rating = worst_choice->rating() + speckle_rating_penalty;
// Compute the rating to correspond to the certainty. (Used to be kept
// the same, but that messes up the language model search.)
certainty = -rating * getDict().certainty_scale /
(rating_scale * blob_length);
}
BLOB_CHOICE* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty,
-1, -1, 0, 0, MAX_FLOAT32, 0,
BCC_SPECKLE_CLASSIFIER);
bc_it.add_to_end(blob_choice);
}

// Returns true if the blob is small enough to be a large speckle.
bool Classify::LargeSpeckle(const TBLOB &blob) {
double speckle_size = kBlnXHeight * speckle_large_max_size;
TBOX bbox = blob.bounding_box();
return bbox.width() < speckle_size && bbox.height() < speckle_size;
}


} // namespace tesseract
Loading

0 comments on commit 99edf4c

Please sign in to comment.