Skip to content

Commit

Permalink
Made some major classifier and clustering improvements
Browse files Browse the repository at this point in the history
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@130 d0cd1f9f-072b-0410-8dd7-cf729c803f20
  • Loading branch information
theraysmith committed Feb 1, 2008
1 parent 166c867 commit 6b5e0c4
Show file tree
Hide file tree
Showing 10 changed files with 1,139 additions and 1,977 deletions.
2 changes: 1 addition & 1 deletion ccmain/blobcmp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ float compare_tess_blobs(TBLOB *blob1,
SetBaseLineMatch();
IntegerMatcher (ClassForClassId (ad_templates->Templates, CMP_CLASS),
AllProtosOn, AllConfigsOn, fcount, fcount,
int_features, 0, 0, &int_result, testedit_match_debug);
int_features, 0, &int_result, testedit_match_debug);
FreeFeatureSet(float_features);
if (int_result.Rating < 0)
int_result.Rating = MAX_FLOAT32;
Expand Down
1,624 changes: 718 additions & 906 deletions classify/adaptmatch.cpp

Large diffs are not rendered by default.

74 changes: 62 additions & 12 deletions classify/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "const.h"
#include "cluster.h"
#include "emalloc.h"
#include "tprintf.h"
#include "danerror.h"
#include "freelist.h"
#include <math.h>
Expand Down Expand Up @@ -281,6 +282,7 @@ PROTOTYPE *MakeDegenerateProto(UINT16 N,
INT32 MinSamples);

PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
CLUSTERCONFIG *Config,
CLUSTER *Cluster,
STATISTICS *Statistics);

Expand Down Expand Up @@ -1037,7 +1039,7 @@ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
}

if (HOTELLING && Config->ProtoStyle == elliptical) {
Proto = TestEllipticalProto(Clusterer, Cluster, Statistics);
Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
if (Proto != NULL) {
FreeStatistics(Statistics);
return Proto;
Expand Down Expand Up @@ -1129,6 +1131,7 @@ PROTOTYPE *MakeDegenerateProto( //this was MinSample

/** TestEllipticalProto ****************************************************
Parameters: Clusterer data struct containing samples being clustered
Config provides the magic number of samples that make a good cluster
Cluster cluster to be made into an elliptical prototype
Statistics statistical info about cluster
Globals: None
Expand All @@ -1141,24 +1144,60 @@ Operation: This routine tests the specified cluster to see if **
Return: Pointer to new elliptical prototype or NULL.
****************************************************************************/
PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
CLUSTERCONFIG *Config,
CLUSTER *Cluster,
STATISTICS *Statistics) {
// Fraction of the number of samples used as a range around 1 within
// which a cluster has the magic size that allows a boost to the
// FTable by kFTableBoostMargin, thus allowing clusters near the
// magic size (equal to the number of sample characters) to be more
// likely to stay together.
const double kMagicSampleMargin = 0.0625;
const double kFTableBoostMargin = 2.0;

int N = Clusterer->SampleSize;
CLUSTER* Left = Cluster->Left;
CLUSTER* Right = Cluster->Right;
if (Left == NULL || Right == NULL)
return NULL;
int TotalDims = Left->SampleCount + Right->SampleCount;
if (TotalDims < N + 1)
if (TotalDims < N + 1 || TotalDims < 2)
return NULL;
FLOAT32* Inverse = (FLOAT32 *) Emalloc(N * N * sizeof(FLOAT32));
FLOAT32* Delta = (FLOAT32*) Emalloc(N * sizeof(FLOAT32));
double err = InvertMatrix(Statistics->CoVariance, N, Inverse);
const int kMatrixSize = N * N * sizeof(FLOAT32);
FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
// Compute a new covariance matrix that only uses essential features.
for (int i = 0; i < N; ++i) {
int row_offset = i * N;
if (!Clusterer->ParamDesc[i].NonEssential) {
for (int j = 0; j < N; ++j) {
if (!Clusterer->ParamDesc[j].NonEssential)
Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
else
Covariance[j + row_offset] = 0.0f;
}
} else {
for (int j = 0; j < N; ++j) {
if (i == j)
Covariance[j + row_offset] = 1.0f;
else
Covariance[j + row_offset] = 0.0f;
}
}
}
double err = InvertMatrix(Covariance, N, Inverse);
if (err > 1) {
cprintf("Clustering error: Matrix inverse failed with error %g\n", err);
tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
}
int EssentialN = 0;
for (int dim = 0; dim < N; ++dim) {
Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
if (!Clusterer->ParamDesc[dim].NonEssential) {
Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
++EssentialN;
} else {
Delta[dim] = 0.0f;
}
}
// Compute Hotelling's T-squared.
double Tsq = 0.0;
Expand All @@ -1169,19 +1208,30 @@ PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
}
Tsq += Delta[x] * temp;
}
memfree(Covariance);
memfree(Inverse);
memfree(Delta);
Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
double F = Tsq * (TotalDims - N - 1) / ((TotalDims - N) * 2);
int Fx = N;
// Changed this function to match the formula in
// Statistical Methods in Medical Research p 473
// By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
// Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
int Fx = EssentialN;
if (Fx > FTABLE_X)
Fx = FTABLE_X;
--Fx;
int Fy = TotalDims - N - 1;
int Fy = TotalDims - EssentialN - 1;
if (Fy > FTABLE_Y)
Fy = FTABLE_Y;
--Fy;
if (F < FTable[Fy][Fx]) {
double FTarget = FTable[Fy][Fx];
if (Config->MagicSamples > 0 &&
TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
// Give magic-sized clusters a magic FTable boost.
FTarget += kFTableBoostMargin;
}
if (F < FTarget) {
return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
}
return NULL;
Expand Down
23 changes: 16 additions & 7 deletions classify/cluster.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ typedef struct // parameters to control clustering
// more than 1 feature in that cluster
FLOAT32 Independence; // desired independence between dimensions
FLOAT64 Confidence; // desired confidence in prototypes created
int MagicSamples; // Ideal number of samples in a cluster.
}


Expand All @@ -80,8 +81,13 @@ FLOATUNION;
typedef struct proto
{
unsigned Significant:1; // TRUE if prototype is significant
unsigned Merged:1; // Merged after clustering so do not output
// but kept for display purposes. If it has no
// samples then it was actually merged.
// Otherwise it matched an already significant
// cluster.
unsigned Style:2; // spherical, elliptical, or mixed
unsigned NumSamples:29; // number of samples in the cluster
unsigned NumSamples:28; // number of samples in the cluster
CLUSTER *Cluster; // ptr to cluster which made prototype
DISTRIBUTION *Distrib; // different distribution for each dimension
FLOAT32 *Mean; // prototype mean
Expand Down Expand Up @@ -129,19 +135,22 @@ CLUSTERER *MakeClusterer (INT16 SampleSize, PARAM_DESC ParamDesc[]);

SAMPLE *MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], INT32 CharID);

LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);

void FreeClusterer(CLUSTERER *Clusterer);
void FreeClusterer(CLUSTERER *Clusterer);

void FreeProtoList(LIST *ProtoList);
void FreeProtoList(LIST *ProtoList);

void FreePrototype(void *arg); //PROTOTYPE *Prototype);

CLUSTER *NextSample(LIST *SearchState);
CLUSTER *NextSample(LIST *SearchState);

FLOAT32 Mean(PROTOTYPE *Proto, UINT16 Dimension);
FLOAT32 Mean(PROTOTYPE *Proto, UINT16 Dimension);

FLOAT32 StandardDeviation(PROTOTYPE *Proto, UINT16 Dimension);
FLOAT32 StandardDeviation(PROTOTYPE *Proto, UINT16 Dimension);

INT32 MergeClusters(INT16 N, PARAM_DESC ParamDesc[], INT32 n1, INT32 n2,
FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[]);

//--------------Global Data Definitions and Declarations---------------------------
// define errors that can be trapped
Expand Down
8 changes: 4 additions & 4 deletions classify/featdefs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
StartParamDesc (MicroFeatureParams)
DefineParam (0, 0, -0.5, 0.5)
DefineParam (0, 0, -0.25, 0.75)
DefineParam (0, 0, 0.0, 1.0)
DefineParam (0, 1, 0.0, 1.0)
DefineParam (1, 0, 0.0, 1.0)
DefineParam (0, 1, -0.5, 0.5)
DefineParam (0, 1, -0.5, 0.5)
Expand All @@ -65,9 +65,9 @@ DefineFeature (PicoFeatDesc, 2, 1, 1, MAX_UINT8, "Pico", "pf", PicoFeatParams)
/* define all of the parameters for the NormFeat type*/
StartParamDesc (CharNormParams)
DefineParam (0, 0, -0.25, 0.75)
DefineParam (0, 0, 0.0, 1.0)
DefineParam (0, 0, 0.0, 1.0)
DefineParam (0, 0, 0.0, 1.0)
DefineParam (0, 1, 0.0, 1.0)
DefineParam (0, 1, 0.0, 1.0)
DefineParam (0, 1, 0.0, 1.0)
EndParamDesc
/* now define the feature type itself (see features.h for info about each
parameter).*/
Expand Down
Loading

0 comments on commit 6b5e0c4

Please sign in to comment.