Made some major classifier and clustering improvements

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@130 d0cd1f9f-072b-0410-8dd7-cf729c803f20
allen8807 · Feb 1, 2008 · 6b5e0c4 · 6b5e0c4
1 parent 166c867
commit 6b5e0c4
Show file tree

Hide file tree

Showing 10 changed files with 1,139 additions and 1,977 deletions.
diff --git a/ccmain/blobcmp.cpp b/ccmain/blobcmp.cpp
@@ -62,7 +62,7 @@ float compare_tess_blobs(TBLOB *blob1,
     SetBaseLineMatch();
     IntegerMatcher (ClassForClassId (ad_templates->Templates, CMP_CLASS),
       AllProtosOn, AllConfigsOn, fcount, fcount,
-      int_features, 0, 0, &int_result, testedit_match_debug);
+      int_features, 0, &int_result, testedit_match_debug);
     FreeFeatureSet(float_features);
     if (int_result.Rating < 0)
       int_result.Rating = MAX_FLOAT32;

diff --git a/classify/adaptmatch.cpp b/classify/adaptmatch.cpp
diff --git a/classify/cluster.cpp b/classify/cluster.cpp
@@ -19,6 +19,7 @@
 #include "const.h"
 #include "cluster.h"
 #include "emalloc.h"
+#include "tprintf.h"
 #include "danerror.h"
 #include "freelist.h"
 #include <math.h>
@@ -281,6 +282,7 @@ PROTOTYPE *MakeDegenerateProto(UINT16 N,
                                INT32 MinSamples);
 
 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
+                               CLUSTERCONFIG *Config,
                                CLUSTER *Cluster,
                                STATISTICS *Statistics);
 
@@ -1037,7 +1039,7 @@ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
   }
 
   if (HOTELLING && Config->ProtoStyle == elliptical) {
-    Proto = TestEllipticalProto(Clusterer, Cluster, Statistics);
+    Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
     if (Proto != NULL) {
       FreeStatistics(Statistics);
       return Proto;
@@ -1129,6 +1131,7 @@ PROTOTYPE *MakeDegenerateProto(  //this was MinSample
 
 /** TestEllipticalProto ****************************************************
 Parameters:	Clusterer	data struct containing samples being clustered
+      Config provides the magic number of samples that make a good cluster
       Cluster		cluster to be made into an elliptical prototype
       Statistics	statistical info about cluster
 Globals:	None
@@ -1141,24 +1144,60 @@ Operation:	This routine tests the specified cluster to see if **
 Return:		Pointer to new elliptical prototype or NULL.
 ****************************************************************************/
 PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
+                               CLUSTERCONFIG *Config,
                                CLUSTER *Cluster,
                                STATISTICS *Statistics) {
+  // Fraction of the number of samples used as a range around 1 within
+  // which a cluster has the magic size that allows a boost to the
+  // FTable by kFTableBoostMargin, thus allowing clusters near the
+  // magic size (equal to the number of sample characters) to be more
+  // likely to stay together.
+  const double kMagicSampleMargin = 0.0625;
+  const double kFTableBoostMargin = 2.0;
+
   int N = Clusterer->SampleSize;
   CLUSTER* Left = Cluster->Left;
   CLUSTER* Right = Cluster->Right;
   if (Left == NULL || Right == NULL)
     return NULL;
   int TotalDims = Left->SampleCount + Right->SampleCount;
-  if (TotalDims < N + 1)
+  if (TotalDims < N + 1 || TotalDims < 2)
     return NULL;
-  FLOAT32* Inverse = (FLOAT32 *) Emalloc(N * N * sizeof(FLOAT32));
-  FLOAT32* Delta = (FLOAT32*) Emalloc(N * sizeof(FLOAT32));
-  double err = InvertMatrix(Statistics->CoVariance, N, Inverse);
+  const int kMatrixSize = N * N * sizeof(FLOAT32);
+  FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
+  FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
+  FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
+  // Compute a new covariance matrix that only uses essential features.
+  for (int i = 0; i < N; ++i) {
+    int row_offset = i * N;
+    if (!Clusterer->ParamDesc[i].NonEssential) {
+      for (int j = 0; j < N; ++j) {
+        if (!Clusterer->ParamDesc[j].NonEssential)
+          Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
+        else
+          Covariance[j + row_offset] = 0.0f;
+      }
+    } else {
+      for (int j = 0; j < N; ++j) {
+        if (i == j)
+          Covariance[j + row_offset] = 1.0f;
+        else
+          Covariance[j + row_offset] = 0.0f;
+      }
+    }
+  }
+  double err = InvertMatrix(Covariance, N, Inverse);
   if (err > 1) {
-    cprintf("Clustering error: Matrix inverse failed with error %g\n", err);
+    tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
   }
+  int EssentialN = 0;
   for (int dim = 0; dim < N; ++dim) {
-    Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
+    if (!Clusterer->ParamDesc[dim].NonEssential) {
+      Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
+      ++EssentialN;
+    } else {
+      Delta[dim] = 0.0f;
+    }
   }
   // Compute Hotelling's T-squared.
   double Tsq = 0.0;
@@ -1169,19 +1208,30 @@ PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer,
     }
     Tsq += Delta[x] * temp;
   }
+  memfree(Covariance);
   memfree(Inverse);
   memfree(Delta);
-  Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
-  double F = Tsq * (TotalDims - N - 1) / ((TotalDims - N) * 2);
-  int Fx = N;
+  // Changed this function to match the formula in
+  // Statistical Methods in Medical Research p 473
+  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
+  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
+  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
+  int Fx = EssentialN;
   if (Fx > FTABLE_X)
     Fx = FTABLE_X;
   --Fx;
-  int Fy = TotalDims - N - 1;
+  int Fy = TotalDims - EssentialN - 1;
   if (Fy > FTABLE_Y)
     Fy = FTABLE_Y;
   --Fy;
-  if (F < FTable[Fy][Fx]) {
+  double FTarget = FTable[Fy][Fx];
+  if (Config->MagicSamples > 0 &&
+      TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
+      TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
+    // Give magic-sized clusters a magic FTable boost.
+    FTarget += kFTableBoostMargin;
+  }
+  if (F < FTarget) {
     return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
   }
   return NULL;

diff --git a/classify/cluster.h b/classify/cluster.h
@@ -55,6 +55,7 @@ typedef struct                   // parameters to control clustering
   // more than 1 feature in that cluster
   FLOAT32 Independence;          // desired independence between dimensions
   FLOAT64 Confidence;            // desired confidence in prototypes created
+  int MagicSamples;              // Ideal number of samples in a cluster.
 }
 
 
@@ -80,8 +81,13 @@ FLOATUNION;
 typedef struct proto
 {
   unsigned Significant:1;        // TRUE if prototype is significant
+  unsigned Merged:1;             // Merged after clustering so do not output
+                                 // but kept for display purposes. If it has no
+                                 // samples then it was actually merged.
+                                 // Otherwise it matched an already significant
+                                 // cluster.
   unsigned Style:2;              // spherical, elliptical, or mixed
-  unsigned NumSamples:29;        // number of samples in the cluster
+  unsigned NumSamples:28;        // number of samples in the cluster
   CLUSTER *Cluster;              // ptr to cluster which made prototype
   DISTRIBUTION *Distrib;         // different distribution for each dimension
   FLOAT32 *Mean;                 // prototype mean
@@ -129,19 +135,22 @@ CLUSTERER *MakeClusterer (INT16 SampleSize, PARAM_DESC ParamDesc[]);
 
 SAMPLE *MakeSample (CLUSTERER * Clusterer, FLOAT32 Feature[], INT32 CharID);
 
-LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config); 
+LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config);
 
-void FreeClusterer(CLUSTERER *Clusterer); 
+void FreeClusterer(CLUSTERER *Clusterer);
 
-void FreeProtoList(LIST *ProtoList); 
+void FreeProtoList(LIST *ProtoList);
 
 void FreePrototype(void *arg);  //PROTOTYPE     *Prototype);
 
-CLUSTER *NextSample(LIST *SearchState); 
+CLUSTER *NextSample(LIST *SearchState);
 
-FLOAT32 Mean(PROTOTYPE *Proto, UINT16 Dimension); 
+FLOAT32 Mean(PROTOTYPE *Proto, UINT16 Dimension);
 
-FLOAT32 StandardDeviation(PROTOTYPE *Proto, UINT16 Dimension); 
+FLOAT32 StandardDeviation(PROTOTYPE *Proto, UINT16 Dimension);
+
+INT32 MergeClusters(INT16 N, PARAM_DESC ParamDesc[], INT32 n1, INT32 n2,
+                    FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[]);
 
 //--------------Global Data Definitions and Declarations---------------------------
 // define errors that can be trapped

diff --git a/classify/featdefs.cpp b/classify/featdefs.cpp
@@ -41,7 +41,7 @@
 StartParamDesc (MicroFeatureParams)
 DefineParam (0, 0, -0.5, 0.5)
 DefineParam (0, 0, -0.25, 0.75)
-DefineParam (0, 0, 0.0, 1.0)
+DefineParam (0, 1, 0.0, 1.0)
 DefineParam (1, 0, 0.0, 1.0)
 DefineParam (0, 1, -0.5, 0.5)
 DefineParam (0, 1, -0.5, 0.5)
@@ -65,9 +65,9 @@ DefineFeature (PicoFeatDesc, 2, 1, 1, MAX_UINT8, "Pico", "pf", PicoFeatParams)
 /* define all of the parameters for the NormFeat type*/
 StartParamDesc (CharNormParams)
 DefineParam (0, 0, -0.25, 0.75)
-DefineParam (0, 0, 0.0, 1.0)
-DefineParam (0, 0, 0.0, 1.0)
-DefineParam (0, 0, 0.0, 1.0)
+DefineParam (0, 1, 0.0, 1.0)
+DefineParam (0, 1, 0.0, 1.0)
+DefineParam (0, 1, 0.0, 1.0)
 EndParamDesc
 /* now define the feature type itself (see features.h for info about each
   parameter).*/