Major internationalization improvements

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@133 d0cd1f9f-072b-0410-8dd7-cf729c803f20
qcdcool · Feb 1, 2008 · 2a67830 · 2a67830
1 parent aa55810
commit 2a67830
Show file tree

Hide file tree

Showing 17 changed files with 616 additions and 44 deletions.
diff --git a/ccutil/unichar.h b/ccutil/unichar.h
@@ -24,7 +24,7 @@
 
 // Maximum number of characters that can be stored in a UNICHAR. Must be
 // at least 4. Must not exceed 31 without changing the coding of length.
-#define UNICHAR_LEN 8
+#define UNICHAR_LEN 24
 
 // A UNICHAR_ID is the unique id of a unichar.
 typedef int UNICHAR_ID;

diff --git a/ccutil/unicharmap.cpp b/ccutil/unicharmap.cpp
@@ -19,7 +19,7 @@
 
 #include <assert.h>
 #include "unichar.h"
-
+#include "host.h"
 #include "unicharmap.h"
 
 UNICHARMAP::UNICHARMAP() :
@@ -135,6 +135,22 @@ bool UNICHARMAP::contains(const char* const unichar_repr,
       current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
 }
 
+// Return the minimum number of characters that must be used from this string
+// to obtain a match in the UNICHARMAP.
+int UNICHARMAP::minmatch(const char* const unichar_repr) const {
+  const char* current_char = unichar_repr;
+  UNICHARMAP_NODE* current_nodes = nodes;
+
+  while (current_nodes != NULL && *current_char != '\0') {
+    if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
+      return current_char + 1 - unichar_repr;
+    current_nodes =
+        current_nodes[static_cast<unsigned char>(*current_char)].children;
+    ++current_char;
+  }
+  return 0;
+}
+
 void UNICHARMAP::clear() {
   if (nodes != 0)
   {

diff --git a/ccutil/unicharmap.h b/ccutil/unicharmap.h
@@ -56,6 +56,10 @@ class UNICHARMAP {
   // used. The length MUST be non-zero.
   bool contains(const char* const unichar_repr, int length) const;
 
+  // Return the minimum number of characters that must be used from this string
+  // to obtain a match in the UNICHARMAP.
+  int minmatch(const char* const unichar_repr) const;
+
   // Clear the UNICHARMAP. All previous data is lost.
   void clear();
 

diff --git a/ccutil/unicharset.cpp b/ccutil/unicharset.cpp
@@ -44,11 +44,10 @@ UNICHARSET::~UNICHARSET() {
 }
 
 void UNICHARSET::reserve(int unichars_number) {
-  if (unichars_number > size_reserved)
-  {
+  if (unichars_number > size_reserved) {
     UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
     for (int i = 0; i < size_used; ++i)
-      memcpy(&unichars_new[i], &unichars[i], sizeof (UNICHAR_SLOT));
+      memcpy(&unichars_new[i], &unichars[i], sizeof(UNICHAR_SLOT));
     delete[] unichars;
     unichars = unichars_new;
     size_reserved = unichars_number;
@@ -68,22 +67,50 @@ const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
   return ids.unichar_to_id(unichar_repr, length);
 }
 
+// Return the minimum number of bytes that matches a legal UNICHAR_ID,
+// while leaving a legal UNICHAR_ID afterwards. In other words, if there
+// is both a short and a long match to the string, return the length that
+// ensures there is a legal match after it.
+int UNICHARSET::step(const char* str) const {
+  // Find the length of the first matching unicharset member.
+  int minlength = ids.minmatch(str);
+  if (minlength == 0)
+    return 0;  // Empty string or illegal char.
+
+  int goodlength = minlength;
+  while (goodlength <= UNICHAR_LEN) {
+    if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
+      return goodlength;  // This length works!
+    // The next char is illegal so find the next usable length.
+    do {
+      ++goodlength;
+    } while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
+             !ids.contains(str, goodlength));
+  }
+  // Search to find a subsequent legal char failed so return the minlength.
+  return minlength;
+}
+
 const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
   assert(id < this->size());
   return unichars[id].representation;
 }
 
 void UNICHARSET::unichar_insert(const char* const unichar_repr) {
   if (!ids.contains(unichar_repr)) {
-    if (size_used == size_reserved)
-    {
+    if (size_used == size_reserved) {
       if (size_used == 0)
         reserve(8);
       else
         reserve(2 * size_used);
     }
 
     strcpy(unichars[size_used].representation, unichar_repr);
+    this->set_isalpha(size_used, false);
+    this->set_islower(size_used, false);
+    this->set_isupper(size_used, false);
+    this->set_isdigit(size_used, false);
+    this->unichars[size_used].properties.enabled = true;
     ids.insert(unichar_repr, size_used);
     ++size_used;
   }
@@ -93,6 +120,10 @@ bool UNICHARSET::contains_unichar(const char* const unichar_repr) {
   return ids.contains(unichar_repr);
 }
 
+bool UNICHARSET::contains_unichar(const char* const unichar_repr, int length) {
+  return ids.contains(unichar_repr, length);
+}
+
 bool UNICHARSET::eq(UNICHAR_ID unichar_id, const char* const unichar_repr) {
   return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
 }
@@ -135,8 +166,7 @@ bool UNICHARSET::load_from_file(const char* filename) {
 
   this->clear();
   if (fgets(buffer, sizeof (buffer), file) == NULL ||
-      sscanf(buffer, "%d", &unicharset_size) != 1)
-  {
+      sscanf(buffer, "%d", &unicharset_size) != 1) {
     fclose(file);
     return false;
   }
@@ -146,8 +176,7 @@ bool UNICHARSET::load_from_file(const char* filename) {
     unsigned int properties;
 
     if (fgets(buffer, sizeof (buffer), file) == NULL ||
-        sscanf(buffer, "%s %x", unichar, &properties) != 2)
-    {
+        sscanf(buffer, "%s %x", unichar, &properties) != 2) {
       fclose(file);
       return false;
     }
@@ -160,7 +189,45 @@ bool UNICHARSET::load_from_file(const char* filename) {
     this->set_islower(id, properties & ISLOWER_MASK);
     this->set_isupper(id, properties & ISUPPER_MASK);
     this->set_isdigit(id, properties & ISDIGIT_MASK);
+    this->unichars[id].properties.enabled = true;
   }
   fclose(file);
   return true;
 }
+
+// Set a whitelist and/or blacklist of characters to recognize.
+// An empty or NULL whitelist enables everything (minus any blacklist).
+// An empty or NULL blacklist disables nothing.
+void UNICHARSET::set_black_and_whitelist(const char* blacklist,
+                                         const char* whitelist) {
+  bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
+  // Set everything to default
+  for (int ch = 0; ch < size_used; ++ch)
+    unichars[ch].properties.enabled = def_enabled;
+  int ch_step;
+  if (!def_enabled) {
+    // Enable the whitelist.
+    for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
+      ch_step = step(whitelist + w_ind);
+      if (ch_step > 0) {
+        UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
+        unichars[u_id].properties.enabled = true;
+      } else {
+        ch_step = 1;
+      }
+    }
+  }
+  if (blacklist != NULL && blacklist[0] != '\0') {
+    // Disable the blacklist.
+    for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
+      ch_step = step(blacklist + b_ind);
+      if (ch_step > 0) {
+        UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
+        unichars[u_id].properties.enabled = false;
+      } else {
+        ch_step = 1;
+      }
+    }
+  }
+}
+
diff --git a/ccutil/unicharset.h b/ccutil/unicharset.h
@@ -43,6 +43,12 @@ class UNICHARSET {
   const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
                                  int length) const;
 
+  // Return the minimum number of bytes that matches a legal UNICHAR_ID,
+  // while leaving a legal UNICHAR_ID afterwards. In other words, if there
+  // is both a short and a long match to the string, return the length that
+  // ensures there is a legal match after it.
+  int step(const char* str) const;
+
   // Return the unichar representation corresponding to the given UNICHAR_ID
   // within the UNICHARSET.
   const char* const id_to_unichar(UNICHAR_ID id) const;
@@ -52,6 +58,7 @@ class UNICHARSET {
 
   // Return true if the given unichar representation exists within the set.
   bool contains_unichar(const char* const unichar_repr);
+  bool contains_unichar(const char* const unichar_repr, int length);
 
   // Return true if the given unichar representation corresponds to the given
   // UNICHAR_ID within the set.
@@ -84,6 +91,15 @@ class UNICHARSET {
   // true if the operation is successful.
   bool load_from_file(const char* const filename);
 
+  // Set a whitelist and/or blacklist of characters to recognize.
+  // An empty or NULL whitelist enables everything (minus any blacklist).
+  // An empty or NULL blacklist disables nothing.
+  // The blacklist overrides the whitelist.
+  // Each list is a string of utf8 character strings. Boundaries between
+  // unicharset units are worked out automatically, and characters not in
+  // the unicharset are silently ignored.
+  void set_black_and_whitelist(const char* blacklist, const char* whitelist);
+
   // Set the isalpha property of the given unichar to the given value.
   void set_isalpha(UNICHAR_ID unichar_id, bool value) {
     unichars[unichar_id].properties.isalpha = value;
@@ -172,13 +188,19 @@ class UNICHARSET {
     return get_isdigit(unichar_to_id(unichar_repr, length));
   }
 
+  // Return the enabled property of the given unichar.
+  bool get_enabled(UNICHAR_ID unichar_id) const {
+    return unichars[unichar_id].properties.enabled;
+  }
+
  private:
 
   struct UNICHAR_PROPERTIES {
     bool isalpha;
     bool islower;
     bool isupper;
     bool isdigit;
+    bool enabled;
   };
 
   struct UNICHAR_SLOT {

diff --git a/classify/intproto.cpp b/classify/intproto.cpp
@@ -59,6 +59,10 @@
 /* define pad used to snap near horiz/vertical protos to horiz/vertical */
 #define HV_TOLERANCE  (0.0025)   /* approx 0.9 degrees */
 
+const int kInputSize = 16;
+//extern int input_unicode[kInputSize];
+int input_unicode[kInputSize];
+
 typedef enum
 { StartSwitch, EndSwitch, LastSwitch }
 SWITCH_TYPE;
@@ -872,6 +876,7 @@ INT_TEMPLATES ReadIntTemplates(FILE *File, BOOL8 swap) {
   int i, j, x, y, z;
   int nread;
   int unicharset_size;
+  int version_id = 0;
   INT_TEMPLATES Templates;
   CLASS_PRUNER Pruner;
   INT_CLASS Class;
@@ -900,6 +905,12 @@ INT_TEMPLATES ReadIntTemplates(FILE *File, BOOL8 swap) {
             unicharset_size, unicharset.size());
     exit(1);
   }
+  if (Templates->NumClasses < 0) {
+    // This file has a version id!
+    version_id = -Templates->NumClasses;
+    if (fread(&Templates->NumClasses, sizeof(int), 1, File) != 1)
+      cprintf ("Bad read of inttemp!\n");
+  }
   for (i = 0; i < unicharset_size; ++i) {
     if (fread(&Templates->IndexFor[i], sizeof(CLASS_INDEX), 1, File) != 1)
       cprintf("Bad read of inttemp!\n");
@@ -944,10 +955,13 @@ INT_TEMPLATES ReadIntTemplates(FILE *File, BOOL8 swap) {
         fread(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File) != 1 ||
         fread(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File) != 1)
       cprintf ("Bad read of inttemp!\n");
-    for (j = 0; j <= MAX_NUM_PROTO_SETS; ++j) {
-      int junk;
-      if (fread(&junk, sizeof(junk), 1, File) != 1)
-        cprintf ("Bad read of inttemp!\n");
+    if (version_id == 0) {
+      // Only version 0 writes 5 pointless pointers to the file.
+      for (j = 0; j < 5; ++j) {
+        int junk;
+        if (fread(&junk, sizeof(junk), 1, File) != 1)
+          cprintf ("Bad read of inttemp!\n");
+      }
     }
     for (j = 0; j < MAX_NUM_CONFIGS; ++j) {
       if (fread(&Class->ConfigLengths[j], sizeof(UINT16), 1, File) != 1)
@@ -1072,11 +1086,13 @@ void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
   int i, j;
   INT_CLASS Class;
   int unicharset_size = target_unicharset.size();
+  int version_id = -1;  // Turns positive on reading.
 
   /* first write the high level template struct */
   fwrite((char *) &unicharset_size, sizeof (int), 1, File);
-  fwrite((char *) &Templates->NumClasses, sizeof (int), 1, File);
+  fwrite((char *) &version_id, sizeof (int), 1, File);
   fwrite((char *) &Templates->NumClassPruners, sizeof (int), 1, File);
+  fwrite((char *) &Templates->NumClasses, sizeof (int), 1, File);
   fwrite((char *) &Templates->IndexFor[0], sizeof (CLASS_INDEX),
          unicharset_size, File);
   fwrite((char *) &Templates->ClassIdFor[0], sizeof (CLASS_ID),
@@ -1092,7 +1108,12 @@ void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
     Class = ClassForIndex (Templates, i);
 
     /* first write out the high level struct for the class */
-    fwrite ((char *) Class, sizeof (INT_CLASS_STRUCT), 1, File);
+    fwrite(&Class->NumProtos, sizeof(Class->NumProtos), 1, File);
+    fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
+    fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
+    for (j = 0; j < MAX_NUM_CONFIGS; ++j) {
+      fwrite(&Class->ConfigLengths[j], sizeof(UINT16), 1, File);
+    }
 
     /* then write out the proto lengths */
     fwrite ((char *) (Class->ProtoLengths), sizeof (UINT8),
@@ -1546,7 +1567,7 @@ FLOAT32 AnglePad, PROTO Proto, TABLE_FILLER * Filler)
   else {
     /* diagonal proto */
 
-    if (Angle > 0.0 && Angle < 0.25 || Angle > 0.5 && Angle < 0.75) {
+    if ((Angle > 0.0 && Angle < 0.25) || (Angle > 0.5 && Angle < 0.75)) {
       /* rising diagonal proto */
       Angle *= 2.0 * PI;
       Cos = fabs (cos (Angle));
@@ -1736,17 +1757,19 @@ void RenderIntProto(void *window,
   Xmin = Ymin = NUM_PP_BUCKETS;
   Xmax = Ymax = 0;
   for (Bucket = 0; Bucket < NUM_PP_BUCKETS; Bucket++) {
-    if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_X][Bucket][ProtoWordIndex])
+    if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_X][Bucket][ProtoWordIndex]) {
       if (Bucket < Xmin)
         Xmin = Bucket;
-    else if (Bucket > Xmax)
-      Xmax = Bucket;
+      else if (Bucket > Xmax)
+        Xmax = Bucket;
+    }
 
-    if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_Y][Bucket][ProtoWordIndex])
+    if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_Y][Bucket][ProtoWordIndex]) {
       if (Bucket < Ymin)
         Ymin = Bucket;
-    else if (Bucket > Ymax)
-      Ymax = Bucket;
+      else if (Bucket > Ymax)
+        Ymax = Bucket;
+    }
   }
   X = (Xmin + Xmax + 1) / 2.0 * PROTO_PRUNER_SCALE - DISPLAY_OFFSET;
   Y = (Ymin + Ymax + 1) / 2.0 * PROTO_PRUNER_SCALE - DISPLAY_OFFSET;

diff --git a/classify/intproto.h b/classify/intproto.h
@@ -39,7 +39,7 @@
 #define MAX_PROTO_INDEX   24
 #define BITS_PER_WERD   (8 * sizeof (UINT32))
 #define MAX_NUM_CONFIGS   32
-#define MAX_NUM_PROTOS    256
+#define MAX_NUM_PROTOS    512
 #define PROTOS_PER_PROTO_SET  64
 #define MAX_NUM_PROTO_SETS  (MAX_NUM_PROTOS / PROTOS_PER_PROTO_SET)
 #define NUM_PP_PARAMS   3

diff --git a/classify/protos.cpp b/classify/protos.cpp
@@ -31,6 +31,7 @@
 #include "emalloc.h"
 #include "freelist.h"
 #include "callcpp.h"
+#include "tprintf.h"
 #include "adaptmatch.h"
 #include "scanutils.h"
 #include "globals.h"
@@ -122,6 +123,10 @@ int AddProtoToClass(CLASS_TYPE Class) {
   }
   NewProto = NumProtosIn (Class);
   NumProtosIn (Class)++;
+  if (NumProtosIn(Class) > MAX_NUM_PROTOS) {
+    tprintf("Ouch! number of protos = %d, vs max of %d!",
+            NumProtosIn(Class), MAX_NUM_PROTOS);
+  }
   return (NewProto);
 }