Skip to content

Commit

Permalink
Major internationalization improvements
Browse files Browse the repository at this point in the history
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@133 d0cd1f9f-072b-0410-8dd7-cf729c803f20
  • Loading branch information
theraysmith committed Feb 1, 2008
1 parent aa55810 commit 2a67830
Show file tree
Hide file tree
Showing 17 changed files with 616 additions and 44 deletions.
2 changes: 1 addition & 1 deletion ccutil/unichar.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

// Maximum number of characters that can be stored in a UNICHAR. Must be
// at least 4. Must not exceed 31 without changing the coding of length.
#define UNICHAR_LEN 8
#define UNICHAR_LEN 24

// A UNICHAR_ID is the unique id of a unichar.
typedef int UNICHAR_ID;
Expand Down
18 changes: 17 additions & 1 deletion ccutil/unicharmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

#include <assert.h>
#include "unichar.h"

#include "host.h"
#include "unicharmap.h"

UNICHARMAP::UNICHARMAP() :
Expand Down Expand Up @@ -135,6 +135,22 @@ bool UNICHARMAP::contains(const char* const unichar_repr,
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
}

// Return the minimum number of characters that must be used from this string
// to obtain a match in the UNICHARMAP.
int UNICHARMAP::minmatch(const char* const unichar_repr) const {
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;

while (current_nodes != NULL && *current_char != '\0') {
if (current_nodes[static_cast<unsigned char>(*current_char)].id >= 0)
return current_char + 1 - unichar_repr;
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
}
return 0;
}

void UNICHARMAP::clear() {
if (nodes != 0)
{
Expand Down
4 changes: 4 additions & 0 deletions ccutil/unicharmap.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ class UNICHARMAP {
// used. The length MUST be non-zero.
bool contains(const char* const unichar_repr, int length) const;

// Return the minimum number of characters that must be used from this string
// to obtain a match in the UNICHARMAP.
int minmatch(const char* const unichar_repr) const;

// Clear the UNICHARMAP. All previous data is lost.
void clear();

Expand Down
85 changes: 76 additions & 9 deletions ccutil/unicharset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,10 @@ UNICHARSET::~UNICHARSET() {
}

void UNICHARSET::reserve(int unichars_number) {
if (unichars_number > size_reserved)
{
if (unichars_number > size_reserved) {
UNICHAR_SLOT* unichars_new = new UNICHAR_SLOT[unichars_number];
for (int i = 0; i < size_used; ++i)
memcpy(&unichars_new[i], &unichars[i], sizeof (UNICHAR_SLOT));
memcpy(&unichars_new[i], &unichars[i], sizeof(UNICHAR_SLOT));
delete[] unichars;
unichars = unichars_new;
size_reserved = unichars_number;
Expand All @@ -68,22 +67,50 @@ const UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
return ids.unichar_to_id(unichar_repr, length);
}

// Return the minimum number of bytes that matches a legal UNICHAR_ID,
// while leaving a legal UNICHAR_ID afterwards. In other words, if there
// is both a short and a long match to the string, return the length that
// ensures there is a legal match after it.
int UNICHARSET::step(const char* str) const {
// Find the length of the first matching unicharset member.
int minlength = ids.minmatch(str);
if (minlength == 0)
return 0; // Empty string or illegal char.

int goodlength = minlength;
while (goodlength <= UNICHAR_LEN) {
if (str[goodlength] == '\0' || ids.minmatch(str + goodlength) > 0)
return goodlength; // This length works!
// The next char is illegal so find the next usable length.
do {
++goodlength;
} while (str[goodlength] != '\0' && goodlength <= UNICHAR_LEN &&
!ids.contains(str, goodlength));
}
// Search to find a subsequent legal char failed so return the minlength.
return minlength;
}

const char* const UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
assert(id < this->size());
return unichars[id].representation;
}

void UNICHARSET::unichar_insert(const char* const unichar_repr) {
if (!ids.contains(unichar_repr)) {
if (size_used == size_reserved)
{
if (size_used == size_reserved) {
if (size_used == 0)
reserve(8);
else
reserve(2 * size_used);
}

strcpy(unichars[size_used].representation, unichar_repr);
this->set_isalpha(size_used, false);
this->set_islower(size_used, false);
this->set_isupper(size_used, false);
this->set_isdigit(size_used, false);
this->unichars[size_used].properties.enabled = true;
ids.insert(unichar_repr, size_used);
++size_used;
}
Expand All @@ -93,6 +120,10 @@ bool UNICHARSET::contains_unichar(const char* const unichar_repr) {
return ids.contains(unichar_repr);
}

bool UNICHARSET::contains_unichar(const char* const unichar_repr, int length) {
return ids.contains(unichar_repr, length);
}

bool UNICHARSET::eq(UNICHAR_ID unichar_id, const char* const unichar_repr) {
return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
}
Expand Down Expand Up @@ -135,8 +166,7 @@ bool UNICHARSET::load_from_file(const char* filename) {

this->clear();
if (fgets(buffer, sizeof (buffer), file) == NULL ||
sscanf(buffer, "%d", &unicharset_size) != 1)
{
sscanf(buffer, "%d", &unicharset_size) != 1) {
fclose(file);
return false;
}
Expand All @@ -146,8 +176,7 @@ bool UNICHARSET::load_from_file(const char* filename) {
unsigned int properties;

if (fgets(buffer, sizeof (buffer), file) == NULL ||
sscanf(buffer, "%s %x", unichar, &properties) != 2)
{
sscanf(buffer, "%s %x", unichar, &properties) != 2) {
fclose(file);
return false;
}
Expand All @@ -160,7 +189,45 @@ bool UNICHARSET::load_from_file(const char* filename) {
this->set_islower(id, properties & ISLOWER_MASK);
this->set_isupper(id, properties & ISUPPER_MASK);
this->set_isdigit(id, properties & ISDIGIT_MASK);
this->unichars[id].properties.enabled = true;
}
fclose(file);
return true;
}

// Set a whitelist and/or blacklist of characters to recognize.
// An empty or NULL whitelist enables everything (minus any blacklist).
// An empty or NULL blacklist disables nothing.
void UNICHARSET::set_black_and_whitelist(const char* blacklist,
const char* whitelist) {
bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
// Set everything to default
for (int ch = 0; ch < size_used; ++ch)
unichars[ch].properties.enabled = def_enabled;
int ch_step;
if (!def_enabled) {
// Enable the whitelist.
for (int w_ind = 0; whitelist[w_ind] != '\0'; w_ind += ch_step) {
ch_step = step(whitelist + w_ind);
if (ch_step > 0) {
UNICHAR_ID u_id = unichar_to_id(whitelist + w_ind, ch_step);
unichars[u_id].properties.enabled = true;
} else {
ch_step = 1;
}
}
}
if (blacklist != NULL && blacklist[0] != '\0') {
// Disable the blacklist.
for (int b_ind = 0; blacklist[b_ind] != '\0'; b_ind += ch_step) {
ch_step = step(blacklist + b_ind);
if (ch_step > 0) {
UNICHAR_ID u_id = unichar_to_id(blacklist + b_ind, ch_step);
unichars[u_id].properties.enabled = false;
} else {
ch_step = 1;
}
}
}
}

22 changes: 22 additions & 0 deletions ccutil/unicharset.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ class UNICHARSET {
const UNICHAR_ID unichar_to_id(const char* const unichar_repr,
int length) const;

// Return the minimum number of bytes that matches a legal UNICHAR_ID,
// while leaving a legal UNICHAR_ID afterwards. In other words, if there
// is both a short and a long match to the string, return the length that
// ensures there is a legal match after it.
int step(const char* str) const;

// Return the unichar representation corresponding to the given UNICHAR_ID
// within the UNICHARSET.
const char* const id_to_unichar(UNICHAR_ID id) const;
Expand All @@ -52,6 +58,7 @@ class UNICHARSET {

// Return true if the given unichar representation exists within the set.
bool contains_unichar(const char* const unichar_repr);
bool contains_unichar(const char* const unichar_repr, int length);

// Return true if the given unichar representation corresponds to the given
// UNICHAR_ID within the set.
Expand Down Expand Up @@ -84,6 +91,15 @@ class UNICHARSET {
// true if the operation is successful.
bool load_from_file(const char* const filename);

// Set a whitelist and/or blacklist of characters to recognize.
// An empty or NULL whitelist enables everything (minus any blacklist).
// An empty or NULL blacklist disables nothing.
// The blacklist overrides the whitelist.
// Each list is a string of utf8 character strings. Boundaries between
// unicharset units are worked out automatically, and characters not in
// the unicharset are silently ignored.
void set_black_and_whitelist(const char* blacklist, const char* whitelist);

// Set the isalpha property of the given unichar to the given value.
void set_isalpha(UNICHAR_ID unichar_id, bool value) {
unichars[unichar_id].properties.isalpha = value;
Expand Down Expand Up @@ -172,13 +188,19 @@ class UNICHARSET {
return get_isdigit(unichar_to_id(unichar_repr, length));
}

// Return the enabled property of the given unichar.
bool get_enabled(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.enabled;
}

private:

struct UNICHAR_PROPERTIES {
bool isalpha;
bool islower;
bool isupper;
bool isdigit;
bool enabled;
};

struct UNICHAR_SLOT {
Expand Down
49 changes: 36 additions & 13 deletions classify/intproto.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@
/* define pad used to snap near horiz/vertical protos to horiz/vertical */
#define HV_TOLERANCE (0.0025) /* approx 0.9 degrees */

const int kInputSize = 16;
//extern int input_unicode[kInputSize];
int input_unicode[kInputSize];

typedef enum
{ StartSwitch, EndSwitch, LastSwitch }
SWITCH_TYPE;
Expand Down Expand Up @@ -872,6 +876,7 @@ INT_TEMPLATES ReadIntTemplates(FILE *File, BOOL8 swap) {
int i, j, x, y, z;
int nread;
int unicharset_size;
int version_id = 0;
INT_TEMPLATES Templates;
CLASS_PRUNER Pruner;
INT_CLASS Class;
Expand Down Expand Up @@ -900,6 +905,12 @@ INT_TEMPLATES ReadIntTemplates(FILE *File, BOOL8 swap) {
unicharset_size, unicharset.size());
exit(1);
}
if (Templates->NumClasses < 0) {
// This file has a version id!
version_id = -Templates->NumClasses;
if (fread(&Templates->NumClasses, sizeof(int), 1, File) != 1)
cprintf ("Bad read of inttemp!\n");
}
for (i = 0; i < unicharset_size; ++i) {
if (fread(&Templates->IndexFor[i], sizeof(CLASS_INDEX), 1, File) != 1)
cprintf("Bad read of inttemp!\n");
Expand Down Expand Up @@ -944,10 +955,13 @@ INT_TEMPLATES ReadIntTemplates(FILE *File, BOOL8 swap) {
fread(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File) != 1 ||
fread(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File) != 1)
cprintf ("Bad read of inttemp!\n");
for (j = 0; j <= MAX_NUM_PROTO_SETS; ++j) {
int junk;
if (fread(&junk, sizeof(junk), 1, File) != 1)
cprintf ("Bad read of inttemp!\n");
if (version_id == 0) {
// Only version 0 writes 5 pointless pointers to the file.
for (j = 0; j < 5; ++j) {
int junk;
if (fread(&junk, sizeof(junk), 1, File) != 1)
cprintf ("Bad read of inttemp!\n");
}
}
for (j = 0; j < MAX_NUM_CONFIGS; ++j) {
if (fread(&Class->ConfigLengths[j], sizeof(UINT16), 1, File) != 1)
Expand Down Expand Up @@ -1072,11 +1086,13 @@ void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
int i, j;
INT_CLASS Class;
int unicharset_size = target_unicharset.size();
int version_id = -1; // Turns positive on reading.

/* first write the high level template struct */
fwrite((char *) &unicharset_size, sizeof (int), 1, File);
fwrite((char *) &Templates->NumClasses, sizeof (int), 1, File);
fwrite((char *) &version_id, sizeof (int), 1, File);
fwrite((char *) &Templates->NumClassPruners, sizeof (int), 1, File);
fwrite((char *) &Templates->NumClasses, sizeof (int), 1, File);
fwrite((char *) &Templates->IndexFor[0], sizeof (CLASS_INDEX),
unicharset_size, File);
fwrite((char *) &Templates->ClassIdFor[0], sizeof (CLASS_ID),
Expand All @@ -1092,7 +1108,12 @@ void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates,
Class = ClassForIndex (Templates, i);

/* first write out the high level struct for the class */
fwrite ((char *) Class, sizeof (INT_CLASS_STRUCT), 1, File);
fwrite(&Class->NumProtos, sizeof(Class->NumProtos), 1, File);
fwrite(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File);
fwrite(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File);
for (j = 0; j < MAX_NUM_CONFIGS; ++j) {
fwrite(&Class->ConfigLengths[j], sizeof(UINT16), 1, File);
}

/* then write out the proto lengths */
fwrite ((char *) (Class->ProtoLengths), sizeof (UINT8),
Expand Down Expand Up @@ -1546,7 +1567,7 @@ FLOAT32 AnglePad, PROTO Proto, TABLE_FILLER * Filler)
else {
/* diagonal proto */

if (Angle > 0.0 && Angle < 0.25 || Angle > 0.5 && Angle < 0.75) {
if ((Angle > 0.0 && Angle < 0.25) || (Angle > 0.5 && Angle < 0.75)) {
/* rising diagonal proto */
Angle *= 2.0 * PI;
Cos = fabs (cos (Angle));
Expand Down Expand Up @@ -1736,17 +1757,19 @@ void RenderIntProto(void *window,
Xmin = Ymin = NUM_PP_BUCKETS;
Xmax = Ymax = 0;
for (Bucket = 0; Bucket < NUM_PP_BUCKETS; Bucket++) {
if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_X][Bucket][ProtoWordIndex])
if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_X][Bucket][ProtoWordIndex]) {
if (Bucket < Xmin)
Xmin = Bucket;
else if (Bucket > Xmax)
Xmax = Bucket;
else if (Bucket > Xmax)
Xmax = Bucket;
}

if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_Y][Bucket][ProtoWordIndex])
if (ProtoMask & ProtoSet->ProtoPruner[PRUNER_Y][Bucket][ProtoWordIndex]) {
if (Bucket < Ymin)
Ymin = Bucket;
else if (Bucket > Ymax)
Ymax = Bucket;
else if (Bucket > Ymax)
Ymax = Bucket;
}
}
X = (Xmin + Xmax + 1) / 2.0 * PROTO_PRUNER_SCALE - DISPLAY_OFFSET;
Y = (Ymin + Ymax + 1) / 2.0 * PROTO_PRUNER_SCALE - DISPLAY_OFFSET;
Expand Down
2 changes: 1 addition & 1 deletion classify/intproto.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@
#define MAX_PROTO_INDEX 24
#define BITS_PER_WERD (8 * sizeof (UINT32))
#define MAX_NUM_CONFIGS 32
#define MAX_NUM_PROTOS 256
#define MAX_NUM_PROTOS 512
#define PROTOS_PER_PROTO_SET 64
#define MAX_NUM_PROTO_SETS (MAX_NUM_PROTOS / PROTOS_PER_PROTO_SET)
#define NUM_PP_PARAMS 3
Expand Down
5 changes: 5 additions & 0 deletions classify/protos.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include "emalloc.h"
#include "freelist.h"
#include "callcpp.h"
#include "tprintf.h"
#include "adaptmatch.h"
#include "scanutils.h"
#include "globals.h"
Expand Down Expand Up @@ -122,6 +123,10 @@ int AddProtoToClass(CLASS_TYPE Class) {
}
NewProto = NumProtosIn (Class);
NumProtosIn (Class)++;
if (NumProtosIn(Class) > MAX_NUM_PROTOS) {
tprintf("Ouch! number of protos = %d, vs max of %d!",
NumProtosIn(Class), MAX_NUM_PROTOS);
}
return (NewProto);
}

Expand Down
Loading

0 comments on commit 2a67830

Please sign in to comment.