Skip to content

Commit

Permalink
Major internationalization improvements
Browse files Browse the repository at this point in the history
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@153 d0cd1f9f-072b-0410-8dd7-cf729c803f20
  • Loading branch information
theraysmith committed Feb 1, 2008
1 parent b88a694 commit 0b50f4f
Showing 1 changed file with 18 additions and 22 deletions.
40 changes: 18 additions & 22 deletions training/unicharset_extractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,28 +56,24 @@ void set_properties(UNICHARSET *unicharset, const char* const c_string) {
// Convert the string to a unichar id.
id = unicharset->unichar_to_id(c_string);

int step = 0;
int len = strlen(c_string);
for (int offset = 0; offset < len; offset += step) {
step = UNICHAR::utf8_step(c_string + offset);
if (step == 0)
break; // Invalid utf-8.

// Get the next Unicode cond point in the string.
UNICHAR ch(c_string + offset, step);
wc = ch.first_uni();

/* Copy the properties. */
if (iswalpha(wc)) {
unicharset->set_isalpha(id, 1);
if (iswlower(wc))
unicharset->set_islower(id, 1);
if (iswupper(wc))
unicharset->set_isupper(id, 1);
}
if (iswdigit(wc))
unicharset->set_isdigit(id, 1);
int step = UNICHAR::utf8_step(c_string);
if (step == 0)
return; // Invalid utf-8.

// Get the next Unicode cond point in the string.
UNICHAR ch(c_string, step);
wc = ch.first_uni();

/* Copy the properties. */
if (iswalpha(wc)) {
unicharset->set_isalpha(id, 1);
if (iswlower(wc))
unicharset->set_islower(id, 1);
if (iswupper(wc))
unicharset->set_isupper(id, 1);
}
if (iswdigit(wc))
unicharset->set_isdigit(id, 1);
#endif
}

Expand Down Expand Up @@ -123,7 +119,7 @@ int main(int argc, char** argv) {
}

int x_min, y_min, x_max, y_max;
char c_string[kBufSize];
char c_string[kBoxReadBufSize];
while (read_next_box(box_file, c_string, &x_min, &y_min, &x_max, &y_max)) {
unicharset.unichar_insert(c_string);
set_properties(&unicharset, c_string);
Expand Down

0 comments on commit 0b50f4f

Please sign in to comment.