Skip to content

Commit

Permalink
Fixes to wordlist2dawg to create correct dawgs on windows
Browse files Browse the repository at this point in the history
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@179 d0cd1f9f-072b-0410-8dd7-cf729c803f20
  • Loading branch information
theraysmith committed Aug 14, 2008
1 parent 3f218cd commit b950752
Show file tree
Hide file tree
Showing 6 changed files with 33 additions and 137 deletions.
40 changes: 6 additions & 34 deletions dict/dawg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,40 +138,12 @@ inT32 def_letter_is_okay(EDGE_ARRAY dawg,
else
return (FALSE);
}
else {
/* Leading punctuation */
if (*node == 0 &&
char_index != 0 &&
// TODO(tkielbus) Replace islalpha by unicode versions.
// However the lengths information is not available at this point in the
// code. We will probably get rid of the dictionaries at some point anyway.
isalpha (dummy_word [char_index]) &&
! leading_punc (dummy_word [char_index-1]) &&
dummy_word [char_index-1] != '-') {
return (FALSE);
}
}
/* Handle compund words */
#if 0
if (dummy_word [char_index] == '-') {
if (char_index>0 && !word_end
&& word [char_index-1] == '-'
&& word [char_index+1] == '-')
return FALSE; /*not allowed*/
dummy_word [char_index] = (char) 0;
if (word_in_dawg (dawg, dummy_word.string())) {
dummy_word [char_index] = '-';
*node = 0;
return (TRUE);
}
else {
dummy_word [char_index] = '-';
return (FALSE);
}
}
#endif
// rays: removed incorrect code that attempted to enforce leading
// punctutation (or nothing) before an alpha character.
/* Check the DAWG */
edge = edge_char_of (dawg, *node, dummy_word [char_index], word_end);
edge = edge_char_of(dawg, *node,
static_cast<unsigned char>(dummy_word [char_index]),
word_end);

if (edge != NO_EDGE) { /* Normal edge in DAWG */
if (case_sensative || case_is_okay (dummy_word, char_index)) {
Expand Down Expand Up @@ -244,7 +216,7 @@ void print_dawg_node(EDGE_ARRAY dawg, NODE_REF node) {
const char *is_last;
const char *eow;

char ch;
int ch;

if (edge_occupied (dawg, edge)) {
do {
Expand Down
4 changes: 2 additions & 2 deletions dict/dawg.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ for (edge=0; edge<max_num_edges; edge++) \
**********************************************************************/

#define edge_letter(edges,e) \
((char)(((edges)[e] & LETTER_MASK) >> LETTER_START_BIT))
((int)(((edges)[e] & LETTER_MASK) >> LETTER_START_BIT))

/**********************************************************************
* letter_of_edge
Expand All @@ -201,7 +201,7 @@ for (edge=0; edge<max_num_edges; edge++) \
**********************************************************************/

#define letter_of_edge(edge) \
((char)((edge & LETTER_MASK) >> LETTER_START_BIT))
((int)((edge & LETTER_MASK) >> LETTER_START_BIT))

/**********************************************************************
* last_edge
Expand Down
3 changes: 2 additions & 1 deletion dict/lookdawg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,8 @@ void match_words (EDGE_ARRAY dawg,
}
else {
word_end = (string[index+1] == (char) 0);
edge = edge_char_of (dawg, node, string[index], word_end);
edge = edge_char_of(dawg, node,
static_cast<unsigned char>(string[index]), word_end);
if (edge != NO_EDGE) { /* Normal edge in DAWG */
node = next_node (dawg, edge);
if (word_end) {
Expand Down
4 changes: 4 additions & 0 deletions dict/makedawg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,11 @@ void write_squished_dawg (const char *filename,
node_map = build_node_map (dawg, &node_count, FALSE, max_num_edges,
reserved_edges);

#ifdef WIN32
file = open_file (filename, "wb");
#else
file = open_file (filename, "w");
#endif

num_edges = 0; /* Count number of edges */
for (edge=0; edge<max_num_edges; edge++)
Expand Down
29 changes: 14 additions & 15 deletions dict/trie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,12 @@ void add_edge_linkage(EDGE_ARRAY dawg,
NODE_REF node1,
NODE_REF node2,
EDGE_RECORD direction,
char character,
int character,
EDGE_RECORD word_end) {
EDGE_REF edge1 = node1;
EDGE_REF edge2;
inT32 num_edges = edges_in_node (dawg, node1);
inT32 last_one;
inT64 last_one;

word_end = (word_end ? WERD_END_FLAG : 0);

Expand Down Expand Up @@ -117,7 +117,7 @@ void add_edge_linkage(EDGE_ARRAY dawg,
bool add_new_edge(EDGE_ARRAY dawg,
NODE_REF *node1,
NODE_REF *node2,
char character,
int character,
EDGE_RECORD word_end,
inT32 max_num_edges,
inT32 reserved_edges) {
Expand Down Expand Up @@ -149,7 +149,7 @@ bool add_new_edge(EDGE_ARRAY dawg,
* Add in a word by creating the necessary nodes and edges.
**********************************************************************/
void add_word_to_dawg(EDGE_ARRAY dawg,
char *string,
const char *string,
inT32 max_num_edges,
inT32 reserved_edges) {
EDGE_REF edge;
Expand All @@ -162,17 +162,17 @@ void add_word_to_dawg(EDGE_ARRAY dawg,

if (debug) cprintf("Adding word %s\n", string);
for (i=0; i<strlen(string)-1; i++) {
unsigned char ch = case_sensative ? string[i] : tolower(string[i]);
if (still_finding_chars) {
edge = edge_char_of (dawg, last_node, string[i], word_end);
edge = edge_char_of(dawg, last_node, ch, word_end);
if (debug) cprintf ("exploring edge = " REFFORMAT "\n", edge);
if (edge == NO_EDGE)
still_finding_chars = FALSE;
else
if (next_node (dawg, edge) == 0) {
word_end = TRUE;
still_finding_chars = FALSE;
if (! case_sensative) string[i] = tolower (string[i]);
remove_edge (dawg, last_node, 0, string[i], word_end);
remove_edge (dawg, last_node, 0, ch, word_end);
}
else {
last_node = next_node (dawg, edge);
Expand All @@ -195,9 +195,8 @@ void add_word_to_dawg(EDGE_ARRAY dawg,
break;
}
}
if (! case_sensative) string[i] = tolower (string[i]);
if (!add_new_edge (dawg, &last_node, &the_next_node,
string[i], word_end, max_num_edges, reserved_edges)) {
if (!add_new_edge (dawg, &last_node, &the_next_node, ch,
word_end, max_num_edges, reserved_edges)) {
add_failed = true;
break;
}
Expand All @@ -209,10 +208,10 @@ void add_word_to_dawg(EDGE_ARRAY dawg,
}

the_next_node = 0;
if (! case_sensative) string[i] = tolower (string[i]);
unsigned char ch = case_sensative ? string[i] : tolower(string[i]);
if (!add_failed &&
!add_new_edge(dawg, &last_node, &the_next_node,
string[i], TRUE, max_num_edges, reserved_edges))
!add_new_edge(dawg, &last_node, &the_next_node, ch,
TRUE, max_num_edges, reserved_edges))
add_failed = true;

if (edges_in_node (dawg, 0) > reserved_edges) {
Expand Down Expand Up @@ -496,7 +495,7 @@ void relocate_edge(EDGE_ARRAY dawg,
void remove_edge(EDGE_ARRAY dawg,
NODE_REF node1,
NODE_REF node2,
char character,
int character,
EDGE_RECORD word_end) {
remove_edge_linkage(dawg, node1, node2, FORWARD_EDGE, character, word_end);

Expand All @@ -514,7 +513,7 @@ void remove_edge_linkage(EDGE_ARRAY dawg,
NODE_REF node,
NODE_REF next,
EDGE_RECORD direction,
char character,
int character,
EDGE_RECORD word_end) {
inT32 forward_edges;
inT32 num_edges;
Expand Down
90 changes: 5 additions & 85 deletions dict/trie.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,19 +123,19 @@ void add_edge_linkage(EDGE_ARRAY dawg,
NODE_REF node1,
NODE_REF node2,
EDGE_RECORD direction,
char character,
int character,
EDGE_RECORD word_end);

bool add_new_edge(EDGE_ARRAY dawg,
NODE_REF *node1,
NODE_REF *node2,
char character,
int character,
EDGE_RECORD word_end,
inT32 max_num_edges,
inT32 reserved_edges);

void add_word_to_dawg(EDGE_ARRAY dawg,
char *string,
const char *string,
inT32 max_num_edges,
inT32 reserved_edges);

Expand Down Expand Up @@ -170,14 +170,14 @@ void relocate_edge(EDGE_ARRAY dawg,
void remove_edge(EDGE_ARRAY dawg,
NODE_REF node1,
NODE_REF node2,
char character,
int character,
EDGE_RECORD word_end);

void remove_edge_linkage(EDGE_ARRAY dawg,
NODE_REF node,
NODE_REF next,
EDGE_RECORD direction,
char character,
int character,
EDGE_RECORD word_end);

inT32 room_in_node(EDGE_ARRAY dawg, NODE_REF node);
Expand All @@ -187,84 +187,4 @@ void write_full_dawg (const char *filename,
inT32 max_num_edges);


/*
#if defined(__STDC__) || defined(__cplusplus)
# define _ARGS(s) s
#else
# define _ARGS(s) ()
#endif*/

/* trie.c *
void add_edge_linkage
_ARGS((EDGE_ARRAY dawg,
NODE_REF node1,
NODE_REF node2,
inT32 direction,
int character,
inT32 word_end));
void add_new_edge
_ARGS((EDGE_ARRAY dawg,
NODE_REF *node1,
NODE_REF *node2,
int character,
inT32 word_end,
inT32 max_num_edges,
inT32 reserved_edges));
void add_word_to_dawg
_ARGS((EDGE_ARRAY dawg,
char *string,
inT32 max_num_edges,
inT32 reserved_edges));
void initialize_dawg
_ARGS((EDGE_ARRAY dawg,
inT32 max_num_edges));
NODE_REF move_node
_ARGS((EDGE_ARRAY dawg,
NODE_REF node,
inT32 max_num_edges,
inT32 reserved_edges));
NODE_REF new_dawg_node
_ARGS((EDGE_ARRAY dawg,
inT32 num_edges,
inT32 max_num_edges,
inT32 reserved_edges));
void read_word_list
_ARGS((char *filename,
EDGE_ARRAY dawg,
inT32 max_num_edges,
inT32 reserved_edges));
void relocate_edge
_ARGS((EDGE_ARRAY dawg,
NODE_REF node,
NODE_REF old_node,
NODE_REF new_node));
void remove_edge
_ARGS((EDGE_ARRAY dawg,
NODE_REF node1,
NODE_REF node2,
int character,
inT32 word_end));
void remove_edge_linkage
_ARGS((EDGE_ARRAY dawg,
NODE_REF node,
NODE_REF next,
inT32 direction,
int character,
inT32 word_end));
inT32 room_in_node
_ARGS((EDGE_ARRAY dawg,
NODE_REF node));
#undef _ARGS
*/
#endif

0 comments on commit b950752

Please sign in to comment.