Skip to content

Commit

Permalink
Fixed endian bug in dawg reader, Added word bigram correction,
Browse files Browse the repository at this point in the history
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@649 d0cd1f9f-072b-0410-8dd7-cf729c803f20
  • Loading branch information
theraysmith@gmail.com committed Feb 2, 2012
1 parent 6e3d810 commit fdd4ffe
Show file tree
Hide file tree
Showing 15 changed files with 527 additions and 198 deletions.
36 changes: 32 additions & 4 deletions dict/dawg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,32 @@ int Dawg::check_for_words(const char *filename,
return misses;
}

void Dawg::iterate_words(const UNICHARSET &unicharset,
TessCallback1<const char *> *cb) const {
WERD_CHOICE word(&unicharset);
iterate_words_rec(word, 0, cb);
}

void Dawg::iterate_words_rec(const WERD_CHOICE &word_so_far,
NODE_REF to_explore,
TessCallback1<const char *> *cb) const {
NodeChildVector children;
this->unichar_ids_of(to_explore, &children);
for (int i = 0; i < children.size(); i++) {
WERD_CHOICE next_word(word_so_far);
next_word.append_unichar_id(children[i].unichar_id, 1, 0.0, 0.0);
if (this->end_of_word(children[i].edge_ref)) {
STRING s;
next_word.string_and_lengths(&s, NULL);
cb->Run(s.string());
}
NODE_REF next = next_node(children[i].edge_ref);
if (next != 0) {
iterate_words_rec(next_word, next, cb);
}
}
}

bool Dawg::match_words(WERD_CHOICE *word, inT32 index,
NODE_REF node, UNICHAR_ID wildcard) const {
EDGE_REF edge;
Expand Down Expand Up @@ -286,12 +312,12 @@ void SquishedDawg::read_squished_dawg(FILE *file,
int unicharset_size;
fread(&unicharset_size, sizeof(inT32), 1, file);
fread(&num_edges_, sizeof(inT32), 1, file);
ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty

if (swap) {
unicharset_size = reverse32(unicharset_size);
num_edges_ = reverse32(num_edges_);
}
ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty
Dawg::init(type, lang, perm, unicharset_size, debug_level);

edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_);
Expand All @@ -318,20 +344,21 @@ NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const {

node_map = (NODE_MAP) malloc(sizeof(EDGE_REF) * num_edges_);

for (edge=0; edge < num_edges_; edge++) // init all slots
for (edge = 0; edge < num_edges_; edge++) // init all slots
node_map [edge] = -1;

node_counter = num_forward_edges(0);

*num_nodes = 0;
for (edge=0; edge < num_edges_; edge++) { // search all slots
for (edge = 0; edge < num_edges_; edge++) { // search all slots

if (forward_edge(edge)) {
(*num_nodes)++; // count nodes links
node_map[edge] = (edge ? node_counter : 0);
num_edges = num_forward_edges(edge);
if (edge != 0) node_counter += num_edges;
edge += num_edges;
if (edge >= num_edges_) break;
if (backward_edge(edge)) while (!last_edge(edge++));
edge--;
}
Expand Down Expand Up @@ -369,7 +396,7 @@ void SquishedDawg::write_squished_dawg(FILE *file) {
tprintf("%d edges in DAWG\n", num_edges);
}

for (edge=0; edge<num_edges_; edge++) {
for (edge = 0; edge < num_edges_; edge++) {
if (forward_edge(edge)) { // write forward edges
do {
old_index = next_node_from_edge_rec(edges_[edge]);
Expand All @@ -379,6 +406,7 @@ void SquishedDawg::write_squished_dawg(FILE *file) {
set_next_node(edge, old_index);
} while (!last_edge(edge++));

if (edge >= num_edges_) break;
if (backward_edge(edge)) // skip back links
while (!last_edge(edge++));

Expand Down
11 changes: 11 additions & 0 deletions dict/dawg.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "elst.h"
#include "ratngs.h"
#include "params.h"
#include "tesscallback.h"

#ifndef __GNUC__
#ifdef __MSW32__
Expand Down Expand Up @@ -142,6 +143,11 @@ class Dawg {
const UNICHARSET &unicharset,
bool enable_wildcard) const;

// For each word in the Dawg, call the given (permanent) callback with the
// text (UTF-8) version of the word.
void iterate_words(const UNICHARSET &unicharset,
TessCallback1<const char *> *cb) const;

// Pure virtual function that should be implemented by the derived classes.

/// Returns the edge that corresponds to the letter out of this node.
Expand Down Expand Up @@ -268,6 +274,11 @@ class Dawg {
bool match_words(WERD_CHOICE *word, inT32 index,
NODE_REF node, UNICHAR_ID wildcard) const;

// Recursively iterate over all words in a dawg (see public iterate_words).
void iterate_words_rec(const WERD_CHOICE &word_so_far,
NODE_REF to_explore,
TessCallback1<const char *> *cb) const;

// Member Variables.
DawgType type_;
STRING lang_;
Expand Down
Loading

0 comments on commit fdd4ffe

Please sign in to comment.