Skip to content

Commit

Permalink
Major refactor of beam search, elimination of dead code, misc bug fix…
Browse files Browse the repository at this point in the history
…es, updates to Makefile.am, Changelog etc.

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@878 d0cd1f9f-072b-0410-8dd7-cf729c803f20
  • Loading branch information
theraysmith@gmail.com committed Sep 23, 2013
1 parent 2c90970 commit 4d514d5
Show file tree
Hide file tree
Showing 187 changed files with 41,117 additions and 14,408 deletions.
12 changes: 12 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,15 @@
2013-09-20 v3.03
* Added Renderer to API to allow document-level processing and output
of document formats, like hOCR, PDF.
* Major refactor of word-level recognition, beam search, eliminating dead code.
* Refactored classifier to make it easier to add new ones.
* Generalized feature extractor to allow feature extraction from greyscale.
* Improved sub/superscript treatment.
* Improved baseline fit.
* Added set_unicharset_properties to training tools.
* Many bug fixes.


2012-02-01 - v3.02
* Moved ResultIterator/PageIterator to ccmain.
* Added Right-to-left/Bidi capability in the output iterators for Hebrew/Arabic.
Expand Down
4 changes: 2 additions & 2 deletions api/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ if VISIBILITY
AM_CPPFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
endif

include_HEADERS = apitypes.h baseapi.h capi.h
include_HEADERS = apitypes.h baseapi.h capi.h renderer.h
lib_LTLIBRARIES =

if !USING_MULTIPLELIBS
Expand All @@ -35,7 +35,7 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
if VISIBILITY
libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
endif
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp

lib_LTLIBRARIES += libtesseract.la
libtesseract_la_LDFLAGS =
Expand Down
14 changes: 8 additions & 6 deletions api/capi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# define TESS_CAPI_INCLUDE_BASEAPI
#endif
#include "capi.h"
#include "genericvector.h"
#include "strngs.h"

TESS_API const char* TESS_CALL TessVersion()
{
Expand Down Expand Up @@ -382,21 +384,21 @@ TESS_API BOOL TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* resu
return handle->DetectOS(results) ? TRUE : FALSE;
}

TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, INT_FEATURE_ARRAY int_features,
TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features,
int* num_features, int* FeatureOutlineIndex)
{
handle->GetFeaturesForBlob(blob, *denorm, int_features, num_features, FeatureOutlineIndex);
handle->GetFeaturesForBlob(blob, int_features, num_features, FeatureOutlineIndex);
}

TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top, int right, int bottom)
{
return TessBaseAPI::FindRowForBox(blocks, left, top, right, bottom);
}

TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, int num_max_matches,
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, int num_max_matches,
int* unichar_ids, float* ratings, int* num_matches_returned)
{
handle->RunAdaptiveClassifier(blob, *denorm, num_max_matches, unichar_ids, ratings, num_matches_returned);
handle->RunAdaptiveClassifier(blob, num_max_matches, unichar_ids, ratings, num_matches_returned);
}

TESS_API const char* TESS_CALL TessBaseAPIGetUnichar(TessBaseAPI* handle, int unichar_id)
Expand Down Expand Up @@ -424,9 +426,9 @@ TESS_API TBLOB* TESS_CALL TessMakeTBLOB(struct Pix *pix)
return TessBaseAPI::MakeTBLOB(pix);
}

TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode, DENORM *denorm)
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode)
{
TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode != FALSE, denorm);
TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode != FALSE);
}

TESS_API TessOcrEngineMode TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle)
Expand Down
6 changes: 3 additions & 3 deletions api/capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -205,11 +205,11 @@ TESS_API void TESS_CALL TessBaseAPISetProbabilityInContextFunc(TessBaseAPI* han
TESS_API void TESS_CALL TessBaseAPISetFillLatticeFunc(TessBaseAPI* handle, TessFillLatticeFunc f);
TESS_API BOOL TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* results);

TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, INT_FEATURE_ARRAY int_features,
TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features,
int* num_features, int* FeatureOutlineIndex);

TESS_API ROW* TESS_CALL TessFindRowForBox(BLOCK_LIST* blocks, int left, int top, int right, int bottom);
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, const DENORM* denorm, int num_max_matches,
TESS_API void TESS_CALL TessBaseAPIRunAdaptiveClassifier(TessBaseAPI* handle, TBLOB* blob, int num_max_matches,
int* unichar_ids, float* ratings, int* num_matches_returned);
#endif

Expand All @@ -226,7 +226,7 @@ TESS_API int TESS_CALL TessBaseAPINumDawgs(const TessBaseAPI* handle);
TESS_API ROW* TESS_CALL TessMakeTessOCRRow(float baseline, float xheight, float descender, float ascender);
TESS_API TBLOB*
TESS_CALL TessMakeTBLOB(Pix *pix);
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode, DENORM *denorm);
TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB *tblob, ROW *row, BOOL numeric_mode);

TESS_API TessOcrEngineMode
TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle);
Expand Down
4 changes: 2 additions & 2 deletions ccmain/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ noinst_HEADERS = \
equationdetect.h fixspace.h imgscale.h mutableiterator.h osdetect.h \
output.h paragraphs.h paragraphs_internal.h paramsd.h pgedit.h \
reject.h scaleimg.h tessbox.h tessedit.h tesseractclass.h \
tesseract_cube_combiner.h tessvars.h tfacep.h tfacepp.h werdit.h
tesseract_cube_combiner.h tessvars.h werdit.h

if !USING_MULTIPLELIBS
noinst_LTLIBRARIES = libtesseract_main.la
Expand All @@ -46,7 +46,7 @@ libtesseract_main_la_SOURCES = \
imgscale.cpp ltrresultiterator.cpp \
osdetect.cpp output.cpp pageiterator.cpp pagesegmain.cpp \
pagewalk.cpp paragraphs.cpp paramsd.cpp pgedit.cpp recogtraining.cpp \
reject.cpp resultiterator.cpp scaleimg.cpp \
reject.cpp resultiterator.cpp scaleimg.cpp superscript.cpp \
tesseract_cube_combiner.cpp \
tessbox.cpp tessedit.cpp tesseractclass.cpp tessvars.cpp \
tfacepp.cpp thresholder.cpp \
Expand Down
17 changes: 1 addition & 16 deletions ccmain/adaptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,27 +114,12 @@ BOOL8 Tesseract::word_adaptable( //should we adapt?
return FALSE;
}

// if (flags.bit (CHECK_AMBIG_WERD) && test_ambig_word (word))
if (flags.bit (CHECK_AMBIG_WERD) &&
!getDict().NoDangerousAmbig(word->best_choice, NULL, false, NULL, NULL)) {
word->best_choice->dangerous_ambig_found()) {
if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
return FALSE;
}

// Do not adapt to words that are composed from fragments if
// tessedit_adapt_to_char_fragments is false.
if (!tessedit_adapt_to_char_fragments) {
const char *fragment_lengths = word->best_choice->fragment_lengths();
if (fragment_lengths != NULL && *fragment_lengths != '\0') {
for (int i = 0; i < word->best_choice->length(); ++i) {
if (fragment_lengths[i] > 1) {
if (tessedit_adaption_debug) tprintf("won't adapt to fragments\n");
return false; // found a character composed from fragments
}
}
}
}

if (tessedit_adaption_debug) {
tprintf("returning status %d\n", status);
}
Expand Down
102 changes: 44 additions & 58 deletions ccmain/applybox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -235,29 +235,15 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
return page_res;
}

// Helper to make a WERD_CHOICE from the BLOB_CHOICE_LIST_VECTOR using only
// the top choices. Avoids problems with very long words.
static void MakeWordChoice(const BLOB_CHOICE_LIST_VECTOR& char_choices,
const UNICHARSET& unicharset,
WERD_CHOICE* word_choice) {
*word_choice = WERD_CHOICE(&unicharset); // clear the word choice.
word_choice->make_bad();
for (int i = 0; i < char_choices.size(); ++i) {
BLOB_CHOICE_IT it(char_choices[i]);
BLOB_CHOICE* bc = it.data();
word_choice->append_unichar_id(bc->unichar_id(), 1,
bc->rating(), bc->certainty());
}
}

// Tests the chopper by exhaustively running chop_one_blob.
// The word_res will contain filled chopped_word, seam_array, denorm,
// box_word and best_state for the maximally chopped word.
void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
BLOCK* block, ROW* row,
WERD_RES* word_res) {
if (!word_res->SetupForTessRecognition(unicharset, this, BestPix(), false,
this->textord_use_cjk_fp_model,
textord_use_cjk_fp_model,
poly_allow_detailed_fx,
row, block)) {
word_res->CloneChoppedToRebuild();
return;
Expand All @@ -266,13 +252,10 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
tprintf("Maximally chopping word at:");
word_res->word->bounding_box().print();
}
blob_match_table.init_match_table();
BLOB_CHOICE_LIST *match_result;
BLOB_CHOICE_LIST_VECTOR *char_choices = new BLOB_CHOICE_LIST_VECTOR();
ASSERT_HOST(word_res->chopped_word->blobs != NULL);
GenericVector<BLOB_CHOICE*> blob_choices;
ASSERT_HOST(!word_res->chopped_word->blobs.empty());
float rating = static_cast<float>(MAX_INT8);
for (TBLOB* blob = word_res->chopped_word->blobs; blob != NULL;
blob = blob->next) {
for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
// The rating and certainty are not quite arbitrary. Since
// select_blob_to_chop uses the worst certainty to choose, they all have
// to be different, so starting with MAX_INT8, subtract 1/8 for each blob
Expand All @@ -281,32 +264,33 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
// produced, however much chopping is required. The chops are thus only
// limited by the ability of the chopper to find suitable chop points,
// and not by the value of the certainties.
match_result = fake_classify_blob(0, rating, -rating);
modify_blob_choice(match_result, 0);
ASSERT_HOST(!match_result->empty());
*char_choices += match_result;
BLOB_CHOICE* choice =
new BLOB_CHOICE(0, rating, -rating, -1, -1, 0, 0, 0, 0, BCC_FAKE);
blob_choices.push_back(choice);
rating -= 0.125f;
}
inT32 blob_number;
const double e = exp(1.0); // The base of natural logs.
int blob_number;
int right_chop_index = 0;
if (!assume_fixed_pitch_char_segment) {
// We only chop if the language is not fixed pitch like CJK.
if (prioritize_division) {
while (chop_one_blob2(boxes, word_res, &word_res->seam_array));
} else {
while (chop_one_blob(word_res->chopped_word, char_choices,
&blob_number, &word_res->seam_array,
&right_chop_index));
SEAM* seam = NULL;
while ((seam = chop_one_blob(boxes, blob_choices, word_res,
&blob_number)) != NULL) {
word_res->InsertSeam(blob_number, seam);
BLOB_CHOICE* left_choice = blob_choices[blob_number];
rating = left_choice->rating() / e;
left_choice->set_rating(rating);
left_choice->set_certainty(-rating);
// combine confidence w/ serial #
BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
rating - 0.125f, -rating,
-1, -1, 0, 0, 0, 0, BCC_FAKE);
blob_choices.insert(right_choice, blob_number + 1);
}
}
MakeWordChoice(*char_choices, unicharset, word_res->best_choice);
MakeWordChoice(*char_choices, unicharset, word_res->raw_choice);
word_res->CloneChoppedToRebuild();
blob_match_table.end_match_table();
if (char_choices != NULL) {
char_choices->delete_data_pointers();
delete char_choices;
}
word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
}

// Helper to compute the dispute resolution metric.
Expand Down Expand Up @@ -558,16 +542,15 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
// substitutions ARE used.
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
WERD_RES* word_res) {
blob_match_table.init_match_table();
// Classify all required combinations of blobs and save results in choices.
int word_length = word_res->box_word->length();
GenericVector<BLOB_CHOICE_LIST*>* choices =
new GenericVector<BLOB_CHOICE_LIST*>[word_length];
for (int i = 0; i < word_length; ++i) {
for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
BLOB_CHOICE_LIST* match_result = classify_piece(
word_res->chopped_word->blobs, word_res->denorm, word_res->seam_array,
i, i + j - 1, word_res->blamer_bundle);
word_res->seam_array, i, i + j - 1, "Applybox",
word_res->chopped_word, word_res->blamer_bundle);
if (applybox_debug > 2) {
tprintf("%d+%d:", i, j);
print_ratings_list("Segment:", match_result, unicharset);
Expand All @@ -583,17 +566,15 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
float best_rating = 0.0f;
SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
&search_segmentation, &best_rating, &word_res->best_state);
blob_match_table.end_match_table();
for (int i = 0; i < word_length; ++i)
choices[i].delete_data_pointers();
delete [] choices;
if (word_res->best_state.empty()) {
// Build the original segmentation and if it is the same length as the
// truth, assume it will do.
int blob_count = 1;
for (int s = 0; s < array_count(word_res->seam_array); ++s) {
SEAM* seam =
reinterpret_cast<SEAM*>(array_value(word_res->seam_array, s));
for (int s = 0; s < word_res->seam_array.size(); ++s) {
SEAM* seam = word_res->seam_array[s];
if (seam->split1 == NULL) {
word_res->best_state.push_back(blob_count);
blob_count = 1;
Expand Down Expand Up @@ -707,21 +688,25 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
WERD_RES* word_res;
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
int ok_in_word = 0;
BLOB_CHOICE_LIST_VECTOR char_choices;
for (int i = word_res->correct_text.size() - 1; i >= 0; i--) {
if (word_res->correct_text[i].length() > 0) {
int blob_count = word_res->correct_text.size();
WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
word_choice->set_permuter(TOP_CHOICE_PERM);
for (int c = 0; c < blob_count; ++c) {
if (word_res->correct_text[c].length() > 0) {
++ok_in_word;
}
// Since we only need a fake word_res->best_choice, the actual
// unichar_ids do not matter. Which is fortunate, since TidyUp()
// can be called while training Tesseract, at the stage where
// unicharset is not meaningful yet.
char_choices += fake_classify_blob(INVALID_UNICHAR_ID, 1.0, -1.0);
word_choice->append_unichar_id_space_allocated(
INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
}
if (ok_in_word > 0) {
ok_blob_count += ok_in_word;
bad_blob_count += word_res->correct_text.size() - ok_in_word;
MakeWordChoice(char_choices, unicharset, word_res->best_choice);
word_res->LogNewRawChoice(word_choice);
word_res->LogNewCookedChoice(1, false, word_choice);
} else {
++unlabelled_words;
if (applybox_debug > 0) {
Expand All @@ -730,7 +715,6 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
}
pr_it.DeleteCurrentWord();
}
char_choices.delete_data_pointers();
}
pr_it.restart_page();
for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
Expand Down Expand Up @@ -772,11 +756,13 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
GenericVector<STRING> tokens;
word_res->correct_text[i].split(' ', &tokens);
UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
choice->append_unichar_id_space_allocated(char_id, 1, 0.0f, 0.0f);
choice->append_unichar_id_space_allocated(char_id,
word_res->best_state[i],
0.0f, 0.0f);
}
if (word_res->best_choice != NULL)
delete word_res->best_choice;
word_res->best_choice = choice;
word_res->ClearWordChoices();
word_res->LogNewRawChoice(choice);
word_res->LogNewCookedChoice(1, false, choice);
}
}

Expand All @@ -787,7 +773,7 @@ void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
int word_count = 0;
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
word_res = pr_it.forward()) {
LearnWord(filename.string(), NULL, word_res);
LearnWord(filename.string(), word_res);
++word_count;
}
tprintf("Generated training data for %d words\n", word_count);
Expand Down
Loading

0 comments on commit 4d514d5

Please sign in to comment.