Skip to content

Commit

Permalink
more doxygen
Browse files Browse the repository at this point in the history
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@450 d0cd1f9f-072b-0410-8dd7-cf729c803f20
  • Loading branch information
joregan committed Aug 10, 2010
1 parent 75676cd commit 08defee
Show file tree
Hide file tree
Showing 12 changed files with 836 additions and 850 deletions.
124 changes: 58 additions & 66 deletions ccmain/fixspace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,18 @@ EXTERN STRING_VAR (numeric_punctuation, ".,",
#define PERFECT_WERDS 999
#define MAXSPACING 128 /*max expected spacing in pix */

/*************************************************************************
* fix_fuzzy_spaces()
namespace tesseract {
/**
* @name fix_fuzzy_spaces()
* Walk over the page finding sequences of words joined by fuzzy spaces. Extract
* them as a sublist, process the sublist to find the optimal arrangement of
* spaces then replace the sublist in the ROW_RES.
*************************************************************************/
namespace tesseract {
void Tesseract::fix_fuzzy_spaces( //find fuzzy words
//progress monitor
volatile ETEXT_DESC *monitor,
//count of words in doc
*
* @param monitor progress monitor
* @param word_count count of words in doc
* @param[out] page_res
*/
void Tesseract::fix_fuzzy_spaces(volatile ETEXT_DESC *monitor,
inT32 word_count,
PAGE_RES *page_res) {
BLOCK_RES_IT block_res_it; //iterators
Expand All @@ -83,19 +84,17 @@ void Tesseract::fix_fuzzy_spaces( //find fuzzy words
block_res_it.set_to_list (&page_res->block_res_list);
word_index = 0;
for (block_res_it.mark_cycle_pt ();
!block_res_it.cycled_list (); block_res_it.forward ()) {
!block_res_it.cycled_list (); block_res_it.forward ()) {
row_res_it.set_to_list (&block_res_it.data ()->row_res_list);
for (row_res_it.mark_cycle_pt ();
!row_res_it.cycled_list (); row_res_it.forward ()) {
!row_res_it.cycled_list (); row_res_it.forward ()) {
word_res_it_from.set_to_list (&row_res_it.data ()->word_res_list);
while (!word_res_it_from.at_last ()) {
word_res = word_res_it_from.data ();
while (!word_res_it_from.at_last () &&
!(word_res->combination ||
word_res_it_from.data_relative (1)->
word->flag (W_FUZZY_NON) ||
word_res_it_from.data_relative (1)->
word->flag (W_FUZZY_SP))) {
!(word_res->combination ||
word_res_it_from.data_relative (1)->word->flag (W_FUZZY_NON) ||
word_res_it_from.data_relative (1)->word->flag (W_FUZZY_SP))) {
fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
block_res_it.data()->block);
word_res = word_res_it_from.forward ();
Expand All @@ -119,10 +118,8 @@ void Tesseract::fix_fuzzy_spaces( //find fuzzy words
monitor->progress = 90 + 5 * word_index / word_count;
}
while (!word_res_it_to.at_last () &&
(word_res_it_to.data_relative (1)->
word->flag (W_FUZZY_NON) ||
word_res_it_to.data_relative (1)->
word->flag (W_FUZZY_SP))) {
(word_res_it_to.data_relative (1)->word->flag (W_FUZZY_NON) ||
word_res_it_to.data_relative (1)->word->flag (W_FUZZY_SP))) {
if (check_debug_pt (word_res, 60))
debug_fix_space_level.set_value (10);
if (word_res->word->gblob_list ()->empty ())
Expand All @@ -143,9 +140,7 @@ void Tesseract::fix_fuzzy_spaces( //find fuzzy words
block_res_it.data()->block);
new_length = fuzzy_space_words.length ();
word_res_it_from.add_list_before (&fuzzy_space_words);
for (;
(!word_res_it_from.at_last () &&
(new_length > 0)); new_length--) {
for (; (!word_res_it_from.at_last () && (new_length > 0)); new_length--) {
word_res_it_from.forward ();
}
}
Expand All @@ -160,8 +155,7 @@ void Tesseract::fix_fuzzy_spaces( //find fuzzy words
}
}

void Tesseract::fix_fuzzy_space_list( //space explorer
WERD_RES_LIST &best_perm,
void Tesseract::fix_fuzzy_space_list(WERD_RES_LIST &best_perm,
ROW *row,
BLOCK* block) {
inT16 best_score;
Expand Down Expand Up @@ -225,8 +219,8 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
}


/*************************************************************************
* eval_word_spacing()
/**
* @name eval_word_spacing()
* The basic measure is the number of characters in contextually confirmed
* words. (I.e the word is done)
* If all words are contextually confirmed the evaluation is deemed perfect.
Expand All @@ -249,7 +243,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
* confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally
* confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.
*
*************************************************************************/
*/
inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
WERD_RES_IT word_res_it(&word_res_list);
inT16 total_score = 0;
Expand Down Expand Up @@ -292,23 +286,22 @@ inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
word_len = word->reject_map.length ();
current_word_ok_so_far = FALSE;
if (!((prev_char_1 &&
digit_or_numeric_punct (word, 0)) ||
(prev_char_digit &&
((word_done &&
(word->best_choice->unichar_lengths().string()[0] == 1 &&
word->best_choice->unichar_string()[0] == '1')) ||
(!word_done &&
STRING(conflict_set_I_l_1).contains(
word->best_choice->unichar_string ()[0])))))) {
digit_or_numeric_punct (word, 0)) ||
(prev_char_digit &&
((word_done &&
(word->best_choice->unichar_lengths().string()[0] == 1 &&
word->best_choice->unichar_string()[0] == '1')) ||
(!word_done &&
STRING(conflict_set_I_l_1).contains(word->best_choice->unichar_string ()[0])))))) {
total_score += prev_word_score;
if (prev_word_done)
done_word_count++;
current_word_ok_so_far = word_done;
}

if ((current_word_ok_so_far) &&
(!tessedit_test_uniform_wd_spacing ||
((word->best_choice->permuter () == NUMBER_PERM) ||
(!tessedit_test_uniform_wd_spacing ||
((word->best_choice->permuter () == NUMBER_PERM) ||
uniformly_spaced (word)))) {
prev_word_done = TRUE;
prev_word_score = word_len;
Expand Down Expand Up @@ -354,7 +347,7 @@ inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
}
/* Find next word */
do
word_res_it.forward ();
word_res_it.forward ();
while (word_res_it.data ()->part_of_combo);
}
while (!word_res_it.at_first ());
Expand Down Expand Up @@ -384,8 +377,8 @@ BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
} // namespace tesseract


/*************************************************************************
* transform_to_next_perm()
/**
* @name transform_to_next_perm()
* Examines the current word list to find the smallest word gap size. Then walks
* the word list closing any gaps of this size by either inserted new
* combination words, or extending existing ones.
Expand All @@ -394,7 +387,7 @@ BOOL8 Tesseract::digit_or_numeric_punct(WERD_RES *word, int char_position) {
*
* If there are no more gaps then it DELETES the entire list and returns the
* empty list to cause termination.
*************************************************************************/
*/
void transform_to_next_perm(WERD_RES_LIST &words) {
WERD_RES_IT word_it(&words);
WERD_RES_IT prev_word_it(&words);
Expand Down Expand Up @@ -422,8 +415,8 @@ void transform_to_next_perm(WERD_RES_LIST &words) {
if (min_gap < MAX_INT16) {
prev_right = -1; //back to start
word_it.set_to_list (&words);
for (; //cant use cycle pt due to inserted combos at start of list
(prev_right < 0) || !word_it.at_first (); word_it.forward ()) {
//cant use cycle pt due to inserted combos at start of list
for (; (prev_right < 0) || !word_it.at_first (); word_it.forward ()) {
word = word_it.data ();
if (!word->part_of_combo) {
box = word->word->bounding_box ();
Expand Down Expand Up @@ -510,7 +503,7 @@ void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved) {
}

for (word_res_it.mark_cycle_pt ();
!word_res_it.cycled_list (); word_res_it.forward ()) {
!word_res_it.cycled_list (); word_res_it.forward ()) {
if (!word_res_it.data ()->part_of_combo)
tprintf("%s/%1d ",
word_res_it.data()->best_choice->unichar_string().string(),
Expand All @@ -521,7 +514,7 @@ void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved) {
else if (improved) {
tprintf ("FIX SPACING \"%s\" => \"", initial_str.string ());
for (word_res_it.mark_cycle_pt ();
!word_res_it.cycled_list (); word_res_it.forward ()) {
!word_res_it.cycled_list (); word_res_it.forward ()) {
if (!word_res_it.data ()->part_of_combo)
tprintf ("%s/%1d ",
word_res_it.data()->best_choice->unichar_string().string(),
Expand All @@ -534,16 +527,15 @@ void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved) {
}


/*************************************************************************
* uniformly_spaced()
/**
* @name uniformly_spaced()
* Return true if one of the following are true:
* - All inter-char gaps are the same width
* - The largest gap is no larger than twice the mean/median of the others
* - The largest gap is < 64/5 = 13 and all others are <= 0
* - All inter-char gaps are the same width
* - The largest gap is no larger than twice the mean/median of the others
* - The largest gap is < 64/5 = 13 and all others are <= 0
* **** REMEMBER - WE'RE NOW WORKING WITH A BLN WERD !!!
*************************************************************************/
BOOL8 uniformly_spaced( //sensible word
WERD_RES *word) {
*/
BOOL8 uniformly_spaced(WERD_RES *word) {
PBLOB_IT blob_it;
TBOX box;
inT16 prev_right = -MAX_INT16;
Expand All @@ -564,10 +556,10 @@ BOOL8 uniformly_spaced( //sensible word
for (blob_it.mark_cycle_pt (); !blob_it.cycled_list (); blob_it.forward ()) {
box = blob_it.data ()->bounding_box ();
if ((prev_right > -MAX_INT16) &&
(!fixsp_ignore_punct ||
(!punct_chars.contains (word->best_choice->unichar_string()
[offset - word->best_choice->unichar_lengths()[i - 1]]) &&
!punct_chars.contains (word->best_choice->unichar_string()[offset])))) {
(!fixsp_ignore_punct ||
(!punct_chars.contains (word->best_choice->unichar_string()
[offset - word->best_choice->unichar_lengths()[i - 1]]) &&
!punct_chars.contains (word->best_choice->unichar_string()[offset])))) {
gap = box.left () - prev_right;
if (gap < max_gap)
gap_stats.add (gap, 1);
Expand Down Expand Up @@ -639,14 +631,14 @@ BOOL8 fixspace_thinks_word_done(WERD_RES *word) {
}


/*************************************************************************
* fix_sp_fp_word()
namespace tesseract {
/**
* @name fix_sp_fp_word()
* Test the current word to see if it can be split by deleting noise blobs. If
* so, do the buisiness.
* so, do the business.
* Return with the iterator pointing to the same place if the word is unchanged,
* or the last of the replacement words.
*************************************************************************/
namespace tesseract {
*/
void Tesseract::fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row,
BLOCK* block) {
WERD_RES *word_res;
Expand Down Expand Up @@ -730,11 +722,11 @@ void Tesseract::fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row,
} // namespace tesseract


/*************************************************************************
/**
* break_noisiest_blob_word()
* Find the word with the blob which looks like the worst noise.
* Break the word into two, deleting the noise blob.
*************************************************************************/
*/
void break_noisiest_blob_word(WERD_RES_LIST &words) {
WERD_RES_IT word_it(&words);
WERD_RES_IT worst_word_it;
Expand Down Expand Up @@ -947,14 +939,14 @@ void fixspace_dbg(WERD_RES *word) {
}


/*************************************************************************
/**
* fp_eval_word_spacing()
* Evaluation function for fixed pitch word lists.
*
* Basically, count the number of "nice" characters - those which are in tess
* acceptable words or in dict words and are not rejected.
* Penalise any potential noise chars
*************************************************************************/
*/
namespace tesseract {
inT16 Tesseract::fp_eval_word_spacing(WERD_RES_LIST &word_res_list) {
WERD_RES_IT word_it(&word_res_list);
Expand Down
27 changes: 13 additions & 14 deletions ccmain/fixxht.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,17 +258,16 @@ void re_estimate_x_ht( //improve for 1 word

/* Now make some estimates */

if ((est_x_ht > 0) ||
(est_caps_ht > 0) ||
((ambig_lc_x_est > 0) && (ambig_lc_x_est != ambig_uc_caps_est))) {
if ((est_x_ht > 0) || (est_caps_ht > 0) ||
((ambig_lc_x_est > 0) && (ambig_lc_x_est != ambig_uc_caps_est))) {
/* There is some sensible data to go on so make the most of it. */
if (debug_x_ht_level >= 20)
tprintf ("Mode20:C: Sensible Data\n", ambig_lc_x_est);
if (est_x_ht > 0) {
est_x_ht_certain = TRUE;
if (est_caps_ht == 0) {
if ((ambig_uc_caps_est > ambig_lc_x_est) &&
(ambig_uc_caps_est > est_x_ht + x_ht_ok_variation))
(ambig_uc_caps_est > est_x_ht + x_ht_ok_variation))
est_caps_ht = ambig_uc_caps_est;
else
est_caps_ht = est_x_ht / x_ht_fraction_of_caps_ht;
Expand Down Expand Up @@ -371,10 +370,10 @@ void re_estimate_x_ht( //improve for 1 word
tprintf ("Mode20:J: No comment due to no rejects\n");
}
else if (x_ht_limit_flip_trials &&
((max_blob_ht < marginally_above_x_ht) ||
((ambig_lc_x_est > 0) &&
(ambig_lc_x_est == ambig_uc_caps_est) &&
(ambig_lc_x_est < marginally_above_x_ht)))) {
((max_blob_ht < marginally_above_x_ht) ||
((ambig_lc_x_est > 0) &&
(ambig_lc_x_est == ambig_uc_caps_est) &&
(ambig_lc_x_est < marginally_above_x_ht)))) {
no_comment = TRUE;
if (debug_x_ht_level >= 20)
tprintf ("Mode20:K: No comment as close to xht %f < %f\n",
Expand All @@ -401,7 +400,7 @@ void re_estimate_x_ht( //improve for 1 word
*/
else {
if (max_blob_ht <
(bln_x_height + bln_x_height / x_ht_fraction_of_caps_ht) / 2.0) {
(bln_x_height + bln_x_height / x_ht_fraction_of_caps_ht) / 2.0) {
trial = TRUE;
est_x_ht = x_ht_fraction_of_caps_ht * max_blob_ht;
est_caps_ht = max_blob_ht;
Expand All @@ -423,9 +422,9 @@ void re_estimate_x_ht( //improve for 1 word
/* Sanity check - reject word if fails */

if (!no_comment &&
((est_x_ht > 2 * bln_x_height) ||
(est_x_ht / word_res->denorm.scale () <= min_sane_x_ht_pixels) ||
(est_caps_ht <= est_x_ht) || (est_caps_ht >= 2.5 * est_x_ht))) {
((est_x_ht > 2 * bln_x_height) ||
(est_x_ht / word_res->denorm.scale () <= min_sane_x_ht_pixels) ||
(est_caps_ht <= est_x_ht) || (est_caps_ht >= 2.5 * est_x_ht))) {
no_comment = TRUE;
if (!trial && rej_use_xht) {
if (debug_x_ht_level >= 2) {
Expand Down Expand Up @@ -485,7 +484,7 @@ void re_estimate_x_ht( //improve for 1 word

#ifndef SECURE_NAMES
if (((*trial_x_ht > 0) && (debug_x_ht_level >= 3)) ||
(debug_x_ht_level >= 5)) {
(debug_x_ht_level >= 5)) {
tprintf ("%s ", word_str);
word_res->reject_map.print (debug_fp);
tprintf
Expand Down Expand Up @@ -699,7 +698,7 @@ void improve_estimate(WERD_RES *word_res,
!blob_it.cycled_list (); blob_it.forward (),
offset += word_res->best_choice->unichar_lengths()[i++]) {
if ((STRING (chs_ambig_caps_x).contains (word_str[offset])) &&
(!dodgy_blob (blob_it.data ()))) {
(!dodgy_blob (blob_it.data ()))) {
blob_box = blob_it.data ()->bounding_box ();
blob_ht_above_baseline = blob_box.top () - bln_baseline_offset;
strncpy(temp_char, word_str + offset,
Expand Down
Loading

0 comments on commit 08defee

Please sign in to comment.