diff --git a/ccmain/control.cpp b/ccmain/control.cpp index a765a97c8a..3abf216e34 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -93,8 +93,7 @@ BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) { WordData word_data(*pr_it); SetupWordPassN(2, &word_data); - classify_word_and_language(&Tesseract::classify_word_pass2, pr_it, - &word_data); + classify_word_and_language(2, pr_it, &word_data); if (tessedit_debug_quality_metrics) { WERD_RES* word_res = pr_it->word(); word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual); @@ -190,6 +189,7 @@ void Tesseract::SetupWordPassN(int pass_n, WordData* word) { if (word->word->x_height == 0.0f) word->word->x_height = word->row->x_height(); } + word->lang_words.truncate(0); for (int s = 0; s <= sub_langs_.size(); ++s) { // The sub_langs_.size() entry is for the master language. Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this; @@ -249,15 +249,23 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, while (pr_it->word() != NULL && pr_it->word() != word->word) pr_it->forward(); ASSERT_HOST(pr_it->word() != NULL); - WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1 - : &Tesseract::classify_word_pass2; - classify_word_and_language(recognizer, pr_it, word); - if (tessedit_dump_choices) { + bool make_next_word_fuzzy = false; + if (!AnyLSTMLang() && + ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) { + // Needs to be setup again to see the new outlines in the chopped_word. + SetupWordPassN(pass_n, word); + } + + classify_word_and_language(pass_n, pr_it, word); + if (tessedit_dump_choices || debug_noise_removal) { tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().string(), word->word->best_choice->debug_string().string()); } pr_it->forward(); + if (make_next_word_fuzzy && pr_it->word() != NULL) { + pr_it->MakeCurrentWordFuzzy(); + } } return true; } @@ -898,6 +906,359 @@ static bool WordsAcceptable(const PointerVector& words) { return true; } +// Moves good-looking "noise"/diacritics from the reject list to the main +// blob list on the current word. Returns true if anything was done, and +// sets make_next_word_fuzzy if blob(s) were added to the end of the word. +bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT* pr_it, + bool* make_next_word_fuzzy) { + *make_next_word_fuzzy = false; + WERD* real_word = pr_it->word()->word; + if (real_word->rej_cblob_list()->empty() || + real_word->cblob_list()->empty() || + real_word->rej_cblob_list()->length() > noise_maxperword) + return false; + real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle); + // Get the noise outlines into a vector with matching bool map. + GenericVector outlines; + real_word->GetNoiseOutlines(&outlines); + GenericVector word_wanted; + GenericVector overlapped_any_blob; + GenericVector target_blobs; + AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, + &word_wanted, &overlapped_any_blob, + &target_blobs); + // Filter the outlines that overlapped any blob and put them into the word + // now. This simplifies the remaining task and also makes it more accurate + // as it has more completed blobs to work on. + GenericVector wanted; + GenericVector wanted_blobs; + GenericVector wanted_outlines; + int num_overlapped = 0; + int num_overlapped_used = 0; + for (int i = 0; i < overlapped_any_blob.size(); ++i) { + if (overlapped_any_blob[i]) { + ++num_overlapped; + if (word_wanted[i]) ++num_overlapped_used; + wanted.push_back(word_wanted[i]); + wanted_blobs.push_back(target_blobs[i]); + wanted_outlines.push_back(outlines[i]); + outlines[i] = NULL; + } + } + real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL); + AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, + &target_blobs); + int non_overlapped = 0; + int non_overlapped_used = 0; + for (int i = 0; i < word_wanted.size(); ++i) { + if (word_wanted[i]) ++non_overlapped_used; + if (outlines[i] != NULL) ++non_overlapped_used; + } + if (debug_noise_removal) { + tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:", + num_overlapped_used, num_overlapped, non_overlapped_used, + non_overlapped); + real_word->bounding_box().print(); + } + // Now we have decided which outlines we want, put them into the real_word. + if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, + make_next_word_fuzzy)) { + pr_it->MakeCurrentWordFuzzy(); + } + // TODO(rays) Parts of combos have a deep copy of the real word, and need + // to have their noise outlines moved/assigned in the same way!! + return num_overlapped_used != 0 || non_overlapped_used != 0; +} + +// Attempts to put noise/diacritic outlines into the blobs that they overlap. +// Input: a set of noisy outlines that probably belong to the real_word. +// Output: word_wanted indicates which outlines are to be assigned to a blob, +// target_blobs indicates which to assign to, and overlapped_any_blob is +// true for all outlines that overlapped a blob. +void Tesseract::AssignDiacriticsToOverlappingBlobs( + const GenericVector& outlines, int pass, WERD* real_word, + PAGE_RES_IT* pr_it, GenericVector* word_wanted, + GenericVector* overlapped_any_blob, + GenericVector* target_blobs) { + GenericVector blob_wanted; + word_wanted->init_to_size(outlines.size(), false); + overlapped_any_blob->init_to_size(outlines.size(), false); + target_blobs->init_to_size(outlines.size(), NULL); + // For each real blob, find the outlines that seriously overlap it. + // A single blob could be several merged characters, so there can be quite + // a few outlines overlapping, and the full engine needs to be used to chop + // and join to get a sensible result. + C_BLOB_IT blob_it(real_word->cblob_list()); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + C_BLOB* blob = blob_it.data(); + TBOX blob_box = blob->bounding_box(); + blob_wanted.init_to_size(outlines.size(), false); + int num_blob_outlines = 0; + for (int i = 0; i < outlines.size(); ++i) { + if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && + !(*word_wanted)[i]) { + blob_wanted[i] = true; + (*overlapped_any_blob)[i] = true; + ++num_blob_outlines; + } + } + if (debug_noise_removal) { + tprintf("%d noise outlines overlap blob at:", num_blob_outlines); + blob_box.print(); + } + // If any outlines overlap the blob, and not too many, classify the blob + // (using the full engine, languages and all), and choose the maximal + // combination of outlines that doesn't hurt the end-result classification + // by too much. Mark them as wanted. + if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) { + if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, + outlines, num_blob_outlines, + &blob_wanted)) { + for (int i = 0; i < blob_wanted.size(); ++i) { + if (blob_wanted[i]) { + // Claim the outline and record where it is going. + (*word_wanted)[i] = true; + (*target_blobs)[i] = blob; + } + } + } + } + } +} + +// Attempts to assign non-overlapping outlines to their nearest blobs or +// make new blobs out of them. +void Tesseract::AssignDiacriticsToNewBlobs( + const GenericVector& outlines, int pass, WERD* real_word, + PAGE_RES_IT* pr_it, GenericVector* word_wanted, + GenericVector* target_blobs) { + GenericVector blob_wanted; + word_wanted->init_to_size(outlines.size(), false); + target_blobs->init_to_size(outlines.size(), NULL); + // Check for outlines that need to be turned into stand-alone blobs. + for (int i = 0; i < outlines.size(); ++i) { + if (outlines[i] == NULL) continue; + // Get a set of adjacent outlines that don't overlap any existing blob. + blob_wanted.init_to_size(outlines.size(), false); + int num_blob_outlines = 0; + TBOX total_ol_box(outlines[i]->bounding_box()); + while (i < outlines.size() && outlines[i] != NULL) { + blob_wanted[i] = true; + total_ol_box += outlines[i]->bounding_box(); + ++i; + ++num_blob_outlines; + } + // Find the insertion point. + C_BLOB_IT blob_it(real_word->cblob_list()); + while (!blob_it.at_last() && + blob_it.data_relative(1)->bounding_box().left() <= + total_ol_box.left()) { + blob_it.forward(); + } + // Choose which combination of them we actually want and where to put + // them. + if (debug_noise_removal) + tprintf("Num blobless outlines = %d\n", num_blob_outlines); + C_BLOB* left_blob = blob_it.data(); + TBOX left_box = left_blob->bounding_box(); + C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1); + if ((left_box.x_overlap(total_ol_box) || right_blob == NULL || + !right_blob->bounding_box().x_overlap(total_ol_box)) && + SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, + outlines, num_blob_outlines, + &blob_wanted)) { + if (debug_noise_removal) tprintf("Added to left blob\n"); + for (int j = 0; j < blob_wanted.size(); ++j) { + if (blob_wanted[j]) { + (*word_wanted)[j] = true; + (*target_blobs)[j] = left_blob; + } + } + } else if (right_blob != NULL && + (!left_box.x_overlap(total_ol_box) || + right_blob->bounding_box().x_overlap(total_ol_box)) && + SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, + right_blob, outlines, + num_blob_outlines, &blob_wanted)) { + if (debug_noise_removal) tprintf("Added to right blob\n"); + for (int j = 0; j < blob_wanted.size(); ++j) { + if (blob_wanted[j]) { + (*word_wanted)[j] = true; + (*target_blobs)[j] = right_blob; + } + } + } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL, + outlines, num_blob_outlines, + &blob_wanted)) { + if (debug_noise_removal) tprintf("Fitted between blobs\n"); + for (int j = 0; j < blob_wanted.size(); ++j) { + if (blob_wanted[j]) { + (*word_wanted)[j] = true; + (*target_blobs)[j] = NULL; + } + } + } + } +} + +// Starting with ok_outlines set to indicate which outlines overlap the blob, +// chooses the optimal set (approximately) and returns true if any outlines +// are desired, in which case ok_outlines indicates which ones. +bool Tesseract::SelectGoodDiacriticOutlines( + int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob, + const GenericVector& outlines, int num_outlines, + GenericVector* ok_outlines) { + STRING best_str; + float target_cert = certainty_threshold; + if (blob != NULL) { + float target_c2; + target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2); + if (debug_noise_removal) { + tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(), + target_cert, target_c2); + blob->bounding_box().print(); + } + target_cert -= (target_cert - certainty_threshold) * noise_cert_factor; + } + GenericVector test_outlines = *ok_outlines; + // Start with all the outlines in. + STRING all_str; + GenericVector best_outlines = *ok_outlines; + float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, + pr_it, blob, &all_str); + if (debug_noise_removal) { + TBOX ol_box; + for (int i = 0; i < test_outlines.size(); ++i) { + if (test_outlines[i]) ol_box += outlines[i]->bounding_box(); + } + tprintf("All Noise blob classified as %s=%g, delta=%g at:", + all_str.string(), best_cert, best_cert - target_cert); + ol_box.print(); + } + // Iteratively zero out the bit that improves the certainty the most, until + // we get past the threshold, have zero bits, or fail to improve. + int best_index = 0; // To zero out. + while (num_outlines > 1 && best_index >= 0 && + (blob == NULL || best_cert < target_cert || blob != NULL)) { + // Find the best bit to zero out. + best_index = -1; + for (int i = 0; i < outlines.size(); ++i) { + if (test_outlines[i]) { + test_outlines[i] = false; + STRING str; + float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, + pr_it, blob, &str); + if (debug_noise_removal) { + TBOX ol_box; + for (int j = 0; j < outlines.size(); ++j) { + if (test_outlines[j]) ol_box += outlines[j]->bounding_box(); + tprintf("%d", test_outlines[j]); + } + tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(), + cert, cert - target_cert); + ol_box.print(); + } + if (cert > best_cert) { + best_cert = cert; + best_index = i; + best_outlines = test_outlines; + } + test_outlines[i] = true; + } + } + if (best_index >= 0) { + test_outlines[best_index] = false; + --num_outlines; + } + } + if (best_cert >= target_cert) { + // Save the best combination. + *ok_outlines = best_outlines; + if (debug_noise_removal) { + tprintf("%s noise combination ", blob ? "Adding" : "New"); + for (int i = 0; i < best_outlines.size(); ++i) { + tprintf("%d", best_outlines[i]); + } + tprintf(" yields certainty %g, beating target of %g\n", best_cert, + target_cert); + } + return true; + } + return false; +} + +// Classifies the given blob plus the outlines flagged by ok_outlines, undoes +// the inclusion of the outlines, and returns the certainty of the raw choice. +float Tesseract::ClassifyBlobPlusOutlines( + const GenericVector& ok_outlines, + const GenericVector& outlines, int pass_n, PAGE_RES_IT* pr_it, + C_BLOB* blob, STRING* best_str) { + C_OUTLINE_IT ol_it; + C_OUTLINE* first_to_keep = NULL; + if (blob != NULL) { + // Add the required outlines to the blob. + ol_it.set_to_list(blob->out_list()); + first_to_keep = ol_it.data(); + } + for (int i = 0; i < ok_outlines.size(); ++i) { + if (ok_outlines[i]) { + // This outline is to be added. + if (blob == NULL) { + blob = new C_BLOB(outlines[i]); + ol_it.set_to_list(blob->out_list()); + } else { + ol_it.add_before_stay_put(outlines[i]); + } + } + } + float c2; + float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2); + ol_it.move_to_first(); + if (first_to_keep == NULL) { + // We created blob. Empty its outlines and delete it. + for (; !ol_it.empty(); ol_it.forward()) ol_it.extract(); + delete blob; + cert = -c2; + } else { + // Remove the outlines that we put in. + for (; ol_it.data() != first_to_keep; ol_it.forward()) { + ol_it.extract(); + } + } + return cert; +} + +// Classifies the given blob (part of word_data->word->word) as an individual +// word, using languages, chopper etc, returning only the certainty of the +// best raw choice, and undoing all the work done to fake out the word. +float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, + C_BLOB* blob, STRING* best_str, float* c2) { + WERD* real_word = pr_it->word()->word; + WERD* word = real_word->ConstructFromSingleBlob( + real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob)); + WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word); + // Get a new iterator that points to the new word. + PAGE_RES_IT it(pr_it->page_res); + while (it.word() != word_res && it.word() != NULL) it.forward(); + ASSERT_HOST(it.word() == word_res); + WordData wd(it); + // Force full initialization. + SetupWordPassN(1, &wd); + classify_word_and_language(pass_n, &it, &wd); + if (debug_noise_removal) { + tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, + wd.row->x_height(), wd.word->raw_choice->min_x_height(), + wd.word->raw_choice->max_x_height()); + } + float cert = wd.word->raw_choice->certainty(); + float rat = wd.word->raw_choice->rating(); + *c2 = rat > 0.0f ? cert * cert / rat : 0.0f; + *best_str = wd.word->raw_choice->unichar_string(); + it.DeleteCurrentWord(); + pr_it->ResetWordIterator(); + return cert; +} + // Generic function for classifying a word. Can be used either for pass1 or // pass2 according to the function passed to recognizer. // word_data holds the word to be recognized, and its block and row, and @@ -906,9 +1267,10 @@ static bool WordsAcceptable(const PointerVector& words) { // Recognizes in the current language, and if successful that is all. // If recognition was not successful, tries all available languages until // it gets a successful result or runs out of languages. Keeps the best result. -void Tesseract::classify_word_and_language(WordRecognizer recognizer, - PAGE_RES_IT* pr_it, +void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it, WordData* word_data) { + WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1 + : &Tesseract::classify_word_pass2; // Best result so far. PointerVector best_words; // Points to the best result. May be word or in lang_words. diff --git a/ccmain/fixspace.cpp b/ccmain/fixspace.cpp index 17c4f96ed1..0a561ac9a0 100644 --- a/ccmain/fixspace.cpp +++ b/ccmain/fixspace.cpp @@ -205,8 +205,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, if ((!word->part_of_combo) && (word->box_word == NULL)) { WordData word_data(block, row, word); SetupWordPassN(2, &word_data); - classify_word_and_language(&Tesseract::classify_word_pass2, NULL, - &word_data); + classify_word_and_language(2, NULL, &word_data); } prev_word_best_choice_ = word->best_choice; } diff --git a/ccmain/pageiterator.cpp b/ccmain/pageiterator.cpp index c8e025c13f..ed03ceaba5 100644 --- a/ccmain/pageiterator.cpp +++ b/ccmain/pageiterator.cpp @@ -26,15 +26,23 @@ namespace tesseract { -PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, - int scale, int scaled_yres, - int rect_left, int rect_top, +PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale, + int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height) - : page_res_(page_res), tesseract_(tesseract), - word_(NULL), word_length_(0), blob_index_(0), cblob_it_(NULL), - scale_(scale), scaled_yres_(scaled_yres), - rect_left_(rect_left), rect_top_(rect_top), - rect_width_(rect_width), rect_height_(rect_height) { + : page_res_(page_res), + tesseract_(tesseract), + word_(NULL), + word_length_(0), + blob_index_(0), + cblob_it_(NULL), + include_upper_dots_(false), + include_lower_dots_(false), + scale_(scale), + scaled_yres_(scaled_yres), + rect_left_(rect_left), + rect_top_(rect_top), + rect_width_(rect_width), + rect_height_(rect_height) { it_ = new PAGE_RES_IT(page_res); PageIterator::Begin(); } @@ -50,12 +58,20 @@ PageIterator::~PageIterator() { * objects at a higher level. */ PageIterator::PageIterator(const PageIterator& src) - : page_res_(src.page_res_), tesseract_(src.tesseract_), - word_(NULL), word_length_(src.word_length_), - blob_index_(src.blob_index_), cblob_it_(NULL), - scale_(src.scale_), scaled_yres_(src.scaled_yres_), - rect_left_(src.rect_left_), rect_top_(src.rect_top_), - rect_width_(src.rect_width_), rect_height_(src.rect_height_) { + : page_res_(src.page_res_), + tesseract_(src.tesseract_), + word_(NULL), + word_length_(src.word_length_), + blob_index_(src.blob_index_), + cblob_it_(NULL), + include_upper_dots_(src.include_upper_dots_), + include_lower_dots_(src.include_lower_dots_), + scale_(src.scale_), + scaled_yres_(src.scaled_yres_), + rect_left_(src.rect_left_), + rect_top_(src.rect_top_), + rect_width_(src.rect_width_), + rect_height_(src.rect_height_) { it_ = new PAGE_RES_IT(*src.it_); BeginWord(src.blob_index_); } @@ -63,6 +79,8 @@ PageIterator::PageIterator(const PageIterator& src) const PageIterator& PageIterator::operator=(const PageIterator& src) { page_res_ = src.page_res_; tesseract_ = src.tesseract_; + include_upper_dots_ = src.include_upper_dots_; + include_lower_dots_ = src.include_lower_dots_; scale_ = src.scale_; scaled_yres_ = src.scaled_yres_; rect_left_ = src.rect_left_; @@ -252,16 +270,19 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level, PARA *para = NULL; switch (level) { case RIL_BLOCK: - box = it_->block()->block->bounding_box(); + box = it_->block()->block->restricted_bounding_box(include_upper_dots_, + include_lower_dots_); break; case RIL_PARA: para = it_->row()->row->para(); // explicit fall-through. case RIL_TEXTLINE: - box = it_->row()->row->bounding_box(); + box = it_->row()->row->restricted_bounding_box(include_upper_dots_, + include_lower_dots_); break; case RIL_WORD: - box = it_->word()->word->bounding_box(); + box = it_->word()->word->restricted_bounding_box(include_upper_dots_, + include_lower_dots_); break; case RIL_SYMBOL: if (cblob_it_ == NULL) @@ -387,39 +408,23 @@ Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const { int left, top, right, bottom; if (!BoundingBoxInternal(level, &left, &top, &right, &bottom)) return NULL; - Pix* pix = NULL; - switch (level) { - case RIL_BLOCK: - case RIL_PARA: - int bleft, btop, bright, bbottom; - BoundingBoxInternal(RIL_BLOCK, &bleft, &btop, &bright, &bbottom); - pix = it_->block()->block->render_mask(); - // AND the mask and the image. - pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix), - PIX_SRC & PIX_DST, tesseract_->pix_binary(), - bleft, btop); - if (level == RIL_PARA) { - // RIL_PARA needs further attention: - // clip the paragraph from the block mask. - Box* box = boxCreate(left - bleft, top - btop, - right - left, bottom - top); - Pix* pix2 = pixClipRectangle(pix, box, NULL); - boxDestroy(&box); - pixDestroy(&pix); - pix = pix2; - } - break; - case RIL_TEXTLINE: - case RIL_WORD: - case RIL_SYMBOL: - if (level == RIL_SYMBOL && cblob_it_ != NULL && - cblob_it_->data()->area() != 0) - return cblob_it_->data()->render(); - // Just clip from the bounding box. - Box* box = boxCreate(left, top, right - left, bottom - top); - pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL); - boxDestroy(&box); - break; + if (level == RIL_SYMBOL && cblob_it_ != NULL && + cblob_it_->data()->area() != 0) + return cblob_it_->data()->render(); + Box* box = boxCreate(left, top, right - left, bottom - top); + Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL); + boxDestroy(&box); + if (level == RIL_BLOCK || level == RIL_PARA) { + // Clip to the block polygon as well. + TBOX mask_box; + Pix* mask = it_->block()->block->render_mask(&mask_box); + int mask_x = left - mask_box.left(); + int mask_y = top - (tesseract_->ImageHeight() - mask_box.top()); + // AND the mask and pix, putting the result in pix. + pixRasterop(pix, MAX(0, -mask_x), MAX(0, -mask_y), pixGetWidth(pix), + pixGetHeight(pix), PIX_SRC & PIX_DST, mask, MAX(0, mask_x), + MAX(0, mask_y)); + pixDestroy(&mask); } return pix; } @@ -452,17 +457,24 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding, Box* box = boxCreate(*left, *top, right - *left, bottom - *top); Pix* grey_pix = pixClipRectangle(original_img, box, NULL); boxDestroy(&box); - if (level == RIL_BLOCK) { - Pix* mask = it_->block()->block->render_mask(); - Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1); - pixRasterop(expanded_mask, padding, padding, - pixGetWidth(mask), pixGetHeight(mask), - PIX_SRC, mask, 0, 0); + if (level == RIL_BLOCK || level == RIL_PARA) { + // Clip to the block polygon as well. + TBOX mask_box; + Pix* mask = it_->block()->block->render_mask(&mask_box); + // Copy the mask registered correctly into an image the size of grey_pix. + int mask_x = *left - mask_box.left(); + int mask_y = *top - (pixGetHeight(original_img) - mask_box.top()); + int width = pixGetWidth(grey_pix); + int height = pixGetHeight(grey_pix); + Pix* resized_mask = pixCreate(width, height, 1); + pixRasterop(resized_mask, MAX(0, -mask_x), MAX(0, -mask_y), width, height, + PIX_SRC, mask, MAX(0, mask_x), MAX(0, mask_y)); pixDestroy(&mask); - pixDilateBrick(expanded_mask, expanded_mask, 2*padding + 1, 2*padding + 1); - pixInvert(expanded_mask, expanded_mask); - pixSetMasked(grey_pix, expanded_mask, MAX_UINT32); - pixDestroy(&expanded_mask); + pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1, + 2 * padding + 1); + pixInvert(resized_mask, resized_mask); + pixSetMasked(grey_pix, resized_mask, MAX_UINT32); + pixDestroy(&resized_mask); } return grey_pix; } diff --git a/ccmain/pageiterator.h b/ccmain/pageiterator.h index 27b02ddf8f..56c78150a8 100644 --- a/ccmain/pageiterator.h +++ b/ccmain/pageiterator.h @@ -179,6 +179,21 @@ class TESS_API PageIterator { // If an image rectangle has been set in the API, then returned coordinates // relate to the original (full) image, rather than the rectangle. + /** + * Controls what to include in a bounding box. Bounding boxes of all levels + * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics. + * Between layout analysis and recognition, it isn't known where all + * diacritics belong, so this control is used to include or exclude some + * diacritics that are above or below the main body of the word. In most cases + * where the placement is obvious, and after recognition, it doesn't make as + * much difference, as the diacritics will already be included in the word. + */ + void SetBoundingBoxComponents(bool include_upper_dots, + bool include_lower_dots) { + include_upper_dots_ = include_upper_dots; + include_lower_dots_ = include_lower_dots; + } + /** * Returns the bounding rectangle of the current object at the given level. * See comment on coordinate system above. @@ -332,6 +347,9 @@ class TESS_API PageIterator { * Owned by this ResultIterator. */ C_BLOB_IT* cblob_it_; + /** Control over what to include in bounding boxes. */ + bool include_upper_dots_; + bool include_lower_dots_; /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/ int scale_; int scaled_yres_; diff --git a/ccmain/pagesegmain.cpp b/ccmain/pagesegmain.cpp index 396be13048..6ced2d4c40 100644 --- a/ccmain/pagesegmain.cpp +++ b/ccmain/pagesegmain.cpp @@ -134,12 +134,20 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, // UNLV file present. Use PSM_SINGLE_BLOCK. pageseg_mode = PSM_SINGLE_BLOCK; } + // The diacritic_blobs holds noise blobs that may be diacritics. They + // are separated out on areas of the image that seem noisy and short-circuit + // the layout process, going straight from the initial partition creation + // right through to after word segmentation, where they are added to the + // rej_cblobs list of the most appropriate word. From there classification + // will determine whether they are used. + BLOBNBOX_LIST diacritic_blobs; int auto_page_seg_ret_val = 0; TO_BLOCK_LIST to_blocks; if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) || PSM_SPARSE(pageseg_mode)) { - auto_page_seg_ret_val = - AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr); + auto_page_seg_ret_val = AutoPageSeg( + pageseg_mode, blocks, &to_blocks, + enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr); if (pageseg_mode == PSM_OSD_ONLY) return auto_page_seg_ret_val; // To create blobs from the image region bounds uncomment this line: @@ -171,7 +179,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_, pix_grey_, splitting || cjk_mode, - blocks, &to_blocks); + &diacritic_blobs, blocks, &to_blocks); return auto_page_seg_ret_val; } @@ -197,7 +205,6 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { pixDestroy(&grey_pix); } - /** * Auto page segmentation. Divide the page image into blocks of uniform * text linespacing and images. @@ -207,9 +214,14 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { * The output goes in the blocks list with corresponding TO_BLOCKs in the * to_blocks list. * - * If single_column is true, then no attempt is made to divide the image - * into columns, but multiple blocks are still made if the text is of - * non-uniform linespacing. + * If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide + * the image into columns, but multiple blocks are still made if the text is + * of non-uniform linespacing. + * + * If diacritic_blobs is non-null, then diacritics/noise blobs, that would + * confuse layout anaylsis by causing textline overlap, are placed there, + * with the expectation that they will be reassigned to words later and + * noise/diacriticness determined via classification. * * If osd (orientation and script detection) is true then that is performed * as well. If only_osd is true, then only orientation and script detection is @@ -217,9 +229,10 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { * another Tesseract that was initialized especially for osd, and the results * will be output into osr (orientation and script result). */ -int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, - Tesseract* osd_tess, OSResults* osr) { +int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks, + TO_BLOCK_LIST* to_blocks, + BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess, + OSResults* osr) { if (textord_debug_images) { WriteDebugBackgroundImage(textord_debug_printable, pix_binary_); } @@ -247,10 +260,9 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, if (equ_detect_) { finder->SetEquationDetect(equ_detect_); } - result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, - to_block, photomask_pix, - pix_thresholds_, pix_grey_, - &found_blocks, to_blocks); + result = finder->FindBlocks( + pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix, + pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks); if (result >= 0) finder->GetDeskewVectors(&deskew_, &reskew_); delete finder; diff --git a/ccmain/pgedit.cpp b/ccmain/pgedit.cpp index 7c8f626b6b..ea44ead7c9 100644 --- a/ccmain/pgedit.cpp +++ b/ccmain/pgedit.cpp @@ -655,7 +655,8 @@ void show_point(PAGE_RES* page_res, float x, float y) { FCOORD pt(x, y); PAGE_RES_IT pr_it(page_res); - char msg[160]; + const int kBufsize = 512; + char msg[kBufsize]; char *msg_ptr = msg; msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y); diff --git a/ccmain/recogtraining.cpp b/ccmain/recogtraining.cpp index 2dc94886ed..27d7e97ea0 100644 --- a/ccmain/recogtraining.cpp +++ b/ccmain/recogtraining.cpp @@ -207,8 +207,7 @@ void Tesseract::ambigs_classify_and_output(const char *label, fflush(stdout); WordData word_data(*pr_it); SetupWordPassN(1, &word_data); - classify_word_and_language(&Tesseract::classify_word_pass1, - pr_it, &word_data); + classify_word_and_language(1, pr_it, &word_data); WERD_RES* werd_res = word_data.word; WERD_CHOICE *best_choice = werd_res->best_choice; ASSERT_HOST(best_choice != NULL); diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index c262bbc95e..25819e8cdd 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -55,507 +55,569 @@ namespace tesseract { Tesseract::Tesseract() - : BOOL_MEMBER(tessedit_resegment_from_boxes, false, - "Take segmentation and labeling from box file", - this->params()), - BOOL_MEMBER(tessedit_resegment_from_line_boxes, false, - "Conversion of word/line box file to char box file", - this->params()), - BOOL_MEMBER(tessedit_train_from_boxes, false, - "Generate training data from boxed chars", this->params()), - BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, - "Generate more boxes from boxed chars", this->params()), - BOOL_MEMBER(tessedit_dump_pageseg_images, false, - "Dump intermediate images made during page segmentation", - this->params()), - // The default for pageseg_mode is the old behaviour, so as not to - // upset anything that relies on that. - INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK, - "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," - " 5=line, 6=word, 7=char" - " (Values from PageSegMode enum in publictypes.h)", - this->params()), - INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY, - "Which OCR engine(s) to run (Tesseract, Cube, both)." - " Defaults to loading and running only Tesseract" - " (no Cube,no combiner)." - " Values from OcrEngineMode enum in tesseractclass.h)", - this->params()), - STRING_MEMBER(tessedit_char_blacklist, "", - "Blacklist of chars not to recognize", this->params()), - STRING_MEMBER(tessedit_char_whitelist, "", - "Whitelist of chars to recognize", this->params()), - STRING_MEMBER(tessedit_char_unblacklist, "", - "List of chars to override tessedit_char_blacklist", - this->params()), - BOOL_MEMBER(tessedit_ambigs_training, false, - "Perform training for ambiguities", this->params()), - INT_MEMBER(pageseg_devanagari_split_strategy, - tesseract::ShiroRekhaSplitter::NO_SPLIT, - "Whether to use the top-line splitting process for Devanagari " - "documents while performing page-segmentation.", this->params()), - INT_MEMBER(ocr_devanagari_split_strategy, - tesseract::ShiroRekhaSplitter::NO_SPLIT, - "Whether to use the top-line splitting process for Devanagari " - "documents while performing ocr.", this->params()), - STRING_MEMBER(tessedit_write_params_to_file, "", - "Write all parameters to the given file.", this->params()), - BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug" - " information for adaption", this->params()), - INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()), - INT_MEMBER(applybox_debug, 1, "Debug level", this->params()), - INT_MEMBER(applybox_page, 0, - "Page number to apply boxes from", this->params()), - STRING_MEMBER(applybox_exposure_pattern, ".exp", "Exposure value follows" - " this pattern in the image filename. The name of the image" - " files are expected to be in the form" - " [lang].[fontname].exp[num].tif", this->params()), - BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false, - "Learn both character fragments (as is done in the" - " special low exposure mode) as well as unfragmented" - " characters.", this->params()), - BOOL_MEMBER(applybox_learn_ngrams_mode, false, "Each bounding box" - " is assumed to contain ngrams. Only learn the ngrams" - " whose outlines overlap horizontally.", this->params()), - BOOL_MEMBER(tessedit_display_outwords, false, - "Draw output words", this->params()), - BOOL_MEMBER(tessedit_dump_choices, false, - "Dump char choices", this->params()), - BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", - this->params()), - BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, - "Try to improve fuzzy spaces", this->params()), - BOOL_MEMBER(tessedit_unrej_any_wd, false, - "Dont bother with word plausibility", this->params()), - BOOL_MEMBER(tessedit_fix_hyphens, true, - "Crunch double hyphens?", this->params()), - BOOL_MEMBER(tessedit_redo_xheight, true, - "Check/Correct x-height", this->params()), - BOOL_MEMBER(tessedit_enable_doc_dict, true, - "Add words to the document dictionary", this->params()), - BOOL_MEMBER(tessedit_debug_fonts, false, - "Output font info per char", this->params()), - BOOL_MEMBER(tessedit_debug_block_rejection, false, - "Block and Row stats", this->params()), - BOOL_MEMBER(tessedit_enable_bigram_correction, true, - "Enable correction based on the word bigram dictionary.", - this->params()), - BOOL_MEMBER(tessedit_enable_dict_correction, false, - "Enable single word correction based on the dictionary.", - this->params()), - INT_MEMBER(tessedit_bigram_debug, 0, - "Amount of debug output for bigram correction.", - this->params()), - INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()), - BOOL_MEMBER(debug_acceptable_wds, false, - "Dump word pass/fail chk", this->params()), - STRING_MEMBER(chs_leading_punct, "('`\"", - "Leading punctuation", this->params()), - STRING_MEMBER(chs_trailing_punct1, ").,;:?!", - "1st Trailing punctuation", this->params()), - STRING_MEMBER(chs_trailing_punct2, ")'`\"", - "2nd Trailing punctuation", this->params()), - double_MEMBER(quality_rej_pc, 0.08, - "good_quality_doc lte rejection limit", this->params()), - double_MEMBER(quality_blob_pc, 0.0, - "good_quality_doc gte good blobs limit", this->params()), - double_MEMBER(quality_outline_pc, 1.0, - "good_quality_doc lte outline error limit", this->params()), - double_MEMBER(quality_char_pc, 0.95, - "good_quality_doc gte good char limit", this->params()), - INT_MEMBER(quality_min_initial_alphas_reqd, 2, - "alphas in a good word", this->params()), - INT_MEMBER(tessedit_tess_adaption_mode, 0x27, - "Adaptation decision algorithm for tess", this->params()), - BOOL_MEMBER(tessedit_minimal_rej_pass1, false, - "Do minimal rejection on pass 1 output", this->params()), - BOOL_MEMBER(tessedit_test_adaption, false, - "Test adaption criteria", this->params()), - BOOL_MEMBER(tessedit_matcher_log, false, - "Log matcher activity", this->params()), - INT_MEMBER(tessedit_test_adaption_mode, 3, - "Adaptation decision algorithm for tess", this->params()), - BOOL_MEMBER(test_pt, false, "Test for point", this->params()), - double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()), - double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()), - INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", - this->params()), - BOOL_MEMBER(paragraph_text_based, true, - "Run paragraph detection on the post-text-recognition " - "(more accurate)", this->params()), - INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()), - STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", - this->params()), - STRING_MEMBER(outlines_2, "ij!?%\":;", - "Non standard number of outlines", this->params()), - BOOL_MEMBER(docqual_excuse_outline_errs, false, - "Allow outline errs in unrejection?", this->params()), - BOOL_MEMBER(tessedit_good_quality_unrej, true, - "Reduce rejection on good docs", this->params()), - BOOL_MEMBER(tessedit_use_reject_spaces, true, - "Reject spaces?", this->params()), - double_MEMBER(tessedit_reject_doc_percent, 65.00, - "%rej allowed before rej whole doc", this->params()), - double_MEMBER(tessedit_reject_block_percent, 45.00, - "%rej allowed before rej whole block", this->params()), - double_MEMBER(tessedit_reject_row_percent, 40.00, - "%rej allowed before rej whole row", this->params()), - double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00, - "Number of row rejects in whole word rejects" - "which prevents whole row rejection", this->params()), - BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true, - "Only rej partially rejected words in block rejection", - this->params()), - BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true, - "Only rej partially rejected words in row rejection", - this->params()), - BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, - "Use word segmentation quality metric", this->params()), - BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, - "Use word segmentation quality metric", this->params()), - INT_MEMBER(tessedit_preserve_min_wd_len, 2, - "Only preserve wds longer than this", this->params()), - BOOL_MEMBER(tessedit_row_rej_good_docs, true, - "Apply row rejection to good docs", this->params()), - double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1, - "rej good doc wd if more than this fraction rejected", - this->params()), - BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, - "Reject all bad quality wds", this->params()), - BOOL_MEMBER(tessedit_debug_doc_rejection, false, - "Page stats", this->params()), - BOOL_MEMBER(tessedit_debug_quality_metrics, false, - "Output data to debug file", this->params()), - BOOL_MEMBER(bland_unrej, false, - "unrej potential with no chekcs", this->params()), - double_MEMBER(quality_rowrej_pc, 1.1, - "good_quality_doc gte good char limit", this->params()), - BOOL_MEMBER(unlv_tilde_crunching, true, - "Mark v.bad words for tilde crunch", this->params()), - BOOL_MEMBER(hocr_font_info, false, - "Add font info to hocr output", this->params()), - BOOL_MEMBER(crunch_early_merge_tess_fails, true, - "Before word crunch?", this->params()), - BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, - "Take out ~^ early?", this->params()), - double_MEMBER(crunch_terrible_rating, 80.0, - "crunch rating lt this", this->params()), - BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()), - double_MEMBER(crunch_poor_garbage_cert, -9.0, - "crunch garbage cert lt this", this->params()), - double_MEMBER(crunch_poor_garbage_rate, 60, - "crunch garbage rating lt this", this->params()), - double_MEMBER(crunch_pot_poor_rate, 40, - "POTENTIAL crunch rating lt this", this->params()), - double_MEMBER(crunch_pot_poor_cert, -8.0, - "POTENTIAL crunch cert lt this", this->params()), - BOOL_MEMBER(crunch_pot_garbage, true, - "POTENTIAL crunch garbage", this->params()), - double_MEMBER(crunch_del_rating, 60, - "POTENTIAL crunch rating lt this", this->params()), - double_MEMBER(crunch_del_cert, -10.0, - "POTENTIAL crunch cert lt this", this->params()), - double_MEMBER(crunch_del_min_ht, 0.7, - "Del if word ht lt xht x this", this->params()), - double_MEMBER(crunch_del_max_ht, 3.0, - "Del if word ht gt xht x this", this->params()), - double_MEMBER(crunch_del_min_width, 3.0, - "Del if word width lt xht x this", this->params()), - double_MEMBER(crunch_del_high_word, 1.5, - "Del if word gt xht x this above bl", this->params()), - double_MEMBER(crunch_del_low_word, 0.5, - "Del if word gt xht x this below bl", this->params()), - double_MEMBER(crunch_small_outlines_size, 0.6, - "Small if lt xht x this", this->params()), - INT_MEMBER(crunch_rating_max, 10, - "For adj length in rating per ch", this->params()), - INT_MEMBER(crunch_pot_indicators, 1, - "How many potential indicators needed", this->params()), - BOOL_MEMBER(crunch_leave_ok_strings, true, - "Dont touch sensible strings", this->params()), - BOOL_MEMBER(crunch_accept_ok, true, - "Use acceptability in okstring", this->params()), - BOOL_MEMBER(crunch_leave_accept_strings, false, - "Dont pot crunch sensible strings", this->params()), - BOOL_MEMBER(crunch_include_numerals, false, - "Fiddle alpha figures", this->params()), - INT_MEMBER(crunch_leave_lc_strings, 4, - "Dont crunch words with long lower case strings", - this->params()), - INT_MEMBER(crunch_leave_uc_strings, 4, - "Dont crunch words with long lower case strings", - this->params()), - INT_MEMBER(crunch_long_repetitions, 3, - "Crunch words with long repetitions", this->params()), - INT_MEMBER(crunch_debug, 0, "As it says", this->params()), - INT_MEMBER(fixsp_non_noise_limit, 1, - "How many non-noise blbs either side?", this->params()), - double_MEMBER(fixsp_small_outlines_size, 0.28, - "Small if lt xht x this", this->params()), - BOOL_MEMBER(tessedit_prefer_joined_punct, false, - "Reward punctation joins", this->params()), - INT_MEMBER(fixsp_done_mode, 1, - "What constitues done for spacing", this->params()), - INT_MEMBER(debug_fix_space_level, 0, - "Contextual fixspace debug", this->params()), - STRING_MEMBER(numeric_punctuation, ".,", - "Punct. chs expected WITHIN numbers", this->params()), - INT_MEMBER(x_ht_acceptance_tolerance, 8, - "Max allowed deviation of blob top outside of font data", - this->params()), - INT_MEMBER(x_ht_min_change, 8, - "Min change in xht before actually trying it", this->params()), - INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer", - this->params()), - double_MEMBER(superscript_worse_certainty, 2.0, "How many times worse " - "certainty does a superscript position glyph need to be for " - "us to try classifying it as a char with a different " - "baseline?", this->params()), - double_MEMBER(superscript_bettered_certainty, 0.97, "What reduction in " - "badness do we think sufficient to choose a superscript " - "over what we'd thought. For example, a value of 0.6 means " - "we want to reduce badness of certainty by at least 40%", - this->params()), - double_MEMBER(superscript_scaledown_ratio, 0.4, - "A superscript scaled down more than this is unbelievably " - "small. For example, 0.3 means we expect the font size to " - "be no smaller than 30% of the text line font size.", - this->params()), - double_MEMBER(subscript_max_y_top, 0.5, - "Maximum top of a character measured as a multiple of " - "x-height above the baseline for us to reconsider whether " - "it's a subscript.", this->params()), - double_MEMBER(superscript_min_y_bottom, 0.3, - "Minimum bottom of a character measured as a multiple of " - "x-height above the baseline for us to reconsider whether " - "it's a superscript.", this->params()), - BOOL_MEMBER(tessedit_write_block_separators, false, - "Write block separators in output", this->params()), - BOOL_MEMBER(tessedit_write_rep_codes, false, - "Write repetition char code", this->params()), - BOOL_MEMBER(tessedit_write_unlv, false, - "Write .unlv output file", this->params()), - BOOL_MEMBER(tessedit_create_txt, true, - "Write .txt output file", this->params()), - BOOL_MEMBER(tessedit_create_hocr, false, - "Write .html hOCR output file", this->params()), - BOOL_MEMBER(tessedit_create_pdf, false, - "Write .pdf output file", this->params()), - STRING_MEMBER(unrecognised_char, "|", - "Output char for unidentified blobs", this->params()), - INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()), - INT_MEMBER(suspect_space_level, 100, - "Min suspect level for rejecting spaces", this->params()), - INT_MEMBER(suspect_short_words, 2, - "Dont Suspect dict wds longer than this", this->params()), - BOOL_MEMBER(suspect_constrain_1Il, false, - "UNLV keep 1Il chars rejected", this->params()), - double_MEMBER(suspect_rating_per_ch, 999.9, - "Dont touch bad rating limit", this->params()), - double_MEMBER(suspect_accept_rating, -999.9, - "Accept good rating limit", this->params()), - BOOL_MEMBER(tessedit_minimal_rejection, false, - "Only reject tess failures", this->params()), - BOOL_MEMBER(tessedit_zero_rejection, false, - "Dont reject ANYTHING", this->params()), - BOOL_MEMBER(tessedit_word_for_word, false, - "Make output have exactly one word per WERD", this->params()), - BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, - "Dont reject ANYTHING AT ALL", this->params()), - BOOL_MEMBER(tessedit_consistent_reps, true, - "Force all rep chars the same", this->params()), - INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params()), - BOOL_MEMBER(tessedit_rejection_debug, false, - "Adaption debug", this->params()), - BOOL_MEMBER(tessedit_flip_0O, true, - "Contextual 0O O0 flips", this->params()), - double_MEMBER(tessedit_lower_flip_hyphen, 1.5, - "Aspect ratio dot/hyphen test", this->params()), - double_MEMBER(tessedit_upper_flip_hyphen, 1.8, - "Aspect ratio dot/hyphen test", this->params()), - BOOL_MEMBER(rej_trust_doc_dawg, false, - "Use DOC dawg in 11l conf. detector", this->params()), - BOOL_MEMBER(rej_1Il_use_dict_word, false, - "Use dictword test", this->params()), - BOOL_MEMBER(rej_1Il_trust_permuter_type, true, - "Dont double check", this->params()), - BOOL_MEMBER(rej_use_tess_accepted, true, - "Individual rejection control", this->params()), - BOOL_MEMBER(rej_use_tess_blanks, true, - "Individual rejection control", this->params()), - BOOL_MEMBER(rej_use_good_perm, true, - "Individual rejection control", this->params()), - BOOL_MEMBER(rej_use_sensible_wd, false, - "Extend permuter check", this->params()), - BOOL_MEMBER(rej_alphas_in_number_perm, false, - "Extend permuter check", this->params()), - double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, - "if >this fract", this->params()), - INT_MEMBER(tessedit_image_border, 2, - "Rej blbs near image edge limit", this->params()), - STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", - "Allow NN to unrej", this->params()), - STRING_MEMBER(conflict_set_I_l_1, "Il1[]", - "Il1 conflict set", this->params()), - INT_MEMBER(min_sane_x_ht_pixels, 8, - "Reject any x-ht lt or eq than this", this->params()), - BOOL_MEMBER(tessedit_create_boxfile, false, - "Output text with boxes", this->params()), - INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages" - " , else specifc page to process", this->params()), - BOOL_MEMBER(tessedit_write_images, false, - "Capture the image from the IPE", this->params()), - BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", - this->params()), - STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()), - BOOL_MEMBER(tessedit_override_permuter, true, - "According to dict_word", this->params()), - INT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for" - " TessdataManager functions.", this->params()), - STRING_MEMBER(tessedit_load_sublangs, "", - "List of languages to load with this one", this->params()), - BOOL_MEMBER(tessedit_use_primary_params_model, false, - "In multilingual mode use params model of the" - " primary language", this->params()), - double_MEMBER(min_orientation_margin, 7.0, - "Min acceptable orientation margin", this->params()), - BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", - this->params()), - BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model", - this->params()), - BOOL_MEMBER(poly_allow_detailed_fx, false, - "Allow feature extractors to see the original outline", - this->params()), - BOOL_INIT_MEMBER(tessedit_init_config_only, false, - "Only initialize with the config file. Useful if the " - "instance is not going to be used for OCR but say only " - "for layout analysis.", this->params()), - BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", - this->params()), - BOOL_MEMBER(textord_tabfind_vertical_text, true, - "Enable vertical detection", this->params()), - BOOL_MEMBER(textord_tabfind_force_vertical_text, false, - "Force using vertical text page mode", this->params()), - double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5, - "Fraction of textlines deemed vertical to use vertical page " - "mode", this->params()), - double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75, - "Fraction of height used as a minimum gap for aligned blobs.", - this->params()), - INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", - this->params()), - BOOL_MEMBER(preserve_interword_spaces, false, - "Preserve multiple interword spaces", this->params()), - BOOL_MEMBER(include_page_breaks, FALSE, - "Include page separator string in output text after each " - "image/page.", this->params()), - STRING_MEMBER(page_separator, "\f", - "Page separator (default is form feed control character)", + : BOOL_MEMBER(tessedit_resegment_from_boxes, false, + "Take segmentation and labeling from box file", this->params()), + BOOL_MEMBER(tessedit_resegment_from_line_boxes, false, + "Conversion of word/line box file to char box file", + this->params()), + BOOL_MEMBER(tessedit_train_from_boxes, false, + "Generate training data from boxed chars", this->params()), + BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, + "Generate more boxes from boxed chars", this->params()), + BOOL_MEMBER(tessedit_dump_pageseg_images, false, + "Dump intermediate images made during page segmentation", + this->params()), + // The default for pageseg_mode is the old behaviour, so as not to + // upset anything that relies on that. + INT_MEMBER( + tessedit_pageseg_mode, PSM_SINGLE_BLOCK, + "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," + " 5=line, 6=word, 7=char" + " (Values from PageSegMode enum in publictypes.h)", + this->params()), + INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY, + "Which OCR engine(s) to run (Tesseract, Cube, both)." + " Defaults to loading and running only Tesseract" + " (no Cube,no combiner)." + " Values from OcrEngineMode enum in tesseractclass.h)", + this->params()), + STRING_MEMBER(tessedit_char_blacklist, "", + "Blacklist of chars not to recognize", this->params()), + STRING_MEMBER(tessedit_char_whitelist, "", + "Whitelist of chars to recognize", this->params()), + STRING_MEMBER(tessedit_char_unblacklist, "", + "List of chars to override tessedit_char_blacklist", + this->params()), + BOOL_MEMBER(tessedit_ambigs_training, false, + "Perform training for ambiguities", this->params()), + INT_MEMBER(pageseg_devanagari_split_strategy, + tesseract::ShiroRekhaSplitter::NO_SPLIT, + "Whether to use the top-line splitting process for Devanagari " + "documents while performing page-segmentation.", + this->params()), + INT_MEMBER(ocr_devanagari_split_strategy, + tesseract::ShiroRekhaSplitter::NO_SPLIT, + "Whether to use the top-line splitting process for Devanagari " + "documents while performing ocr.", + this->params()), + STRING_MEMBER(tessedit_write_params_to_file, "", + "Write all parameters to the given file.", this->params()), + BOOL_MEMBER(tessedit_adaption_debug, false, + "Generate and print debug" + " information for adaption", + this->params()), + INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()), + INT_MEMBER(applybox_debug, 1, "Debug level", this->params()), + INT_MEMBER(applybox_page, 0, "Page number to apply boxes from", + this->params()), + STRING_MEMBER(applybox_exposure_pattern, ".exp", + "Exposure value follows" + " this pattern in the image filename. The name of the image" + " files are expected to be in the form" + " [lang].[fontname].exp[num].tif", + this->params()), + BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false, + "Learn both character fragments (as is done in the" + " special low exposure mode) as well as unfragmented" + " characters.", + this->params()), + BOOL_MEMBER(applybox_learn_ngrams_mode, false, + "Each bounding box" + " is assumed to contain ngrams. Only learn the ngrams" + " whose outlines overlap horizontally.", + this->params()), + BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words", + this->params()), + BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices", + this->params()), + BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", + this->params()), + BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, + "Try to improve fuzzy spaces", this->params()), + BOOL_MEMBER(tessedit_unrej_any_wd, false, + "Dont bother with word plausibility", this->params()), + BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?", + this->params()), + BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height", + this->params()), + BOOL_MEMBER(tessedit_enable_doc_dict, true, + "Add words to the document dictionary", this->params()), + BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char", + this->params()), + BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", + this->params()), + BOOL_MEMBER(tessedit_enable_bigram_correction, true, + "Enable correction based on the word bigram dictionary.", + this->params()), + BOOL_MEMBER(tessedit_enable_dict_correction, false, + "Enable single word correction based on the dictionary.", + this->params()), + INT_MEMBER(tessedit_bigram_debug, 0, + "Amount of debug output for bigram correction.", + this->params()), + BOOL_MEMBER(enable_noise_removal, true, + "Remove and conditionally reassign small outlines when they" + " confuse layout analysis, determining diacritics vs noise", + this->params()), + INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines", + this->params()), + // Worst (min) certainty, for which a diacritic is allowed to make the + // base + // character worse and still be included. + double_MEMBER(noise_cert_basechar, -8.0, + "Hingepoint for base char certainty", this->params()), + // Worst (min) certainty, for which a non-overlapping diacritic is allowed + // to make the base character worse and still be included. + double_MEMBER(noise_cert_disjoint, -1.0, + "Hingepoint for disjoint certainty", this->params()), + // Worst (min) certainty, for which a diacritic is allowed to make a new + // stand-alone blob. + double_MEMBER(noise_cert_punc, -3.0, + "Threshold for new punc char certainty", this->params()), + // Factor of certainty margin for adding diacritics to not count as worse. + double_MEMBER(noise_cert_factor, 0.375, + "Scaling on certainty diff from Hingepoint", + this->params()), + INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob", + this->params()), + INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word", + this->params()), + INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()), + BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk", + this->params()), + STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation", + this->params()), + STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation", + this->params()), + STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation", + this->params()), + double_MEMBER(quality_rej_pc, 0.08, + "good_quality_doc lte rejection limit", this->params()), + double_MEMBER(quality_blob_pc, 0.0, + "good_quality_doc gte good blobs limit", this->params()), + double_MEMBER(quality_outline_pc, 1.0, + "good_quality_doc lte outline error limit", this->params()), + double_MEMBER(quality_char_pc, 0.95, + "good_quality_doc gte good char limit", this->params()), + INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word", + this->params()), + INT_MEMBER(tessedit_tess_adaption_mode, 0x27, + "Adaptation decision algorithm for tess", this->params()), + BOOL_MEMBER(tessedit_minimal_rej_pass1, false, + "Do minimal rejection on pass 1 output", this->params()), + BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria", + this->params()), + BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity", + this->params()), + INT_MEMBER(tessedit_test_adaption_mode, 3, + "Adaptation decision algorithm for tess", this->params()), + BOOL_MEMBER(test_pt, false, "Test for point", this->params()), + double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()), + double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()), + INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", + this->params()), + BOOL_MEMBER(paragraph_text_based, true, + "Run paragraph detection on the post-text-recognition " + "(more accurate)", + this->params()), + INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()), + STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", + this->params()), + STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines", + this->params()), + BOOL_MEMBER(docqual_excuse_outline_errs, false, + "Allow outline errs in unrejection?", this->params()), + BOOL_MEMBER(tessedit_good_quality_unrej, true, + "Reduce rejection on good docs", this->params()), + BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?", + this->params()), + double_MEMBER(tessedit_reject_doc_percent, 65.00, + "%rej allowed before rej whole doc", this->params()), + double_MEMBER(tessedit_reject_block_percent, 45.00, + "%rej allowed before rej whole block", this->params()), + double_MEMBER(tessedit_reject_row_percent, 40.00, + "%rej allowed before rej whole row", this->params()), + double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00, + "Number of row rejects in whole word rejects" + "which prevents whole row rejection", + this->params()), + BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true, + "Only rej partially rejected words in block rejection", + this->params()), + BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true, + "Only rej partially rejected words in row rejection", + this->params()), + BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, + "Use word segmentation quality metric", this->params()), + BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, + "Use word segmentation quality metric", this->params()), + INT_MEMBER(tessedit_preserve_min_wd_len, 2, + "Only preserve wds longer than this", this->params()), + BOOL_MEMBER(tessedit_row_rej_good_docs, true, + "Apply row rejection to good docs", this->params()), + double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1, + "rej good doc wd if more than this fraction rejected", + this->params()), + BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, + "Reject all bad quality wds", this->params()), + BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats", + this->params()), + BOOL_MEMBER(tessedit_debug_quality_metrics, false, + "Output data to debug file", this->params()), + BOOL_MEMBER(bland_unrej, false, "unrej potential with no chekcs", + this->params()), + double_MEMBER(quality_rowrej_pc, 1.1, + "good_quality_doc gte good char limit", this->params()), + BOOL_MEMBER(unlv_tilde_crunching, true, + "Mark v.bad words for tilde crunch", this->params()), + BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", + this->params()), + BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", + this->params()), + BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, + "Take out ~^ early?", this->params()), + double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", + this->params()), + BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()), + double_MEMBER(crunch_poor_garbage_cert, -9.0, + "crunch garbage cert lt this", this->params()), + double_MEMBER(crunch_poor_garbage_rate, 60, + "crunch garbage rating lt this", this->params()), + double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this", + this->params()), + double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this", + this->params()), + BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage", + this->params()), + double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this", + this->params()), + double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this", + this->params()), + double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this", + this->params()), + double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this", + this->params()), + double_MEMBER(crunch_del_min_width, 3.0, + "Del if word width lt xht x this", this->params()), + double_MEMBER(crunch_del_high_word, 1.5, + "Del if word gt xht x this above bl", this->params()), + double_MEMBER(crunch_del_low_word, 0.5, + "Del if word gt xht x this below bl", this->params()), + double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this", + this->params()), + INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch", + this->params()), + INT_MEMBER(crunch_pot_indicators, 1, + "How many potential indicators needed", this->params()), + BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings", + this->params()), + BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring", + this->params()), + BOOL_MEMBER(crunch_leave_accept_strings, false, + "Dont pot crunch sensible strings", this->params()), + BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures", + this->params()), + INT_MEMBER(crunch_leave_lc_strings, 4, + "Dont crunch words with long lower case strings", + this->params()), + INT_MEMBER(crunch_leave_uc_strings, 4, + "Dont crunch words with long lower case strings", + this->params()), + INT_MEMBER(crunch_long_repetitions, 3, + "Crunch words with long repetitions", this->params()), + INT_MEMBER(crunch_debug, 0, "As it says", this->params()), + INT_MEMBER(fixsp_non_noise_limit, 1, + "How many non-noise blbs either side?", this->params()), + double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this", + this->params()), + BOOL_MEMBER(tessedit_prefer_joined_punct, false, + "Reward punctation joins", this->params()), + INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing", + this->params()), + INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug", + this->params()), + STRING_MEMBER(numeric_punctuation, ".,", + "Punct. chs expected WITHIN numbers", this->params()), + INT_MEMBER(x_ht_acceptance_tolerance, 8, + "Max allowed deviation of blob top outside of font data", + this->params()), + INT_MEMBER(x_ht_min_change, 8, + "Min change in xht before actually trying it", this->params()), + INT_MEMBER(superscript_debug, 0, + "Debug level for sub & superscript fixer", this->params()), + double_MEMBER( + superscript_worse_certainty, 2.0, + "How many times worse " + "certainty does a superscript position glyph need to be for " + "us to try classifying it as a char with a different " + "baseline?", + this->params()), + double_MEMBER( + superscript_bettered_certainty, 0.97, + "What reduction in " + "badness do we think sufficient to choose a superscript " + "over what we'd thought. For example, a value of 0.6 means " + "we want to reduce badness of certainty by at least 40%", + this->params()), + double_MEMBER(superscript_scaledown_ratio, 0.4, + "A superscript scaled down more than this is unbelievably " + "small. For example, 0.3 means we expect the font size to " + "be no smaller than 30% of the text line font size.", + this->params()), + double_MEMBER(subscript_max_y_top, 0.5, + "Maximum top of a character measured as a multiple of " + "x-height above the baseline for us to reconsider whether " + "it's a subscript.", + this->params()), + double_MEMBER(superscript_min_y_bottom, 0.3, + "Minimum bottom of a character measured as a multiple of " + "x-height above the baseline for us to reconsider whether " + "it's a superscript.", + this->params()), + BOOL_MEMBER(tessedit_write_block_separators, false, + "Write block separators in output", this->params()), + BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code", + this->params()), + BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file", + this->params()), + BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file", + this->params()), + BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", + this->params()), + BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", + this->params()), + STRING_MEMBER(unrecognised_char, "|", + "Output char for unidentified blobs", this->params()), + INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()), + INT_MEMBER(suspect_space_level, 100, + "Min suspect level for rejecting spaces", this->params()), + INT_MEMBER(suspect_short_words, 2, + "Dont Suspect dict wds longer than this", this->params()), + BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", + this->params()), + double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit", + this->params()), + double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", + this->params()), + BOOL_MEMBER(tessedit_minimal_rejection, false, + "Only reject tess failures", this->params()), + BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING", + this->params()), + BOOL_MEMBER(tessedit_word_for_word, false, + "Make output have exactly one word per WERD", this->params()), + BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, + "Dont reject ANYTHING AT ALL", this->params()), + BOOL_MEMBER(tessedit_consistent_reps, true, + "Force all rep chars the same", this->params()), + INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", + this->params()), + BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug", + this->params()), + BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips", + this->params()), + double_MEMBER(tessedit_lower_flip_hyphen, 1.5, + "Aspect ratio dot/hyphen test", this->params()), + double_MEMBER(tessedit_upper_flip_hyphen, 1.8, + "Aspect ratio dot/hyphen test", this->params()), + BOOL_MEMBER(rej_trust_doc_dawg, false, + "Use DOC dawg in 11l conf. detector", this->params()), + BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test", + this->params()), + BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check", + this->params()), + BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control", + this->params()), + BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control", + this->params()), + BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control", + this->params()), + BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check", + this->params()), + BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check", + this->params()), + double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, + "if >this fract", this->params()), + INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit", + this->params()), + STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", + "Allow NN to unrej", this->params()), + STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set", + this->params()), + INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this", + this->params()), + BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes", + this->params()), + INT_MEMBER(tessedit_page_number, -1, + "-1 -> All pages" + " , else specifc page to process", + this->params()), + BOOL_MEMBER(tessedit_write_images, false, + "Capture the image from the IPE", this->params()), + BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", + this->params()), + STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()), + BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", + this->params()), + INT_MEMBER(tessdata_manager_debug_level, 0, + "Debug level for" + " TessdataManager functions.", + this->params()), + STRING_MEMBER(tessedit_load_sublangs, "", + "List of languages to load with this one", this->params()), + BOOL_MEMBER(tessedit_use_primary_params_model, false, + "In multilingual mode use params model of the" + " primary language", + this->params()), + double_MEMBER(min_orientation_margin, 7.0, + "Min acceptable orientation margin", this->params()), + BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", + this->params()), + BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model", + this->params()), + BOOL_MEMBER(poly_allow_detailed_fx, false, + "Allow feature extractors to see the original outline", + this->params()), + BOOL_INIT_MEMBER(tessedit_init_config_only, false, + "Only initialize with the config file. Useful if the " + "instance is not going to be used for OCR but say only " + "for layout analysis.", + this->params()), + BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", + this->params()), + BOOL_MEMBER(textord_tabfind_vertical_text, true, + "Enable vertical detection", this->params()), + BOOL_MEMBER(textord_tabfind_force_vertical_text, false, + "Force using vertical text page mode", this->params()), + double_MEMBER( + textord_tabfind_vertical_text_ratio, 0.5, + "Fraction of textlines deemed vertical to use vertical page " + "mode", + this->params()), + double_MEMBER( + textord_tabfind_aligned_gap_fraction, 0.75, + "Fraction of height used as a minimum gap for aligned blobs.", + this->params()), + INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", + this->params()), + BOOL_MEMBER(preserve_interword_spaces, false, + "Preserve multiple interword spaces", this->params()), + BOOL_MEMBER(include_page_breaks, FALSE, + "Include page separator string in output text after each " + "image/page.", + this->params()), + STRING_MEMBER(page_separator, "\f", + "Page separator (default is form feed control character)", + this->params()), - // The following parameters were deprecated and removed from their original - // locations. The parameters are temporarily kept here to give Tesseract - // users a chance to updated their [lang].traineddata and config files - // without introducing failures during Tesseract initialization. - // TODO(ocr-team): remove these parameters from the code once we are - // reasonably sure that Tesseract users have updated their data files. - // - // BEGIN DEPRECATED PARAMETERS - BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true, - "find horizontal lines such as headers in vertical page mode", - this->params()), - INT_MEMBER(tessedit_ok_mode, 5, - "Acceptance decision algorithm", this->params()), - BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs" - " (e.g. for non-space delimited languages)", - this->params()), - INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process", - this->params()), - BOOL_MEMBER(permute_debug, 0, "Debug char permutation process", - this->params()), - double_MEMBER(bestrate_pruning_factor, 2.0, "Multiplying factor of" - " current best rate to prune other hypotheses", - this->params()), - BOOL_MEMBER(permute_script_word, 0, - "Turn on word script consistency permuter", - this->params()), - BOOL_MEMBER(segment_segcost_rating, 0, - "incorporate segmentation cost in word rating?", - this->params()), - double_MEMBER(segment_reward_script, 0.95, - "Score multipler for script consistency within a word. " - "Being a 'reward' factor, it should be <= 1. " - "Smaller value implies bigger reward.", - this->params()), - BOOL_MEMBER(permute_fixed_length_dawg, 0, - "Turn on fixed-length phrasebook search permuter", - this->params()), - BOOL_MEMBER(permute_chartype_word, 0, - "Turn on character type (property) consistency permuter", - this->params()), - double_MEMBER(segment_reward_chartype, 0.97, - "Score multipler for char type consistency within a word. ", - this->params()), - double_MEMBER(segment_reward_ngram_best_choice, 0.99, - "Score multipler for ngram permuter's best choice" - " (only used in the Han script path).", - this->params()), - BOOL_MEMBER(ngram_permuter_activated, false, - "Activate character-level n-gram-based permuter", - this->params()), - BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter", - this->params()), - INT_MEMBER(language_model_fixed_length_choices_depth, 3, - "Depth of blob choice lists to explore" - " when fixed length dawgs are on", - this->params()), - BOOL_MEMBER(use_new_state_cost, FALSE, - "use new state cost heuristics for segmentation state" - " evaluation", this->params()), - double_MEMBER(heuristic_segcost_rating_base, 1.25, - "base factor for adding segmentation cost into word rating." - "It's a multiplying factor, the larger the value above 1, " - "the bigger the effect of segmentation cost.", - this->params()), - double_MEMBER(heuristic_weight_rating, 1.0, - "weight associated with char rating in combined cost of" - "state", this->params()), - double_MEMBER(heuristic_weight_width, 1000.0, - "weight associated with width evidence in combined cost of" - " state", this->params()), - double_MEMBER(heuristic_weight_seamcut, 0.0, - "weight associated with seam cut in combined cost of state", - this->params()), - double_MEMBER(heuristic_max_char_wh_ratio, 2.0, - "max char width-to-height ratio allowed in segmentation", - this->params()), - BOOL_MEMBER(enable_new_segsearch, true, - "Enable new segmentation search path.", this->params()), - double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, - "Maximum character width-to-height ratio for" - " fixed-pitch fonts", - this->params()), - // END DEPRECATED PARAMETERS + // The following parameters were deprecated and removed from their + // original + // locations. The parameters are temporarily kept here to give Tesseract + // users a chance to updated their [lang].traineddata and config files + // without introducing failures during Tesseract initialization. + // TODO(ocr-team): remove these parameters from the code once we are + // reasonably sure that Tesseract users have updated their data files. + // + // BEGIN DEPRECATED PARAMETERS + BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true, + "find horizontal lines such as headers in vertical page mode", + this->params()), + INT_MEMBER(tessedit_ok_mode, 5, "Acceptance decision algorithm", + this->params()), + BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, + "Load fixed length dawgs" + " (e.g. for non-space delimited languages)", + this->params()), + INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process", + this->params()), + BOOL_MEMBER(permute_debug, 0, "Debug char permutation process", + this->params()), + double_MEMBER(bestrate_pruning_factor, 2.0, + "Multiplying factor of" + " current best rate to prune other hypotheses", + this->params()), + BOOL_MEMBER(permute_script_word, 0, + "Turn on word script consistency permuter", this->params()), + BOOL_MEMBER(segment_segcost_rating, 0, + "incorporate segmentation cost in word rating?", + this->params()), + double_MEMBER(segment_reward_script, 0.95, + "Score multipler for script consistency within a word. " + "Being a 'reward' factor, it should be <= 1. " + "Smaller value implies bigger reward.", + this->params()), + BOOL_MEMBER(permute_fixed_length_dawg, 0, + "Turn on fixed-length phrasebook search permuter", + this->params()), + BOOL_MEMBER(permute_chartype_word, 0, + "Turn on character type (property) consistency permuter", + this->params()), + double_MEMBER(segment_reward_chartype, 0.97, + "Score multipler for char type consistency within a word. ", + this->params()), + double_MEMBER(segment_reward_ngram_best_choice, 0.99, + "Score multipler for ngram permuter's best choice" + " (only used in the Han script path).", + this->params()), + BOOL_MEMBER(ngram_permuter_activated, false, + "Activate character-level n-gram-based permuter", + this->params()), + BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter", + this->params()), + INT_MEMBER(language_model_fixed_length_choices_depth, 3, + "Depth of blob choice lists to explore" + " when fixed length dawgs are on", + this->params()), + BOOL_MEMBER(use_new_state_cost, FALSE, + "use new state cost heuristics for segmentation state" + " evaluation", + this->params()), + double_MEMBER(heuristic_segcost_rating_base, 1.25, + "base factor for adding segmentation cost into word rating." + "It's a multiplying factor, the larger the value above 1, " + "the bigger the effect of segmentation cost.", + this->params()), + double_MEMBER(heuristic_weight_rating, 1.0, + "weight associated with char rating in combined cost of" + "state", + this->params()), + double_MEMBER(heuristic_weight_width, 1000.0, + "weight associated with width evidence in combined cost of" + " state", + this->params()), + double_MEMBER(heuristic_weight_seamcut, 0.0, + "weight associated with seam cut in combined cost of state", + this->params()), + double_MEMBER(heuristic_max_char_wh_ratio, 2.0, + "max char width-to-height ratio allowed in segmentation", + this->params()), + BOOL_MEMBER(enable_new_segsearch, true, + "Enable new segmentation search path.", this->params()), + double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, + "Maximum character width-to-height ratio for" + " fixed-pitch fonts", + this->params()), + // END DEPRECATED PARAMETERS - backup_config_file_(NULL), - pix_binary_(NULL), - cube_binary_(NULL), - pix_grey_(NULL), - pix_thresholds_(NULL), - source_resolution_(0), - textord_(this), - right_to_left_(false), - scaled_color_(NULL), - scaled_factor_(-1), - deskew_(1.0f, 0.0f), - reskew_(1.0f, 0.0f), - most_recently_used_(this), - font_table_size_(0), + backup_config_file_(NULL), + pix_binary_(NULL), + cube_binary_(NULL), + pix_grey_(NULL), + pix_thresholds_(NULL), + source_resolution_(0), + textord_(this), + right_to_left_(false), + scaled_color_(NULL), + scaled_factor_(-1), + deskew_(1.0f, 0.0f), + reskew_(1.0f, 0.0f), + most_recently_used_(this), + font_table_size_(0), #ifndef ANDROID_BUILD - cube_cntxt_(NULL), - tess_cube_combiner_(NULL), + cube_cntxt_(NULL), + tess_cube_combiner_(NULL), #endif - equ_detect_(NULL) { + equ_detect_(NULL) { } Tesseract::~Tesseract() { diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index bd03fff642..d488fd30f3 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -283,8 +283,8 @@ class Tesseract : public Wordrec { int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr); void SetupWordScripts(BLOCK_LIST* blocks); - int AutoPageSeg(PageSegMode pageseg_mode, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, + int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks, + TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess, OSResults* osr); ColumnFinder* SetupPageSegAndDetectOrientation( bool single_column, bool osd, bool only_osd, @@ -328,8 +328,46 @@ class Tesseract : public Wordrec { WordRecognizer recognizer, WERD_RES** in_word, PointerVector* best_words); - void classify_word_and_language(WordRecognizer recognizer, - PAGE_RES_IT* pr_it, + // Moves good-looking "noise"/diacritics from the reject list to the main + // blob list on the current word. Returns true if anything was done, and + // sets make_next_word_fuzzy if blob(s) were added to the end of the word. + bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it, + bool* make_next_word_fuzzy); + // Attempts to put noise/diacritic outlines into the blobs that they overlap. + // Input: a set of noisy outlines that probably belong to the real_word. + // Output: outlines that overlapped blobs are set to NULL and put back into + // the word, either in the blobs or in the reject list. + void AssignDiacriticsToOverlappingBlobs( + const GenericVector& outlines, int pass, WERD* real_word, + PAGE_RES_IT* pr_it, GenericVector* word_wanted, + GenericVector* overlapped_any_blob, + GenericVector* target_blobs); + // Attempts to assign non-overlapping outlines to their nearest blobs or + // make new blobs out of them. + void AssignDiacriticsToNewBlobs(const GenericVector& outlines, + int pass, WERD* real_word, PAGE_RES_IT* pr_it, + GenericVector* word_wanted, + GenericVector* target_blobs); + // Starting with ok_outlines set to indicate which outlines overlap the blob, + // chooses the optimal set (approximately) and returns true if any outlines + // are desired, in which case ok_outlines indicates which ones. + bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, + PAGE_RES_IT* pr_it, C_BLOB* blob, + const GenericVector& outlines, + int num_outlines, + GenericVector* ok_outlines); + // Classifies the given blob plus the outlines flagged by ok_outlines, undoes + // the inclusion of the outlines, and returns the certainty of the raw choice. + float ClassifyBlobPlusOutlines(const GenericVector& ok_outlines, + const GenericVector& outlines, + int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob, + STRING* best_str); + // Classifies the given blob (part of word_data->word->word) as an individual + // word, using languages, chopper etc, returning only the certainty of the + // best raw choice, and undoing all the work done to fake out the word. + float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob, + STRING* best_str, float* c2); + void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it, WordData* word_data); void classify_word_pass1(const WordData& word_data, WERD_RES** in_word, @@ -808,6 +846,24 @@ class Tesseract : public Wordrec { "Enable single word correction based on the dictionary."); INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram " "correction."); + BOOL_VAR_H(enable_noise_removal, true, + "Remove and conditionally reassign small outlines when they" + " confuse layout analysis, determining diacritics vs noise"); + INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines"); + // Worst (min) certainty, for which a diacritic is allowed to make the base + // character worse and still be included. + double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty"); + // Worst (min) certainty, for which a non-overlapping diacritic is allowed to + // make the base character worse and still be included. + double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty"); + // Worst (min) certainty, for which a diacritic is allowed to make a new + // stand-alone blob. + double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty"); + // Factor of certainty margin for adding diacritics to not count as worse. + double_VAR_H(noise_cert_factor, 0.375, + "Scaling on certainty diff from Hingepoint"); + INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob"); + INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word"); INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug"); BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk"); STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation"); diff --git a/ccstruct/blobbox.h b/ccstruct/blobbox.h index bd26e1be95..b09d82f4da 100644 --- a/ccstruct/blobbox.h +++ b/ccstruct/blobbox.h @@ -137,6 +137,9 @@ class BLOBNBOX:public ELIST_LINK cblob_ptr = srcblob; area = static_cast(srcblob->area()); } + ~BLOBNBOX() { + if (owns_cblob_) delete cblob_ptr; + } static BLOBNBOX* RealBlob(C_OUTLINE* outline) { C_BLOB* blob = new C_BLOB(outline); return new BLOBNBOX(blob); @@ -387,6 +390,7 @@ class BLOBNBOX:public ELIST_LINK void set_base_char_blob(BLOBNBOX* blob) { base_char_blob_ = blob; } + void set_owns_cblob(bool value) { owns_cblob_ = value; } bool UniquelyVertical() const { return vert_possible_ && !horz_possible_; @@ -450,6 +454,7 @@ class BLOBNBOX:public ELIST_LINK // construction time. void ConstructionInit() { cblob_ptr = NULL; + owns_cblob_ = false; area = 0; area_stroke_width_ = 0.0f; horz_stroke_width_ = 0.0f; @@ -525,6 +530,10 @@ class BLOBNBOX:public ELIST_LINK bool vert_possible_; // Could be part of vertical flow. bool leader_on_left_; // There is a leader to the left. bool leader_on_right_; // There is a leader to the right. + // Iff true, then the destructor should delete the cblob_ptr. + // TODO(rays) migrate all uses to correctly setting this flag instead of + // deleting the C_BLOB before deleting the BLOBNBOX. + bool owns_cblob_; }; class TO_ROW: public ELIST2_LINK diff --git a/ccstruct/ocrblock.cpp b/ccstruct/ocrblock.cpp index a328e03887..ad7893b05a 100644 --- a/ccstruct/ocrblock.cpp +++ b/ccstruct/ocrblock.cpp @@ -86,6 +86,18 @@ void BLOCK::rotate(const FCOORD& rotation) { box = *poly_block()->bounding_box(); } +// Returns the bounding box including the desired combination of upper and +// lower noise/diacritic elements. +TBOX BLOCK::restricted_bounding_box(bool upper_dots, bool lower_dots) const { + TBOX box; + // This is a read-only iteration of the rows in the block. + ROW_IT it(const_cast(&rows)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + box += it.data()->restricted_bounding_box(upper_dots, lower_dots); + } + return box; +} + /** * BLOCK::reflect_polygon_in_y_axis * diff --git a/ccstruct/ocrblock.h b/ccstruct/ocrblock.h index 207c1e8579..c93aaf8a4c 100644 --- a/ccstruct/ocrblock.h +++ b/ccstruct/ocrblock.h @@ -161,10 +161,14 @@ class BLOCK:public ELIST_LINK, public PDBLK median_size_.set_y(y); } - Pix* render_mask() { - return PDBLK::render_mask(re_rotation_); + Pix* render_mask(TBOX* mask_box) { + return PDBLK::render_mask(re_rotation_, mask_box); } + // Returns the bounding box including the desired combination of upper and + // lower noise/diacritic elements. + TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; + // Reflects the polygon in the y-axis and recomputes the bounding_box. // Does nothing to any contained rows/words/blobs etc. void reflect_polygon_in_y_axis(); diff --git a/ccstruct/ocrrow.cpp b/ccstruct/ocrrow.cpp index a7ad6ba791..c6f919ca12 100644 --- a/ccstruct/ocrrow.cpp +++ b/ccstruct/ocrrow.cpp @@ -80,6 +80,17 @@ ROW::ROW( //constructor rmargin_ = 0; } +// Returns the bounding box including the desired combination of upper and +// lower noise/diacritic elements. +TBOX ROW::restricted_bounding_box(bool upper_dots, bool lower_dots) const { + TBOX box; + // This is a read-only iteration of the words in the row. + WERD_IT it(const_cast(&words)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + box += it.data()->restricted_bounding_box(upper_dots, lower_dots); + } + return box; +} /********************************************************************** * ROW::recalc_bounding_box diff --git a/ccstruct/ocrrow.h b/ccstruct/ocrrow.h index 1a23889279..45384b710f 100644 --- a/ccstruct/ocrrow.h +++ b/ccstruct/ocrrow.h @@ -85,6 +85,9 @@ class ROW:public ELIST_LINK TBOX bounding_box() const { //return bounding box return bound_box; } + // Returns the bounding box including the desired combination of upper and + // lower noise/diacritic elements. + TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; void set_lmargin(inT16 lmargin) { lmargin_ = lmargin; diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp index 5304451929..9c1b13c5c3 100644 --- a/ccstruct/pageres.cpp +++ b/ccstruct/pageres.cpp @@ -1258,23 +1258,16 @@ int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const { return 0; } -// Inserts the new_word and a corresponding WERD_RES before the current -// position. The simple fields of the WERD_RES are copied from clone_res and -// the resulting WERD_RES is returned for further setup with best_choice etc. +// Inserts the new_word as a combination owned by a corresponding WERD_RES +// before the current position. The simple fields of the WERD_RES are copied +// from clone_res and the resulting WERD_RES is returned for further setup +// with best_choice etc. WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word) { - // Insert new_word into the ROW. - WERD_IT w_it(row()->row->word_list()); - for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { - WERD* word = w_it.data(); - if (word == word_res->word) - break; - } - ASSERT_HOST(!w_it.cycled_list()); - w_it.add_before_then_move(new_word); // Make a WERD_RES for the new_word. WERD_RES* new_res = new WERD_RES(new_word); new_res->CopySimpleFields(clone_res); + new_res->combination = true; // Insert into the appropriate place in the ROW_RES. WERD_RES_IT wr_it(&row()->word_res_list); for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { @@ -1477,6 +1470,33 @@ void PAGE_RES_IT::DeleteCurrentWord() { ResetWordIterator(); } +// Makes the current word a fuzzy space if not already fuzzy. Updates +// corresponding part of combo if required. +void PAGE_RES_IT::MakeCurrentWordFuzzy() { + WERD* real_word = word_res->word; + if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) { + real_word->set_flag(W_FUZZY_SP, true); + tprintf("Made word fuzzy at:"); + real_word->bounding_box().print(); + if (word_res->combination) { + // The next word should be the corresponding part of combo, but we have + // already stepped past it, so find it by search. + WERD_RES_IT wr_it(&row()->word_res_list); + for (wr_it.mark_cycle_pt(); + !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) { + } + wr_it.forward(); + ASSERT_HOST(wr_it.data()->part_of_combo); + real_word = wr_it.data()->word; + ASSERT_HOST(!real_word->flag(W_FUZZY_SP) && + !real_word->flag(W_FUZZY_NON)); + real_word->set_flag(W_FUZZY_SP, true); + tprintf("Made part of combo word fuzzy at:"); + real_word->bounding_box().print(); + } + } +} + /************************************************************************* * PAGE_RES_IT::restart_page * diff --git a/ccstruct/pageres.h b/ccstruct/pageres.h index 75798113d4..a6a8404275 100644 --- a/ccstruct/pageres.h +++ b/ccstruct/pageres.h @@ -708,6 +708,10 @@ class PAGE_RES_IT { // Deletes the current WERD_RES and its underlying WERD. void DeleteCurrentWord(); + // Makes the current word a fuzzy space if not already fuzzy. Updates + // corresponding part of combo if required. + void MakeCurrentWordFuzzy(); + WERD_RES *forward() { // Get next word. return internal_forward(false, false); } @@ -747,9 +751,9 @@ class PAGE_RES_IT { return next_block_res; } void rej_stat_word(); // for page/block/row + void ResetWordIterator(); private: - void ResetWordIterator(); WERD_RES *internal_forward(bool new_block, bool empty_ok); WERD_RES * prev_word_res; // previous word diff --git a/ccstruct/pdblock.cpp b/ccstruct/pdblock.cpp index 97365b53e7..cf3289f2e7 100644 --- a/ccstruct/pdblock.cpp +++ b/ccstruct/pdblock.cpp @@ -77,7 +77,6 @@ void PDBLK::set_sides( //set vertex lists right_it.add_list_before (right); } - /********************************************************************** * PDBLK::contains * @@ -126,7 +125,7 @@ void PDBLK::move( // reposition block // Returns a binary Pix mask with a 1 pixel for every pixel within the // block. Rotates the coordinate system by rerotation prior to rendering. -Pix* PDBLK::render_mask(const FCOORD& rerotation) { +Pix* PDBLK::render_mask(const FCOORD& rerotation, TBOX* mask_box) { TBOX rotated_box(box); rotated_box.rotate(rerotation); Pix* pix = pixCreate(rotated_box.width(), rotated_box.height(), 1); @@ -163,6 +162,7 @@ Pix* PDBLK::render_mask(const FCOORD& rerotation) { pixRasterop(pix, 0, 0, rotated_box.width(), rotated_box.height(), PIX_SET, NULL, 0, 0); } + if (mask_box != NULL) *mask_box = rotated_box; return pix; } diff --git a/ccstruct/pdblock.h b/ccstruct/pdblock.h index 34f5518e3c..0dd0bf2ef8 100644 --- a/ccstruct/pdblock.h +++ b/ccstruct/pdblock.h @@ -89,7 +89,9 @@ class PDBLK // Returns a binary Pix mask with a 1 pixel for every pixel within the // block. Rotates the coordinate system by rerotation prior to rendering. - Pix* render_mask(const FCOORD& rerotation); + // If not NULL, mask_box is filled with the position box of the returned + // mask image. + Pix *render_mask(const FCOORD &rerotation, TBOX *mask_box); #ifndef GRAPHICS_DISABLED ///draw histogram diff --git a/ccstruct/werd.cpp b/ccstruct/werd.cpp index 24c8a41b33..aaaee9cc23 100644 --- a/ccstruct/werd.cpp +++ b/ccstruct/werd.cpp @@ -160,23 +160,37 @@ WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) { * row being marked as FUZZY space. */ -TBOX WERD::bounding_box() { - TBOX box; // box being built - C_BLOB_IT rej_cblob_it = &rej_cblobs; // rejected blobs - - for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list(); - rej_cblob_it.forward()) { - box += rej_cblob_it.data()->bounding_box(); +TBOX WERD::bounding_box() const { return restricted_bounding_box(true, true); } + +// Returns the bounding box including the desired combination of upper and +// lower noise/diacritic elements. +TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const { + TBOX box = true_bounding_box(); + int bottom = box.bottom(); + int top = box.top(); + // This is a read-only iteration of the rejected blobs. + C_BLOB_IT it(const_cast(&rej_cblobs)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + TBOX dot_box = it.data()->bounding_box(); + if ((upper_dots || dot_box.bottom() <= top) && + (lower_dots || dot_box.top() >= bottom)) { + box += dot_box; + } } + return box; +} - C_BLOB_IT it = &cblobs; // blobs of WERD +// Returns the bounding box of only the good blobs. +TBOX WERD::true_bounding_box() const { + TBOX box; // box being built + // This is a read-only iteration of the good blobs. + C_BLOB_IT it(const_cast(&cblobs)); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { box += it.data()->bounding_box(); } return box; } - /** * WERD::move * @@ -489,3 +503,101 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs, } return new_werd; } + +// Removes noise from the word by moving small outlines to the rej_cblobs +// list, based on the size_threshold. +void WERD::CleanNoise(float size_threshold) { + C_BLOB_IT blob_it(&cblobs); + C_BLOB_IT rej_it(&rej_cblobs); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + C_BLOB* blob = blob_it.data(); + C_OUTLINE_IT ol_it(blob->out_list()); + for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { + C_OUTLINE* outline = ol_it.data(); + TBOX ol_box = outline->bounding_box(); + int ol_size = + ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height(); + if (ol_size < size_threshold) { + // This outline is too small. Move it to a separate blob in the + // reject blobs list. + C_BLOB* rej_blob = new C_BLOB(ol_it.extract()); + rej_it.add_after_then_move(rej_blob); + } + } + if (blob->out_list()->empty()) delete blob_it.extract(); + } +} + +// Extracts all the noise outlines and stuffs the pointers into the given +// vector of outlines. Afterwards, the outlines vector owns the pointers. +void WERD::GetNoiseOutlines(GenericVector* outlines) { + C_BLOB_IT rej_it(&rej_cblobs); + for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) { + C_BLOB* blob = rej_it.extract(); + C_OUTLINE_IT ol_it(blob->out_list()); + outlines->push_back(ol_it.extract()); + delete blob; + } +} + +// Adds the selected outlines to the indcated real blobs, and puts the rest +// back in rej_cblobs where they came from. Where the target_blobs entry is +// NULL, a run of wanted outlines is put into a single new blob. +// Ownership of the outlines is transferred back to the word. (Hence +// GenericVector and not PointerVector.) +// Returns true if any new blob was added to the start of the word, which +// suggests that it might need joining to the word before it, and likewise +// sets make_next_word_fuzzy true if any new blob was added to the end. +bool WERD::AddSelectedOutlines(const GenericVector& wanted, + const GenericVector& target_blobs, + const GenericVector& outlines, + bool* make_next_word_fuzzy) { + bool outline_added_to_start = false; + if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = false; + C_BLOB_IT rej_it(&rej_cblobs); + for (int i = 0; i < outlines.size(); ++i) { + C_OUTLINE* outline = outlines[i]; + if (outline == NULL) continue; // Already used it. + if (wanted[i]) { + C_BLOB* target_blob = target_blobs[i]; + TBOX noise_box = outline->bounding_box(); + if (target_blob == NULL) { + target_blob = new C_BLOB(outline); + // Need to find the insertion point. + C_BLOB_IT blob_it(&cblobs); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); + blob_it.forward()) { + C_BLOB* blob = blob_it.data(); + TBOX blob_box = blob->bounding_box(); + if (blob_box.left() > noise_box.left()) { + if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) { + // We might want to join this word to its predecessor. + outline_added_to_start = true; + } + blob_it.add_before_stay_put(target_blob); + break; + } + } + if (blob_it.cycled_list()) { + blob_it.add_to_end(target_blob); + if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = true; + } + // Add all consecutive wanted, but null-blob outlines to same blob. + C_OUTLINE_IT ol_it(target_blob->out_list()); + while (i + 1 < outlines.size() && wanted[i + 1] && + target_blobs[i + 1] == NULL) { + ++i; + ol_it.add_to_end(outlines[i]); + } + } else { + // Insert outline into this blob. + C_OUTLINE_IT ol_it(target_blob->out_list()); + ol_it.add_to_end(outline); + } + } else { + // Put back on noise list. + rej_it.add_to_end(new C_BLOB(outline)); + } + } + return outline_added_to_start; +} diff --git a/ccstruct/werd.h b/ccstruct/werd.h index 43ecb84b6e..f9a89fb5b5 100644 --- a/ccstruct/werd.h +++ b/ccstruct/werd.h @@ -114,7 +114,13 @@ class WERD : public ELIST2_LINK { script_id_ = id; } - TBOX bounding_box(); // compute bounding box + // Returns the (default) bounding box including all the dots. + TBOX bounding_box() const; // compute bounding box + // Returns the bounding box including the desired combination of upper and + // lower noise/diacritic elements. + TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; + // Returns the bounding box of only the good blobs. + TBOX true_bounding_box() const; const char *text() const { return correct.string(); } void set_text(const char *new_text) { correct = new_text; } @@ -155,6 +161,26 @@ class WERD : public ELIST2_LINK { void plot_rej_blobs(ScrollView *window); #endif // GRAPHICS_DISABLED + // Removes noise from the word by moving small outlines to the rej_cblobs + // list, based on the size_threshold. + void CleanNoise(float size_threshold); + + // Extracts all the noise outlines and stuffs the pointers into the given + // vector of outlines. Afterwards, the outlines vector owns the pointers. + void GetNoiseOutlines(GenericVector *outlines); + // Adds the selected outlines to the indcated real blobs, and puts the rest + // back in rej_cblobs where they came from. Where the target_blobs entry is + // NULL, a run of wanted outlines is put into a single new blob. + // Ownership of the outlines is transferred back to the word. (Hence + // GenericVector and not PointerVector.) + // Returns true if any new blob was added to the start of the word, which + // suggests that it might need joining to the word before it, and likewise + // sets make_next_word_fuzzy true if any new blob was added to the end. + bool AddSelectedOutlines(const GenericVector &wanted, + const GenericVector &target_blobs, + const GenericVector &outlines, + bool *make_next_word_fuzzy); + private: uinT8 blanks; // no of blanks uinT8 dummy; // padding diff --git a/textord/colfind.cpp b/textord/colfind.cpp index b9b10649af..41b3895602 100644 --- a/textord/colfind.cpp +++ b/textord/colfind.cpp @@ -286,22 +286,27 @@ void ColumnFinder::CorrectOrientation(TO_BLOCK* block, // thresholds_pix is expected to be present iff grey_pix is present and // can be an integer factor reduction of the grey_pix. It represents the // thresholds that were used to create the binary_pix from the grey_pix. +// If diacritic_blobs is non-null, then diacritics/noise blobs, that would +// confuse layout anaylsis by causing textline overlap, are placed there, +// with the expectation that they will be reassigned to words later and +// noise/diacriticness determined via classification. // Returns -1 if the user hits the 'd' key in the blocks window while running // in debug mode, which requests a retry with more debug info. -int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, - Pix* scaled_color, int scaled_factor, - TO_BLOCK* input_block, Pix* photo_mask_pix, - Pix* thresholds_pix, Pix* grey_pix, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) { +int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, + int scaled_factor, TO_BLOCK* input_block, + Pix* photo_mask_pix, Pix* thresholds_pix, + Pix* grey_pix, BLOCK_LIST* blocks, + BLOBNBOX_LIST* diacritic_blobs, + TO_BLOCK_LIST* to_blocks) { pixOr(photo_mask_pix, photo_mask_pix, nontext_map_); stroke_width_->FindLeaderPartitions(input_block, &part_grid_); stroke_width_->RemoveLineResidue(&big_parts_); FindInitialTabVectors(NULL, min_gutter_width_, tabfind_aligned_gap_fraction_, input_block); SetBlockRuleEdges(input_block); - stroke_width_->GradeBlobsIntoPartitions(rerotate_, input_block, nontext_map_, - denorm_, cjk_script_, &projection_, - &part_grid_, &big_parts_); + stroke_width_->GradeBlobsIntoPartitions( + rerotate_, input_block, nontext_map_, denorm_, cjk_script_, &projection_, + diacritic_blobs, &part_grid_, &big_parts_); if (!PSM_SPARSE(pageseg_mode)) { ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_, input_block, this, &part_grid_, &big_parts_); @@ -1134,9 +1139,13 @@ void ColumnFinder::GridMergePartitions() { neighbour->Print(); } rsearch.RemoveBBox(); - gsearch.RepositionIterator(); + if (!modified_box) { + // We are going to modify part, so remove it and re-insert it after. + gsearch.RemoveBBox(); + rsearch.RepositionIterator(); + modified_box = true; + } part->Absorb(neighbour, WidthCB()); - modified_box = true; } else if (debug) { tprintf("Neighbour failed hgap test\n"); } @@ -1151,7 +1160,6 @@ void ColumnFinder::GridMergePartitions() { // or it will never be found by a full search. // Because the box has changed, it has to be removed first, otherwise // add_sorted may fail to keep a single copy of the pointer. - gsearch.RemoveBBox(); part_grid_.InsertBBox(true, true, part); gsearch.RepositionIterator(); } diff --git a/textord/colfind.h b/textord/colfind.h index 04ad1684de..eedd4c407e 100644 --- a/textord/colfind.h +++ b/textord/colfind.h @@ -155,13 +155,15 @@ class ColumnFinder : public TabFind { // thresholds_pix is expected to be present iff grey_pix is present and // can be an integer factor reduction of the grey_pix. It represents the // thresholds that were used to create the binary_pix from the grey_pix. + // Small blobs that confuse the segmentation into lines are placed into + // diacritic_blobs, with the intention that they be put into the most + // appropriate word after the rest of layout analysis. // Returns -1 if the user hits the 'd' key in the blocks window while running // in debug mode, which requests a retry with more debug info. - int FindBlocks(PageSegMode pageseg_mode, - Pix* scaled_color, int scaled_factor, - TO_BLOCK* block, Pix* photo_mask_pix, - Pix* thresholds_pix, Pix* grey_pix, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks); + int FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, int scaled_factor, + TO_BLOCK* block, Pix* photo_mask_pix, Pix* thresholds_pix, + Pix* grey_pix, BLOCK_LIST* blocks, + BLOBNBOX_LIST* diacritic_blobs, TO_BLOCK_LIST* to_blocks); // Get the rotation required to deskew, and its inverse rotation. void GetDeskewVectors(FCOORD* deskew, FCOORD* reskew); diff --git a/textord/colpartition.cpp b/textord/colpartition.cpp index e9ce568aa3..565c660bb2 100644 --- a/textord/colpartition.cpp +++ b/textord/colpartition.cpp @@ -297,6 +297,25 @@ void ColPartition::DisownBoxesNoAssert() { } } +// NULLs the owner of the blobs in this partition that are owned by this +// partition and not leader blobs, removing them from the boxes_ list, thus +// turning this partition back to a leader partition if it contains a leader, +// or otherwise leaving it empty. Returns true if any boxes remain. +bool ColPartition::ReleaseNonLeaderBoxes() { + BLOBNBOX_C_IT bb_it(&boxes_); + for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) { + BLOBNBOX* bblob = bb_it.data(); + if (bblob->flow() != BTFT_LEADER) { + if (bblob->owner() == this) bblob->set_owner(NULL); + bb_it.extract(); + } + } + if (bb_it.empty()) return false; + flow_ = BTFT_LEADER; + ComputeLimits(); + return true; +} + // Delete the boxes that this partition owns. void ColPartition::DeleteBoxes() { // Although the boxes_ list is a C_LIST, in some cases it owns the @@ -831,6 +850,10 @@ ColPartition* ColPartition::SplitAt(int split_x) { bbox->set_owner(split_part); } } + if (it.empty()) { + // Possible if split-x passes through the first blob. + it.add_list_after(&split_part->boxes_); + } ASSERT_HOST(!it.empty()); if (split_part->IsEmpty()) { // Split part ended up with nothing. Possible if split_x passes @@ -1130,6 +1153,7 @@ bool ColPartition::MarkAsLeaderIfMonospaced() { if (best_end != NULL && best_end->total_cost() < blob_count) { // Good enough. Call it a leader. result = true; + bool modified_blob_list = false; for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { BLOBNBOX* blob = it.data(); TBOX box = blob->bounding_box(); @@ -1139,6 +1163,7 @@ bool ColPartition::MarkAsLeaderIfMonospaced() { blob->bounding_box().right(); if (blob->bounding_box().width() + gap > max_step) { it.extract(); + modified_blob_list = true; continue; } } @@ -1147,12 +1172,14 @@ bool ColPartition::MarkAsLeaderIfMonospaced() { it.data_relative(-1)->bounding_box().right(); if (blob->bounding_box().width() + gap > max_step) { it.extract(); + modified_blob_list = true; break; } } blob->set_region_type(BRT_TEXT); blob->set_flow(BTFT_LEADER); } + if (modified_blob_list) ComputeLimits(); blob_type_ = BRT_TEXT; flow_ = BTFT_LEADER; } else if (textord_debug_tabfind) { diff --git a/textord/colpartition.h b/textord/colpartition.h index 7f6cd64328..1b35d48545 100644 --- a/textord/colpartition.h +++ b/textord/colpartition.h @@ -481,6 +481,11 @@ class ColPartition : public ELIST2_LINK { // Any blobs that are not owned by this partition get to keep their owner // without an assert failure. void DisownBoxesNoAssert(); + // NULLs the owner of the blobs in this partition that are owned by this + // partition and not leader blobs, removing them from the boxes_ list, thus + // turning this partition back to a leader partition if it contains a leader, + // or otherwise leaving it empty. Returns true if any boxes remain. + bool ReleaseNonLeaderBoxes(); // Delete the boxes that this partition owns. void DeleteBoxes(); diff --git a/textord/colpartitiongrid.cpp b/textord/colpartitiongrid.cpp index 6cd8f31c93..800cbcb3c9 100644 --- a/textord/colpartitiongrid.cpp +++ b/textord/colpartitiongrid.cpp @@ -324,6 +324,40 @@ static bool TestCompatibleCandidates(const ColPartition& part, bool debug, return true; } +// Computes and returns the total overlap of all partitions in the grid. +// If overlap_grid is non-null, it is filled with a grid that holds empty +// partitions representing the union of all overlapped partitions. +int ColPartitionGrid::ComputeTotalOverlap(ColPartitionGrid** overlap_grid) { + int total_overlap = 0; + // Iterate the ColPartitions in the grid. + ColPartitionGridSearch gsearch(this); + gsearch.StartFullSearch(); + ColPartition* part; + while ((part = gsearch.NextFullSearch()) != NULL) { + ColPartition_CLIST neighbors; + const TBOX& part_box = part->bounding_box(); + FindOverlappingPartitions(part_box, part, &neighbors); + ColPartition_C_IT n_it(&neighbors); + bool any_part_overlap = false; + for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) { + const TBOX& n_box = n_it.data()->bounding_box(); + int overlap = n_box.intersection(part_box).area(); + if (overlap > 0 && overlap_grid != NULL) { + if (*overlap_grid == NULL) { + *overlap_grid = new ColPartitionGrid(gridsize(), bleft(), tright()); + } + (*overlap_grid)->InsertBBox(true, true, n_it.data()->ShallowCopy()); + if (!any_part_overlap) { + (*overlap_grid)->InsertBBox(true, true, part->ShallowCopy()); + } + } + any_part_overlap = true; + total_overlap += overlap; + } + } + return total_overlap; +} + // Finds all the ColPartitions in the grid that overlap with the given // box and returns them SortByBoxLeft(ed) and uniqued in the given list. // Any partition equal to not_this (may be NULL) is excluded. @@ -901,6 +935,7 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) { while ((part = gsearch.NextFullSearch()) != NULL) { BlobRegionType blob_type = part->blob_type(); BlobTextFlowType flow = part->flow(); + bool any_blobs_moved = false; if (blob_type == BRT_POLYIMAGE || blob_type == BRT_RECTIMAGE) { BLOBNBOX_C_IT blob_it(part->boxes()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { @@ -918,6 +953,7 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) { ASSERT_HOST(blob->cblob()->area() != 0); blob->set_owner(NULL); blob_it.extract(); + any_blobs_moved = true; } else { blob->set_region_type(blob_type); if (blob->flow() != BTFT_LEADER) @@ -938,6 +974,11 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) { delete blob; } } + } else if (any_blobs_moved) { + gsearch.RemoveBBox(); + part->ComputeLimits(); + InsertBBox(true, true, part); + gsearch.RepositionIterator(); } } } @@ -1048,6 +1089,24 @@ void ColPartitionGrid::DeleteUnknownParts(TO_BLOCK* block) { block->DeleteUnownedNoise(); } +// Deletes all the partitions in the grid that are NOT of flow type BTFT_LEADER. +void ColPartitionGrid::DeleteNonLeaderParts() { + ColPartitionGridSearch gsearch(this); + gsearch.StartFullSearch(); + ColPartition* part; + while ((part = gsearch.NextFullSearch()) != NULL) { + if (part->flow() != BTFT_LEADER) { + gsearch.RemoveBBox(); + if (part->ReleaseNonLeaderBoxes()) { + InsertBBox(true, true, part); + gsearch.RepositionIterator(); + } else { + delete part; + } + } + } +} + // Finds and marks text partitions that represent figure captions. void ColPartitionGrid::FindFigureCaptions() { // For each image region find its best candidate text caption region, diff --git a/textord/colpartitiongrid.h b/textord/colpartitiongrid.h index 40946e5746..94e7da2c43 100644 --- a/textord/colpartitiongrid.h +++ b/textord/colpartitiongrid.h @@ -63,6 +63,11 @@ class ColPartitionGrid : public BBGrid* confirm_cb, ColPartition* part); + // Computes and returns the total overlap of all partitions in the grid. + // If overlap_grid is non-null, it is filled with a grid that holds empty + // partitions representing the union of all overlapped partitions. + int ComputeTotalOverlap(ColPartitionGrid** overlap_grid); + // Finds all the ColPartitions in the grid that overlap with the given // box and returns them SortByBoxLeft(ed) and uniqued in the given list. // Any partition equal to not_this (may be NULL) is excluded. @@ -165,6 +170,10 @@ class ColPartitionGrid : public BBGridConstructProjection(block, rerotation, nontext_map_); if (textord_tabfind_show_strokewidths) { ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs"); @@ -375,7 +379,19 @@ void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation, // Clear and re Insert to take advantage of the removed diacritics. Clear(); InsertBlobs(block); - FindInitialPartitions(rerotation, block, part_grid, big_parts); + FCOORD skew; + FindTextlineFlowDirection(true); + PartitionFindResult r = FindInitialPartitions( + rerotation, true, block, diacritic_blobs, part_grid, big_parts, &skew); + if (r == PFR_NOISE) { + tprintf("Detected %d diacritics\n", diacritic_blobs->length()); + // Noise was found, and removed. + Clear(); + InsertBlobs(block); + FindTextlineFlowDirection(true); + r = FindInitialPartitions(rerotation, false, block, diacritic_blobs, + part_grid, big_parts, &skew); + } nontext_map_ = NULL; projection_ = NULL; denorm_ = NULL; @@ -1220,10 +1236,17 @@ void StrokeWidth::SmoothNeighbourTypes(BLOBNBOX* blob, bool reset_all) { // minimize overlap and smoothes the types with neighbours and the color // image if provided. rerotation is used to rotate the coordinate space // back to the nontext_map_ image. -void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, - TO_BLOCK* block, - ColPartitionGrid* part_grid, - ColPartition_LIST* big_parts) { +// If find_problems is true, detects possible noise pollution by the amount +// of partition overlap that is created by the diacritics. If excessive, the +// noise is separated out into diacritic blobs, and PFR_NOISE is returned. +// [TODO(rays): if the partition overlap is caused by heavy skew, deskews +// the components, saves the skew_angle and returns PFR_SKEW.] If the return +// is not PFR_OK, the job is incomplete, and FindInitialPartitions must be +// called again after cleaning up the partly done work. +PartitionFindResult StrokeWidth::FindInitialPartitions( + const FCOORD& rerotation, bool find_problems, TO_BLOCK* block, + BLOBNBOX_LIST* diacritic_blobs, ColPartitionGrid* part_grid, + ColPartition_LIST* big_parts, FCOORD* skew_angle) { FindVerticalTextChains(part_grid); FindHorizontalTextChains(part_grid); if (textord_tabfind_show_strokewidths) { @@ -1231,6 +1254,10 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, part_grid->DisplayBoxes(chains_win_); projection_->DisplayProjection(); } + if (find_problems) { + // TODO(rays) Do something to find skew, set skew_angle and return if there + // is some. + } part_grid->SplitOverlappingPartitions(big_parts); EasyMerges(part_grid); RemoveLargeUnusedBlobs(block, part_grid, big_parts); @@ -1239,8 +1266,14 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, rerotation)); while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_, grid_box, rerotation)); + int pre_overlap = part_grid->ComputeTotalOverlap(NULL); TestDiacritics(part_grid, block); MergeDiacritics(block, part_grid); + if (find_problems && diacritic_blobs != NULL && + DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid, + diacritic_blobs)) { + return PFR_NOISE; + } if (textord_tabfind_show_strokewidths) { textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs"); part_grid->DisplayBoxes(textlines_win_); @@ -1260,6 +1293,57 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs"); part_grid->DisplayBoxes(smoothed_win_); } + return PFR_OK; +} + +// Detects noise by a significant increase in partition overlap from +// pre_overlap to now, and removes noise from the union of all the overlapping +// partitions, placing the blobs in diacritic_blobs. Returns true if any noise +// was found and removed. +bool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box, + TO_BLOCK* block, + ColPartitionGrid* part_grid, + BLOBNBOX_LIST* diacritic_blobs) { + ColPartitionGrid* noise_grid = NULL; + int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid); + if (pre_overlap == 0) pre_overlap = 1; + BLOBNBOX_IT diacritic_it(diacritic_blobs); + if (noise_grid != NULL) { + if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor && + post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) { + // This is noisy enough to fix. + if (textord_tabfind_show_strokewidths) { + ScrollView* noise_win = MakeWindow(1000, 500, "Noise Areas"); + noise_grid->DisplayBoxes(noise_win); + } + part_grid->DeleteNonLeaderParts(); + BLOBNBOX_IT blob_it(&block->noise_blobs); + ColPartitionGridSearch rsearch(noise_grid); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + BLOBNBOX* blob = blob_it.data(); + blob->ClearNeighbours(); + if (!blob->IsDiacritic() || blob->owner() != NULL) + continue; // Not a noise candidate. + TBOX blob_box(blob->bounding_box()); + TBOX search_box(blob->bounding_box()); + search_box.pad(gridsize(), gridsize()); + rsearch.StartRectSearch(search_box); + ColPartition* part = rsearch.NextRectSearch(); + if (part != NULL) { + // Consider blob as possible noise. + blob->set_owns_cblob(true); + blob->compute_bounding_box(); + diacritic_it.add_after_then_move(blob_it.extract()); + } + } + noise_grid->DeleteParts(); + delete noise_grid; + return true; + } + noise_grid->DeleteParts(); + delete noise_grid; + } + return false; } // Helper verifies that blob's neighbour in direction dir is good to add to a diff --git a/textord/strokewidth.h b/textord/strokewidth.h index 5d649b5708..12cb3c91f6 100644 --- a/textord/strokewidth.h +++ b/textord/strokewidth.h @@ -41,6 +41,14 @@ enum LeftOrRight { LR_RIGHT }; +// Return value from FindInitialPartitions indicates detection of severe +// skew or noise. +enum PartitionFindResult { + PFR_OK, // Everything is OK. + PFR_SKEW, // Skew was detected and rotated. + PFR_NOISE // Noise was detected and removed. +}; + /** * The StrokeWidth class holds all the normal and large blobs. * It is used to find good large blobs and move them to the normal blobs @@ -110,12 +118,10 @@ class StrokeWidth : public BlobGrid { // part_grid is the output grid of textline partitions. // Large blobs that cause overlap are put in separate partitions and added // to the big_parts list. - void GradeBlobsIntoPartitions(const FCOORD& rerotation, - TO_BLOCK* block, - Pix* nontext_pix, - const DENORM* denorm, - bool cjk_script, - TextlineProjection* projection, + void GradeBlobsIntoPartitions(const FCOORD& rerotation, TO_BLOCK* block, + Pix* nontext_pix, const DENORM* denorm, + bool cjk_script, TextlineProjection* projection, + BLOBNBOX_LIST* diacritic_blobs, ColPartitionGrid* part_grid, ColPartition_LIST* big_parts); @@ -205,10 +211,26 @@ class StrokeWidth : public BlobGrid { // minimize overlap and smoothes the types with neighbours and the color // image if provided. rerotation is used to rotate the coordinate space // back to the nontext_map_ image. - void FindInitialPartitions(const FCOORD& rerotation, - TO_BLOCK* block, - ColPartitionGrid* part_grid, - ColPartition_LIST* big_parts); + // If find_problems is true, detects possible noise pollution by the amount + // of partition overlap that is created by the diacritics. If excessive, the + // noise is separated out into diacritic blobs, and PFR_NOISE is returned. + // [TODO(rays): if the partition overlap is caused by heavy skew, deskews + // the components, saves the skew_angle and returns PFR_SKEW.] If the return + // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be + // called again after cleaning up the partly done work. + PartitionFindResult FindInitialPartitions(const FCOORD& rerotation, + bool find_problems, TO_BLOCK* block, + BLOBNBOX_LIST* diacritic_blobs, + ColPartitionGrid* part_grid, + ColPartition_LIST* big_parts, + FCOORD* skew_angle); + // Detects noise by a significant increase in partition overlap from + // pre_overlap to now, and removes noise from the union of all the overlapping + // partitions, placing the blobs in diacritic_blobs. Returns true if any noise + // was found and removed. + bool DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box, + TO_BLOCK* block, ColPartitionGrid* part_grid, + BLOBNBOX_LIST* diacritic_blobs); // Finds vertical chains of text-like blobs and puts them in ColPartitions. void FindVerticalTextChains(ColPartitionGrid* part_grid); // Finds horizontal chains of text-like blobs and puts them in ColPartitions. diff --git a/textord/tablefind.cpp b/textord/tablefind.cpp index 888fe145f5..2e38bada0b 100644 --- a/textord/tablefind.cpp +++ b/textord/tablefind.cpp @@ -974,12 +974,12 @@ bool TableFinder::HasLeaderAdjacent(const ColPartition& part) { hsearch.StartSideSearch(x, bottom, top); ColPartition* leader = NULL; while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) { - // This should not happen, they are in different grids. - ASSERT_HOST(&part != leader); // The leader could be a horizontal ruling in the grid. // Make sure it is actually a leader. if (leader->flow() != BTFT_LEADER) continue; + // This should not happen, they are in different grids. + ASSERT_HOST(&part != leader); // Make sure the leader shares a page column with the partition, // otherwise we are spreading across columns. if (!part.IsInSameColumnAs(*leader)) diff --git a/textord/textord.cpp b/textord/textord.cpp index cf2fc04fe3..6156e45b3b 100644 --- a/textord/textord.cpp +++ b/textord/textord.cpp @@ -268,7 +268,7 @@ Textord::~Textord() { void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew, int width, int height, Pix* binary_pix, Pix* thresholds_pix, Pix* grey_pix, - bool use_box_bottoms, + bool use_box_bottoms, BLOBNBOX_LIST* diacritic_blobs, BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) { page_tr_.set_x(width); page_tr_.set_y(height); @@ -340,9 +340,9 @@ void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew, make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(), to_block->block->row_list()); } - cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks); // Remove empties. - + cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks); + TransferDiacriticsToBlockGroups(diacritic_blobs, blocks); // Compute the margins for each row in the block, to be used later for // paragraph detection. BLOCK_IT b_it(blocks); diff --git a/textord/textord.h b/textord/textord.h index b99541efce..cc9cb1d341 100644 --- a/textord/textord.h +++ b/textord/textord.h @@ -22,6 +22,7 @@ #define TESSERACT_TEXTORD_TEXTORD_H__ #include "ccstruct.h" +#include "bbgrid.h" #include "blobbox.h" #include "gap_map.h" #include "publictypes.h" // For PageSegMode. @@ -35,6 +36,35 @@ class ScrollView; namespace tesseract { +// A simple class that can be used by BBGrid to hold a word and an expanded +// bounding box that makes it easy to find words to put diacritics. +class WordWithBox { + public: + WordWithBox() : word_(NULL) {} + explicit WordWithBox(WERD *word) + : word_(word), bounding_box_(word->bounding_box()) { + int height = bounding_box_.height(); + bounding_box_.pad(height, height); + } + + const TBOX &bounding_box() const { return bounding_box_; } + // Returns the bounding box of only the good blobs. + TBOX true_bounding_box() const { return word_->true_bounding_box(); } + C_BLOB_LIST *RejBlobs() const { return word_->rej_cblob_list(); } + const WERD *word() const { return word_; } + + private: + // Borrowed pointer to a real word somewhere that must outlive this class. + WERD *word_; + // Cached expanded bounding box of the word, padded all round by its height. + TBOX bounding_box_; +}; + +// Make it usable by BBGrid. +CLISTIZEH(WordWithBox) +typedef BBGrid WordGrid; +typedef GridSearch WordSearch; + class Textord { public: explicit Textord(CCStruct* ccstruct); @@ -47,11 +77,13 @@ class Textord { // thresholds_pix is expected to be present iff grey_pix is present and // can be an integer factor reduction of the grey_pix. It represents the // thresholds that were used to create the binary_pix from the grey_pix. - void TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew, - int width, int height, Pix* binary_pix, - Pix* thresholds_pix, Pix* grey_pix, - bool use_box_bottoms, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks); + // diacritic_blobs contain small confusing components that should be added + // to the appropriate word(s) in case they are really diacritics. + void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, + int height, Pix *binary_pix, Pix *thresholds_pix, + Pix *grey_pix, bool use_box_bottoms, + BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, + TO_BLOCK_LIST *to_blocks); // If we were supposed to return only a single textline, and there is more // than one, clean up and leave only the best. @@ -212,6 +244,17 @@ class Textord { // Remove outlines that are a tiny fraction in either width or height // of the word height. void clean_small_noise_from_words(ROW *row); + // Groups blocks by rotation, then, for each group, makes a WordGrid and calls + // TransferDiacriticsToWords to copy the diacritic blobs to the most + // appropriate words in the group of blocks. Source blobs are not touched. + void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs, + BLOCK_LIST* blocks); + // Places a copy of blobs that are near a word (after applying rotation to the + // blob) in the most appropriate word, unless there is doubt, in which case a + // blob can end up in two words. Source blobs are not touched. + void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs, + const FCOORD &rotation, WordGrid *word_grid); + public: // makerow.cpp /////////////////////////////////////////// BOOL_VAR_H(textord_single_height_mode, false, diff --git a/textord/topitch.cpp b/textord/topitch.cpp index 3136a9417e..e918f14c36 100644 --- a/textord/topitch.cpp +++ b/textord/topitch.cpp @@ -283,12 +283,13 @@ void fix_row_pitch(TO_ROW *bad_row, // row to fix bad_row->space_threshold = (bad_row->min_space + bad_row->max_nonspace) / 2; bad_row->space_size = bad_row->fixed_pitch; - if (bad_row->char_cells.empty ()) + if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) { tune_row_pitch (bad_row, &bad_row->projection, bad_row->projection_left, bad_row->projection_right, (bad_row->fixed_pitch + bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch, sp_sd, mid_cuts, &bad_row->char_cells, FALSE); + } } else if (bad_row->pitch_decision == PITCH_CORR_PROP || bad_row->pitch_decision == PITCH_DEF_PROP) { @@ -1279,13 +1280,13 @@ float tune_row_pitch2( //find fp cells best_sp_sd = initial_pitch; - if (textord_disable_pitch_test) { + best_pitch = static_cast(initial_pitch); + if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) { return initial_pitch; } sum_proj = new STATS[textord_pitch_range * 2 + 1]; if (sum_proj == NULL) return initial_pitch; - best_pitch = (inT32) initial_pitch; for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) @@ -1293,12 +1294,12 @@ float tune_row_pitch2( //find fp cells best_pitch + pitch_delta + 1); for (pixel = projection_left; pixel <= projection_right; pixel++) { - for (pitch_delta = -textord_pitch_range; - pitch_delta <= textord_pitch_range; pitch_delta++) - sum_proj[textord_pitch_range + - pitch_delta].add ((pixel - projection_left) % (best_pitch + - pitch_delta), - projection->pile_count (pixel)); + for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; + pitch_delta++) { + sum_proj[textord_pitch_range + pitch_delta].add( + (pixel - projection_left) % (best_pitch + pitch_delta), + projection->pile_count(pixel)); + } } best_count = sum_proj[textord_pitch_range].pile_count (0); best_delta = 0; @@ -1427,7 +1428,7 @@ float compute_pitch_sd( //find fp cells if (blob_it.empty ()) return space_size * 10; #ifndef GRAPHICS_DISABLED - if (testing_on && to_win > 0) { + if (testing_on && to_win != NULL) { blob_box = blob_it.data ()->bounding_box (); projection->plot (to_win, projection_left, row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); @@ -1476,7 +1477,7 @@ float compute_pitch_sd( //find fp cells tprintf ("\n"); } #ifndef GRAPHICS_DISABLED - if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0) + if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL) plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); #endif seg_it.set_to_list (&seg_list); @@ -1566,7 +1567,7 @@ float compute_pitch_sd2( //find fp cells return initial_pitch * 10; } #ifndef GRAPHICS_DISABLED - if (testing_on && to_win > 0) { + if (testing_on && to_win != NULL) { projection->plot (to_win, projection_left, row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); } @@ -1602,7 +1603,7 @@ float compute_pitch_sd2( //find fp cells tprintf ("\n"); } #ifndef GRAPHICS_DISABLED - if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0) + if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL) plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); #endif seg_it.set_to_list (&seg_list); diff --git a/textord/tordmain.cpp b/textord/tordmain.cpp index eb229eaa1a..e9e59261da 100644 --- a/textord/tordmain.cpp +++ b/textord/tordmain.cpp @@ -38,13 +38,18 @@ #include "allheaders.h" -const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block"; +// Gridsize for word grid when reassigning diacritics to words. Not critical. +const int kWordGridSize = 50; #undef EXTERN #define EXTERN #define MAX_NEAREST_DIST 600 //for block skew stats +namespace tesseract { + +CLISTIZE(WordWithBox) + /********************************************************************** * SetBlobStrokeWidth * @@ -143,7 +148,6 @@ void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) { } } - /********************************************************************** * assign_blobs_to_blocks2 * @@ -193,7 +197,6 @@ void assign_blobs_to_blocks2(Pix* pix, } } -namespace tesseract { /********************************************************************** * find_components * @@ -400,7 +403,7 @@ void Textord::cleanup_nontext_block(BLOCK* block) { * Delete empty blocks, rows from the page. **********************************************************************/ -void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) { +void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) { BLOCK_IT block_it = blocks; //iterator ROW_IT row_it; //row iterator @@ -420,18 +423,18 @@ void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) { if (clean_noise) { row_it.set_to_list(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ROW* row = row_it.data(); ++num_rows_all; - clean_small_noise_from_words(row_it.data()); - if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() && - clean_noise_from_row(row_it.data())) || - row_it.data()->word_list()->empty()) { + clean_small_noise_from_words(row); + if ((textord_noise_rejrows && !row->word_list()->empty() && + clean_noise_from_row(row)) || + row->word_list()->empty()) { delete row_it.extract(); // lose empty row. } else { if (textord_noise_rejwords) clean_noise_from_words(row_it.data()); if (textord_blshift_maxshift >= 0) - tweak_row_baseline(row_it.data(), - textord_blshift_maxshift, + tweak_row_baseline(row, textord_blshift_maxshift, textord_blshift_xfraction); ++num_rows; } @@ -640,16 +643,16 @@ void Textord::clean_noise_from_words( //remove empties && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; } - if (dot_count > 2) { + if (dot_count > 2 && !word->flag(W_REP_CHAR)) { if (dot_count > norm_count * textord_noise_normratio * 2) word_dud[word_index] = 2; else if (dot_count > norm_count * textord_noise_normratio) word_dud[word_index] = 1; else word_dud[word_index] = 0; - } - else + } else { word_dud[word_index] = 0; + } if (word_dud[word_index] == 2) dud_words++; else @@ -661,11 +664,11 @@ void Textord::clean_noise_from_words( //remove empties for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { - word = word_it.data (); //current word - //rejected blobs - blob_it.set_to_list (word->rej_cblob_list ()); - //move from blobs - blob_it.add_list_after (word->cblob_list ()); + word = word_it.data(); // Current word. + // Previously we threw away the entire word. + // Now just aggressively throw all small blobs into the reject list, where + // the classifier can decide whether they are actually needed. + word->CleanNoise(textord_noise_sizelimit * row->x_height()); } word_index++; } @@ -705,6 +708,176 @@ void Textord::clean_small_noise_from_words(ROW *row) { } } } + +// Local struct to hold a group of blocks. +struct BlockGroup { + BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {} + explicit BlockGroup(BLOCK* block) + : bounding_box(block->bounding_box()), + rotation(block->re_rotation()), + angle(block->re_rotation().angle()), + min_xheight(block->x_height()) { + blocks.push_back(block); + } + // Union of block bounding boxes. + TBOX bounding_box; + // Common rotation of the blocks. + FCOORD rotation; + // Angle of rotation. + float angle; + // Min xheight of the blocks. + float min_xheight; + // Collection of borrowed pointers to the blocks in the group. + GenericVector blocks; +}; + +// Groups blocks by rotation, then, for each group, makes a WordGrid and calls +// TransferDiacriticsToWords to copy the diacritic blobs to the most +// appropriate words in the group of blocks. Source blobs are not touched. +void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs, + BLOCK_LIST* blocks) { + // Angle difference larger than this is too much to consider equal. + // They should only be in multiples of M_PI/2 anyway. + const double kMaxAngleDiff = 0.01; // About 0.6 degrees. + PointerVector groups; + BLOCK_IT bk_it(blocks); + for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) { + BLOCK* block = bk_it.data(); + if (block->poly_block() != NULL && !block->poly_block()->IsText()) { + continue; + } + // Linear search of the groups to find a matching rotation. + float block_angle = block->re_rotation().angle(); + int best_g = 0; + float best_angle_diff = MAX_FLOAT32; + for (int g = 0; g < groups.size(); ++g) { + double angle_diff = fabs(block_angle - groups[g]->angle); + if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI); + if (angle_diff < best_angle_diff) { + best_angle_diff = angle_diff; + best_g = g; + } + } + if (best_angle_diff > kMaxAngleDiff) { + groups.push_back(new BlockGroup(block)); + } else { + groups[best_g]->blocks.push_back(block); + groups[best_g]->bounding_box += block->bounding_box(); + float x_height = block->x_height(); + if (x_height < groups[best_g]->min_xheight) + groups[best_g]->min_xheight = x_height; + } + } + // Now process each group of blocks. + PointerVector word_ptrs; + for (int g = 0; g < groups.size(); ++g) { + const BlockGroup* group = groups[g]; + tprintf("group %d, xh=%g, %d blocks\n", g, group->min_xheight, + group->blocks.size()); + WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(), + group->bounding_box.topright()); + for (int b = 0; b < group->blocks.size(); ++b) { + tprintf("block %d, %d rows\n", b, group->blocks[b]->row_list()->length()); + ROW_IT row_it(group->blocks[b]->row_list()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ROW* row = row_it.data(); + tprintf("%d words in row\n", row->word_list()->length()); + // Put the words of the row into the grid. + WERD_IT w_it(row->word_list()); + for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { + WERD* word = w_it.data(); + WordWithBox* box_word = new WordWithBox(word); + word_grid.InsertBBox(true, true, box_word); + // Save the pointer where it will be auto-deleted. + word_ptrs.push_back(box_word); + } + } + } + FCOORD rotation = group->rotation; + // Make it a forward rotation that will transform blob coords to block. + rotation.set_y(-rotation.y()); + TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid); + } +} + +// Places a copy of blobs that are near a word (after applying rotation to the +// blob) in the most appropriate word, unless there is doubt, in which case a +// blob can end up in two words. Source blobs are not touched. +void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs, + const FCOORD& rotation, + WordGrid* word_grid) { + WordSearch ws(word_grid); + BLOBNBOX_IT b_it(diacritic_blobs); + // Apply rotation to each blob before finding the nearest words. The rotation + // allows us to only consider above/below placement and not left/right on + // vertical text, because all text is horizontal here. + for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { + BLOBNBOX* blobnbox = b_it.data(); + TBOX blob_box = blobnbox->bounding_box(); + blob_box.rotate(rotation); + ws.StartRectSearch(blob_box); + // Above/below refer to word position relative to diacritic. Since some + // scripts eg Kannada/Telugu habitually put diacritics below words, and + // others eg Thai/Vietnamese/Latin put most diacritics above words, try + // for both if there isn't much in it. + WordWithBox* best_above_word = NULL; + WordWithBox* best_below_word = NULL; + int best_above_distance = 0; + int best_below_distance = 0; + for (WordWithBox* word = ws.NextRectSearch(); word != NULL; + word = ws.NextRectSearch()) { + if (word->word()->flag(W_REP_CHAR)) continue; + TBOX word_box = word->true_bounding_box(); + int x_distance = blob_box.x_gap(word_box); + int y_distance = blob_box.y_gap(word_box); + if (x_distance > 0) { + // Arbitrarily divide x-distance by 2 if there is a major y overlap, + // and the word is to the left of the diacritic. If the + // diacritic is a dropped broken character between two words, this will + // help send all the pieces to a single word, instead of splitting them + // over the 2 words. + if (word_box.major_y_overlap(blob_box) && + blob_box.left() > word_box.right()) { + x_distance /= 2; + } + y_distance += x_distance; + } + if (word_box.y_middle() > blob_box.y_middle() && + (best_above_word == NULL || y_distance < best_above_distance)) { + best_above_word = word; + best_above_distance = y_distance; + } + if (word_box.y_middle() <= blob_box.y_middle() && + (best_below_word == NULL || y_distance < best_below_distance)) { + best_below_word = word; + best_below_distance = y_distance; + } + } + bool above_good = + best_above_word != NULL && + (best_below_word == NULL || + best_above_distance < best_below_distance + blob_box.height()); + bool below_good = + best_below_word != NULL && best_below_word != best_above_word && + (best_above_word == NULL || + best_below_distance < best_above_distance + blob_box.height()); + if (below_good) { + C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); + copied_blob->rotate(rotation); + // Put the blob into the word's reject blobs list. + C_BLOB_IT blob_it(best_below_word->RejBlobs()); + blob_it.add_to_end(copied_blob); + } + if (above_good) { + C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); + copied_blob->rotate(rotation); + // Put the blob into the word's reject blobs list. + C_BLOB_IT blob_it(best_above_word->RejBlobs()); + blob_it.add_to_end(copied_blob); + } + } +} + } // tesseract /********************************************************************** @@ -820,33 +993,3 @@ void tweak_row_baseline(ROW *row, free_mem(xstarts); free_mem(coeffs); } - -/********************************************************************** - * blob_y_order - * - * Sort function to sort blobs in y from page top. - **********************************************************************/ - -inT32 blob_y_order( //sort function - void *item1, //items to compare - void *item2) { - //converted ptr - BLOBNBOX *blob1 = *(BLOBNBOX **) item1; - //converted ptr - BLOBNBOX *blob2 = *(BLOBNBOX **) item2; - - if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ()) - return -1; - else if (blob1->bounding_box ().bottom () < - blob2->bounding_box ().bottom ()) - return 1; - else { - if (blob1->bounding_box ().left () < blob2->bounding_box ().left ()) - return -1; - else if (blob1->bounding_box ().left () > - blob2->bounding_box ().left ()) - return 1; - else - return 0; - } -} diff --git a/textord/tordmain.h b/textord/tordmain.h index 340ff1aabe..cb5a6a1ef2 100644 --- a/textord/tordmain.h +++ b/textord/tordmain.h @@ -29,29 +29,14 @@ struct Pix; namespace tesseract { class Tesseract; -} -void make_blocks_from_blobs( //convert & textord - TBLOB *tessblobs, //tess style input - const char *filename, //blob file - ICOORD page_tr, //top right - BOOL8 do_shift, //shift tess coords - BLOCK_LIST *blocks //block list - ); void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob); void assign_blobs_to_blocks2(Pix* pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks); -void textord_page( //make rows & words - ICOORD page_tr, //top right - BLOCK_LIST *blocks, //block list - TO_BLOCK_LIST *land_blocks, //rotated for landscape - TO_BLOCK_LIST *port_blocks, //output list - tesseract::Tesseract* - ); +} // namespace tesseract + void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction); -inT32 blob_y_order( //sort function - void *item1, //items to compare - void *item2); + #endif