diff --git a/ccmain/control.cpp b/ccmain/control.cpp
index a765a97c8a..3abf216e34 100644
--- a/ccmain/control.cpp
+++ b/ccmain/control.cpp
@@ -93,8 +93,7 @@ BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) {
 
   WordData word_data(*pr_it);
   SetupWordPassN(2, &word_data);
-  classify_word_and_language(&Tesseract::classify_word_pass2, pr_it,
-                             &word_data);
+  classify_word_and_language(2, pr_it, &word_data);
   if (tessedit_debug_quality_metrics) {
     WERD_RES* word_res = pr_it->word();
     word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
@@ -190,6 +189,7 @@ void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
       if (word->word->x_height == 0.0f)
         word->word->x_height = word->row->x_height();
     }
+    word->lang_words.truncate(0);
     for (int s = 0; s <= sub_langs_.size(); ++s) {
       // The sub_langs_.size() entry is for the master language.
       Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
@@ -249,15 +249,23 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
     while (pr_it->word() != NULL && pr_it->word() != word->word)
       pr_it->forward();
     ASSERT_HOST(pr_it->word() != NULL);
-    WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
-                                            : &Tesseract::classify_word_pass2;
-    classify_word_and_language(recognizer, pr_it, word);
-    if (tessedit_dump_choices) {
+    bool make_next_word_fuzzy = false;
+    if (!AnyLSTMLang() &&
+        ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
+      // Needs to be setup again to see the new outlines in the chopped_word.
+      SetupWordPassN(pass_n, word);
+    }
+
+    classify_word_and_language(pass_n, pr_it, word);
+    if (tessedit_dump_choices || debug_noise_removal) {
       tprintf("Pass%d: %s [%s]\n", pass_n,
               word->word->best_choice->unichar_string().string(),
               word->word->best_choice->debug_string().string());
     }
     pr_it->forward();
+    if (make_next_word_fuzzy && pr_it->word() != NULL) {
+      pr_it->MakeCurrentWordFuzzy();
+    }
   }
   return true;
 }
@@ -898,6 +906,359 @@ static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
   return true;
 }
 
+// Moves good-looking "noise"/diacritics from the reject list to the main
+// blob list on the current word. Returns true if anything was done, and
+// sets make_next_word_fuzzy if blob(s) were added to the end of the word.
+bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
+                                   bool* make_next_word_fuzzy) {
+  *make_next_word_fuzzy = false;
+  WERD* real_word = pr_it->word()->word;
+  if (real_word->rej_cblob_list()->empty() ||
+      real_word->cblob_list()->empty() ||
+      real_word->rej_cblob_list()->length() > noise_maxperword)
+    return false;
+  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
+  // Get the noise outlines into a vector with matching bool map.
+  GenericVector<C_OUTLINE*> outlines;
+  real_word->GetNoiseOutlines(&outlines);
+  GenericVector<bool> word_wanted;
+  GenericVector<bool> overlapped_any_blob;
+  GenericVector<C_BLOB*> target_blobs;
+  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
+                                     &word_wanted, &overlapped_any_blob,
+                                     &target_blobs);
+  // Filter the outlines that overlapped any blob and put them into the word
+  // now. This simplifies the remaining task and also makes it more accurate
+  // as it has more completed blobs to work on.
+  GenericVector<bool> wanted;
+  GenericVector<C_BLOB*> wanted_blobs;
+  GenericVector<C_OUTLINE*> wanted_outlines;
+  int num_overlapped = 0;
+  int num_overlapped_used = 0;
+  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
+    if (overlapped_any_blob[i]) {
+      ++num_overlapped;
+      if (word_wanted[i]) ++num_overlapped_used;
+      wanted.push_back(word_wanted[i]);
+      wanted_blobs.push_back(target_blobs[i]);
+      wanted_outlines.push_back(outlines[i]);
+      outlines[i] = NULL;
+    }
+  }
+  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL);
+  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
+                             &target_blobs);
+  int non_overlapped = 0;
+  int non_overlapped_used = 0;
+  for (int i = 0; i < word_wanted.size(); ++i) {
+    if (word_wanted[i]) ++non_overlapped_used;
+    if (outlines[i] != NULL) ++non_overlapped_used;
+  }
+  if (debug_noise_removal) {
+    tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
+            num_overlapped_used, num_overlapped, non_overlapped_used,
+            non_overlapped);
+    real_word->bounding_box().print();
+  }
+  // Now we have decided which outlines we want, put them into the real_word.
+  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
+                                     make_next_word_fuzzy)) {
+    pr_it->MakeCurrentWordFuzzy();
+  }
+  // TODO(rays) Parts of combos have a deep copy of the real word, and need
+  // to have their noise outlines moved/assigned in the same way!!
+  return num_overlapped_used != 0 || non_overlapped_used != 0;
+}
+
+// Attempts to put noise/diacritic outlines into the blobs that they overlap.
+// Input: a set of noisy outlines that probably belong to the real_word.
+// Output: word_wanted indicates which outlines are to be assigned to a blob,
+//   target_blobs indicates which to assign to, and overlapped_any_blob is
+//   true for all outlines that overlapped a blob.
+void Tesseract::AssignDiacriticsToOverlappingBlobs(
+    const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
+    PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
+    GenericVector<bool>* overlapped_any_blob,
+    GenericVector<C_BLOB*>* target_blobs) {
+  GenericVector<bool> blob_wanted;
+  word_wanted->init_to_size(outlines.size(), false);
+  overlapped_any_blob->init_to_size(outlines.size(), false);
+  target_blobs->init_to_size(outlines.size(), NULL);
+  // For each real blob, find the outlines that seriously overlap it.
+  // A single blob could be several merged characters, so there can be quite
+  // a few outlines overlapping, and the full engine needs to be used to chop
+  // and join to get a sensible result.
+  C_BLOB_IT blob_it(real_word->cblob_list());
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    C_BLOB* blob = blob_it.data();
+    TBOX blob_box = blob->bounding_box();
+    blob_wanted.init_to_size(outlines.size(), false);
+    int num_blob_outlines = 0;
+    for (int i = 0; i < outlines.size(); ++i) {
+      if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
+          !(*word_wanted)[i]) {
+        blob_wanted[i] = true;
+        (*overlapped_any_blob)[i] = true;
+        ++num_blob_outlines;
+      }
+    }
+    if (debug_noise_removal) {
+      tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
+      blob_box.print();
+    }
+    // If any outlines overlap the blob, and not too many, classify the blob
+    // (using the full engine, languages and all), and choose the maximal
+    // combination of outlines that doesn't hurt the end-result classification
+    // by too much. Mark them as wanted.
+    if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
+      if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
+                                      outlines, num_blob_outlines,
+                                      &blob_wanted)) {
+        for (int i = 0; i < blob_wanted.size(); ++i) {
+          if (blob_wanted[i]) {
+            // Claim the outline and record where it is going.
+            (*word_wanted)[i] = true;
+            (*target_blobs)[i] = blob;
+          }
+        }
+      }
+    }
+  }
+}
+
+// Attempts to assign non-overlapping outlines to their nearest blobs or
+// make new blobs out of them.
+void Tesseract::AssignDiacriticsToNewBlobs(
+    const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
+    PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
+    GenericVector<C_BLOB*>* target_blobs) {
+  GenericVector<bool> blob_wanted;
+  word_wanted->init_to_size(outlines.size(), false);
+  target_blobs->init_to_size(outlines.size(), NULL);
+  // Check for outlines that need to be turned into stand-alone blobs.
+  for (int i = 0; i < outlines.size(); ++i) {
+    if (outlines[i] == NULL) continue;
+    // Get a set of adjacent outlines that don't overlap any existing blob.
+    blob_wanted.init_to_size(outlines.size(), false);
+    int num_blob_outlines = 0;
+    TBOX total_ol_box(outlines[i]->bounding_box());
+    while (i < outlines.size() && outlines[i] != NULL) {
+      blob_wanted[i] = true;
+      total_ol_box += outlines[i]->bounding_box();
+      ++i;
+      ++num_blob_outlines;
+    }
+    // Find the insertion point.
+    C_BLOB_IT blob_it(real_word->cblob_list());
+    while (!blob_it.at_last() &&
+           blob_it.data_relative(1)->bounding_box().left() <=
+               total_ol_box.left()) {
+      blob_it.forward();
+    }
+    // Choose which combination of them we actually want and where to put
+    // them.
+    if (debug_noise_removal)
+      tprintf("Num blobless outlines = %d\n", num_blob_outlines);
+    C_BLOB* left_blob = blob_it.data();
+    TBOX left_box = left_blob->bounding_box();
+    C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1);
+    if ((left_box.x_overlap(total_ol_box) || right_blob == NULL ||
+         !right_blob->bounding_box().x_overlap(total_ol_box)) &&
+        SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
+                                    outlines, num_blob_outlines,
+                                    &blob_wanted)) {
+      if (debug_noise_removal) tprintf("Added to left blob\n");
+      for (int j = 0; j < blob_wanted.size(); ++j) {
+        if (blob_wanted[j]) {
+          (*word_wanted)[j] = true;
+          (*target_blobs)[j] = left_blob;
+        }
+      }
+    } else if (right_blob != NULL &&
+               (!left_box.x_overlap(total_ol_box) ||
+                right_blob->bounding_box().x_overlap(total_ol_box)) &&
+               SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it,
+                                           right_blob, outlines,
+                                           num_blob_outlines, &blob_wanted)) {
+      if (debug_noise_removal) tprintf("Added to right blob\n");
+      for (int j = 0; j < blob_wanted.size(); ++j) {
+        if (blob_wanted[j]) {
+          (*word_wanted)[j] = true;
+          (*target_blobs)[j] = right_blob;
+        }
+      }
+    } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL,
+                                           outlines, num_blob_outlines,
+                                           &blob_wanted)) {
+      if (debug_noise_removal) tprintf("Fitted between blobs\n");
+      for (int j = 0; j < blob_wanted.size(); ++j) {
+        if (blob_wanted[j]) {
+          (*word_wanted)[j] = true;
+          (*target_blobs)[j] = NULL;
+        }
+      }
+    }
+  }
+}
+
+// Starting with ok_outlines set to indicate which outlines overlap the blob,
+// chooses the optimal set (approximately) and returns true if any outlines
+// are desired, in which case ok_outlines indicates which ones.
+bool Tesseract::SelectGoodDiacriticOutlines(
+    int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob,
+    const GenericVector<C_OUTLINE*>& outlines, int num_outlines,
+    GenericVector<bool>* ok_outlines) {
+  STRING best_str;
+  float target_cert = certainty_threshold;
+  if (blob != NULL) {
+    float target_c2;
+    target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
+    if (debug_noise_removal) {
+      tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
+              target_cert, target_c2);
+      blob->bounding_box().print();
+    }
+    target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
+  }
+  GenericVector<bool> test_outlines = *ok_outlines;
+  // Start with all the outlines in.
+  STRING all_str;
+  GenericVector<bool> best_outlines = *ok_outlines;
+  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
+                                             pr_it, blob, &all_str);
+  if (debug_noise_removal) {
+    TBOX ol_box;
+    for (int i = 0; i < test_outlines.size(); ++i) {
+      if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
+    }
+    tprintf("All Noise blob classified as %s=%g, delta=%g at:",
+            all_str.string(), best_cert, best_cert - target_cert);
+    ol_box.print();
+  }
+  // Iteratively zero out the bit that improves the certainty the most, until
+  // we get past the threshold, have zero bits, or fail to improve.
+  int best_index = 0;  // To zero out.
+  while (num_outlines > 1 && best_index >= 0 &&
+         (blob == NULL || best_cert < target_cert || blob != NULL)) {
+    // Find the best bit to zero out.
+    best_index = -1;
+    for (int i = 0; i < outlines.size(); ++i) {
+      if (test_outlines[i]) {
+        test_outlines[i] = false;
+        STRING str;
+        float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
+                                              pr_it, blob, &str);
+        if (debug_noise_removal) {
+          TBOX ol_box;
+          for (int j = 0; j < outlines.size(); ++j) {
+            if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
+            tprintf("%d", test_outlines[j]);
+          }
+          tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
+                  cert, cert - target_cert);
+          ol_box.print();
+        }
+        if (cert > best_cert) {
+          best_cert = cert;
+          best_index = i;
+          best_outlines = test_outlines;
+        }
+        test_outlines[i] = true;
+      }
+    }
+    if (best_index >= 0) {
+      test_outlines[best_index] = false;
+      --num_outlines;
+    }
+  }
+  if (best_cert >= target_cert) {
+    // Save the best combination.
+    *ok_outlines = best_outlines;
+    if (debug_noise_removal) {
+      tprintf("%s noise combination ", blob ? "Adding" : "New");
+      for (int i = 0; i < best_outlines.size(); ++i) {
+        tprintf("%d", best_outlines[i]);
+      }
+      tprintf(" yields certainty %g, beating target of %g\n", best_cert,
+              target_cert);
+    }
+    return true;
+  }
+  return false;
+}
+
+// Classifies the given blob plus the outlines flagged by ok_outlines, undoes
+// the inclusion of the outlines, and returns the certainty of the raw choice.
+float Tesseract::ClassifyBlobPlusOutlines(
+    const GenericVector<bool>& ok_outlines,
+    const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it,
+    C_BLOB* blob, STRING* best_str) {
+  C_OUTLINE_IT ol_it;
+  C_OUTLINE* first_to_keep = NULL;
+  if (blob != NULL) {
+    // Add the required outlines to the blob.
+    ol_it.set_to_list(blob->out_list());
+    first_to_keep = ol_it.data();
+  }
+  for (int i = 0; i < ok_outlines.size(); ++i) {
+    if (ok_outlines[i]) {
+      // This outline is to be added.
+      if (blob == NULL) {
+        blob = new C_BLOB(outlines[i]);
+        ol_it.set_to_list(blob->out_list());
+      } else {
+        ol_it.add_before_stay_put(outlines[i]);
+      }
+    }
+  }
+  float c2;
+  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
+  ol_it.move_to_first();
+  if (first_to_keep == NULL) {
+    // We created blob. Empty its outlines and delete it.
+    for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
+    delete blob;
+    cert = -c2;
+  } else {
+    // Remove the outlines that we put in.
+    for (; ol_it.data() != first_to_keep; ol_it.forward()) {
+      ol_it.extract();
+    }
+  }
+  return cert;
+}
+
+// Classifies the given blob (part of word_data->word->word) as an individual
+// word, using languages, chopper etc, returning only the certainty of the
+// best raw choice, and undoing all the work done to fake out the word.
+float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it,
+                                    C_BLOB* blob, STRING* best_str, float* c2) {
+  WERD* real_word = pr_it->word()->word;
+  WERD* word = real_word->ConstructFromSingleBlob(
+      real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
+  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
+  // Get a new iterator that points to the new word.
+  PAGE_RES_IT it(pr_it->page_res);
+  while (it.word() != word_res && it.word() != NULL) it.forward();
+  ASSERT_HOST(it.word() == word_res);
+  WordData wd(it);
+  // Force full initialization.
+  SetupWordPassN(1, &wd);
+  classify_word_and_language(pass_n, &it, &wd);
+  if (debug_noise_removal) {
+    tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
+            wd.row->x_height(), wd.word->raw_choice->min_x_height(),
+            wd.word->raw_choice->max_x_height());
+  }
+  float cert = wd.word->raw_choice->certainty();
+  float rat = wd.word->raw_choice->rating();
+  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
+  *best_str = wd.word->raw_choice->unichar_string();
+  it.DeleteCurrentWord();
+  pr_it->ResetWordIterator();
+  return cert;
+}
+
 // Generic function for classifying a word. Can be used either for pass1 or
 // pass2 according to the function passed to recognizer.
 // word_data holds the word to be recognized, and its block and row, and
@@ -906,9 +1267,10 @@ static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
 // Recognizes in the current language, and if successful that is all.
 // If recognition was not successful, tries all available languages until
 // it gets a successful result or runs out of languages. Keeps the best result.
-void Tesseract::classify_word_and_language(WordRecognizer recognizer,
-                                           PAGE_RES_IT* pr_it,
+void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
                                            WordData* word_data) {
+  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
+                                          : &Tesseract::classify_word_pass2;
   // Best result so far.
   PointerVector<WERD_RES> best_words;
   // Points to the best result. May be word or in lang_words.
diff --git a/ccmain/fixspace.cpp b/ccmain/fixspace.cpp
index 17c4f96ed1..0a561ac9a0 100644
--- a/ccmain/fixspace.cpp
+++ b/ccmain/fixspace.cpp
@@ -205,8 +205,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
     if ((!word->part_of_combo) && (word->box_word == NULL)) {
       WordData word_data(block, row, word);
       SetupWordPassN(2, &word_data);
-      classify_word_and_language(&Tesseract::classify_word_pass2, NULL,
-                                 &word_data);
+      classify_word_and_language(2, NULL, &word_data);
     }
     prev_word_best_choice_ = word->best_choice;
   }
diff --git a/ccmain/pageiterator.cpp b/ccmain/pageiterator.cpp
index c8e025c13f..ed03ceaba5 100644
--- a/ccmain/pageiterator.cpp
+++ b/ccmain/pageiterator.cpp
@@ -26,15 +26,23 @@
 
 namespace tesseract {
 
-PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
-                           int scale, int scaled_yres,
-                           int rect_left, int rect_top,
+PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
+                           int scaled_yres, int rect_left, int rect_top,
                            int rect_width, int rect_height)
-  : page_res_(page_res), tesseract_(tesseract),
-    word_(NULL), word_length_(0), blob_index_(0), cblob_it_(NULL),
-    scale_(scale), scaled_yres_(scaled_yres),
-    rect_left_(rect_left), rect_top_(rect_top),
-    rect_width_(rect_width), rect_height_(rect_height) {
+    : page_res_(page_res),
+      tesseract_(tesseract),
+      word_(NULL),
+      word_length_(0),
+      blob_index_(0),
+      cblob_it_(NULL),
+      include_upper_dots_(false),
+      include_lower_dots_(false),
+      scale_(scale),
+      scaled_yres_(scaled_yres),
+      rect_left_(rect_left),
+      rect_top_(rect_top),
+      rect_width_(rect_width),
+      rect_height_(rect_height) {
   it_ = new PAGE_RES_IT(page_res);
   PageIterator::Begin();
 }
@@ -50,12 +58,20 @@ PageIterator::~PageIterator() {
  * objects at a higher level.
  */
 PageIterator::PageIterator(const PageIterator& src)
-  : page_res_(src.page_res_), tesseract_(src.tesseract_),
-    word_(NULL), word_length_(src.word_length_),
-    blob_index_(src.blob_index_), cblob_it_(NULL),
-    scale_(src.scale_), scaled_yres_(src.scaled_yres_),
-    rect_left_(src.rect_left_), rect_top_(src.rect_top_),
-    rect_width_(src.rect_width_), rect_height_(src.rect_height_) {
+    : page_res_(src.page_res_),
+      tesseract_(src.tesseract_),
+      word_(NULL),
+      word_length_(src.word_length_),
+      blob_index_(src.blob_index_),
+      cblob_it_(NULL),
+      include_upper_dots_(src.include_upper_dots_),
+      include_lower_dots_(src.include_lower_dots_),
+      scale_(src.scale_),
+      scaled_yres_(src.scaled_yres_),
+      rect_left_(src.rect_left_),
+      rect_top_(src.rect_top_),
+      rect_width_(src.rect_width_),
+      rect_height_(src.rect_height_) {
   it_ = new PAGE_RES_IT(*src.it_);
   BeginWord(src.blob_index_);
 }
@@ -63,6 +79,8 @@ PageIterator::PageIterator(const PageIterator& src)
 const PageIterator& PageIterator::operator=(const PageIterator& src) {
   page_res_ = src.page_res_;
   tesseract_ = src.tesseract_;
+  include_upper_dots_ = src.include_upper_dots_;
+  include_lower_dots_ = src.include_lower_dots_;
   scale_ = src.scale_;
   scaled_yres_ = src.scaled_yres_;
   rect_left_ = src.rect_left_;
@@ -252,16 +270,19 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
   PARA *para = NULL;
   switch (level) {
     case RIL_BLOCK:
-      box = it_->block()->block->bounding_box();
+      box = it_->block()->block->restricted_bounding_box(include_upper_dots_,
+                                                         include_lower_dots_);
       break;
     case RIL_PARA:
       para = it_->row()->row->para();
       // explicit fall-through.
     case RIL_TEXTLINE:
-      box = it_->row()->row->bounding_box();
+      box = it_->row()->row->restricted_bounding_box(include_upper_dots_,
+                                                     include_lower_dots_);
       break;
     case RIL_WORD:
-      box = it_->word()->word->bounding_box();
+      box = it_->word()->word->restricted_bounding_box(include_upper_dots_,
+                                                       include_lower_dots_);
       break;
     case RIL_SYMBOL:
       if (cblob_it_ == NULL)
@@ -387,39 +408,23 @@ Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
   int left, top, right, bottom;
   if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
     return NULL;
-  Pix* pix = NULL;
-  switch (level) {
-    case RIL_BLOCK:
-    case RIL_PARA:
-      int bleft, btop, bright, bbottom;
-      BoundingBoxInternal(RIL_BLOCK, &bleft, &btop, &bright, &bbottom);
-      pix = it_->block()->block->render_mask();
-      // AND the mask and the image.
-      pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix),
-                  PIX_SRC & PIX_DST, tesseract_->pix_binary(),
-                  bleft, btop);
-      if (level == RIL_PARA) {
-        // RIL_PARA needs further attention:
-        //   clip the paragraph from the block mask.
-        Box* box = boxCreate(left - bleft, top - btop,
-                             right - left, bottom - top);
-        Pix* pix2 = pixClipRectangle(pix, box, NULL);
-        boxDestroy(&box);
-        pixDestroy(&pix);
-        pix = pix2;
-      }
-      break;
-    case RIL_TEXTLINE:
-    case RIL_WORD:
-    case RIL_SYMBOL:
-      if (level == RIL_SYMBOL && cblob_it_ != NULL &&
-          cblob_it_->data()->area() != 0)
-        return cblob_it_->data()->render();
-      // Just clip from the bounding box.
-      Box* box = boxCreate(left, top, right - left, bottom - top);
-      pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
-      boxDestroy(&box);
-      break;
+  if (level == RIL_SYMBOL && cblob_it_ != NULL &&
+      cblob_it_->data()->area() != 0)
+    return cblob_it_->data()->render();
+  Box* box = boxCreate(left, top, right - left, bottom - top);
+  Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
+  boxDestroy(&box);
+  if (level == RIL_BLOCK || level == RIL_PARA) {
+    // Clip to the block polygon as well.
+    TBOX mask_box;
+    Pix* mask = it_->block()->block->render_mask(&mask_box);
+    int mask_x = left - mask_box.left();
+    int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
+    // AND the mask and pix, putting the result in pix.
+    pixRasterop(pix, MAX(0, -mask_x), MAX(0, -mask_y), pixGetWidth(pix),
+                pixGetHeight(pix), PIX_SRC & PIX_DST, mask, MAX(0, mask_x),
+                MAX(0, mask_y));
+    pixDestroy(&mask);
   }
   return pix;
 }
@@ -452,17 +457,24 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
   Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
   Pix* grey_pix = pixClipRectangle(original_img, box, NULL);
   boxDestroy(&box);
-  if (level == RIL_BLOCK) {
-    Pix* mask = it_->block()->block->render_mask();
-    Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1);
-    pixRasterop(expanded_mask, padding, padding,
-                pixGetWidth(mask), pixGetHeight(mask),
-                PIX_SRC, mask, 0, 0);
+  if (level == RIL_BLOCK || level == RIL_PARA) {
+    // Clip to the block polygon as well.
+    TBOX mask_box;
+    Pix* mask = it_->block()->block->render_mask(&mask_box);
+    // Copy the mask registered correctly into an image the size of grey_pix.
+    int mask_x = *left - mask_box.left();
+    int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
+    int width = pixGetWidth(grey_pix);
+    int height = pixGetHeight(grey_pix);
+    Pix* resized_mask = pixCreate(width, height, 1);
+    pixRasterop(resized_mask, MAX(0, -mask_x), MAX(0, -mask_y), width, height,
+                PIX_SRC, mask, MAX(0, mask_x), MAX(0, mask_y));
     pixDestroy(&mask);
-    pixDilateBrick(expanded_mask, expanded_mask, 2*padding + 1, 2*padding + 1);
-    pixInvert(expanded_mask, expanded_mask);
-    pixSetMasked(grey_pix, expanded_mask, MAX_UINT32);
-    pixDestroy(&expanded_mask);
+    pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1,
+                   2 * padding + 1);
+    pixInvert(resized_mask, resized_mask);
+    pixSetMasked(grey_pix, resized_mask, MAX_UINT32);
+    pixDestroy(&resized_mask);
   }
   return grey_pix;
 }
diff --git a/ccmain/pageiterator.h b/ccmain/pageiterator.h
index 27b02ddf8f..56c78150a8 100644
--- a/ccmain/pageiterator.h
+++ b/ccmain/pageiterator.h
@@ -179,6 +179,21 @@ class TESS_API PageIterator {
   // If an image rectangle has been set in the API, then returned coordinates
   // relate to the original (full) image, rather than the rectangle.
 
+  /**
+   * Controls what to include in a bounding box. Bounding boxes of all levels
+   * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
+   * Between layout analysis and recognition, it isn't known where all
+   * diacritics belong, so this control is used to include or exclude some
+   * diacritics that are above or below the main body of the word. In most cases
+   * where the placement is obvious, and after recognition, it doesn't make as
+   * much difference, as the diacritics will already be included in the word.
+   */
+  void SetBoundingBoxComponents(bool include_upper_dots,
+                                bool include_lower_dots) {
+    include_upper_dots_ = include_upper_dots;
+    include_lower_dots_ = include_lower_dots;
+  }
+
   /**
    * Returns the bounding rectangle of the current object at the given level.
    * See comment on coordinate system above.
@@ -332,6 +347,9 @@ class TESS_API PageIterator {
    * Owned by this ResultIterator.
    */
   C_BLOB_IT* cblob_it_;
+  /** Control over what to include in bounding boxes. */
+  bool include_upper_dots_;
+  bool include_lower_dots_;
   /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
   int scale_;
   int scaled_yres_;
diff --git a/ccmain/pagesegmain.cpp b/ccmain/pagesegmain.cpp
index 396be13048..6ced2d4c40 100644
--- a/ccmain/pagesegmain.cpp
+++ b/ccmain/pagesegmain.cpp
@@ -134,12 +134,20 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
     // UNLV file present. Use PSM_SINGLE_BLOCK.
     pageseg_mode = PSM_SINGLE_BLOCK;
   }
+  // The diacritic_blobs holds noise blobs that may be diacritics. They
+  // are separated out on areas of the image that seem noisy and short-circuit
+  // the layout process, going straight from the initial partition creation
+  // right through to after word segmentation, where they are added to the
+  // rej_cblobs list of the most appropriate word. From there classification
+  // will determine whether they are used.
+  BLOBNBOX_LIST diacritic_blobs;
   int auto_page_seg_ret_val = 0;
   TO_BLOCK_LIST to_blocks;
   if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
       PSM_SPARSE(pageseg_mode)) {
-    auto_page_seg_ret_val =
-        AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr);
+    auto_page_seg_ret_val = AutoPageSeg(
+        pageseg_mode, blocks, &to_blocks,
+        enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
     if (pageseg_mode == PSM_OSD_ONLY)
       return auto_page_seg_ret_val;
     // To create blobs from the image region bounds uncomment this line:
@@ -171,7 +179,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
 
   textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
                        pix_thresholds_, pix_grey_, splitting || cjk_mode,
-                       blocks, &to_blocks);
+                       &diacritic_blobs, blocks, &to_blocks);
   return auto_page_seg_ret_val;
 }
 
@@ -197,7 +205,6 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
   pixDestroy(&grey_pix);
 }
 
-
 /**
  * Auto page segmentation. Divide the page image into blocks of uniform
  * text linespacing and images.
@@ -207,9 +214,14 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
  * The output goes in the blocks list with corresponding TO_BLOCKs in the
  * to_blocks list.
  *
- * If single_column is true, then no attempt is made to divide the image
- * into columns, but multiple blocks are still made if the text is of
- * non-uniform linespacing.
+ * If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
+ * the image into columns, but multiple blocks are still made if the text is
+ * of non-uniform linespacing.
+ *
+ * If diacritic_blobs is non-null, then diacritics/noise blobs, that would
+ * confuse layout anaylsis by causing textline overlap, are placed there,
+ * with the expectation that they will be reassigned to words later and
+ * noise/diacriticness determined via classification.
  *
  * If osd (orientation and script detection) is true then that is performed
  * as well. If only_osd is true, then only orientation and script detection is
@@ -217,9 +229,10 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
  * another Tesseract that was initialized especially for osd, and the results
  * will be output into osr (orientation and script result).
  */
-int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
-                           BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
-                           Tesseract* osd_tess, OSResults* osr) {
+int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
+                           TO_BLOCK_LIST* to_blocks,
+                           BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess,
+                           OSResults* osr) {
   if (textord_debug_images) {
     WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
   }
@@ -247,10 +260,9 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
     if (equ_detect_) {
       finder->SetEquationDetect(equ_detect_);
     }
-    result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
-                                to_block, photomask_pix,
-                                pix_thresholds_, pix_grey_,
-                                &found_blocks, to_blocks);
+    result = finder->FindBlocks(
+        pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix,
+        pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks);
     if (result >= 0)
       finder->GetDeskewVectors(&deskew_, &reskew_);
     delete finder;
diff --git a/ccmain/pgedit.cpp b/ccmain/pgedit.cpp
index 7c8f626b6b..ea44ead7c9 100644
--- a/ccmain/pgedit.cpp
+++ b/ccmain/pgedit.cpp
@@ -655,7 +655,8 @@ void show_point(PAGE_RES* page_res, float x, float y) {
   FCOORD pt(x, y);
   PAGE_RES_IT pr_it(page_res);
 
-  char msg[160];
+  const int kBufsize = 512;
+  char msg[kBufsize];
   char *msg_ptr = msg;
 
   msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y);
diff --git a/ccmain/recogtraining.cpp b/ccmain/recogtraining.cpp
index 2dc94886ed..27d7e97ea0 100644
--- a/ccmain/recogtraining.cpp
+++ b/ccmain/recogtraining.cpp
@@ -207,8 +207,7 @@ void Tesseract::ambigs_classify_and_output(const char *label,
   fflush(stdout);
   WordData word_data(*pr_it);
   SetupWordPassN(1, &word_data);
-  classify_word_and_language(&Tesseract::classify_word_pass1,
-                             pr_it, &word_data);
+  classify_word_and_language(1, pr_it, &word_data);
   WERD_RES* werd_res = word_data.word;
   WERD_CHOICE *best_choice = werd_res->best_choice;
   ASSERT_HOST(best_choice != NULL);
diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp
index c262bbc95e..25819e8cdd 100644
--- a/ccmain/tesseractclass.cpp
+++ b/ccmain/tesseractclass.cpp
@@ -55,507 +55,569 @@
 namespace tesseract {
 
 Tesseract::Tesseract()
-  : BOOL_MEMBER(tessedit_resegment_from_boxes, false,
-                "Take segmentation and labeling from box file",
-                this->params()),
-    BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
-                "Conversion of word/line box file to char box file",
-                this->params()),
-    BOOL_MEMBER(tessedit_train_from_boxes, false,
-                "Generate training data from boxed chars", this->params()),
-    BOOL_MEMBER(tessedit_make_boxes_from_boxes, false,
-                "Generate more boxes from boxed chars", this->params()),
-    BOOL_MEMBER(tessedit_dump_pageseg_images, false,
-               "Dump intermediate images made during page segmentation",
-               this->params()),
-    // The default for pageseg_mode is the old behaviour, so as not to
-    // upset anything that relies on that.
-    INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
-               "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
-               " 5=line, 6=word, 7=char"
-               " (Values from PageSegMode enum in publictypes.h)",
-               this->params()),
-    INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
-                    "Which OCR engine(s) to run (Tesseract, Cube, both)."
-                    " Defaults to loading and running only Tesseract"
-                    " (no Cube,no combiner)."
-                    " Values from OcrEngineMode enum in tesseractclass.h)",
-               this->params()),
-    STRING_MEMBER(tessedit_char_blacklist, "",
-                  "Blacklist of chars not to recognize", this->params()),
-    STRING_MEMBER(tessedit_char_whitelist, "",
-                  "Whitelist of chars to recognize", this->params()),
-    STRING_MEMBER(tessedit_char_unblacklist, "",
-                  "List of chars to override tessedit_char_blacklist",
-                  this->params()),
-    BOOL_MEMBER(tessedit_ambigs_training, false,
-                "Perform training for ambiguities", this->params()),
-    INT_MEMBER(pageseg_devanagari_split_strategy,
-              tesseract::ShiroRekhaSplitter::NO_SPLIT,
-              "Whether to use the top-line splitting process for Devanagari "
-              "documents while performing page-segmentation.", this->params()),
-    INT_MEMBER(ocr_devanagari_split_strategy,
-              tesseract::ShiroRekhaSplitter::NO_SPLIT,
-              "Whether to use the top-line splitting process for Devanagari "
-              "documents while performing ocr.", this->params()),
-    STRING_MEMBER(tessedit_write_params_to_file, "",
-                  "Write all parameters to the given file.", this->params()),
-    BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug"
-                " information for adaption", this->params()),
-    INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
-    INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
-    INT_MEMBER(applybox_page, 0,
-               "Page number to apply boxes from", this->params()),
-    STRING_MEMBER(applybox_exposure_pattern, ".exp", "Exposure value follows"
-                  " this pattern in the image filename. The name of the image"
-                  " files are expected to be in the form"
-                  " [lang].[fontname].exp[num].tif", this->params()),
-    BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
-               "Learn both character fragments (as is done in the"
-               " special low exposure mode) as well as unfragmented"
-               " characters.", this->params()),
-    BOOL_MEMBER(applybox_learn_ngrams_mode, false, "Each bounding box"
-                " is assumed to contain ngrams. Only learn the ngrams"
-                " whose outlines overlap horizontally.", this->params()),
-    BOOL_MEMBER(tessedit_display_outwords, false,
-                "Draw output words", this->params()),
-    BOOL_MEMBER(tessedit_dump_choices, false,
-                "Dump char choices", this->params()),
-    BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
-                this->params()),
-    BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
-                "Try to improve fuzzy spaces", this->params()),
-    BOOL_MEMBER(tessedit_unrej_any_wd, false,
-                "Dont bother with word plausibility", this->params()),
-    BOOL_MEMBER(tessedit_fix_hyphens, true,
-                "Crunch double hyphens?", this->params()),
-    BOOL_MEMBER(tessedit_redo_xheight, true,
-                "Check/Correct x-height", this->params()),
-    BOOL_MEMBER(tessedit_enable_doc_dict, true,
-                "Add words to the document dictionary", this->params()),
-    BOOL_MEMBER(tessedit_debug_fonts, false,
-                "Output font info per char", this->params()),
-    BOOL_MEMBER(tessedit_debug_block_rejection, false,
-                "Block and Row stats", this->params()),
-    BOOL_MEMBER(tessedit_enable_bigram_correction, true,
-                "Enable correction based on the word bigram dictionary.",
-                this->params()),
-    BOOL_MEMBER(tessedit_enable_dict_correction, false,
-                "Enable single word correction based on the dictionary.",
-                this->params()),
-    INT_MEMBER(tessedit_bigram_debug, 0,
-               "Amount of debug output for bigram correction.",
-               this->params()),
-    INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
-    BOOL_MEMBER(debug_acceptable_wds, false,
-                "Dump word pass/fail chk", this->params()),
-    STRING_MEMBER(chs_leading_punct, "('`\"",
-                  "Leading punctuation", this->params()),
-    STRING_MEMBER(chs_trailing_punct1, ").,;:?!",
-                  "1st Trailing punctuation", this->params()),
-    STRING_MEMBER(chs_trailing_punct2, ")'`\"",
-                  "2nd Trailing punctuation", this->params()),
-    double_MEMBER(quality_rej_pc, 0.08,
-                  "good_quality_doc lte rejection limit", this->params()),
-    double_MEMBER(quality_blob_pc, 0.0,
-                  "good_quality_doc gte good blobs limit", this->params()),
-    double_MEMBER(quality_outline_pc, 1.0,
-                  "good_quality_doc lte outline error limit", this->params()),
-    double_MEMBER(quality_char_pc, 0.95,
-                  "good_quality_doc gte good char limit", this->params()),
-    INT_MEMBER(quality_min_initial_alphas_reqd, 2,
-               "alphas in a good word", this->params()),
-    INT_MEMBER(tessedit_tess_adaption_mode, 0x27,
-               "Adaptation decision algorithm for tess", this->params()),
-    BOOL_MEMBER(tessedit_minimal_rej_pass1, false,
-                "Do minimal rejection on pass 1 output", this->params()),
-    BOOL_MEMBER(tessedit_test_adaption, false,
-                "Test adaption criteria", this->params()),
-    BOOL_MEMBER(tessedit_matcher_log, false,
-                "Log matcher activity", this->params()),
-    INT_MEMBER(tessedit_test_adaption_mode, 3,
-               "Adaptation decision algorithm for tess", this->params()),
-    BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
-    double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
-    double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
-    INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
-               this->params()),
-    BOOL_MEMBER(paragraph_text_based, true,
-                "Run paragraph detection on the post-text-recognition "
-                "(more accurate)", this->params()),
-    INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()),
-    STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
-                  this->params()),
-    STRING_MEMBER(outlines_2, "ij!?%\":;",
-                  "Non standard number of outlines", this->params()),
-    BOOL_MEMBER(docqual_excuse_outline_errs, false,
-                "Allow outline errs in unrejection?", this->params()),
-    BOOL_MEMBER(tessedit_good_quality_unrej, true,
-                "Reduce rejection on good docs", this->params()),
-    BOOL_MEMBER(tessedit_use_reject_spaces, true,
-                "Reject spaces?", this->params()),
-    double_MEMBER(tessedit_reject_doc_percent, 65.00,
-                  "%rej allowed before rej whole doc", this->params()),
-    double_MEMBER(tessedit_reject_block_percent, 45.00,
-                  "%rej allowed before rej whole block", this->params()),
-    double_MEMBER(tessedit_reject_row_percent, 40.00,
-                "%rej allowed before rej whole row", this->params()),
-    double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
-                  "Number of row rejects in whole word rejects"
-                  "which prevents whole row rejection", this->params()),
-    BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
-                "Only rej partially rejected words in block rejection",
-                this->params()),
-    BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
-                "Only rej partially rejected words in row rejection",
-                this->params()),
-    BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false,
-                "Use word segmentation quality metric", this->params()),
-    BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false,
-                "Use word segmentation quality metric", this->params()),
-    INT_MEMBER(tessedit_preserve_min_wd_len, 2,
-               "Only preserve wds longer than this", this->params()),
-    BOOL_MEMBER(tessedit_row_rej_good_docs, true,
-                "Apply row rejection to good docs", this->params()),
-    double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
-                  "rej good doc wd if more than this fraction rejected",
-                  this->params()),
-    BOOL_MEMBER(tessedit_reject_bad_qual_wds, true,
-                "Reject all bad quality wds", this->params()),
-    BOOL_MEMBER(tessedit_debug_doc_rejection, false,
-                "Page stats", this->params()),
-    BOOL_MEMBER(tessedit_debug_quality_metrics, false,
-                "Output data to debug file", this->params()),
-    BOOL_MEMBER(bland_unrej, false,
-                "unrej potential with no chekcs", this->params()),
-    double_MEMBER(quality_rowrej_pc, 1.1,
-                  "good_quality_doc gte good char limit", this->params()),
-    BOOL_MEMBER(unlv_tilde_crunching, true,
-                "Mark v.bad words for tilde crunch", this->params()),
-    BOOL_MEMBER(hocr_font_info, false,
-                "Add font info to hocr output", this->params()),
-    BOOL_MEMBER(crunch_early_merge_tess_fails, true,
-                "Before word crunch?", this->params()),
-    BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
-                "Take out ~^ early?", this->params()),
-    double_MEMBER(crunch_terrible_rating, 80.0,
-                  "crunch rating lt this", this->params()),
-    BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
-    double_MEMBER(crunch_poor_garbage_cert, -9.0,
-                  "crunch garbage cert lt this", this->params()),
-    double_MEMBER(crunch_poor_garbage_rate, 60,
-                  "crunch garbage rating lt this", this->params()),
-    double_MEMBER(crunch_pot_poor_rate, 40,
-                  "POTENTIAL crunch rating lt this", this->params()),
-    double_MEMBER(crunch_pot_poor_cert, -8.0,
-                  "POTENTIAL crunch cert lt this", this->params()),
-    BOOL_MEMBER(crunch_pot_garbage, true,
-                "POTENTIAL crunch garbage", this->params()),
-    double_MEMBER(crunch_del_rating, 60,
-                  "POTENTIAL crunch rating lt this", this->params()),
-    double_MEMBER(crunch_del_cert, -10.0,
-                  "POTENTIAL crunch cert lt this", this->params()),
-    double_MEMBER(crunch_del_min_ht, 0.7,
-                  "Del if word ht lt xht x this", this->params()),
-    double_MEMBER(crunch_del_max_ht, 3.0,
-                  "Del if word ht gt xht x this", this->params()),
-    double_MEMBER(crunch_del_min_width, 3.0,
-                  "Del if word width lt xht x this", this->params()),
-    double_MEMBER(crunch_del_high_word, 1.5,
-                  "Del if word gt xht x this above bl", this->params()),
-    double_MEMBER(crunch_del_low_word, 0.5,
-                  "Del if word gt xht x this below bl", this->params()),
-    double_MEMBER(crunch_small_outlines_size, 0.6,
-                  "Small if lt xht x this", this->params()),
-    INT_MEMBER(crunch_rating_max, 10,
-               "For adj length in rating per ch", this->params()),
-    INT_MEMBER(crunch_pot_indicators, 1,
-               "How many potential indicators needed", this->params()),
-    BOOL_MEMBER(crunch_leave_ok_strings, true,
-                "Dont touch sensible strings", this->params()),
-    BOOL_MEMBER(crunch_accept_ok, true,
-                "Use acceptability in okstring", this->params()),
-    BOOL_MEMBER(crunch_leave_accept_strings, false,
-                "Dont pot crunch sensible strings", this->params()),
-    BOOL_MEMBER(crunch_include_numerals, false,
-                "Fiddle alpha figures", this->params()),
-    INT_MEMBER(crunch_leave_lc_strings, 4,
-               "Dont crunch words with long lower case strings",
-               this->params()),
-    INT_MEMBER(crunch_leave_uc_strings, 4,
-               "Dont crunch words with long lower case strings",
-               this->params()),
-    INT_MEMBER(crunch_long_repetitions, 3,
-               "Crunch words with long repetitions", this->params()),
-    INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
-    INT_MEMBER(fixsp_non_noise_limit, 1,
-               "How many non-noise blbs either side?", this->params()),
-    double_MEMBER(fixsp_small_outlines_size, 0.28,
-                  "Small if lt xht x this", this->params()),
-    BOOL_MEMBER(tessedit_prefer_joined_punct, false,
-                "Reward punctation joins", this->params()),
-    INT_MEMBER(fixsp_done_mode, 1,
-               "What constitues done for spacing", this->params()),
-    INT_MEMBER(debug_fix_space_level, 0,
-               "Contextual fixspace debug", this->params()),
-    STRING_MEMBER(numeric_punctuation, ".,",
-                  "Punct. chs expected WITHIN numbers", this->params()),
-    INT_MEMBER(x_ht_acceptance_tolerance, 8,
-               "Max allowed deviation of blob top outside of font data",
-               this->params()),
-    INT_MEMBER(x_ht_min_change, 8,
-               "Min change in xht before actually trying it", this->params()),
-    INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer",
-               this->params()),
-    double_MEMBER(superscript_worse_certainty, 2.0, "How many times worse "
-                  "certainty does a superscript position glyph need to be for "
-                  "us to try classifying it as a char with a different "
-                  "baseline?", this->params()),
-    double_MEMBER(superscript_bettered_certainty, 0.97, "What reduction in "
-                  "badness do we think sufficient to choose a superscript "
-                  "over what we'd thought.  For example, a value of 0.6 means "
-                  "we want to reduce badness of certainty by at least 40%",
-                  this->params()),
-    double_MEMBER(superscript_scaledown_ratio, 0.4,
-                  "A superscript scaled down more than this is unbelievably "
-                  "small.  For example, 0.3 means we expect the font size to "
-                  "be no smaller than 30% of the text line font size.",
-                  this->params()),
-    double_MEMBER(subscript_max_y_top, 0.5,
-                  "Maximum top of a character measured as a multiple of "
-                  "x-height above the baseline for us to reconsider whether "
-                  "it's a subscript.", this->params()),
-    double_MEMBER(superscript_min_y_bottom, 0.3,
-                  "Minimum bottom of a character measured as a multiple of "
-                  "x-height above the baseline for us to reconsider whether "
-                  "it's a superscript.", this->params()),
-    BOOL_MEMBER(tessedit_write_block_separators, false,
-                "Write block separators in output", this->params()),
-    BOOL_MEMBER(tessedit_write_rep_codes, false,
-                "Write repetition char code", this->params()),
-    BOOL_MEMBER(tessedit_write_unlv, false,
-                "Write .unlv output file", this->params()),
-    BOOL_MEMBER(tessedit_create_txt, true,
-                "Write .txt output file", this->params()),
-    BOOL_MEMBER(tessedit_create_hocr, false,
-                "Write .html hOCR output file", this->params()),
-    BOOL_MEMBER(tessedit_create_pdf, false,
-                "Write .pdf output file", this->params()),
-    STRING_MEMBER(unrecognised_char, "|",
-                  "Output char for unidentified blobs", this->params()),
-    INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
-    INT_MEMBER(suspect_space_level, 100,
-               "Min suspect level for rejecting spaces", this->params()),
-    INT_MEMBER(suspect_short_words, 2,
-               "Dont Suspect dict wds longer than this", this->params()),
-    BOOL_MEMBER(suspect_constrain_1Il, false,
-                "UNLV keep 1Il chars rejected", this->params()),
-    double_MEMBER(suspect_rating_per_ch, 999.9,
-                  "Dont touch bad rating limit", this->params()),
-    double_MEMBER(suspect_accept_rating, -999.9,
-                  "Accept good rating limit", this->params()),
-    BOOL_MEMBER(tessedit_minimal_rejection, false,
-                "Only reject tess failures", this->params()),
-    BOOL_MEMBER(tessedit_zero_rejection, false,
-                "Dont reject ANYTHING", this->params()),
-    BOOL_MEMBER(tessedit_word_for_word, false,
-                "Make output have exactly one word per WERD", this->params()),
-    BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
-                "Dont reject ANYTHING AT ALL", this->params()),
-    BOOL_MEMBER(tessedit_consistent_reps, true,
-                "Force all rep chars the same", this->params()),
-    INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params()),
-    BOOL_MEMBER(tessedit_rejection_debug, false,
-                "Adaption debug", this->params()),
-    BOOL_MEMBER(tessedit_flip_0O, true,
-                "Contextual 0O O0 flips", this->params()),
-    double_MEMBER(tessedit_lower_flip_hyphen, 1.5,
-                  "Aspect ratio dot/hyphen test", this->params()),
-    double_MEMBER(tessedit_upper_flip_hyphen, 1.8,
-                  "Aspect ratio dot/hyphen test", this->params()),
-    BOOL_MEMBER(rej_trust_doc_dawg, false,
-                "Use DOC dawg in 11l conf. detector", this->params()),
-    BOOL_MEMBER(rej_1Il_use_dict_word, false,
-                "Use dictword test", this->params()),
-    BOOL_MEMBER(rej_1Il_trust_permuter_type, true,
-                "Dont double check", this->params()),
-    BOOL_MEMBER(rej_use_tess_accepted, true,
-                "Individual rejection control", this->params()),
-    BOOL_MEMBER(rej_use_tess_blanks, true,
-                "Individual rejection control", this->params()),
-    BOOL_MEMBER(rej_use_good_perm, true,
-                "Individual rejection control", this->params()),
-    BOOL_MEMBER(rej_use_sensible_wd, false,
-                "Extend permuter check", this->params()),
-    BOOL_MEMBER(rej_alphas_in_number_perm, false,
-                "Extend permuter check", this->params()),
-    double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85,
-                  "if >this fract", this->params()),
-    INT_MEMBER(tessedit_image_border, 2,
-               "Rej blbs near image edge limit", this->params()),
-    STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075",
-                  "Allow NN to unrej", this->params()),
-    STRING_MEMBER(conflict_set_I_l_1, "Il1[]",
-                  "Il1 conflict set", this->params()),
-    INT_MEMBER(min_sane_x_ht_pixels, 8,
-               "Reject any x-ht lt or eq than this", this->params()),
-    BOOL_MEMBER(tessedit_create_boxfile, false,
-                "Output text with boxes", this->params()),
-    INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages"
-               " , else specifc page to process", this->params()),
-    BOOL_MEMBER(tessedit_write_images, false,
-                "Capture the image from the IPE", this->params()),
-    BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
-                this->params()),
-    STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
-    BOOL_MEMBER(tessedit_override_permuter, true,
-                "According to dict_word", this->params()),
-    INT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for"
-               " TessdataManager functions.", this->params()),
-    STRING_MEMBER(tessedit_load_sublangs, "",
-                  "List of languages to load with this one", this->params()),
-    BOOL_MEMBER(tessedit_use_primary_params_model, false,
-                "In multilingual mode use params model of the"
-                " primary language", this->params()),
-    double_MEMBER(min_orientation_margin, 7.0,
-                  "Min acceptable orientation margin", this->params()),
-    BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
-                this->params()),
-    BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
-                this->params()),
-    BOOL_MEMBER(poly_allow_detailed_fx, false,
-                "Allow feature extractors to see the original outline",
-                this->params()),
-    BOOL_INIT_MEMBER(tessedit_init_config_only, false,
-                     "Only initialize with the config file. Useful if the "
-                     "instance is not going to be used for OCR but say only "
-                     "for layout analysis.", this->params()),
-    BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
-                this->params()),
-    BOOL_MEMBER(textord_tabfind_vertical_text, true,
-                "Enable vertical detection", this->params()),
-    BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
-                "Force using vertical text page mode", this->params()),
-    double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5,
-                  "Fraction of textlines deemed vertical to use vertical page "
-                  "mode", this->params()),
-    double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75,
-                  "Fraction of height used as a minimum gap for aligned blobs.",
-                  this->params()),
-    INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
-                this->params()),
-    BOOL_MEMBER(preserve_interword_spaces, false,
-                "Preserve multiple interword spaces", this->params()),
-    BOOL_MEMBER(include_page_breaks, FALSE,
-                "Include page separator string in output text after each "
-                "image/page.", this->params()),
-    STRING_MEMBER(page_separator, "\f",
-                  "Page separator (default is form feed control character)",
+    : BOOL_MEMBER(tessedit_resegment_from_boxes, false,
+                  "Take segmentation and labeling from box file",
                   this->params()),
+      BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
+                  "Conversion of word/line box file to char box file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_train_from_boxes, false,
+                  "Generate training data from boxed chars", this->params()),
+      BOOL_MEMBER(tessedit_make_boxes_from_boxes, false,
+                  "Generate more boxes from boxed chars", this->params()),
+      BOOL_MEMBER(tessedit_dump_pageseg_images, false,
+                  "Dump intermediate images made during page segmentation",
+                  this->params()),
+      // The default for pageseg_mode is the old behaviour, so as not to
+      // upset anything that relies on that.
+      INT_MEMBER(
+          tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
+          "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
+          " 5=line, 6=word, 7=char"
+          " (Values from PageSegMode enum in publictypes.h)",
+          this->params()),
+      INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY,
+                      "Which OCR engine(s) to run (Tesseract, Cube, both)."
+                      " Defaults to loading and running only Tesseract"
+                      " (no Cube,no combiner)."
+                      " Values from OcrEngineMode enum in tesseractclass.h)",
+                      this->params()),
+      STRING_MEMBER(tessedit_char_blacklist, "",
+                    "Blacklist of chars not to recognize", this->params()),
+      STRING_MEMBER(tessedit_char_whitelist, "",
+                    "Whitelist of chars to recognize", this->params()),
+      STRING_MEMBER(tessedit_char_unblacklist, "",
+                    "List of chars to override tessedit_char_blacklist",
+                    this->params()),
+      BOOL_MEMBER(tessedit_ambigs_training, false,
+                  "Perform training for ambiguities", this->params()),
+      INT_MEMBER(pageseg_devanagari_split_strategy,
+                 tesseract::ShiroRekhaSplitter::NO_SPLIT,
+                 "Whether to use the top-line splitting process for Devanagari "
+                 "documents while performing page-segmentation.",
+                 this->params()),
+      INT_MEMBER(ocr_devanagari_split_strategy,
+                 tesseract::ShiroRekhaSplitter::NO_SPLIT,
+                 "Whether to use the top-line splitting process for Devanagari "
+                 "documents while performing ocr.",
+                 this->params()),
+      STRING_MEMBER(tessedit_write_params_to_file, "",
+                    "Write all parameters to the given file.", this->params()),
+      BOOL_MEMBER(tessedit_adaption_debug, false,
+                  "Generate and print debug"
+                  " information for adaption",
+                  this->params()),
+      INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
+      INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
+      INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
+                 this->params()),
+      STRING_MEMBER(applybox_exposure_pattern, ".exp",
+                    "Exposure value follows"
+                    " this pattern in the image filename. The name of the image"
+                    " files are expected to be in the form"
+                    " [lang].[fontname].exp[num].tif",
+                    this->params()),
+      BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
+                  "Learn both character fragments (as is done in the"
+                  " special low exposure mode) as well as unfragmented"
+                  " characters.",
+                  this->params()),
+      BOOL_MEMBER(applybox_learn_ngrams_mode, false,
+                  "Each bounding box"
+                  " is assumed to contain ngrams. Only learn the ngrams"
+                  " whose outlines overlap horizontally.",
+                  this->params()),
+      BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
+                  this->params()),
+      BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
+                  this->params()),
+      BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
+                  this->params()),
+      BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
+                  "Try to improve fuzzy spaces", this->params()),
+      BOOL_MEMBER(tessedit_unrej_any_wd, false,
+                  "Dont bother with word plausibility", this->params()),
+      BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
+                  this->params()),
+      BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
+                  this->params()),
+      BOOL_MEMBER(tessedit_enable_doc_dict, true,
+                  "Add words to the document dictionary", this->params()),
+      BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
+                  this->params()),
+      BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
+                  this->params()),
+      BOOL_MEMBER(tessedit_enable_bigram_correction, true,
+                  "Enable correction based on the word bigram dictionary.",
+                  this->params()),
+      BOOL_MEMBER(tessedit_enable_dict_correction, false,
+                  "Enable single word correction based on the dictionary.",
+                  this->params()),
+      INT_MEMBER(tessedit_bigram_debug, 0,
+                 "Amount of debug output for bigram correction.",
+                 this->params()),
+      BOOL_MEMBER(enable_noise_removal, true,
+                  "Remove and conditionally reassign small outlines when they"
+                  " confuse layout analysis, determining diacritics vs noise",
+                  this->params()),
+      INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
+                 this->params()),
+      // Worst (min) certainty, for which a diacritic is allowed to make the
+      // base
+      // character worse and still be included.
+      double_MEMBER(noise_cert_basechar, -8.0,
+                    "Hingepoint for base char certainty", this->params()),
+      // Worst (min) certainty, for which a non-overlapping diacritic is allowed
+      // to make the base character worse and still be included.
+      double_MEMBER(noise_cert_disjoint, -1.0,
+                    "Hingepoint for disjoint certainty", this->params()),
+      // Worst (min) certainty, for which a diacritic is allowed to make a new
+      // stand-alone blob.
+      double_MEMBER(noise_cert_punc, -3.0,
+                    "Threshold for new punc char certainty", this->params()),
+      // Factor of certainty margin for adding diacritics to not count as worse.
+      double_MEMBER(noise_cert_factor, 0.375,
+                    "Scaling on certainty diff from Hingepoint",
+                    this->params()),
+      INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
+                 this->params()),
+      INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
+                 this->params()),
+      INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
+      BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk",
+                  this->params()),
+      STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
+                    this->params()),
+      STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
+                    this->params()),
+      STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
+                    this->params()),
+      double_MEMBER(quality_rej_pc, 0.08,
+                    "good_quality_doc lte rejection limit", this->params()),
+      double_MEMBER(quality_blob_pc, 0.0,
+                    "good_quality_doc gte good blobs limit", this->params()),
+      double_MEMBER(quality_outline_pc, 1.0,
+                    "good_quality_doc lte outline error limit", this->params()),
+      double_MEMBER(quality_char_pc, 0.95,
+                    "good_quality_doc gte good char limit", this->params()),
+      INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
+                 this->params()),
+      INT_MEMBER(tessedit_tess_adaption_mode, 0x27,
+                 "Adaptation decision algorithm for tess", this->params()),
+      BOOL_MEMBER(tessedit_minimal_rej_pass1, false,
+                  "Do minimal rejection on pass 1 output", this->params()),
+      BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
+                  this->params()),
+      BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity",
+                  this->params()),
+      INT_MEMBER(tessedit_test_adaption_mode, 3,
+                 "Adaptation decision algorithm for tess", this->params()),
+      BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
+      double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
+      double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
+      INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
+                 this->params()),
+      BOOL_MEMBER(paragraph_text_based, true,
+                  "Run paragraph detection on the post-text-recognition "
+                  "(more accurate)",
+                  this->params()),
+      INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()),
+      STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
+                    this->params()),
+      STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
+                    this->params()),
+      BOOL_MEMBER(docqual_excuse_outline_errs, false,
+                  "Allow outline errs in unrejection?", this->params()),
+      BOOL_MEMBER(tessedit_good_quality_unrej, true,
+                  "Reduce rejection on good docs", this->params()),
+      BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
+                  this->params()),
+      double_MEMBER(tessedit_reject_doc_percent, 65.00,
+                    "%rej allowed before rej whole doc", this->params()),
+      double_MEMBER(tessedit_reject_block_percent, 45.00,
+                    "%rej allowed before rej whole block", this->params()),
+      double_MEMBER(tessedit_reject_row_percent, 40.00,
+                    "%rej allowed before rej whole row", this->params()),
+      double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
+                    "Number of row rejects in whole word rejects"
+                    "which prevents whole row rejection",
+                    this->params()),
+      BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
+                  "Only rej partially rejected words in block rejection",
+                  this->params()),
+      BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
+                  "Only rej partially rejected words in row rejection",
+                  this->params()),
+      BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false,
+                  "Use word segmentation quality metric", this->params()),
+      BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false,
+                  "Use word segmentation quality metric", this->params()),
+      INT_MEMBER(tessedit_preserve_min_wd_len, 2,
+                 "Only preserve wds longer than this", this->params()),
+      BOOL_MEMBER(tessedit_row_rej_good_docs, true,
+                  "Apply row rejection to good docs", this->params()),
+      double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
+                    "rej good doc wd if more than this fraction rejected",
+                    this->params()),
+      BOOL_MEMBER(tessedit_reject_bad_qual_wds, true,
+                  "Reject all bad quality wds", this->params()),
+      BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
+                  this->params()),
+      BOOL_MEMBER(tessedit_debug_quality_metrics, false,
+                  "Output data to debug file", this->params()),
+      BOOL_MEMBER(bland_unrej, false, "unrej potential with no chekcs",
+                  this->params()),
+      double_MEMBER(quality_rowrej_pc, 1.1,
+                    "good_quality_doc gte good char limit", this->params()),
+      BOOL_MEMBER(unlv_tilde_crunching, true,
+                  "Mark v.bad words for tilde crunch", this->params()),
+      BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
+                  this->params()),
+      BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
+                  this->params()),
+      BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
+                  "Take out ~^ early?", this->params()),
+      double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
+                    this->params()),
+      BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
+      double_MEMBER(crunch_poor_garbage_cert, -9.0,
+                    "crunch garbage cert lt this", this->params()),
+      double_MEMBER(crunch_poor_garbage_rate, 60,
+                    "crunch garbage rating lt this", this->params()),
+      double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
+                    this->params()),
+      double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
+                    this->params()),
+      BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage",
+                  this->params()),
+      double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
+                    this->params()),
+      double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
+                    this->params()),
+      double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
+                    this->params()),
+      double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
+                    this->params()),
+      double_MEMBER(crunch_del_min_width, 3.0,
+                    "Del if word width lt xht x this", this->params()),
+      double_MEMBER(crunch_del_high_word, 1.5,
+                    "Del if word gt xht x this above bl", this->params()),
+      double_MEMBER(crunch_del_low_word, 0.5,
+                    "Del if word gt xht x this below bl", this->params()),
+      double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
+                    this->params()),
+      INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
+                 this->params()),
+      INT_MEMBER(crunch_pot_indicators, 1,
+                 "How many potential indicators needed", this->params()),
+      BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings",
+                  this->params()),
+      BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
+                  this->params()),
+      BOOL_MEMBER(crunch_leave_accept_strings, false,
+                  "Dont pot crunch sensible strings", this->params()),
+      BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
+                  this->params()),
+      INT_MEMBER(crunch_leave_lc_strings, 4,
+                 "Dont crunch words with long lower case strings",
+                 this->params()),
+      INT_MEMBER(crunch_leave_uc_strings, 4,
+                 "Dont crunch words with long lower case strings",
+                 this->params()),
+      INT_MEMBER(crunch_long_repetitions, 3,
+                 "Crunch words with long repetitions", this->params()),
+      INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
+      INT_MEMBER(fixsp_non_noise_limit, 1,
+                 "How many non-noise blbs either side?", this->params()),
+      double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
+                    this->params()),
+      BOOL_MEMBER(tessedit_prefer_joined_punct, false,
+                  "Reward punctation joins", this->params()),
+      INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
+                 this->params()),
+      INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
+                 this->params()),
+      STRING_MEMBER(numeric_punctuation, ".,",
+                    "Punct. chs expected WITHIN numbers", this->params()),
+      INT_MEMBER(x_ht_acceptance_tolerance, 8,
+                 "Max allowed deviation of blob top outside of font data",
+                 this->params()),
+      INT_MEMBER(x_ht_min_change, 8,
+                 "Min change in xht before actually trying it", this->params()),
+      INT_MEMBER(superscript_debug, 0,
+                 "Debug level for sub & superscript fixer", this->params()),
+      double_MEMBER(
+          superscript_worse_certainty, 2.0,
+          "How many times worse "
+          "certainty does a superscript position glyph need to be for "
+          "us to try classifying it as a char with a different "
+          "baseline?",
+          this->params()),
+      double_MEMBER(
+          superscript_bettered_certainty, 0.97,
+          "What reduction in "
+          "badness do we think sufficient to choose a superscript "
+          "over what we'd thought.  For example, a value of 0.6 means "
+          "we want to reduce badness of certainty by at least 40%",
+          this->params()),
+      double_MEMBER(superscript_scaledown_ratio, 0.4,
+                    "A superscript scaled down more than this is unbelievably "
+                    "small.  For example, 0.3 means we expect the font size to "
+                    "be no smaller than 30% of the text line font size.",
+                    this->params()),
+      double_MEMBER(subscript_max_y_top, 0.5,
+                    "Maximum top of a character measured as a multiple of "
+                    "x-height above the baseline for us to reconsider whether "
+                    "it's a subscript.",
+                    this->params()),
+      double_MEMBER(superscript_min_y_bottom, 0.3,
+                    "Minimum bottom of a character measured as a multiple of "
+                    "x-height above the baseline for us to reconsider whether "
+                    "it's a superscript.",
+                    this->params()),
+      BOOL_MEMBER(tessedit_write_block_separators, false,
+                  "Write block separators in output", this->params()),
+      BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
+                  this->params()),
+      BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
+                  this->params()),
+      BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
+                  this->params()),
+      STRING_MEMBER(unrecognised_char, "|",
+                    "Output char for unidentified blobs", this->params()),
+      INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
+      INT_MEMBER(suspect_space_level, 100,
+                 "Min suspect level for rejecting spaces", this->params()),
+      INT_MEMBER(suspect_short_words, 2,
+                 "Dont Suspect dict wds longer than this", this->params()),
+      BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
+                  this->params()),
+      double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit",
+                    this->params()),
+      double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
+                    this->params()),
+      BOOL_MEMBER(tessedit_minimal_rejection, false,
+                  "Only reject tess failures", this->params()),
+      BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING",
+                  this->params()),
+      BOOL_MEMBER(tessedit_word_for_word, false,
+                  "Make output have exactly one word per WERD", this->params()),
+      BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
+                  "Dont reject ANYTHING AT ALL", this->params()),
+      BOOL_MEMBER(tessedit_consistent_reps, true,
+                  "Force all rep chars the same", this->params()),
+      INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
+                 this->params()),
+      BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
+                  this->params()),
+      BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
+                  this->params()),
+      double_MEMBER(tessedit_lower_flip_hyphen, 1.5,
+                    "Aspect ratio dot/hyphen test", this->params()),
+      double_MEMBER(tessedit_upper_flip_hyphen, 1.8,
+                    "Aspect ratio dot/hyphen test", this->params()),
+      BOOL_MEMBER(rej_trust_doc_dawg, false,
+                  "Use DOC dawg in 11l conf. detector", this->params()),
+      BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
+                  this->params()),
+      BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check",
+                  this->params()),
+      BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
+                  this->params()),
+      BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
+                  this->params()),
+      BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
+                  this->params()),
+      BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
+                  this->params()),
+      BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
+                  this->params()),
+      double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85,
+                    "if >this fract", this->params()),
+      INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
+                 this->params()),
+      STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075",
+                    "Allow NN to unrej", this->params()),
+      STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
+                    this->params()),
+      INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
+                 this->params()),
+      BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
+                  this->params()),
+      INT_MEMBER(tessedit_page_number, -1,
+                 "-1 -> All pages"
+                 " , else specifc page to process",
+                 this->params()),
+      BOOL_MEMBER(tessedit_write_images, false,
+                  "Capture the image from the IPE", this->params()),
+      BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
+                  this->params()),
+      STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
+      BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
+                  this->params()),
+      INT_MEMBER(tessdata_manager_debug_level, 0,
+                 "Debug level for"
+                 " TessdataManager functions.",
+                 this->params()),
+      STRING_MEMBER(tessedit_load_sublangs, "",
+                    "List of languages to load with this one", this->params()),
+      BOOL_MEMBER(tessedit_use_primary_params_model, false,
+                  "In multilingual mode use params model of the"
+                  " primary language",
+                  this->params()),
+      double_MEMBER(min_orientation_margin, 7.0,
+                    "Min acceptable orientation margin", this->params()),
+      BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
+                  this->params()),
+      BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
+                  this->params()),
+      BOOL_MEMBER(poly_allow_detailed_fx, false,
+                  "Allow feature extractors to see the original outline",
+                  this->params()),
+      BOOL_INIT_MEMBER(tessedit_init_config_only, false,
+                       "Only initialize with the config file. Useful if the "
+                       "instance is not going to be used for OCR but say only "
+                       "for layout analysis.",
+                       this->params()),
+      BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
+                  this->params()),
+      BOOL_MEMBER(textord_tabfind_vertical_text, true,
+                  "Enable vertical detection", this->params()),
+      BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
+                  "Force using vertical text page mode", this->params()),
+      double_MEMBER(
+          textord_tabfind_vertical_text_ratio, 0.5,
+          "Fraction of textlines deemed vertical to use vertical page "
+          "mode",
+          this->params()),
+      double_MEMBER(
+          textord_tabfind_aligned_gap_fraction, 0.75,
+          "Fraction of height used as a minimum gap for aligned blobs.",
+          this->params()),
+      INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
+                 this->params()),
+      BOOL_MEMBER(preserve_interword_spaces, false,
+                  "Preserve multiple interword spaces", this->params()),
+      BOOL_MEMBER(include_page_breaks, FALSE,
+                  "Include page separator string in output text after each "
+                  "image/page.",
+                  this->params()),
+      STRING_MEMBER(page_separator, "\f",
+                    "Page separator (default is form feed control character)",
+                    this->params()),
 
-    // The following parameters were deprecated and removed from their original
-    // locations. The parameters are temporarily kept here to give Tesseract
-    // users a chance to updated their [lang].traineddata and config files
-    // without introducing failures during Tesseract initialization.
-    // TODO(ocr-team): remove these parameters from the code once we are
-    // reasonably sure that Tesseract users have updated their data files.
-    //
-    // BEGIN DEPRECATED PARAMETERS
-    BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true,
-                "find horizontal lines such as headers in vertical page mode",
-                this->params()),
-    INT_MEMBER(tessedit_ok_mode, 5,
-               "Acceptance decision algorithm", this->params()),
-    BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs"
-                     " (e.g. for non-space delimited languages)",
-                     this->params()),
-    INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process",
-               this->params()),
-    BOOL_MEMBER(permute_debug, 0, "Debug char permutation process",
-                this->params()),
-    double_MEMBER(bestrate_pruning_factor, 2.0, "Multiplying factor of"
-                  " current best rate to prune other hypotheses",
-                  this->params()),
-    BOOL_MEMBER(permute_script_word, 0,
-                "Turn on word script consistency permuter",
-                this->params()),
-    BOOL_MEMBER(segment_segcost_rating, 0,
-                "incorporate segmentation cost in word rating?",
-                this->params()),
-    double_MEMBER(segment_reward_script, 0.95,
-                  "Score multipler for script consistency within a word. "
-                  "Being a 'reward' factor, it should be <= 1. "
-                  "Smaller value implies bigger reward.",
-                  this->params()),
-    BOOL_MEMBER(permute_fixed_length_dawg, 0,
-                "Turn on fixed-length phrasebook search permuter",
-                this->params()),
-    BOOL_MEMBER(permute_chartype_word, 0,
-                "Turn on character type (property) consistency permuter",
-                this->params()),
-    double_MEMBER(segment_reward_chartype, 0.97,
-                  "Score multipler for char type consistency within a word. ",
-                  this->params()),
-    double_MEMBER(segment_reward_ngram_best_choice, 0.99,
-                  "Score multipler for ngram permuter's best choice"
-                  " (only used in the Han script path).",
-                  this->params()),
-    BOOL_MEMBER(ngram_permuter_activated, false,
-                "Activate character-level n-gram-based permuter",
-                this->params()),
-    BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter",
-                this->params()),
-    INT_MEMBER(language_model_fixed_length_choices_depth, 3,
-               "Depth of blob choice lists to explore"
-               " when fixed length dawgs are on",
-               this->params()),
-    BOOL_MEMBER(use_new_state_cost, FALSE,
-                "use new state cost heuristics for segmentation state"
-                " evaluation", this->params()),
-    double_MEMBER(heuristic_segcost_rating_base, 1.25,
-                  "base factor for adding segmentation cost into word rating."
-                  "It's a multiplying factor, the larger the value above 1, "
-                  "the bigger the effect of segmentation cost.",
-                  this->params()),
-    double_MEMBER(heuristic_weight_rating, 1.0,
-                  "weight associated with char rating in combined cost of"
-                  "state", this->params()),
-    double_MEMBER(heuristic_weight_width, 1000.0,
-                  "weight associated with width evidence in combined cost of"
-                  " state", this->params()),
-    double_MEMBER(heuristic_weight_seamcut, 0.0,
-                  "weight associated with seam cut in combined cost of state",
-                  this->params()),
-    double_MEMBER(heuristic_max_char_wh_ratio, 2.0,
-                  "max char width-to-height ratio allowed in segmentation",
-                  this->params()),
-    BOOL_MEMBER(enable_new_segsearch, true,
-                "Enable new segmentation search path.", this->params()),
-    double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
-                  "Maximum character width-to-height ratio for"
-                  " fixed-pitch fonts",
-                  this->params()),
-    // END DEPRECATED PARAMETERS
+      // The following parameters were deprecated and removed from their
+      // original
+      // locations. The parameters are temporarily kept here to give Tesseract
+      // users a chance to updated their [lang].traineddata and config files
+      // without introducing failures during Tesseract initialization.
+      // TODO(ocr-team): remove these parameters from the code once we are
+      // reasonably sure that Tesseract users have updated their data files.
+      //
+      // BEGIN DEPRECATED PARAMETERS
+      BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true,
+                  "find horizontal lines such as headers in vertical page mode",
+                  this->params()),
+      INT_MEMBER(tessedit_ok_mode, 5, "Acceptance decision algorithm",
+                 this->params()),
+      BOOL_INIT_MEMBER(load_fixed_length_dawgs, true,
+                       "Load fixed length dawgs"
+                       " (e.g. for non-space delimited languages)",
+                       this->params()),
+      INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process",
+                 this->params()),
+      BOOL_MEMBER(permute_debug, 0, "Debug char permutation process",
+                  this->params()),
+      double_MEMBER(bestrate_pruning_factor, 2.0,
+                    "Multiplying factor of"
+                    " current best rate to prune other hypotheses",
+                    this->params()),
+      BOOL_MEMBER(permute_script_word, 0,
+                  "Turn on word script consistency permuter", this->params()),
+      BOOL_MEMBER(segment_segcost_rating, 0,
+                  "incorporate segmentation cost in word rating?",
+                  this->params()),
+      double_MEMBER(segment_reward_script, 0.95,
+                    "Score multipler for script consistency within a word. "
+                    "Being a 'reward' factor, it should be <= 1. "
+                    "Smaller value implies bigger reward.",
+                    this->params()),
+      BOOL_MEMBER(permute_fixed_length_dawg, 0,
+                  "Turn on fixed-length phrasebook search permuter",
+                  this->params()),
+      BOOL_MEMBER(permute_chartype_word, 0,
+                  "Turn on character type (property) consistency permuter",
+                  this->params()),
+      double_MEMBER(segment_reward_chartype, 0.97,
+                    "Score multipler for char type consistency within a word. ",
+                    this->params()),
+      double_MEMBER(segment_reward_ngram_best_choice, 0.99,
+                    "Score multipler for ngram permuter's best choice"
+                    " (only used in the Han script path).",
+                    this->params()),
+      BOOL_MEMBER(ngram_permuter_activated, false,
+                  "Activate character-level n-gram-based permuter",
+                  this->params()),
+      BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter",
+                  this->params()),
+      INT_MEMBER(language_model_fixed_length_choices_depth, 3,
+                 "Depth of blob choice lists to explore"
+                 " when fixed length dawgs are on",
+                 this->params()),
+      BOOL_MEMBER(use_new_state_cost, FALSE,
+                  "use new state cost heuristics for segmentation state"
+                  " evaluation",
+                  this->params()),
+      double_MEMBER(heuristic_segcost_rating_base, 1.25,
+                    "base factor for adding segmentation cost into word rating."
+                    "It's a multiplying factor, the larger the value above 1, "
+                    "the bigger the effect of segmentation cost.",
+                    this->params()),
+      double_MEMBER(heuristic_weight_rating, 1.0,
+                    "weight associated with char rating in combined cost of"
+                    "state",
+                    this->params()),
+      double_MEMBER(heuristic_weight_width, 1000.0,
+                    "weight associated with width evidence in combined cost of"
+                    " state",
+                    this->params()),
+      double_MEMBER(heuristic_weight_seamcut, 0.0,
+                    "weight associated with seam cut in combined cost of state",
+                    this->params()),
+      double_MEMBER(heuristic_max_char_wh_ratio, 2.0,
+                    "max char width-to-height ratio allowed in segmentation",
+                    this->params()),
+      BOOL_MEMBER(enable_new_segsearch, true,
+                  "Enable new segmentation search path.", this->params()),
+      double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
+                    "Maximum character width-to-height ratio for"
+                    " fixed-pitch fonts",
+                    this->params()),
+      // END DEPRECATED PARAMETERS
 
-    backup_config_file_(NULL),
-    pix_binary_(NULL),
-    cube_binary_(NULL),
-    pix_grey_(NULL),
-    pix_thresholds_(NULL),
-    source_resolution_(0),
-    textord_(this),
-    right_to_left_(false),
-    scaled_color_(NULL),
-    scaled_factor_(-1),
-    deskew_(1.0f, 0.0f),
-    reskew_(1.0f, 0.0f),
-    most_recently_used_(this),
-    font_table_size_(0),
+      backup_config_file_(NULL),
+      pix_binary_(NULL),
+      cube_binary_(NULL),
+      pix_grey_(NULL),
+      pix_thresholds_(NULL),
+      source_resolution_(0),
+      textord_(this),
+      right_to_left_(false),
+      scaled_color_(NULL),
+      scaled_factor_(-1),
+      deskew_(1.0f, 0.0f),
+      reskew_(1.0f, 0.0f),
+      most_recently_used_(this),
+      font_table_size_(0),
 #ifndef ANDROID_BUILD
-    cube_cntxt_(NULL),
-    tess_cube_combiner_(NULL),
+      cube_cntxt_(NULL),
+      tess_cube_combiner_(NULL),
 #endif
-    equ_detect_(NULL) {
+      equ_detect_(NULL) {
 }
 
 Tesseract::~Tesseract() {
diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h
index bd03fff642..d488fd30f3 100644
--- a/ccmain/tesseractclass.h
+++ b/ccmain/tesseractclass.h
@@ -283,8 +283,8 @@ class Tesseract : public Wordrec {
   int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
                   Tesseract* osd_tess, OSResults* osr);
   void SetupWordScripts(BLOCK_LIST* blocks);
-  int AutoPageSeg(PageSegMode pageseg_mode,
-                  BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
+  int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
+                  TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs,
                   Tesseract* osd_tess, OSResults* osr);
   ColumnFinder* SetupPageSegAndDetectOrientation(
       bool single_column, bool osd, bool only_osd,
@@ -328,8 +328,46 @@ class Tesseract : public Wordrec {
                         WordRecognizer recognizer,
                         WERD_RES** in_word,
                         PointerVector<WERD_RES>* best_words);
-  void classify_word_and_language(WordRecognizer recognizer,
-                                  PAGE_RES_IT* pr_it,
+  // Moves good-looking "noise"/diacritics from the reject list to the main
+  // blob list on the current word. Returns true if anything was done, and
+  // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
+  bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it,
+                          bool* make_next_word_fuzzy);
+  // Attempts to put noise/diacritic outlines into the blobs that they overlap.
+  // Input: a set of noisy outlines that probably belong to the real_word.
+  // Output: outlines that overlapped blobs are set to NULL and put back into
+  // the word, either in the blobs or in the reject list.
+  void AssignDiacriticsToOverlappingBlobs(
+      const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
+      PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
+      GenericVector<bool>* overlapped_any_blob,
+      GenericVector<C_BLOB*>* target_blobs);
+  // Attempts to assign non-overlapping outlines to their nearest blobs or
+  // make new blobs out of them.
+  void AssignDiacriticsToNewBlobs(const GenericVector<C_OUTLINE*>& outlines,
+                                  int pass, WERD* real_word, PAGE_RES_IT* pr_it,
+                                  GenericVector<bool>* word_wanted,
+                                  GenericVector<C_BLOB*>* target_blobs);
+  // Starting with ok_outlines set to indicate which outlines overlap the blob,
+  // chooses the optimal set (approximately) and returns true if any outlines
+  // are desired, in which case ok_outlines indicates which ones.
+  bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold,
+                                   PAGE_RES_IT* pr_it, C_BLOB* blob,
+                                   const GenericVector<C_OUTLINE*>& outlines,
+                                   int num_outlines,
+                                   GenericVector<bool>* ok_outlines);
+  // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
+  // the inclusion of the outlines, and returns the certainty of the raw choice.
+  float ClassifyBlobPlusOutlines(const GenericVector<bool>& ok_outlines,
+                                 const GenericVector<C_OUTLINE*>& outlines,
+                                 int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
+                                 STRING* best_str);
+  // Classifies the given blob (part of word_data->word->word) as an individual
+  // word, using languages, chopper etc, returning only the certainty of the
+  // best raw choice, and undoing all the work done to fake out the word.
+  float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob,
+                           STRING* best_str, float* c2);
+  void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it,
                                   WordData* word_data);
   void classify_word_pass1(const WordData& word_data,
                            WERD_RES** in_word,
@@ -808,6 +846,24 @@ class Tesseract : public Wordrec {
              "Enable single word correction based on the dictionary.");
   INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram "
             "correction.");
+  BOOL_VAR_H(enable_noise_removal, true,
+             "Remove and conditionally reassign small outlines when they"
+             " confuse layout analysis, determining diacritics vs noise");
+  INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines");
+  // Worst (min) certainty, for which a diacritic is allowed to make the base
+  // character worse and still be included.
+  double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty");
+  // Worst (min) certainty, for which a non-overlapping diacritic is allowed to
+  // make the base character worse and still be included.
+  double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty");
+  // Worst (min) certainty, for which a diacritic is allowed to make a new
+  // stand-alone blob.
+  double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty");
+  // Factor of certainty margin for adding diacritics to not count as worse.
+  double_VAR_H(noise_cert_factor, 0.375,
+               "Scaling on certainty diff from Hingepoint");
+  INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob");
+  INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word");
   INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug");
   BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk");
   STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation");
diff --git a/ccstruct/blobbox.h b/ccstruct/blobbox.h
index bd26e1be95..b09d82f4da 100644
--- a/ccstruct/blobbox.h
+++ b/ccstruct/blobbox.h
@@ -137,6 +137,9 @@ class BLOBNBOX:public ELIST_LINK
       cblob_ptr = srcblob;
       area = static_cast<int>(srcblob->area());
     }
+    ~BLOBNBOX() {
+      if (owns_cblob_) delete cblob_ptr;
+    }
     static BLOBNBOX* RealBlob(C_OUTLINE* outline) {
       C_BLOB* blob = new C_BLOB(outline);
       return new BLOBNBOX(blob);
@@ -387,6 +390,7 @@ class BLOBNBOX:public ELIST_LINK
     void set_base_char_blob(BLOBNBOX* blob) {
       base_char_blob_ = blob;
     }
+    void set_owns_cblob(bool value) { owns_cblob_ = value; }
 
     bool UniquelyVertical() const {
       return vert_possible_ && !horz_possible_;
@@ -450,6 +454,7 @@ class BLOBNBOX:public ELIST_LINK
   // construction time.
   void ConstructionInit() {
     cblob_ptr = NULL;
+    owns_cblob_ = false;
     area = 0;
     area_stroke_width_ = 0.0f;
     horz_stroke_width_ = 0.0f;
@@ -525,6 +530,10 @@ class BLOBNBOX:public ELIST_LINK
   bool vert_possible_;           // Could be part of vertical flow.
   bool leader_on_left_;          // There is a leader to the left.
   bool leader_on_right_;         // There is a leader to the right.
+  // Iff true, then the destructor should delete the cblob_ptr.
+  // TODO(rays) migrate all uses to correctly setting this flag instead of
+  // deleting the C_BLOB before deleting the BLOBNBOX.
+  bool owns_cblob_;
 };
 
 class TO_ROW: public ELIST2_LINK
diff --git a/ccstruct/ocrblock.cpp b/ccstruct/ocrblock.cpp
index a328e03887..ad7893b05a 100644
--- a/ccstruct/ocrblock.cpp
+++ b/ccstruct/ocrblock.cpp
@@ -86,6 +86,18 @@ void BLOCK::rotate(const FCOORD& rotation) {
   box = *poly_block()->bounding_box();
 }
 
+// Returns the bounding box including the desired combination of upper and
+// lower noise/diacritic elements.
+TBOX BLOCK::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
+  TBOX box;
+  // This is a read-only iteration of the rows in the block.
+  ROW_IT it(const_cast<ROW_LIST*>(&rows));
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    box += it.data()->restricted_bounding_box(upper_dots, lower_dots);
+  }
+  return box;
+}
+
 /**
  * BLOCK::reflect_polygon_in_y_axis
  *
diff --git a/ccstruct/ocrblock.h b/ccstruct/ocrblock.h
index 207c1e8579..c93aaf8a4c 100644
--- a/ccstruct/ocrblock.h
+++ b/ccstruct/ocrblock.h
@@ -161,10 +161,14 @@ class BLOCK:public ELIST_LINK, public PDBLK
     median_size_.set_y(y);
   }
 
-  Pix* render_mask() {
-    return PDBLK::render_mask(re_rotation_);
+  Pix* render_mask(TBOX* mask_box) {
+    return PDBLK::render_mask(re_rotation_, mask_box);
   }
 
+  // Returns the bounding box including the desired combination of upper and
+  // lower noise/diacritic elements.
+  TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
+
   // Reflects the polygon in the y-axis and recomputes the bounding_box.
   // Does nothing to any contained rows/words/blobs etc.
   void reflect_polygon_in_y_axis();
diff --git a/ccstruct/ocrrow.cpp b/ccstruct/ocrrow.cpp
index a7ad6ba791..c6f919ca12 100644
--- a/ccstruct/ocrrow.cpp
+++ b/ccstruct/ocrrow.cpp
@@ -80,6 +80,17 @@ ROW::ROW(                 //constructor
   rmargin_ = 0;
 }
 
+// Returns the bounding box including the desired combination of upper and
+// lower noise/diacritic elements.
+TBOX ROW::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
+  TBOX box;
+  // This is a read-only iteration of the words in the row.
+  WERD_IT it(const_cast<WERD_LIST *>(&words));
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    box += it.data()->restricted_bounding_box(upper_dots, lower_dots);
+  }
+  return box;
+}
 
 /**********************************************************************
  * ROW::recalc_bounding_box
diff --git a/ccstruct/ocrrow.h b/ccstruct/ocrrow.h
index 1a23889279..45384b710f 100644
--- a/ccstruct/ocrrow.h
+++ b/ccstruct/ocrrow.h
@@ -85,6 +85,9 @@ class ROW:public ELIST_LINK
     TBOX bounding_box() const {  //return bounding box
       return bound_box;
     }
+    // Returns the bounding box including the desired combination of upper and
+    // lower noise/diacritic elements.
+    TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
 
     void set_lmargin(inT16 lmargin) {
       lmargin_ = lmargin;
diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp
index 5304451929..9c1b13c5c3 100644
--- a/ccstruct/pageres.cpp
+++ b/ccstruct/pageres.cpp
@@ -1258,23 +1258,16 @@ int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const {
   return 0;
 }
 
-// Inserts the new_word and a corresponding WERD_RES before the current
-// position. The simple fields of the WERD_RES are copied from clone_res and
-// the resulting WERD_RES is returned for further setup with best_choice etc.
+// Inserts the new_word as a combination owned by a corresponding WERD_RES
+// before the current position. The simple fields of the WERD_RES are copied
+// from clone_res and the resulting WERD_RES is returned for further setup
+// with best_choice etc.
 WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
                                              WERD* new_word) {
-  // Insert new_word into the ROW.
-  WERD_IT w_it(row()->row->word_list());
-  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
-    WERD* word = w_it.data();
-    if (word == word_res->word)
-      break;
-  }
-  ASSERT_HOST(!w_it.cycled_list());
-  w_it.add_before_then_move(new_word);
   // Make a WERD_RES for the new_word.
   WERD_RES* new_res = new WERD_RES(new_word);
   new_res->CopySimpleFields(clone_res);
+  new_res->combination = true;
   // Insert into the appropriate place in the ROW_RES.
   WERD_RES_IT wr_it(&row()->word_res_list);
   for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) {
@@ -1477,6 +1470,33 @@ void PAGE_RES_IT::DeleteCurrentWord() {
   ResetWordIterator();
 }
 
+// Makes the current word a fuzzy space if not already fuzzy. Updates
+// corresponding part of combo if required.
+void PAGE_RES_IT::MakeCurrentWordFuzzy() {
+  WERD* real_word = word_res->word;
+  if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
+    real_word->set_flag(W_FUZZY_SP, true);
+    tprintf("Made word fuzzy at:");
+    real_word->bounding_box().print();
+    if (word_res->combination) {
+      // The next word should be the corresponding part of combo, but we have
+      // already stepped past it, so find it by search.
+      WERD_RES_IT wr_it(&row()->word_res_list);
+      for (wr_it.mark_cycle_pt();
+           !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) {
+      }
+      wr_it.forward();
+      ASSERT_HOST(wr_it.data()->part_of_combo);
+      real_word = wr_it.data()->word;
+      ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
+                  !real_word->flag(W_FUZZY_NON));
+      real_word->set_flag(W_FUZZY_SP, true);
+      tprintf("Made part of combo word fuzzy at:");
+      real_word->bounding_box().print();
+    }
+  }
+}
+
 /*************************************************************************
  * PAGE_RES_IT::restart_page
  *
diff --git a/ccstruct/pageres.h b/ccstruct/pageres.h
index 75798113d4..a6a8404275 100644
--- a/ccstruct/pageres.h
+++ b/ccstruct/pageres.h
@@ -708,6 +708,10 @@ class PAGE_RES_IT {
   // Deletes the current WERD_RES and its underlying WERD.
   void DeleteCurrentWord();
 
+  // Makes the current word a fuzzy space if not already fuzzy. Updates
+  // corresponding part of combo if required.
+  void MakeCurrentWordFuzzy();
+
   WERD_RES *forward() {  // Get next word.
     return internal_forward(false, false);
   }
@@ -747,9 +751,9 @@ class PAGE_RES_IT {
     return next_block_res;
   }
   void rej_stat_word();  // for page/block/row
+  void ResetWordIterator();
 
  private:
-  void ResetWordIterator();
   WERD_RES *internal_forward(bool new_block, bool empty_ok);
 
   WERD_RES * prev_word_res;    // previous word
diff --git a/ccstruct/pdblock.cpp b/ccstruct/pdblock.cpp
index 97365b53e7..cf3289f2e7 100644
--- a/ccstruct/pdblock.cpp
+++ b/ccstruct/pdblock.cpp
@@ -77,7 +77,6 @@ void PDBLK::set_sides(                       //set vertex lists
   right_it.add_list_before (right);
 }
 
-
 /**********************************************************************
  * PDBLK::contains
  *
@@ -126,7 +125,7 @@ void PDBLK::move(                  // reposition block
 
 // Returns a binary Pix mask with a 1 pixel for every pixel within the
 // block. Rotates the coordinate system by rerotation prior to rendering.
-Pix* PDBLK::render_mask(const FCOORD& rerotation) {
+Pix* PDBLK::render_mask(const FCOORD& rerotation, TBOX* mask_box) {
   TBOX rotated_box(box);
   rotated_box.rotate(rerotation);
   Pix* pix = pixCreate(rotated_box.width(), rotated_box.height(), 1);
@@ -163,6 +162,7 @@ Pix* PDBLK::render_mask(const FCOORD& rerotation) {
     pixRasterop(pix, 0, 0, rotated_box.width(), rotated_box.height(),
                 PIX_SET, NULL, 0, 0);
   }
+  if (mask_box != NULL) *mask_box = rotated_box;
   return pix;
 }
 
diff --git a/ccstruct/pdblock.h b/ccstruct/pdblock.h
index 34f5518e3c..0dd0bf2ef8 100644
--- a/ccstruct/pdblock.h
+++ b/ccstruct/pdblock.h
@@ -89,7 +89,9 @@ class PDBLK
 
     // Returns a binary Pix mask with a 1 pixel for every pixel within the
     // block. Rotates the coordinate system by rerotation prior to rendering.
-    Pix* render_mask(const FCOORD& rerotation);
+    // If not NULL, mask_box is filled with the position box of the returned
+    // mask image.
+    Pix *render_mask(const FCOORD &rerotation, TBOX *mask_box);
 
     #ifndef GRAPHICS_DISABLED
     ///draw histogram
diff --git a/ccstruct/werd.cpp b/ccstruct/werd.cpp
index 24c8a41b33..aaaee9cc23 100644
--- a/ccstruct/werd.cpp
+++ b/ccstruct/werd.cpp
@@ -160,23 +160,37 @@ WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) {
  * row being marked as FUZZY space.
  */
 
-TBOX WERD::bounding_box() {
-  TBOX box;                       // box being built
-  C_BLOB_IT rej_cblob_it = &rej_cblobs;  // rejected blobs
-
-  for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list();
-       rej_cblob_it.forward()) {
-    box += rej_cblob_it.data()->bounding_box();
+TBOX WERD::bounding_box() const { return restricted_bounding_box(true, true); }
+
+// Returns the bounding box including the desired combination of upper and
+// lower noise/diacritic elements.
+TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const {
+  TBOX box = true_bounding_box();
+  int bottom = box.bottom();
+  int top = box.top();
+  // This is a read-only iteration of the rejected blobs.
+  C_BLOB_IT it(const_cast<C_BLOB_LIST*>(&rej_cblobs));
+  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
+    TBOX dot_box = it.data()->bounding_box();
+    if ((upper_dots || dot_box.bottom() <= top) &&
+        (lower_dots || dot_box.top() >= bottom)) {
+      box += dot_box;
+    }
   }
+  return box;
+}
 
-  C_BLOB_IT it = &cblobs;    // blobs of WERD
+// Returns the bounding box of only the good blobs.
+TBOX WERD::true_bounding_box() const {
+  TBOX box;  // box being built
+  // This is a read-only iteration of the good blobs.
+  C_BLOB_IT it(const_cast<C_BLOB_LIST*>(&cblobs));
   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
     box += it.data()->bounding_box();
   }
   return box;
 }
 
-
 /**
  * WERD::move
  *
@@ -489,3 +503,101 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs,
   }
   return new_werd;
 }
+
+// Removes noise from the word by moving small outlines to the rej_cblobs
+// list, based on the size_threshold.
+void WERD::CleanNoise(float size_threshold) {
+  C_BLOB_IT blob_it(&cblobs);
+  C_BLOB_IT rej_it(&rej_cblobs);
+  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+    C_BLOB* blob = blob_it.data();
+    C_OUTLINE_IT ol_it(blob->out_list());
+    for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) {
+      C_OUTLINE* outline = ol_it.data();
+      TBOX ol_box = outline->bounding_box();
+      int ol_size =
+          ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height();
+      if (ol_size < size_threshold) {
+        // This outline is too small. Move it to a separate blob in the
+        // reject blobs list.
+        C_BLOB* rej_blob = new C_BLOB(ol_it.extract());
+        rej_it.add_after_then_move(rej_blob);
+      }
+    }
+    if (blob->out_list()->empty()) delete blob_it.extract();
+  }
+}
+
+// Extracts all the noise outlines and stuffs the pointers into the given
+// vector of outlines. Afterwards, the outlines vector owns the pointers.
+void WERD::GetNoiseOutlines(GenericVector<C_OUTLINE*>* outlines) {
+  C_BLOB_IT rej_it(&rej_cblobs);
+  for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) {
+    C_BLOB* blob = rej_it.extract();
+    C_OUTLINE_IT ol_it(blob->out_list());
+    outlines->push_back(ol_it.extract());
+    delete blob;
+  }
+}
+
+// Adds the selected outlines to the indcated real blobs, and puts the rest
+// back in rej_cblobs where they came from. Where the target_blobs entry is
+// NULL, a run of wanted outlines is put into a single new blob.
+// Ownership of the outlines is transferred back to the word. (Hence
+// GenericVector and not PointerVector.)
+// Returns true if any new blob was added to the start of the word, which
+// suggests that it might need joining to the word before it, and likewise
+// sets make_next_word_fuzzy true if any new blob was added to the end.
+bool WERD::AddSelectedOutlines(const GenericVector<bool>& wanted,
+                               const GenericVector<C_BLOB*>& target_blobs,
+                               const GenericVector<C_OUTLINE*>& outlines,
+                               bool* make_next_word_fuzzy) {
+  bool outline_added_to_start = false;
+  if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = false;
+  C_BLOB_IT rej_it(&rej_cblobs);
+  for (int i = 0; i < outlines.size(); ++i) {
+    C_OUTLINE* outline = outlines[i];
+    if (outline == NULL) continue;  // Already used it.
+    if (wanted[i]) {
+      C_BLOB* target_blob = target_blobs[i];
+      TBOX noise_box = outline->bounding_box();
+      if (target_blob == NULL) {
+        target_blob = new C_BLOB(outline);
+        // Need to find the insertion point.
+        C_BLOB_IT blob_it(&cblobs);
+        for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
+             blob_it.forward()) {
+          C_BLOB* blob = blob_it.data();
+          TBOX blob_box = blob->bounding_box();
+          if (blob_box.left() > noise_box.left()) {
+            if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) {
+              // We might want to join this word to its predecessor.
+              outline_added_to_start = true;
+            }
+            blob_it.add_before_stay_put(target_blob);
+            break;
+          }
+        }
+        if (blob_it.cycled_list()) {
+          blob_it.add_to_end(target_blob);
+          if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = true;
+        }
+        // Add all consecutive wanted, but null-blob outlines to same blob.
+        C_OUTLINE_IT ol_it(target_blob->out_list());
+        while (i + 1 < outlines.size() && wanted[i + 1] &&
+               target_blobs[i + 1] == NULL) {
+          ++i;
+          ol_it.add_to_end(outlines[i]);
+        }
+      } else {
+        // Insert outline into this blob.
+        C_OUTLINE_IT ol_it(target_blob->out_list());
+        ol_it.add_to_end(outline);
+      }
+    } else {
+      // Put back on noise list.
+      rej_it.add_to_end(new C_BLOB(outline));
+    }
+  }
+  return outline_added_to_start;
+}
diff --git a/ccstruct/werd.h b/ccstruct/werd.h
index 43ecb84b6e..f9a89fb5b5 100644
--- a/ccstruct/werd.h
+++ b/ccstruct/werd.h
@@ -114,7 +114,13 @@ class WERD : public ELIST2_LINK {
       script_id_ = id;
     }
 
-    TBOX bounding_box();  // compute bounding box
+    // Returns the (default) bounding box including all the dots.
+    TBOX bounding_box() const;  // compute bounding box
+    // Returns the bounding box including the desired combination of upper and
+    // lower noise/diacritic elements.
+    TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const;
+    // Returns the bounding box of only the good blobs.
+    TBOX true_bounding_box() const;
 
     const char *text() const { return correct.string(); }
     void set_text(const char *new_text) { correct = new_text; }
@@ -155,6 +161,26 @@ class WERD : public ELIST2_LINK {
     void plot_rej_blobs(ScrollView *window);
     #endif  // GRAPHICS_DISABLED
 
+    // Removes noise from the word by moving small outlines to the rej_cblobs
+    // list, based on the size_threshold.
+    void CleanNoise(float size_threshold);
+
+    // Extracts all the noise outlines and stuffs the pointers into the given
+    // vector of outlines. Afterwards, the outlines vector owns the pointers.
+    void GetNoiseOutlines(GenericVector<C_OUTLINE *> *outlines);
+    // Adds the selected outlines to the indcated real blobs, and puts the rest
+    // back in rej_cblobs where they came from. Where the target_blobs entry is
+    // NULL, a run of wanted outlines is put into a single new blob.
+    // Ownership of the outlines is transferred back to the word. (Hence
+    // GenericVector and not PointerVector.)
+    // Returns true if any new blob was added to the start of the word, which
+    // suggests that it might need joining to the word before it, and likewise
+    // sets make_next_word_fuzzy true if any new blob was added to the end.
+    bool AddSelectedOutlines(const GenericVector<bool> &wanted,
+                             const GenericVector<C_BLOB *> &target_blobs,
+                             const GenericVector<C_OUTLINE *> &outlines,
+                             bool *make_next_word_fuzzy);
+
  private:
     uinT8 blanks;                // no of blanks
     uinT8 dummy;                 // padding
diff --git a/textord/colfind.cpp b/textord/colfind.cpp
index b9b10649af..41b3895602 100644
--- a/textord/colfind.cpp
+++ b/textord/colfind.cpp
@@ -286,22 +286,27 @@ void ColumnFinder::CorrectOrientation(TO_BLOCK* block,
 // thresholds_pix is expected to be present iff grey_pix is present and
 // can be an integer factor reduction of the grey_pix. It represents the
 // thresholds that were used to create the binary_pix from the grey_pix.
+// If diacritic_blobs is non-null, then diacritics/noise blobs, that would
+// confuse layout anaylsis by causing textline overlap, are placed there,
+// with the expectation that they will be reassigned to words later and
+// noise/diacriticness determined via classification.
 // Returns -1 if the user hits the 'd' key in the blocks window while running
 // in debug mode, which requests a retry with more debug info.
-int ColumnFinder::FindBlocks(PageSegMode pageseg_mode,
-                             Pix* scaled_color, int scaled_factor,
-                             TO_BLOCK* input_block, Pix* photo_mask_pix,
-                             Pix* thresholds_pix, Pix* grey_pix,
-                             BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) {
+int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color,
+                             int scaled_factor, TO_BLOCK* input_block,
+                             Pix* photo_mask_pix, Pix* thresholds_pix,
+                             Pix* grey_pix, BLOCK_LIST* blocks,
+                             BLOBNBOX_LIST* diacritic_blobs,
+                             TO_BLOCK_LIST* to_blocks) {
   pixOr(photo_mask_pix, photo_mask_pix, nontext_map_);
   stroke_width_->FindLeaderPartitions(input_block, &part_grid_);
   stroke_width_->RemoveLineResidue(&big_parts_);
   FindInitialTabVectors(NULL, min_gutter_width_, tabfind_aligned_gap_fraction_,
                         input_block);
   SetBlockRuleEdges(input_block);
-  stroke_width_->GradeBlobsIntoPartitions(rerotate_, input_block, nontext_map_,
-                                          denorm_, cjk_script_, &projection_,
-                                          &part_grid_, &big_parts_);
+  stroke_width_->GradeBlobsIntoPartitions(
+      rerotate_, input_block, nontext_map_, denorm_, cjk_script_, &projection_,
+      diacritic_blobs, &part_grid_, &big_parts_);
   if (!PSM_SPARSE(pageseg_mode)) {
     ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_,
                                    input_block, this, &part_grid_, &big_parts_);
@@ -1134,9 +1139,13 @@ void ColumnFinder::GridMergePartitions() {
             neighbour->Print();
           }
           rsearch.RemoveBBox();
-          gsearch.RepositionIterator();
+          if (!modified_box) {
+            // We are going to modify part, so remove it and re-insert it after.
+            gsearch.RemoveBBox();
+            rsearch.RepositionIterator();
+            modified_box = true;
+          }
           part->Absorb(neighbour, WidthCB());
-          modified_box = true;
         } else if (debug) {
           tprintf("Neighbour failed hgap test\n");
         }
@@ -1151,7 +1160,6 @@ void ColumnFinder::GridMergePartitions() {
       // or it will never be found by a full search.
       // Because the box has changed, it has to be removed first, otherwise
       // add_sorted may fail to keep a single copy of the pointer.
-      gsearch.RemoveBBox();
       part_grid_.InsertBBox(true, true, part);
       gsearch.RepositionIterator();
     }
diff --git a/textord/colfind.h b/textord/colfind.h
index 04ad1684de..eedd4c407e 100644
--- a/textord/colfind.h
+++ b/textord/colfind.h
@@ -155,13 +155,15 @@ class ColumnFinder : public TabFind {
   // thresholds_pix is expected to be present iff grey_pix is present and
   // can be an integer factor reduction of the grey_pix. It represents the
   // thresholds that were used to create the binary_pix from the grey_pix.
+  // Small blobs that confuse the segmentation into lines are placed into
+  // diacritic_blobs, with the intention that they be put into the most
+  // appropriate word after the rest of layout analysis.
   // Returns -1 if the user hits the 'd' key in the blocks window while running
   // in debug mode, which requests a retry with more debug info.
-  int FindBlocks(PageSegMode pageseg_mode,
-                 Pix* scaled_color, int scaled_factor,
-                 TO_BLOCK* block, Pix* photo_mask_pix,
-                 Pix* thresholds_pix, Pix* grey_pix,
-                 BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);
+  int FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, int scaled_factor,
+                 TO_BLOCK* block, Pix* photo_mask_pix, Pix* thresholds_pix,
+                 Pix* grey_pix, BLOCK_LIST* blocks,
+                 BLOBNBOX_LIST* diacritic_blobs, TO_BLOCK_LIST* to_blocks);
 
   // Get the rotation required to deskew, and its inverse rotation.
   void GetDeskewVectors(FCOORD* deskew, FCOORD* reskew);
diff --git a/textord/colpartition.cpp b/textord/colpartition.cpp
index e9ce568aa3..565c660bb2 100644
--- a/textord/colpartition.cpp
+++ b/textord/colpartition.cpp
@@ -297,6 +297,25 @@ void ColPartition::DisownBoxesNoAssert() {
   }
 }
 
+// NULLs the owner of the blobs in this partition that are owned by this
+// partition and not leader blobs, removing them from the boxes_ list, thus
+// turning this partition back to a leader partition if it contains a leader,
+// or otherwise leaving it empty. Returns true if any boxes remain.
+bool ColPartition::ReleaseNonLeaderBoxes() {
+  BLOBNBOX_C_IT bb_it(&boxes_);
+  for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) {
+    BLOBNBOX* bblob = bb_it.data();
+    if (bblob->flow() != BTFT_LEADER) {
+      if (bblob->owner() == this) bblob->set_owner(NULL);
+      bb_it.extract();
+    }
+  }
+  if (bb_it.empty()) return false;
+  flow_ = BTFT_LEADER;
+  ComputeLimits();
+  return true;
+}
+
 // Delete the boxes that this partition owns.
 void ColPartition::DeleteBoxes() {
   // Although the boxes_ list is a C_LIST, in some cases it owns the
@@ -831,6 +850,10 @@ ColPartition* ColPartition::SplitAt(int split_x) {
         bbox->set_owner(split_part);
     }
   }
+  if (it.empty()) {
+    // Possible if split-x passes through the first blob.
+    it.add_list_after(&split_part->boxes_);
+  }
   ASSERT_HOST(!it.empty());
   if (split_part->IsEmpty()) {
     // Split part ended up with nothing. Possible if split_x passes
@@ -1130,6 +1153,7 @@ bool ColPartition::MarkAsLeaderIfMonospaced() {
     if (best_end != NULL && best_end->total_cost() < blob_count) {
       // Good enough. Call it a leader.
       result = true;
+      bool modified_blob_list = false;
       for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
         BLOBNBOX* blob = it.data();
         TBOX box = blob->bounding_box();
@@ -1139,6 +1163,7 @@ bool ColPartition::MarkAsLeaderIfMonospaced() {
                      blob->bounding_box().right();
           if (blob->bounding_box().width() + gap > max_step) {
             it.extract();
+            modified_blob_list = true;
             continue;
           }
         }
@@ -1147,12 +1172,14 @@ bool ColPartition::MarkAsLeaderIfMonospaced() {
                      it.data_relative(-1)->bounding_box().right();
           if (blob->bounding_box().width() + gap > max_step) {
             it.extract();
+            modified_blob_list = true;
             break;
           }
         }
         blob->set_region_type(BRT_TEXT);
         blob->set_flow(BTFT_LEADER);
       }
+      if (modified_blob_list) ComputeLimits();
       blob_type_ = BRT_TEXT;
       flow_ = BTFT_LEADER;
     } else if (textord_debug_tabfind) {
diff --git a/textord/colpartition.h b/textord/colpartition.h
index 7f6cd64328..1b35d48545 100644
--- a/textord/colpartition.h
+++ b/textord/colpartition.h
@@ -481,6 +481,11 @@ class ColPartition : public ELIST2_LINK {
   // Any blobs that are not owned by this partition get to keep their owner
   // without an assert failure.
   void DisownBoxesNoAssert();
+  // NULLs the owner of the blobs in this partition that are owned by this
+  // partition and not leader blobs, removing them from the boxes_ list, thus
+  // turning this partition back to a leader partition if it contains a leader,
+  // or otherwise leaving it empty. Returns true if any boxes remain.
+  bool ReleaseNonLeaderBoxes();
 
   // Delete the boxes that this partition owns.
   void DeleteBoxes();
diff --git a/textord/colpartitiongrid.cpp b/textord/colpartitiongrid.cpp
index 6cd8f31c93..800cbcb3c9 100644
--- a/textord/colpartitiongrid.cpp
+++ b/textord/colpartitiongrid.cpp
@@ -324,6 +324,40 @@ static bool TestCompatibleCandidates(const ColPartition& part, bool debug,
   return true;
 }
 
+// Computes and returns the total overlap of all partitions in the grid.
+// If overlap_grid is non-null, it is filled with a grid that holds empty
+// partitions representing the union of all overlapped partitions.
+int ColPartitionGrid::ComputeTotalOverlap(ColPartitionGrid** overlap_grid) {
+  int total_overlap = 0;
+  // Iterate the ColPartitions in the grid.
+  ColPartitionGridSearch gsearch(this);
+  gsearch.StartFullSearch();
+  ColPartition* part;
+  while ((part = gsearch.NextFullSearch()) != NULL) {
+    ColPartition_CLIST neighbors;
+    const TBOX& part_box = part->bounding_box();
+    FindOverlappingPartitions(part_box, part, &neighbors);
+    ColPartition_C_IT n_it(&neighbors);
+    bool any_part_overlap = false;
+    for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) {
+      const TBOX& n_box = n_it.data()->bounding_box();
+      int overlap = n_box.intersection(part_box).area();
+      if (overlap > 0 && overlap_grid != NULL) {
+        if (*overlap_grid == NULL) {
+          *overlap_grid = new ColPartitionGrid(gridsize(), bleft(), tright());
+        }
+        (*overlap_grid)->InsertBBox(true, true, n_it.data()->ShallowCopy());
+        if (!any_part_overlap) {
+          (*overlap_grid)->InsertBBox(true, true, part->ShallowCopy());
+        }
+      }
+      any_part_overlap = true;
+      total_overlap += overlap;
+    }
+  }
+  return total_overlap;
+}
+
 // Finds all the ColPartitions in the grid that overlap with the given
 // box and returns them SortByBoxLeft(ed) and uniqued in the given list.
 // Any partition equal to not_this (may be NULL) is excluded.
@@ -901,6 +935,7 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) {
   while ((part = gsearch.NextFullSearch()) != NULL) {
     BlobRegionType blob_type = part->blob_type();
     BlobTextFlowType flow = part->flow();
+    bool any_blobs_moved = false;
     if (blob_type == BRT_POLYIMAGE || blob_type == BRT_RECTIMAGE) {
       BLOBNBOX_C_IT blob_it(part->boxes());
       for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
@@ -918,6 +953,7 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) {
           ASSERT_HOST(blob->cblob()->area() != 0);
           blob->set_owner(NULL);
           blob_it.extract();
+          any_blobs_moved = true;
         } else {
           blob->set_region_type(blob_type);
           if (blob->flow() != BTFT_LEADER)
@@ -938,6 +974,11 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) {
           delete blob;
         }
       }
+    } else if (any_blobs_moved) {
+      gsearch.RemoveBBox();
+      part->ComputeLimits();
+      InsertBBox(true, true, part);
+      gsearch.RepositionIterator();
     }
   }
 }
@@ -1048,6 +1089,24 @@ void ColPartitionGrid::DeleteUnknownParts(TO_BLOCK* block) {
   block->DeleteUnownedNoise();
 }
 
+// Deletes all the partitions in the grid that are NOT of flow type BTFT_LEADER.
+void ColPartitionGrid::DeleteNonLeaderParts() {
+  ColPartitionGridSearch gsearch(this);
+  gsearch.StartFullSearch();
+  ColPartition* part;
+  while ((part = gsearch.NextFullSearch()) != NULL) {
+    if (part->flow() != BTFT_LEADER) {
+      gsearch.RemoveBBox();
+      if (part->ReleaseNonLeaderBoxes()) {
+        InsertBBox(true, true, part);
+        gsearch.RepositionIterator();
+      } else {
+        delete part;
+      }
+    }
+  }
+}
+
 // Finds and marks text partitions that represent figure captions.
 void ColPartitionGrid::FindFigureCaptions() {
   // For each image region find its best candidate text caption region,
diff --git a/textord/colpartitiongrid.h b/textord/colpartitiongrid.h
index 40946e5746..94e7da2c43 100644
--- a/textord/colpartitiongrid.h
+++ b/textord/colpartitiongrid.h
@@ -63,6 +63,11 @@ class ColPartitionGrid : public BBGrid<ColPartition,
                                      const ColPartition*>* confirm_cb,
                  ColPartition* part);
 
+  // Computes and returns the total overlap of all partitions in the grid.
+  // If overlap_grid is non-null, it is filled with a grid that holds empty
+  // partitions representing the union of all overlapped partitions.
+  int ComputeTotalOverlap(ColPartitionGrid** overlap_grid);
+
   // Finds all the ColPartitions in the grid that overlap with the given
   // box and returns them SortByBoxLeft(ed) and uniqued in the given list.
   // Any partition equal to not_this (may be NULL) is excluded.
@@ -165,6 +170,10 @@ class ColPartitionGrid : public BBGrid<ColPartition,
   // all the blobs in them.
   void DeleteUnknownParts(TO_BLOCK* block);
 
+  // Deletes all the partitions in the grid that are NOT of flow type
+  // BTFT_LEADER.
+  void DeleteNonLeaderParts();
+
   // Finds and marks text partitions that represent figure captions.
   void FindFigureCaptions();
 
diff --git a/textord/strokewidth.cpp b/textord/strokewidth.cpp
index e6c16abcc3..5aa4b481a6 100644
--- a/textord/strokewidth.cpp
+++ b/textord/strokewidth.cpp
@@ -109,6 +109,13 @@ const float kSizeRatioToReject = 2.0;
 const int kMaxLargeOverlaps = 3;
 // Expansion factor for search box for good neighbours.
 const double kNeighbourSearchFactor = 2.5;
+// Factor of increase of overlap when adding diacritics to make an image noisy.
+const double kNoiseOverlapGrowthFactor = 4.0;
+// Fraction of the image size to add overlap when adding diacritics for an
+// image to qualify as noisy.
+const double kNoiseOverlapAreaFactor = 1.0 / 512;
+// Ratio of perimeter^2/area for a blob to be considered noise vs i dot.
+const double kShapePerimeterRatio = 3.0;
 
 StrokeWidth::StrokeWidth(int gridsize,
                          const ICOORD& bleft, const ICOORD& tright)
@@ -343,14 +350,11 @@ void StrokeWidth::RemoveLineResidue(ColPartition_LIST* big_part_list) {
 // part_grid is the output grid of textline partitions.
 // Large blobs that cause overlap are put in separate partitions and added
 // to the big_parts list.
-void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation,
-                                           TO_BLOCK* block,
-                                           Pix* nontext_pix,
-                                           const DENORM* denorm,
-                                           bool cjk_script,
-                                           TextlineProjection* projection,
-                                           ColPartitionGrid* part_grid,
-                                           ColPartition_LIST* big_parts) {
+void StrokeWidth::GradeBlobsIntoPartitions(
+    const FCOORD& rerotation, TO_BLOCK* block, Pix* nontext_pix,
+    const DENORM* denorm, bool cjk_script, TextlineProjection* projection,
+    BLOBNBOX_LIST* diacritic_blobs, ColPartitionGrid* part_grid,
+    ColPartition_LIST* big_parts) {
   nontext_map_ = nontext_pix;
   projection_ = projection;
   denorm_ = denorm;
@@ -363,7 +367,7 @@ void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation,
   if (cjk_script) {
     FixBrokenCJK(block);
   }
-  FindTextlineFlowDirection(true);
+  FindTextlineFlowDirection(false);
   projection_->ConstructProjection(block, rerotation, nontext_map_);
   if (textord_tabfind_show_strokewidths) {
     ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs");
@@ -375,7 +379,19 @@ void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation,
   // Clear and re Insert to take advantage of the removed diacritics.
   Clear();
   InsertBlobs(block);
-  FindInitialPartitions(rerotation, block, part_grid, big_parts);
+  FCOORD skew;
+  FindTextlineFlowDirection(true);
+  PartitionFindResult r = FindInitialPartitions(
+      rerotation, true, block, diacritic_blobs, part_grid, big_parts, &skew);
+  if (r == PFR_NOISE) {
+    tprintf("Detected %d diacritics\n", diacritic_blobs->length());
+    // Noise was found, and removed.
+    Clear();
+    InsertBlobs(block);
+    FindTextlineFlowDirection(true);
+    r = FindInitialPartitions(rerotation, false, block, diacritic_blobs,
+                              part_grid, big_parts, &skew);
+  }
   nontext_map_ = NULL;
   projection_ = NULL;
   denorm_ = NULL;
@@ -1220,10 +1236,17 @@ void StrokeWidth::SmoothNeighbourTypes(BLOBNBOX* blob, bool reset_all) {
 // minimize overlap and smoothes the types with neighbours and the color
 // image if provided. rerotation is used to rotate the coordinate space
 // back to the nontext_map_ image.
-void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation,
-                                        TO_BLOCK* block,
-                                        ColPartitionGrid* part_grid,
-                                        ColPartition_LIST* big_parts) {
+// If find_problems is true, detects possible noise pollution by the amount
+// of partition overlap that is created by the diacritics. If excessive, the
+// noise is separated out into diacritic blobs, and PFR_NOISE is returned.
+// [TODO(rays): if the partition overlap is caused by heavy skew, deskews
+// the components, saves the skew_angle and returns PFR_SKEW.] If the return
+// is not PFR_OK, the job is incomplete, and FindInitialPartitions must be
+// called again after cleaning up the partly done work.
+PartitionFindResult StrokeWidth::FindInitialPartitions(
+    const FCOORD& rerotation, bool find_problems, TO_BLOCK* block,
+    BLOBNBOX_LIST* diacritic_blobs, ColPartitionGrid* part_grid,
+    ColPartition_LIST* big_parts, FCOORD* skew_angle) {
   FindVerticalTextChains(part_grid);
   FindHorizontalTextChains(part_grid);
   if (textord_tabfind_show_strokewidths) {
@@ -1231,6 +1254,10 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation,
     part_grid->DisplayBoxes(chains_win_);
     projection_->DisplayProjection();
   }
+  if (find_problems) {
+    // TODO(rays) Do something to find skew, set skew_angle and return if there
+    // is some.
+  }
   part_grid->SplitOverlappingPartitions(big_parts);
   EasyMerges(part_grid);
   RemoveLargeUnusedBlobs(block, part_grid, big_parts);
@@ -1239,8 +1266,14 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation,
                                          rerotation));
   while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_,
                                          grid_box, rerotation));
+  int pre_overlap = part_grid->ComputeTotalOverlap(NULL);
   TestDiacritics(part_grid, block);
   MergeDiacritics(block, part_grid);
+  if (find_problems && diacritic_blobs != NULL &&
+      DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid,
+                           diacritic_blobs)) {
+    return PFR_NOISE;
+  }
   if (textord_tabfind_show_strokewidths) {
     textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs");
     part_grid->DisplayBoxes(textlines_win_);
@@ -1260,6 +1293,57 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation,
     smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs");
     part_grid->DisplayBoxes(smoothed_win_);
   }
+  return PFR_OK;
+}
+
+// Detects noise by a significant increase in partition overlap from
+// pre_overlap to now, and removes noise from the union of all the overlapping
+// partitions, placing the blobs in diacritic_blobs. Returns true if any noise
+// was found and removed.
+bool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box,
+                                       TO_BLOCK* block,
+                                       ColPartitionGrid* part_grid,
+                                       BLOBNBOX_LIST* diacritic_blobs) {
+  ColPartitionGrid* noise_grid = NULL;
+  int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid);
+  if (pre_overlap == 0) pre_overlap = 1;
+  BLOBNBOX_IT diacritic_it(diacritic_blobs);
+  if (noise_grid != NULL) {
+    if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor &&
+        post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) {
+      // This is noisy enough to fix.
+      if (textord_tabfind_show_strokewidths) {
+        ScrollView* noise_win = MakeWindow(1000, 500, "Noise Areas");
+        noise_grid->DisplayBoxes(noise_win);
+      }
+      part_grid->DeleteNonLeaderParts();
+      BLOBNBOX_IT blob_it(&block->noise_blobs);
+      ColPartitionGridSearch rsearch(noise_grid);
+      for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
+        BLOBNBOX* blob = blob_it.data();
+        blob->ClearNeighbours();
+        if (!blob->IsDiacritic() || blob->owner() != NULL)
+          continue;  // Not a noise candidate.
+        TBOX blob_box(blob->bounding_box());
+        TBOX search_box(blob->bounding_box());
+        search_box.pad(gridsize(), gridsize());
+        rsearch.StartRectSearch(search_box);
+        ColPartition* part = rsearch.NextRectSearch();
+        if (part != NULL) {
+          // Consider blob as possible noise.
+          blob->set_owns_cblob(true);
+          blob->compute_bounding_box();
+          diacritic_it.add_after_then_move(blob_it.extract());
+        }
+      }
+      noise_grid->DeleteParts();
+      delete noise_grid;
+      return true;
+    }
+    noise_grid->DeleteParts();
+    delete noise_grid;
+  }
+  return false;
 }
 
 // Helper verifies that blob's neighbour in direction dir is good to add to a
diff --git a/textord/strokewidth.h b/textord/strokewidth.h
index 5d649b5708..12cb3c91f6 100644
--- a/textord/strokewidth.h
+++ b/textord/strokewidth.h
@@ -41,6 +41,14 @@ enum LeftOrRight {
   LR_RIGHT
 };
 
+// Return value from FindInitialPartitions indicates detection of severe
+// skew or noise.
+enum PartitionFindResult {
+  PFR_OK,    // Everything is OK.
+  PFR_SKEW,  // Skew was detected and rotated.
+  PFR_NOISE  // Noise was detected and removed.
+};
+
 /**
  * The StrokeWidth class holds all the normal and large blobs.
  * It is used to find good large blobs and move them to the normal blobs
@@ -110,12 +118,10 @@ class StrokeWidth : public BlobGrid {
   // part_grid is the output grid of textline partitions.
   // Large blobs that cause overlap are put in separate partitions and added
   // to the big_parts list.
-  void GradeBlobsIntoPartitions(const FCOORD& rerotation,
-                                TO_BLOCK* block,
-                                Pix* nontext_pix,
-                                const DENORM* denorm,
-                                bool cjk_script,
-                                TextlineProjection* projection,
+  void GradeBlobsIntoPartitions(const FCOORD& rerotation, TO_BLOCK* block,
+                                Pix* nontext_pix, const DENORM* denorm,
+                                bool cjk_script, TextlineProjection* projection,
+                                BLOBNBOX_LIST* diacritic_blobs,
                                 ColPartitionGrid* part_grid,
                                 ColPartition_LIST* big_parts);
 
@@ -205,10 +211,26 @@ class StrokeWidth : public BlobGrid {
   // minimize overlap and smoothes the types with neighbours and the color
   // image if provided. rerotation is used to rotate the coordinate space
   // back to the nontext_map_ image.
-  void FindInitialPartitions(const FCOORD& rerotation,
-                             TO_BLOCK* block,
-                             ColPartitionGrid* part_grid,
-                             ColPartition_LIST* big_parts);
+  // If find_problems is true, detects possible noise pollution by the amount
+  // of partition overlap that is created by the diacritics. If excessive, the
+  // noise is separated out into diacritic blobs, and PFR_NOISE is returned.
+  // [TODO(rays): if the partition overlap is caused by heavy skew, deskews
+  // the components, saves the skew_angle and returns PFR_SKEW.] If the return
+  // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be
+  // called again after cleaning up the partly done work.
+  PartitionFindResult FindInitialPartitions(const FCOORD& rerotation,
+                                            bool find_problems, TO_BLOCK* block,
+                                            BLOBNBOX_LIST* diacritic_blobs,
+                                            ColPartitionGrid* part_grid,
+                                            ColPartition_LIST* big_parts,
+                                            FCOORD* skew_angle);
+  // Detects noise by a significant increase in partition overlap from
+  // pre_overlap to now, and removes noise from the union of all the overlapping
+  // partitions, placing the blobs in diacritic_blobs. Returns true if any noise
+  // was found and removed.
+  bool DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box,
+                            TO_BLOCK* block, ColPartitionGrid* part_grid,
+                            BLOBNBOX_LIST* diacritic_blobs);
   // Finds vertical chains of text-like blobs and puts them in ColPartitions.
   void FindVerticalTextChains(ColPartitionGrid* part_grid);
   // Finds horizontal chains of text-like blobs and puts them in ColPartitions.
diff --git a/textord/tablefind.cpp b/textord/tablefind.cpp
index 888fe145f5..2e38bada0b 100644
--- a/textord/tablefind.cpp
+++ b/textord/tablefind.cpp
@@ -974,12 +974,12 @@ bool TableFinder::HasLeaderAdjacent(const ColPartition& part) {
     hsearch.StartSideSearch(x, bottom, top);
     ColPartition* leader = NULL;
     while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) {
-      // This should not happen, they are in different grids.
-      ASSERT_HOST(&part != leader);
       // The leader could be a horizontal ruling in the grid.
       // Make sure it is actually a leader.
       if (leader->flow() != BTFT_LEADER)
         continue;
+      // This should not happen, they are in different grids.
+      ASSERT_HOST(&part != leader);
       // Make sure the leader shares a page column with the partition,
       // otherwise we are spreading across columns.
       if (!part.IsInSameColumnAs(*leader))
diff --git a/textord/textord.cpp b/textord/textord.cpp
index cf2fc04fe3..6156e45b3b 100644
--- a/textord/textord.cpp
+++ b/textord/textord.cpp
@@ -268,7 +268,7 @@ Textord::~Textord() {
 void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew,
                           int width, int height, Pix* binary_pix,
                           Pix* thresholds_pix, Pix* grey_pix,
-                          bool use_box_bottoms,
+                          bool use_box_bottoms, BLOBNBOX_LIST* diacritic_blobs,
                           BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) {
   page_tr_.set_x(width);
   page_tr_.set_y(height);
@@ -340,9 +340,9 @@ void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew,
     make_single_word(pageseg_mode == PSM_SINGLE_CHAR,
                      to_block->get_rows(), to_block->block->row_list());
   }
-  cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks);
   // Remove empties.
-
+  cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks);
+  TransferDiacriticsToBlockGroups(diacritic_blobs, blocks);
   // Compute the margins for each row in the block, to be used later for
   // paragraph detection.
   BLOCK_IT b_it(blocks);
diff --git a/textord/textord.h b/textord/textord.h
index b99541efce..cc9cb1d341 100644
--- a/textord/textord.h
+++ b/textord/textord.h
@@ -22,6 +22,7 @@
 #define TESSERACT_TEXTORD_TEXTORD_H__
 
 #include "ccstruct.h"
+#include "bbgrid.h"
 #include "blobbox.h"
 #include "gap_map.h"
 #include "publictypes.h"  // For PageSegMode.
@@ -35,6 +36,35 @@ class ScrollView;
 
 namespace tesseract {
 
+// A simple class that can be used by BBGrid to hold a word and an expanded
+// bounding box that makes it easy to find words to put diacritics.
+class WordWithBox {
+ public:
+  WordWithBox() : word_(NULL) {}
+  explicit WordWithBox(WERD *word)
+      : word_(word), bounding_box_(word->bounding_box()) {
+    int height = bounding_box_.height();
+    bounding_box_.pad(height, height);
+  }
+
+  const TBOX &bounding_box() const { return bounding_box_; }
+  // Returns the bounding box of only the good blobs.
+  TBOX true_bounding_box() const { return word_->true_bounding_box(); }
+  C_BLOB_LIST *RejBlobs() const { return word_->rej_cblob_list(); }
+  const WERD *word() const { return word_; }
+
+ private:
+  // Borrowed pointer to a real word somewhere that must outlive this class.
+  WERD *word_;
+  // Cached expanded bounding box of the word, padded all round by its height.
+  TBOX bounding_box_;
+};
+
+// Make it usable by BBGrid.
+CLISTIZEH(WordWithBox)
+typedef BBGrid<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> WordGrid;
+typedef GridSearch<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> WordSearch;
+
 class Textord {
  public:
   explicit Textord(CCStruct* ccstruct);
@@ -47,11 +77,13 @@ class Textord {
   // thresholds_pix is expected to be present iff grey_pix is present and
   // can be an integer factor reduction of the grey_pix. It represents the
   // thresholds that were used to create the binary_pix from the grey_pix.
-  void TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew,
-                   int width, int height, Pix* binary_pix,
-                   Pix* thresholds_pix, Pix* grey_pix,
-                   bool use_box_bottoms,
-                   BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks);
+  // diacritic_blobs contain small confusing components that should be added
+  // to the appropriate word(s) in case they are really diacritics.
+  void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width,
+                   int height, Pix *binary_pix, Pix *thresholds_pix,
+                   Pix *grey_pix, bool use_box_bottoms,
+                   BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks,
+                   TO_BLOCK_LIST *to_blocks);
 
   // If we were supposed to return only a single textline, and there is more
   // than one, clean up and leave only the best.
@@ -212,6 +244,17 @@ class Textord {
   // Remove outlines that are a tiny fraction in either width or height
   // of the word height.
   void clean_small_noise_from_words(ROW *row);
+  // Groups blocks by rotation, then, for each group, makes a WordGrid and calls
+  // TransferDiacriticsToWords to copy the diacritic blobs to the most
+  // appropriate words in the group of blocks. Source blobs are not touched.
+  void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
+                                       BLOCK_LIST* blocks);
+  // Places a copy of blobs that are near a word (after applying rotation to the
+  // blob) in the most appropriate word, unless there is doubt, in which case a
+  // blob can end up in two words. Source blobs are not touched.
+  void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs,
+                                 const FCOORD &rotation, WordGrid *word_grid);
+
  public:
   // makerow.cpp ///////////////////////////////////////////
   BOOL_VAR_H(textord_single_height_mode, false,
diff --git a/textord/topitch.cpp b/textord/topitch.cpp
index 3136a9417e..e918f14c36 100644
--- a/textord/topitch.cpp
+++ b/textord/topitch.cpp
@@ -283,12 +283,13 @@ void fix_row_pitch(TO_ROW *bad_row,        // row to fix
     bad_row->space_threshold =
       (bad_row->min_space + bad_row->max_nonspace) / 2;
     bad_row->space_size = bad_row->fixed_pitch;
-    if (bad_row->char_cells.empty ())
+    if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) {
       tune_row_pitch (bad_row, &bad_row->projection,
         bad_row->projection_left, bad_row->projection_right,
         (bad_row->fixed_pitch +
         bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch,
         sp_sd, mid_cuts, &bad_row->char_cells, FALSE);
+    }
   }
   else if (bad_row->pitch_decision == PITCH_CORR_PROP
   || bad_row->pitch_decision == PITCH_DEF_PROP) {
@@ -1279,13 +1280,13 @@ float tune_row_pitch2(                             //find fp cells
 
   best_sp_sd = initial_pitch;
 
-  if (textord_disable_pitch_test) {
+  best_pitch = static_cast<int>(initial_pitch);
+  if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) {
     return initial_pitch;
   }
   sum_proj = new STATS[textord_pitch_range * 2 + 1];
   if (sum_proj == NULL)
     return initial_pitch;
-  best_pitch = (inT32) initial_pitch;
 
   for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
     pitch_delta++)
@@ -1293,12 +1294,12 @@ float tune_row_pitch2(                             //find fp cells
       best_pitch +
       pitch_delta + 1);
   for (pixel = projection_left; pixel <= projection_right; pixel++) {
-    for (pitch_delta = -textord_pitch_range;
-      pitch_delta <= textord_pitch_range; pitch_delta++)
-    sum_proj[textord_pitch_range +
-        pitch_delta].add ((pixel - projection_left) % (best_pitch +
-        pitch_delta),
-        projection->pile_count (pixel));
+    for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range;
+         pitch_delta++) {
+      sum_proj[textord_pitch_range + pitch_delta].add(
+          (pixel - projection_left) % (best_pitch + pitch_delta),
+          projection->pile_count(pixel));
+    }
   }
   best_count = sum_proj[textord_pitch_range].pile_count (0);
   best_delta = 0;
@@ -1427,7 +1428,7 @@ float compute_pitch_sd(                            //find fp cells
   if (blob_it.empty ())
     return space_size * 10;
 #ifndef GRAPHICS_DISABLED
-  if (testing_on && to_win > 0) {
+  if (testing_on && to_win != NULL) {
     blob_box = blob_it.data ()->bounding_box ();
     projection->plot (to_win, projection_left,
       row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
@@ -1476,7 +1477,7 @@ float compute_pitch_sd(                            //find fp cells
       tprintf ("\n");
     }
 #ifndef GRAPHICS_DISABLED
-    if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
+    if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL)
       plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
 #endif
     seg_it.set_to_list (&seg_list);
@@ -1566,7 +1567,7 @@ float compute_pitch_sd2(                            //find fp cells
     return initial_pitch * 10;
   }
 #ifndef GRAPHICS_DISABLED
-  if (testing_on && to_win > 0) {
+  if (testing_on && to_win != NULL) {
     projection->plot (to_win, projection_left,
       row->intercept (), 1.0f, -1.0f, ScrollView::CORAL);
   }
@@ -1602,7 +1603,7 @@ float compute_pitch_sd2(                            //find fp cells
     tprintf ("\n");
   }
 #ifndef GRAPHICS_DISABLED
-  if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0)
+  if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL)
     plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list);
 #endif
   seg_it.set_to_list (&seg_list);
diff --git a/textord/tordmain.cpp b/textord/tordmain.cpp
index eb229eaa1a..e9e59261da 100644
--- a/textord/tordmain.cpp
+++ b/textord/tordmain.cpp
@@ -38,13 +38,18 @@
 
 #include "allheaders.h"
 
-const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block";
+// Gridsize for word grid when reassigning diacritics to words. Not critical.
+const int kWordGridSize = 50;
 
 #undef EXTERN
 #define EXTERN
 
 #define MAX_NEAREST_DIST  600    //for block skew stats
 
+namespace tesseract {
+
+CLISTIZE(WordWithBox)
+
 /**********************************************************************
  * SetBlobStrokeWidth
  *
@@ -143,7 +148,6 @@ void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) {
   }
 }
 
-
 /**********************************************************************
  * assign_blobs_to_blocks2
  *
@@ -193,7 +197,6 @@ void assign_blobs_to_blocks2(Pix* pix,
   }
 }
 
-namespace tesseract {
 /**********************************************************************
  * find_components
  *
@@ -400,7 +403,7 @@ void Textord::cleanup_nontext_block(BLOCK* block) {
  * Delete empty blocks, rows from the page.
  **********************************************************************/
 
-void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) {
+void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) {
   BLOCK_IT block_it = blocks;    //iterator
   ROW_IT row_it;                 //row iterator
 
@@ -420,18 +423,18 @@ void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) {
     if (clean_noise) {
       row_it.set_to_list(block->row_list());
       for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+        ROW* row = row_it.data();
         ++num_rows_all;
-        clean_small_noise_from_words(row_it.data());
-        if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() &&
-             clean_noise_from_row(row_it.data())) ||
-            row_it.data()->word_list()->empty()) {
+        clean_small_noise_from_words(row);
+        if ((textord_noise_rejrows && !row->word_list()->empty() &&
+             clean_noise_from_row(row)) ||
+            row->word_list()->empty()) {
           delete row_it.extract();  // lose empty row.
         } else {
           if (textord_noise_rejwords)
             clean_noise_from_words(row_it.data());
           if (textord_blshift_maxshift >= 0)
-            tweak_row_baseline(row_it.data(),
-                               textord_blshift_maxshift,
+            tweak_row_baseline(row, textord_blshift_maxshift,
                                textord_blshift_xfraction);
           ++num_rows;
         }
@@ -640,16 +643,16 @@ void Textord::clean_noise_from_words(          //remove empties
         && (!word_it.at_first () || !blob_it.at_first ()))
         dot_count += 2;
     }
-    if (dot_count > 2) {
+    if (dot_count > 2 && !word->flag(W_REP_CHAR)) {
       if (dot_count > norm_count * textord_noise_normratio * 2)
         word_dud[word_index] = 2;
       else if (dot_count > norm_count * textord_noise_normratio)
         word_dud[word_index] = 1;
       else
         word_dud[word_index] = 0;
-    }
-    else
+    } else {
       word_dud[word_index] = 0;
+    }
     if (word_dud[word_index] == 2)
       dud_words++;
     else
@@ -661,11 +664,11 @@ void Textord::clean_noise_from_words(          //remove empties
   for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) {
     if (word_dud[word_index] == 2
     || (word_dud[word_index] == 1 && dud_words > ok_words)) {
-      word = word_it.data ();    //current word
-                                 //rejected blobs
-      blob_it.set_to_list (word->rej_cblob_list ());
-                                 //move from blobs
-      blob_it.add_list_after (word->cblob_list ());
+      word = word_it.data();  // Current word.
+      // Previously we threw away the entire word.
+      // Now just aggressively throw all small blobs into the reject list, where
+      // the classifier can decide whether they are actually needed.
+      word->CleanNoise(textord_noise_sizelimit * row->x_height());
     }
     word_index++;
   }
@@ -705,6 +708,176 @@ void Textord::clean_small_noise_from_words(ROW *row) {
     }
   }
 }
+
+// Local struct to hold a group of blocks.
+struct BlockGroup {
+  BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {}
+  explicit BlockGroup(BLOCK* block)
+      : bounding_box(block->bounding_box()),
+        rotation(block->re_rotation()),
+        angle(block->re_rotation().angle()),
+        min_xheight(block->x_height()) {
+    blocks.push_back(block);
+  }
+  // Union of block bounding boxes.
+  TBOX bounding_box;
+  // Common rotation of the blocks.
+  FCOORD rotation;
+  // Angle of rotation.
+  float angle;
+  // Min xheight of the blocks.
+  float min_xheight;
+  // Collection of borrowed pointers to the blocks in the group.
+  GenericVector<BLOCK*> blocks;
+};
+
+// Groups blocks by rotation, then, for each group, makes a WordGrid and calls
+// TransferDiacriticsToWords to copy the diacritic blobs to the most
+// appropriate words in the group of blocks. Source blobs are not touched.
+void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
+                                              BLOCK_LIST* blocks) {
+  // Angle difference larger than this is too much to consider equal.
+  // They should only be in multiples of M_PI/2 anyway.
+  const double kMaxAngleDiff = 0.01;  // About 0.6 degrees.
+  PointerVector<BlockGroup> groups;
+  BLOCK_IT bk_it(blocks);
+  for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) {
+    BLOCK* block = bk_it.data();
+    if (block->poly_block() != NULL && !block->poly_block()->IsText()) {
+      continue;
+    }
+    // Linear search of the groups to find a matching rotation.
+    float block_angle = block->re_rotation().angle();
+    int best_g = 0;
+    float best_angle_diff = MAX_FLOAT32;
+    for (int g = 0; g < groups.size(); ++g) {
+      double angle_diff = fabs(block_angle - groups[g]->angle);
+      if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI);
+      if (angle_diff < best_angle_diff) {
+        best_angle_diff = angle_diff;
+        best_g = g;
+      }
+    }
+    if (best_angle_diff > kMaxAngleDiff) {
+      groups.push_back(new BlockGroup(block));
+    } else {
+      groups[best_g]->blocks.push_back(block);
+      groups[best_g]->bounding_box += block->bounding_box();
+      float x_height = block->x_height();
+      if (x_height < groups[best_g]->min_xheight)
+        groups[best_g]->min_xheight = x_height;
+    }
+  }
+  // Now process each group of blocks.
+  PointerVector<WordWithBox> word_ptrs;
+  for (int g = 0; g < groups.size(); ++g) {
+    const BlockGroup* group = groups[g];
+    tprintf("group %d, xh=%g, %d blocks\n", g, group->min_xheight,
+            group->blocks.size());
+    WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(),
+                       group->bounding_box.topright());
+    for (int b = 0; b < group->blocks.size(); ++b) {
+      tprintf("block %d, %d rows\n", b, group->blocks[b]->row_list()->length());
+      ROW_IT row_it(group->blocks[b]->row_list());
+      for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
+        ROW* row = row_it.data();
+        tprintf("%d words in row\n", row->word_list()->length());
+        // Put the words of the row into the grid.
+        WERD_IT w_it(row->word_list());
+        for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
+          WERD* word = w_it.data();
+          WordWithBox* box_word = new WordWithBox(word);
+          word_grid.InsertBBox(true, true, box_word);
+          // Save the pointer where it will be auto-deleted.
+          word_ptrs.push_back(box_word);
+        }
+      }
+    }
+    FCOORD rotation = group->rotation;
+    // Make it a forward rotation that will transform blob coords to block.
+    rotation.set_y(-rotation.y());
+    TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid);
+  }
+}
+
+// Places a copy of blobs that are near a word (after applying rotation to the
+// blob) in the most appropriate word, unless there is doubt, in which case a
+// blob can end up in two words. Source blobs are not touched.
+void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs,
+                                        const FCOORD& rotation,
+                                        WordGrid* word_grid) {
+  WordSearch ws(word_grid);
+  BLOBNBOX_IT b_it(diacritic_blobs);
+  // Apply rotation to each blob before finding the nearest words. The rotation
+  // allows us to only consider above/below placement and not left/right on
+  // vertical text, because all text is horizontal here.
+  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
+    BLOBNBOX* blobnbox = b_it.data();
+    TBOX blob_box = blobnbox->bounding_box();
+    blob_box.rotate(rotation);
+    ws.StartRectSearch(blob_box);
+    // Above/below refer to word position relative to diacritic. Since some
+    // scripts eg Kannada/Telugu habitually put diacritics below words, and
+    // others eg Thai/Vietnamese/Latin put most diacritics above words, try
+    // for both if there isn't much in it.
+    WordWithBox* best_above_word = NULL;
+    WordWithBox* best_below_word = NULL;
+    int best_above_distance = 0;
+    int best_below_distance = 0;
+    for (WordWithBox* word = ws.NextRectSearch(); word != NULL;
+         word = ws.NextRectSearch()) {
+      if (word->word()->flag(W_REP_CHAR)) continue;
+      TBOX word_box = word->true_bounding_box();
+      int x_distance = blob_box.x_gap(word_box);
+      int y_distance = blob_box.y_gap(word_box);
+      if (x_distance > 0) {
+        // Arbitrarily divide x-distance by 2 if there is a major y overlap,
+        // and the word is to the left of the diacritic. If the
+        // diacritic is a dropped broken character between two words, this will
+        // help send all the pieces to a single word, instead of splitting them
+        // over the 2 words.
+        if (word_box.major_y_overlap(blob_box) &&
+            blob_box.left() > word_box.right()) {
+          x_distance /= 2;
+        }
+        y_distance += x_distance;
+      }
+      if (word_box.y_middle() > blob_box.y_middle() &&
+          (best_above_word == NULL || y_distance < best_above_distance)) {
+        best_above_word = word;
+        best_above_distance = y_distance;
+      }
+      if (word_box.y_middle() <= blob_box.y_middle() &&
+          (best_below_word == NULL || y_distance < best_below_distance)) {
+        best_below_word = word;
+        best_below_distance = y_distance;
+      }
+    }
+    bool above_good =
+        best_above_word != NULL &&
+        (best_below_word == NULL ||
+         best_above_distance < best_below_distance + blob_box.height());
+    bool below_good =
+        best_below_word != NULL && best_below_word != best_above_word &&
+        (best_above_word == NULL ||
+         best_below_distance < best_above_distance + blob_box.height());
+    if (below_good) {
+      C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
+      copied_blob->rotate(rotation);
+      // Put the blob into the word's reject blobs list.
+      C_BLOB_IT blob_it(best_below_word->RejBlobs());
+      blob_it.add_to_end(copied_blob);
+    }
+    if (above_good) {
+      C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob());
+      copied_blob->rotate(rotation);
+      // Put the blob into the word's reject blobs list.
+      C_BLOB_IT blob_it(best_above_word->RejBlobs());
+      blob_it.add_to_end(copied_blob);
+    }
+  }
+}
+
 }  // tesseract
 
 /**********************************************************************
@@ -820,33 +993,3 @@ void tweak_row_baseline(ROW *row,
   free_mem(xstarts);
   free_mem(coeffs);
 }
-
-/**********************************************************************
- * blob_y_order
- *
- * Sort function to sort blobs in y from page top.
- **********************************************************************/
-
-inT32 blob_y_order(              //sort function
-                   void *item1,  //items to compare
-                   void *item2) {
-                                 //converted ptr
-  BLOBNBOX *blob1 = *(BLOBNBOX **) item1;
-                                 //converted ptr
-  BLOBNBOX *blob2 = *(BLOBNBOX **) item2;
-
-  if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ())
-    return -1;
-  else if (blob1->bounding_box ().bottom () <
-    blob2->bounding_box ().bottom ())
-    return 1;
-  else {
-    if (blob1->bounding_box ().left () < blob2->bounding_box ().left ())
-      return -1;
-    else if (blob1->bounding_box ().left () >
-      blob2->bounding_box ().left ())
-      return 1;
-    else
-      return 0;
-  }
-}
diff --git a/textord/tordmain.h b/textord/tordmain.h
index 340ff1aabe..cb5a6a1ef2 100644
--- a/textord/tordmain.h
+++ b/textord/tordmain.h
@@ -29,29 +29,14 @@
 struct Pix;
 namespace tesseract {
 class Tesseract;
-}
 
-void make_blocks_from_blobs(                       //convert & textord
-                            TBLOB *tessblobs,      //tess style input
-                            const char *filename,  //blob file
-                            ICOORD page_tr,        //top right
-                            BOOL8 do_shift,        //shift tess coords
-                            BLOCK_LIST *blocks     //block list
-                           );
 void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob);
 void assign_blobs_to_blocks2(Pix* pix, BLOCK_LIST *blocks,
                              TO_BLOCK_LIST *port_blocks);
-void textord_page(                             //make rows & words
-                  ICOORD page_tr,              //top right
-                  BLOCK_LIST *blocks,          //block list
-                  TO_BLOCK_LIST *land_blocks,  //rotated for landscape
-                  TO_BLOCK_LIST *port_blocks,  //output list
-                  tesseract::Tesseract*
-                 );
+}  // namespace tesseract
+
 void tweak_row_baseline(ROW *row,
                         double blshift_maxshift,
                         double blshift_xfraction);
-inT32 blob_y_order(              //sort function
-                   void *item1,  //items to compare
-                   void *item2);
+
 #endif