Add new parameter for invert_threshold (tesseract-ocr#3852)

Change default value from 0.5 to 0.7. Signed-off-by: Stefan Weil <sw@weilnetz.de>
UB-Mannheim · Jun 26, 2022 · 96861b5 · 96861b5
1 parent 0df584e
commit 96861b5
Show file tree

Hide file tree

Showing 6 changed files with 20 additions and 12 deletions.
diff --git a/src/ccmain/linerec.cpp b/src/ccmain/linerec.cpp
@@ -250,7 +250,8 @@ void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,
   }
 
   bool do_invert = tessedit_do_invert;
-  lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
+  float threshold = do_invert ? double(invert_threshold) : 0.0f;
+  lstm_recognizer_->RecognizeLine(*im_data, threshold, classify_debug_level > 0,
                                   kWorstDictCertainty / kCertaintyScale, word_box, words,
                                   lstm_choice_mode, lstm_choice_iterations);
   delete im_data;

diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
@@ -63,8 +63,11 @@ Tesseract::Tesseract()
                   "Break input into lines and remap boxes if present", this->params())
     , BOOL_MEMBER(tessedit_dump_pageseg_images, false,
                   "Dump intermediate images made during page segmentation", this->params())
-    , BOOL_MEMBER(tessedit_do_invert, true, "Try inverting the image in `LSTMRecognizeWord`",
+    , BOOL_MEMBER(tessedit_do_invert, true, "Try inverted line image if necessary",
                   this->params())
+    , double_MEMBER(invert_threshold, 0.7,
+                    "For lines with a mean confidence below this value, OCR is also tried with an inverted image",
+                    this->params())
     ,
     // The default for pageseg_mode is the old behaviour, so as not to
     // upset anything that relies on that.

diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h
@@ -756,6 +756,7 @@ class TESS_API Tesseract : public Wordrec {
   BOOL_VAR_H(tessedit_train_line_recognizer);
   BOOL_VAR_H(tessedit_dump_pageseg_images);
   BOOL_VAR_H(tessedit_do_invert);
+  double_VAR_H(invert_threshold);
   INT_VAR_H(tessedit_pageseg_mode);
   INT_VAR_H(thresholding_method);
   BOOL_VAR_H(thresholding_debug);

diff --git a/src/lstm/lstmrecognizer.cpp b/src/lstm/lstmrecognizer.cpp
@@ -244,14 +244,15 @@ bool LSTMRecognizer::LoadDictionary(const ParamsVectors *params, const std::stri
 
 // Recognizes the line image, contained within image_data, returning the
 // ratings matrix and matching box_word for each WERD_RES in the output.
-void LSTMRecognizer::RecognizeLine(const ImageData &image_data, bool invert, bool debug,
+void LSTMRecognizer::RecognizeLine(const ImageData &image_data,
+                                   float invert_threshold, bool debug,
                                    double worst_dict_cert, const TBOX &line_box,
                                    PointerVector<WERD_RES> *words, int lstm_choice_mode,
                                    int lstm_choice_amount) {
   NetworkIO outputs;
   float scale_factor;
   NetworkIO inputs;
-  if (!RecognizeLine(image_data, invert, debug, false, false, &scale_factor, &inputs, &outputs)) {
+  if (!RecognizeLine(image_data, invert_threshold, debug, false, false, &scale_factor, &inputs, &outputs)) {
     return;
   }
   if (search_ == nullptr) {
@@ -317,7 +318,8 @@ void LSTMRecognizer::OutputStats(const NetworkIO &outputs, float *min_output, fl
 
 // Recognizes the image_data, returning the labels,
 // scores, and corresponding pairs of start, end x-coords in coords.
-bool LSTMRecognizer::RecognizeLine(const ImageData &image_data, bool invert, bool debug,
+bool LSTMRecognizer::RecognizeLine(const ImageData &image_data,
+                                   float invert_threshold, bool debug,
                                    bool re_invert, bool upside_down, float *scale_factor,
                                    NetworkIO *inputs, NetworkIO *outputs) {
   // This ensures consistent recognition results.
@@ -345,10 +347,10 @@ bool LSTMRecognizer::RecognizeLine(const ImageData &image_data, bool invert, boo
   Input::PreparePixInput(network_->InputShape(), pix, &randomizer_, inputs);
   network_->Forward(debug, *inputs, nullptr, &scratch_space_, outputs);
   // Check for auto inversion.
-  if (invert) {
+  if (invert_threshold > 0.0f) {
     float pos_min, pos_mean, pos_sd;
     OutputStats(*outputs, &pos_min, &pos_mean, &pos_sd);
-    if (pos_mean < 0.5f) {
+    if (pos_mean < invert_threshold) {
       // Run again inverted and see if it is any better.
       NetworkIO inv_inputs, inv_outputs;
       inv_inputs.set_int_mode(IsIntMode());

diff --git a/src/lstm/lstmrecognizer.h b/src/lstm/lstmrecognizer.h
@@ -244,11 +244,12 @@ class TESS_API LSTMRecognizer {
 
   // Recognizes the line image, contained within image_data, returning the
   // recognized tesseract WERD_RES for the words.
-  // If invert, tries inverted as well if the normal interpretation doesn't
-  // produce a good enough result. The line_box is used for computing the
+  // If invert_threshold > 0, tries inverted as well if the normal
+  // interpretation doesn't produce a result which at least reaches
+  // that threshold. The line_box is used for computing the
   // box_word in the output words. worst_dict_cert is the worst certainty that
   // will be used in a dictionary word.
-  void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert,
+  void RecognizeLine(const ImageData &image_data, float invert_threshold, bool debug, double worst_dict_cert,
                      const TBOX &line_box, PointerVector<WERD_RES> *words, int lstm_choice_mode = 0,
                      int lstm_choice_amount = 5);
 
@@ -263,7 +264,7 @@ class TESS_API LSTMRecognizer {
   // improve the results. This ensures that outputs contains the correct
   // forward outputs for the best photometric interpretation.
   // inputs is filled with the used inputs to the network.
-  bool RecognizeLine(const ImageData &image_data, bool invert, bool debug, bool re_invert,
+  bool RecognizeLine(const ImageData &image_data, float invert_threshold, bool debug, bool re_invert,
                      bool upside_down, float *scale_factor, NetworkIO *inputs, NetworkIO *outputs);
 
   // Converts an array of labels to utf-8, whether or not the labels are

diff --git a/src/training/unicharset/lstmtrainer.cpp b/src/training/unicharset/lstmtrainer.cpp
@@ -948,7 +948,7 @@ Trainability LSTMTrainer::PrepareForBackward(const ImageData *trainingdata,
   float image_scale;
   NetworkIO inputs;
   bool invert = trainingdata->boxes().empty();
-  if (!RecognizeLine(*trainingdata, invert, debug, invert, upside_down,
+  if (!RecognizeLine(*trainingdata, invert ? 0.5f : 0.0f, debug, invert, upside_down,
                      &image_scale, &inputs, fwd_outputs)) {
     tprintf("Image %s not trainable\n", trainingdata->imagefilename().c_str());
     return UNENCODABLE;