Skip to content

Commit

Permalink
Add new parameter for invert_threshold (tesseract-ocr#3852)
Browse files Browse the repository at this point in the history
Change default value from 0.5 to 0.7.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
  • Loading branch information
stweil authored Jun 26, 2022
1 parent 0df584e commit 96861b5
Show file tree
Hide file tree
Showing 6 changed files with 20 additions and 12 deletions.
3 changes: 2 additions & 1 deletion src/ccmain/linerec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -250,7 +250,8 @@ void Tesseract::LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word,
}

bool do_invert = tessedit_do_invert;
lstm_recognizer_->RecognizeLine(*im_data, do_invert, classify_debug_level > 0,
float threshold = do_invert ? double(invert_threshold) : 0.0f;
lstm_recognizer_->RecognizeLine(*im_data, threshold, classify_debug_level > 0,
kWorstDictCertainty / kCertaintyScale, word_box, words,
lstm_choice_mode, lstm_choice_iterations);
delete im_data;
Expand Down
5 changes: 4 additions & 1 deletion src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,11 @@ Tesseract::Tesseract()
"Break input into lines and remap boxes if present", this->params())
, BOOL_MEMBER(tessedit_dump_pageseg_images, false,
"Dump intermediate images made during page segmentation", this->params())
, BOOL_MEMBER(tessedit_do_invert, true, "Try inverting the image in `LSTMRecognizeWord`",
, BOOL_MEMBER(tessedit_do_invert, true, "Try inverted line image if necessary",
this->params())
, double_MEMBER(invert_threshold, 0.7,
"For lines with a mean confidence below this value, OCR is also tried with an inverted image",
this->params())
,
// The default for pageseg_mode is the old behaviour, so as not to
// upset anything that relies on that.
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,7 @@ class TESS_API Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_train_line_recognizer);
BOOL_VAR_H(tessedit_dump_pageseg_images);
BOOL_VAR_H(tessedit_do_invert);
double_VAR_H(invert_threshold);
INT_VAR_H(tessedit_pageseg_mode);
INT_VAR_H(thresholding_method);
BOOL_VAR_H(thresholding_debug);
Expand Down
12 changes: 7 additions & 5 deletions src/lstm/lstmrecognizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,14 +244,15 @@ bool LSTMRecognizer::LoadDictionary(const ParamsVectors *params, const std::stri

// Recognizes the line image, contained within image_data, returning the
// ratings matrix and matching box_word for each WERD_RES in the output.
void LSTMRecognizer::RecognizeLine(const ImageData &image_data, bool invert, bool debug,
void LSTMRecognizer::RecognizeLine(const ImageData &image_data,
float invert_threshold, bool debug,
double worst_dict_cert, const TBOX &line_box,
PointerVector<WERD_RES> *words, int lstm_choice_mode,
int lstm_choice_amount) {
NetworkIO outputs;
float scale_factor;
NetworkIO inputs;
if (!RecognizeLine(image_data, invert, debug, false, false, &scale_factor, &inputs, &outputs)) {
if (!RecognizeLine(image_data, invert_threshold, debug, false, false, &scale_factor, &inputs, &outputs)) {
return;
}
if (search_ == nullptr) {
Expand Down Expand Up @@ -317,7 +318,8 @@ void LSTMRecognizer::OutputStats(const NetworkIO &outputs, float *min_output, fl

// Recognizes the image_data, returning the labels,
// scores, and corresponding pairs of start, end x-coords in coords.
bool LSTMRecognizer::RecognizeLine(const ImageData &image_data, bool invert, bool debug,
bool LSTMRecognizer::RecognizeLine(const ImageData &image_data,
float invert_threshold, bool debug,
bool re_invert, bool upside_down, float *scale_factor,
NetworkIO *inputs, NetworkIO *outputs) {
// This ensures consistent recognition results.
Expand Down Expand Up @@ -345,10 +347,10 @@ bool LSTMRecognizer::RecognizeLine(const ImageData &image_data, bool invert, boo
Input::PreparePixInput(network_->InputShape(), pix, &randomizer_, inputs);
network_->Forward(debug, *inputs, nullptr, &scratch_space_, outputs);
// Check for auto inversion.
if (invert) {
if (invert_threshold > 0.0f) {
float pos_min, pos_mean, pos_sd;
OutputStats(*outputs, &pos_min, &pos_mean, &pos_sd);
if (pos_mean < 0.5f) {
if (pos_mean < invert_threshold) {
// Run again inverted and see if it is any better.
NetworkIO inv_inputs, inv_outputs;
inv_inputs.set_int_mode(IsIntMode());
Expand Down
9 changes: 5 additions & 4 deletions src/lstm/lstmrecognizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,12 @@ class TESS_API LSTMRecognizer {

// Recognizes the line image, contained within image_data, returning the
// recognized tesseract WERD_RES for the words.
// If invert, tries inverted as well if the normal interpretation doesn't
// produce a good enough result. The line_box is used for computing the
// If invert_threshold > 0, tries inverted as well if the normal
// interpretation doesn't produce a result which at least reaches
// that threshold. The line_box is used for computing the
// box_word in the output words. worst_dict_cert is the worst certainty that
// will be used in a dictionary word.
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert,
void RecognizeLine(const ImageData &image_data, float invert_threshold, bool debug, double worst_dict_cert,
const TBOX &line_box, PointerVector<WERD_RES> *words, int lstm_choice_mode = 0,
int lstm_choice_amount = 5);

Expand All @@ -263,7 +264,7 @@ class TESS_API LSTMRecognizer {
// improve the results. This ensures that outputs contains the correct
// forward outputs for the best photometric interpretation.
// inputs is filled with the used inputs to the network.
bool RecognizeLine(const ImageData &image_data, bool invert, bool debug, bool re_invert,
bool RecognizeLine(const ImageData &image_data, float invert_threshold, bool debug, bool re_invert,
bool upside_down, float *scale_factor, NetworkIO *inputs, NetworkIO *outputs);

// Converts an array of labels to utf-8, whether or not the labels are
Expand Down
2 changes: 1 addition & 1 deletion src/training/unicharset/lstmtrainer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -948,7 +948,7 @@ Trainability LSTMTrainer::PrepareForBackward(const ImageData *trainingdata,
float image_scale;
NetworkIO inputs;
bool invert = trainingdata->boxes().empty();
if (!RecognizeLine(*trainingdata, invert, debug, invert, upside_down,
if (!RecognizeLine(*trainingdata, invert ? 0.5f : 0.0f, debug, invert, upside_down,
&image_scale, &inputs, fwd_outputs)) {
tprintf("Image %s not trainable\n", trainingdata->imagefilename().c_str());
return UNENCODABLE;
Expand Down

0 comments on commit 96861b5

Please sign in to comment.