Skip to content

Commit

Permalink
If there is no explicit renderer(s), default to TessTextRenderer
Browse files Browse the repository at this point in the history
Revert fd429c3, 43834da, 05de195.

See tesseract-ocr#49, tesseract-ocr#59.

The code in this commit solves the issue in a more elegant way, IMHO.

Now you can use:
  * `tesseract eurotext.tif eurotext txt pdf`
  * `tesseract eurotext.tif eurotext txt hocr`
  * `tesseract eurotext.tif eurotext txt hocr pdf`

NOTE:
  With `tesseract eurotext.tif eurotext`
  or `tesseract eurotext.tif eurotext txt`
  the psm will be set to '3', but...
  With `tesseract eurotext.tif eurotext txt pdf`
  or `tesseract eurotext.tif eurotext txt hocr`
  the psm will be set to '1'.
  • Loading branch information
amitdo committed Dec 11, 2015
1 parent d4e0c64 commit c2f5e9b
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 21 deletions.
48 changes: 33 additions & 15 deletions api/tesseractmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,16 +176,16 @@ void PrintLangsList(tesseract::TessBaseAPI* api) {
/**
* We have 2 possible sources of pagesegmode: a config file and
* the command line. For backwards compatibility reasons, the
* default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
* default for this program is tesseract::PSM_AUTO. We will let
* the config file take priority, so the command-line default
* can take priority over the tesseract default, so we use the
* value from the command line only if the retrieved mode
* is still tesseract::PSM_SINGLE_BLOCK, indicating no change
* in any config file. Therefore the only way to force
* tesseract::PSM_SINGLE_BLOCK is from the command line.
* It would be simpler if we could set the value before Init,
* but that doesn't work.
* default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
* default for this program is tesseract::PSM_AUTO. We will let
* the config file take priority, so the command-line default
* can take priority over the tesseract default, so we use the
* value from the command line only if the retrieved mode
* is still tesseract::PSM_SINGLE_BLOCK, indicating no change
* in any config file. Therefore the only way to force
* tesseract::PSM_SINGLE_BLOCK is from the command line.
* It would be simpler if we could set the value before Init,
* but that doesn't work.
*/
void FixPageSegMode(tesseract::TessBaseAPI* api,
tesseract::PageSegMode pagesegmode) {
Expand Down Expand Up @@ -295,19 +295,37 @@ void PreloadRenderers(tesseract::TessBaseAPI* api,
if (b) {
bool font_info;
api->GetBoolVariable("hocr_font_info", &font_info);
renderers->push_back(new tesseract::TessHOcrRenderer(outputbase, font_info));
renderers->push_back(
new tesseract::TessHOcrRenderer(outputbase, font_info));
}

api->GetBoolVariable("tessedit_create_pdf", &b);
if (b) {
renderers->push_back(new tesseract::TessPDFRenderer(outputbase,
api->GetDatapath()));
api->GetDatapath()));
}

api->GetBoolVariable("tessedit_write_unlv", &b);
if (b) renderers->push_back(new tesseract::TessUnlvRenderer(outputbase));
if (b) {
renderers->push_back(new tesseract::TessUnlvRenderer(outputbase));
}

api->GetBoolVariable("tessedit_create_boxfile", &b);
if (b) renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase));
if (b) {
renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase));
}

// disable text renderer when using one of these configs:
// ambigs.train, box.train, box.train.stderr, linebox, rebox
bool disable_text_renderer =
(api->GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
(api->GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
(api->GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b);

api->GetBoolVariable("tessedit_create_txt", &b);
if (b) renderers->push_back(new tesseract::TessTextRenderer(outputbase));
if (b || (renderers->empty() && !disable_text_renderer) {
renderers->push_back(new tesseract::TessTextRenderer(outputbase));
}
}

if (!renderers->empty()) {
Expand Down
2 changes: 1 addition & 1 deletion ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,7 @@ Tesseract::Tesseract()
this->params()),
BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
this->params()),
BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file",
BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
this->params()),
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
this->params()),
Expand Down
2 changes: 1 addition & 1 deletion ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -1001,7 +1001,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_write_rep_codes, false,
"Write repetition char code");
BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
BOOL_VAR_H(tessedit_create_txt, true, "Write .txt output file");
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
STRING_VAR_H(unrecognised_char, "|",
Expand Down
1 change: 0 additions & 1 deletion tessdata/configs/hocr
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
tessedit_create_txt 0
tessedit_create_hocr 1
tessedit_pageseg_mode 1
1 change: 0 additions & 1 deletion tessdata/configs/makebox
Original file line number Diff line number Diff line change
@@ -1,2 +1 @@
tessedit_create_txt 0
tessedit_create_boxfile 1
1 change: 0 additions & 1 deletion tessdata/configs/pdf
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
tessedit_create_txt 0
tessedit_create_pdf 1
tessedit_pageseg_mode 1
3 changes: 3 additions & 0 deletions tessdata/configs/txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# This config file should be used with other cofig files which creates renderers.
# usage example: tesseract eurotext.tif eurotext txt hocr pdf
tessedit_create_txt 1
1 change: 0 additions & 1 deletion tessdata/configs/unlv
Original file line number Diff line number Diff line change
@@ -1,3 +1,2 @@
tessedit_create_txt 0
tessedit_write_unlv 1
tessedit_pageseg_mode 6

0 comments on commit c2f5e9b

Please sign in to comment.