Skip to content

Commit

Permalink
Add a new renderer to create box files from images for LSTM training
Browse files Browse the repository at this point in the history
(cherry picked from commit 921da6b)

fix typo

(cherry picked from commit 7bd1a0c)

Add lstmboxrenderer to CMakeLists

(cherry picked from commit cfef3a8)

fix formatting

(cherry picked from commit 7ba2b01)
  • Loading branch information
Shreeshrii committed Feb 5, 2019
1 parent 56725de commit 9c89cd5
Show file tree
Hide file tree
Showing 10 changed files with 150 additions and 1 deletion.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ set(tesseract_src ${tesseract_src}
src/api/renderer.cpp
src/api/altorenderer.cpp
src/api/hocrrenderer.cpp
src/api/lstmboxrenderer.cpp
src/api/pdfrenderer.cpp
)

Expand Down
1 change: 1 addition & 0 deletions src/api/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ endif
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
libtesseract_api_la_SOURCES += altorenderer.cpp
libtesseract_api_la_SOURCES += hocrrenderer.cpp
libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
libtesseract_api_la_SOURCES += pdfrenderer.cpp
libtesseract_api_la_SOURCES += renderer.cpp

Expand Down
8 changes: 8 additions & 0 deletions src/api/baseapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,14 @@ class TESS_API TessBaseAPI {
* Returned string must be freed with the delete [] operator.
*/
char* GetTSVText(int page_number);

/**
* Make a box file for LSTM training from the internal data structures.
* Constructs coordinates in the original image - not just the rectangle.
* page_number is a 0-based page index that will appear in the box file.
* Returned string must be freed with the delete [] operator.
*/
char* GetLSTMBOXText(int page_number);

/**
* The recognized text is returned as a char* which is coded in the same
Expand Down
2 changes: 1 addition & 1 deletion src/api/hocrrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
if (grapheme && grapheme[0] != 0) {
if (hocr_boxes) {
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
hocr_str << "<span class='ocrx_cinfo' title='x_bboxes "
hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
<< left << " " << top << " " << right << " " << bottom
<< "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
}
Expand Down
110 changes: 110 additions & 0 deletions src/api/lstmboxrenderer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
/**********************************************************************
* File: lstmboxrenderer.cpp
* Description: Renderer for creating box file for LSTM training.
* based on the tsv renderer.
*
* (C) Copyright 2006, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/


#include <locale> // for std::locale::classic
#include <memory> // for std::unique_ptr
#include <sstream> // for std::stringstream
#include "baseapi.h" // for TessBaseAPI
#include "renderer.h"
#include "tesseractclass.h" // for Tesseract

namespace tesseract {

/**
* Create a UTF8 box file for LSTM training from the internal data structures.
* page_number is a 0-base page index that will appear in the box file.
* Returned string must be freed with the delete [] operator.
*/

char* TessBaseAPI::GetLSTMBOXText(int page_number) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
return nullptr;

STRING lstm_box_str("");

int page_num = page_number;
bool first_word = true;

LTRResultIterator* res_it = GetLTRIterator();
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_SYMBOL)) {
res_it->Next(RIL_SYMBOL);
continue;
}

int left, top, right, bottom;

if (!first_word) {
if (res_it->IsAtBeginningOf(RIL_WORD)) {
lstm_box_str.add_str_int(" ", left);
lstm_box_str.add_str_int(" ", image_height_ - bottom);
lstm_box_str.add_str_int(" ", right + 2);
lstm_box_str.add_str_int(" ", image_height_ - top);
lstm_box_str.add_str_int(" ", page_num); // level 5 - word
lstm_box_str += "\n"; // end of row for word
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
lstm_box_str.add_str_int("\t ", left);
lstm_box_str.add_str_int(" ", image_height_ - bottom);
lstm_box_str.add_str_int(" ", right + 5);
lstm_box_str.add_str_int(" ", image_height_ - top);
lstm_box_str.add_str_int(" ", page_num); // level 4 - line
lstm_box_str += "\n"; // end of row for line
}
}
first_word=false;
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);

do {
lstm_box_str +=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));

lstm_box_str.add_str_int(" ", left);
lstm_box_str.add_str_int(" ", image_height_ - bottom);
lstm_box_str.add_str_int(" ", right);
lstm_box_str.add_str_int(" ", image_height_ - top);
lstm_box_str.add_str_int(" ", page_num); // level 6 - symbol
lstm_box_str += "\n"; // end of row

}

char* ret = new char[lstm_box_str.length() + 1];
strcpy(ret, lstm_box_str.string());
delete res_it;
return ret;
}

/**********************************************************************
* LSTMBOX Renderer interface implementation
**********************************************************************/
TessLSTMBOXRenderer::TessLSTMBOXRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "box") {
}

bool TessLSTMBOXRenderer::AddImageHandler(TessBaseAPI* api) {
const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBOXText(imagenum()));
if (lstmbox == nullptr) return false;

AppendString(lstmbox.get());

return true;
}

} // namespace tesseract.
11 changes: 11 additions & 0 deletions src/api/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,17 @@ class TESS_API TessUnlvRenderer : public TessResultRenderer {
virtual bool AddImageHandler(TessBaseAPI* api);
};

/**
* Renders tesseract output into a plain UTF-8 text string for LSTMBOX
*/
class TESS_API TessLSTMBOXRenderer : public TessResultRenderer {
public:
explicit TessLSTMBOXRenderer(const char *outputbase);

protected:
virtual bool AddImageHandler(TessBaseAPI* api);
};

/**
* Renders tesseract output into a plain UTF-8 text string
*/
Expand Down
14 changes: 14 additions & 0 deletions src/api/tesseractmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,20 @@ static void PreloadRenderers(
}
}

api->GetBoolVariable("tessedit_create_lstmbox", &b);
if (b) {
tesseract::TessLSTMBOXRenderer* renderer =
new tesseract::TessLSTMBOXRenderer(outputbase);
if (renderer->happy()) {
renderers->push_back(renderer);
} else {
delete renderer;
tprintf("Error, could not create LSTM BOX output file: %s\n",
strerror(errno));
error = true;
}
}

api->GetBoolVariable("tessedit_create_boxfile", &b);
if (b) {
tesseract::TessBoxTextRenderer* renderer =
Expand Down
2 changes: 2 additions & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,8 @@ Tesseract::Tesseract()
this->params()),
BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
this->params()),
BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
this->params()),
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
this->params()),
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training");
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
BOOL_VAR_H(textonly_pdf, false,
Expand Down
1 change: 1 addition & 0 deletions tessdata/configs/lstmbox
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tessedit_create_lstmbox 1

0 comments on commit 9c89cd5

Please sign in to comment.