Skip to content

Commit

Permalink
Add font info to hocr output - fix issue 1219
Browse files Browse the repository at this point in the history
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1132 d0cd1f9f-072b-0410-8dd7-cf729c803f20
  • Loading branch information
zdenop committed Aug 3, 2014
1 parent 19ddc89 commit 1156098
Show file tree
Hide file tree
Showing 8 changed files with 45 additions and 10 deletions.
19 changes: 13 additions & 6 deletions api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1356,6 +1356,8 @@ char* TessBaseAPI::GetHOCRText(int page_number) {

int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
bool font_info = false;
GetBoolVariable("hocr_font_info", &font_info);

STRING hocr_str("");

Expand Down Expand Up @@ -1428,12 +1430,23 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
hocr_str.add_str_int("<span class='ocrx_word' id='word_", page_id);
hocr_str.add_str_int("_", wcnt);
int left, top, right, bottom;
bool bold, italic, underlined, monospace, serif, smallcaps;
int pointsize, font_id;
const char *font_name;
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
&monospace, &serif, &smallcaps,
&pointsize, &font_id);
hocr_str.add_str_int("' title='bbox ", left);
hocr_str.add_str_int(" ", top);
hocr_str.add_str_int(" ", right);
hocr_str.add_str_int(" ", bottom);
hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
if (font_info) {
hocr_str += "; x_font ";
HOcrEscape(font_name, hocr_str);
hocr_str.add_str_int("; x_fsize ", pointsize);
}
hocr_str += "'";
if (res_it->WordRecognitionLanguage()) {
hocr_str += " lang='";
Expand All @@ -1447,12 +1460,6 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
break;
}
hocr_str += ">";
bool bold, italic, underlined, monospace, serif, smallcaps;
int pointsize, font_id;
// TODO(rays): Is hOCR interested in the font name?
(void) res_it->WordFontAttributes(&bold, &italic, &underlined,
&monospace, &serif, &smallcaps,
&pointsize, &font_id);
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
Expand Down
5 changes: 5 additions & 0 deletions api/capi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@ TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate(const char* output
return new TessHOcrRenderer(outputbase);
}

TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outputbase, BOOL font_info)
{
return new TessHOcrRenderer(outputbase, font_info);
}

TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir)
{
return new TessPDFRenderer(outputbase, datadir);
Expand Down
3 changes: 2 additions & 1 deletion api/capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ TESS_API void TESS_CALL TessDeleteBlockList(BLOCK_LIST* block_list);
/* Renderer API */
TESS_API TessResultRenderer* TESS_CALL TessTextRendererCreate(const char* outputbase);
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate(const char* outputbase);
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outputbase, BOOL font_info);
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir);
TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase);
TESS_API TessResultRenderer* TESS_CALL TessBoxTextRendererCreate(const char* outputbase);
Expand Down Expand Up @@ -161,7 +162,7 @@ TESS_API int TESS_CALL TessBaseAPIInit1(TessBaseAPI* handle, const char* datap
TESS_API int TESS_CALL TessBaseAPIInit2(TessBaseAPI* handle, const char* datapath, const char* language, TessOcrEngineMode oem);
TESS_API int TESS_CALL TessBaseAPIInit3(TessBaseAPI* handle, const char* datapath, const char* language);

TESS_API int TESS_CALL TessBaseAPIInit4(TessBaseAPI* handle, const char* datapath, const char* language, TessOcrEngineMode mode,
TESS_API int TESS_CALL TessBaseAPIInit4(TessBaseAPI* handle, const char* datapath, const char* language, TessOcrEngineMode mode,
char** configs, int configs_size,
char** vars_vec, char** vars_values, size_t vars_vec_size,
BOOL set_only_non_debug_params);
Expand Down
13 changes: 12 additions & 1 deletion api/renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,12 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) {
**********************************************************************/
TessHOcrRenderer::TessHOcrRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "hocr") {
font_info_ = false;
}

TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info)
: TessResultRenderer(outputbase, "hocr") {
font_info_ = font_info;
}

bool TessHOcrRenderer::BeginDocumentHandler() {
Expand All @@ -139,7 +145,12 @@ bool TessHOcrRenderer::BeginDocumentHandler() {
" <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
"' />\n"
" <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
" ocr_line ocrx_word'/>\n"
" ocr_line ocrx_word");
if (font_info_)
AppendString(
" ocrp_lang ocrp_dir ocrp_font ocrp_fsize ocrp_wconf");
AppendString(
"'/>\n"
"</head>\n<body>\n");

return true;
Expand Down
4 changes: 4 additions & 0 deletions api/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,12 +150,16 @@ class TESS_API TessTextRenderer : public TessResultRenderer {
*/
class TESS_API TessHOcrRenderer : public TessResultRenderer {
public:
explicit TessHOcrRenderer(const char *outputbase, bool font_info);
explicit TessHOcrRenderer(const char *outputbase);

protected:
virtual bool BeginDocumentHandler();
virtual bool AddImageHandler(TessBaseAPI* api);
virtual bool EndDocumentHandler();

private:
bool font_info_; // whether to print font information
};

/**
Expand Down
7 changes: 5 additions & 2 deletions api/tesseractmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -290,8 +290,11 @@ int main(int argc, char **argv) {
tesseract::TessResultRenderer* renderer = NULL;
bool b;
api.GetBoolVariable("tessedit_create_hocr", &b);
if (b)
renderer = new tesseract::TessHOcrRenderer(outputbase);
if (b) {
bool font_info;
api.GetBoolVariable("hocr_font_info", &font_info);
renderer = new tesseract::TessHOcrRenderer(outputbase, font_info);
}

api.GetBoolVariable("tessedit_create_pdf", &b);
if (b && renderer == NULL)
Expand Down
2 changes: 2 additions & 0 deletions ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ Tesseract::Tesseract()
"good_quality_doc gte good char limit", this->params()),
BOOL_MEMBER(unlv_tilde_crunching, true,
"Mark v.bad words for tilde crunch", this->params()),
BOOL_MEMBER(hocr_font_info, false,
"Add font info to hocr output", this->params()),
BOOL_MEMBER(crunch_early_merge_tess_fails, true,
"Before word crunch?", this->params()),
BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
Expand Down
2 changes: 2 additions & 0 deletions ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,8 @@ class Tesseract : public Wordrec {
"good_quality_doc gte good char limit");
BOOL_VAR_H(unlv_tilde_crunching, true,
"Mark v.bad words for tilde crunch");
BOOL_VAR_H(hocr_font_info, false,
"Add font info to hocr output");
BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?");
BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?");
double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");
Expand Down

0 comments on commit 1156098

Please sign in to comment.