From 1156098567bb6aa9b7cfc5ea67d15a4d7e313c1b Mon Sep 17 00:00:00 2001 From: zdenop Date: Sun, 3 Aug 2014 16:22:12 +0000 Subject: [PATCH] Add font info to hocr output - fix issue 1219 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1132 d0cd1f9f-072b-0410-8dd7-cf729c803f20 --- api/baseapi.cpp | 19 +++++++++++++------ api/capi.cpp | 5 +++++ api/capi.h | 3 ++- api/renderer.cpp | 13 ++++++++++++- api/renderer.h | 4 ++++ api/tesseractmain.cpp | 7 +++++-- ccmain/tesseractclass.cpp | 2 ++ ccmain/tesseractclass.h | 2 ++ 8 files changed, 45 insertions(+), 10 deletions(-) diff --git a/api/baseapi.cpp b/api/baseapi.cpp index c6fa75da72..b5c5f34a47 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -1356,6 +1356,8 @@ char* TessBaseAPI::GetHOCRText(int page_number) { int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; int page_id = page_number + 1; // hOCR uses 1-based page numbers. + bool font_info = false; + GetBoolVariable("hocr_font_info", &font_info); STRING hocr_str(""); @@ -1428,12 +1430,23 @@ char* TessBaseAPI::GetHOCRText(int page_number) { hocr_str.add_str_int("WordRecognitionLanguage()) { hocr_str += " lang='"; @@ -1447,12 +1460,6 @@ char* TessBaseAPI::GetHOCRText(int page_number) { break; } hocr_str += ">"; - bool bold, italic, underlined, monospace, serif, smallcaps; - int pointsize, font_id; - // TODO(rays): Is hOCR interested in the font name? - (void) res_it->WordFontAttributes(&bold, &italic, &underlined, - &monospace, &serif, &smallcaps, - &pointsize, &font_id); bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); diff --git a/api/capi.cpp b/api/capi.cpp index 74405e9261..9511b16db6 100644 --- a/api/capi.cpp +++ b/api/capi.cpp @@ -42,6 +42,11 @@ TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate(const char* output return new TessHOcrRenderer(outputbase); } +TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outputbase, BOOL font_info) +{ + return new TessHOcrRenderer(outputbase, font_info); +} + TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir) { return new TessPDFRenderer(outputbase, datadir); diff --git a/api/capi.h b/api/capi.h index cbf5d0ab59..e813916cfc 100644 --- a/api/capi.h +++ b/api/capi.h @@ -100,6 +100,7 @@ TESS_API void TESS_CALL TessDeleteBlockList(BLOCK_LIST* block_list); /* Renderer API */ TESS_API TessResultRenderer* TESS_CALL TessTextRendererCreate(const char* outputbase); TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate(const char* outputbase); +TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outputbase, BOOL font_info); TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir); TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase); TESS_API TessResultRenderer* TESS_CALL TessBoxTextRendererCreate(const char* outputbase); @@ -161,7 +162,7 @@ TESS_API int TESS_CALL TessBaseAPIInit1(TessBaseAPI* handle, const char* datap TESS_API int TESS_CALL TessBaseAPIInit2(TessBaseAPI* handle, const char* datapath, const char* language, TessOcrEngineMode oem); TESS_API int TESS_CALL TessBaseAPIInit3(TessBaseAPI* handle, const char* datapath, const char* language); -TESS_API int TESS_CALL TessBaseAPIInit4(TessBaseAPI* handle, const char* datapath, const char* language, TessOcrEngineMode mode, +TESS_API int TESS_CALL TessBaseAPIInit4(TessBaseAPI* handle, const char* datapath, const char* language, TessOcrEngineMode mode, char** configs, int configs_size, char** vars_vec, char** vars_values, size_t vars_vec_size, BOOL set_only_non_debug_params); diff --git a/api/renderer.cpp b/api/renderer.cpp index d03b16d14f..7a2c4b2a1b 100644 --- a/api/renderer.cpp +++ b/api/renderer.cpp @@ -122,6 +122,12 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) { **********************************************************************/ TessHOcrRenderer::TessHOcrRenderer(const char *outputbase) : TessResultRenderer(outputbase, "hocr") { + font_info_ = false; +} + +TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info) + : TessResultRenderer(outputbase, "hocr") { + font_info_ = font_info; } bool TessHOcrRenderer::BeginDocumentHandler() { @@ -139,7 +145,12 @@ bool TessHOcrRenderer::BeginDocumentHandler() { " \n" " \n" + " ocr_line ocrx_word"); + if (font_info_) + AppendString( + " ocrp_lang ocrp_dir ocrp_font ocrp_fsize ocrp_wconf"); + AppendString( + "'/>\n" "\n\n"); return true; diff --git a/api/renderer.h b/api/renderer.h index 39be8d3fa4..1ad14731e5 100644 --- a/api/renderer.h +++ b/api/renderer.h @@ -150,12 +150,16 @@ class TESS_API TessTextRenderer : public TessResultRenderer { */ class TESS_API TessHOcrRenderer : public TessResultRenderer { public: + explicit TessHOcrRenderer(const char *outputbase, bool font_info); explicit TessHOcrRenderer(const char *outputbase); protected: virtual bool BeginDocumentHandler(); virtual bool AddImageHandler(TessBaseAPI* api); virtual bool EndDocumentHandler(); + +private: + bool font_info_; // whether to print font information }; /** diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index dd40f0599a..e58be46b87 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -290,8 +290,11 @@ int main(int argc, char **argv) { tesseract::TessResultRenderer* renderer = NULL; bool b; api.GetBoolVariable("tessedit_create_hocr", &b); - if (b) - renderer = new tesseract::TessHOcrRenderer(outputbase); + if (b) { + bool font_info; + api.GetBoolVariable("hocr_font_info", &font_info); + renderer = new tesseract::TessHOcrRenderer(outputbase, font_info); + } api.GetBoolVariable("tessedit_create_pdf", &b); if (b && renderer == NULL) diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index c9ed79f552..b6ffc57073 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -208,6 +208,8 @@ Tesseract::Tesseract() "good_quality_doc gte good char limit", this->params()), BOOL_MEMBER(unlv_tilde_crunching, true, "Mark v.bad words for tilde crunch", this->params()), + BOOL_MEMBER(hocr_font_info, false, + "Add font info to hocr output", this->params()), BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", this->params()), BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index 867393462c..2d8b8d8ffb 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -836,6 +836,8 @@ class Tesseract : public Wordrec { "good_quality_doc gte good char limit"); BOOL_VAR_H(unlv_tilde_crunching, true, "Mark v.bad words for tilde crunch"); + BOOL_VAR_H(hocr_font_info, false, + "Add font info to hocr output"); BOOL_VAR_H(crunch_early_merge_tess_fails, true, "Before word crunch?"); BOOL_VAR_H(crunch_early_convert_bad_unlv_chs, false, "Take out ~^ early?"); double_VAR_H(crunch_terrible_rating, 80.0, "crunch rating lt this");