diff --git a/api/baseapi.cpp b/api/baseapi.cpp index cd6985da8e..f59363f939 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -1379,14 +1379,14 @@ static void AddBaselineCoordsTohOCR(const PageIterator *it, } static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, int num2) { - unsigned long bufsize = base.length() + 2 * kMaxIntSize; - char id_buffer[bufsize]; + const unsigned long BUFSIZE = 64; + char id_buffer[BUFSIZE]; if (num2 >= 0) { - snprintf(id_buffer, bufsize - 1, "%s_%d_%d", base.c_str(), num1, num2); + snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d", base.c_str(), num1, num2); } else { - snprintf(id_buffer, bufsize - 1, "%s_%d", base.c_str(), num1); + snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1); } - id_buffer[bufsize - 1] = '\0'; + id_buffer[BUFSIZE - 1] = '\0'; *hocr_str += " id='"; *hocr_str += id_buffer; *hocr_str += "'"; @@ -1444,6 +1444,8 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) { int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; int page_id = page_number + 1; // hOCR uses 1-based page numbers. + bool para_is_ltr = true; // Default direction is LTR + const char* paragraph_lang = NULL; bool font_info = false; GetBoolVariable("hocr_font_info", &font_info); @@ -1493,18 +1495,24 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) { // Open any new block/paragraph/textline. if (res_it->IsAtBeginningOf(RIL_BLOCK)) { + para_is_ltr = true; // reset to default direction hocr_str += "
IsAtBeginningOf(RIL_PARA)) { hocr_str += "\n

ParagraphIsLtr()) { - hocr_str += " dir='ltr'"; - } else { + para_is_ltr = res_it->ParagraphIsLtr(); + if (!para_is_ltr) { hocr_str += " dir='rtl'"; } AddIdTohOCR(&hocr_str, "par", page_id, pcnt); + paragraph_lang = res_it->WordRecognitionLanguage(); + if (paragraph_lang) { + hocr_str += " lang='"; + hocr_str += paragraph_lang; + hocr_str += "'"; + } AddBoxTohOCR(res_it, RIL_PARA, &hocr_str); } if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { @@ -1537,14 +1545,18 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) { hocr_str.add_str_int("; x_fsize ", pointsize); } hocr_str += "'"; - if (res_it->WordRecognitionLanguage()) { + const char* lang = res_it->WordRecognitionLanguage(); + if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) { hocr_str += " lang='"; - hocr_str += res_it->WordRecognitionLanguage(); + hocr_str += lang; hocr_str += "'"; } switch (res_it->WordDirection()) { - case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break; - case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break; + // Only emit direction if different from current paragraph direction + case DIR_LEFT_TO_RIGHT: if (!para_is_ltr) hocr_str += " dir='ltr'"; break; + case DIR_RIGHT_TO_LEFT: if (para_is_ltr) hocr_str += " dir='rtl'"; break; + case DIR_MIX: + case DIR_NEUTRAL: default: // Do nothing. break; } @@ -1574,6 +1586,7 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) { if (last_word_in_para) { hocr_str += "\n

\n"; pcnt++; + para_is_ltr = true; // back to default direction } if (last_word_in_block) { hocr_str += "
\n";