Skip to content

Commit

Permalink
Merge pull request tesseract-ocr#227 from tfmorris/hocr-both
Browse files Browse the repository at this point in the history
hOCR improvements
  • Loading branch information
zdenop committed Feb 17, 2016
2 parents e028274 + 6c44775 commit 8c7d158
Showing 1 changed file with 25 additions and 12 deletions.
37 changes: 25 additions & 12 deletions api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1379,14 +1379,14 @@ static void AddBaselineCoordsTohOCR(const PageIterator *it,
}

static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, int num2) {
unsigned long bufsize = base.length() + 2 * kMaxIntSize;
char id_buffer[bufsize];
const unsigned long BUFSIZE = 64;
char id_buffer[BUFSIZE];
if (num2 >= 0) {
snprintf(id_buffer, bufsize - 1, "%s_%d_%d", base.c_str(), num1, num2);
snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d", base.c_str(), num1, num2);
} else {
snprintf(id_buffer, bufsize - 1, "%s_%d", base.c_str(), num1);
snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1);
}
id_buffer[bufsize - 1] = '\0';
id_buffer[BUFSIZE - 1] = '\0';
*hocr_str += " id='";
*hocr_str += id_buffer;
*hocr_str += "'";
Expand Down Expand Up @@ -1444,6 +1444,8 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {

int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
bool para_is_ltr = true; // Default direction is LTR
const char* paragraph_lang = NULL;
bool font_info = false;
GetBoolVariable("hocr_font_info", &font_info);

Expand Down Expand Up @@ -1493,18 +1495,24 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {

// Open any new block/paragraph/textline.
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
para_is_ltr = true; // reset to default direction
hocr_str += " <div class='ocr_carea'";
AddIdTohOCR(&hocr_str, "block", page_id, bcnt);
AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
hocr_str += "\n <p class='ocr_par'";
if (res_it->ParagraphIsLtr()) {
hocr_str += " dir='ltr'";
} else {
para_is_ltr = res_it->ParagraphIsLtr();
if (!para_is_ltr) {
hocr_str += " dir='rtl'";
}
AddIdTohOCR(&hocr_str, "par", page_id, pcnt);
paragraph_lang = res_it->WordRecognitionLanguage();
if (paragraph_lang) {
hocr_str += " lang='";
hocr_str += paragraph_lang;
hocr_str += "'";
}
AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
Expand Down Expand Up @@ -1537,14 +1545,18 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
hocr_str.add_str_int("; x_fsize ", pointsize);
}
hocr_str += "'";
if (res_it->WordRecognitionLanguage()) {
const char* lang = res_it->WordRecognitionLanguage();
if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
hocr_str += " lang='";
hocr_str += res_it->WordRecognitionLanguage();
hocr_str += lang;
hocr_str += "'";
}
switch (res_it->WordDirection()) {
case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
// Only emit direction if different from current paragraph direction
case DIR_LEFT_TO_RIGHT: if (!para_is_ltr) hocr_str += " dir='ltr'"; break;
case DIR_RIGHT_TO_LEFT: if (para_is_ltr) hocr_str += " dir='rtl'"; break;
case DIR_MIX:
case DIR_NEUTRAL:
default: // Do nothing.
break;
}
Expand Down Expand Up @@ -1574,6 +1586,7 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
if (last_word_in_para) {
hocr_str += "\n </p>\n";
pcnt++;
para_is_ltr = true; // back to default direction
}
if (last_word_in_block) {
hocr_str += " </div>\n";
Expand Down

0 comments on commit 8c7d158

Please sign in to comment.