Skip to content

Commit

Permalink
fixed hocr to produce valid document (acording http://validator.w3.org/
Browse files Browse the repository at this point in the history
  • Loading branch information
zdenop@gmail.com committed Nov 17, 2010
1 parent 515ac2d commit 7511d76
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 27 deletions.
36 changes: 19 additions & 17 deletions api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -720,18 +720,18 @@ static bool IsParagraphBreak(TBOX bbox_cur, TBOX bbox_prev,
// Check if the distance between lines is larger than the normal leading,
if (fabs((float)(bbox_cur.bottom() - bbox_prev.bottom())) > line_height * 2)
return true;

// Check if the distance between left bounds of the two lines is nearly the
// same as between their right bounds (if so, then both lines probably belong
// to the same paragraph, maybe a centered one).
if (fabs((float)((bbox_cur.left() - bbox_prev.left()) -
(bbox_prev.right() - bbox_cur.right()))) < line_height)
return false;

// Check if there is a paragraph indent at this line (either -ve or +ve).
if (fabs((float)(bbox_cur.left() - bbox_prev.left())) > line_height)
return true;

// Check if both current and previous line don't reach the right bound of the
// block, but the distance is different. This will cause all lines in a verse
// to be treated as separate paragraphs, but most probably will not split
Expand All @@ -740,7 +740,7 @@ static bool IsParagraphBreak(TBOX bbox_cur, TBOX bbox_prev,
right - bbox_cur.right() > line_height &&
right - bbox_prev.right() > line_height)
return true;

return false;
}

Expand All @@ -761,7 +761,7 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
if (tesseract_ == NULL ||
(page_res_ == NULL && Recognize(NULL) < 0))
return NULL;

PAGE_RES_IT page_res_it(page_res_);
ROW_RES *row = NULL; // current row
ROW *real_row = NULL, *prev_row = NULL;
Expand All @@ -783,37 +783,37 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
for (page_res_it.restart_page(); page_res_it.word () != NULL;
page_res_it.forward()) {
if (block != page_res_it.block ()) {

if (block != NULL) {
hocr_str += "</span>\n</p>\n</div>\n";
}

block = page_res_it.block (); // current row
real_block = block->block;
real_row = NULL;
row = NULL;

hocr_str.add_str_int("<div class='ocr_carea' id='block_", page_id);
hocr_str.add_str_int("_", bcnt++);
AddBoxTohOCR(real_block->bounding_box(), image_height_, &hocr_str);
hocr_str += "\n<p class='ocr_par'>\n";
}
if (row != page_res_it.row ()) {

if (row != NULL) {
hocr_str += "</span>\n";
}
prev_row = real_row;

row = page_res_it.row (); // current row
real_row = row->row;
if (prev_row != NULL &&

if (prev_row != NULL &&
IsParagraphBreak(real_row->bounding_box(), prev_row->bounding_box(),
real_block->bounding_box().right(),
real_row->x_height() + real_row->ascenders()))
hocr_str += "</p>\n<p class='ocr_par'>\n";

hocr_str.add_str_int("<span class='ocr_line' id='line_", page_id);
hocr_str.add_str_int("_", lcnt++);
AddBoxTohOCR(real_row->bounding_box(), image_height_, &hocr_str);
Expand All @@ -834,10 +834,10 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
if (word->italic > 0)
hocr_str += "<em>";
int i;
// escape special characters
// escape special characters
for (i = 0;
choice->unichar_string()[i] != '\0';
i++) {
i++) {
if (choice->unichar_string()[i] == '<') { hocr_str += "&lt;"; }
else if (choice->unichar_string()[i] == '>') { hocr_str += "&gt;"; }
else if (choice->unichar_string()[i] == '&') { hocr_str += "&amp;"; }
Expand All @@ -854,8 +854,10 @@ char* TessBaseAPI::GetHOCRText(int page_id) {
hocr_str += " ";
}
}
hocr_str += "</span>\n</p>\n";
hocr_str += "</div>\n</div>\n";
if (block != NULL)
hocr_str += "</span>\n</p>\n</div>\n";
hocr_str += "</div>\n";

char *ret = new char[hocr_str.length() + 1];
strcpy(ret, hocr_str.string());
return ret;
Expand Down
14 changes: 8 additions & 6 deletions api/tesseractmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,12 +391,14 @@ int main(int argc, char **argv) {
}
if (output_hocr) {
const char html_header[] =
"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\""
" \"http://www.w3.org/TR/html4/loose.dtd\">\n"
"<html>\n<head>\n<title></title>\n"
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
"charset=utf-8\" >\n<meta name='ocr-system' content='tesseract'>\n"
"</head>\n<body>\n";
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
"<html xmlns=\"http://www.w3.org/1999/xhtml\">\n<head>\n"
" <title>OCR Output</title>\n"
" <meta http-equiv=\"Content-Type\" content=\"text/html;"
"charset=utf-8\" />\n <meta name='ocr-system' "
"content='tesseract-ocr 3.00' />\n <meta name='ocr-capabilities'"
" content='ocr_page' />\n</head>\n<body>\n";
fprintf(fout, "%s", html_header);
}
fwrite(text_out.string(), 1, text_out.length(), fout);
Expand Down
4 changes: 2 additions & 2 deletions tessdata/configs/Makefile.am
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
datadir = @datadir@/tessdata/configs
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits hocr
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits hocr
4 changes: 2 additions & 2 deletions tessdata/configs/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,8 @@ target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits
data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits hocr
EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr logfile digits hocr
all: all-am

.SUFFIXES:
Expand Down

0 comments on commit 7511d76

Please sign in to comment.