Skip to content

Commit

Permalink
Fixed bidi handling in PDF output
Browse files Browse the repository at this point in the history
  • Loading branch information
theraysmith committed Oct 9, 2014
1 parent f927728 commit d9699c4
Show file tree
Hide file tree
Showing 4 changed files with 1,492 additions and 488 deletions.
282 changes: 166 additions & 116 deletions api/pdfrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,165 +60,216 @@ long dist2(int x1, int y1, int x2, int y2) {
return (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1);
}

// Viewers like evince can get really confused during copy-paste when
// the baseline wanders around. So I've decided to project every word
// onto the (straight) line baseline. All numbers are in the native
// PDF coordinate system, which has the origin in the bottom left and
// the unit is points, which is 1/72 inch. Tesseract reports baselines
// left-to-right no matter what the reading order is. We need the
// word baseline in reading order, so we do that conversion here. Returns
// the word's baseline origin and length.
void GetWordBaseline(int writing_direction, int ppi, int height,
int word_x1, int word_y1, int word_x2, int word_y2,
int line_x1, int line_y1, int line_x2, int line_y2,
double *x0, double *y0, double *length) {
if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
Swap(&word_x1, &word_x2);
Swap(&word_y1, &word_y2);
}
double word_length;
double x, y;
{
int px = word_x1;
int py = word_y1;
double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
if (l2 == 0) {
x = line_x1;
y = line_y1;
} else {
double t = ((px - line_x2) * (line_x2 - line_x1) +
(py - line_y2) * (line_y2 - line_y1)) / l2;
x = line_x2 + t * (line_x2 - line_x1);
y = line_y2 + t * (line_y2 - line_y1);
}
word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
word_x2, word_y2)));
word_length = word_length * 72.0 / ppi;
x = x * 72 / ppi;
y = height - (y * 72.0 / ppi);
}
*x0 = x;
*y0 = y;
*length = word_length;
}

// Compute coefficients for an affine matrix describing the rotation
// of the text. If the text is right-to-left such as Arabic or Hebrew,
// we reflect over the Y-axis. This matrix will set the coordinate
// system for placing text in the PDF file.
//
// RTL
// [ x' ] = [ a b ][ x ] = [-1 0 ] [ cos sin ][ x ]
// [ y' ] [ c d ][ y ] [ 0 1 ] [-sin cos ][ y ]
void AffineMatrix(int writing_direction,
int line_x1, int line_y1, int line_x2, int line_y2,
double *a, double *b, double *c, double *d) {
double theta = atan2(static_cast<double>(line_y1 - line_y2),
static_cast<double>(line_x2 - line_x1));
*a = cos(theta);
*b = sin(theta);
*c = -sin(theta);
*d = cos(theta);
switch(writing_direction) {
case WRITING_DIRECTION_RIGHT_TO_LEFT:
*a = -*a;
*b = -*b;
break;
case WRITING_DIRECTION_TOP_TO_BOTTOM:
// TODO(jbreiden) Consider using the vertical PDF writing mode.
break;
default:
break;
}
}

// There are some really stupid PDF viewers in the wild, such as
// 'Preview' which ships with the Mac. They do a better job with text
// selection and highlighting when given perfectly flat baseline
// instead of very slightly tilted. We clip small tilts to appease
// these viewers. I chose this threshold large enough to absorb noise,
// but small enough that lines probably won't cross each other if the
// whole page is tilted at almost exactly the clipping threshold.
void ClipBaseline(int ppi, int x1, int y1, int x2, int y2,
int *line_x1, int *line_y1,
int *line_x2, int *line_y2) {
*line_x1 = x1;
*line_y1 = y1;
*line_x2 = x2;
*line_y2 = y2;
double rise = abs(y2 - y1) * 72 / ppi;
double run = abs(x2 - x1) * 72 / ppi;
if (rise < 2.0 && 2.0 < run)
*line_y1 = *line_y2 = (y1 + y2) / 2;
}

char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
double width, double height) {
double ppi = api->GetSourceYResolution();
STRING pdf_str("");
double old_x = 0.0, old_y = 0.0;
int old_pointsize = 0;
double ppi = api->GetSourceYResolution();

// TODO(jbreiden) Slightly cleaner from an abstraction standpoint
// if this were to live inside a separate text object.
// These initial conditions are all arbitrary and will be overwritten
double old_x = 0.0, old_y = 0.0;
int old_fontsize = 0;
tesseract::WritingDirection old_writing_direction =
WRITING_DIRECTION_LEFT_TO_RIGHT;
bool new_block = true;
int fontsize = 0;
double a = 1;
double b = 0;
double c = 0;
double d = 1;

// TODO(jbreiden) This marries the text and image together.
// Slightly cleaner from an abstraction standpoint if this were to
// live inside a separate text object.
pdf_str += "q ";
pdf_str.add_str_double("", prec(width));
pdf_str += " 0 0 ";
pdf_str.add_str_double("", prec(height));
pdf_str += " 0 0 cm /Im1 Do Q\n";

ResultIterator *res_it = api->GetIterator();

while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
pdf_str += "BT\n3 Tr\n"; // Begin text object, use invisible ink
old_pointsize = 0.0; // Every block will declare its font
pdf_str += "BT\n3 Tr"; // Begin text object, use invisible ink
old_fontsize = 0; // Every block will declare its fontsize
new_block = true; // Every block will declare its affine matrix
}

int line_x1, line_y1, line_x2, line_y2;
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
res_it->Baseline(RIL_TEXTLINE,
&line_x1, &line_y1, &line_x2, &line_y2);
double rise = abs(line_y2 - line_y1) * 72 / ppi;
double run = abs(line_x2 - line_x1) * 72 / ppi;
// There are some really stupid PDF viewers in the wild, such as
// 'Preview' which ships with the Mac. They might do a better
// job with text selection and highlighting when given perfectly
// straight text instead of very slightly tilted text. I chose
// this threshold large enough to absorb noise, but small enough
// that lines probably won't cross each other if the whole page
// is tilted at almost exactly the clipping threshold.
if (rise < 2.0 && 2.0 < run)
line_y1 = line_y2 = (line_y1 + line_y2) / 2;
int x1, y1, x2, y2;
res_it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
ClipBaseline(ppi, x1, y1, x2, y2, &line_x1, &line_y1, &line_x2, &line_y2);
}

if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
continue;
}

int word_x1, word_y1, word_x2, word_y2;
res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);

// The critical one is writing_direction
tesseract::Orientation orientation;
// Writing direction changes at a per-word granularity
tesseract::WritingDirection writing_direction;
tesseract::TextlineOrder textline_order;
float deskew_angle;
res_it->Orientation(&orientation, &writing_direction,
&textline_order, &deskew_angle);

// Unlike Tesseract, we always want the word baseline in reading order.
if (writing_direction == WRITING_DIRECTION_RIGHT_TO_LEFT) {
Swap(&word_x1, &word_x2);
Swap(&word_y1, &word_y2);
}

// Viewers like evince can get really confused during copy-paste
// when the baseline wanders around. I've decided to force every
// word to match the (straight) baseline. The math below is just
// projecting the word origin onto the baseline. All numbers are
// in the native PDF coordinate system, which has the origin in
// the bottom left and the unit is points, which is 1/72 inch.
double word_length;
double x, y;
{
int px = word_x1;
int py = word_y1;
double l2 = dist2(line_x1, line_y1, line_x2, line_y2);
if (l2 == 0) {
x = line_x1;
y = line_y1;
} else {
double t = ((px - line_x2) * (line_x2 - line_x1) +
(py - line_y2) * (line_y2 - line_y1)) / l2;
x = line_x2 + t * (line_x2 - line_x1);
y = line_y2 + t * (line_y2 - line_y1);
tesseract::Orientation orientation;
tesseract::TextlineOrder textline_order;
float deskew_angle;
res_it->Orientation(&orientation, &writing_direction,
&textline_order, &deskew_angle);
if (writing_direction != WRITING_DIRECTION_TOP_TO_BOTTOM) {
switch (res_it->WordDirection()) {
case DIR_LEFT_TO_RIGHT:
writing_direction = WRITING_DIRECTION_LEFT_TO_RIGHT;
break;
case DIR_RIGHT_TO_LEFT:
writing_direction = WRITING_DIRECTION_RIGHT_TO_LEFT;
break;
default:
writing_direction = old_writing_direction;
}
}
word_length = sqrt(static_cast<double>(dist2(word_x1, word_y1,
word_x2, word_y2)));
word_length = word_length * 72.0 / ppi;
x = x * 72 / ppi;
y = height - (y * 72.0 / ppi);
}

int pointsize = 0;
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
// Calculate the rotation angle in the PDF cooordinate system,
// which has the origin in the bottom left. The Tesseract
// coordinate system has the origin in the upper left.
//
// PDF is kind of a like turtle graphics, and we orient the
// turtle (errr... initial cursor position) with an affine
// transformation.
//
// Rotate RTL Translate
//
// [ x' y' 1 ] = [ x y 1 ] [ cos𝜃 -sin𝜃 0 ] [ -1 0 0 ] [ 1 0 0 ]
// [ sin𝜃 cos𝜃 0 ] [ 0 1 0 ] [ 0 1 0 ]
// [ 0 0 1 ] [ 0 0 1 ] [ x y 1 ]
//
double theta = atan2(static_cast<double>(line_y1 - line_y2),
static_cast<double>(line_x2 - line_x1));
double a, b, c, d;
a = cos(theta);
b = sin(theta);
c = -sin(theta);
d = cos(theta);
switch(writing_direction) {
case WRITING_DIRECTION_RIGHT_TO_LEFT:
a = -a;
b = -b;
c = -c;
break;
case WRITING_DIRECTION_TOP_TO_BOTTOM:
// TODO(jbreiden) Consider switching PDF writing mode to vertical.
break;
default:
break;
}
// Where is word origin and how long is it?
double x, y, word_length;
{
int word_x1, word_y1, word_x2, word_y2;
res_it->Baseline(RIL_WORD, &word_x1, &word_y1, &word_x2, &word_y2);
GetWordBaseline(writing_direction, ppi, height,
word_x1, word_y1, word_x2, word_y2,
line_x1, line_y1, line_x2, line_y2,
&x, &y, &word_length);
}

pdf_str.add_str_double("", prec(a)); // . This affine matrix
if (writing_direction != old_writing_direction || new_block) {
AffineMatrix(writing_direction,
line_x1, line_y1, line_x2, line_y2, &a, &b, &c, &d);
pdf_str.add_str_double(" ", prec(a)); // . This affine matrix
pdf_str.add_str_double(" ", prec(b)); // . sets the coordinate
pdf_str.add_str_double(" ", prec(c)); // . system for all
pdf_str.add_str_double(" ", prec(d)); // . text in the entire
pdf_str.add_str_double(" ", prec(x)); // . line.
pdf_str.add_str_double(" ", prec(d)); // . text that follows.
pdf_str.add_str_double(" ", prec(x)); // .
pdf_str.add_str_double(" ", prec(y)); // .
pdf_str += (" Tm "); // Place cursor absolutely
new_block = false;
} else {
double offset = sqrt(static_cast<double>(dist2(old_x, old_y, x, y)));
pdf_str.add_str_double(" ", prec(offset)); // Delta x in pts
pdf_str.add_str_double(" ", 0); // Delta y in pts
pdf_str += (" Td "); // Relative moveto
double dx = x - old_x;
double dy = y - old_y;
pdf_str.add_str_double(" ", prec(dx * a + dy * b));
pdf_str.add_str_double(" ", prec(dx * c + dy * d));
pdf_str += (" Td "); // Relative moveto
}
old_x = x;
old_y = y;
old_writing_direction = writing_direction;

// Adjust font size on a per word granularity. Pay attention to
// pointsize, old_pointsize, and pdf_str. We've found that for
// in Arabic, Tesseract will happily return a pointsize of zero,
// fontsize, old_fontsize, and pdf_str. We've found that for
// in Arabic, Tesseract will happily return a fontsize of zero,
// so we make up a default number to protect ourselves.
{
bool bold, italic, underlined, monospace, serif, smallcaps;
int font_id;
res_it->WordFontAttributes(&bold, &italic, &underlined, &monospace,
&serif, &smallcaps, &pointsize, &font_id);
const int kDefaultPointSize = 8;
if (pointsize <= 0)
pointsize = kDefaultPointSize;
if (pointsize != old_pointsize) {
&serif, &smallcaps, &fontsize, &font_id);
const int kDefaultFontsize = 8;
if (fontsize <= 0)
fontsize = kDefaultFontsize;
if (fontsize != old_fontsize) {
char textfont[20];
snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", pointsize);
snprintf(textfont, sizeof(textfont), "/f-0-0 %d Tf ", fontsize);
pdf_str += textfont;
old_pointsize = pointsize;
old_fontsize = fontsize;
}
}

Expand All @@ -243,9 +294,9 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
delete []grapheme;
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
if (word_length > 0 && pdf_word_len > 0 && pointsize > 0) {
if (word_length > 0 && pdf_word_len > 0 && fontsize > 0) {
double h_stretch =
kCharWidth * prec(100.0 * word_length / (pointsize * pdf_word_len));
kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len));
pdf_str.add_str_double("", h_stretch);
pdf_str += " Tz"; // horizontal stretch
pdf_str += " [ ";
Expand Down Expand Up @@ -449,7 +500,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,

L_COMP_DATA *cid = NULL;
const int kJpegQuality = 85;
l_generateCIDataForPdf(filename, pix, kJpegQuality, &cid);

// TODO(jbreiden) Leptonica 1.71 doesn't correctly handle certain
// types of PNG files, especially if there are 2 samples per pixel.
// We can get rid of this logic after Leptonica 1.72 is released and
Expand Down Expand Up @@ -747,5 +798,4 @@ bool TessPDFRenderer::EndDocumentHandler() {
AppendString(buf);
return true;
}

} // namespace tesseract
2 changes: 1 addition & 1 deletion api/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ class TESS_API TessPDFRenderer : public TessResultRenderer {
double width, double height);
// Turn an image into a PDF object. Only transcode if we have to.
static bool imageToPDFObj(Pix *pix, char *filename, long int objnum,
char **pdf_object, long int *pdf_object_size);
char **pdf_object, long int *pdf_object_size);
};


Expand Down
Binary file modified tessdata/pdf.ttf
Binary file not shown.
Loading

0 comments on commit d9699c4

Please sign in to comment.