Reinstated handling of rotated text

unidoc · gunnsth · Jun 30, 2020 · May 19, 2020 · May 19, 2020 · May 20, 2020
commit 3f1df971e5108ed5cc5617b24466de1f8a4bebd4
diff --git a/extractor/README.md b/extractor/README.md
@@ -59,9 +59,9 @@ TODO
 -----
 
 * Remove serial code?
-* Remove verbose* logging?
-* Reinstate rotated text handling.
+* Remove `verbose*` logging?
 * Come up with a better name for *reading* direction. Scanning direction? [Word order](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2694615/)?
+* Handle diagonal text.
 * Get R to L text extraction working.
 * Get top to bottom text extraction working.
 * Remove TM from ligature map.
diff --git a/extractor/text.go b/extractor/text.go
@@ -838,8 +838,7 @@ func (to *textObject) renderText(data []byte) error {
 		} else {
 			// TODO: This lookup seems confusing. Went from bytes <-> charcodes already.
 			// NOTE: This is needed to register runes by the font encoder - for subsetting (optimization).
-			original, ok := font.Encoder().CharcodeToRune(code)
-			if ok {
+			if original, ok := font.Encoder().CharcodeToRune(code); ok {
 				mark.original = string(original)
 			}
 		}
@@ -923,8 +922,25 @@ func (pt PageText) Tables() []TextTable {
 // The comments above the TextMark definition describe how to use the []TextMark to
 // maps substrings of the page text to locations on the PDF page.
 func (pt *PageText) computeViews() {
-	common.Log.Trace("ToTextLocation: %d elements", len(pt.marks))
-	paras := makeTextPage(pt.marks, pt.pageSize, 0)
+	// Extract text paragraphs one orientation at a time.
+	// If there are texts with several orientations on a page then the all the text of the same
+	// orientation gets extracted togther.
+	var paras paraList
+	n := len(pt.marks)
+	for orient := 0; orient < 360 && n > 0; orient += 90 {
+		marks := make([]*textMark, 0, len(pt.marks)-n)
+		for _, tm := range pt.marks {
+			if tm.orient == orient {
+				marks = append(marks, tm)
+			}
+		}
+		if len(marks) > 0 {
+			parasOrient := makeTextPage(marks, pt.pageSize)
+			paras = append(paras, parasOrient...)
+			n -= len(marks)
+		}
+	}
+	// Build the public viewable fields from the paraLis
 	b := new(bytes.Buffer)
 	paras.writeText(b)
 	pt.viewText = b.String()

diff --git a/extractor/text_const.go b/extractor/text_const.go
@@ -26,6 +26,8 @@ const (
 
 // The following constants are the tuning parameter for text extracton
 const (
+	// Change in angle of text in degrees that we treat as a different orientatiom/
+	orientationGranularity = 10
 	// Size of depth bins in points
 	depthBinPoints = 6
 

diff --git a/extractor/text_mark.go b/extractor/text_mark.go
@@ -17,15 +17,17 @@ import (
 // textMark represents text drawn on a page and its position in device coordinates.
 // All dimensions are in device coordinates.
 type textMark struct {
-	serial             int              // Sequence number for debugging.
-	model.PdfRectangle                  // Bounding box.
-	text               string           // The text (decoded via ToUnicode).
-	original           string           // Original text (decoded).
-	font               *model.PdfFont   // The font the mark was drawn with.
-	fontsize           float64          // The font size the mark was drawn with.
-	charspacing        float64          // TODO (peterwilliams97: Should this be exposed in TextMark?
-	trm                transform.Matrix // The current text rendering matrix (TRM above).
-	end                transform.Point  // The end of character device coordinates.
+	serial             int                // Sequence number for debugging.
+	model.PdfRectangle                    // Bounding box oriented so character base is at bottom
+	orient             int                // Orientation
+	text               string             // The text (decoded via ToUnicode).
+	original           string             // Original text (decoded).
+	font               *model.PdfFont     // The font the mark was drawn with.
+	fontsize           float64            // The font size the mark was drawn with.
+	charspacing        float64            // TODO (peterwilliams97: Should this be exposed in TextMark?
+	trm                transform.Matrix   // The current text rendering matrix (TRM above).
+	end                transform.Point    // The end of character device coordinates.
+	originaBBox        model.PdfRectangle // Bounding box without orientation correction.
 }
 
 // newTextMark returns a textMark for text `text` rendered with text rendering matrix (TRM) `trm`
@@ -34,7 +36,7 @@ type textMark struct {
 func (to *textObject) newTextMark(text string, trm transform.Matrix, end transform.Point,
 	spaceWidth float64, font *model.PdfFont, charspacing float64) (textMark, bool) {
 	theta := trm.Angle()
-	orient := nearestMultiple(theta, 10)
+	orient := nearestMultiple(theta, orientationGranularity)
 	var height float64
 	if orient%180 != 90 {
 		height = trm.ScalingFactorY()
@@ -51,7 +53,12 @@ func (to *textObject) newTextMark(text string, trm transform.Matrix, end transfo
 		bbox.Ury -= height
 	case 270:
 		bbox.Urx += height
+	case 0:
+		bbox.Ury += height
 	default:
+		// This is a hack to capture diagonal text.
+		// TODO(peterwilliams97): Extract diagonal text.
+		orient = 0
 		bbox.Ury += height
 	}
 	if bbox.Llx > bbox.Urx {
@@ -68,20 +75,52 @@ func (to *textObject) newTextMark(text string, trm transform.Matrix, end transfo
 	}
 	bbox = clipped
 
+	// The orientedBBox is bbox rotated and translated so the base of the character is at Lly.
+	orientedBBox := bbox
+	orientedMBox := to.e.mediaBox
+
+	switch orient % 360 {
+	case 90:
+		orientedMBox.Urx, orientedMBox.Ury = orientedMBox.Ury, orientedMBox.Urx
+		orientedBBox = model.PdfRectangle{
+			Llx: orientedMBox.Urx - bbox.Ury,
+			Urx: orientedMBox.Urx - bbox.Lly,
+			Lly: bbox.Llx,
+			Ury: bbox.Urx}
+	case 180:
+		orientedBBox = model.PdfRectangle{
+			Llx: bbox.Llx,
+			Urx: bbox.Urx,
+			Lly: orientedMBox.Ury - bbox.Lly,
+			Ury: orientedMBox.Ury - bbox.Ury}
+	case 270:
+		orientedMBox.Urx, orientedMBox.Ury = orientedMBox.Ury, orientedMBox.Urx
+		orientedBBox = model.PdfRectangle{
+			Llx: bbox.Ury,
+			Urx: bbox.Lly,
+			Lly: orientedMBox.Ury - bbox.Llx,
+			Ury: orientedMBox.Ury - bbox.Urx}
+	}
+	if orientedBBox.Llx > orientedBBox.Urx {
+		orientedBBox.Llx, orientedBBox.Urx = orientedBBox.Urx, orientedBBox.Llx
+	}
+	if orientedBBox.Lly > orientedBBox.Ury {
+		orientedBBox.Lly, orientedBBox.Ury = orientedBBox.Ury, orientedBBox.Lly
+	}
+
 	tm := textMark{
 		text:         text,
-		PdfRectangle: bbox,
+		PdfRectangle: orientedBBox,
+		originaBBox:  bbox,
 		font:         font,
 		fontsize:     height,
 		charspacing:  charspacing,
 		trm:          trm,
 		end:          end,
+		orient:       orient,
 		serial:       serial.mark,
 	}
 	serial.mark++
-	if !isTextSpace(tm.text) && tm.Width() == 0.0 {
-		common.Log.Debug("ERROR: Zero width text. tm=%s", tm.String())
-	}
 	if verboseGeom {
 		common.Log.Info("newTextMark: start=%.2f end=%.2f %s", start, end, tm.String())
 	}
@@ -106,7 +145,7 @@ func (tm *textMark) ToTextMark() TextMark {
 		count:    int64(tm.serial),
 		Text:     tm.text,
 		Original: tm.original,
-		BBox:     tm.PdfRectangle,
+		BBox:     tm.originaBBox,
 		Font:     tm.font,
 		FontSize: tm.fontsize,
 	}

diff --git a/extractor/text_page.go b/extractor/text_page.go
@@ -38,7 +38,7 @@ import (
 // 3) Detect textParas arranged as cells in a table and convert each one to a textPara containing a
 //    textTable.
 // 4) Sort the textParas in reading order.
-func makeTextPage(marks []*textMark, pageSize model.PdfRectangle, rot int) paraList {
+func makeTextPage(marks []*textMark, pageSize model.PdfRectangle) paraList {
 	common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)
 	if len(marks) == 0 {
 		return nil

diff --git a/extractor/text_test.go b/extractor/text_test.go
@@ -214,21 +214,21 @@ var fileExtractionTests = []struct {
 		},
 	},
 	// TODO(peterwilliams97): Reinstate rotation handling and this text.
-	// {filename: "000026.pdf",
-	// 	pageTerms: map[int][]string{
-	// 		1: []string{"Fresh Flower",
-	// 			"Care & Handling",
-	// 		},
-	// 	},
-	// },
+	{filename: "000026.pdf",
+		pageTerms: map[int][]string{
+			1: {"Fresh Flower",
+				"Care & Handling",
+			},
+		},
+	},
 	{filename: "search_sim_key.pdf",
 		pageTerms: map[int][]string{
 			2: {"A cryptographic scheme which enables searching",
 				"Untrusted server should not be able to search for a word without authorization",
 			},
 		},
 	},
-	{filename: "Theil_inequality.pdf",
+	{filename: "Theil_inequality.pdf", // 270° rotated file.
 		pageTerms: map[int][]string{
 			1: {"London School of Economics and Political Science"},
 			4: {"The purpose of this paper is to set Theil’s approach"},
@@ -273,10 +273,6 @@ var fileExtractionTests = []struct {
 			1: {"entropy of a system of n identical resonators in a stationary radiation field"},
 		},
 	},
-	// Case where combineDiacritics was combining ' and " with preceeding letters.
-	// NOTE(peterwilliams97): Part of the reason this test fails is that we don't currently read
-	// Type0:CIDFontType0 font metrics and assume zero displacemet so that we place the ' and " too
-	// close to the preceeding letters.
 	{filename: "/rfc6962.txt.pdf",
 		pageTerms: map[int][]string{
 			4: {"timestamps for certificates they then don’t log",
@@ -288,15 +284,14 @@ var fileExtractionTests = []struct {
 			10: {"الله"},
 		},
 	},
-	// TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed.
-	// {filename: "Ito_Formula.pdf",
-	// 	pageTerms: map[int][]string{
-	// 		1: {"In the Itô stochastic calculus",
-	// 			"In standard, non-stochastic calculus, one computes a derivative"},
-	// 		2: {"Financial Economics Itô’s Formula"},
-	// 	},
-	// },
-	{filename: "thanh.pdf",
+	{filename: "Ito_Formula.pdf", // 90° rotated with diacritics in different textMarks to base.
+		pageTerms: map[int][]string{
+			1: {"In the Itô stochastic calculus",
+				"In standard, non-stochastic calculus, one computes a derivative"},
+			2: {"Financial Economics Itô’s Formula"},
+		},
+	},
+	{filename: "thanh.pdf", // Diacritics in different textMarks to base.
 		pageTerms: map[int][]string{
 			1: {"Hàn Thế Thành"},
 			6: {"Petr Olšák"},