Addressed issues in PR review

unidoc · gunnsth · Jun 30, 2020 · May 19, 2020 · May 19, 2020 · May 20, 2020
commit 3cca58106533ad41cb3027d16cd85e670450480b
diff --git a/extractor/README.md b/extractor/README.md
@@ -58,10 +58,7 @@ The entire order of extracted text from a page is expressed in `paraList.writeTe
 TODO
 -----
 
-* Remove serial code?
 * Remove `verbose*` logging?
-* Come up with a better name for *reading* direction. Scanning direction? [Word order](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2694615/)?
 * Handle diagonal text.
 * Get R to L text extraction working.
 * Get top to bottom text extraction working.
-* Remove TM from ligature map.
diff --git a/extractor/const.go b/extractor/const.go
@@ -5,4 +5,9 @@
 
 package extractor
 
+import "errors"
+
 var isTesting = false
+var (
+	errTypeCheck = errors.New("type check error")
+)
diff --git a/extractor/extractor.go b/extractor/extractor.go
@@ -35,7 +35,6 @@ type Extractor struct {
 
 // New returns an Extractor instance for extracting content from the input PDF page.
 func New(page *model.PdfPage) (*Extractor, error) {
-	serial.reset()
 	contents, err := page.GetAllContentStreams()
 	if err != nil {
 		return nil, err
@@ -61,7 +60,6 @@ func New(page *model.PdfPage) (*Extractor, error) {
 }
 
 // NewFromContents creates a new extractor from contents and page resources.
-// XXX(peterwilliams97). Does anyone use this?
 func NewFromContents(contents string, resources *model.PdfPageResources) (*Extractor, error) {
 	e := &Extractor{
 		contents:    contents,

diff --git a/extractor/image.go b/extractor/image.go
@@ -124,7 +124,7 @@ func (ctx *imageExtractContext) processOperand(op *contentstream.ContentStreamOp
 		name, ok := core.GetName(op.Params[0])
 		if !ok {
 			common.Log.Debug("ERROR: Type")
-			return core.ErrTypeError
+			return errTypeCheck
 		}
 
 		_, xtype := resources.GetXObjectByName(*name)

diff --git a/extractor/text.go b/extractor/text.go
@@ -1065,7 +1065,7 @@ func (ma *TextMarkArray) BBox() (model.PdfRectangle, bool) {
 //      bbox, ok := spanMarks.BBox()
 //      // handle errors
 type TextMark struct {
-	// Text is the extracted text. It has been decoded to Unicode via ToUnicode().
+	// Text is the extracted text.
 	Text string
 	// Original is the text in the PDF. It has not been decoded like `Text`.
 	Original string
@@ -1084,8 +1084,6 @@ type TextMark struct {
 	// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
 	//  distance  apart. See wordJoiner (lineJoiner) in PageText.computeViews().
 	Meta bool
-	// For debugging
-	count int64
 }
 
 // String returns a string describing `tm`.
@@ -1102,8 +1100,8 @@ func (tm TextMark) String() string {
 	if tm.Meta {
 		meta = " *M*"
 	}
-	return fmt.Sprintf("{@%04d TextMark: %d %q=%02x (%6.2f, %6.2f) (%6.2f, %6.2f) %s%s}",
-		tm.count, tm.Offset, tm.Text, []rune(tm.Text), b.Llx, b.Lly, b.Urx, b.Ury, font, meta)
+	return fmt.Sprintf("{TextMark: %d %q=%02x (%6.2f, %6.2f) (%6.2f, %6.2f) %s%s}",
+		tm.Offset, tm.Text, []rune(tm.Text), b.Llx, b.Lly, b.Urx, b.Ury, font, meta)
 }
 
 // spaceMark is a special TextMark used for spaces.
@@ -1119,7 +1117,15 @@ var spaceMark = TextMark{
 // Cells[y][x] is the (0-offset) x'th column in the table.
 type TextTable struct {
 	W, H  int
-	Cells [][]string
+	Cells [][]TableCell
+}
+
+// TableCell is a cell in a TextTable.
+type TableCell struct {
+	// Text is the extracted text.
+	Text string
+	// Marks returns the TextMarks corresponding to the text in Text.
+	Marks TextMarkArray
 }
 
 // getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is

diff --git a/extractor/text_bag.go b/extractor/text_bag.go
@@ -22,7 +22,6 @@ import (
 // In the current implementation, wordBag is a list of word fragment bins arranged by their depth on
 // a page with the word fragments  in each bin are sorted in reading order.
 type wordBag struct {
-	serial             int     // Sequence number for debugging.
 	model.PdfRectangle         // Bounding box of all the textWord in the wordBag.
 	fontsize           float64 // The size of the largest font in the wordBag.
 	// The following fields are for the current bin based implementation
@@ -48,13 +47,11 @@ func newWordBag(word *textWord, pageHeight float64) *wordBag {
 	depthIdx := depthIndex(word.depth)
 	words := []*textWord{word}
 	bag := wordBag{
-		serial:       serial.wordBag,
 		bins:         map[int][]*textWord{depthIdx: words},
 		PdfRectangle: word.PdfRectangle,
 		fontsize:     word.fontsize,
 		pageHeight:   pageHeight,
 	}
-	serial.wordBag++
 	return &bag
 }
 
@@ -67,8 +64,7 @@ func (b *wordBag) String() string {
 			texts = append(texts, w.text)
 		}
 	}
-	return fmt.Sprintf("serial=%d %.2f fontsize=%.2f %d %q",
-		b.serial, b.PdfRectangle, b.fontsize, len(texts), texts)
+	return fmt.Sprintf("%.2f fontsize=%.2f %d %q", b.PdfRectangle, b.fontsize, len(texts), texts)
 }
 
 // scanBand scans the bins for words w:

diff --git a/extractor/text_line.go b/extractor/text_line.go
@@ -15,7 +15,6 @@ import (
 
 // textLine repesents words on the same line within a textPara.
 type textLine struct {
-	serial             int         // Sequence number for debugging.
 	model.PdfRectangle             // Bounding box (union of `marks` bounding boxes).
 	depth              float64     // Distance from bottom of line to top of page.
 	words              []*textWord // Words in this line.
@@ -27,20 +26,18 @@ type textLine struct {
 func newTextLine(b *wordBag, depthIdx int) *textLine {
 	word := b.firstWord(depthIdx)
 	line := textLine{
-		serial:       serial.line,
 		PdfRectangle: word.PdfRectangle,
 		fontsize:     word.fontsize,
 		depth:        word.depth,
 	}
-	serial.line++
 	line.pullWord(b, word, depthIdx)
 	return &line
 }
 
 // String returns a description of `l`.
 func (l *textLine) String() string {
-	return fmt.Sprintf("serial=%d %.2f %6.2f fontsize=%.2f \"%s\"",
-		l.serial, l.depth, l.PdfRectangle, l.fontsize, l.text())
+	return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",
+		l.depth, l.PdfRectangle, l.fontsize, l.text())
 }
 
 // bbox makes textLine implement the `bounded` interface.

diff --git a/extractor/text_mark.go b/extractor/text_mark.go
@@ -17,7 +17,6 @@ import (
 // textMark represents text drawn on a page and its position in device coordinates.
 // All dimensions are in device coordinates.
 type textMark struct {
-	serial             int                // Sequence number for debugging.
 	model.PdfRectangle                    // Bounding box oriented so character base is at bottom
 	orient             int                // Orientation
 	text               string             // The text (decoded via ToUnicode).
@@ -118,20 +117,16 @@ func (to *textObject) newTextMark(text string, trm transform.Matrix, end transfo
 		trm:          trm,
 		end:          end,
 		orient:       orient,
-		serial:       serial.mark,
 	}
-	serial.mark++
 	if verboseGeom {
 		common.Log.Info("newTextMark: start=%.2f end=%.2f %s", start, end, tm.String())
 	}
-
 	return tm, onPage
 }
 
 // String returns a description of `tm`.
 func (tm *textMark) String() string {
-	return fmt.Sprintf("serial=%d %.2f fontsize=%.2f \"%s\"",
-		tm.serial, tm.PdfRectangle, tm.fontsize, tm.text)
+	return fmt.Sprintf("%.2f fontsize=%.2f \"%s\"", tm.PdfRectangle, tm.fontsize, tm.text)
 }
 
 // bbox makes textMark implement the `bounded` interface.
@@ -142,7 +137,6 @@ func (tm *textMark) bbox() model.PdfRectangle {
 // ToTextMark returns the public view of `tm`.
 func (tm *textMark) ToTextMark() TextMark {
 	return TextMark{
-		count:    int64(tm.serial),
 		Text:     tm.text,
 		Original: tm.original,
 		BBox:     tm.originaBBox,

diff --git a/extractor/text_para.go b/extractor/text_para.go
@@ -21,11 +21,10 @@ import (
 type paraList []*textPara
 
 // textPara is a group of words in a rectangular region of a page that get read together.
-// An peragraph in a document might span multiple pages. This is the paragraph framgent on one page.
+// A paragraph in a document might span multiple pages. This is the paragraph fragment on one page.
 // textParas can be tables in which case the content is in `table`, otherwise the content is in `lines`.
 // textTable cells are textParas so this gives one level of recursion
 type textPara struct {
-	serial             int                // Sequence number for debugging.
 	model.PdfRectangle                    // Bounding box.
 	eBBox              model.PdfRectangle // Extended bounding box needed to compute reading order.
 	lines              []*textLine        // The lines in the paragraph. (nil for the table case)
@@ -40,13 +39,7 @@ type textPara struct {
 
 // makeTextPara returns a textPara with bounding rectangle `bbox`.
 func makeTextPara(bbox model.PdfRectangle, lines []*textLine) *textPara {
-	para := textPara{
-		serial:       serial.para,
-		PdfRectangle: bbox,
-		lines:        lines,
-	}
-	serial.para++
-	return &para
+	return &textPara{PdfRectangle: bbox, lines: lines}
 }
 
 // String returns a description of `p`.
@@ -55,8 +48,8 @@ func (p *textPara) String() string {
 	if p.table != nil {
 		table = fmt.Sprintf("[%dx%d] ", p.table.w, p.table.h)
 	}
-	return fmt.Sprintf("serial=%d %6.2f %s%d lines %q",
-		p.serial, p.PdfRectangle, table, len(p.lines), truncate(p.text(), 50))
+	return fmt.Sprintf("%6.2f %s%d lines %q",
+		p.PdfRectangle, table, len(p.lines), truncate(p.text(), 50))
 }
 
 // depth returns the paragraph's depth. which is the depth of its top line.

diff --git a/extractor/text_table.go b/extractor/text_table.go
@@ -231,15 +231,11 @@ func (t *textTable) markCells() {
 // newTablePara returns a textPara containing `t`.
 func (t *textTable) newTablePara() *textPara {
 	bbox := t.computeBbox()
-	para := textPara{
-		serial:       serial.para,
+	return &textPara{
 		PdfRectangle: bbox,
 		eBBox:        bbox,
 		table:        t,
 	}
-	t.log(fmt.Sprintf("newTablePara: serial=%d", para.serial))
-	serial.para++
-	return &para
 }
 
 // computeBbox computes and returns the bounding box of `t`.
@@ -258,11 +254,14 @@ func (t *textTable) computeBbox() model.PdfRectangle {
 
 // toTextTable returns the TextTable corresponding to `t`.
 func (t *textTable) toTextTable() TextTable {
-	cells := make([][]string, t.h)
+	cells := make([][]TableCell, t.h)
 	for y := 0; y < t.h; y++ {
-		cells[y] = make([]string, t.w)
+		cells[y] = make([]TableCell, t.w)
 		for x := 0; x < t.w; x++ {
-			cells[y][x] = t.get(x, y).text()
+			c := t.get(x, y)
+			cells[y][x].Text = c.text()
+			offset := 0
+			cells[y][x].Marks.marks = c.toTextMarks(&offset)
 		}
 	}
 	return TextTable{W: t.w, H: t.h, Cells: cells}

diff --git a/extractor/text_utils.go b/extractor/text_utils.go
@@ -11,24 +11,6 @@ import (
 	"unicode"
 )
 
-// serial is used to add serial numbers to all text* instances.
-var serial serialState
-
-// serialState keeps serial number for text* structs.
-type serialState struct {
-	mark    int // textMark
-	word    int // textWord
-	wordBag int // wordBag
-	line    int // textLine
-	para    int // textPara
-}
-
-// reset resets `serial` to all zeros.
-func (serial *serialState) reset() {
-	var empty serialState
-	*serial = empty
-}
-
 // TOL is the tolerance for coordinates to be consideted equal. It is big enough to cover all
 // rounding errors and small enough that TOL point differences on a page aren't visible.
 const TOL = 1.0e-6

diff --git a/extractor/text_word.go b/extractor/text_word.go
@@ -23,7 +23,6 @@ import (
 //  - A textLine is the textWords at similar depths sorted in reading order.
 //  - All textWords, w, in the textLine that start whole words have w.newWord = true
 type textWord struct {
-	serial             int         // Sequence number for debugging.
 	model.PdfRectangle             // Bounding box (union of `marks` bounding boxes).
 	depth              float64     // Distance from bottom of this word to the top of the page.
 	text               string      // The word fragment text.
@@ -122,21 +121,18 @@ func newTextWord(marks []*textMark, pageSize model.PdfRectangle) *textWord {
 		}
 	}
 
-	word := textWord{
-		serial:       serial.word,
+	return &textWord{
 		PdfRectangle: r,
 		marks:        marks,
 		depth:        pageSize.Ury - r.Lly,
 		fontsize:     fontsize,
 	}
-	serial.word++
-	return &word
 }
 
 // String returns a description of `w`.
 func (w *textWord) String() string {
-	return fmt.Sprintf("serial=%d %.2f %6.2f fontsize=%.2f \"%s\"",
-		w.serial, w.depth, w.PdfRectangle, w.fontsize, w.text)
+	return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",
+		w.depth, w.PdfRectangle, w.fontsize, w.text)
 }
 
 // bbox makes textWord implement the `bounded` interface.

diff --git a/internal/textencoding/glyphs_glyphlist.go b/internal/textencoding/glyphs_glyphlist.go
@@ -148,15 +148,13 @@ var ligatureToString = map[rune]string{
 	'œ':          "oe",
 	'Ꝏ':          "OO",
 	'ꝏ':          "oo",
-	// 'ẞ':          "fs",
-	// 'ß':          "fz",
-	'ﬆ': "st",
-	'ﬅ': "ſt",
-	'Ꜩ': "TZ",
-	'ꜩ': "tz",
-	'ᵫ': "ue",
-	'Ꝡ': "VY",
-	'ꝡ': "vy",
+	'ﬆ':          "st",
+	'ﬅ':          "ſt",
+	'Ꜩ':          "TZ",
+	'ꜩ':          "tz",
+	'ᵫ':          "ue",
+	'Ꝡ':          "VY",
+	'ꝡ':          "vy",
 	// Reverse of ligatureMap
 	0xe000: "ft",
 	0xe001: "fj",