Commented code and removed unused functions.

unidoc · gunnsth · Jun 30, 2020 · May 19, 2020 · May 19, 2020 · May 20, 2020
commit 17bee4d907484f28d93859a2d8141c593cb09377
diff --git a/extractor/text_bound.go b/extractor/text_bound.go
@@ -38,19 +38,6 @@ func diffReading(a, b bounded) float64 {
 	return a.bbox().Llx - b.bbox().Llx
 }
 
-func boundedUnion(objs ...bounded) model.PdfRectangle {
-	rect := objs[0].bbox()
-	for _, r := range objs[1:] {
-		rect = rectUnion(rect, r.bbox())
-	}
-	return rect
-}
-
-// rectContainsBounded returns true if `a` contains `b`.
-func rectContainsBounded(a model.PdfRectangle, b bounded) bool {
-	return rectContainsRect(a, b.bbox())
-}
-
 // rectContainsRect returns true if `a` contains `b`.
 func rectContainsRect(a, b model.PdfRectangle) bool {
 	return a.Llx <= b.Llx && b.Urx <= a.Urx && a.Lly <= b.Lly && b.Ury <= a.Ury
@@ -110,21 +97,6 @@ func partial(overlap func(*wordBag, *textWord, float64) bool,
 	}
 }
 
-// overlapped returns true if `a` and `b` overlap.
-func overlapped(a, b bounded) bool {
-	return overlappedX(a, b) && overlappedY(a, b)
-}
-
-// overlappedX returns true if `a` and `b` overlap in the x direction.
-func overlappedX(a, b bounded) bool {
-	return intersectsX(a.bbox(), b.bbox())
-}
-
-// overlappedY returns true if `a` and `b` overlap in the y direction.
-func overlappedY(a, b bounded) bool {
-	return intersectsY(a.bbox(), b.bbox())
-}
-
 // rectUnion returns the smallest axis-aligned rectangle that contains `b1` and `b2`.
 func rectUnion(b1, b2 model.PdfRectangle) model.PdfRectangle {
 	return model.PdfRectangle{

diff --git a/extractor/text_line.go b/extractor/text_line.go
@@ -43,7 +43,7 @@ func (l *textLine) String() string {
 		l.serial, l.depth, l.PdfRectangle, l.fontsize, l.text())
 }
 
-// bbox makes textLine implementethe `bounded` interface.
+// bbox makes textLine implement the `bounded` interface.
 func (l *textLine) bbox() model.PdfRectangle {
 	return l.PdfRectangle
 }
@@ -104,7 +104,10 @@ func (l *textLine) markWordBoundaries() {
 	}
 }
 
-// endsInHyphen returns true if `l` has at least minHyphenation runes and end in a hyphen.
+// endsInHyphen attempts to detect words that are split between lines
+// IT currently returns true if `l` ends in a hyphen and its last minHyphenation runes don't coataib
+// a space.
+// TODO(peterwilliams97): Figure out a better heuristic
 func (l *textLine) endsInHyphen() bool {
 	// Computing l.text() is a little expensive so we filter out simple cases first.
 	lastWord := l.words[len(l.words)-1]
@@ -115,7 +118,6 @@ func (l *textLine) endsInHyphen() bool {
 	if lastWord.newWord && endsInHyphen(runes) {
 		return true
 	}
-
 	return endsInHyphen([]rune(l.text()))
 }
 

diff --git a/extractor/text_para.go b/extractor/text_para.go
@@ -21,15 +21,16 @@ type paraList []*textPara
 
 // textPara is a group of words in a rectangular region of a page that get read together.
 // An peragraph in a document might span multiple pages. This is the paragraph framgent on one page.
-// We start by finding paragraph regions on a page, then we break the words into the textPara into
-// textLines.
+// textParas can be tables in which case the content is in `table`, otherwise the content is in `lines`.
+// textTable cells are textParas so this gives one level of recursion
 type textPara struct {
 	serial             int                // Sequence number for debugging.
 	model.PdfRectangle                    // Bounding box.
 	eBBox              model.PdfRectangle // Extended bounding box needed to compute reading order.
-	lines              []*textLine        // Paragraph text gets broken into lines.
-	table              *textTable         // A table in which the cells which textParas.
-	isCell             bool               // Is this para a cell in a textTable>
+	lines              []*textLine        // The lines in the paragraph. (nil for the table case)
+	table              *textTable         // The table contained in this region if there is one. nil otherwise
+	// The following fields are used for detecting and extracting tables.
+	isCell bool // Is this para a cell in a textTable?
 	// The unique highest para completely below this that overlaps it in the y-direction, if one exists.
 	right *textPara
 	// The unique highest para completely below `this that overlaps it in the x-direction, if one exists.
@@ -57,17 +58,14 @@ func (p *textPara) String() string {
 		p.serial, p.PdfRectangle, table, len(p.lines), truncate(p.text(), 50))
 }
 
-// text returns the text  of the lines in `p`.
-func (p *textPara) text() string {
-	w := new(bytes.Buffer)
-	p.writeText(w)
-	return w.String()
-}
-
+// depth returns the paragraph's depth. which is the depth of its top line.
+// We return the top line depth because textPara depth is used to tell if 2 paras have the same
+// depth. English readers compare paragraph depths by their top lines.
 func (p *textPara) depth() float64 {
 	if len(p.lines) > 0 {
 		return p.lines[0].depth
 	}
+	// Use the top left cell of the table if there is one
 	return p.table.get(0, 0).depth()
 }
 
@@ -199,8 +197,7 @@ func (p *textPara) fontsize() float64 {
 // The textWords in each line are sorted in reading order and those that start whole words (as
 // opposed to word fragments) have their `newWord` flag set to true.
 func (b *wordBag) arrangeText() *textPara {
-	// Sort the words in `b`'s bins in the reading direction.
-	b.sort()
+	b.sort() // Sort the words in `b`'s bins in the reading direction.
 
 	var lines []*textLine
 
@@ -257,7 +254,6 @@ func (b *wordBag) arrangeText() *textPara {
 
 			line.markWordBoundaries()
 			lines = append(lines, line)
-
 		}
 	}
 
@@ -304,3 +300,11 @@ func (paras paraList) log(title string) {
 		fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tabl, truncate(text, 50))
 	}
 }
+
+// text returns the text  of the lines in `p`.
+// NOTE: For debugging only/
+func (p *textPara) text() string {
+	w := new(bytes.Buffer)
+	p.writeText(w)
+	return w.String()
+}
diff --git a/extractor/text_table.go b/extractor/text_table.go
@@ -13,10 +13,21 @@ import (
 	"github.com/unidoc/unipdf/v3/model"
 )
 
+// textTable is a table of `w` x `h` textPara cells.
 type textTable struct {
-	model.PdfRectangle
-	w, h  int
-	cells map[uint64]*textPara
+	model.PdfRectangle                      // Bounding rectangle.
+	w, h               int                  // w=number of columns. h=number of rows.
+	cells              map[uint64]*textPara // The cells
+}
+
+// String returns a description of `t`.
+func (t *textTable) String() string {
+	return fmt.Sprintf("%d x %d", t.w, t.h)
+}
+
+// bbox makes textLine implement the `bounded` interface.
+func (t *textTable) bbox() model.PdfRectangle {
+	return t.PdfRectangle
 }
 
 // extractTables converts the`paras` that are table cells to tables containing those cells.
@@ -27,22 +38,17 @@ func (paras paraList) extractTables() paraList {
 	if len(paras) < minTableParas {
 		return paras
 	}
-
 	tables := paras.findTables()
-
 	if verboseTable {
 		common.Log.Info("combined tables %d ================", len(tables))
 		for i, t := range tables {
 			t.log(fmt.Sprintf("combined %d", i))
 		}
 	}
-
-	paras = paras.applyTables(tables)
-
-	return paras
+	return paras.applyTables(tables)
 }
 
-// findTables returns all the 2x2 table candidateds in `paras`.
+// findTables returns all the tables  in `paras`.
 func (paras paraList) findTables() []*textTable {
 	paras.addNeighbours()
 	// Pre-sort by reading direction then depth
@@ -72,17 +78,17 @@ func (paras paraList) findTables() []*textTable {
 	return tables
 }
 
-// Attempr to build the smallest possible table fragment of 2 x 2 cells.
-// If it can be built then return it. Otherwise return nil.
+// isAtom atempts to build the smallest possible table fragment of 2 x 2 cells.
+// If a table can be built then it is returned. Otherwise nil is returned.
 // The smallest possible table is
 //   a b
 //   c d
 // where
-//   a is `para`
-//   b is immediately to the right of a and overlaps it in the y axis
-//   c is immediately below a and ooverlaps it in the x axis
-//   d is immediately to the right of c and overlaps it in the x axis and
-//        immediately below b and ooverlaps it in the y axis
+//   a is `para`.
+//   b is immediately to the right of a and overlaps it in the y axis.
+//   c is immediately below a and overlaps it in the x axis.
+//   d is immediately to the right of c and overlaps it in the y axis and
+//        immediately below b and ooverlaps it in the s axis.
 //   None of a, b, c or d are cells in existing tables.
 func (para *textPara) isAtom() *textTable {
 	a := para
@@ -97,7 +103,7 @@ func (para *textPara) isAtom() *textTable {
 	return nil
 }
 
-// newTable returns a table containg the a, b, c, d elements from isAtom().
+// newTable returns a table containing the a, b, c, d elements from isAtom().
 func newTableAtom(a, b, c, d *textPara) *textTable {
 	t := &textTable{w: 2, h: 2, cells: map[uint64]*textPara{}}
 	t.put(0, 0, a)
@@ -107,6 +113,11 @@ func newTableAtom(a, b, c, d *textPara) *textTable {
 	return t
 }
 
+// growTable grows `t` to the largest w x h it can while remaining a valid table.
+// It repeatedly tries to extend by one row and/or column
+//    - down and right, then
+//    - down, then
+//    - right.
 func (t *textTable) growTable() {
 	growDown := func(down paraList) {
 		t.h++
@@ -150,6 +161,7 @@ func (t *textTable) growTable() {
 	}
 }
 
+// getDown returns the row of cells below `t` if they are a valid extension to `t` or nil if they aren't.
 func (t *textTable) getDown() paraList {
 	cells := make(paraList, t.w)
 	for x := 0; x < t.w; x++ {
@@ -167,6 +179,8 @@ func (t *textTable) getDown() paraList {
 	return cells
 }
 
+// getRight returns the column of cells to the right `t` if they are a valid extension to `t` or nil
+// if they aren't.
 func (t *textTable) getRight() paraList {
 	cells := make(paraList, t.h)
 	for y := 0; y < t.h; y++ {
@@ -184,7 +198,7 @@ func (t *textTable) getRight() paraList {
 	return cells
 }
 
-// applyTables replaces the paras that re  cells in `tables` with paras containing the tables in
+// applyTables replaces the paras that are cells in `tables` with paras containing the tables in
 //`tables`. This, of course, reduces the number of paras.
 func (paras paraList) applyTables(tables []*textTable) paraList {
 	consumed := map[*textPara]struct{}{}
@@ -214,20 +228,7 @@ func (t *textTable) markCells() {
 	}
 }
 
-func (t *textTable) log(title string) {
-	if !verboseTable {
-		return
-	}
-	common.Log.Info("~~~ %s: %s: %d x %d\n      %6.2f", title, fileLine(1, false),
-		t.w, t.h, t.PdfRectangle)
-	for y := 0; y < t.h; y++ {
-		for x := 0; x < t.w; x++ {
-			p := t.get(x, y)
-			fmt.Printf("%4d %2d: %6.2f %q\n", x, y, p.PdfRectangle, truncate(p.text(), 50))
-		}
-	}
-}
-
+// newTablePara returns a textPara containing `t`.
 func (t *textTable) newTablePara() *textPara {
 	bbox := t.computeBbox()
 	para := textPara{
@@ -241,6 +242,7 @@ func (t *textTable) newTablePara() *textPara {
 	return &para
 }
 
+// computeBbox computes and returns the bounding box of `t`.
 func (t *textTable) computeBbox() model.PdfRectangle {
 	r := t.get(0, 0).PdfRectangle
 	for x := 1; x < t.w; x++ {
@@ -266,26 +268,32 @@ func (t *textTable) toTextTable() TextTable {
 	return TextTable{W: t.w, H: t.h, Cells: cells}
 }
 
-func cellIndex(x, y int) uint64 {
-	return uint64(x)*0x1000000 + uint64(y)
-}
-
+// get returns the cell at `x`, `y`.
 func (t *textTable) get(x, y int) *textPara {
 	return t.cells[cellIndex(x, y)]
 }
 
+// put sets the cell at `x`, `y` to `cell`.
 func (t *textTable) put(x, y int, cell *textPara) {
 	t.cells[cellIndex(x, y)] = cell
 }
 
-func (t *textTable) del(x, y int) {
-	delete(t.cells, cellIndex(x, y))
-}
-
-func (t *textTable) bbox() model.PdfRectangle {
-	return t.PdfRectangle
+// cellIndex returns a number that will be different for different `x` and `y` for any table found
+// in a PDF which will less than 2^32 wide and hight.
+func cellIndex(x, y int) uint64 {
+	return uint64(x)*0x1000000 + uint64(y)
 }
 
-func (t *textTable) String() string {
-	return fmt.Sprintf("%d x %d", t.w, t.h)
+func (t *textTable) log(title string) {
+	if !verboseTable {
+		return
+	}
+	common.Log.Info("~~~ %s: %d x %d\n      %6.2f", title,
+		t.w, t.h, t.PdfRectangle)
+	for y := 0; y < t.h; y++ {
+		for x := 0; x < t.w; x++ {
+			p := t.get(x, y)
+			fmt.Printf("%4d %2d: %6.2f %q\n", x, y, p.PdfRectangle, truncate(p.text(), 50))
+		}
+	}
 }
diff --git a/extractor/text_utils.go b/extractor/text_utils.go
@@ -6,10 +6,7 @@
 package extractor
 
 import (
-	"fmt"
 	"math"
-	"path/filepath"
-	"runtime"
 	"sort"
 )
 
@@ -56,23 +53,6 @@ func maxInt(a, b int) int {
 	return b
 }
 
-// fileLine printed out a file:line string for the caller `skip` levels up the call stack.
-func fileLine(skip int, doSecond bool) string {
-	_, file, line, ok := runtime.Caller(skip + 1)
-	if !ok {
-		file = "???"
-		line = 0
-	} else {
-		file = filepath.Base(file)
-	}
-	depth := fmt.Sprintf("%s:%-4d", file, line)
-	if !doSecond {
-		return depth
-	}
-	_, _, line2, _ := runtime.Caller(skip + 2)
-	return fmt.Sprintf("%s:%-4d", depth, line2)
-}
-
 // addNeighbours fills out the below and right fields of the paras in `paras`.
 // For each para `a`:
 //    a.below is the unique highest para completely below `a` that overlaps it in the x-direction
@@ -147,12 +127,14 @@ func (paras paraList) yNeighbours() map[*textPara][]int {
 	return paras.eventNeighbours(events)
 }
 
+// event is an entry or exit from an interval while scanning.
 type event struct {
-	z     float64
-	enter bool
-	i     int
+	z     float64 // Coordinate in the scanning direction.
+	enter bool    // True if entering the interval, false it leaving.
+	i     int     // Index of the interval
 }
 
+// eventNeighbours returns a map {para: indexes of paras that overlap para in `events`}.
 func (paras paraList) eventNeighbours(events []event) map[*textPara][]int {
 	sort.Slice(events, func(i, j int) bool {
 		ei, ej := events[i], events[j]