Cleaned up some comments and removed a panic

unidoc · gunnsth · Jun 30, 2020 · May 19, 2020 · May 19, 2020 · May 20, 2020
commit 91479a7c2bf934089c6e970c38171c49bfac5bac
diff --git a/extractor/README.md b/extractor/README.md
@@ -22,16 +22,26 @@ HOW TEXT IS EXTRACTED
 * The `textWords`s are grouped into `textParas`s based on their bounding boxes' proximities to other
  textWords.
 * The `textWord`s in each `textPara` are arranged into `textLine`s (`textWord`s of similar depth).
-* Within each `textLine`, `textWord`s are sorted in reading order each one that starts a whole word is marked.
+* Within each `textLine`, `textWord`s are sorted in reading order and each one that starts a whole
+word is marked.
 See `textLine.text()`.
 * `textPara.writeCellText()` shows how to extract the paragraph text from this arrangment.
 * All the `textPara`s on a page are checked to see if they are arranged as cells within a table and,
-if they are, they are combined into `textTable`s and a textPara containing the textTable replaces the
+if they are, they are combined into `textTable`s and a `textPara` containing the `textTable` replaces
 the `textPara`s containing the cells.
-* The textParas, some of which may be tables, in sorted into reading order (the order in which they
+* The `textPara`s, some of which may be tables, are sorted into reading order (the order in which they
 are reading, not in the reading directions).
 
 
+The entire order of extracted text from a page is expressed in `paraList.writeText()` which
+
+* Iterates through the `textParas1, which are sorted in reading.
+* For each `textPara` with a table, iterates through through the table cell `textPara`s.
+* For each (top level or table cell) `textPara` iterates through the `textLine`s.
+* For each `textLine` iterates through the `textWord`s inserting a space before each one that has
+ the `newWord` flag set.
+
+
 ### `textWord` creation
 
 * `makeTextWords()` combines `textMark`s into `textWord`s, word fragments
@@ -54,4 +64,4 @@ TODO
 * Reinstate  diacritic composition.
 * Reinstate duplicate text removal.
 * Reinstate creater_test.go extraction test.
-
+* Come up with a better name for _reading_ direction,
diff --git a/extractor/text_bag.go b/extractor/text_bag.go
@@ -146,12 +146,8 @@ func (b *wordBag) scanBand(title string, para *wordBag,
 	return n
 }
 
-// highestword returns the hight word in b.bins[depthIdx] w: minDepth <= w.depth <= maxDepth.
-func (b *wordBag) highestword(depthIdx int, minDepth, maxDepth float64) *textWord {
-	if len(b.bins) == 0 {
-		panic("bbbin")
-		return nil
-	}
+// highestWord returns the hight word in b.bins[depthIdx] w: minDepth <= w.depth <= maxDepth.
+func (b *wordBag) highestWord(depthIdx int, minDepth, maxDepth float64) *textWord {
 	for _, word := range b.bins[depthIdx] {
 		if minDepth <= word.depth && word.depth <= maxDepth {
 			return word
@@ -165,7 +161,6 @@ func (b *wordBag) depthBand(minDepth, maxDepth float64) []int {
 	if len(b.bins) == 0 {
 		return nil
 	}
-
 	return b.depthRange(b.getDepthIdx(minDepth), b.getDepthIdx(maxDepth))
 }
 
@@ -219,11 +214,12 @@ func (b *wordBag) empty(depthIdx int) bool {
 	return !ok
 }
 
+// firstWord returns the first word in reading order in bin `depthIdx`.
 func (b *wordBag) firstWord(depthIdx int) *textWord {
 	return b.bins[depthIdx][0]
 }
 
-// stratum returns a copy of `p`.bins[`depthIdx`].
+// stratum returns a copy of `b`.bins[`depthIdx`].
 // stratum is guaranteed to return a non-nil value. It must be called with a valid depth index.
 // NOTE: We need to return a copy because remove() and other functions manipulate the array
 // underlying the slice.

diff --git a/extractor/text_para.go b/extractor/text_para.go
@@ -231,7 +231,7 @@ func (b *wordBag) arrangeText() *textPara {
 				nextDepthIdx := 0      // nextWord's depthIndex
 				// We start with this highest remaining word
 				for _, depthIdx := range b.depthBand(minDepth, maxDepth) {
-					word := b.highestword(depthIdx, minDepth, maxDepth)
+					word := b.highestWord(depthIdx, minDepth, maxDepth)
 					if word == nil {
 						continue
 					}

diff --git a/extractor/text_word.go b/extractor/text_word.go
@@ -28,7 +28,7 @@ type textWord struct {
 	text               string      // The word fragment text.
 	marks              []*textMark // Marks in this word.
 	fontsize           float64     // Largest fontsize in the word.
-	newWord            bool        // Is this word fragemet the start of  a new word?
+	newWord            bool        // Is this word fragment the start of  a new word?
 }
 
 // makeTextPage combines `marks`, the textMarks on a page, into word fragments.