Combine diacritic textMarks in text extraction

unidoc · gunnsth · Jun 30, 2020 · May 19, 2020 · May 19, 2020 · May 20, 2020
commit 933021cfef936110526e1b818d9eb5c6b7de33b9
diff --git a/extractor/README.md b/extractor/README.md
@@ -10,7 +10,6 @@ In English text,
 - the *reading* direction is left to right, increasing X in the PDF coordinate system.
 - the *depth* directon is top to bottom, decreasing Y in the PDF coordinate system.
 
-
 HOW TEXT IS EXTRACTED
 ---------------------
 
@@ -62,8 +61,7 @@ TODO
 * Remove serial code?
 * Remove verbose* logging?
 * Reinstate rotated text handling.
-* Reinstate  diacritic composition.
-* Reinstate duplicate text removal.
-* Come up with a better name for *reading* direction.
+* Come up with a better name for *reading* direction. Scanning direction? [Word order](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC2694615/)?
 * Get R to L text extraction working.
 * Get top to bottom text extraction working.
+* Remove TM from ligature map.
diff --git a/extractor/text.go b/extractor/text.go
@@ -12,7 +12,6 @@ import (
 	"math"
 	"sort"
 	"strings"
-	"unicode"
 
 	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/contentstream"
@@ -877,16 +876,6 @@ func (to *textObject) moveTo(tx, ty float64) {
 	to.tm = to.tlm
 }
 
-// isTextSpace returns true if `text` contains nothing but space code points.
-func isTextSpace(text string) bool {
-	for _, r := range text {
-		if !unicode.IsSpace(r) {
-			return false
-		}
-	}
-	return true
-}
-
 // PageText represents the layout of text on a device page.
 type PageText struct {
 	marks      []*textMark        // Texts and their positions on a PDF page.

diff --git a/extractor/text_bag.go b/extractor/text_bag.go
@@ -289,7 +289,7 @@ func mergWordBags(paraWords []*wordBag) []*wordBag {
 			}
 			para1 := paraWords[i1]
 			r := para0.PdfRectangle
-			r.Llx -= para0.fontsize * 0.99
+			r.Llx -= para0.fontsize
 			if rectContainsRect(r, para1.PdfRectangle) {
 				para0.absorb(para1)
 				absorbed[i1] = struct{}{}

diff --git a/extractor/text_const.go b/extractor/text_const.go
@@ -18,8 +18,10 @@ const (
 
 // The following constants control the approaches used in the code.
 const (
-	doHyphens = true
-	useEBBox  = false
+	doHyphens           = true
+	doRemoveDuplicates  = true
+	doCombineDiacritics = true
+	useEBBox            = false
 )
 
 // The following constants are the tuning parameter for text extracton
@@ -67,13 +69,18 @@ const (
 	// Maximum spacing between characters within a line.
 	maxIntraLineGapR = 0.02
 
-	// Max difference in coordinates of duplicated textWords.
+	// Maximum difference in coordinates of duplicated textWords.
 	maxDuplicateWordR = 0.2
 
+	// Maximum distance from a character to its diacritic marks as a fraction of the character size.
+	diacriticRadiusR = 0.5
+
+	// Minimum number of rumes in the first half of a hyphenated word
 	minHyphenation = 4
 
 	// The distance we look down from the top of a wordBag for the leftmost word.
 	topWordRangeR = 4.0
-	// minimum number of cells in a textTable
+
+	// Minimum number of cells in a textTable
 	minTableParas = 6
 )
diff --git a/extractor/text_para.go b/extractor/text_para.go
@@ -70,6 +70,13 @@ func (p *textPara) depth() float64 {
 	return p.table.get(0, 0).depth()
 }
 
+// text is a convenience function that returns the text `p` including tables.
+func (p *textPara) text() string {
+	w := new(bytes.Buffer)
+	p.writeText(w)
+	return w.String()
+}
+
 // writeText writes the text of `p` including tables to `w`.
 func (p *textPara) writeText(w io.Writer) {
 	if p.table == nil {
@@ -133,7 +140,7 @@ func (p *textPara) writeCellText(w io.Writer) {
 }
 
 // toCellTextMarks creates the TextMarkArray corresponding to the extracted text created by
-// paras `paras`.writeCellText().
+// paras `p`.writeCellText().
 func (p *textPara) toCellTextMarks(offset *int) []TextMark {
 	var marks []TextMark
 	for il, line := range p.lines {
@@ -150,7 +157,7 @@ func (p *textPara) toCellTextMarks(offset *int) []TextMark {
 	return marks
 }
 
-// removeLastTextMarkRune removes the last run from `marks`.
+// removeLastTextMarkRune removes the last rune from `marks`.
 func removeLastTextMarkRune(marks []TextMark, offset *int) []TextMark {
 	tm := marks[len(marks)-1]
 	runes := []rune(tm.Text)
@@ -235,7 +242,9 @@ func (b *wordBag) removeDuplicates() {
 func (b *wordBag) arrangeText() *textPara {
 	b.sort() // Sort the words in `b`'s bins in the reading direction.
 
-	b.removeDuplicates()
+	if doRemoveDuplicates {
+		b.removeDuplicates()
+	}
 
 	var lines []*textLine
 
@@ -342,11 +351,3 @@ func (paras paraList) log(title string) {
 		fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tabl, truncate(text, 50))
 	}
 }
-
-// text returns the text  of the lines in `p`.
-// NOTE: For debugging only/
-func (p *textPara) text() string {
-	w := new(bytes.Buffer)
-	p.writeText(w)
-	return w.String()
-}
diff --git a/extractor/text_utils.go b/extractor/text_utils.go
@@ -8,6 +8,7 @@ package extractor
 import (
 	"math"
 	"sort"
+	"unicode"
 )
 
 // serial is used to add serial numbers to all text* instances.
@@ -178,3 +179,71 @@ func (paras paraList) eventNeighbours(events []event) map[*textPara][]int {
 	}
 	return paraNeighbors
 }
+
+// isTextSpace returns true if `text` contains nothing but space code points.
+func isTextSpace(text string) bool {
+	for _, r := range text {
+		if !unicode.IsSpace(r) {
+			return false
+		}
+	}
+	return true
+}
+
+// combiningDiacritic returns the combining version of `text` if text contains a single uncombined
+// diacritic rune.
+func combiningDiacritic(text string) (string, bool) {
+	runes := []rune(text)
+	if len(runes) != 1 {
+		return "", false
+	}
+	combining, isDiacritic := diacriticsToCombining[runes[0]]
+	return combining, isDiacritic
+}
+
+var (
+	// diacriticsToCombining is a map of diacritic runes to their combining diacritic equivalents.
+	// These values were  copied from  (https://svn.apache.org/repos/asf/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java)
+	diacriticsToCombining = map[rune]string{
+		0x0060: "\u0300", //   ` -> ò
+		0x02CB: "\u0300", //   ˋ -> ò
+		0x0027: "\u0301", //   ' -> ó
+		0x00B4: "\u0301", //   ´ -> ó
+		0x02B9: "\u0301", //   ʹ -> ó
+		0x02CA: "\u0301", //   ˊ -> ó
+		0x005E: "\u0302", //   ^ -> ô
+		0x02C6: "\u0302", //   ˆ -> ô
+		0x007E: "\u0303", //   ~ -> õ
+		0x02DC: "\u0303", //   ˜ -> õ
+		0x00AF: "\u0304", //   ¯ -> ō
+		0x02C9: "\u0304", //   ˉ -> ō
+		0x02D8: "\u0306", //   ˘ -> ŏ
+		0x02D9: "\u0307", //   ˙ -> ȯ
+		0x00A8: "\u0308", //   ¨ -> ö
+		0x00B0: "\u030A", //   ° -> o̊
+		0x02DA: "\u030A", //   ˚ -> o̊
+		0x02BA: "\u030B", //   ʺ -> ő
+		0x02DD: "\u030B", //   ˝ -> ő
+		0x02C7: "\u030C", //   ˇ -> ǒ
+		0x02C8: "\u030D", //   ˈ -> o̍
+		0x0022: "\u030E", //   " -> o̎
+		0x02BB: "\u0312", //   ʻ -> o̒
+		0x02BC: "\u0313", //   ʼ -> o̓
+		0x0486: "\u0313", //   ҆ -> o̓
+		0x055A: "\u0313", //   ՚ -> o̓
+		0x02BD: "\u0314", //   ʽ -> o̔
+		0x0485: "\u0314", //   ҅ -> o̔
+		0x0559: "\u0314", //   ՙ -> o̔
+		0x02D4: "\u031D", //   ˔ -> o̝
+		0x02D5: "\u031E", //   ˕ -> o̞
+		0x02D6: "\u031F", //   ˖ -> o̟
+		0x02D7: "\u0320", //   ˗ -> o̠
+		0x02B2: "\u0321", //   ʲ -> o̡
+		0x00B8: "\u0327", //   ¸ -> o̧
+		0x02CC: "\u0329", //   ˌ -> o̩
+		0x02B7: "\u032B", //   ʷ -> o̫
+		0x02CD: "\u0331", //   ˍ -> o̱
+		0x005F: "\u0332", //   _ -> o̲
+		0x204E: "\u0359", //   ⁎ -> o͙
+	}
+)
diff --git a/extractor/text_word.go b/extractor/text_word.go
@@ -12,6 +12,7 @@ import (
 
 	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/model"
+	"golang.org/x/text/unicode/norm"
 )
 
 // textWord represents a word fragment.
@@ -59,16 +60,38 @@ func makeTextWords(marks []*textMark, pageSize model.PdfRectangle) []*textWord {
 	}
 
 	for _, tm := range marks {
-		isSpace := isTextSpace(tm.text)
-		if newWord == nil && !isSpace {
-			newWord = newTextWord([]*textMark{tm}, pageSize)
-			continue
+		if doCombineDiacritics {
+			// Combine diacritic marks into neighbourimg non-diacritics marks.
+			if newWord != nil && len(newWord.marks) > 0 {
+				prev := newWord.marks[len(newWord.marks)-1]
+				text, isDiacritic := combiningDiacritic(tm.text)
+				prevText, prevDiacritic := combiningDiacritic(prev.text)
+				if isDiacritic && !prevDiacritic && prev.inDiacriticArea(tm) {
+					newWord.addDiacritic(text)
+					continue
+				}
+				if prevDiacritic && !isDiacritic && tm.inDiacriticArea(prev) {
+					// If the previous mark was the diacritic, merge it into this mark and re-append it
+					newWord.marks = newWord.marks[:len(newWord.marks)-1]
+					newWord.addMark(tm, pageSize)
+					newWord.addDiacritic(prevText)
+					continue
+				}
+			}
 		}
+
+		// Check for spaces between words.
+		isSpace := isTextSpace(tm.text)
 		if isSpace {
 			addNewWord()
 			continue
 		}
 
+		if newWord == nil && !isSpace {
+			newWord = newTextWord([]*textMark{tm}, pageSize)
+			continue
+		}
+
 		fontsize := newWord.fontsize
 		depthGap := math.Abs(getDepth(pageSize, tm)-newWord.depth) / fontsize
 		readingGap := gapReading(tm, newWord) / fontsize
@@ -89,6 +112,15 @@ func makeTextWords(marks []*textMark, pageSize model.PdfRectangle) []*textWord {
 	return words
 }
 
+// inDiacriticArea returns true if `diacritic` is in the area where it could be a diacritic of `tm`.
+func (tm *textMark) inDiacriticArea(diacritic *textMark) bool {
+	dLlx := tm.Llx - diacritic.Llx
+	dUrx := tm.Urx - diacritic.Urx
+	dLly := tm.Lly - diacritic.Lly
+	return math.Abs(dLlx+dUrx) < tm.Width()*diacriticRadiusR &&
+		math.Abs(dLly) < tm.Height()*diacriticRadiusR
+}
+
 // newTextWord creates a textWords containing `marks`.
 // `pageSize` is used to calculate the word's depth on the page.
 func newTextWord(marks []*textMark, pageSize model.PdfRectangle) *textWord {
@@ -123,7 +155,7 @@ func (w *textWord) bbox() model.PdfRectangle {
 	return w.PdfRectangle
 }
 
-// addMark adds textMark `tm` to word `w`.
+// addMark adds textMark `tm` to  `w`.
 // `pageSize` is used to calculate the word's depth on the page.
 func (w *textWord) addMark(tm *textMark, pageSize model.PdfRectangle) {
 	w.marks = append(w.marks, tm)
@@ -134,6 +166,14 @@ func (w *textWord) addMark(tm *textMark, pageSize model.PdfRectangle) {
 	w.depth = pageSize.Ury - w.PdfRectangle.Lly
 }
 
+// addDiacritic adds combining diacritic `text` `tm` to `w`.
+// It adds the diacritic to the last mark and doesn't update the size
+func (w *textWord) addDiacritic(text string) {
+	lastMark := w.marks[len(w.marks)-1]
+	lastMark.text = lastMark.text + text
+	lastMark.text = norm.NFKC.String(lastMark.text)
+}
+
 // absorb combines `word` into `w`.
 func (w *textWord) absorb(word *textWord) {
 	w.PdfRectangle = rectUnion(w.PdfRectangle, word.PdfRectangle)