diff --git a/extractor/README.md b/extractor/README.md
new file mode 100644
index 000000000..15646ea6b
--- /dev/null
+++ b/extractor/README.md
@@ -0,0 +1,63 @@
+TEXT EXTRACTION CODE
+====================
+
+There are two [directions](https://www.w3.org/International/questions/qa-scripts.en#directions)s\.
+
+- *reading*
+- *depth*
+
+In English text,
+- the *reading* direction is left to right, increasing X in the PDF coordinate system.
+- the *depth* directon is top to bottom, decreasing Y in the PDF coordinate system.
+
+HOW TEXT IS EXTRACTED
+---------------------
+
+`text_page.go` **makeTextPage()** is the top level text extraction function. It returns an ordered
+list of `textPara`s which are described below.
+
+* A page's `textMark`s are obtained from its content stream. They are in the order they occur in the content stream.
+* The `textMark`s are grouped into word fragments called`textWord`s by scanning through the textMarks
+ and splitting on space characters and the gaps between marks.
+* The `textWords`s are grouped into rectangular regions  based on their bounding boxes' proximities
+  to other `textWords`. These rectangular regions are called `textParas`s. (In the current implementation
+  there is an intermediate step where the `textWords` are divided into containers called `wordBags`.)
+* The `textWord`s in each `textPara` are arranged into `textLine`s (`textWord`s of similar depth).
+* Within each `textLine`, `textWord`s are sorted in reading order and each one that starts a whole
+word is marked by setting its `newWord` flag to true. (See `textLine.text()`.)
+* All the `textPara`s on a page are checked to see if they are arranged as cells within a table and,
+if they are, they are combined into `textTable`s and a `textPara` containing the `textTable` replaces
+the `textPara`s containing the cells.
+* The `textPara`s, some of which may be tables, are sorted into reading order (the order in which they
+are read, not in the *reading* direction).
+
+
+The entire order of extracted text from a page is expressed in `paraList.writeText()`.
+
+* This function iterates through the `textPara`s, which are sorted in reading order.
+* For each `textPara` with a table, it iterates through the table cell `textPara`s. (See
+ `textPara.writeCellText()`.)
+* For each (top level or table cell) `textPara`, it iterates through the `textLine`s.
+* For each `textLine`, it iterates through the `textWord`s inserting a space before each one that has
+ the `newWord` flag set.
+
+
+### `textWord` creation
+
+* `makeTextWords()` combines `textMark`s into `textWord`s, word fragments.
+* `textWord`s are the atoms of the text extraction code.
+
+### `textPara` creation
+
+* `dividePage()` combines `textWord`s that are close to each other into groups in rectangular
+ regions called `wordBags`.
+* `wordBag.arrangeText()` arranges the `textWord`s in the rectangular regions into `textLine`s,
+  groups textWords of about the same depth sorted left to right.
+* `textLine.markWordBoundaries()` marks the `textWord`s in each `textLine` that start whole words.
+
+TODO
+-----
+
+* Handle diagonal text.
+* Get R to L text extraction working.
+* Get top to bottom text extraction working.
diff --git a/extractor/extractor.go b/extractor/extractor.go
index 0441ce587..06abaef0f 100644
--- a/extractor/extractor.go
+++ b/extractor/extractor.go
@@ -6,6 +6,8 @@
 package extractor
 
 import (
+	"fmt"
+
 	"github.com/unidoc/unipdf/v3/model"
 )
 
@@ -14,20 +16,21 @@ type Extractor struct {
 	// stream contents and resources for page
 	contents  string
 	resources *model.PdfPageResources
+	mediaBox  model.PdfRectangle
 
-	// fontCache is a simple LRU cache that is used to prevent redundant constructions of PdfFont's from
-	// PDF objects. NOTE: This is not a conventional glyph cache. It only caches PdfFont's.
+	// fontCache is a simple LRU cache that is used to prevent redundant constructions of PdfFonts
+	// from PDF objects. NOTE: This is not a conventional glyph cache. It only caches PdfFonts.
 	fontCache map[string]fontEntry
 
 	// text results from running extractXYText on forms within the page.
-	// TODO(peterwilliams): Cache this map accross all pages in a PDF to speed up processig.
+	// TODO(peterwilliams97): Cache this map accross all pages in a PDF to speed up processing.
 	formResults map[string]textResult
 
 	// accessCount is used to set fontEntry.access to an incrementing number.
 	accessCount int64
 
 	// textCount is an incrementing number used to identify XYTest objects.
-	textCount int64
+	textCount int
 }
 
 // New returns an Extractor instance for extracting content from the input PDF page.
@@ -42,7 +45,18 @@ func New(page *model.PdfPage) (*Extractor, error) {
 	// fmt.Printf("%s\n", contents)
 	// fmt.Println("========================= ::: =========================")
 
-	return NewFromContents(contents, page.Resources)
+	mediaBox, err := page.GetMediaBox()
+	if err != nil {
+		return nil, fmt.Errorf("extractor requires mediaBox. %v", err)
+	}
+	e := &Extractor{
+		contents:    contents,
+		resources:   page.Resources,
+		mediaBox:    *mediaBox,
+		fontCache:   map[string]fontEntry{},
+		formResults: map[string]textResult{},
+	}
+	return e, nil
 }
 
 // NewFromContents creates a new extractor from contents and page resources.
diff --git a/extractor/text.go b/extractor/text.go
index 42399df2c..9a18dfe3c 100644
--- a/extractor/text.go
+++ b/extractor/text.go
@@ -6,26 +6,26 @@
 package extractor
 
 import (
+	"bytes"
 	"errors"
 	"fmt"
 	"image/color"
 	"math"
 	"sort"
 	"strings"
-	"unicode"
 
 	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/contentstream"
 	"github.com/unidoc/unipdf/v3/core"
+	"github.com/unidoc/unipdf/v3/internal/textencoding"
 	"github.com/unidoc/unipdf/v3/internal/transform"
 	"github.com/unidoc/unipdf/v3/model"
-	"golang.org/x/text/unicode/norm"
+	"golang.org/x/xerrors"
 )
 
-var (
-	errType  = errors.New("type check error")
-	errRange = errors.New("range check error")
-)
+// maxFormStack is the maximum form stack recursion depth. It has to be low enough to avoid a stack
+// overflow and high enough to accomodate customers' PDFs
+const maxFormStack = 20
 
 // ExtractText processes and extracts all text data in content streams and returns as a string.
 // It takes into account character encodings in the PDF file, which are decoded by
@@ -47,6 +47,8 @@ func (e *Extractor) ExtractTextWithStats() (extracted string, numChars int, numM
 }
 
 // ExtractPageText returns the text contents of `e` (an Extractor for a page) as a PageText.
+// TODO(peterwilliams97): The stats complicate this function signature and aren't very useful.
+//                        Replace with a function like Extract() (*PageText, error)
 func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
 	pt, numChars, numMisses, err := e.extractPageText(e.contents, e.resources, transform.IdentityMatrix(), 0)
 	if err != nil {
@@ -61,15 +63,27 @@ func (e *Extractor) ExtractPageText() (*PageText, int, int, error) {
 // extractPageText returns the text contents of content stream `e` and resouces `resources` as a
 // PageText.
 // This can be called on a page or a form XObject.
-func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources, parentCTM transform.Matrix, level int) (
+func (e *Extractor) extractPageText(contents string, resources *model.PdfPageResources,
+	parentCTM transform.Matrix, level int) (
 	*PageText, int, int, error) {
 	common.Log.Trace("extractPageText: level=%d", level)
-	pageText := &PageText{}
-	state := newTextState()
-	fontStack := fontStacker{}
-	to := newTextObject(e, resources, contentstream.GraphicsState{}, &state, &fontStack)
+	pageText := &PageText{pageSize: e.mediaBox}
+	state := newTextState(e.mediaBox)
+	var savedStates stateStack
+	to := newTextObject(e, resources, contentstream.GraphicsState{}, &state, &savedStates)
 	var inTextObj bool
 
+	if level > maxFormStack {
+		err := errors.New("form stack overflow")
+		common.Log.Debug("ERROR: extractPageText. recursion level=%d err=%v", level, err)
+		return pageText, state.numChars, state.numMisses, err
+	}
+
+	// Uncomment the following 3 statements to log the content stream.
+	// common.Log.Info("contents* %d -----------------------------", len(contents))
+	// fmt.Println(contents)
+	// common.Log.Info("contents+ -----------------------------")
+
 	cstreamParser := contentstream.NewContentStreamParser(contents)
 	operations, err := cstreamParser.Parse()
 	if err != nil {
@@ -83,28 +97,20 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
 		func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState,
 			resources *model.PdfPageResources) error {
 			operand := op.Operand
+
+			if verboseGeom {
+				common.Log.Info("&&& op=%s", op)
+			}
+
 			switch operand {
-			case "q":
-				if !fontStack.empty() {
-					common.Log.Trace("Save font state: %s\n%s",
-						fontStack.peek(), fontStack.String())
-					fontStack.push(fontStack.peek())
-				}
-				if state.tfont != nil {
-					common.Log.Trace("Save font state: %s\n->%s\n%s",
-						fontStack.peek(), state.tfont, fontStack.String())
-					fontStack.push(state.tfont)
-				}
-			case "Q":
-				if !fontStack.empty() {
-					common.Log.Trace("Restore font state: %s\n->%s\n%s",
-						fontStack.peek(), fontStack.get(-2), fontStack.String())
-					fontStack.pop()
-				}
-				if len(fontStack) >= 2 {
-					common.Log.Trace("Restore font state: %s\n->%s\n%s",
-						state.tfont, fontStack.peek(), fontStack.String())
-					state.tfont = fontStack.pop()
+			case "q": // Push current graphics state to the stack.
+				savedStates.push(&state)
+			case "Q": // Pop graphics state from the stack.
+				if !savedStates.empty() {
+					state = *savedStates.top()
+					if len(savedStates) >= 2 {
+						savedStates.pop()
+					}
 				}
 			case "BT": // Begin text
 				// Begin a text object, initializing the text matrix, Tm, and
@@ -120,7 +126,7 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
 
 				graphicsState := gs
 				graphicsState.CTM = parentCTM.Mult(graphicsState.CTM)
-				to = newTextObject(e, resources, graphicsState, &state, &fontStack)
+				to = newTextObject(e, resources, graphicsState, &state, &savedStates)
 			case "ET": // End Text
 				// End text object, discarding text matrix. If the current
 				// text object contains text marks, they are added to the
@@ -240,7 +246,8 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
 					return err
 				}
 				err = to.setFont(name, size)
-				if err != nil {
+				to.invalidFont = xerrors.Is(err, core.ErrNotSupported)
+				if err != nil && !to.invalidFont {
 					return err
 				}
 			case "Tm": // Set text matrix.
@@ -302,14 +309,14 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
 				// Handle XObjects by recursing through form XObjects.
 				if len(op.Params) == 0 {
 					common.Log.Debug("ERROR: expected XObject name operand for Do operator. Got %+v.", op.Params)
-					return errRange
+					return core.ErrRangeError
 				}
 
 				// Get XObject name.
 				name, ok := core.GetName(op.Params[0])
 				if !ok {
 					common.Log.Debug("ERROR: invalid Do operator XObject name operand: %+v.", op.Params[0])
-					return errType
+					return core.ErrTypeError
 				}
 
 				_, xtype := resources.GetXObjectByName(*name)
@@ -366,6 +373,7 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
 	return pageText, state.numChars, state.numMisses, err
 }
 
+// textResult is used for holding results of PDF form processig
 type textResult struct {
 	pageText  PageText
 	numChars  int
@@ -439,18 +447,13 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
 			}
 			td := translationMatrix(transform.Point{X: dx, Y: dy})
 			to.tm.Concat(td)
-			common.Log.Trace("showTextAdjusted: dx,dy=%3f,%.3f Tm=%s", dx, dy, to.tm)
 		case *core.PdfObjectString:
 			charcodes, ok := core.GetStringBytes(o)
 			if !ok {
 				common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
 				return core.ErrTypeError
 			}
-			err := to.renderText(charcodes)
-			if err != nil {
-				common.Log.Debug("Render text error: %v", err)
-				return err
-			}
+			to.renderText(charcodes)
 		default:
 			common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
 			return core.ErrTypeError
@@ -473,6 +476,9 @@ func (to *textObject) setCharSpacing(x float64) {
 		return
 	}
 	to.state.tc = x
+	if verboseGeom {
+		common.Log.Info("setCharSpacing: %.2f state=%s", x, to.state.String())
+	}
 }
 
 // setFont "Tf". Set font.
@@ -480,21 +486,18 @@ func (to *textObject) setFont(name string, size float64) error {
 	if to == nil {
 		return nil
 	}
+	to.state.tfs = size
 	font, err := to.getFont(name)
-	if err == nil {
-		to.state.tfont = font
-		if len(*to.fontStack) == 0 {
-			to.fontStack.push(font)
-		} else {
-			(*to.fontStack)[len(*to.fontStack)-1] = font
-		}
-	} else if err == model.ErrFontNotSupported {
-		// TODO(peterwilliams97): Do we need to handle this case in a special way?
+	if err != nil {
 		return err
+	}
+	to.state.tfont = font
+	if to.savedStates.empty() {
+		to.savedStates.push(to.state)
 	} else {
-		return err
+		to.savedStates.top().tfont = to.state.tfont
 	}
-	to.state.tfs = size
+
 	return nil
 }
 
@@ -569,67 +572,56 @@ func (to *textObject) checkOp(op *contentstream.ContentStreamOperation, numParam
 	return true, nil
 }
 
-// fontStacker is the PDF font stack implementation.
-type fontStacker []*model.PdfFont
+// stateStack is the PDF textState stack implementation.
+type stateStack []*textState
 
-// String returns a string describing the current state of the font stack.
-func (fontStack *fontStacker) String() string {
-	parts := []string{"---- font stack"}
-	for i, font := range *fontStack {
+// String returns a string describing the current state of the textState stack.
+func (savedStates *stateStack) String() string {
+	parts := []string{fmt.Sprintf("---- font stack: %d", len(*savedStates))}
+	for i, state := range *savedStates {
 		s := "<nil>"
-		if font != nil {
-			s = font.String()
+		if state != nil {
+			s = state.String()
 		}
 		parts = append(parts, fmt.Sprintf("\t%2d: %s", i, s))
 	}
 	return strings.Join(parts, "\n")
 }
 
-// push pushes `font` onto the font stack.
-func (fontStack *fontStacker) push(font *model.PdfFont) {
-	*fontStack = append(*fontStack, font)
+// push pushes a copy of `state` onto the textState stack.
+func (savedStates *stateStack) push(state *textState) {
+	s := *state
+	*savedStates = append(*savedStates, &s)
 }
 
-// pop pops and returns the element on the top of the font stack if there is one or nil if there isn't.
-func (fontStack *fontStacker) pop() *model.PdfFont {
-	if fontStack.empty() {
+// pop pops and returns a copy of the last state on the textState stack there is one or nil if
+// there isn't.
+func (savedStates *stateStack) pop() *textState {
+	if savedStates.empty() {
 		return nil
 	}
-	font := (*fontStack)[len(*fontStack)-1]
-	*fontStack = (*fontStack)[:len(*fontStack)-1]
-	return font
+	state := *(*savedStates)[len(*savedStates)-1]
+	*savedStates = (*savedStates)[:len(*savedStates)-1]
+	return &state
 }
 
-// peek returns the element on the top of the font stack if there is one or nil if there isn't.
-func (fontStack *fontStacker) peek() *model.PdfFont {
-	if fontStack.empty() {
+// top returns the last saved state if there is one or nil if there isn't.
+// NOTE: The return is a pointer. Modifying it will modify the stack.
+func (savedStates *stateStack) top() *textState {
+	if savedStates.empty() {
 		return nil
 	}
-	return (*fontStack)[len(*fontStack)-1]
+	return (*savedStates)[savedStates.size()-1]
 }
 
-// get returns the `idx`'th element of the font stack if there is one or nil if there isn't.
-//  idx = 0: bottom of font stack
-//  idx = len(fontstack) - 1: top of font stack
-//  idx = -n is same as dx = len(fontstack) - n, so fontstack.get(-1) is same as fontstack.peek()
-func (fontStack *fontStacker) get(idx int) *model.PdfFont {
-	if idx < 0 {
-		idx += fontStack.size()
-	}
-	if idx < 0 || idx > fontStack.size()-1 {
-		return nil
-	}
-	return (*fontStack)[idx]
+// empty returns true if the textState stack is empty.
+func (savedStates *stateStack) empty() bool {
+	return len(*savedStates) == 0
 }
 
-// empty returns true if the font stack is empty.
-func (fontStack *fontStacker) empty() bool {
-	return len(*fontStack) == 0
-}
-
-// size returns the number of elements in the font stack.
-func (fontStack *fontStacker) size() int {
-	return len(*fontStack)
+// size returns the number of elements in the textState stack.
+func (savedStates *stateStack) size() int {
+	return len(*savedStates)
 }
 
 // 9.3 Text State Parameters and Operators (page 243)
@@ -639,19 +631,30 @@ func (fontStack *fontStacker) size() int {
 
 // textState represents the text state.
 type textState struct {
-	tc    float64        // Character spacing. Unscaled text space units.
-	tw    float64        // Word spacing. Unscaled text space units.
-	th    float64        // Horizontal scaling.
-	tl    float64        // Leading. Unscaled text space units. Used by TD,T*,'," see Table 108.
-	tfs   float64        // Text font size.
-	tmode RenderMode     // Text rendering mode.
-	trise float64        // Text rise. Unscaled text space units. Set by Ts.
-	tfont *model.PdfFont // Text font.
+	tc       float64        // Character spacing. Unscaled text space units.
+	tw       float64        // Word spacing. Unscaled text space units.
+	th       float64        // Horizontal scaling.
+	tl       float64        // Leading. Unscaled text space units. Used by TD,T*,'," see Table 108.
+	tfs      float64        // Text font size.
+	tmode    RenderMode     // Text rendering mode.
+	trise    float64        // Text rise. Unscaled text space units. Set by Ts.
+	tfont    *model.PdfFont // Text font.
+	mediaBox model.PdfRectangle
 	// For debugging
 	numChars  int
 	numMisses int
 }
 
+// String returns a description of `state`.
+func (state *textState) String() string {
+	fontName := "[NOT SET]"
+	if state.tfont != nil {
+		fontName = state.tfont.BaseFont()
+	}
+	return fmt.Sprintf("tc=%.2f tw=%.2f tfs=%.2f font=%q",
+		state.tc, state.tw, state.tfs, fontName)
+}
+
 // 9.4.1 General (page 248)
 // A PDF text object consists of operators that may show text strings, move the text position, and
 // set text state and certain other parameters. In addition, two parameters may be specified only
@@ -669,35 +672,37 @@ type textState struct {
 
 // textObject represents a PDF text object.
 type textObject struct {
-	e         *Extractor
-	resources *model.PdfPageResources
-	gs        contentstream.GraphicsState
-	fontStack *fontStacker
-	state     *textState
-	tm        transform.Matrix // Text matrix. For the character pointer.
-	tlm       transform.Matrix // Text line matrix. For the start of line pointer.
-	marks     []textMark       // Text marks get written here.
+	e           *Extractor
+	resources   *model.PdfPageResources
+	gs          contentstream.GraphicsState
+	state       *textState
+	savedStates *stateStack
+	tm          transform.Matrix // Text matrix. For the character pointer.
+	tlm         transform.Matrix // Text line matrix. For the start of line pointer.
+	marks       []*textMark      // Text marks get written here.
+	invalidFont bool             // Flag that gets set true when we can't handle the current font.
 }
 
 // newTextState returns a default textState.
-func newTextState() textState {
+func newTextState(mediaBox model.PdfRectangle) textState {
 	return textState{
-		th:    100,
-		tmode: RenderModeFill,
+		th:       100,
+		tmode:    RenderModeFill,
+		mediaBox: mediaBox,
 	}
 }
 
 // newTextObject returns a default textObject.
 func newTextObject(e *Extractor, resources *model.PdfPageResources, gs contentstream.GraphicsState,
-	state *textState, fontStack *fontStacker) *textObject {
+	state *textState, savedStates *stateStack) *textObject {
 	return &textObject{
-		e:         e,
-		resources: resources,
-		gs:        gs,
-		fontStack: fontStack,
-		state:     state,
-		tm:        transform.IdentityMatrix(),
-		tlm:       transform.IdentityMatrix(),
+		e:           e,
+		resources:   resources,
+		gs:          gs,
+		savedStates: savedStates,
+		state:       state,
+		tm:          transform.IdentityMatrix(),
+		tlm:         transform.IdentityMatrix(),
 	}
 }
 
@@ -720,7 +725,13 @@ func (to *textObject) getStrokeColor() color.Color {
 }
 
 // renderText processes and renders byte array `data` for extraction purposes.
+// It extracts textMarks based the charcodes in `data` and the currect text and graphics states
+// are tracked in `to`.
 func (to *textObject) renderText(data []byte) error {
+	if to.invalidFont {
+		common.Log.Debug("renderText: Invalid font. Not processing.")
+		return nil
+	}
 	font := to.getCurrentFont()
 	charcodes := font.BytesToCharcodes(data)
 	texts, numChars, numMisses := font.CharcodesToStrings(charcodes)
@@ -748,6 +759,9 @@ func (to *textObject) renderText(data []byte) error {
 		tfs*th, 0,
 		0, tfs,
 		0, state.trise)
+	if verboseGeom {
+		common.Log.Info("renderText: %d codes=%+v texts=%q", len(charcodes), charcodes, texts)
+	}
 
 	common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, len(texts))
 
@@ -760,7 +774,6 @@ func (to *textObject) renderText(data []byte) error {
 			continue
 		}
 
-		// TODO(gunnsth): Assuming 1:1 charcode[i] <-> rune[i] mapping.
 		code := charcodes[i]
 		// The location of the text on the page in device coordinates is given by trm, the text
 		// rendering matrix.
@@ -771,7 +784,7 @@ func (to *textObject) renderText(data []byte) error {
 
 		// w is the unscaled movement at the end of a word.
 		w := 0.0
-		if string(r) == " " {
+		if len(r) == 1 && r[0] == 32 {
 			w = state.tw
 		}
 
@@ -788,25 +801,40 @@ func (to *textObject) renderText(data []byte) error {
 		// t is the displacement of the text cursor when the character is rendered.
 		t0 := transform.Point{X: (c.X*tfs + w) * th}
 		t := transform.Point{X: (c.X*tfs + state.tc + w) * th}
+		if verboseGeom {
+			common.Log.Info("tfs=%.2f tc=%.2f tw=%.2f th=%.2f", tfs, state.tc, state.tw, th)
+			common.Log.Info("dx,dy=%.3f t0=%.2f t=%.2f", c, t0, t)
+		}
 
 		// td, td0 are t, t0 in matrix form.
 		// td0 is where this character ends. td is where the next character starts.
 		td0 := translationMatrix(t0)
 		td := translationMatrix(t)
+		end := to.gs.CTM.Mult(to.tm).Mult(td0)
+
+		if verboseGeom {
+			common.Log.Info("end:\n\tCTM=%s\n\t tm=%s\n"+
+				"\t td=%s xlat=%s\n"+
+				"\ttd0=%s\n\t → %s xlat=%s",
+				to.gs.CTM, to.tm,
+				td, translation(to.gs.CTM.Mult(to.tm).Mult(td)),
+				td0, end, translation(end))
+		}
 
-		common.Log.Trace("\"%c\" stateMatrix=%s CTM=%s Tm=%s", r, stateMatrix, to.gs.CTM, to.tm)
-		common.Log.Trace("tfs=%.3f th=%.3f Tc=%.3f w=%.3f (Tw=%.3f)", tfs, th, state.tc, w, state.tw)
-		common.Log.Trace("m=%s c=%+v t0=%+v td0=%s trm0=%s", m, c, t0, td0, td0.Mult(to.tm).Mult(to.gs.CTM))
-
-		mark := to.newTextMark(
-			text,
+		mark, onPage := to.newTextMark(
+			textencoding.ExpandLigatures(r),
 			trm,
-			translation(to.gs.CTM.Mult(to.tm).Mult(td0)),
+			translation(end),
 			math.Abs(spaceWidth*trm.ScalingFactorX()),
 			font,
 			to.state.tc,
 			fillColor,
 			strokeColor)
+
+		if !onPage {
+			common.Log.Debug("Text mark outside page. Skipping")
+			continue
+		}
 		if font == nil {
 			common.Log.Debug("ERROR: No font.")
 		} else if font.Encoder() == nil {
@@ -814,17 +842,15 @@ func (to *textObject) renderText(data []byte) error {
 		} else {
 			// TODO: This lookup seems confusing. Went from bytes <-> charcodes already.
 			// NOTE: This is needed to register runes by the font encoder - for subsetting (optimization).
-			original, ok := font.Encoder().CharcodeToRune(code)
-			if ok {
+			if original, ok := font.Encoder().CharcodeToRune(code); ok {
 				mark.original = string(original)
 			}
 		}
 		common.Log.Trace("i=%d code=%d mark=%s trm=%s", i, code, mark, trm)
-		to.marks = append(to.marks, mark)
+		to.marks = append(to.marks, &mark)
 
 		// update the text matrix by the displacement of the text location.
 		to.tm.Concat(td)
-		common.Log.Trace("to.tm=%s", to.tm)
 	}
 
 	return nil
@@ -853,127 +879,13 @@ func (to *textObject) moveTo(tx, ty float64) {
 	to.tm = to.tlm
 }
 
-// textMark represents text drawn on a page and its position in device coordinates.
-// All dimensions are in device coordinates.
-type textMark struct {
-	text          string             // The text (decoded via ToUnicode).
-	original      string             // Original text (decoded).
-	bbox          model.PdfRectangle // Text bounding box.
-	orient        int                // The text orientation in degrees. This is the current TRM rounded to 10°.
-	orientedStart transform.Point    // Left of text in orientation where text is horizontal.
-	orientedEnd   transform.Point    // Right of text in orientation where text is horizontal.
-	height        float64            // Text height.
-	spaceWidth    float64            // Best guess at the width of a space in the font the text was rendered with.
-	font          *model.PdfFont     // The font the mark was drawn with.
-	fontsize      float64            // The font size the mark was drawn with.
-	charspacing   float64            // TODO (peterwilliams97: Should this be exposed in TextMark?
-	trm           transform.Matrix   // The current text rendering matrix (TRM above).
-	end           transform.Point    // The end of character device coordinates.
-	count         int64              // To help with reading debug logs.
-	fillColor     color.Color        // Text fill color.
-	strokeColor   color.Color        // Text stroke color.
-}
-
-// newTextMark returns a textMark for text `text` rendered with text rendering matrix (TRM) `trm`
-// and end of character device coordinates `end`. `spaceWidth` is our best guess at the width of a
-// space in the font the text is rendered in device coordinates.
-func (to *textObject) newTextMark(text string, trm transform.Matrix, end transform.Point,
-	spaceWidth float64, font *model.PdfFont, charspacing float64,
-	fillColor, strokeColor color.Color) textMark {
-	to.e.textCount++
-	theta := trm.Angle()
-	orient := nearestMultiple(theta, 10)
-	var height float64
-	if orient%180 != 90 {
-		height = trm.ScalingFactorY()
-	} else {
-		height = trm.ScalingFactorX()
-	}
-
-	start := translation(trm)
-	bbox := model.PdfRectangle{Llx: start.X, Lly: start.Y, Urx: end.X, Ury: end.Y}
-	switch orient % 360 {
-	case 90:
-		bbox.Urx -= height
-	case 180:
-		bbox.Ury -= height
-	case 270:
-		bbox.Urx += height
-	default:
-		bbox.Ury += height
-	}
-	tm := textMark{
-		text:          text,
-		orient:        orient,
-		bbox:          bbox,
-		orientedStart: start.Rotate(theta),
-		orientedEnd:   end.Rotate(theta),
-		height:        math.Abs(height),
-		spaceWidth:    spaceWidth,
-		font:          font,
-		fontsize:      to.state.tfs,
-		charspacing:   charspacing,
-		trm:           trm,
-		end:           end,
-		count:         to.e.textCount,
-		fillColor:     fillColor,
-		strokeColor:   strokeColor,
-	}
-	if !isTextSpace(tm.text) && tm.Width() == 0.0 {
-		common.Log.Debug("ERROR: Zero width text. tm=%s\n\tm=%#v", tm, tm)
-	}
-	return tm
-}
-
-// isTextSpace returns true if `text` contains nothing but space code points.
-func isTextSpace(text string) bool {
-	for _, r := range text {
-		if !unicode.IsSpace(r) {
-			return false
-		}
-	}
-	return true
-}
-
-// nearestMultiple return the integer multiple of `m` that is closest to `x`.
-func nearestMultiple(x float64, m int) int {
-	if m == 0 {
-		m = 1
-	}
-	fac := float64(m)
-	return int(math.Round(x/fac) * fac)
-}
-
-// String returns a string describing `tm`.
-func (tm textMark) String() string {
-	return fmt.Sprintf("textMark{@%03d [%.3f,%.3f] w=%.1f %d° %q}",
-		tm.count, tm.orientedStart.X, tm.orientedStart.Y, tm.Width(), tm.orient,
-		truncate(tm.text, 100))
-}
-
-// Width returns the width of `tm`.text in the text direction.
-func (tm textMark) Width() float64 {
-	return math.Abs(tm.orientedStart.X - tm.orientedEnd.X)
-}
-
-// ToTextMark returns the public view of `tm`.
-func (tm textMark) ToTextMark() TextMark {
-	return TextMark{
-		Text:        tm.text,
-		Original:    tm.original,
-		BBox:        tm.bbox,
-		Font:        tm.font,
-		FontSize:    tm.fontsize,
-		FillColor:   tm.fillColor,
-		StrokeColor: tm.strokeColor,
-	}
-}
-
 // PageText represents the layout of text on a device page.
 type PageText struct {
-	marks     []textMark // Texts and their positions on a PDF page.
-	viewText  string     // Extracted page text.
-	viewMarks []TextMark // Public view of `marks`.
+	marks      []*textMark        // Texts and their positions on a PDF page.
+	viewText   string             // Extracted page text.
+	viewMarks  []TextMark         // Public view of text marks.
+	viewTables []TextTable        // Public view of text tables.
+	pageSize   model.PdfRectangle // Page size. Used to calculate depth.
 }
 
 // String returns a string describing `pt`.
@@ -987,11 +899,6 @@ func (pt PageText) String() string {
 	return strings.Join(parts, "\n")
 }
 
-// length returns the number of elements in `pt.marks`.
-func (pt PageText) length() int {
-	return len(pt.marks)
-}
-
 // Text returns the extracted page text.
 func (pt PageText) Text() string {
 	return pt.viewText
@@ -1009,6 +916,42 @@ func (pt PageText) Marks() *TextMarkArray {
 	return &TextMarkArray{marks: pt.viewMarks}
 }
 
+// Tables returns the tables extracted from the page.
+func (pt PageText) Tables() []TextTable {
+	return pt.viewTables
+}
+
+// computeViews processes the page TextMarks sorting by position and populates `pt.viewText` and
+// `pt.viewMarks` which represent the text and marks in the order which it is read on the page.
+// The comments above the TextMark definition describe how to use the []TextMark to
+// maps substrings of the page text to locations on the PDF page.
+func (pt *PageText) computeViews() {
+	// Extract text paragraphs one orientation at a time.
+	// If there are texts with several orientations on a page then the all the text of the same
+	// orientation gets extracted togther.
+	var paras paraList
+	n := len(pt.marks)
+	for orient := 0; orient < 360 && n > 0; orient += 90 {
+		marks := make([]*textMark, 0, len(pt.marks)-n)
+		for _, tm := range pt.marks {
+			if tm.orient == orient {
+				marks = append(marks, tm)
+			}
+		}
+		if len(marks) > 0 {
+			parasOrient := makeTextPage(marks, pt.pageSize)
+			paras = append(paras, parasOrient...)
+			n -= len(marks)
+		}
+	}
+	// Build the public viewable fields from the paraLis
+	b := new(bytes.Buffer)
+	paras.writeText(b)
+	pt.viewText = b.String()
+	pt.viewMarks = paras.toTextMarks()
+	pt.viewTables = paras.tables()
+}
+
 // TextMarkArray is a collection of TextMarks.
 type TextMarkArray struct {
 	marks []TextMark
@@ -1043,7 +986,11 @@ func (ma *TextMarkArray) Len() int {
 	return len(ma.marks)
 }
 
-// RangeOffset returns the TextMarks in `ma` that have `start` <= TextMark.Offset < `end`.
+// RangeOffset returns the TextMarks in `ma` that overlap text[start:end] in the extracted text.
+// These are tm: `start` <= tm.Offset + len(tm.Text) && tm.Offset < `end` where
+// `start` and `end` are offsets in the extracted text.
+// NOTE: TextMarks can contain multiple characters. e.g. "ffi" for the ﬃ ligature so the first and
+// last elements of the returned TextMarkArray may only partially overlap text[start:end].
 func (ma *TextMarkArray) RangeOffset(start, end int) (*TextMarkArray, error) {
 	if ma == nil {
 		return nil, errors.New("ma==nil")
@@ -1062,7 +1009,7 @@ func (ma *TextMarkArray) RangeOffset(start, end int) (*TextMarkArray, error) {
 		end = ma.marks[n-1].Offset + 1
 	}
 
-	iStart := sort.Search(n, func(i int) bool { return ma.marks[i].Offset >= start })
+	iStart := sort.Search(n, func(i int) bool { return ma.marks[i].Offset+len(ma.marks[i].Text)-1 >= start })
 	if !(0 <= iStart && iStart < n) {
 		err := fmt.Errorf("Out of range. start=%d iStart=%d len=%d\n\tfirst=%v\n\t last=%v",
 			start, iStart, n, ma.marks[0], ma.marks[n-1])
@@ -1076,34 +1023,28 @@ func (ma *TextMarkArray) RangeOffset(start, end int) (*TextMarkArray, error) {
 	}
 	if iEnd <= iStart {
 		// This should never happen.
-		return nil, fmt.Errorf("start=%d end=%d iStart=%d iEnd=%d", start, end, iStart, iEnd)
+		return nil, fmt.Errorf("iEnd <= iStart: start=%d end=%d iStart=%d iEnd=%d",
+			start, end, iStart, iEnd)
 	}
 	return &TextMarkArray{marks: ma.marks[iStart:iEnd]}, nil
 }
 
 // BBox returns the smallest axis-aligned rectangle that encloses all the TextMarks in `ma`.
 func (ma *TextMarkArray) BBox() (model.PdfRectangle, bool) {
-	if len(ma.marks) == 0 {
-		return model.PdfRectangle{}, false
-	}
-	bbox := ma.marks[0].BBox
-	for _, tm := range ma.marks[1:] {
-		if isTextSpace(tm.Text) {
+	var bbox model.PdfRectangle
+	found := false
+	for _, tm := range ma.marks {
+		if tm.Meta || isTextSpace(tm.Text) {
 			continue
 		}
-		bbox = rectUnion(bbox, tm.BBox)
-	}
-	return bbox, true
-}
-
-// rectUnion returns the smallest axis-aligned rectangle that contains `b1` and `b2`.
-func rectUnion(b1, b2 model.PdfRectangle) model.PdfRectangle {
-	return model.PdfRectangle{
-		Llx: math.Min(b1.Llx, b2.Llx),
-		Lly: math.Min(b1.Lly, b2.Lly),
-		Urx: math.Max(b1.Urx, b2.Urx),
-		Ury: math.Max(b1.Ury, b2.Ury),
+		if found {
+			bbox = rectUnion(bbox, tm.BBox)
+		} else {
+			bbox = tm.BBox
+			found = true
+		}
 	}
+	return bbox, found
 }
 
 // TextMark represents extracted text on a page with information regarding both textual content,
@@ -1128,7 +1069,7 @@ func rectUnion(b1, b2 model.PdfRectangle) model.PdfRectangle {
 //      bbox, ok := spanMarks.BBox()
 //      // handle errors
 type TextMark struct {
-	// Text is the extracted text. It has been decoded to Unicode via ToUnicode().
+	// Text is the extracted text.
 	Text string
 	// Original is the text in the PDF. It has not been decoded like `Text`.
 	Original string
@@ -1169,491 +1110,48 @@ func (tm TextMark) String() string {
 	if tm.Meta {
 		meta = " *M*"
 	}
-	return fmt.Sprintf("{TextMark: %d %q=%02x (%5.1f, %5.1f) (%5.1f, %5.1f) %s%s}",
+	return fmt.Sprintf("{TextMark: %d %q=%02x (%6.2f, %6.2f) (%6.2f, %6.2f) %s%s}",
 		tm.Offset, tm.Text, []rune(tm.Text), b.Llx, b.Lly, b.Urx, b.Ury, font, meta)
 }
 
-// computeViews processes the page TextMarks sorting by position and populates `pt.viewText` and
-// `pt.viewMarks` which represent the text and marks in the order which it is read on the page.
-// The comments above the TextMark definition describe how to use the []TextMark to
-// maps substrings of the page text to locations on the PDF page.
-func (pt *PageText) computeViews() {
-	fontHeight := pt.height()
-	// We sort with a y tolerance to allow for subscripts, diacritics etc.
-	tol := minFloat(fontHeight*0.19, 5.0)
-	common.Log.Trace("ToTextLocation: %d elements fontHeight=%.1f tol=%.1f", len(pt.marks), fontHeight, tol)
-	// Uncomment the 2 following Debug statements to see the effects of sorting.
-	// common.Log.Debug("computeViews: Before sorting %s", pt)
-	pt.sortPosition(tol)
-	// common.Log.Debug("computeViews: After sorting %s", pt)
-	lines := pt.toLines(tol)
-	texts := make([]string, len(lines))
-	for i, l := range lines {
-		texts[i] = strings.Join(l.words(), wordJoiner)
-	}
-	text := strings.Join(texts, lineJoiner)
-	var marks []TextMark
-	offset := 0
-	for i, l := range lines {
-		for j, tm := range l.marks {
-			tm.Offset = offset
-			marks = append(marks, tm)
-			offset += len(tm.Text)
-			if j == len(l.marks)-1 {
-				break
-			}
-			if wordJoinerLen > 0 {
-				tm := TextMark{
-					Offset: offset,
-					Text:   wordJoiner,
-					Meta:   true,
-				}
-				marks = append(marks, tm)
-				offset += wordJoinerLen
-			}
-		}
-		if i == len(lines)-1 {
-			break
-		}
-		if lineJoinerLen > 0 {
-			tm := TextMark{
-				Offset: offset,
-				Text:   lineJoiner,
-				Meta:   true,
-			}
-			marks = append(marks, tm)
-			offset += lineJoinerLen
-		}
-	}
-	pt.viewText = text
-	pt.viewMarks = marks
-}
-
-// height returns the max height of the elements in `pt.marks`.
-func (pt PageText) height() float64 {
-	fontHeight := 0.0
-	for _, tm := range pt.marks {
-		if tm.height > fontHeight {
-			fontHeight = tm.height
-		}
-	}
-	return fontHeight
-}
-
-const (
-	// wordJoiner is added between text marks in extracted text.
-	wordJoiner = ""
-	// lineJoiner is added between lines in extracted text.
-	lineJoiner = "\n"
-)
-
-var (
-	wordJoinerLen = len(wordJoiner)
-	lineJoinerLen = len(lineJoiner)
-	// spaceMark is a special TextMark used for spaces.
-	spaceMark = TextMark{
-		Text:     " ",
-		Original: " ",
-		Meta:     true,
-	}
-)
-
-// sortPosition sorts a text list by its elements' positions on a page.
-// Sorting is by orientation then top to bottom, left to right when page is orientated so that text
-// is horizontal.
-// Text is considered to be on different lines if the lines' orientedStart.Y differs by more than `tol`.
-func (pt *PageText) sortPosition(tol float64) {
-	if len(pt.marks) == 0 {
-		return
-	}
-
-	// For grouping data vertically into lines, it is necessary to have the data presorted by
-	// descending y position.
-	sort.SliceStable(pt.marks, func(i, j int) bool {
-		ti, tj := pt.marks[i], pt.marks[j]
-		if ti.orient != tj.orient {
-			return ti.orient < tj.orient
-		}
-		return ti.orientedStart.Y >= tj.orientedStart.Y
-	})
-
-	// Cluster the marks into y-clusters by relative y proximity. Each cluster is our guess of what
-	// makes up a line of text.
-	clusters := make([]int, len(pt.marks))
-	cluster := 0
-	clusters[0] = cluster
-	for i := 1; i < len(pt.marks); i++ {
-		if pt.marks[i-1].orient != pt.marks[i].orient {
-			cluster++
-		} else {
-			if pt.marks[i-1].orientedStart.Y-pt.marks[i].orientedStart.Y > tol {
-				cluster++
-			}
-		}
-		clusters[i] = cluster
-	}
-
-	// Sort by y-cluster and x.
-	sort.SliceStable(pt.marks, func(i, j int) bool {
-		ti, tj := pt.marks[i], pt.marks[j]
-		if ti.orient != tj.orient {
-			return ti.orient < tj.orient
-		}
-		if clusters[i] != clusters[j] {
-			return clusters[i] < clusters[j]
-		}
-		return ti.orientedStart.X < tj.orientedStart.X
-	})
+// spaceMark is a special TextMark used for spaces.
+var spaceMark = TextMark{
+	Text:        "[X]",
+	Original:    " ",
+	Meta:        true,
+	FillColor:   color.White,
+	StrokeColor: color.White,
 }
 
-// textLine represents a line of text on a page.
-type textLine struct {
-	x      float64    // x position of line.
-	y      float64    // y position of line.
-	h      float64    // height of line text.
-	dxList []float64  // x distance between successive words in line.
-	marks  []TextMark // TextMarks in the line.
+// TextTable represents a table.
+// Cells are ordered top-to-bottom, left-to-right.
+// Cells[y] is the (0-offset) y'th row in the table.
+// Cells[y][x] is the (0-offset) x'th column in the table.
+type TextTable struct {
+	W, H  int
+	Cells [][]TableCell
 }
 
-// words returns the texts in `tl`.
-func (tl textLine) words() []string {
-	var texts []string
-	for _, tm := range tl.marks {
-		texts = append(texts, tm.Text)
-	}
-	return texts
-}
-
-// toLines returns the text and positions in `pt.marks` as a slice of textLine.
-// NOTE: Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
-// that text is horizontal) before calling this function.
-func (pt PageText) toLines(tol float64) []textLine {
-	// We divide `pt.marks` into slices which contain texts with the same orientation, extract the
-	// lines for each orientation then return the concatenation of these lines sorted by orientation.
-	tlOrient := make(map[int][]textMark, len(pt.marks))
-	for _, tm := range pt.marks {
-		tlOrient[tm.orient] = append(tlOrient[tm.orient], tm)
-	}
-	var lines []textLine
-	for _, o := range orientKeys(tlOrient) {
-		lns := PageText{marks: tlOrient[o]}.toLinesOrient(tol)
-		lines = append(lines, lns...)
-	}
-	return lines
-}
-
-// toLinesOrient returns the text and positions in `pt.marks` as a slice of textLine.
-// NOTE: This function only works on text lists where all text is the same orientation so it should
-// only be called from toLines.
-// Caller must sort the text list top-to-bottom, left-to-right (for orientation adjusted so
-// that text is horizontal) before calling this function.
-func (pt PageText) toLinesOrient(tol float64) []textLine {
-	if len(pt.marks) == 0 {
-		return []textLine{}
-	}
-	var marks []TextMark
-	var lines []textLine
-	var xx []float64
-	y := pt.marks[0].orientedStart.Y
-
-	scanning := false
-
-	averageCharWidth := exponAve{}
-	wordSpacing := exponAve{}
-	lastEndX := 0.0 // lastEndX is pt.marks[i-1].orientedEnd.X
-
-	for _, tm := range pt.marks {
-		if tm.orientedStart.Y+tol < y {
-			if len(marks) > 0 {
-				tl := newLine(y, xx, marks)
-				if averageCharWidth.running {
-					// FIXME(peterwilliams97): Fix and reinstate combineDiacritics.
-					// tl = combineDiacritics(tl, averageCharWidth.ave)
-					tl = removeDuplicates(tl, averageCharWidth.ave)
-				}
-				lines = append(lines, tl)
-			}
-			marks = []TextMark{}
-			xx = []float64{}
-			y = tm.orientedStart.Y
-			scanning = false
-		}
-
-		// Detect text movements that represent spaces on the printed page.
-		// We use a heuristic from PdfBox: If the next character starts to the right of where a
-		// character after a space at "normal spacing" would start, then there is a space before it.
-		// The tricky thing to guess here is the width of a space at normal spacing.
-		// We follow PdfBox and use min(deltaSpace, deltaCharWidth).
-		deltaSpace := 0.0
-		if tm.spaceWidth == 0 {
-			deltaSpace = math.MaxFloat64
-		} else {
-			wordSpacing.update(tm.spaceWidth)
-			deltaSpace = wordSpacing.ave * 0.5
-		}
-		averageCharWidth.update(tm.Width())
-		deltaCharWidth := averageCharWidth.ave * 0.3
-
-		isSpace := false
-		nextWordX := lastEndX + minFloat(deltaSpace, deltaCharWidth)
-		if scanning && !isTextSpace(tm.text) {
-			isSpace = nextWordX < tm.orientedStart.X
-		}
-		common.Log.Trace("tm=%s", tm)
-		common.Log.Trace("width=%.2f delta=%.2f deltaSpace=%.2g deltaCharWidth=%.2g",
-			tm.Width(), minFloat(deltaSpace, deltaCharWidth), deltaSpace, deltaCharWidth)
-		common.Log.Trace("%+q [%.1f, %.1f] lastEndX=%.2f nextWordX=%.2f (%.2f) isSpace=%t",
-			tm.text, tm.orientedStart.X, tm.orientedStart.Y, lastEndX, nextWordX,
-			nextWordX-tm.orientedStart.X, isSpace)
-
-		if isSpace {
-			marks = append(marks, spaceMark)
-			xx = append(xx, (lastEndX+tm.orientedStart.X)*0.5)
-		}
-
-		// Add the text to the line.
-		lastEndX = tm.orientedEnd.X
-		marks = append(marks, tm.ToTextMark())
-		xx = append(xx, tm.orientedStart.X)
-		scanning = true
-		common.Log.Trace("lastEndX=%.2f", lastEndX)
-	}
-	if len(marks) > 0 {
-		tl := newLine(y, xx, marks)
-		if averageCharWidth.running {
-			tl = removeDuplicates(tl, averageCharWidth.ave)
-		}
-		lines = append(lines, tl)
-	}
-	return lines
-}
-
-// orientKeys returns the keys of `tlOrient` as a sorted slice.
-func orientKeys(tlOrient map[int][]textMark) []int {
-	keys := []int{}
-	for k := range tlOrient {
-		keys = append(keys, k)
-	}
-	sort.Ints(keys)
-	return keys
-}
-
-// exponAve implements an exponential average.
-type exponAve struct {
-	ave     float64 // Current average value.
-	running bool    // Has `ave` been set?
-}
-
-// update updates the exponential average `exp`.ave with latest value `x` and returns `exp`.ave.
-func (exp *exponAve) update(x float64) float64 {
-	if !exp.running {
-		exp.ave = x
-		exp.running = true
-	} else {
-		// NOTE(peterwilliams97): 0.5 is a guess. It may be possible to improve average character
-		// and space width estimation by tuning this value. It may be that different exponents
-		// would work better for character and space estimation.
-		exp.ave = (exp.ave + x) * 0.5
-	}
-	return exp.ave
-}
-
-// newLine returns the textLine representation of strings `words` with y coordinate `y` and x
-// coordinates `xx` and height `h`.
-func newLine(y float64, xx []float64, marks []TextMark) textLine {
-	dxList := make([]float64, len(xx)-1)
-	for i := 1; i < len(xx); i++ {
-		dxList[i-1] = xx[i] - xx[i-1]
-	}
-	return textLine{
-		x:      xx[0],
-		y:      y,
-		dxList: dxList,
-		marks:  marks,
-	}
-}
-
-// removeDuplicates returns `tl` with duplicate characters removed. `charWidth` is the average
-// character width for the line.
-func removeDuplicates(tl textLine, charWidth float64) textLine {
-	if len(tl.dxList) == 0 || len(tl.marks) == 0 {
-		return tl
-	}
-	// NOTE(peterwilliams97) 0.3 is a guess. It may be possible to tune this to a better value.
-	tol := charWidth * 0.3
-	marks := []TextMark{tl.marks[0]}
-	var dxList []float64
-
-	tm0 := tl.marks[0]
-	for i, dx := range tl.dxList {
-		tm := tl.marks[i+1]
-		if tm.Text != tm0.Text || dx > tol {
-			marks = append(marks, tm)
-			dxList = append(dxList, dx)
-		}
-		tm0 = tm
-	}
-	return textLine{
-		x:      tl.x,
-		y:      tl.y,
-		dxList: dxList,
-		marks:  marks,
-	}
-}
-
-// combineDiacritics returns `line` with diacritics close to characters combined with the characters.
-// `charWidth` is the average character width for the line.
-// We have to do this because PDF can render diacritics separately to the characters they attach to
-// in extracted text.
-func combineDiacritics(tl textLine, charWidth float64) textLine {
-	if len(tl.dxList) == 0 || len(tl.marks) == 0 {
-		return tl
-	}
-	// NOTE(peterwilliams97) 0.2 is a guess. It may be possible to tune this to a better value.
-	tol := charWidth * 0.2
-	common.Log.Trace("combineDiacritics: charWidth=%.2f tol=%.2f", charWidth, tol)
-
-	var marks []TextMark
-	var dxList []float64
-	tm := marks[0]
-	w, c := countDiacritic(tm.Text)
-	delta := 0.0
-	dx0 := 0.0
-	parts := []string{w}
-	numChars := c
-
-	for i, dx := range tl.dxList {
-		tm = marks[i+1]
-		w, c := countDiacritic(tm.Text)
-		if numChars+c <= 1 && delta+dx <= tol {
-			if len(parts) == 0 {
-				dx0 = dx
-			} else {
-				delta += dx
-			}
-			parts = append(parts, w)
-			numChars += c
-		} else {
-			if len(parts) > 0 {
-				if len(marks) > 0 {
-					dxList = append(dxList, dx0)
-				}
-				tm.Text = combine(parts)
-				marks = append(marks, tm)
-			}
-			parts = []string{w}
-			numChars = c
-			dx0 = dx
-			delta = 0.0
-		}
-	}
-	if len(parts) > 0 {
-		if len(marks) > 0 {
-			dxList = append(dxList, dx0)
-		}
-		tm.Text = combine(parts)
-		marks = append(marks, tm)
-	}
-	if len(marks) != len(dxList)+1 {
-		common.Log.Error("Inconsistent: \nwords=%d \ndxList=%d %.2f",
-			len(marks), len(dxList), dxList)
-		return tl
-	}
-	return textLine{
-		x:      tl.x,
-		y:      tl.y,
-		dxList: dxList,
-		marks:  marks,
-	}
-}
-
-// combine combines any diacritics in `parts` with the single non-diacritic character in `parts`.
-func combine(parts []string) string {
-	if len(parts) == 1 {
-		// Must be a non-diacritic.
-		return parts[0]
-	}
-
-	// We need to put the diacritics before the non-diacritic for NFKC normalization to work.
-	diacritic := map[string]bool{}
-	for _, w := range parts {
-		r := []rune(w)[0]
-		diacritic[w] = unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)
-	}
-	sort.SliceStable(parts, func(i, j int) bool { return !diacritic[parts[i]] && diacritic[parts[j]] })
-
-	// Construct the NFKC-normalized concatenation of the diacritics and the non-diacritic.
-	for i, w := range parts {
-		parts[i] = strings.TrimSpace(norm.NFKC.String(w))
-	}
-	return strings.Join(parts, "")
-}
-
-// countDiacritic returns the combining diacritic version of `w` (usually itself) and the number of
-// non-diacritics in `w` (0 or 1).
-func countDiacritic(w string) (string, int) {
-	runes := []rune(w)
-	if len(runes) != 1 {
-		return w, 1
-	}
-	r := runes[0]
-	c := 1
-	if (unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Sk, r)) &&
-		r != '\'' && r != '"' && r != '`' {
-		c = 0
-	}
-	if w2, ok := diacritics[r]; ok {
-		c = 0
-		w = w2
-	}
-	return w, c
-}
-
-// diacritics is a map of diacritic characters that are not classified as unicode.Mn or unicode.Sk
-// and the corresponding unicode.Mn or unicode.Sk characters. This map was copied from PdfBox.
-// (https://svn.apache.org/repos/asf/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java)
-var diacritics = map[rune]string{
-	0x0060: "\u0300",
-	0x02CB: "\u0300",
-	0x0027: "\u0301",
-	0x02B9: "\u0301",
-	0x02CA: "\u0301",
-	0x005e: "\u0302",
-	0x02C6: "\u0302",
-	0x007E: "\u0303",
-	0x02C9: "\u0304",
-	0x00B0: "\u030A",
-	0x02BA: "\u030B",
-	0x02C7: "\u030C",
-	0x02C8: "\u030D",
-	0x0022: "\u030E",
-	0x02BB: "\u0312",
-	0x02BC: "\u0313",
-	0x0486: "\u0313",
-	0x055A: "\u0313",
-	0x02BD: "\u0314",
-	0x0485: "\u0314",
-	0x0559: "\u0314",
-	0x02D4: "\u031D",
-	0x02D5: "\u031E",
-	0x02D6: "\u031F",
-	0x02D7: "\u0320",
-	0x02B2: "\u0321",
-	0x02CC: "\u0329",
-	0x02B7: "\u032B",
-	0x02CD: "\u0331",
-	0x005F: "\u0332",
-	0x204E: "\u0359",
+// TableCell is a cell in a TextTable.
+type TableCell struct {
+	// Text is the extracted text.
+	Text string
+	// Marks returns the TextMarks corresponding to the text in Text.
+	Marks TextMarkArray
 }
 
 // getCurrentFont returns the font on top of the font stack, or DefaultFont if the font stack is
 // empty.
 func (to *textObject) getCurrentFont() *model.PdfFont {
-	if to.fontStack.empty() {
+	var font *model.PdfFont
+	if !to.savedStates.empty() {
+		font = to.savedStates.top().tfont
+	}
+	if font == nil {
 		common.Log.Debug("ERROR: No font defined. Using default.")
 		return model.DefaultFont()
 	}
-	return to.fontStack.peek()
+	return font
 }
 
 // getFont returns the font named `name` if it exists in the page's resources or an error if it
diff --git a/extractor/text_bag.go b/extractor/text_bag.go
new file mode 100644
index 000000000..88e529a3d
--- /dev/null
+++ b/extractor/text_bag.go
@@ -0,0 +1,375 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package extractor
+
+import (
+	"fmt"
+	"math"
+	"sort"
+	"strings"
+
+	"github.com/unidoc/unipdf/v3/common"
+	"github.com/unidoc/unipdf/v3/model"
+)
+
+// wordBag is just a list of textWords in a rectangular region. It is needed for efficient
+// comparison of the bounding boxes of the words to arrange them into paragraph regions.
+// The implementation is not important as long as it implements the main function scanBand()
+// efficiently.
+// In the current implementation, wordBag is a list of word fragment bins arranged by their depth on
+// a page with the word fragments  in each bin are sorted in reading order.
+type wordBag struct {
+	model.PdfRectangle         // Bounding box of all the textWord in the wordBag.
+	fontsize           float64 // The size of the largest font in the wordBag.
+	// The following fields are for the current bin based implementation
+	pageHeight float64             // Used to calculate depths
+	bins       map[int][]*textWord // bins[n] = w: n*depthBinPoints <= w.depth < (n+1)*depthBinPoints
+}
+
+// makeWordBag return a wordBag containg `words`
+// In the current implementation, it does this by putting the words into the appropriate depth bins.
+// Caller must check that `words` has at least one element.
+func makeWordBag(words []*textWord, pageHeight float64) *wordBag {
+	b := newWordBag(words[0], pageHeight)
+	for _, w := range words[1:] {
+		depthIdx := depthIndex(w.depth)
+		b.bins[depthIdx] = append(b.bins[depthIdx], w)
+	}
+	b.sort()
+	return b
+}
+
+// newWordBag returns a wordBag with page height `pageHeight` with the single word fragment `word`.
+func newWordBag(word *textWord, pageHeight float64) *wordBag {
+	depthIdx := depthIndex(word.depth)
+	words := []*textWord{word}
+	bag := wordBag{
+		bins:         map[int][]*textWord{depthIdx: words},
+		PdfRectangle: word.PdfRectangle,
+		fontsize:     word.fontsize,
+		pageHeight:   pageHeight,
+	}
+	return &bag
+}
+
+// String returns a description of `b`.
+func (b *wordBag) String() string {
+	var texts []string
+	for _, depthIdx := range b.depthIndexes() {
+		words, _ := b.bins[depthIdx]
+		for _, w := range words {
+			texts = append(texts, w.text)
+		}
+	}
+	return fmt.Sprintf("%.2f fontsize=%.2f %d %q", b.PdfRectangle, b.fontsize, len(texts), texts)
+}
+
+// scanBand scans the bins for words w:
+//     `minDepth` <= w.depth <= `maxDepth` &&  // in the depth diraction
+//    `readingOverlap`(`para`, w) &&  // in the reading directon
+//     math.Abs(w.fontsize-fontsize) > `fontTol`*fontsize // font size tolerance
+// and applies `moveWord`(depthIdx, s,para w) to them.
+// If `detectOnly` is true, moveWord is not applied.
+// If `freezeDepth` is true, minDepth and maxDepth are not updated in scan as words are added.
+func (b *wordBag) scanBand(title string, para *wordBag,
+	readingOverlap func(para *wordBag, word *textWord) bool,
+	minDepth, maxDepth, fontTol float64,
+	detectOnly, freezeDepth bool) int {
+	fontsize := para.fontsize
+	lineDepth := lineDepthR * fontsize
+	n := 0
+	minDepth0, maxDepth0 := minDepth, maxDepth
+	var newWords []*textWord
+	for _, depthIdx := range b.depthBand(minDepth-lineDepth, maxDepth+lineDepth) {
+		for _, word := range b.bins[depthIdx] {
+			if !(minDepth-lineDepth <= word.depth && word.depth <= maxDepth+lineDepth) {
+				continue
+			}
+			if !readingOverlap(para, word) {
+				continue
+			}
+			fontRatio1 := math.Abs(word.fontsize-fontsize) / fontsize
+			fontRatio2 := word.fontsize / fontsize
+			fontRatio := math.Min(fontRatio1, fontRatio2)
+			if fontTol > 0 {
+				if fontRatio > fontTol {
+					continue
+				}
+			}
+
+			if !detectOnly {
+				para.pullWord(b, word, depthIdx)
+			}
+			newWords = append(newWords, word)
+			n++
+			if !freezeDepth {
+				if word.depth < minDepth {
+					minDepth = word.depth
+				}
+				if word.depth > maxDepth {
+					maxDepth = word.depth
+				}
+			}
+			// Has no effect on results
+			// fontsize = para.fontsize
+			// lineDepth = lineDepthR * fontsize
+			if detectOnly {
+				break
+			}
+		}
+	}
+	if verbose {
+		if len(title) > 0 {
+			common.Log.Info("scanBand: %s [%.2f %.2f]->[%.2f %.2f] para=%.2f fontsize=%.2f %q",
+				title,
+				minDepth0, maxDepth0,
+				minDepth, maxDepth,
+				para.PdfRectangle, para.fontsize, truncate(para.text(), 20))
+			for i, word := range newWords {
+				fmt.Printf("  %q", word.text)
+				if i >= 5 {
+					break
+				}
+			}
+			if len(newWords) > 0 {
+				fmt.Println()
+			}
+		}
+	}
+	return n
+}
+
+// highestWord returns the hight word in b.bins[depthIdx] w: minDepth <= w.depth <= maxDepth.
+func (b *wordBag) highestWord(depthIdx int, minDepth, maxDepth float64) *textWord {
+	for _, word := range b.bins[depthIdx] {
+		if minDepth <= word.depth && word.depth <= maxDepth {
+			return word
+		}
+	}
+	return nil
+}
+
+// depthBand returns the indexes of the bins with depth: `minDepth` <= depth <= `maxDepth`.
+func (b *wordBag) depthBand(minDepth, maxDepth float64) []int {
+	if len(b.bins) == 0 {
+		return nil
+	}
+	return b.depthRange(b.getDepthIdx(minDepth), b.getDepthIdx(maxDepth))
+}
+
+// depthRange returns the sorted keys of b.bins for depths indexes [`minDepth`,`maxDepth`).
+func (b *wordBag) depthRange(minDepthIdx, maxDepthIdx int) []int {
+	indexes := b.depthIndexes()
+	var rangeIndexes []int
+	for _, depthIdx := range indexes {
+		if minDepthIdx <= depthIdx && depthIdx <= maxDepthIdx {
+			rangeIndexes = append(rangeIndexes, depthIdx)
+		}
+	}
+	return rangeIndexes
+}
+
+// firstReadingIndex returns the index of the bin containing the left-most word near the top of `b`.
+// Precisely, this is the index of the depth bin that starts with that word with the smallest
+// reading direction value in the depth region `minDepthIndex` < depth <= minDepthIndex+ 4*fontsize
+// The point of this function is to find the top-most left-most word in `b` that is not a superscript.
+func (b *wordBag) firstReadingIndex(minDepthIdx int) int {
+	fontsize := b.firstWord(minDepthIdx).fontsize
+	minDepth := float64(minDepthIdx+1) * depthBinPoints
+	maxDepth := minDepth + topWordRangeR*fontsize
+	firstReadingIdx := minDepthIdx
+	for _, depthIdx := range b.depthBand(minDepth, maxDepth) {
+		if diffReading(b.firstWord(depthIdx), b.firstWord(firstReadingIdx)) < 0 {
+			firstReadingIdx = depthIdx
+		}
+	}
+	return firstReadingIdx
+}
+
+// getDepthIdx returns the index into `b.bins` for depth axis value `depth`.
+// Caller must check that len(b.bins) > 0.
+func (b *wordBag) getDepthIdx(depth float64) int {
+	indexes := b.depthIndexes()
+	depthIdx := depthIndex(depth)
+	if depthIdx < indexes[0] {
+		return indexes[0]
+	}
+	if depthIdx > indexes[len(indexes)-1] {
+		return indexes[len(indexes)-1]
+	}
+	return depthIdx
+}
+
+// empty returns true if the depth bin with index `depthIdx` is empty.
+// NOTE: We delete bins as soon as they become empty so we just have to check for the bin's existence.
+func (b *wordBag) empty(depthIdx int) bool {
+	_, ok := b.bins[depthIdx]
+	return !ok
+}
+
+// firstWord returns the first word in reading order in bin `depthIdx`.
+func (b *wordBag) firstWord(depthIdx int) *textWord {
+	return b.bins[depthIdx][0]
+}
+
+// stratum returns a copy of `b`.bins[`depthIdx`].
+// stratum is guaranteed to return a non-nil value. It must be called with a valid depth index.
+// NOTE: We need to return a copy because remove() and other functions manipulate the array
+// underlying the slice.
+func (b *wordBag) stratum(depthIdx int) []*textWord {
+	words := b.bins[depthIdx]
+	dup := make([]*textWord, len(words))
+	copy(dup, words)
+	return dup
+}
+
+// pullWord adds `word` to `b` and removes it from `bag`.
+// `depthIdx` is the depth index of `word` in all wordBags.
+// TODO(peterwilliams97): Compute depthIdx from `word` instead of passing it around.
+func (b *wordBag) pullWord(bag *wordBag, word *textWord, depthIdx int) {
+	b.PdfRectangle = rectUnion(b.PdfRectangle, word.PdfRectangle)
+	if word.fontsize > b.fontsize {
+		b.fontsize = word.fontsize
+	}
+	b.bins[depthIdx] = append(b.bins[depthIdx], word)
+	bag.removeWord(word, depthIdx)
+}
+
+// removeWord removes `word`from `b`.
+// In the current implementation it  removes `word`from `b`.bins[`depthIdx`].
+// NOTE: We delete bins as soon as they become empty to save code that calls other wordBag
+// functions from having to check for empty bins.
+// TODO(peterwilliams97): Find a more efficient way of doing this.
+func (b *wordBag) removeWord(word *textWord, depthIdx int) {
+	words := removeWord(b.stratum(depthIdx), word)
+	if len(words) == 0 {
+		delete(b.bins, depthIdx)
+	} else {
+		b.bins[depthIdx] = words
+	}
+}
+
+// mergeWordBags merges the bags less than a character width to the left of a bag into that bag.
+func mergeWordBags(paraWords []*wordBag) []*wordBag {
+	if len(paraWords) <= 1 {
+		return paraWords
+	}
+	if verbose {
+		common.Log.Info("mergeWordBags:")
+	}
+	sort.Slice(paraWords, func(i, j int) bool {
+		pi, pj := paraWords[i], paraWords[j]
+		ai := pi.Width() * pi.Height()
+		aj := pj.Width() * pj.Height()
+		if ai != aj {
+			return ai > aj
+		}
+		if pi.Height() != pj.Height() {
+			return pi.Height() > pj.Height()
+		}
+		return i < j
+	})
+	var merged []*wordBag
+	absorbed := map[int]struct{}{}
+	for i0 := 0; i0 < len(paraWords); i0++ {
+		if _, ok := absorbed[i0]; ok {
+			continue
+		}
+		para0 := paraWords[i0]
+		for i1 := i0 + 1; i1 < len(paraWords); i1++ {
+			if _, ok := absorbed[i0]; ok {
+				continue
+			}
+			para1 := paraWords[i1]
+			r := para0.PdfRectangle
+			r.Llx -= para0.fontsize
+			if rectContainsRect(r, para1.PdfRectangle) {
+				para0.absorb(para1)
+				absorbed[i1] = struct{}{}
+			}
+		}
+		merged = append(merged, para0)
+	}
+
+	if len(paraWords) != len(merged)+len(absorbed) {
+		common.Log.Error("mergeWordBags: %d->%d absorbed=%d",
+			len(paraWords), len(merged), len(absorbed))
+	}
+	return merged
+}
+
+// absorb combines the words from `bag` into `b`.
+func (b *wordBag) absorb(bag *wordBag) {
+	for depthIdx, words := range bag.bins {
+		for _, word := range words {
+			b.pullWord(bag, word, depthIdx)
+		}
+	}
+}
+
+// depthIndex returns a bin index for depth `depth`.
+// The returned depthIdx obeys the following rule.
+// depthIdx * depthBinPoints <= depth <= (depthIdx+1) * depthBinPoint
+func depthIndex(depth float64) int {
+	var depthIdx int
+	if depth >= 0 {
+		depthIdx = int(depth / depthBinPoints)
+	} else {
+		depthIdx = int(depth/depthBinPoints) - 1
+	}
+	return depthIdx
+}
+
+// depthIndexes returns the sorted keys of b.bins.
+func (b *wordBag) depthIndexes() []int {
+	if len(b.bins) == 0 {
+		return nil
+	}
+	indexes := make([]int, len(b.bins))
+	i := 0
+	for idx := range b.bins {
+		indexes[i] = idx
+		i++
+	}
+	sort.Ints(indexes)
+	return indexes
+}
+
+// sort sorts the word fragments in each bin in `b` in the reading direction.
+func (b *wordBag) sort() {
+	for _, bin := range b.bins {
+		sort.Slice(bin, func(i, j int) bool { return diffReading(bin[i], bin[j]) < 0 })
+	}
+}
+
+// minDepth returns the minimum depth that word fragments in `b` touch.
+func (b *wordBag) minDepth() float64 {
+	return b.pageHeight - (b.Ury - b.fontsize)
+}
+
+// maxDepth returns the maximum depth that word fragments in `b` touch.
+func (b *wordBag) maxDepth() float64 {
+	return b.pageHeight - b.Lly
+}
+
+// The following functions are used only for logging.
+
+func (b *wordBag) text() string {
+	words := b.allWords()
+	texts := make([]string, len(words))
+	for i, w := range words {
+		texts[i] = w.text
+	}
+	return strings.Join(texts, " ")
+}
+
+func (b *wordBag) allWords() []*textWord {
+	var wordList []*textWord
+	for _, words := range b.bins {
+		wordList = append(wordList, words...)
+	}
+	return wordList
+}
diff --git a/extractor/text_bound.go b/extractor/text_bound.go
new file mode 100644
index 000000000..2b0832629
--- /dev/null
+++ b/extractor/text_bound.go
@@ -0,0 +1,136 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package extractor
+
+import (
+	"math"
+
+	"github.com/unidoc/unipdf/v3/model"
+)
+
+/*
+ * Sorting functions.
+ *
+ * There are two directions:
+ *  - reading. Left to right in English
+ *  - depth (aka non-reading).  Top to botttom in English.
+ *
+ * Text is read in reading then depth order.
+ *
+ * TODO(peterwilliams97): Add support for other reading orders and page rotations
+ */
+
+// bounded is an object with a bounding box. A mark, word, line or para.
+type bounded interface {
+	bbox() model.PdfRectangle
+}
+
+// getDepth returns the depth of `a` on a page of size `pageSize`.
+func getDepth(pageSize model.PdfRectangle, a bounded) float64 {
+	return pageSize.Ury - a.bbox().Lly
+}
+
+// diffReading returns `a` - `b` in the reading direction.
+func diffReading(a, b bounded) float64 {
+	return a.bbox().Llx - b.bbox().Llx
+}
+
+// rectContainsRect returns true if `a` contains `b`.
+func rectContainsRect(a, b model.PdfRectangle) bool {
+	return a.Llx <= b.Llx && b.Urx <= a.Urx && a.Lly <= b.Lly && b.Ury <= a.Ury
+}
+
+// diffDepth returns `a` - `b` in the depth direction.
+func diffDepth(a, b bounded) float64 {
+	return bboxDepth(a) - bboxDepth(b)
+}
+
+// diffReadingDepth returns `a` - `b` in the reading then depth direction..
+func diffReadingDepth(a, b bounded) float64 {
+	diff := diffReading(a, b)
+	if !isZero(diff) {
+		return diff
+	}
+	return diffDepth(a, b)
+}
+
+// diffDepthReading returns `a` - `b` in the depth then reading directions
+func diffDepthReading(a, b bounded) float64 {
+	cmp := diffDepth(a, b)
+	if !isZero(cmp) {
+		return cmp
+	}
+	return diffReading(a, b)
+}
+
+// gapReading returns the reading direction gap between `a` and the following object `b` in the
+// reading direction.
+func gapReading(a, b bounded) float64 {
+	return a.bbox().Llx - b.bbox().Urx
+}
+
+// bboxDepth returns the relative depth of `b`. Depth is only used for comparison so we don't care
+// about its absolute value
+func bboxDepth(b bounded) float64 {
+	return -b.bbox().Lly
+}
+
+// readingOverlapLeft returns true is the left of `word` is in within `para` or delta to its right
+func readingOverlapLeft(para *wordBag, word *textWord, delta float64) bool {
+	return para.Urx <= word.Llx && word.Llx < para.Urx+delta
+}
+
+// readingOverlapPlusGap returns true if `word` overlaps [para.Llx-maxIntraReadingGap, para.Urx+maxIntraReadingGap]
+// in the reading direction.
+func readingOverlapPlusGap(para *wordBag, word *textWord, maxIntraReadingGap float64) bool {
+	return word.Llx < para.Urx+maxIntraReadingGap && para.Llx-maxIntraReadingGap < word.Urx
+}
+
+// partial return 'overlap`(*wordBag, *textWord, `param`) bool.
+func partial(overlap func(*wordBag, *textWord, float64) bool,
+	param float64) func(*wordBag, *textWord) bool {
+	return func(para *wordBag, word *textWord) bool {
+		return overlap(para, word, param)
+	}
+}
+
+// rectUnion returns the smallest axis-aligned rectangle that contains `b1` and `b2`.
+func rectUnion(b1, b2 model.PdfRectangle) model.PdfRectangle {
+	return model.PdfRectangle{
+		Llx: math.Min(b1.Llx, b2.Llx),
+		Lly: math.Min(b1.Lly, b2.Lly),
+		Urx: math.Max(b1.Urx, b2.Urx),
+		Ury: math.Max(b1.Ury, b2.Ury),
+	}
+}
+
+// rectIntersection returns the largest axis-aligned rectangle that is contained by `b1` and `b2`.
+func rectIntersection(b1, b2 model.PdfRectangle) (model.PdfRectangle, bool) {
+	if !intersects(b1, b2) {
+		return model.PdfRectangle{}, false
+	}
+	return model.PdfRectangle{
+		Llx: math.Max(b1.Llx, b2.Llx),
+		Urx: math.Min(b1.Urx, b2.Urx),
+		Lly: math.Max(b1.Lly, b2.Lly),
+		Ury: math.Min(b1.Ury, b2.Ury),
+	}, true
+}
+
+// intersects returns true if `r0` and `r1` overlap in the x and y axes.
+func intersects(b1, b2 model.PdfRectangle) bool {
+	return intersectsX(b1, b2) && intersectsY(b1, b2)
+}
+
+// intersectsX returns true if `r0` and `r1` overlap in the x axis.
+func intersectsX(r0, r1 model.PdfRectangle) bool {
+	return r1.Llx <= r0.Urx && r0.Llx <= r1.Urx
+}
+
+// intersectsY returns true if `r0` and `r1` overlap in the y axis.
+func intersectsY(r0, r1 model.PdfRectangle) bool {
+	return r0.Lly <= r1.Ury && r1.Lly <= r0.Ury
+}
diff --git a/extractor/text_const.go b/extractor/text_const.go
new file mode 100644
index 000000000..b3b463bb7
--- /dev/null
+++ b/extractor/text_const.go
@@ -0,0 +1,88 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package extractor
+
+// The follow constant configure debugging.
+const (
+	verbose         = false
+	verboseGeom     = false
+	verbosePage     = false
+	verbosePara     = false
+	verboseParaLine = verbosePara && false
+	verboseParaWord = verboseParaLine && false
+	verboseTable    = false
+)
+
+// The following constants control the approaches used in the code.
+const (
+	doHyphens           = true
+	doRemoveDuplicates  = true
+	doCombineDiacritics = true
+	useEBBox            = false
+)
+
+// The following constants are the tuning parameter for text extracton
+const (
+	// Change in angle of text in degrees that we treat as a different orientatiom/
+	orientationGranularity = 10
+	// Size of depth bins in points
+	depthBinPoints = 6
+
+	// Variation in line depth as a fraction of font size. +lineDepthR for subscripts, -lineDepthR for
+	// superscripts
+	lineDepthR = 0.5
+
+	// All constants that end in R are relative to font size.
+
+	maxWordAdvanceR = 0.11
+
+	maxKerningR = 0.19
+	maxLeadingR = 0.04
+
+	// Max difference in font sizes allowed within a word.
+	maxIntraWordFontTolR = 0.04
+
+	// Maximum gap between a word and a para in the depth direction for which we pull the word
+	// into the para, as a fraction of the font size.
+	maxIntraDepthGapR = 1.0
+	// Max diffrence in font size for word and para for the above case
+	maxIntraDepthFontTolR = 0.04
+
+	// Maximum gap between a word and a para in the reading direction for which we pull the word
+	// into the para.
+	maxIntraReadingGapR = 0.4
+	// Max diffrence in font size for word and para for the above case
+	maxIntraReadingFontTol = 0.7
+
+	// Minimum spacing between paras in the reading direction.
+	minInterReadingGapR = 1.0
+	// Max difference in font size for word and para for the above case
+	minInterReadingFontTol = 0.1
+
+	// Maximum inter-word spacing.
+	maxIntraWordGapR = 1.4
+
+	// Maximum overlap between characters allowd within a line
+	maxIntraLineOverlapR = 0.46
+
+	// Maximum spacing between characters within a line.
+	maxIntraLineGapR = 0.02
+
+	// Maximum difference in coordinates of duplicated textWords.
+	maxDuplicateWordR = 0.2
+
+	// Maximum distance from a character to its diacritic marks as a fraction of the character size.
+	diacriticRadiusR = 0.5
+
+	// Minimum number of rumes in the first half of a hyphenated word
+	minHyphenation = 4
+
+	// The distance we look down from the top of a wordBag for the leftmost word.
+	topWordRangeR = 4.0
+
+	// Minimum number of cells in a textTable
+	minTableParas = 6
+)
diff --git a/extractor/text_line.go b/extractor/text_line.go
new file mode 100644
index 000000000..6d89d2b99
--- /dev/null
+++ b/extractor/text_line.go
@@ -0,0 +1,126 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package extractor
+
+import (
+	"fmt"
+	"strings"
+	"unicode"
+
+	"github.com/unidoc/unipdf/v3/model"
+)
+
+// textLine repesents words on the same line within a textPara.
+type textLine struct {
+	model.PdfRectangle             // Bounding box (union of `marks` bounding boxes).
+	depth              float64     // Distance from bottom of line to top of page.
+	words              []*textWord // Words in this line.
+	fontsize           float64     // Largest word font size.
+}
+
+// newTextLine creates a line with the font and bbox size of the first word in `b`, removes the word
+// from `b` and adds it to the line.
+func newTextLine(b *wordBag, depthIdx int) *textLine {
+	word := b.firstWord(depthIdx)
+	line := textLine{
+		PdfRectangle: word.PdfRectangle,
+		fontsize:     word.fontsize,
+		depth:        word.depth,
+	}
+	line.pullWord(b, word, depthIdx)
+	return &line
+}
+
+// String returns a description of `l`.
+func (l *textLine) String() string {
+	return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",
+		l.depth, l.PdfRectangle, l.fontsize, l.text())
+}
+
+// bbox makes textLine implement the `bounded` interface.
+func (l *textLine) bbox() model.PdfRectangle {
+	return l.PdfRectangle
+}
+
+// text returns the extracted text contained in line.
+func (l *textLine) text() string {
+	var words []string
+	for _, w := range l.words {
+		if w.newWord {
+			words = append(words, " ")
+		}
+		words = append(words, w.text)
+	}
+	return strings.Join(words, "")
+}
+
+// toTextMarks returns the TextMarks contained in `l`.text().
+// `offset` is used to give the TextMarks the correct Offset values.
+func (l *textLine) toTextMarks(offset *int) []TextMark {
+	var marks []TextMark
+	for _, w := range l.words {
+		if w.newWord {
+			marks = appendSpaceMark(marks, offset, " ")
+		}
+		wordMarks := w.toTextMarks(offset)
+		marks = append(marks, wordMarks...)
+	}
+	return marks
+}
+
+// pullWord removes `word` from bag and appends it to `l`.
+func (l *textLine) pullWord(bag *wordBag, word *textWord, depthIdx int) {
+	l.appendWord(word)
+	bag.removeWord(word, depthIdx)
+}
+
+// appendWord appends `word` to `l`.
+// `l.PdfRectangle` is increased to bound the new word.
+// `l.fontsize` is the largest of the fontsizes of the words in line.
+func (l *textLine) appendWord(word *textWord) {
+	l.words = append(l.words, word)
+	l.PdfRectangle = rectUnion(l.PdfRectangle, word.PdfRectangle)
+	if word.fontsize > l.fontsize {
+		l.fontsize = word.fontsize
+	}
+	if word.depth > l.depth {
+		l.depth = word.depth
+	}
+}
+
+// markWordBoundaries marks the word fragments that are the first fragments in whole words.
+func (l *textLine) markWordBoundaries() {
+	maxGap := maxIntraLineGapR * l.fontsize
+	for i, w := range l.words[1:] {
+		if gapReading(w, l.words[i]) >= maxGap {
+			w.newWord = true
+		}
+	}
+}
+
+// endsInHyphen attempts to detect words that are split between lines
+// IT currently returns true if `l` ends in a hyphen and its last minHyphenation runes don't coataib
+// a space.
+// TODO(peterwilliams97): Figure out a better heuristic
+func (l *textLine) endsInHyphen() bool {
+	// Computing l.text() is a little expensive so we filter out simple cases first.
+	lastWord := l.words[len(l.words)-1]
+	runes := []rune(lastWord.text)
+	if !unicode.Is(unicode.Hyphen, runes[len(runes)-1]) {
+		return false
+	}
+	if lastWord.newWord && endsInHyphen(runes) {
+		return true
+	}
+	return endsInHyphen([]rune(l.text()))
+}
+
+// endsInHyphen returns true if `runes` ends with a hyphenated word.
+func endsInHyphen(runes []rune) bool {
+	return len(runes) >= minHyphenation &&
+		unicode.Is(unicode.Hyphen, runes[len(runes)-1]) &&
+		!unicode.IsSpace(runes[len(runes)-2])
+}
diff --git a/extractor/text_mark.go b/extractor/text_mark.go
new file mode 100644
index 000000000..7888d3420
--- /dev/null
+++ b/extractor/text_mark.go
@@ -0,0 +1,189 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package extractor
+
+import (
+	"fmt"
+	"image/color"
+	"math"
+
+	"github.com/unidoc/unipdf/v3/common"
+	"github.com/unidoc/unipdf/v3/internal/transform"
+	"github.com/unidoc/unipdf/v3/model"
+)
+
+// textMark represents text drawn on a page and its position in device coordinates.
+// All dimensions are in device coordinates.
+type textMark struct {
+	model.PdfRectangle                    // Bounding box oriented so character base is at bottom
+	orient             int                // Orientation
+	text               string             // The text (decoded via ToUnicode).
+	original           string             // Original text (decoded).
+	font               *model.PdfFont     // The font the mark was drawn with.
+	fontsize           float64            // The font size the mark was drawn with.
+	charspacing        float64            // TODO (peterwilliams97: Should this be exposed in TextMark?
+	trm                transform.Matrix   // The current text rendering matrix (TRM above).
+	end                transform.Point    // The end of character device coordinates.
+	originaBBox        model.PdfRectangle // Bounding box without orientation correction.
+	fillColor          color.Color        // Text fill color.
+	strokeColor        color.Color        // Text stroke color.
+}
+
+// newTextMark returns a textMark for text `text` rendered with text rendering matrix (TRM) `trm`
+// and end of character device coordinates `end`. `spaceWidth` is our best guess at the width of a
+// space in the font the text is rendered in device coordinates.
+func (to *textObject) newTextMark(text string, trm transform.Matrix, end transform.Point,
+	spaceWidth float64, font *model.PdfFont, charspacing float64,
+	fillColor, strokeColor color.Color) (textMark, bool) {
+	theta := trm.Angle()
+	orient := nearestMultiple(theta, orientationGranularity)
+	var height float64
+	if orient%180 != 90 {
+		height = trm.ScalingFactorY()
+	} else {
+		height = trm.ScalingFactorX()
+	}
+
+	start := translation(trm)
+	bbox := model.PdfRectangle{Llx: start.X, Lly: start.Y, Urx: end.X, Ury: end.Y}
+	switch orient % 360 {
+	case 90:
+		bbox.Urx -= height
+	case 180:
+		bbox.Ury -= height
+	case 270:
+		bbox.Urx += height
+	case 0:
+		bbox.Ury += height
+	default:
+		// This is a hack to capture diagonal text.
+		// TODO(peterwilliams97): Extract diagonal text.
+		orient = 0
+		bbox.Ury += height
+	}
+	if bbox.Llx > bbox.Urx {
+		bbox.Llx, bbox.Urx = bbox.Urx, bbox.Llx
+	}
+	if bbox.Lly > bbox.Ury {
+		bbox.Lly, bbox.Ury = bbox.Ury, bbox.Lly
+	}
+
+	clipped, onPage := rectIntersection(bbox, to.e.mediaBox)
+	if !onPage {
+		common.Log.Debug("Text mark outside page. bbox=%g mediaBox=%g text=%q",
+			bbox, to.e.mediaBox, text)
+	}
+	bbox = clipped
+
+	// The orientedBBox is bbox rotated and translated so the base of the character is at Lly.
+	orientedBBox := bbox
+	orientedMBox := to.e.mediaBox
+
+	switch orient % 360 {
+	case 90:
+		orientedMBox.Urx, orientedMBox.Ury = orientedMBox.Ury, orientedMBox.Urx
+		orientedBBox = model.PdfRectangle{
+			Llx: orientedMBox.Urx - bbox.Ury,
+			Urx: orientedMBox.Urx - bbox.Lly,
+			Lly: bbox.Llx,
+			Ury: bbox.Urx}
+	case 180:
+		orientedBBox = model.PdfRectangle{
+			Llx: orientedMBox.Urx - bbox.Llx,
+			Urx: orientedMBox.Urx - bbox.Urx,
+			Lly: orientedMBox.Ury - bbox.Lly,
+			Ury: orientedMBox.Ury - bbox.Ury}
+	case 270:
+		orientedMBox.Urx, orientedMBox.Ury = orientedMBox.Ury, orientedMBox.Urx
+		orientedBBox = model.PdfRectangle{
+			Llx: bbox.Ury,
+			Urx: bbox.Lly,
+			Lly: orientedMBox.Ury - bbox.Llx,
+			Ury: orientedMBox.Ury - bbox.Urx}
+	}
+	if orientedBBox.Llx > orientedBBox.Urx {
+		orientedBBox.Llx, orientedBBox.Urx = orientedBBox.Urx, orientedBBox.Llx
+	}
+	if orientedBBox.Lly > orientedBBox.Ury {
+		orientedBBox.Lly, orientedBBox.Ury = orientedBBox.Ury, orientedBBox.Lly
+	}
+
+	tm := textMark{
+		text:         text,
+		PdfRectangle: orientedBBox,
+		originaBBox:  bbox,
+		font:         font,
+		fontsize:     height,
+		charspacing:  charspacing,
+		trm:          trm,
+		end:          end,
+		orient:       orient,
+		fillColor:    fillColor,
+		strokeColor:  strokeColor,
+	}
+	if verboseGeom {
+		common.Log.Info("newTextMark: start=%.2f end=%.2f %s", start, end, tm.String())
+	}
+	return tm, onPage
+}
+
+// String returns a description of `tm`.
+func (tm *textMark) String() string {
+	return fmt.Sprintf("%.2f fontsize=%.2f \"%s\"", tm.PdfRectangle, tm.fontsize, tm.text)
+}
+
+// bbox makes textMark implement the `bounded` interface.
+func (tm *textMark) bbox() model.PdfRectangle {
+	return tm.PdfRectangle
+}
+
+// ToTextMark returns the public view of `tm`.
+func (tm *textMark) ToTextMark() TextMark {
+	return TextMark{
+		Text:        tm.text,
+		Original:    tm.original,
+		BBox:        tm.originaBBox,
+		Font:        tm.font,
+		FontSize:    tm.fontsize,
+		FillColor:   tm.fillColor,
+		StrokeColor: tm.strokeColor,
+	}
+}
+
+// inDiacriticArea returns true if `diacritic` is in the area where it could be a diacritic of `tm`.
+func (tm *textMark) inDiacriticArea(diacritic *textMark) bool {
+	dLlx := tm.Llx - diacritic.Llx
+	dUrx := tm.Urx - diacritic.Urx
+	dLly := tm.Lly - diacritic.Lly
+	return math.Abs(dLlx+dUrx) < tm.Width()*diacriticRadiusR &&
+		math.Abs(dLly) < tm.Height()*diacriticRadiusR
+}
+
+// appendTextMark appends `mark` to `marks` and updates `offset`, the offset of `mark` in the extracted
+// text.
+func appendTextMark(marks []TextMark, offset *int, mark TextMark) []TextMark {
+	mark.Offset = *offset
+	marks = append(marks, mark)
+	*offset += len(mark.Text)
+	return marks
+}
+
+// appendSpaceMark appends a spaceMark with space character `space` to `marks` and updates `offset`,
+// the offset of `mark` in the extracted text.
+func appendSpaceMark(marks []TextMark, offset *int, spaceChar string) []TextMark {
+	mark := spaceMark
+	mark.Text = spaceChar
+	return appendTextMark(marks, offset, mark)
+}
+
+// nearestMultiple return the integer multiple of `m` that is closest to `x`.
+func nearestMultiple(x float64, m int) int {
+	if m == 0 {
+		m = 1
+	}
+	fac := float64(m)
+	return int(math.Round(x/fac) * fac)
+}
diff --git a/extractor/text_page.go b/extractor/text_page.go
new file mode 100644
index 000000000..6bd8e7089
--- /dev/null
+++ b/extractor/text_page.go
@@ -0,0 +1,430 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package extractor
+
+import (
+	"fmt"
+	"io"
+	"math"
+	"sort"
+
+	"github.com/unidoc/unipdf/v3/common"
+	"github.com/unidoc/unipdf/v3/model"
+)
+
+// makeTextPage builds a paraList from `marks`, the textMarks on a page.
+// The paraList contains the page arranged as
+//  - a list of texPara in reading order
+//  - each textPara contains list of textLine (text lines or parts of text lines) in reading order
+//  - each textLine contains a list of textWord (words or parts of words) in reading order
+// The paraList is thus an ordering of words on a page.
+//   - Users of the paraList are expected to work with words. This should be adequate for most uses
+//     as words are the basic unit of meaning in written language.
+//   - However we provide links back from the extracted text to the textMarks as follows.
+//        * paraList.writeText() returns the extracted text for a page
+//        * paras.toTextMarks() returns a TextMarkArray containing the marks
+//        * TextMarkArray.RangeOffset(lo, hi) return the marks corresponding offsets [lo:hi] in the
+//          extracted text.
+// NOTE: The "parts of words" occur because of hyphenation. We do some weak coordinate based
+//        dehypenation. Caller who need strong dehypenation should use NLP librarie.
+//       The "parts of lines" are an implementation detail. Line fragments are combined in
+//        paraList.writeText()
+// ALGORITHM:
+// 1) Group the textMarks into textWords based on their bounding boxes.
+// 2) Group the textWords into textParas based on their bounding boxes.
+// 3) Detect textParas arranged as cells in a table and convert each one to a textPara containing a
+//    textTable.
+// 4) Sort the textParas in reading order.
+func makeTextPage(marks []*textMark, pageSize model.PdfRectangle) paraList {
+	common.Log.Trace("makeTextPage: %d elements pageSize=%.2f", len(marks), pageSize)
+	if len(marks) == 0 {
+		return nil
+	}
+
+	// Group the marks into word fragments
+	words := makeTextWords(marks, pageSize)
+	if len(words) == 0 {
+		return nil
+	}
+
+	// Put the word fragments into a container that facilitates the grouping of words into paragraphs.
+	pageWords := makeWordBag(words, pageSize.Ury)
+
+	// Divide the page into rectangular regions for each paragraph and creata a wordBag for each one.
+	paraWords := dividePage(pageWords, pageSize.Ury)
+	paraWords = mergeWordBags(paraWords)
+
+	// Arrange the contents of each paragraph wordBag into lines and the lines into whole words.
+	paras := make(paraList, 0, len(paraWords))
+	for _, bag := range paraWords {
+		para := bag.arrangeText()
+		if para != nil {
+			paras = append(paras, para)
+		}
+	}
+
+	// Find paras that are cells in tables, convert the tables to paras and remove the cell paras.
+	if len(paras) >= minTableParas {
+		paras = paras.extractTables()
+	}
+
+	// Sort the paras into reading order.
+	paras.sortReadingOrder()
+	paras.log("sorted in reading order")
+
+	return paras
+}
+
+// dividePage divides `pageWords`, the page wordBag, into a list of paragraph wordBags.
+func dividePage(pageWords *wordBag, pageHeight float64) []*wordBag {
+	var paraWordBags []*wordBag
+
+	// We move words from `page` to paras until there no words left in page.
+	// We do this by iterating through `page` in depth bin order and, for each surving bin (see
+	// below),  creating a paragraph with seed word, `words[0]` in the code below.
+	// We then move words from around the `para` region from `page` to `para` .
+	// This may empty some page bins before we iterate to them
+	// Some bins are emptied before they iterated to (seee "surving bin" above).
+	// If a `page` survives until it is iterated to then at least one `para` will be built around it.
+
+	for _, depthIdx := range pageWords.depthIndexes() {
+		changed := false
+		for !pageWords.empty(depthIdx) {
+			// Start a new paragraph region `paraWords`.
+			// Build `paraWords` out from the left-most (lowest in reading direction) word `words`[0],
+			// in the bins in and below `depthIdx`.
+
+			// `firstWord` is the left-most word from the bins in and a few lines below `depthIdx`. We
+			// seed 'paraWords` with this word.
+			firstReadingIdx := pageWords.firstReadingIndex(depthIdx)
+			firstWord := pageWords.firstWord(firstReadingIdx)
+			paraWords := newWordBag(firstWord, pageHeight)
+			pageWords.removeWord(firstWord, firstReadingIdx)
+			if verbosePage {
+				common.Log.Info("words[0]=%s", firstWord.String())
+			}
+
+			// The following 3 numbers define whether words should be added to `paraWords`.
+			minInterReadingGap := minInterReadingGapR * paraWords.fontsize
+			maxIntraReadingGap := maxIntraReadingGapR * paraWords.fontsize
+			maxIntraDepthGap := maxIntraDepthGapR * paraWords.fontsize
+
+			// Add words to `paraWords` until we pass through the following loop without adding a
+			// new word.
+			for running := true; running; running = changed {
+				changed = false
+
+				// Add words that are within maxIntraDepthGap of `paraWords` in the depth direction.
+				// i.e. Stretch paraWords in the depth direction, vertically for English text.
+				if verbosePage {
+					common.Log.Info("paraWords depth %.2f - %.2f maxIntraDepthGap=%.2f ",
+						paraWords.minDepth(), paraWords.maxDepth(), maxIntraDepthGap)
+				}
+				if pageWords.scanBand("vertical", paraWords, partial(readingOverlapPlusGap, 0),
+					paraWords.minDepth()-maxIntraDepthGap, paraWords.maxDepth()+maxIntraDepthGap,
+					maxIntraDepthFontTolR, false, false) > 0 {
+					changed = true
+				}
+				// Add words that are within maxIntraReadingGap of `paraWords` in the reading direction.
+				// i.e. Stretch paraWords in the reading direction, horizontall for English text.
+				if pageWords.scanBand("horizontal", paraWords, partial(readingOverlapPlusGap, maxIntraReadingGap),
+					paraWords.minDepth(), paraWords.maxDepth(),
+					maxIntraReadingFontTol, false, false) > 0 {
+					changed = true
+				}
+				// The above stretching has got as far as it can go. Repeating it won't pull in more words.
+
+				// Only try to combine other words if we can't grow paraWords in the simple way above.
+				if changed {
+					continue
+				}
+
+				// In the following cases, we don't expand `paraWords` while scanning. We look for words
+				// around paraWords. If we find them, we add them then expand `paraWords` when we are done.
+				// This pulls the numbers to the left of paraWords into paraWords
+				// e.g. From
+				// 		Regulatory compliance
+				// 		Archiving
+				// 		Document search
+				// to
+				// 		1. Regulatory compliance
+				// 		2. Archiving
+				// 		3. Document search
+
+				// If there are words to the left of `paraWords`, add them.
+				// We need to limit the number of words.
+				n := pageWords.scanBand("", paraWords, partial(readingOverlapLeft, minInterReadingGap),
+					paraWords.minDepth(), paraWords.maxDepth(),
+					minInterReadingFontTol, true, false)
+				if n > 0 {
+					r := (paraWords.maxDepth() - paraWords.minDepth()) / paraWords.fontsize
+					if (n > 1 && float64(n) > 0.3*r) || n <= 10 {
+						if pageWords.scanBand("other", paraWords, partial(readingOverlapLeft, minInterReadingGap),
+							paraWords.minDepth(), paraWords.maxDepth(),
+							minInterReadingFontTol, false, true) > 0 {
+							changed = true
+						}
+					}
+				}
+			}
+			paraWordBags = append(paraWordBags, paraWords)
+		}
+	}
+
+	return paraWordBags
+}
+
+// writeText writes the text in `paras` to `w`.
+func (paras paraList) writeText(w io.Writer) {
+	for ip, para := range paras {
+		para.writeText(w)
+		if ip != len(paras)-1 {
+			if sameLine(para, paras[ip+1]) {
+				w.Write([]byte(" "))
+			} else {
+				w.Write([]byte("\n"))
+				w.Write([]byte("\n"))
+			}
+		}
+	}
+	w.Write([]byte("\n"))
+	w.Write([]byte("\n"))
+}
+
+// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
+// `paras`.writeText().
+func (paras paraList) toTextMarks() []TextMark {
+	offset := 0
+	var marks []TextMark
+	for ip, para := range paras {
+		paraMarks := para.toTextMarks(&offset)
+		marks = append(marks, paraMarks...)
+		if ip != len(paras)-1 {
+			if sameLine(para, paras[ip+1]) {
+				marks = appendSpaceMark(marks, &offset, " ")
+			} else {
+				marks = appendSpaceMark(marks, &offset, "\n")
+				marks = appendSpaceMark(marks, &offset, "\n")
+			}
+		}
+	}
+	marks = appendSpaceMark(marks, &offset, "\n")
+	marks = appendSpaceMark(marks, &offset, "\n")
+	return marks
+}
+
+// sameLine returms true if `para1` and `para2` are on the same line.
+func sameLine(para1, para2 *textPara) bool {
+	return isZero(para1.depth() - para2.depth())
+}
+
+// tables returns the tables from all the paras that contain them.
+func (paras paraList) tables() []TextTable {
+	var tables []TextTable
+	for _, para := range paras {
+		if para.table != nil {
+			tables = append(tables, para.table.toTextTable())
+		}
+	}
+	return tables
+}
+
+// sortReadingOrder sorts `paras` in reading order.
+func (paras paraList) sortReadingOrder() {
+	common.Log.Trace("sortReadingOrder: paras=%d ===========x=============", len(paras))
+	if len(paras) <= 1 {
+		return
+	}
+	paras.computeEBBoxes()
+	sort.Slice(paras, func(i, j int) bool { return diffDepthReading(paras[i], paras[j]) <= 0 })
+	order := paras.topoOrder()
+	paras.reorder(order)
+}
+
+// topoOrder returns the ordering of the topological sort of `paras` using readBefore() to determine
+// the incoming nodes to each node.
+func (paras paraList) topoOrder() []int {
+	if verbosePage {
+		common.Log.Info("topoOrder:")
+	}
+	n := len(paras)
+	visited := make([]bool, n)
+	order := make([]int, 0, n)
+	llyOrder := paras.llyOrdering()
+
+	// sortNode recursively sorts below node `idx` in the adjacency matrix.
+	var sortNode func(idx int)
+	sortNode = func(idx int) {
+		visited[idx] = true
+		for i := 0; i < n; i++ {
+			if !visited[i] {
+				if paras.readBefore(llyOrder, idx, i) {
+					sortNode(i)
+				}
+			}
+		}
+		order = append(order, idx) // Should prepend but it's cheaper to append and reverse later.
+	}
+
+	for idx := 0; idx < n; idx++ {
+		if !visited[idx] {
+			sortNode(idx)
+		}
+	}
+
+	return reversed(order)
+}
+
+// readBefore returns true if paras[`i`] comes before paras[`j`].
+// readBefore defines an ordering over `paras`.
+// a = paras[i],  b= paras[j]
+// 1. Line segment `a` comes before line segment `b` if their ranges of x-coordinates overlap and if
+//    line segment `a` is above line segment `b` on the page.
+// 2. Line segment `a` comes before line segment `b` if `a` is entirely to the left of `b` and if
+//    there does not exist a line segment `c` whose y-coordinates are between `a` and `b` and whose
+//    range of x coordinates overlaps both `a` and `b`.
+// From Thomas M. Breuel "High Performance Document Layout Analysis"
+func (paras paraList) readBefore(ordering []int, i, j int) bool {
+	a, b := paras[i], paras[j]
+	// Breuel's rule 1
+	if overlappedXPara(a, b) && a.Lly > b.Lly {
+		return true
+	}
+
+	// Breuel's rule 2
+	if !(a.eBBox.Urx < b.eBBox.Llx) {
+		return false
+	}
+
+	lo, hi := a.Lly, b.Lly
+	if lo > hi {
+		hi, lo = lo, hi
+	}
+	llx := math.Max(a.eBBox.Llx, b.eBBox.Llx)
+	urx := math.Min(a.eBBox.Urx, b.eBBox.Urx)
+
+	llyOrder := paras.llyRange(ordering, lo, hi)
+	for _, k := range llyOrder {
+		if k == i || k == j {
+			continue
+		}
+		c := paras[k]
+		if c.eBBox.Llx <= urx && llx <= c.eBBox.Urx {
+			return false
+		}
+	}
+	return true
+}
+
+// overlappedX returns true if `r0` and `r1` overlap on the x-axis.
+func overlappedXPara(r0, r1 *textPara) bool {
+	return intersectsX(r0.eBBox, r1.eBBox)
+}
+
+// llyOrdering is ordering over the indexes of `paras` sorted by Llx is increasing order.
+func (paras paraList) llyOrdering() []int {
+	ordering := make([]int, len(paras))
+	for i := range paras {
+		ordering[i] = i
+	}
+	sort.SliceStable(ordering, func(i, j int) bool {
+		oi, oj := ordering[i], ordering[j]
+		return paras[oi].Lly < paras[oj].Lly
+	})
+	return ordering
+}
+
+// llyRange returns the indexes in `paras` of paras p: lo <= p.Llx < hi
+func (paras paraList) llyRange(ordering []int, lo, hi float64) []int {
+	n := len(paras)
+	if hi < paras[ordering[0]].Lly || lo > paras[ordering[n-1]].Lly {
+		return nil
+	}
+
+	// i0 is the lowest i: lly(i) >= lo
+	// i1 is the lowest i: lly(i) > hi
+	i0 := sort.Search(n, func(i int) bool { return paras[ordering[i]].Lly >= lo })
+	i1 := sort.Search(n, func(i int) bool { return paras[ordering[i]].Lly > hi })
+
+	return ordering[i0:i1]
+}
+
+// computeEBBoxes computes the eBBox fields in the elements of `paras`.
+// The EBBoxs are the regions around the paras that don't intersect paras in other columns.
+// This is needed for sortReadingOrder to work with skinny paras in a column of fat paras. The
+// sorting assumes the skinny para bounding box is as wide as the fat para bounding boxes.
+func (paras paraList) computeEBBoxes() {
+	if verbose {
+		common.Log.Info("computeEBBoxes:")
+	}
+
+	for _, para := range paras {
+		para.eBBox = para.PdfRectangle
+	}
+	paraYNeighbours := paras.yNeighbours()
+
+	for i, aa := range paras {
+		a := aa.eBBox
+		// [llx, urx] is the reading direction interval for which no paras overlap `a`.
+		llx, urx := -1.0e9, +1.0e9
+
+		for _, j := range paraYNeighbours[aa] {
+			b := paras[j].eBBox
+			if b.Urx < a.Llx { // `b` to left of `a`. no x overlap.
+				llx = math.Max(llx, b.Urx)
+			} else if a.Urx < b.Llx { // `b` to right of `a`. no x overlap.
+				urx = math.Min(urx, b.Llx)
+			}
+		}
+
+		// llx extends left from `a` and overlaps no other paras.
+		// urx extends right from `a` and overlaps no other paras.
+
+		// Go through all paras below `a` within interval [llx, urx] in the reading direction and
+		// expand `a` as far as possible to left and right without overlapping any of them.
+		for j, bb := range paras {
+			b := bb.eBBox
+			if i == j || b.Ury > a.Lly {
+				continue
+			}
+
+			if llx <= b.Llx && b.Llx < a.Llx {
+				// If `b` is completely to right of `llx`, extend `a` left to `b`.
+				a.Llx = b.Llx
+			} else if b.Urx <= urx && a.Urx < b.Urx {
+				// If `b` is completely to left of `urx`, extend `a` right to `b`.
+				a.Urx = b.Urx
+			}
+		}
+		if verbose {
+			fmt.Printf("%4d: %6.2f->%6.2f %q\n", i, aa.eBBox, a, truncate(aa.text(), 50))
+		}
+		aa.eBBox = a
+	}
+	if useEBBox {
+		for _, para := range paras {
+			para.PdfRectangle = para.eBBox
+		}
+	}
+}
+
+// reversed return `order` reversed.
+func reversed(order []int) []int {
+	rev := make([]int, len(order))
+	for i, v := range order {
+		rev[len(order)-1-i] = v
+	}
+	return rev
+}
+
+// reorder reorders `para` to the order in `order`.
+func (paras paraList) reorder(order []int) {
+	sorted := make(paraList, len(paras))
+	for i, k := range order {
+		sorted[i] = paras[k]
+	}
+	copy(paras, sorted)
+}
diff --git a/extractor/text_para.go b/extractor/text_para.go
new file mode 100644
index 000000000..9982ffa9d
--- /dev/null
+++ b/extractor/text_para.go
@@ -0,0 +1,354 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package extractor
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"math"
+	"sort"
+
+	"github.com/unidoc/unipdf/v3/common"
+	"github.com/unidoc/unipdf/v3/model"
+)
+
+// paraList is a sequence of textPara. We use it so often that it is convenient to have its own
+// type so we can have methods on it.
+type paraList []*textPara
+
+// textPara is a group of words in a rectangular region of a page that get read together.
+// A paragraph in a document might span multiple pages. This is a paragraph fragment on one page.
+// textParas can be tables in which case the content is in `table`, otherwise the content is in `lines`.
+// textTable cells are textParas so this gives one level of recursion
+type textPara struct {
+	model.PdfRectangle                    // Bounding box.
+	eBBox              model.PdfRectangle // Extended bounding box needed to compute reading order.
+	lines              []*textLine        // The lines in the paragraph. (nil for the table case)
+	table              *textTable         // The table contained in this region if there is one. nil otherwise
+	// The following fields are used for detecting and extracting tables.
+	isCell bool // Is this para a cell in a textTable?
+	// The unique highest para completely to the left of this that overlaps it in the y-direction, if one exists..
+	left *textPara
+	// The unique highest para completely to the right of this that overlaps it in the y-direction, if one exists.
+	right *textPara
+	// The unique highest para completely above this that overlaps it in the x-direction, if one exists.
+	above *textPara
+	// The unique highest para completely below this that overlaps it in the x-direction, if one exists.
+	below *textPara
+}
+
+// makeTextPara returns a textPara with bounding rectangle `bbox`.
+func makeTextPara(bbox model.PdfRectangle, lines []*textLine) *textPara {
+	return &textPara{PdfRectangle: bbox, lines: lines}
+}
+
+// String returns a description of `p`.
+func (p *textPara) String() string {
+	table := ""
+	if p.table != nil {
+		table = fmt.Sprintf("[%dx%d] ", p.table.w, p.table.h)
+	}
+	return fmt.Sprintf("%6.2f %s%d lines %q",
+		p.PdfRectangle, table, len(p.lines), truncate(p.text(), 50))
+}
+
+// depth returns the paragraph's depth. which is the depth of its top line.
+// We return the top line depth because textPara depth is used to tell if 2 paras have the same
+// depth. English readers compare paragraph depths by their top lines.
+func (p *textPara) depth() float64 {
+	if len(p.lines) > 0 {
+		return p.lines[0].depth
+	}
+	// Use the top left cell of the table if there is one
+	return p.table.get(0, 0).depth()
+}
+
+// text is a convenience function that returns the text `p` including tables.
+func (p *textPara) text() string {
+	w := new(bytes.Buffer)
+	p.writeText(w)
+	return w.String()
+}
+
+// writeText writes the text of `p` including tables to `w`.
+func (p *textPara) writeText(w io.Writer) {
+	if p.table == nil {
+		p.writeCellText(w)
+		return
+	}
+	for y := 0; y < p.table.h; y++ {
+		for x := 0; x < p.table.w; x++ {
+			cell := p.table.get(x, y)
+			if cell == nil {
+				w.Write([]byte("\t"))
+			} else {
+				cell.writeCellText(w)
+			}
+			w.Write([]byte(" "))
+		}
+		if y < p.table.h-1 {
+			w.Write([]byte("\n"))
+		}
+	}
+}
+
+// toTextMarks creates the TextMarkArray corresponding to the extracted text created by
+// paras `p`.writeText().
+func (p *textPara) toTextMarks(offset *int) []TextMark {
+	if p.table == nil {
+		return p.toCellTextMarks(offset)
+	}
+	var marks []TextMark
+	for y := 0; y < p.table.h; y++ {
+		for x := 0; x < p.table.w; x++ {
+			cell := p.table.get(x, y)
+			if cell == nil {
+				marks = appendSpaceMark(marks, offset, "\t")
+			} else {
+				cellMarks := cell.toCellTextMarks(offset)
+				marks = append(marks, cellMarks...)
+			}
+			marks = appendSpaceMark(marks, offset, " ")
+		}
+		if y < p.table.h-1 {
+			marks = appendSpaceMark(marks, offset, "\n")
+		}
+	}
+	return marks
+}
+
+// writeCellText writes the text of `p` not including tables to `w`.
+func (p *textPara) writeCellText(w io.Writer) {
+	for il, line := range p.lines {
+		lineText := line.text()
+		reduced := doHyphens && line.endsInHyphen() && il != len(p.lines)-1
+		if reduced { // Line ending with hyphen. Remove it.
+			lineText = removeLastRune(lineText)
+		}
+		w.Write([]byte(lineText))
+		if !(reduced || il == len(p.lines)-1) {
+			w.Write([]byte(getSpace(line.depth, p.lines[il+1].depth)))
+		}
+	}
+}
+
+// toCellTextMarks creates the TextMarkArray corresponding to the extracted text created by
+// paras `p`.writeCellText().
+func (p *textPara) toCellTextMarks(offset *int) []TextMark {
+	var marks []TextMark
+	for il, line := range p.lines {
+		lineMarks := line.toTextMarks(offset)
+		reduced := doHyphens && line.endsInHyphen() && il != len(p.lines)-1
+		if reduced { // Line ending with hyphen. Remove it.
+			lineMarks = removeLastTextMarkRune(lineMarks, offset)
+		}
+		marks = append(marks, lineMarks...)
+		if !(reduced || il == len(p.lines)-1) {
+			marks = appendSpaceMark(marks, offset, getSpace(line.depth, p.lines[il+1].depth))
+		}
+	}
+	return marks
+}
+
+// removeLastTextMarkRune removes the last rune from `marks`.
+func removeLastTextMarkRune(marks []TextMark, offset *int) []TextMark {
+	tm := marks[len(marks)-1]
+	runes := []rune(tm.Text)
+	if len(runes) == 1 {
+		marks = marks[:len(marks)-1]
+		tm1 := marks[len(marks)-1]
+		*offset = tm1.Offset + len(tm1.Text)
+	} else {
+		text := removeLastRune(tm.Text)
+		*offset += len(text) - len(tm.Text)
+		tm.Text = text
+	}
+	return marks
+}
+
+// removeLastRune removes the last run from `text`.
+func removeLastRune(text string) string {
+	runes := []rune(text)
+	return string(runes[:len(runes)-1])
+}
+
+// getSpace returns the space to insert between lines of depth `depth1` and `depth2`.
+// Next line is the same depth so it's the same line as this one in the extracted text
+func getSpace(depth1, depth2 float64) string {
+	eol := !isZero(depth1 - depth2)
+	if eol {
+		return "\n"
+	}
+	return " "
+}
+
+// bbox makes textPara implement the `bounded` interface.
+func (p *textPara) bbox() model.PdfRectangle {
+	return p.PdfRectangle
+}
+
+// fontsize return the para's fontsize which we take to be the first line's fontsize.
+// Caller must check that `p` has at least one line.
+func (p *textPara) fontsize() float64 {
+	return p.lines[0].fontsize
+}
+
+// removeDuplicates removes duplicate word fragments such as those used for bolding.
+func (b *wordBag) removeDuplicates() {
+	for _, depthIdx := range b.depthIndexes() {
+		if len(b.bins[depthIdx]) == 0 {
+			continue
+		}
+		word := b.bins[depthIdx][0]
+		delta := maxDuplicateWordR * word.fontsize
+		minDepth := word.depth
+		for _, idx := range b.depthBand(minDepth, minDepth+delta) {
+			duplicates := map[*textWord]struct{}{}
+			words := b.bins[idx]
+			for _, w := range words {
+				if w != word && w.text == word.text &&
+					math.Abs(w.Llx-word.Llx) < delta &&
+					math.Abs(w.Urx-word.Urx) < delta &&
+					math.Abs(w.Lly-word.Lly) < delta &&
+					math.Abs(w.Ury-word.Ury) < delta {
+					duplicates[w] = struct{}{}
+				}
+			}
+			if len(duplicates) > 0 {
+				i := 0
+				for _, w := range words {
+					if _, ok := duplicates[w]; !ok {
+						words[i] = w
+						i++
+					}
+				}
+				b.bins[idx] = words[:len(words)-len(duplicates)]
+				if len(b.bins[idx]) == 0 {
+					delete(b.bins, idx)
+				}
+			}
+		}
+	}
+}
+
+// arrangeText arranges the word fragments (textWords) in `b` into lines and words.
+// The lines are groups of textWords of similar depths.
+// The textWords in each line are sorted in reading order and those that start whole words (as
+// opposed to word fragments) have their `newWord` flag set to true.
+func (b *wordBag) arrangeText() *textPara {
+	b.sort() // Sort the words in `b`'s bins in the reading direction.
+
+	if doRemoveDuplicates {
+		b.removeDuplicates()
+	}
+
+	var lines []*textLine
+
+	// Build the lines by iterating through the words from top to bottom.
+	// In the current implementation, we do this by emptying the word bins in increasing depth order.
+	for _, depthIdx := range b.depthIndexes() {
+		for !b.empty(depthIdx) {
+
+			// firstWord is the left-most word near the top of the bin with index `depthIdx`. As we
+			// are scanning down `b`, this is the  left-most word near the top of the `b`
+			firstReadingIdx := b.firstReadingIndex(depthIdx)
+			firstWord := b.firstWord(firstReadingIdx)
+			// Create a new line.
+			line := newTextLine(b, firstReadingIdx)
+
+			// Compute the search range based on `b` first word fontsize.
+			fontsize := firstWord.fontsize
+			minDepth := firstWord.depth - lineDepthR*fontsize
+			maxDepth := firstWord.depth + lineDepthR*fontsize
+			maxIntraWordGap := maxIntraWordGapR * fontsize
+			maxIntraLineOverlap := maxIntraLineOverlapR * fontsize
+
+			// Find the rest of the words in the line that starts with `firstWord`
+			// Search down from `minDepth`, half a line above `firstWord` to `maxDepth`, half a line
+			// below `firstWord` for the leftmost word to the right of the last word in `line`.
+		remainingWords:
+			for {
+				var nextWord *textWord // The next word to add to `line` if there is one.
+				nextDepthIdx := 0      // nextWord's depthIndex
+				// We start with this highest remaining word
+				for _, depthIdx := range b.depthBand(minDepth, maxDepth) {
+					word := b.highestWord(depthIdx, minDepth, maxDepth)
+					if word == nil {
+						continue
+					}
+					gap := gapReading(word, line.words[len(line.words)-1])
+					if gap < -maxIntraLineOverlap { // Reverted too far to left. Can't be same line.
+						break remainingWords
+					}
+					if gap > maxIntraWordGap { // Advanced too far too right. Might not be same line.
+						continue
+					}
+					if nextWord != nil && diffReading(word, nextWord) >= 0 { // Not leftmost world
+						continue
+					}
+					nextWord = word
+					nextDepthIdx = depthIdx
+				}
+				if nextWord == nil { // No more words in this line.
+					break
+				}
+				// remove `nextWord` from `b` and append it to `line`.
+				line.pullWord(b, nextWord, nextDepthIdx)
+			}
+
+			line.markWordBoundaries()
+			lines = append(lines, line)
+		}
+	}
+
+	if len(lines) == 0 {
+		return nil
+	}
+
+	sort.Slice(lines, func(i, j int) bool {
+		return diffDepthReading(lines[i], lines[j]) < 0
+	})
+
+	para := makeTextPara(b.PdfRectangle, lines)
+
+	if verbosePara {
+		common.Log.Info("arrangeText !!! para=%s", para.String())
+		if verboseParaLine {
+			for i, line := range para.lines {
+				fmt.Printf("%4d: %s\n", i, line.String())
+				if verboseParaWord {
+					for j, word := range line.words {
+						fmt.Printf("%8d: %s\n", j, word.String())
+						for k, mark := range word.marks {
+							fmt.Printf("%12d: %s\n", k, mark.String())
+						}
+					}
+				}
+			}
+		}
+	}
+	return para
+}
+
+// log logs the contents of `paras`.
+func (paras paraList) log(title string) {
+	if !verbosePage {
+		return
+	}
+	common.Log.Info("%8s: %d paras =======-------=======", title, len(paras))
+	for i, para := range paras {
+		if para == nil {
+			continue
+		}
+		text := para.text()
+		tabl := "  "
+		if para.table != nil {
+			tabl = fmt.Sprintf("[%dx%d]", para.table.w, para.table.h)
+		}
+		fmt.Printf("%4d: %6.2f %s %q\n", i, para.PdfRectangle, tabl, truncate(text, 50))
+	}
+}
diff --git a/extractor/text_table.go b/extractor/text_table.go
new file mode 100644
index 000000000..d1eb5cbfd
--- /dev/null
+++ b/extractor/text_table.go
@@ -0,0 +1,303 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package extractor
+
+import (
+	"fmt"
+	"sort"
+
+	"github.com/unidoc/unipdf/v3/common"
+	"github.com/unidoc/unipdf/v3/model"
+)
+
+// textTable is a table of `w` x `h` textPara cells.
+type textTable struct {
+	model.PdfRectangle                      // Bounding rectangle.
+	w, h               int                  // w=number of columns. h=number of rows.
+	cells              map[uint64]*textPara // The cells
+}
+
+// String returns a description of `t`.
+func (t *textTable) String() string {
+	return fmt.Sprintf("%d x %d", t.w, t.h)
+}
+
+// bbox makes textLine implement the `bounded` interface.
+func (t *textTable) bbox() model.PdfRectangle {
+	return t.PdfRectangle
+}
+
+// extractTables converts the`paras` that are table cells to tables containing those cells.
+func (paras paraList) extractTables() paraList {
+	if verboseTable {
+		common.Log.Debug("extractTables=%d ===========x=============", len(paras))
+	}
+	if len(paras) < minTableParas {
+		return paras
+	}
+	tables := paras.findTables()
+	if verboseTable {
+		common.Log.Info("combined tables %d ================", len(tables))
+		for i, t := range tables {
+			t.log(fmt.Sprintf("combined %d", i))
+		}
+	}
+	return paras.applyTables(tables)
+}
+
+// findTables returns all the tables  in `paras`.
+func (paras paraList) findTables() []*textTable {
+	paras.addNeighbours()
+	// Pre-sort by reading direction then depth
+	sort.Slice(paras, func(i, j int) bool {
+		return diffReadingDepth(paras[i], paras[j]) < 0
+	})
+
+	var tables []*textTable
+	for _, para := range paras {
+		if para.isCell {
+			continue
+		}
+		table := para.isAtom()
+		if table == nil {
+			continue
+		}
+
+		table.growTable()
+		if table.w*table.h < minTableParas {
+			continue
+		}
+		table.markCells()
+		table.log("grown")
+		tables = append(tables, table)
+
+	}
+	return tables
+}
+
+// isAtom atempts to build the smallest possible table fragment of 2 x 2 cells.
+// If a table can be built then it is returned. Otherwise nil is returned.
+// The smallest possible table is
+//   a b
+//   c d
+// where
+//   a is `para`.
+//   b is immediately to the right of a and overlaps it in the y axis.
+//   c is immediately below a and overlaps it in the x axis.
+//   d is immediately to the right of c and overlaps it in the y axis and
+//        immediately below b and ooverlaps it in the s axis.
+//   None of a, b, c or d are cells in existing tables.
+func (para *textPara) isAtom() *textTable {
+	a := para
+	b := para.right
+	c := para.below
+	if !(b != nil && !b.isCell && c != nil && !c.isCell) {
+		return nil
+	}
+	d := b.below
+	if !(d != nil && !d.isCell && d == c.right) {
+		return nil
+	}
+
+	if b.left != a || c.above != a || d.left != c || d.above != b {
+		return nil
+	}
+	return newTableAtom(a, b, c, d)
+}
+
+// newTable returns a table containing the a, b, c, d elements from isAtom().
+func newTableAtom(a, b, c, d *textPara) *textTable {
+	t := &textTable{w: 2, h: 2, cells: map[uint64]*textPara{}}
+	t.put(0, 0, a)
+	t.put(1, 0, b)
+	t.put(0, 1, c)
+	t.put(1, 1, d)
+	return t
+}
+
+// growTable grows `t` to the largest w x h it can while remaining a valid table.
+// It repeatedly tries to extend by one row and/or column
+//    - down and right, then
+//    - down, then
+//    - right.
+func (t *textTable) growTable() {
+	growDown := func(down paraList) {
+		t.h++
+		for x := 0; x < t.w; x++ {
+			cell := down[x]
+			t.put(x, t.h-1, cell)
+		}
+	}
+	growRight := func(right paraList) {
+		t.w++
+		for y := 0; y < t.h; y++ {
+			cell := right[y]
+			t.put(t.w-1, y, cell)
+		}
+	}
+
+	for {
+		changed := false
+		down := t.getDown()
+		right := t.getRight()
+		if down != nil && right != nil {
+			downRight := down[len(down)-1]
+			if downRight != nil && !downRight.isCell && downRight == right[len(right)-1] {
+				growDown(down)
+				growRight(right)
+				t.put(t.w-1, t.h-1, downRight)
+				changed = true
+			}
+		}
+		if !changed && down != nil {
+			growDown(down)
+			changed = true
+		}
+		if !changed && right != nil {
+			growRight(right)
+			changed = true
+		}
+		if !changed {
+			break
+		}
+	}
+}
+
+// getDown returns the row of cells below `t` if they are a valid extension to `t` or nil if they aren't.
+func (t *textTable) getDown() paraList {
+	cells := make(paraList, t.w)
+	for x := 0; x < t.w; x++ {
+		cell := t.get(x, t.h-1).below
+		if cell == nil || cell.isCell {
+			return nil
+		}
+		cells[x] = cell
+	}
+	for x := 0; x < t.w-1; x++ {
+		if cells[x].right != cells[x+1] {
+			return nil
+		}
+	}
+	return cells
+}
+
+// getRight returns the column of cells to the right `t` if they are a valid extension to `t` or nil
+// if they aren't.
+func (t *textTable) getRight() paraList {
+	cells := make(paraList, t.h)
+	for y := 0; y < t.h; y++ {
+		cell := t.get(t.w-1, y).right
+		if cell == nil || cell.isCell {
+			return nil
+		}
+		cells[y] = cell
+	}
+	for y := 0; y < t.h-1; y++ {
+		if cells[y].below != cells[y+1] {
+			return nil
+		}
+	}
+	return cells
+}
+
+// applyTables replaces the paras that are cells in `tables` with paras containing the tables in
+//`tables`. This, of course, reduces the number of paras.
+func (paras paraList) applyTables(tables []*textTable) paraList {
+	consumed := map[*textPara]struct{}{}
+	var tabled paraList
+	for _, table := range tables {
+		for _, para := range table.cells {
+			consumed[para] = struct{}{}
+		}
+		tabled = append(tabled, table.newTablePara())
+	}
+	for _, para := range paras {
+		if _, ok := consumed[para]; !ok {
+			tabled = append(tabled, para)
+		}
+	}
+	return tabled
+}
+
+// markCells marks the paras that are cells in `t` with isCell=true so that the won't be considered
+// as cell candidates for tables in the future.
+func (t *textTable) markCells() {
+	for y := 0; y < t.h; y++ {
+		for x := 0; x < t.w; x++ {
+			para := t.get(x, y)
+			para.isCell = true
+		}
+	}
+}
+
+// newTablePara returns a textPara containing `t`.
+func (t *textTable) newTablePara() *textPara {
+	bbox := t.computeBbox()
+	return &textPara{
+		PdfRectangle: bbox,
+		eBBox:        bbox,
+		table:        t,
+	}
+}
+
+// computeBbox computes and returns the bounding box of `t`.
+func (t *textTable) computeBbox() model.PdfRectangle {
+	r := t.get(0, 0).PdfRectangle
+	for x := 1; x < t.w; x++ {
+		r = rectUnion(r, t.get(x, 0).PdfRectangle)
+	}
+	for y := 1; y < t.h; y++ {
+		for x := 0; x < t.w; x++ {
+			r = rectUnion(r, t.get(x, y).PdfRectangle)
+		}
+	}
+	return r
+}
+
+// toTextTable returns the TextTable corresponding to `t`.
+func (t *textTable) toTextTable() TextTable {
+	cells := make([][]TableCell, t.h)
+	for y := 0; y < t.h; y++ {
+		cells[y] = make([]TableCell, t.w)
+		for x := 0; x < t.w; x++ {
+			c := t.get(x, y)
+			cells[y][x].Text = c.text()
+			offset := 0
+			cells[y][x].Marks.marks = c.toTextMarks(&offset)
+		}
+	}
+	return TextTable{W: t.w, H: t.h, Cells: cells}
+}
+
+// get returns the cell at `x`, `y`.
+func (t *textTable) get(x, y int) *textPara {
+	return t.cells[cellIndex(x, y)]
+}
+
+// put sets the cell at `x`, `y` to `cell`.
+func (t *textTable) put(x, y int, cell *textPara) {
+	t.cells[cellIndex(x, y)] = cell
+}
+
+// cellIndex returns a number that will be different for different `x` and `y` for any table found
+// in a PDF which will less than 2^32 wide and hight.
+func cellIndex(x, y int) uint64 {
+	return uint64(x)*0x1000000 + uint64(y)
+}
+
+func (t *textTable) log(title string) {
+	if !verboseTable {
+		return
+	}
+	common.Log.Info("~~~ %s: %d x %d\n      %6.2f", title,
+		t.w, t.h, t.PdfRectangle)
+	for y := 0; y < t.h; y++ {
+		for x := 0; x < t.w; x++ {
+			p := t.get(x, y)
+			fmt.Printf("%4d %2d: %6.2f %q\n", x, y, p.PdfRectangle, truncate(p.text(), 50))
+		}
+	}
+}
diff --git a/extractor/text_test.go b/extractor/text_test.go
index 89b920f3c..445f5bc62 100644
--- a/extractor/text_test.go
+++ b/extractor/text_test.go
@@ -19,10 +19,10 @@ import (
 	"sort"
 	"strings"
 	"testing"
+	"unicode/utf8"
 
 	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/creator"
-	"github.com/unidoc/unipdf/v3/internal/transform"
 	"github.com/unidoc/unipdf/v3/model"
 	"golang.org/x/text/unicode/norm"
 )
@@ -41,8 +41,9 @@ const (
 var (
 	// forceTest should be set to true to force running all tests.
 	// NOTE: Setting environment variable UNIDOC_EXTRACT_FORCETEST = 1 sets this to true.
-	forceTest    = os.Getenv("UNIDOC_EXTRACT_FORCETEST") == "1"
-	corpusFolder = os.Getenv("UNIDOC_EXTRACT_TESTDATA")
+	forceTest       = os.Getenv("UNIDOC_EXTRACT_FORCETEST") == "1"
+	corpusFolder    = os.Getenv("UNIDOC_EXTRACT_TESTDATA")
+	referenceFolder = filepath.Join(corpusFolder, "reference")
 )
 
 // doStress is set to true to run stress tests with the -extractor-stresstest command line option.
@@ -67,7 +68,7 @@ func TestTextExtractionFragments(t *testing.T) {
         BT
         /UniDocCourier 24 Tf
         (Hello World!)Tj
-        0 -10 Td
+        0 -25 Td
         (Doink)Tj
         ET
         `,
@@ -76,27 +77,27 @@ func TestTextExtractionFragments(t *testing.T) {
 		{
 			name: "landscape",
 			contents: `
-        BT
-        /UniDocCourier 24 Tf
-        0 1 -1 0 0 0 Tm
-        (Hello World!)Tj
-        0 -10 Td
-        (Doink)Tj
-        ET
-        `,
+		BT
+		/UniDocCourier 24 Tf
+		0 1 -1 0 0 0 Tm
+		(Hello World!)Tj
+		0 -25 Td
+		(Doink)Tj
+		ET
+		`,
 			text: "Hello World!\nDoink",
 		},
 		{
 			name: "180 degree rotation",
 			contents: `
-        BT
-        /UniDocCourier 24 Tf
-        -1 0 0 -1 0 0 Tm
-        (Hello World!)Tj
-        0 -10 Td
-        (Doink)Tj
-        ET
-        `,
+		BT
+		/UniDocCourier 24 Tf
+		-1 0 0 -1 0 0 Tm
+		(Hello World!)Tj
+		0 -25 Td
+		(Doink)Tj
+		ET
+		`,
 			text: "Hello World!\nDoink",
 		},
 		{
@@ -104,9 +105,9 @@ func TestTextExtractionFragments(t *testing.T) {
 			contents: `
         BT
         /UniDocHelvetica 24 Tf
-        0 -1 1 0 0 0 Tm
+
         (Hello World!)Tj
-        0 -10 Td
+        0 -25 Td
         (Doink)Tj
         ET
         `,
@@ -125,12 +126,13 @@ func TestTextExtractionFragments(t *testing.T) {
 
 	for _, f := range fragmentTests {
 		t.Run(f.name, func(t *testing.T) {
-			e := Extractor{resources: resources, contents: f.contents}
+			e := Extractor{resources: resources, contents: f.contents, mediaBox: r(-200, -200, 600, 800)}
 			text, err := e.ExtractText()
 			if err != nil {
 				t.Fatalf("Error extracting text: %q err=%v", f.name, err)
 				return
 			}
+			text = strings.TrimRight(text, "\n")
 			if text != f.text {
 				t.Fatalf("Text mismatch: %q Got %q. Expected %q", f.name, text, f.text)
 				return
@@ -148,6 +150,7 @@ func TestTextExtractionFiles(t *testing.T) {
 		return
 	}
 	for _, test := range fileExtractionTests {
+		// TODO(peterwilliams97): Remove non-lazy test.
 		testExtractFileOptions(t, test.filename, test.pageTerms, false)
 		testExtractFileOptions(t, test.filename, test.pageTerms, true)
 	}
@@ -171,7 +174,7 @@ func TestTermMarksFiles(t *testing.T) {
 	if !doStress {
 		t.Skip("skipping stress test")
 	}
-	common.Log.Info("Running text stress tests. go test --short to skip these.")
+	common.Log.Info("Running text stress tests.")
 	if len(corpusFolder) == 0 && !forceTest {
 		t.Log("Corpus folder not set - skipping")
 		return
@@ -179,50 +182,15 @@ func TestTermMarksFiles(t *testing.T) {
 	testTermMarksFiles(t)
 }
 
-//  TestTextSort checks that PageText.sortPosition() gives expected results
-func TestTextSort(t *testing.T) {
-	// marks0 is in the expected sort order for tol=15
-	marks0 := []textMark{
-		// y difference > tol => sorts by Y descending
-		textMark{orientedStart: transform.Point{X: 300, Y: 160}, text: "00"},
-		textMark{orientedStart: transform.Point{X: 200, Y: 140}, text: "01"},
-		textMark{orientedStart: transform.Point{X: 100, Y: 120}, text: "02"},
-
-		// y difference < tol => sort by X ascending for approx same Y
-		textMark{orientedStart: transform.Point{X: 100, Y: 30}, text: "10"},
-		textMark{orientedStart: transform.Point{X: 200, Y: 40}, text: "11"},
-		textMark{orientedStart: transform.Point{X: 300, Y: 50}, text: "12"},
-
-		// y difference < tol => sorts by X descending for approx same Y, different from previous Y
-		textMark{orientedStart: transform.Point{X: 100, Y: 3}, text: "20"},
-		textMark{orientedStart: transform.Point{X: 200, Y: 4}, text: "21"},
-		textMark{orientedStart: transform.Point{X: 300, Y: 5}, text: "22"},
-	}
-
-	// marks is a copy of marks0 with its order scrambled.
-	marks := make([]textMark, len(marks0))
-	copy(marks, marks0)
-	sort.Slice(marks, func(i, j int) bool {
-		ti, tj := marks[i], marks[j]
-		if ti.orientedStart.X != tj.orientedStart.X {
-			return ti.orientedStart.X > tj.orientedStart.X
-		}
-		if ti.orient != tj.orient {
-			return ti.orient > tj.orient
-		}
-		return ti.orientedStart.Y < tj.orientedStart.Y
-	})
-
-	// Copy marks to PageText and sort them. This should give the same order as marks0.
-	pt := PageText{marks: marks}
-	pt.sortPosition(15)
-
-	// Check that marks order is the same as marks0.
-	for i, m0 := range marks0 {
-		m := pt.marks[i]
-		if m.orientedStart.X != m0.orientedStart.X || m.orientedStart.Y != m0.orientedStart.Y {
-			t.Fatalf("i=%d m=%v != m0=%v", i, m, m0)
-		}
+// TestTextExtractionReference compares the text extracted from pages of PDF files to reference text
+// files.
+func TestTextExtractionReference(t *testing.T) {
+	if len(corpusFolder) == 0 && !forceTest {
+		t.Log("Corpus folder not set - skipping")
+		return
+	}
+	for _, er := range extractReferenceTests {
+		er.runTest(t)
 	}
 }
 
@@ -236,7 +204,7 @@ var fileExtractionTests = []struct {
 }{
 	{filename: "reader.pdf",
 		pageTerms: map[int][]string{
-			1: []string{"A Research UNIX Reader:",
+			1: {"A Research UNIX Reader:",
 				"Annotated Excerpts from the Programmer’s Manual,",
 				"1. Introduction",
 				"To keep the size of this report",
@@ -246,93 +214,87 @@ var fileExtractionTests = []struct {
 	},
 	{filename: "000026.pdf",
 		pageTerms: map[int][]string{
-			1: []string{"Fresh Flower",
-				"Care & Handling ",
+			1: {"Fresh Flower",
+				"Care & Handling",
 			},
 		},
 	},
 	{filename: "search_sim_key.pdf",
 		pageTerms: map[int][]string{
-			2: []string{"A cryptographic scheme which enables searching",
+			2: {"A cryptographic scheme which enables searching",
 				"Untrusted server should not be able to search for a word without authorization",
 			},
 		},
 	},
-	{filename: "Theil_inequality.pdf",
+	{filename: "Theil_inequality.pdf", // 270° rotated file.
 		pageTerms: map[int][]string{
-			1: []string{"London School of Economics and Political Science"},
-			4: []string{"The purpose of this paper is to set Theil’s approach"},
+			1: {"London School of Economics and Political Science"},
+			4: {"The purpose of this paper is to set Theil’s approach"},
 		},
 	},
 	{filename: "8207.pdf",
 		pageTerms: map[int][]string{
-			1: []string{"In building graphic systems for use with raster devices,"},
-			2: []string{"The imaging model specifies how geometric shapes and colors are"},
-			3: []string{"The transformation matrix T that maps application defined"},
+			1: {"In building graphic systems for use with raster devices,"},
+			2: {"The imaging model specifies how geometric shapes and colors are"},
+			3: {"The transformation matrix T that maps application defined"},
 		},
 	},
 	{filename: "ling-2013-0040ad.pdf",
 		pageTerms: map[int][]string{
-			1: []string{"Although the linguistic variation among texts is continuous"},
-			2: []string{"distinctions. For example, much of the research on spoken/written"},
+			1: {"Although the linguistic variation among texts is continuous"},
+			2: {"distinctions. For example, much of the research on spoken/written"},
 		},
 	},
 	{filename: "26-Hazard-Thermal-environment.pdf",
 		pageTerms: map[int][]string{
-			1: []string{"OHS Body of Knowledge"},
-			2: []string{"Copyright notice and licence terms"},
+			1: {"OHS Body of Knowledge"},
+			2: {"Copyright notice and licence terms"},
 		},
 	},
 	{filename: "Threshold_survey.pdf",
 		pageTerms: map[int][]string{
-			1: []string{"clustering, entropy, object attributes, spatial correlation, and local"},
+			1: {"clustering, entropy, object attributes, spatial correlation, and local"},
 		},
 	},
 	{filename: "circ2.pdf",
 		pageTerms: map[int][]string{
-			1: []string{"Understanding and complying with copyright law can be a challenge"},
+			1: {"Understanding and complying with copyright law can be a challenge"},
 		},
 	},
 	{filename: "rare_word.pdf",
 		pageTerms: map[int][]string{
-			6: []string{"words in the test set, we increase the BLEU score"},
+			6: {"words in the test set, we increase the BLEU score"},
 		},
 	},
 	{filename: "Planck_Wien.pdf",
 		pageTerms: map[int][]string{
-			1: []string{"entropy of a system of n identical resonators in a stationary radiation field"},
+			1: {"entropy of a system of n identical resonators in a stationary radiation field"},
 		},
 	},
-	// Case where combineDiacritics was combining ' and " with preceeding letters.
-	// NOTE(peterwilliams97): Part of the reason this test fails is that we don't currently read
-	// Type0:CIDFontType0 font metrics and assume zero displacemet so that we place the ' and " too
-	// close to the preceeding letters.
 	{filename: "/rfc6962.txt.pdf",
 		pageTerms: map[int][]string{
-			4: []string{
-				"timestamps for certificates they then don’t log",
+			4: {"timestamps for certificates they then don’t log",
 				`The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",`},
 		},
 	},
 	{filename: "Saudi.pdf",
 		pageTerms: map[int][]string{
-			10: []string{"الله"},
+			10: {"الله"},
+		},
+	},
+	{filename: "Ito_Formula.pdf", // 90° rotated with diacritics in different textMarks to base.
+		pageTerms: map[int][]string{
+			1: {"In the Itô stochastic calculus",
+				"In standard, non-stochastic calculus, one computes a derivative"},
+			2: {"Financial Economics Itô’s Formula"},
+		},
+	},
+	{filename: "thanh.pdf", // Diacritics in different textMarks to base.
+		pageTerms: map[int][]string{
+			1: {"Hàn Thế Thành"},
+			6: {"Petr Olšák"},
 		},
 	},
-	// TODO(peterwilliams97): Reinstate these 2 tests when diacritic combination is fixed.
-	// {filename: "Ito_Formula.pdf",
-	// 	pageTerms: map[int][]string{
-	// 		1: []string{
-	// 			"In the Itô stochastic calculus",
-	// 			"In standard, non-stochastic calculus, one computes a derivative"},
-	// 		2: []string{"Financial Economics Itô’s Formula"},
-	// 	},
-	// },
-	// {filename: "thanh.pdf",
-	// 	pageTerms: map[int][]string{
-	// 		1: []string{"Hàn Thé̂ Thành"},
-	// 	},
-	// },
 }
 
 // testExtractFile tests the ExtractTextWithStats text extractor on `filename` and compares the
@@ -344,7 +306,7 @@ func testExtractFileOptions(t *testing.T, filename string, pageTerms map[int][]s
 		if forceTest {
 			t.Fatalf("filepath=%q does not exist", filepath)
 		}
-		t.Logf("%s not found", filepath)
+		t.Logf("%q not found", filepath)
 		return
 	}
 
@@ -381,7 +343,6 @@ func extractPageTexts(t *testing.T, filename string, lazy bool) (int, map[int]st
 	}
 	pageText := map[int]string{}
 	for pageNum := 1; pageNum <= numPages; pageNum++ {
-
 		page, err := pdfReader.GetPage(pageNum)
 		if err != nil {
 			t.Fatalf("GetPage failed. filename=%q page=%d err=%v", filename, pageNum, err)
@@ -395,7 +356,6 @@ func extractPageTexts(t *testing.T, filename string, lazy bool) (int, map[int]st
 		if err != nil {
 			t.Fatalf("ExtractTextWithStats failed. filename=%q page=%d err=%v", filename, pageNum, err)
 		}
-		// TODO(peterwilliams97): Improve text extraction space insertion so we don't need reduceSpaces.
 		pageText[pageNum] = reduceSpaces(text)
 	}
 	return numPages, pageText
@@ -443,11 +403,11 @@ func (c pageContents) matchTerms() []string {
 
 // textLocTests are the extracted text location tests. All coordinates are multiples of 0.5 points.
 var textLocTests = []textLocTest{
-	textLocTest{
+	{
 		filename: "prop-price-list-2017.pdf",
 		numPages: 1,
 		contents: map[int]pageContents{
-			1: pageContents{
+			1: {
 				terms: []string{
 					"PRICE LIST",
 					"THING ONE", "$99",
@@ -461,7 +421,6 @@ var textLocTests = []textLocTest{
 					l(2, "I", 231.9, 725.2, 245.2, 773.2),
 					l(3, "C", 245.2, 725.2, 279.9, 773.2),
 					l(4, "E", 279.9, 725.2, 312.0, 773.2),
-					l(5, " ", 312.0, 725.2, 325.3, 773.2),
 					l(6, "L", 325.3, 725.2, 354.6, 773.2),
 					l(7, "I", 354.6, 725.2, 368.0, 773.2),
 					l(8, "S", 368.0, 725.2, 400.0, 773.2),
@@ -473,11 +432,11 @@ var textLocTests = []textLocTest{
 			},
 		},
 	},
-	textLocTest{
+	{
 		filename: "pol_e.pdf",
 		numPages: 2,
 		contents: map[int]pageContents{
-			1: pageContents{
+			1: {
 				marks: []TextMark{
 					l(3914, "W", 177.0, 136.5, 188.0, 148.0),
 					l(3915, "T", 187.5, 136.5, 194.5, 148.0),
@@ -490,24 +449,25 @@ var textLocTests = []textLocTest{
 			},
 		},
 	},
-	textLocTest{
+	{
 		filename: "thanh.pdf",
 		numPages: 6,
 		contents: map[int]pageContents{
-			1: pageContents{
+			1: {
 				terms: []string{
 					"result is a set of Type 1 fonts that is similar to the Blue Sky fonts",
 					"provide Vietnamese letters with the same quality of outlines and hints",
 					"Vietnamese letters and VNR fonts",
-					"Vietnamese accents can be divided into three the Czech and Polish version of CMR fonts",
-					"kinds of diacritic marks: tone, vowel and consonant. about 2 years until the ﬁrst version",
+					"Vietnamese accents can be divided into",
+					"kinds of diacritic marks: tone, vowel and consonant.",
+					"about 2 years until the first version was released",
 				},
 				termBBox: map[string]model.PdfRectangle{
 					"the Blue Sky fonts":                       r(358.0, 532.5, 439.0, 542.5),
 					"Vietnamese letters with the same quality": r(165.5, 520.5, 344.5, 530.5),
 				},
 			},
-			2: pageContents{
+			2: {
 				terms: []string{
 					"number of glyphs needed for each font is 47",
 					"which 22 are Vietnamese accents and letters.",
@@ -529,13 +489,13 @@ var textLocTests = []textLocTest{
 			},
 		},
 	},
-	textLocTest{
+	{
 		filename: "unicodeexample.pdf",
 		numPages: 6,
 		contents: map[int]pageContents{
-			2: pageContents{
+			2: {
 				terms: []string{
-					"Österreich", "Johann Strauß",
+					"Österreich", "Johann Strauss",
 					"Azərbaycan", "Vaqif Səmədoğlu",
 					"Азәрбајҹан", "Вагиф Сәмәдоғлу",
 				},
@@ -559,21 +519,21 @@ var textLocTests = []textLocTest{
 			},
 		},
 	},
-	textLocTest{
+	{
 		filename: "AF+handout+scanned.pdf",
 		numPages: 3,
 		contents: map[int]pageContents{
-			1: pageContents{
+			1: {
 				termBBox: map[string]model.PdfRectangle{
 					"reserved": r(505.0, 488.5, 538.5, 497.0),
 				},
 			},
-			2: pageContents{
+			2: {
 				termBBox: map[string]model.PdfRectangle{
 					"atrium": r(452.78, 407.76, 503.78, 416.26),
 				},
 			},
-			3: pageContents{
+			3: {
 				termBBox: map[string]model.PdfRectangle{
 					"treatment": r(348.0, 302.0, 388.0, 311.5),
 				},
@@ -589,6 +549,7 @@ func (e textLocTest) testDocTextAndMarks(t *testing.T, lazy bool) {
 	common.Log.Debug("textLocTest.testDocTextAndMarks: %s", desc)
 
 	filename := filepath.Join(corpusFolder, e.filename)
+	common.Log.Debug("testDocTextAndMarks: %q", filename)
 	f, err := os.Open(filename)
 	if err != nil {
 		t.Fatalf("Couldn't open filename=%q err=%v", filename, err)
@@ -627,6 +588,8 @@ func (c pageContents) testPageTextAndMarks(t *testing.T, l *markupList, desc str
 	page *model.PdfPage) {
 	text, textMarks := pageTextAndMarks(t, desc, page)
 
+	common.Log.Debug("testPageTextAndMarks ===================")
+	common.Log.Debug("text====================\n%s\n======================", text)
 	// 1) Check that all expected terms are found in `text`.
 	for i, term := range c.terms {
 		common.Log.Debug("%d: %q", i, term)
@@ -635,12 +598,7 @@ func (c pageContents) testPageTextAndMarks(t *testing.T, l *markupList, desc str
 		}
 	}
 
-	// 2) Check that all expected TextMarks are in `textMarks`.
-	offsetMark := marksMap(textMarks)
-	for i, tm := range c.marks {
-		common.Log.Debug("%d: %v", i, tm)
-		checkContains(t, desc, offsetMark, tm)
-	}
+	// 2) is missing for historical reasons.
 
 	// 3) Check that locationsIndex() finds TextMarks in `textMarks` corresponding to some
 	//   substrings of `text`.
@@ -685,10 +643,8 @@ func testTermMarksFiles(t *testing.T) {
 		t.Fatalf("Glob(%q) failed. err=%v", pattern, err)
 	}
 	for i, filename := range pathList {
-		for _, lazy := range []bool{false, true} {
-			common.Log.Info("%4d of %d: %q lazy=%t", i+1, len(pathList), filename, lazy)
-			tryTestTermMarksFile(t, filename, lazy)
-		}
+		common.Log.Info("%4d of %d: %q", i+1, len(pathList), filename)
+		tryTestTermMarksFile(t, filename, true)
 	}
 }
 
@@ -726,10 +682,85 @@ func tryTestTermMarksFile(t *testing.T, filename string, lazy bool) {
 	}
 }
 
+// extractReferenceTests compare text extracted from a page of a PDF file to a reference text file.
+var extractReferenceTests = []extractReference{
+	{"ChapterK.pdf", 1},
+	{"Garnaut.pdf", 1},
+	{"rise.pdf", 2},
+	{"pioneer.pdf", 1},
+	{"women.pdf", 20},
+	{"status.pdf", 2},
+	{"recognition.pdf", 1},
+	{"eu.pdf", 5},
+	{"we-dms.pdf", 1},
+	{"Productivity.pdf", 1},
+	{"Nuance.pdf", 1},
+}
+
+// extractReference describes a PDF file and page number.
+type extractReference struct {
+	filename string
+	pageNum  int
+}
+
+// runTest runs the test described by `er`. It checks that the text extracted from the page of the
+// PDF matches the reference text file.
+func (er extractReference) runTest(t *testing.T) {
+	compareExtractedTextToReference(t, er.pdfPath(), er.pageNum, er.textPath())
+}
+
+// pdfPath returns the path of the PDF file for test `er`.
+func (er extractReference) pdfPath() string {
+	return filepath.Join(corpusFolder, er.filename)
+}
+
+// textPath returns the path of the text reference file for test `er`.
+func (er extractReference) textPath() string {
+	pageStr := fmt.Sprintf("page%03d", er.pageNum)
+	return changeDirExt(referenceFolder, er.filename, pageStr, ".txt")
+}
+
+// compareExtractedTextToReference extracts text from (1-offset) page `pageNum` of PDF `filename`
+// and checks that it matches the text in reference file `textPath`.
+func compareExtractedTextToReference(t *testing.T, filename string, pageNum int, textPath string) {
+	f, err := os.Open(filename)
+	if err != nil {
+		common.Log.Info("Couldn't open. skipping. filename=%q err=%v", filename, err)
+		return
+	}
+	defer f.Close()
+	pdfReader, err := openPdfReader(f, true)
+	if err != nil {
+		common.Log.Info("openPdfReader failed. skipping. filename=%q err=%v", filename, err)
+		return
+	}
+	expectedText, err := readTextFile(textPath)
+	if err != nil {
+		common.Log.Info("readTextFile failed. skipping. textPath=%q err=%v", textPath, err)
+		return
+	}
+
+	desc := fmt.Sprintf("filename=%q pageNum=%d", filename, pageNum)
+	page, err := pdfReader.GetPage(pageNum)
+	if err != nil {
+		common.Log.Info("GetPage failed. skipping. %s err=%v", desc, err)
+		return
+	}
+	actualText, _ := pageTextAndMarks(t, desc, page)
+
+	actualText = reduceSpaces(norm.NFKC.String(actualText))
+	expectedText = reduceSpaces(norm.NFKC.String(expectedText))
+	if actualText != expectedText {
+		common.Log.Info("actual   =====================\n%s\n=====================", actualText)
+		common.Log.Info("expected =====================\n%s\n=====================", expectedText)
+		t.Fatalf("Text mismatch filename=%q page=%d", filename, pageNum)
+	}
+}
+
 // testTermMarksMulti checks that textMarks.RangeOffset() finds the TextMarks in `textMarks`
 // corresponding to some substrings of `text` with lengths 1-20.
 func testTermMarksMulti(t *testing.T, text string, textMarks *TextMarkArray) {
-	m := len([]rune(text))
+	m := utf8.RuneCountInString(text)
 	if m > 20 {
 		m = 20
 	}
@@ -750,16 +781,34 @@ func testTermMarks(t *testing.T, text string, textMarks *TextMarkArray, n int) {
 	if n > len(runes)/2 {
 		n = len(runes) / 2
 	}
-	runeString := runeStringIndex(text)
 
-	for ofsRune := 0; ofsRune < len(runes)-n; ofsRune++ {
-		term := string(runes[ofsRune : ofsRune+n])
-		ofs0 := runeString[ofsRune]
-		ofs1 := runeString[ofsRune+n]
+	delta := 5
+	for ofs := 0; ofs < len(runes)-2*n; ofs++ {
+		term := string(runes[ofs : ofs+n])
+		ofs0 := len(string(runes[:ofs]))
+		ofs1 := len(string(runes[:ofs+n]))
+		ofs0d := ofs0 - delta
+		ofs1d := ofs1 + delta
+		if ofs0d < 0 {
+			ofs0d = 0
+		}
+		if ofs1d > len(text) {
+			ofs1d = len(text)
+		}
+		show := fmt.Sprintf("<%s|%s|%s>", text[ofs0d:ofs0], text[ofs0:ofs1], text[ofs1:ofs1d])
+		{
+			show = fmt.Sprintf("%q", show)
+			runes := []rune(show)
+			show = string(runes[1 : len(runes)-1])
+		}
 
-		// Get TextMarks spanned `term` with RangeOffset().
+		// Get TextMarks spanning `term` with RangeOffset().
 		spanArray, err := textMarks.RangeOffset(ofs0, ofs1)
 		if err != nil {
+			if n <= 2 {
+				// Could be ligatures
+				continue
+			}
 			t.Fatalf("textMarks.RangeOffset failed term=%q=text[%d:%d]=%02x err=%v",
 				term, ofs0, ofs1, text[ofs0:ofs1], err)
 		}
@@ -772,29 +821,46 @@ func testTermMarks(t *testing.T, text string, textMarks *TextMarkArray, n int) {
 		mark0 := spanMarks[0]
 		mark1 := spanMarks[spanArray.Len()-1]
 
-		if !strings.HasPrefix(term, mark0.Text) {
-			t.Fatalf("mark0 is not a prefix for term=%q=text[%d:%d]=%02x mark0=%v",
-				term, ofs0, ofs1, text[ofs0:ofs1], mark0)
+		if len(mark0.Text) <= len(term) {
+			if !startWith(term, mark0.Text) {
+				for i, tm := range spanMarks {
+					fmt.Printf("%4d: %s\n", i, tm)
+				}
+				t.Fatalf("mark0 is not a prefix for term=%s=text[%d:%d]=%02x mark0=%v",
+					show, ofs0, ofs1, text[ofs0:ofs1], mark0)
+			}
 		}
-		if !strings.HasSuffix(term, mark1.Text) {
-			t.Fatalf("mark1 is not a suffix for term=%q=text[%d:%d]=%v mark1=%v",
-				term, ofs0, ofs1, text[ofs0:ofs1], mark1)
+		if len(mark1.Text) <= len(term) {
+			if !endsWith(term, mark1.Text) {
+				for i, tm := range spanMarks {
+					fmt.Printf("%4d: %s\n", i, tm)
+				}
+				t.Fatalf("mark1 is not a suffix for term=%s=text[%d:%d]=%v mark1=%v",
+					show, ofs0, ofs1, text[ofs0:ofs1], mark1)
+			}
 		}
 	}
 }
 
-// runeStringIndex returns a map of indexes of `[]rune(text)`` to the corresponding indexes in `text`.
-func runeStringIndex(text string) map[int]int {
-	runeString := map[int]int{}
-	runeIdx := 0
-	for strIdx, _ := range text {
-		runeString[runeIdx] = strIdx
-		runeIdx++
+// startWith returns true if the start of `str` overlaps the end of `sub`.
+func startWith(str, sub string) bool {
+	for n := 0; n < len(sub); n++ {
+		if strings.HasPrefix(str, sub[n:]) {
+			return true
+		}
+		// common.Log.Error("!startsWith: str=%q sub=%q sub[%d:]=%q", str, sub, n, sub[n:])
 	}
-	if len(runeString) != len([]rune(text)) {
-		panic("d")
+	return false
+}
+
+// endsWith returns true if the end of `str` overlaps the start of `sub`.
+func endsWith(str, sub string) bool {
+	for n := len(sub); n >= 1; n-- {
+		if strings.HasSuffix(str, sub[:n]) {
+			return true
+		}
 	}
-	return runeString
+	return false
 }
 
 // checkContains checks that `offsetMark` contains `expectedMark`.
@@ -882,7 +948,7 @@ func pageTextAndMarks(t *testing.T, desc string, page *model.PdfPage) (string, *
 	text := pageText.Text()
 	textMarks := pageText.Marks()
 
-	{ // Some extra debugging to see how the code works. Not needed by test.
+	if false { // Some extra debugging to see how the code works. Not needed by test.
 		common.Log.Debug("text=>>>%s<<<\n", text)
 		common.Log.Debug("textMarks=%s %q", textMarks, desc)
 		for i, tm := range textMarks.Elements() {
@@ -916,7 +982,7 @@ func containsTerms(t *testing.T, terms []string, actualText string) bool {
 	for _, w := range terms {
 		w = norm.NFKC.String(w)
 		if !strings.Contains(actualText, w) {
-			t.Errorf("No match for %q", w)
+			t.Fatalf("No match for %q", w)
 			return false
 		}
 	}
@@ -940,7 +1006,7 @@ func checkFileExists(filepath string) bool {
 
 // sortedKeys returns the keys of `m` as a sorted slice.
 func sortedKeys(m map[int][]string) []int {
-	keys := []int{}
+	keys := make([]int, 0, len(m))
 	for k := range m {
 		keys = append(keys, k)
 	}
@@ -1081,3 +1147,32 @@ func (l *markupList) saveOutputPdf() {
 		l.t.Fatalf("WriteFile failed. metaPath=%q err=%v", metaPath, err)
 	}
 }
+
+// changeDirExt inserts `qualifier` into `filename` before its extension then changes its
+// directory to `dirName` and extrension to `extName`,
+func changeDirExt(dirName, filename, qualifier, extName string) string {
+	if dirName == "" {
+		return ""
+	}
+	base := filepath.Base(filename)
+	ext := filepath.Ext(base)
+	base = base[:len(base)-len(ext)]
+	if len(qualifier) > 0 {
+		base = fmt.Sprintf("%s.%s", base, qualifier)
+	}
+	filename = fmt.Sprintf("%s%s", base, extName)
+	path := filepath.Join(dirName, filename)
+	common.Log.Debug("changeDirExt(%q,%q,%q)->%q", dirName, base, extName, path)
+	return path
+}
+
+// readTextFile return the contents of `filename` as a string.
+func readTextFile(filename string) (string, error) {
+	f, err := os.Open(filename)
+	if err != nil {
+		return "", err
+	}
+	defer f.Close()
+	b, err := ioutil.ReadAll(f)
+	return string(b), err
+}
diff --git a/extractor/text_utils.go b/extractor/text_utils.go
new file mode 100644
index 000000000..9e095f656
--- /dev/null
+++ b/extractor/text_utils.go
@@ -0,0 +1,275 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package extractor
+
+import (
+	"math"
+	"sort"
+	"unicode"
+)
+
+// TOL is the tolerance for coordinates to be consideted equal. It is big enough to cover all
+// rounding errors and small enough that TOL point differences on a page aren't visible.
+const TOL = 1.0e-6
+
+// isZero returns true if x is with TOL of 0.0
+func isZero(x float64) bool {
+	return math.Abs(x) < TOL
+}
+
+// minInt return the lesser of `a` and `b`.
+func minInt(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+// maxInt return the greater of `a` and `b`.
+func maxInt(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+// addNeighbours fills out the below and right fields of the paras in `paras`.
+// For each para `a`:
+//    a.below is the unique highest para completely below `a` that overlaps it in the x-direction
+//    a.right is the unique leftmost para completely to the right of `a` that overlaps it in the y-direction
+func (paras paraList) addNeighbours() {
+	paraNeighbours := paras.yNeighbours()
+	for _, para := range paras {
+		var left *textPara
+		dup := false
+		for _, k := range paraNeighbours[para] {
+			b := paras[k]
+			if b.Urx <= para.Llx {
+				if left == nil {
+					left = b
+				} else {
+					if b.Llx > left.Llx {
+						left = b
+						dup = false
+					} else if b.Llx == left.Llx {
+						dup = true
+					}
+				}
+			}
+		}
+		if !dup {
+			para.left = left
+		}
+	}
+	for _, para := range paras {
+		var right *textPara
+		dup := false
+		for _, k := range paraNeighbours[para] {
+			b := paras[k]
+			if b.Llx >= para.Urx {
+				if right == nil {
+					right = b
+				} else {
+					if b.Llx < right.Llx {
+						right = b
+						dup = false
+					} else if b.Llx == right.Llx {
+						dup = true
+					}
+				}
+			}
+		}
+		if !dup {
+			para.right = right
+		}
+	}
+
+	paraNeighbours = paras.xNeighbours()
+	for _, para := range paras {
+		var above *textPara
+		dup := false
+		for _, i := range paraNeighbours[para] {
+			b := paras[i]
+			if b.Lly >= para.Ury {
+				if above == nil {
+					above = b
+				} else {
+					if b.Ury < above.Ury {
+						above = b
+						dup = false
+					} else if b.Ury == above.Ury {
+						dup = true
+					}
+				}
+			}
+		}
+		if !dup {
+			para.above = above
+		}
+	}
+	for _, para := range paras {
+		var below *textPara
+		dup := false
+		for _, i := range paraNeighbours[para] {
+			b := paras[i]
+			if b.Ury <= para.Lly {
+				if below == nil {
+					below = b
+				} else {
+					if b.Ury > below.Ury {
+						below = b
+						dup = false
+					} else if b.Ury == below.Ury {
+						dup = true
+					}
+				}
+			}
+		}
+		if !dup {
+			para.below = below
+		}
+	}
+}
+
+// xNeighbours returns a map {para: indexes of paras that x-overlap para}.
+func (paras paraList) xNeighbours() map[*textPara][]int {
+	events := make([]event, 2*len(paras))
+	for i, para := range paras {
+		events[2*i] = event{para.Llx, true, i}
+		events[2*i+1] = event{para.Urx, false, i}
+	}
+	return paras.eventNeighbours(events)
+}
+
+// yNeighbours returns a map {para: indexes of paras that y-overlap para}.
+func (paras paraList) yNeighbours() map[*textPara][]int {
+	events := make([]event, 2*len(paras))
+	for i, para := range paras {
+		events[2*i] = event{para.Lly, true, i}
+		events[2*i+1] = event{para.Ury, false, i}
+	}
+	return paras.eventNeighbours(events)
+}
+
+// event is an entry or exit from an interval while scanning.
+type event struct {
+	z     float64 // Coordinate in the scanning direction.
+	enter bool    // True if entering the interval, false it leaving.
+	i     int     // Index of the interval
+}
+
+// eventNeighbours returns a map {para: indexes of paras that overlap para in `events`}.
+func (paras paraList) eventNeighbours(events []event) map[*textPara][]int {
+	sort.Slice(events, func(i, j int) bool {
+		ei, ej := events[i], events[j]
+		zi, zj := ei.z, ej.z
+		if zi != zj {
+			return zi < zj
+		}
+		if ei.enter != ej.enter {
+			return ei.enter
+		}
+		return i < j
+	})
+
+	overlaps := map[int]map[int]struct{}{}
+	olap := map[int]struct{}{}
+	for _, e := range events {
+		if e.enter {
+			overlaps[e.i] = map[int]struct{}{}
+			for i := range olap {
+				if i != e.i {
+					overlaps[e.i][i] = struct{}{}
+					overlaps[i][e.i] = struct{}{}
+				}
+			}
+			olap[e.i] = struct{}{}
+		} else {
+			delete(olap, e.i)
+		}
+	}
+
+	paraNeighbors := map[*textPara][]int{}
+	for i, olap := range overlaps {
+		para := paras[i]
+		neighbours := make([]int, len(olap))
+		k := 0
+		for j := range olap {
+			neighbours[k] = j
+			k++
+		}
+		paraNeighbors[para] = neighbours
+	}
+	return paraNeighbors
+}
+
+// isTextSpace returns true if `text` contains nothing but space code points.
+func isTextSpace(text string) bool {
+	for _, r := range text {
+		if !unicode.IsSpace(r) {
+			return false
+		}
+	}
+	return true
+}
+
+// combiningDiacritic returns the combining version of `text` if text contains a single uncombined
+// diacritic rune.
+func combiningDiacritic(text string) (string, bool) {
+	runes := []rune(text)
+	if len(runes) != 1 {
+		return "", false
+	}
+	combining, isDiacritic := diacriticsToCombining[runes[0]]
+	return combining, isDiacritic
+}
+
+var (
+	// diacriticsToCombining is a map of diacritic runes to their combining diacritic equivalents.
+	// These values were  copied from  (https://svn.apache.org/repos/asf/pdfbox/trunk/pdfbox/src/main/java/org/apache/pdfbox/text/TextPosition.java)
+	diacriticsToCombining = map[rune]string{
+		0x0060: "\u0300", //   ` -> ò
+		0x02CB: "\u0300", //   ˋ -> ò
+		0x0027: "\u0301", //   ' -> ó
+		0x00B4: "\u0301", //   ´ -> ó
+		0x02B9: "\u0301", //   ʹ -> ó
+		0x02CA: "\u0301", //   ˊ -> ó
+		0x005E: "\u0302", //   ^ -> ô
+		0x02C6: "\u0302", //   ˆ -> ô
+		0x007E: "\u0303", //   ~ -> õ
+		0x02DC: "\u0303", //   ˜ -> õ
+		0x00AF: "\u0304", //   ¯ -> ō
+		0x02C9: "\u0304", //   ˉ -> ō
+		0x02D8: "\u0306", //   ˘ -> ŏ
+		0x02D9: "\u0307", //   ˙ -> ȯ
+		0x00A8: "\u0308", //   ¨ -> ö
+		0x00B0: "\u030A", //   ° -> o̊
+		0x02DA: "\u030A", //   ˚ -> o̊
+		0x02BA: "\u030B", //   ʺ -> ő
+		0x02DD: "\u030B", //   ˝ -> ő
+		0x02C7: "\u030C", //   ˇ -> ǒ
+		0x02C8: "\u030D", //   ˈ -> o̍
+		0x0022: "\u030E", //   " -> o̎
+		0x02BB: "\u0312", //   ʻ -> o̒
+		0x02BC: "\u0313", //   ʼ -> o̓
+		0x0486: "\u0313", //   ҆ -> o̓
+		0x055A: "\u0313", //   ՚ -> o̓
+		0x02BD: "\u0314", //   ʽ -> o̔
+		0x0485: "\u0314", //   ҅ -> o̔
+		0x0559: "\u0314", //   ՙ -> o̔
+		0x02D4: "\u031D", //   ˔ -> o̝
+		0x02D5: "\u031E", //   ˕ -> o̞
+		0x02D6: "\u031F", //   ˖ -> o̟
+		0x02D7: "\u0320", //   ˗ -> o̠
+		0x02B2: "\u0321", //   ʲ -> o̡
+		0x00B8: "\u0327", //   ¸ -> o̧
+		0x02CC: "\u0329", //   ˌ -> o̩
+		0x02B7: "\u032B", //   ʷ -> o̫
+		0x02CD: "\u0331", //   ˍ -> o̱
+		0x005F: "\u0332", //   _ -> o̲
+		0x204E: "\u0359", //   ⁎ -> o͙
+	}
+)
diff --git a/extractor/text_word.go b/extractor/text_word.go
new file mode 100644
index 000000000..eefa1f21b
--- /dev/null
+++ b/extractor/text_word.go
@@ -0,0 +1,205 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+package extractor
+
+import (
+	"fmt"
+	"math"
+	"strings"
+
+	"github.com/unidoc/unipdf/v3/common"
+	"github.com/unidoc/unipdf/v3/model"
+	"golang.org/x/text/unicode/norm"
+)
+
+// textWord represents a word fragment.
+// makeTextWords() shows how textWords are created.
+// We don't see whole words until textWords are eventually sorted into textLines  in
+// wordBag.arrangeText(). textLines are slices of textWord that define whole words by the
+//  newWord marker on those fragments that start whole words.
+//  - A textLine is the textWords at similar depths sorted in reading order.
+//  - All textWords, w, in the textLine that start whole words have w.newWord = true
+type textWord struct {
+	model.PdfRectangle             // Bounding box (union of `marks` bounding boxes).
+	depth              float64     // Distance from bottom of this word to the top of the page.
+	text               string      // The word fragment text.
+	marks              []*textMark // Marks in this word.
+	fontsize           float64     // Largest fontsize in the word.
+	newWord            bool        // Is this word fragment the start of  a new word?
+}
+
+// makeTextPage combines `marks`, the textMarks on a page, into word fragments.
+// `pageSize` is used to calculate the words` depths depth on the page.
+// Algorithm:
+//  1. `marks` are in the order they were rendered in the PDF.
+//  2. Successive marks are combined into a word fragment unless
+//      One mark is a space character.
+//      They are separated by more than maxWordAdvanceR*fontsize in the reading direction
+//      They are not within the location allowed by horizontal and vertical variations allowed by
+//       reasonable kerning and leading.
+// TODO(peterwilliams97): Check for overlapping textWords for cases such as diacritics, bolding by
+//                       repeating and others.
+func makeTextWords(marks []*textMark, pageSize model.PdfRectangle) []*textWord {
+	var words []*textWord // The words.
+	var newWord *textWord // The word being built.
+
+	// addNewWord adds `newWord` to `words` and resets `newWord` to nil.
+	addNewWord := func() {
+		if newWord != nil {
+			text := newWord.computeText()
+			if !isTextSpace(text) {
+				newWord.text = text
+				words = append(words, newWord)
+			}
+			newWord = nil
+		}
+	}
+
+	for _, tm := range marks {
+		if doCombineDiacritics && newWord != nil && len(newWord.marks) > 0 {
+			// Combine diacritic marks into neighbourimg non-diacritics marks.
+			prev := newWord.marks[len(newWord.marks)-1]
+			text, isDiacritic := combiningDiacritic(tm.text)
+			prevText, prevDiacritic := combiningDiacritic(prev.text)
+			if isDiacritic && !prevDiacritic && prev.inDiacriticArea(tm) {
+				newWord.addDiacritic(text)
+				continue
+			}
+			if prevDiacritic && !isDiacritic && tm.inDiacriticArea(prev) {
+				// If the previous mark was the diacritic, merge it into this mark and re-append it
+				newWord.marks = newWord.marks[:len(newWord.marks)-1]
+				newWord.appendMark(tm, pageSize)
+				newWord.addDiacritic(prevText)
+				continue
+			}
+		}
+
+		// Check for spaces between words.
+		isSpace := isTextSpace(tm.text)
+		if isSpace {
+			addNewWord()
+			continue
+		}
+
+		if newWord == nil && !isSpace {
+			newWord = newTextWord([]*textMark{tm}, pageSize)
+			continue
+		}
+
+		fontsize := newWord.fontsize
+		depthGap := math.Abs(getDepth(pageSize, tm)-newWord.depth) / fontsize
+		readingGap := gapReading(tm, newWord) / fontsize
+
+		// These are the conditions for `tm` to be from a new word.
+		// - Gap between words in reading position is larger than a space.
+		// - Change in reading position is too negative to be just a kerning adjustment.
+		// - Change in depth is too large to be just a leading adjustment.
+		if readingGap >= maxWordAdvanceR || !(-maxKerningR <= readingGap && depthGap <= maxLeadingR) {
+			addNewWord()
+			newWord = newTextWord([]*textMark{tm}, pageSize)
+			continue
+		}
+		newWord.appendMark(tm, pageSize)
+	}
+	addNewWord()
+
+	return words
+}
+
+// newTextWord creates a textWords containing `marks`.
+// `pageSize` is used to calculate the word's depth on the page.
+func newTextWord(marks []*textMark, pageSize model.PdfRectangle) *textWord {
+	r := marks[0].PdfRectangle
+	fontsize := marks[0].fontsize
+	for _, tm := range marks[1:] {
+		r = rectUnion(r, tm.PdfRectangle)
+		if tm.fontsize > fontsize {
+			fontsize = tm.fontsize
+		}
+	}
+
+	return &textWord{
+		PdfRectangle: r,
+		marks:        marks,
+		depth:        pageSize.Ury - r.Lly,
+		fontsize:     fontsize,
+	}
+}
+
+// String returns a description of `w`.
+func (w *textWord) String() string {
+	return fmt.Sprintf("%.2f %6.2f fontsize=%.2f \"%s\"",
+		w.depth, w.PdfRectangle, w.fontsize, w.text)
+}
+
+// bbox makes textWord implement the `bounded` interface.
+func (w *textWord) bbox() model.PdfRectangle {
+	return w.PdfRectangle
+}
+
+// appendMark adds textMark `tm` to  `w`.
+// `pageSize` is used to calculate the word's depth on the page.
+func (w *textWord) appendMark(tm *textMark, pageSize model.PdfRectangle) {
+	w.marks = append(w.marks, tm)
+	w.PdfRectangle = rectUnion(w.PdfRectangle, tm.PdfRectangle)
+	if tm.fontsize > w.fontsize {
+		w.fontsize = tm.fontsize
+	}
+	w.depth = pageSize.Ury - w.PdfRectangle.Lly
+}
+
+// addDiacritic adds combining diacritic `text` `tm` to `w`.
+// It adds the diacritic to the last mark and doesn't update the size
+func (w *textWord) addDiacritic(text string) {
+	lastMark := w.marks[len(w.marks)-1]
+	lastMark.text = lastMark.text + text
+	lastMark.text = norm.NFKC.String(lastMark.text)
+}
+
+// absorb combines `word` into `w`.
+func (w *textWord) absorb(word *textWord) {
+	w.PdfRectangle = rectUnion(w.PdfRectangle, word.PdfRectangle)
+	w.marks = append(w.marks, word.marks...)
+}
+
+// text returns the text in `w`.
+func (w *textWord) computeText() string {
+	texts := make([]string, len(w.marks))
+	for i, tm := range w.marks {
+		texts[i] = tm.text
+	}
+	return strings.Join(texts, "")
+}
+
+// toTextMarks returns the TextMarks contained in `w`.text().
+// `offset` is used to give the TextMarks the correct Offset values.
+func (w *textWord) toTextMarks(offset *int) []TextMark {
+	var marks []TextMark
+	for _, tm := range w.marks {
+		marks = appendTextMark(marks, offset, tm.ToTextMark())
+	}
+	return marks
+}
+
+// removeWord returns `words` with `word` removed.
+// Caller must check that `words` contains `word`,
+// TODO(peterwilliams97): Optimize
+func removeWord(words []*textWord, word *textWord) []*textWord {
+	for i, w := range words {
+		if w == word {
+			return removeWordAt(words, i)
+		}
+	}
+	common.Log.Error("removeWord: words doesn't contain word=%s", word)
+	return nil
+}
+
+// removeWord returns `words` with `words[idx]` removed.
+func removeWordAt(words []*textWord, idx int) []*textWord {
+	n := len(words)
+	copy(words[idx:], words[idx+1:])
+	return words[:n-1]
+}
diff --git a/extractor/utils.go b/extractor/utils.go
index bacc600e0..de5dfc4b6 100644
--- a/extractor/utils.go
+++ b/extractor/utils.go
@@ -41,22 +41,6 @@ func toFloatXY(objs []core.PdfObject) (x, y float64, err error) {
 	return floats[0], floats[1], nil
 }
 
-// minFloat returns the lesser of `a` and `b`.
-func minFloat(a, b float64) float64 {
-	if a < b {
-		return a
-	}
-	return b
-}
-
-// maxFloat returns the greater of `a` and `b`.
-func maxFloat(a, b float64) float64 {
-	if a > b {
-		return a
-	}
-	return b
-}
-
 func procBuf(pt *PageText) {
 	if isTesting {
 		return
@@ -73,7 +57,7 @@ func procBuf(pt *PageText) {
 	buf.WriteString(pt.viewText)
 
 	s := "- [Unlicensed UniDoc - Get a license on https://unidoc.io]"
-	if buf.Len() > 100 {
+	if buf.Len() > 102 {
 		s = "... [Truncated - Unlicensed UniDoc - Get a license on https://unidoc.io]"
 		buf.Truncate(buf.Len() - 100)
 	}
diff --git a/go.mod b/go.mod
index 6c007954c..14bd743b6 100644
--- a/go.mod
+++ b/go.mod
@@ -15,4 +15,5 @@ require (
 	golang.org/x/image v0.0.0-20181116024801-cd38e8056d9b
 	golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 // indirect
 	golang.org/x/text v0.3.2
+	golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543
 )
diff --git a/go.sum b/go.sum
index e75663e46..1afa04fed 100644
--- a/go.sum
+++ b/go.sum
@@ -56,6 +56,8 @@ golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
 golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543 h1:E7g+9GITq07hpfrRu66IVDexMakfv52eLZ2CXBWiKr4=
+golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
 gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
diff --git a/internal/textencoding/glyphs_glyphlist.go b/internal/textencoding/glyphs_glyphlist.go
index a5ba8f63c..3f0d34bde 100644
--- a/internal/textencoding/glyphs_glyphlist.go
+++ b/internal/textencoding/glyphs_glyphlist.go
@@ -11,6 +11,7 @@
 package textencoding
 
 import (
+	"bytes"
 	"fmt"
 	"regexp"
 	"strconv"
@@ -83,6 +84,16 @@ func RuneToGlyph(r rune) (GlyphName, bool) {
 	return glyph, ok
 }
 
+// ExpandLigatures returns `runes` as a string with ligatures expanded
+func ExpandLigatures(runes []rune) string {
+	var buffer bytes.Buffer
+	for _, r := range runes {
+		s := RuneToString(r)
+		buffer.WriteString(s)
+	}
+	return buffer.String()
+}
+
 // RuneToString converts rune `r` to a string. It unpacks `ligatures`.
 func RuneToString(r rune) string {
 	if s, ok := ligatureToString[r]; ok {
@@ -137,8 +148,6 @@ var ligatureToString = map[rune]string{
 	'œ':          "oe",
 	'Ꝏ':          "OO",
 	'ꝏ':          "oo",
-	'ẞ':          "fs",
-	'ß':          "fz",
 	'ﬆ':          "st",
 	'ﬅ':          "ſt",
 	'Ꜩ':          "TZ",
diff --git a/internal/textencoding/simple.go b/internal/textencoding/simple.go
index 2ddd385c7..615b3443b 100644
--- a/internal/textencoding/simple.go
+++ b/internal/textencoding/simple.go
@@ -7,6 +7,7 @@ package textencoding
 
 import (
 	"errors"
+	"fmt"
 	"sort"
 	"sync"
 	"unicode/utf8"
@@ -54,7 +55,7 @@ func NewSimpleTextEncoder(baseName string, differences map[CharCode]GlyphName) (
 	fnc, ok := simple[baseName]
 	if !ok {
 		common.Log.Debug("ERROR: NewSimpleTextEncoder. Unknown encoding %q", baseName)
-		return nil, errors.New("unsupported font encoding")
+		return nil, fmt.Errorf("unsupported font encoding: %q (%v)", baseName, core.ErrNotSupported)
 	}
 	enc := fnc()
 	if len(differences) != 0 {
diff --git a/model/const.go b/model/const.go
index d6efcac48..6366a0406 100644
--- a/model/const.go
+++ b/model/const.go
@@ -7,6 +7,9 @@ package model
 
 import (
 	"errors"
+	"fmt"
+
+	"github.com/unidoc/unipdf/v3/core"
 )
 
 // Errors when parsing/loading data in PDF.
@@ -18,8 +21,8 @@ var (
 	errRangeError               = errors.New("range check error")
 	ErrEncrypted                = errors.New("file needs to be decrypted first")
 	ErrNoFont                   = errors.New("font not defined")
-	ErrFontNotSupported         = errors.New("unsupported font")
-	ErrType1CFontNotSupported   = errors.New("Type1C fonts are not currently supported")
-	ErrType3FontNotSupported    = errors.New("Type3 fonts are not currently supported")
-	ErrTTCmapNotSupported       = errors.New("unsupported TrueType cmap format")
+	ErrFontNotSupported         = fmt.Errorf("unsupported font (%v)", core.ErrNotSupported)
+	ErrType1CFontNotSupported   = fmt.Errorf("Type1C fonts are not currently supported (%v)", core.ErrNotSupported)
+	ErrType3FontNotSupported    = fmt.Errorf("Type3 fonts are not currently supported (%v)", core.ErrNotSupported)
+	ErrTTCmapNotSupported       = fmt.Errorf("unsupported TrueType cmap format (%v)", core.ErrNotSupported)
 )
diff --git a/model/font.go b/model/font.go
index 7860752ae..d1e06ffb3 100644
--- a/model/font.go
+++ b/model/font.go
@@ -11,6 +11,7 @@ import (
 	"fmt"
 	"sort"
 	"strings"
+	"unicode/utf8"
 
 	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/core"
@@ -485,14 +486,8 @@ func (font *PdfFont) CharcodesToStrings(charcodes []textencoding.CharCode) ([]st
 //   encoding and use the glyph indices as character codes, as described following Table 118.
 func (font *PdfFont) CharcodeBytesToUnicode(data []byte) (string, int, int) {
 	runes, _, numMisses := font.CharcodesToUnicodeWithStats(font.BytesToCharcodes(data))
-
-	var buffer bytes.Buffer
-	for _, r := range runes {
-		buffer.WriteString(textencoding.RuneToString(r))
-	}
-
-	str := buffer.String()
-	return str, len([]rune(str)), numMisses
+	str := textencoding.ExpandLigatures(runes)
+	return str, utf8.RuneCountInString(str), numMisses
 }
 
 // CharcodesToUnicode converts the character codes `charcodes` to a slice of runes.
diff --git a/model/font_composite.go b/model/font_composite.go
index 829d2036d..53e57e240 100644
--- a/model/font_composite.go
+++ b/model/font_composite.go
@@ -16,14 +16,12 @@ import (
 	"sort"
 	"strings"
 
-	"github.com/unidoc/unitype"
-
 	"github.com/unidoc/unipdf/v3/common"
 	"github.com/unidoc/unipdf/v3/core"
-
 	"github.com/unidoc/unipdf/v3/internal/cmap"
 	"github.com/unidoc/unipdf/v3/internal/textencoding"
 	"github.com/unidoc/unipdf/v3/model/internal/fonts"
+	"github.com/unidoc/unitype"
 )
 
 /*
@@ -685,7 +683,7 @@ func parseCIDFontWidthsArray(w core.PdfObject) (map[textencoding.CharCode]float6
 	fontWidths := map[textencoding.CharCode]float64{}
 	wArrLen := wArr.Len()
 	for i := 0; i < wArrLen-1; i++ {
-		obj0 := wArr.Get(i)
+		obj0 := core.TraceToDirectObject(wArr.Get(i))
 		n, ok0 := core.GetIntVal(obj0)
 		if !ok0 {
 			return nil, fmt.Errorf("Bad font W obj0: i=%d %#v", i, obj0)
@@ -695,7 +693,7 @@ func parseCIDFontWidthsArray(w core.PdfObject) (map[textencoding.CharCode]float6
 			return nil, fmt.Errorf("Bad font W array: arr2=%+v", wArr)
 		}
 
-		obj1 := wArr.Get(i)
+		obj1 := core.TraceToDirectObject(wArr.Get(i))
 		switch obj1.(type) {
 		case *core.PdfObjectArray:
 			arr, _ := core.GetArray(obj1)
diff --git a/model/font_test.go b/model/font_test.go
index 4592005a6..8bf3307b5 100644
--- a/model/font_test.go
+++ b/model/font_test.go
@@ -10,6 +10,7 @@ import (
 	"fmt"
 	"io/ioutil"
 	"testing"
+	"unicode/utf8"
 
 	"github.com/stretchr/testify/require"
 
@@ -374,7 +375,7 @@ var charcodeBytesToUnicodeTest = []fontFragmentTest{
 			242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255},
 		" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" +
 			"abcdefghijklmnopqrstuvwxyz{|}~€‚ƒ„…†‡ˆ‰Š‹OEŽ‘’“”•–—˜™š›oežŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·" +
-			"¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞfzàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ",
+			"¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ",
 	},
 	{"Helvetica built-in",
 		"./testdata/font/simple.txt", 5,
@@ -387,7 +388,7 @@ var charcodeBytesToUnicodeTest = []fontFragmentTest{
 			184, 185, 186, 187, 188, 189, 191, 193, 194, 195, 196, 197, 198, 199, 225, 227, 232, 241, 245, 248, 249,
 			250, 251},
 		` !"#$%&’()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_‘abcdefghijklmnopqrstuvwxyz{|}~` +
-			`¡¢£⁄¥ƒ§¤'“«‹›fifl–†‡·¶•‚„”»…‰¿` + "`" + `´ˆ˜¯˘˙ÆªŁæıłøoefz`,
+			`¡¢£⁄¥ƒ§¤'“«‹›fifl–†‡·¶•‚„”»…‰¿` + "`" + `´ˆ˜¯˘˙ÆªŁæıłøoeß`,
 	},
 	{"Symbol built-in",
 		"./testdata/font/simple.txt", 3,
@@ -434,7 +435,7 @@ var charcodeBytesToUnicodeTest = []fontFragmentTest{
 			225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 241, 242, 243,
 			244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255},
 		" !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`" +
-			"abcdefghijklmnopqrstuvwxyz{|}~ÄÅÇÉÑÖÜáàâäãåçéèêëíìîïñóòôöõúùûü†°¢£§•¶fz®©™´¨≠ÆØ∞" +
+			"abcdefghijklmnopqrstuvwxyz{|}~ÄÅÇÉÑÖÜáàâäãåçéèêëíìîïñóòôöõúùûü†°¢£§•¶ß®©™´¨≠ÆØ∞" +
 			"±≤≥¥µ∂∑∏π∫ªºΩæø¿¡¬√ƒ≈∆«»…ÀÃÕOEoe–—“”‘’÷◊ÿŸ⁄€‹›fifl‡·‚„‰ÂÊÁËÈÍÎÏÌÓÔÒÚÛÙıˆ˜¯˘˙˚¸˝˛ˇ",
 	},
 	{"Test beginbfchar and beginbfrange cmap entries",
@@ -608,9 +609,9 @@ func (f *fontFragmentTest) check(t *testing.T) {
 			}
 		}
 	}
-	if numChars != len([]rune(actualText)) {
+	if numChars != utf8.RuneCountInString(actualText) {
 		t.Errorf("Incorrect numChars. %s numChars=%d expected=%d\n%+v\n%c",
-			f, numChars, len([]rune(actualText)), []rune(actualText), []rune(actualText))
+			f, numChars, utf8.RuneCountInString(actualText), []rune(actualText), []rune(actualText))
 	}
 }
 
diff --git a/model/internal/fonts/ttfparser.go b/model/internal/fonts/ttfparser.go
index 42d0a94c8..bb1148dbf 100644
--- a/model/internal/fonts/ttfparser.go
+++ b/model/internal/fonts/ttfparser.go
@@ -209,7 +209,8 @@ func (t *ttfParser) Parse() (TtfType, error) {
 	}
 	if version == "OTTO" {
 		// See https://docs.microsoft.com/en-us/typography/opentype/spec/otff
-		return TtfType{}, errors.New("fonts based on PostScript outlines are not supported")
+		return TtfType{}, fmt.Errorf("fonts based on PostScript outlines are not supported (%v)",
+			core.ErrNotSupported)
 	}
 	if version != "\x00\x01\x00\x00" && version != "true" {
 		// This is not an error. In the font_test.go example axes.txt we see version "true".
@@ -376,7 +377,7 @@ func (t *ttfParser) parseCmapSubtable31(offset31 int64) error {
 	t.f.Seek(int64(t.tables["cmap"])+offset31, os.SEEK_SET)
 	format := t.ReadUShort()
 	if format != 4 {
-		return fmt.Errorf("unexpected subtable format: %d", format)
+		return fmt.Errorf("unexpected subtable format: %d (%v)", format, core.ErrNotSupported)
 	}
 	t.Skip(2 * 2) // length, language
 	segCount := int(t.ReadUShort() / 2)