Skip to content

Commit

Permalink
Merge branch 'development' of https://github.com/unidoc/unipdf into c…
Browse files Browse the repository at this point in the history
…olumns
  • Loading branch information
peterwilliams97 committed Jun 25, 2020
2 parents 3cca581 + 4f96762 commit b39f205
Show file tree
Hide file tree
Showing 4 changed files with 145 additions and 4 deletions.
68 changes: 68 additions & 0 deletions creator/creator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3138,6 +3138,74 @@ func TestReferencedPageDestinations(t *testing.T) {
testPages(buf, 6, 5)
}

func TestExtractTextColor(t *testing.T) {
red := ColorRGBFrom8bit(255, 0, 0)
green := ColorRGBFrom8bit(0, 255, 0)
blue := ColorRGBFrom8bit(0, 0, 255)

// Test data.
type textMark struct {
text string
color Color
}

lines := [][]textMark{
[]textMark{
textMark{text: "a", color: red},
textMark{text: "b", color: green},
textMark{text: "c", color: blue},
},
[]textMark{
textMark{text: "x", color: green},
textMark{text: "y", color: blue},
textMark{text: "z", color: red},
},
}

// Create output file.
c := New()

for _, line := range lines {
p := c.NewStyledParagraph()
for _, mark := range line {
p.Append(mark.text).Style.Color = mark.color
}
require.NoError(t, c.Draw(p))
}

buf := bytes.NewBuffer(nil)
require.NoError(t, c.Write(buf))

// Extract output file.
reader, err := model.NewPdfReader(bytes.NewReader(buf.Bytes()))
require.NoError(t, err)

for _, page := range reader.PageList {
ex, err := extractor.New(page)
require.NoError(t, err)

pageText, _, _, err := ex.ExtractPageText()
require.NoError(t, err)
marks := pageText.Marks().Elements()

for i, line := range lines {
lenLine := len(line)
for j, inMark := range line {
outMark := marks[i*lenLine+i+j]
outR, outG, outB, _ := outMark.FillColor.RGBA()

// Compare the fill color of the input mark with the one
// of the extracted mark.
inR, inG, inB := inMark.color.ToRGB()
require.Equal(t, inMark.text, outMark.Text)
require.Equal(t, uint32(inR*255), outR>>8)
require.Equal(t, uint32(inG*255), outG>>8)
require.Equal(t, uint32(inB*255), outB>>8)
}
}
}
}

var errRenderNotSupported = errors.New("rendering pdf is not supported on this system")

// renderPDFToPNGs uses ghostscript (gs) to render specified PDF file into a set of PNG images (one per page).
Expand Down
44 changes: 41 additions & 3 deletions extractor/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"bytes"
"errors"
"fmt"
"image/color"
"math"
"sort"
"strings"
Expand Down Expand Up @@ -94,7 +95,6 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState,
resources *model.PdfPageResources) error {

operand := op.Operand

if verboseGeom {
Expand Down Expand Up @@ -353,6 +353,14 @@ func (e *Extractor) extractPageText(contents string, resources *model.PdfPageRes
pageText.marks = append(pageText.marks, formResult.pageText.marks...)
state.numChars += formResult.numChars
state.numMisses += formResult.numMisses
case "rg", "g", "k", "cs", "sc", "scn":
// Set non-stroking color/colorspace.
to.gs.ColorspaceNonStroking = gs.ColorspaceNonStroking
to.gs.ColorNonStroking = gs.ColorNonStroking
case "RG", "G", "K", "CS", "SC", "SCN":
// Set stroking color/colorspace.
to.gs.ColorspaceStroking = gs.ColorspaceStroking
to.gs.ColorStroking = gs.ColorStroking
}
return nil
})
Expand Down Expand Up @@ -726,6 +734,16 @@ func (to *textObject) reset() {
to.marks = nil
}

// getFillColor returns the fill color of the text object.
func (to *textObject) getFillColor() color.Color {
return pdfColorToGoColor(to.gs.ColorspaceNonStroking, to.gs.ColorNonStroking)
}

// getStrokeColor returns the stroke color of the text object.
func (to *textObject) getStrokeColor() color.Color {
return pdfColorToGoColor(to.gs.ColorspaceStroking, to.gs.ColorStroking)
}

// renderText processes and renders byte array `data` for extraction purposes.
// It extracts textMarks based the charcodes in `data` and the currect text and graphics states
// are tracked in `to`.
Expand Down Expand Up @@ -767,6 +785,9 @@ func (to *textObject) renderText(data []byte) error {

common.Log.Trace("renderText: %d codes=%+v runes=%q", len(charcodes), charcodes, len(texts))

fillColor := to.getFillColor()
strokeColor := to.getStrokeColor()

for i, text := range texts {
r := []rune(text)
if len(r) == 1 && r[0] == '\x00' {
Expand Down Expand Up @@ -826,12 +847,15 @@ func (to *textObject) renderText(data []byte) error {
translation(end),
math.Abs(spaceWidth*trm.ScalingFactorX()),
font,
to.state.tc)
to.state.tc,
fillColor,
strokeColor)

if !onPage {
common.Log.Debug("Text mark outside page. Skipping")
continue
}
if font == nil {
if font == nil {
common.Log.Debug("ERROR: No font.")
} else if font.Encoder() == nil {
common.Log.Debug("ERROR: No encoding. font=%s", font)
Expand Down Expand Up @@ -875,6 +899,14 @@ func (to *textObject) moveTo(tx, ty float64) {
to.tm = to.tlm
}









// PageText represents the layout of text on a device page.
type PageText struct {
marks []*textMark // Texts and their positions on a PDF page.
Expand Down Expand Up @@ -1084,6 +1116,12 @@ type TextMark struct {
// spaces (line breaks) when we see characters that are over a threshold horizontal (vertical)
// distance apart. See wordJoiner (lineJoiner) in PageText.computeViews().
Meta bool
// FillColor is the fill color of the text.
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
FillColor color.Color
// StrokeColor is the stroke color of the text.
// The color is nil for spaces and line breaks (i.e. the Meta field is true).
StrokeColor color.Color
}

// String returns a string describing `tm`.
Expand Down
8 changes: 7 additions & 1 deletion extractor/text_mark.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package extractor

import (
"fmt"
"image/color"
"math"

"github.com/unidoc/unipdf/v3/common"
Expand All @@ -27,13 +28,16 @@ type textMark struct {
trm transform.Matrix // The current text rendering matrix (TRM above).
end transform.Point // The end of character device coordinates.
originaBBox model.PdfRectangle // Bounding box without orientation correction.
fillColor color.Color // Text fill color.
strokeColor color.Color // Text stroke color.
}

// newTextMark returns a textMark for text `text` rendered with text rendering matrix (TRM) `trm`
// and end of character device coordinates `end`. `spaceWidth` is our best guess at the width of a
// space in the font the text is rendered in device coordinates.
func (to *textObject) newTextMark(text string, trm transform.Matrix, end transform.Point,
spaceWidth float64, font *model.PdfFont, charspacing float64) (textMark, bool) {
spaceWidth float64, font *model.PdfFont, charspacing float64,
fillColor, strokeColor color.Color) (textMark, bool) {
theta := trm.Angle()
orient := nearestMultiple(theta, orientationGranularity)
var height float64
Expand Down Expand Up @@ -117,6 +121,8 @@ func (to *textObject) newTextMark(text string, trm transform.Matrix, end transfo
trm: trm,
end: end,
orient: orient,
fillColor: fillColor,
strokeColor: strokeColor,
}
if verboseGeom {
common.Log.Info("newTextMark: start=%.2f end=%.2f %s", start, end, tm.String())
Expand Down
29 changes: 29 additions & 0 deletions extractor/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,12 @@ package extractor
import (
"bytes"
"fmt"
"image/color"

"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/common/license"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/model"
)

// RenderMode specifies the text rendering mode (Tmode), which determines whether showing text shall cause
Expand Down Expand Up @@ -76,3 +79,29 @@ func truncate(s string, n int) string {
}
return s[:n]
}

// pdfColorToGoColor converts the specified color to a Go color, using the
// provided colorspace. If unsuccessful, color.Black is returned.
func pdfColorToGoColor(space model.PdfColorspace, c model.PdfColor) color.Color {
if space == nil || c == nil {
return color.Black
}

conv, err := space.ColorToRGB(c)
if err != nil {
common.Log.Debug("WARN: could not convert color %v (%v) to RGB: %s", c, space, err)
return color.Black
}
rgb, ok := conv.(*model.PdfColorDeviceRGB)
if !ok {
common.Log.Debug("WARN: converted color is not in the RGB colorspace: %v", conv)
return color.Black
}

return color.NRGBA{
R: uint8(rgb.R() * 255),
G: uint8(rgb.G() * 255),
B: uint8(rgb.B() * 255),
A: uint8(255),
}
}

0 comments on commit b39f205

Please sign in to comment.