Skip to content

Commit

Permalink
Font subsetting and font optimization improvements (unidoc#362)
Browse files Browse the repository at this point in the history
* Track runes in IdentityEncoder (for subsetting), track decoded runes

* Working with the identity encoder in font_composite.go

* Add GetFilterArray to multi encoder.  Add comments.

* Add NewFromContents constructor to extractor only requiring contents and resources

* golint fixes

* Optimizer compress streams - improved detection of raw streams

* Optimize - CleanContentStream optimizer that removes redundant operands

* WIP Optimize - clean fonts

Will support both font file reduction and subsetting. (WIP)

* Optimize - image processing - try combined DCT and Flate

* Update options.go

* Update optimizer.go

* Create utils.go for optimize with common methods needed for optimization

* Optimizer - add font subsetting method

Covers XObject Forms, annotaitons etc.  Uses extractor package to extract text marks covering what fonts and glyphs are used.  Package truetype used for subsetting.

* Add some comments

* Fix cmap parsing rune conversion

* Error checking for extractor.  Add some comments.

* Update Jenkinsfile

* Update modules
  • Loading branch information
gunnsth authored Jun 16, 2020
1 parent 99ef1b8 commit 11f692b
Show file tree
Hide file tree
Showing 20 changed files with 851 additions and 95 deletions.
26 changes: 12 additions & 14 deletions Jenkinsfile
Original file line number Diff line number Diff line change
@@ -1,13 +1,10 @@
node {
// Install the desired Go version
def root = tool name: 'go 1.11.5', type: 'go'
def root = tool name: 'go 1.14.3', type: 'go'

env.GOROOT="${root}"
env.GOPATH="${WORKSPACE}/gopath"
// Hack for 1.11.5 testing work.
env.CGO_ENABLED="0"
env.PATH="${root}/bin:${env.GOPATH}/bin:${env.PATH}"
env.GOCACHE="off"
env.GOBIN="${WORKSPACE}/bin"
env.PATH="${root}/bin:${env.GOBIN}:${env.PATH}"
env.UNIDOC_EXTRACT_FORCETEST="1"
env.UNIDOC_E2E_FORCE_TESTS="1"
env.UNIDOC_EXTRACT_TESTDATA="/home/jenkins/corpus/unidoc-extractor-testdata"
Expand All @@ -19,13 +16,13 @@ node {
env.UNIDOC_JBIG2_TESTDATA="/home/jenkins/corpus/jbig2-testdata"
env.UNIDOC_FDFMERGE_TESTDATA="/home/jenkins/corpus/fdfmerge-testdata"
env.UNIDOC_GS_BIN_PATH="/usr/bin/gs"
// Hack for 1.11.5 testing work.
env.CGO_ENABLED="0"

env.TMPDIR="${WORKSPACE}/temp"
sh "mkdir -p ${env.GOBIN}"
sh "mkdir -p ${env.TMPDIR}"

dir("${GOPATH}/src/github.com/unidoc/unipdf") {
dir("${WORKSPACE}/unipdf") {
sh 'go version'

stage('Checkout') {
Expand All @@ -35,11 +32,9 @@ node {

stage('Prepare') {
// Get linter and other build tools.
sh 'go get -u golang.org/x/lint/golint'
sh 'go get golang.org/x/lint/golint'
sh 'go get github.com/tebeka/go2xunit'
sh 'go get github.com/t-yuki/gocover-cobertura'
// Get all dependencies (for tests also).
sh 'go get -t ./...'
}

stage('Linting') {
Expand All @@ -53,7 +48,7 @@ node {
stage('Testing') {
// Go test - No tolerance.
sh "rm -f ${env.TMPDIR}/*.pdf"
sh '2>&1 go test -v ./... | tee gotest.txt'
sh '2>&1 go test -count=1 -v ./... | tee gotest.txt'
}

stage('Check generated PDFs') {
Expand All @@ -62,7 +57,7 @@ node {
}

stage('Test coverage') {
sh 'go test -coverprofile=coverage.out -covermode=atomic -coverpkg=./... ./...'
sh 'go test -count=1 -coverprofile=coverage.out -covermode=atomic -coverpkg=./... ./...'
sh '/home/jenkins/codecov.sh'
sh 'gocover-cobertura < coverage.out > coverage.xml'
step([$class: 'CoberturaPublisher', coberturaReportFile: 'coverage.xml'])
Expand All @@ -80,7 +75,7 @@ node {
}
}

dir("${GOPATH}/src/github.com/unidoc/unipdf-examples") {
dir("${WORKSPACE}/unipdf-examples") {
stage('Build examples') {
// Output environment variables (useful for debugging).
sh("printenv")
Expand All @@ -97,6 +92,9 @@ node {

echo "Pulling unipdf-examples on branch ${examplesBranch}"
git url: 'https://github.com/unidoc/unidoc-examples.git', branch: examplesBranch

// Use replace directive to use disk version of unipdf.
sh 'echo "replace github.com/unidoc/unipdf/v3 => ../unipdf" >>go.mod'

// Dependencies for examples.
sh './build_examples.sh'
Expand Down
21 changes: 14 additions & 7 deletions core/encoding.go
Original file line number Diff line number Diff line change
Expand Up @@ -948,7 +948,6 @@ func newDCTEncoderFromStream(streamObj *PdfObjectStream, multiEnc *MultiEncoder)
return nil, err
}
encoded = e

}

bufReader := bytes.NewReader(encoded)
Expand Down Expand Up @@ -2158,6 +2157,9 @@ func newMultiEncoderFromStream(streamObj *PdfObjectStream) (*MultiEncoder, error

// GetFilterName returns the names of the underlying encoding filters,
// separated by spaces.
// Note: This is just a string, should not be used in /Filter dictionary entry. Use GetFilterArray for that.
// TODO(v4): Refactor to GetFilter() which can be used for /Filter (either Name or Array), this can be
// renamed to String() as a pretty string to use in debugging etc.
func (enc *MultiEncoder) GetFilterName() string {
name := ""
for idx, encoder := range enc.encoders {
Expand All @@ -2169,6 +2171,16 @@ func (enc *MultiEncoder) GetFilterName() string {
return name
}

// GetFilterArray returns the names of the underlying encoding filters in an array that
// can be used as /Filter entry.
func (enc *MultiEncoder) GetFilterArray() *PdfObjectArray {
names := make([]PdfObject, len(enc.encoders))
for i, e := range enc.encoders {
names[i] = MakeName(e.GetFilterName())
}
return MakeArray(names...)
}

// MakeDecodeParams makes a new instance of an encoding dictionary based on
// the current encoder settings.
func (enc *MultiEncoder) MakeDecodeParams() PdfObject {
Expand Down Expand Up @@ -2201,12 +2213,7 @@ func (enc *MultiEncoder) AddEncoder(encoder StreamEncoder) {
// MakeStreamDict makes a new instance of an encoding dictionary for a stream object.
func (enc *MultiEncoder) MakeStreamDict() *PdfObjectDictionary {
dict := MakeDict()

names := make([]PdfObject, len(enc.encoders))
for i, e := range enc.encoders {
names[i] = MakeName(e.GetFilterName())
}
dict.Set("Filter", MakeArray(names...))
dict.Set("Filter", enc.GetFilterArray())

// Pass all values from children, except Filter and DecodeParms.
for _, encoder := range enc.encoders {
Expand Down
7 changes: 6 additions & 1 deletion extractor/extractor.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,14 @@ func New(page *model.PdfPage) (*Extractor, error) {
// fmt.Printf("%s\n", contents)
// fmt.Println("========================= ::: =========================")

return NewFromContents(contents, page.Resources)
}

// NewFromContents creates a new extractor from contents and page resources.
func NewFromContents(contents string, resources *model.PdfPageResources) (*Extractor, error) {
e := &Extractor{
contents: contents,
resources: page.Resources,
resources: resources,
fontCache: map[string]fontEntry{},
formResults: map[string]textResult{},
}
Expand Down
9 changes: 8 additions & 1 deletion extractor/text.go
Original file line number Diff line number Diff line change
Expand Up @@ -439,7 +439,11 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
return core.ErrTypeError
}
to.renderText(charcodes)
err := to.renderText(charcodes)
if err != nil {
common.Log.Debug("Render text error: %v", err)
return err
}
default:
common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
return core.ErrTypeError
Expand Down Expand Up @@ -736,6 +740,7 @@ func (to *textObject) renderText(data []byte) error {
continue
}

// TODO(gunnsth): Assuming 1:1 charcode[i] <-> rune[i] mapping.
code := charcodes[i]
// The location of the text on the page in device coordinates is given by trm, the text
// rendering matrix.
Expand Down Expand Up @@ -785,6 +790,8 @@ func (to *textObject) renderText(data []byte) error {
} else if font.Encoder() == nil {
common.Log.Debug("ERROR: No encoding. font=%s", font)
} else {
// TODO: This lookup seems confusing. Went from bytes <-> charcodes already.
// NOTE: This is needed to register runes by the font encoder - for subsetting (optimization).
original, ok := font.Encoder().CharcodeToRune(code)
if ok {
mark.original = string(original)
Expand Down
4 changes: 1 addition & 3 deletions extractor/text_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,7 @@ var doStress bool
func init() {
flag.BoolVar(&doStress, "extractor-stresstest", false, "Run text extractor stress tests.")
common.SetLogger(common.NewConsoleLogger(common.LogLevelInfo))
if flag.Lookup("test.v") != nil {
isTesting = true
}
isTesting = true
}

// TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.
Expand Down
5 changes: 4 additions & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,15 @@ go 1.11
require (
github.com/adrg/sysfont v0.1.0
github.com/boombuler/barcode v1.0.0
github.com/davecgh/go-spew v1.1.1
github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0
github.com/sirupsen/logrus v1.6.0
github.com/stretchr/testify v1.4.0
github.com/unidoc/pkcs7 v0.0.0-20200411230602-d883fd70d1df
github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a
github.com/unidoc/unitype v0.1.0
github.com/unidoc/unitype v0.2.0
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5
golang.org/x/image v0.0.0-20181116024801-cd38e8056d9b
golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 // indirect
golang.org/x/text v0.3.2
)
10 changes: 10 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGw
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.2 h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s=
github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.3 h1:CE8S1cTafDpPvMhIxNJKvHsGVBgn1xWYf1NbHQhywc8=
github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
Expand All @@ -24,6 +26,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/sirupsen/logrus v1.5.0 h1:1N5EYkVAPEywqZRJd7cwnRtCb6xJx7NH3T3WUTF980Q=
github.com/sirupsen/logrus v1.5.0/go.mod h1:+F7Ogzej0PZc/94MaYx/nvG9jOFMD2osvC3s+Squfpo=
github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I=
github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
Expand All @@ -34,6 +38,10 @@ github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a h1:RLtvUhe4DsUDl6
github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a/go.mod h1:j+qMWZVpZFTvDey3zxUkSgPJZEX33tDgU/QIA0IzCUw=
github.com/unidoc/unitype v0.1.0 h1:6zJYMl8XdwFBD45Cmg8Ge13WyE92jwLuK1tk2IsRb9s=
github.com/unidoc/unitype v0.1.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
github.com/unidoc/unitype v0.1.1-0.20200524232639-77d42b645b02 h1:zVMJh0ehLc0amGBcqIh7HWikIGXGBGpmW+Lvz1YVYH8=
github.com/unidoc/unitype v0.1.1-0.20200524232639-77d42b645b02/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
github.com/unidoc/unitype v0.2.0 h1:N+ZKjwz8UDU0qa1IYzstDLffvQEctFo+bo6b6ZqW+9M=
github.com/unidoc/unitype v0.2.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5 h1:58fnuSXlxZmFdJyvtTFVmVhcMLU6v5fEb/ok4wyqtNU=
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
Expand All @@ -45,6 +53,8 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w
golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200413165638-669c56c373c4 h1:opSr2sbRXk5X5/givKrrKj9HXxFpW2sdCiP8MJSKLQY=
golang.org/x/sys v0.0.0-20200413165638-669c56c373c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 h1:ogLJMz+qpzav7lGMh10LMvAkM/fAoGlaiiHYiFYdm80=
golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
Expand Down
56 changes: 45 additions & 11 deletions internal/textencoding/identity.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,47 +13,81 @@ import (
"github.com/unidoc/unipdf/v3/core"
)

// IdentityEncoder represents an 2-byte identity encoding
// IdentityEncoder represents an 2-byte identity encoding.
// NOTE: In many cases this is just used to encode/decode to glyph index and does not have a unicode
// meaning, except via the ToUnicode maps.
// TODO: The use of runes as indicators for glyph indices and not-utf8 runes is not good and confusing.
// Might be better to combine the Identity encoder with a ToUnicode map and keep track of the actual
// runes and character codes, CMaps together.
type IdentityEncoder struct {
baseName string

// runes registered by encoder for tracking what runes are used for subsetting.
registeredMap map[rune]struct{}
}

// NewIdentityTextEncoder returns a new IdentityEncoder based on predefined
// encoding `baseName` and difference map `differences`.
func NewIdentityTextEncoder(baseName string) IdentityEncoder {
return IdentityEncoder{baseName}
func NewIdentityTextEncoder(baseName string) *IdentityEncoder {
return &IdentityEncoder{
baseName: baseName,
}
}

// RegisteredRunes returns the slice of runes that have been registered as used by the encoder.
func (enc *IdentityEncoder) RegisteredRunes() []rune {
runes := make([]rune, len(enc.registeredMap))
i := 0
for r := range enc.registeredMap {
runes[i] = r
i++
}
return runes
}

// String returns a string that describes `enc`.
func (enc IdentityEncoder) String() string {
func (enc *IdentityEncoder) String() string {
return enc.baseName
}

// Encode converts the Go unicode string to a PDF encoded string.
func (enc IdentityEncoder) Encode(str string) []byte {
func (enc *IdentityEncoder) Encode(str string) []byte {
return encodeString16bit(enc, str)
}

// Decode converts PDF encoded string to a Go unicode string.
func (enc IdentityEncoder) Decode(raw []byte) string {
func (enc *IdentityEncoder) Decode(raw []byte) string {
return decodeString16bit(enc, raw)
}

// RuneToCharcode converts rune `r` to a PDF character code.
// The bool return flag is true if there was a match, and false otherwise.
func (enc IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) {
// TODO: Here the `r` is an actual rune.
func (enc *IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) {
if enc.registeredMap == nil {
enc.registeredMap = map[rune]struct{}{}
}
enc.registeredMap[r] = struct{}{} // Register use (subsetting).

return CharCode(r), true
}

// CharcodeToRune converts PDF character code `code` to a rune.
// The bool return flag is true if there was a match, and false otherwise.
func (enc IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) {
// TODO: Here the `r` is not necessarily an actual rune but a glyph index (unless both).
func (enc *IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) {
if enc.registeredMap == nil {
enc.registeredMap = map[rune]struct{}{}
}

// TODO: The rune(code) is confusing and is not an actual utf8 rune.
enc.registeredMap[rune(code)] = struct{}{}
return rune(code), true
}

// RuneToGlyph returns the glyph name for rune `r`.
// The bool return flag is true if there was a match, and false otherwise.
func (enc IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
func (enc *IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
if r == ' ' {
return "space", true
}
Expand All @@ -63,7 +97,7 @@ func (enc IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {

// GlyphToRune returns the rune corresponding to glyph name `glyph`.
// The bool return flag is true if there was a match, and false otherwise.
func (enc IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
func (enc *IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
// String with "uniXXXX" format where XXXX is the hexcode.
if glyph == "space" {
return ' ', true
Expand All @@ -78,7 +112,7 @@ func (enc IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
}

// ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file.
func (enc IdentityEncoder) ToPdfObject() core.PdfObject {
func (enc *IdentityEncoder) ToPdfObject() core.PdfObject {
if enc.baseName != "" {
return core.MakeName(enc.baseName)
}
Expand Down
11 changes: 11 additions & 0 deletions internal/textencoding/simple.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,9 @@ type simpleEncoding struct {
// one byte encoding: CharCode <-> byte
encode map[rune]byte
decode map[byte]rune

// runes registered by encoder for tracking what runes are used for subsetting.
registeredMap map[rune]struct{}
}

// Encode converts the Go unicode string to a PDF encoded string.
Expand Down Expand Up @@ -213,6 +216,10 @@ func (enc *simpleEncoding) Charcodes() []CharCode {

func (enc *simpleEncoding) RuneToCharcode(r rune) (CharCode, bool) {
b, ok := enc.encode[r]
if enc.registeredMap == nil {
enc.registeredMap = map[rune]struct{}{}
}
enc.registeredMap[r] = struct{}{} // Register use (subsetting).
return CharCode(b), ok
}

Expand All @@ -222,6 +229,10 @@ func (enc *simpleEncoding) CharcodeToRune(code CharCode) (rune, bool) {
}
b := byte(code)
r, ok := enc.decode[b]
if enc.registeredMap == nil {
enc.registeredMap = map[rune]struct{}{}
}
enc.registeredMap[r] = struct{}{} // Register use (subsetting).
return r, ok
}

Expand Down
Loading

0 comments on commit 11f692b

Please sign in to comment.