Font subsetting and font optimization improvements (unidoc#362)

* Track runes in IdentityEncoder (for subsetting), track decoded runes * Working with the identity encoder in font_composite.go * Add GetFilterArray to multi encoder. Add comments. * Add NewFromContents constructor to extractor only requiring contents and resources * golint fixes * Optimizer compress streams - improved detection of raw streams * Optimize - CleanContentStream optimizer that removes redundant operands * WIP Optimize - clean fonts Will support both font file reduction and subsetting. (WIP) * Optimize - image processing - try combined DCT and Flate * Update options.go * Update optimizer.go * Create utils.go for optimize with common methods needed for optimization * Optimizer - add font subsetting method Covers XObject Forms, annotaitons etc. Uses extractor package to extract text marks covering what fonts and glyphs are used. Package truetype used for subsetting. * Add some comments * Fix cmap parsing rune conversion * Error checking for extractor. Add some comments. * Update Jenkinsfile * Update modules
oliverpool · Jun 16, 2020 · 11f692b · 11f692b
1 parent 99ef1b8
commit 11f692b
Show file tree

Hide file tree

Showing 20 changed files with 851 additions and 95 deletions.
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -1,13 +1,10 @@
 node {
     // Install the desired Go version
-    def root = tool name: 'go 1.11.5', type: 'go'
+    def root = tool name: 'go 1.14.3', type: 'go'
 
     env.GOROOT="${root}"
-    env.GOPATH="${WORKSPACE}/gopath"
-    // Hack for 1.11.5 testing work.
-    env.CGO_ENABLED="0"
-    env.PATH="${root}/bin:${env.GOPATH}/bin:${env.PATH}"
-    env.GOCACHE="off"
+    env.GOBIN="${WORKSPACE}/bin"
+    env.PATH="${root}/bin:${env.GOBIN}:${env.PATH}"
     env.UNIDOC_EXTRACT_FORCETEST="1"
     env.UNIDOC_E2E_FORCE_TESTS="1"
     env.UNIDOC_EXTRACT_TESTDATA="/home/jenkins/corpus/unidoc-extractor-testdata"
@@ -19,13 +16,13 @@ node {
     env.UNIDOC_JBIG2_TESTDATA="/home/jenkins/corpus/jbig2-testdata"
     env.UNIDOC_FDFMERGE_TESTDATA="/home/jenkins/corpus/fdfmerge-testdata"
     env.UNIDOC_GS_BIN_PATH="/usr/bin/gs"
-    // Hack for 1.11.5 testing work.
     env.CGO_ENABLED="0"
 
     env.TMPDIR="${WORKSPACE}/temp"
+    sh "mkdir -p ${env.GOBIN}"
     sh "mkdir -p ${env.TMPDIR}"
 
-    dir("${GOPATH}/src/github.com/unidoc/unipdf") {
+    dir("${WORKSPACE}/unipdf") {
         sh 'go version'
 
         stage('Checkout') {
@@ -35,11 +32,9 @@ node {
 
         stage('Prepare') {
             // Get linter and other build tools.
-            sh 'go get -u golang.org/x/lint/golint'
+            sh 'go get golang.org/x/lint/golint'
             sh 'go get github.com/tebeka/go2xunit'
             sh 'go get github.com/t-yuki/gocover-cobertura'
-            // Get all dependencies (for tests also).
-            sh 'go get -t ./...'
         }
 
         stage('Linting') {
@@ -53,7 +48,7 @@ node {
         stage('Testing') {
             // Go test - No tolerance.
             sh "rm -f ${env.TMPDIR}/*.pdf"
-            sh '2>&1 go test -v ./... | tee gotest.txt'
+            sh '2>&1 go test -count=1 -v ./... | tee gotest.txt'
         }
 
         stage('Check generated PDFs') {
@@ -62,7 +57,7 @@ node {
         }
 
         stage('Test coverage') {
-            sh 'go test -coverprofile=coverage.out -covermode=atomic -coverpkg=./... ./...'
+            sh 'go test -count=1 -coverprofile=coverage.out -covermode=atomic -coverpkg=./... ./...'
             sh '/home/jenkins/codecov.sh'
             sh 'gocover-cobertura < coverage.out > coverage.xml'
             step([$class: 'CoberturaPublisher', coberturaReportFile: 'coverage.xml'])
@@ -80,7 +75,7 @@ node {
         }
     }
 
-    dir("${GOPATH}/src/github.com/unidoc/unipdf-examples") {
+    dir("${WORKSPACE}/unipdf-examples") {
         stage('Build examples') {
             // Output environment variables (useful for debugging).
             sh("printenv")
@@ -97,6 +92,9 @@ node {
 
             echo "Pulling unipdf-examples on branch ${examplesBranch}"
             git url: 'https://github.com/unidoc/unidoc-examples.git', branch: examplesBranch
+
+            // Use replace directive to use disk version of unipdf.
+            sh 'echo "replace github.com/unidoc/unipdf/v3 => ../unipdf" >>go.mod'
 
             // Dependencies for examples.
             sh './build_examples.sh'

diff --git a/core/encoding.go b/core/encoding.go
@@ -948,7 +948,6 @@ func newDCTEncoderFromStream(streamObj *PdfObjectStream, multiEnc *MultiEncoder)
 			return nil, err
 		}
 		encoded = e
-
 	}
 
 	bufReader := bytes.NewReader(encoded)
@@ -2158,6 +2157,9 @@ func newMultiEncoderFromStream(streamObj *PdfObjectStream) (*MultiEncoder, error
 
 // GetFilterName returns the names of the underlying encoding filters,
 // separated by spaces.
+// Note: This is just a string, should not be used in /Filter dictionary entry. Use GetFilterArray for that.
+// TODO(v4): Refactor to GetFilter() which can be used for /Filter (either Name or Array), this can be
+//  renamed to String() as a pretty string to use in debugging etc.
 func (enc *MultiEncoder) GetFilterName() string {
 	name := ""
 	for idx, encoder := range enc.encoders {
@@ -2169,6 +2171,16 @@ func (enc *MultiEncoder) GetFilterName() string {
 	return name
 }
 
+// GetFilterArray returns the names of the underlying encoding filters in an array that
+// can be used as /Filter entry.
+func (enc *MultiEncoder) GetFilterArray() *PdfObjectArray {
+	names := make([]PdfObject, len(enc.encoders))
+	for i, e := range enc.encoders {
+		names[i] = MakeName(e.GetFilterName())
+	}
+	return MakeArray(names...)
+}
+
 // MakeDecodeParams makes a new instance of an encoding dictionary based on
 // the current encoder settings.
 func (enc *MultiEncoder) MakeDecodeParams() PdfObject {
@@ -2201,12 +2213,7 @@ func (enc *MultiEncoder) AddEncoder(encoder StreamEncoder) {
 // MakeStreamDict makes a new instance of an encoding dictionary for a stream object.
 func (enc *MultiEncoder) MakeStreamDict() *PdfObjectDictionary {
 	dict := MakeDict()
-
-	names := make([]PdfObject, len(enc.encoders))
-	for i, e := range enc.encoders {
-		names[i] = MakeName(e.GetFilterName())
-	}
-	dict.Set("Filter", MakeArray(names...))
+	dict.Set("Filter", enc.GetFilterArray())
 
 	// Pass all values from children, except Filter and DecodeParms.
 	for _, encoder := range enc.encoders {

diff --git a/extractor/extractor.go b/extractor/extractor.go
@@ -42,9 +42,14 @@ func New(page *model.PdfPage) (*Extractor, error) {
 	// fmt.Printf("%s\n", contents)
 	// fmt.Println("========================= ::: =========================")
 
+	return NewFromContents(contents, page.Resources)
+}
+
+// NewFromContents creates a new extractor from contents and page resources.
+func NewFromContents(contents string, resources *model.PdfPageResources) (*Extractor, error) {
 	e := &Extractor{
 		contents:    contents,
-		resources:   page.Resources,
+		resources:   resources,
 		fontCache:   map[string]fontEntry{},
 		formResults: map[string]textResult{},
 	}

diff --git a/extractor/text.go b/extractor/text.go
@@ -439,7 +439,11 @@ func (to *textObject) showTextAdjusted(args *core.PdfObjectArray) error {
 				common.Log.Trace("showTextAdjusted: Bad string arg. o=%s args=%+v", o, args)
 				return core.ErrTypeError
 			}
-			to.renderText(charcodes)
+			err := to.renderText(charcodes)
+			if err != nil {
+				common.Log.Debug("Render text error: %v", err)
+				return err
+			}
 		default:
 			common.Log.Debug("ERROR: showTextAdjusted. Unexpected type (%T) args=%+v", o, args)
 			return core.ErrTypeError
@@ -736,6 +740,7 @@ func (to *textObject) renderText(data []byte) error {
 			continue
 		}
 
+		// TODO(gunnsth): Assuming 1:1 charcode[i] <-> rune[i] mapping.
 		code := charcodes[i]
 		// The location of the text on the page in device coordinates is given by trm, the text
 		// rendering matrix.
@@ -785,6 +790,8 @@ func (to *textObject) renderText(data []byte) error {
 		} else if font.Encoder() == nil {
 			common.Log.Debug("ERROR: No encoding. font=%s", font)
 		} else {
+			// TODO: This lookup seems confusing. Went from bytes <-> charcodes already.
+			// NOTE: This is needed to register runes by the font encoder - for subsetting (optimization).
 			original, ok := font.Encoder().CharcodeToRune(code)
 			if ok {
 				mark.original = string(original)

diff --git a/extractor/text_test.go b/extractor/text_test.go
@@ -51,9 +51,7 @@ var doStress bool
 func init() {
 	flag.BoolVar(&doStress, "extractor-stresstest", false, "Run text extractor stress tests.")
 	common.SetLogger(common.NewConsoleLogger(common.LogLevelInfo))
-	if flag.Lookup("test.v") != nil {
-		isTesting = true
-	}
+	isTesting = true
 }
 
 // TestTextExtractionFragments tests text extraction on the PDF fragments in `fragmentTests`.

diff --git a/go.mod b/go.mod
@@ -5,12 +5,15 @@ go 1.11
 require (
 	github.com/adrg/sysfont v0.1.0
 	github.com/boombuler/barcode v1.0.0
+	github.com/davecgh/go-spew v1.1.1
 	github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0
+	github.com/sirupsen/logrus v1.6.0
 	github.com/stretchr/testify v1.4.0
 	github.com/unidoc/pkcs7 v0.0.0-20200411230602-d883fd70d1df
 	github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a
-	github.com/unidoc/unitype v0.1.0
+	github.com/unidoc/unitype v0.2.0
 	golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5
 	golang.org/x/image v0.0.0-20181116024801-cd38e8056d9b
+	golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 // indirect
 	golang.org/x/text v0.3.2
 )
diff --git a/go.sum b/go.sum
@@ -15,6 +15,8 @@ github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGw
 github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/konsorten/go-windows-terminal-sequences v1.0.2 h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s=
 github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
+github.com/konsorten/go-windows-terminal-sequences v1.0.3 h1:CE8S1cTafDpPvMhIxNJKvHsGVBgn1xWYf1NbHQhywc8=
+github.com/konsorten/go-windows-terminal-sequences v1.0.3/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
 github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
 github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
 github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
@@ -24,6 +26,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/sirupsen/logrus v1.5.0 h1:1N5EYkVAPEywqZRJd7cwnRtCb6xJx7NH3T3WUTF980Q=
 github.com/sirupsen/logrus v1.5.0/go.mod h1:+F7Ogzej0PZc/94MaYx/nvG9jOFMD2osvC3s+Squfpo=
+github.com/sirupsen/logrus v1.6.0 h1:UBcNElsrwanuuMsnGSlYmtmgbb23qDR5dG+6X6Oo89I=
+github.com/sirupsen/logrus v1.6.0/go.mod h1:7uNnSEd1DgxDLC74fIahvMZmmYsHGZGEOFrfsX/uA88=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
 github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
 github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
@@ -34,6 +38,10 @@ github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a h1:RLtvUhe4DsUDl6
 github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a/go.mod h1:j+qMWZVpZFTvDey3zxUkSgPJZEX33tDgU/QIA0IzCUw=
 github.com/unidoc/unitype v0.1.0 h1:6zJYMl8XdwFBD45Cmg8Ge13WyE92jwLuK1tk2IsRb9s=
 github.com/unidoc/unitype v0.1.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
+github.com/unidoc/unitype v0.1.1-0.20200524232639-77d42b645b02 h1:zVMJh0ehLc0amGBcqIh7HWikIGXGBGpmW+Lvz1YVYH8=
+github.com/unidoc/unitype v0.1.1-0.20200524232639-77d42b645b02/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
+github.com/unidoc/unitype v0.2.0 h1:N+ZKjwz8UDU0qa1IYzstDLffvQEctFo+bo6b6ZqW+9M=
+github.com/unidoc/unitype v0.2.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
 golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
 golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5 h1:58fnuSXlxZmFdJyvtTFVmVhcMLU6v5fEb/ok4wyqtNU=
 golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
@@ -45,6 +53,8 @@ golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7w
 golang.org/x/sys v0.0.0-20190422165155-953cdadca894/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200413165638-669c56c373c4 h1:opSr2sbRXk5X5/givKrrKj9HXxFpW2sdCiP8MJSKLQY=
 golang.org/x/sys v0.0.0-20200413165638-669c56c373c4/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1 h1:ogLJMz+qpzav7lGMh10LMvAkM/fAoGlaiiHYiFYdm80=
+golang.org/x/sys v0.0.0-20200615200032-f1bc736245b1/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
 golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
 golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=

diff --git a/internal/textencoding/identity.go b/internal/textencoding/identity.go
@@ -13,47 +13,81 @@ import (
 	"github.com/unidoc/unipdf/v3/core"
 )
 
-// IdentityEncoder represents an 2-byte identity encoding
+// IdentityEncoder represents an 2-byte identity encoding.
+// NOTE: In many cases this is just used to encode/decode to glyph index and does not have a unicode
+//  meaning, except via the ToUnicode maps.
+// TODO: The use of runes as indicators for glyph indices and not-utf8 runes is not good and confusing.
+//  Might be better to combine the Identity encoder with a ToUnicode map and keep track of the actual
+//  runes and character codes, CMaps together.
 type IdentityEncoder struct {
 	baseName string
+
+	// runes registered by encoder for tracking what runes are used for subsetting.
+	registeredMap map[rune]struct{}
 }
 
 // NewIdentityTextEncoder returns a new IdentityEncoder based on predefined
 // encoding `baseName` and difference map `differences`.
-func NewIdentityTextEncoder(baseName string) IdentityEncoder {
-	return IdentityEncoder{baseName}
+func NewIdentityTextEncoder(baseName string) *IdentityEncoder {
+	return &IdentityEncoder{
+		baseName: baseName,
+	}
+}
+
+// RegisteredRunes returns the slice of runes that have been registered as used by the encoder.
+func (enc *IdentityEncoder) RegisteredRunes() []rune {
+	runes := make([]rune, len(enc.registeredMap))
+	i := 0
+	for r := range enc.registeredMap {
+		runes[i] = r
+		i++
+	}
+	return runes
 }
 
 // String returns a string that describes `enc`.
-func (enc IdentityEncoder) String() string {
+func (enc *IdentityEncoder) String() string {
 	return enc.baseName
 }
 
 // Encode converts the Go unicode string to a PDF encoded string.
-func (enc IdentityEncoder) Encode(str string) []byte {
+func (enc *IdentityEncoder) Encode(str string) []byte {
 	return encodeString16bit(enc, str)
 }
 
 // Decode converts PDF encoded string to a Go unicode string.
-func (enc IdentityEncoder) Decode(raw []byte) string {
+func (enc *IdentityEncoder) Decode(raw []byte) string {
 	return decodeString16bit(enc, raw)
 }
 
 // RuneToCharcode converts rune `r` to a PDF character code.
 // The bool return flag is true if there was a match, and false otherwise.
-func (enc IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) {
+// TODO: Here the `r` is an actual rune.
+func (enc *IdentityEncoder) RuneToCharcode(r rune) (CharCode, bool) {
+	if enc.registeredMap == nil {
+		enc.registeredMap = map[rune]struct{}{}
+	}
+	enc.registeredMap[r] = struct{}{} // Register use (subsetting).
+
 	return CharCode(r), true
 }
 
 // CharcodeToRune converts PDF character code `code` to a rune.
 // The bool return flag is true if there was a match, and false otherwise.
-func (enc IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) {
+// TODO: Here the `r` is not necessarily an actual rune but a glyph index (unless both).
+func (enc *IdentityEncoder) CharcodeToRune(code CharCode) (rune, bool) {
+	if enc.registeredMap == nil {
+		enc.registeredMap = map[rune]struct{}{}
+	}
+
+	// TODO: The rune(code) is confusing and is not an actual utf8 rune.
+	enc.registeredMap[rune(code)] = struct{}{}
 	return rune(code), true
 }
 
 // RuneToGlyph returns the glyph name for rune `r`.
 // The bool return flag is true if there was a match, and false otherwise.
-func (enc IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
+func (enc *IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
 	if r == ' ' {
 		return "space", true
 	}
@@ -63,7 +97,7 @@ func (enc IdentityEncoder) RuneToGlyph(r rune) (GlyphName, bool) {
 
 // GlyphToRune returns the rune corresponding to glyph name `glyph`.
 // The bool return flag is true if there was a match, and false otherwise.
-func (enc IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
+func (enc *IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
 	// String with "uniXXXX" format where XXXX is the hexcode.
 	if glyph == "space" {
 		return ' ', true
@@ -78,7 +112,7 @@ func (enc IdentityEncoder) GlyphToRune(glyph GlyphName) (rune, bool) {
 }
 
 // ToPdfObject returns a nil as it is not truly a PDF object and should not be attempted to store in file.
-func (enc IdentityEncoder) ToPdfObject() core.PdfObject {
+func (enc *IdentityEncoder) ToPdfObject() core.PdfObject {
 	if enc.baseName != "" {
 		return core.MakeName(enc.baseName)
 	}

diff --git a/internal/textencoding/simple.go b/internal/textencoding/simple.go
@@ -103,6 +103,9 @@ type simpleEncoding struct {
 	// one byte encoding: CharCode <-> byte
 	encode map[rune]byte
 	decode map[byte]rune
+
+	// runes registered by encoder for tracking what runes are used for subsetting.
+	registeredMap map[rune]struct{}
 }
 
 // Encode converts the Go unicode string to a PDF encoded string.
@@ -213,6 +216,10 @@ func (enc *simpleEncoding) Charcodes() []CharCode {
 
 func (enc *simpleEncoding) RuneToCharcode(r rune) (CharCode, bool) {
 	b, ok := enc.encode[r]
+	if enc.registeredMap == nil {
+		enc.registeredMap = map[rune]struct{}{}
+	}
+	enc.registeredMap[r] = struct{}{} // Register use (subsetting).
 	return CharCode(b), ok
 }
 
@@ -222,6 +229,10 @@ func (enc *simpleEncoding) CharcodeToRune(code CharCode) (rune, bool) {
 	}
 	b := byte(code)
 	r, ok := enc.decode[b]
+	if enc.registeredMap == nil {
+		enc.registeredMap = map[rune]struct{}{}
+	}
+	enc.registeredMap[r] = struct{}{} // Register use (subsetting).
 	return r, ok
 }