Skip to content

Commit

Permalink
Subsetting fixes (unidoc#346)
Browse files Browse the repository at this point in the history
* Update unitype lib which improves subsetting

* Add text extraction check to creator font subsetting example

Helps ensure ToUnicode map is set correctly.

* Clean up import

* Fix spelling
  • Loading branch information
gunnsth authored May 12, 2020
1 parent aef6e5e commit ad2a1e9
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 11 deletions.
38 changes: 32 additions & 6 deletions creator/creator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/unidoc/unipdf/v3/common"
"github.com/unidoc/unipdf/v3/contentstream/draw"
"github.com/unidoc/unipdf/v3/core"
"github.com/unidoc/unipdf/v3/extractor"
"github.com/unidoc/unipdf/v3/model"
"github.com/unidoc/unipdf/v3/model/optimize"
)
Expand Down Expand Up @@ -696,12 +697,31 @@ func TestParagraphChinese(t *testing.T) {
require.NoError(t, err)
}

testWriteAndRender(t, creator, "2_p_nihao.pdf")
fname := tempFile("2_p_nihao.pdf")
fname := testWrite(t, creator, "2_p_nihao.pdf")
st, err := os.Stat(fname)
require.NoError(t, err)
t.Logf("output size: %d (%.2f MB)", st.Size(), float64(st.Size())/1024/1024)

t.Logf("output size: %d (%d MB)", st.Size(), st.Size()/1024/1024)
// Check if text is extracted correctly (tests the ToUnicode map).
f, err := os.Open(fname)
require.NoError(t, err)
defer f.Close()
r, err := model.NewPdfReaderLazy(f)
require.NoError(t, err)
p, err := r.GetPage(1)
require.NoError(t, err)
e, err := extractor.New(p)
require.NoError(t, err)
text, err := e.ExtractText()
require.NoError(t, err)
expected := strings.Join(lines, "\n")
if len(text) > len(expected) {
// Trim off extra license data.
text = text[:len(expected)]
}
require.Equal(t, expected, text)

testRender(t, fname)
}

// Test paragraph with composite font and various unicode characters.
Expand Down Expand Up @@ -3113,14 +3133,20 @@ func hashFile(file string) (string, error) {
return hex.EncodeToString(h.Sum(nil)), nil
}

func testWriteAndRender(t *testing.T, c *Creator, pname string) {
func testWriteAndRender(t *testing.T, c *Creator, pname string) string {
pname = testWrite(t, c, pname)
testRender(t, pname)
return pname
}

func testWrite(t *testing.T, c *Creator, pname string) string {
pname = tempFile(pname)
err := c.WriteToFile(pname)
if err != nil {
t.Errorf("Fail: %v\n", err)
return
return pname
}
testRender(t, pname)
return pname
}

func testRender(t *testing.T, pdfPath string) {
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ require (
github.com/stretchr/testify v1.4.0
github.com/unidoc/pkcs7 v0.0.0-20200411230602-d883fd70d1df
github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a
github.com/unidoc/unitype v0.0.0-20200426000514-43fb032b9ce6
github.com/unidoc/unitype v0.1.0
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5
golang.org/x/image v0.0.0-20181116024801-cd38e8056d9b
golang.org/x/text v0.3.2
Expand Down
9 changes: 5 additions & 4 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -15,25 +15,25 @@ github.com/golang/freetype v0.0.0-20170609003504-e2365dfdc4a0/go.mod h1:E/TSTwGw
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/konsorten/go-windows-terminal-sequences v1.0.2 h1:DB17ag19krx9CFsz4o3enTrPXyIXCl+2iCXH/aMAp9s=
github.com/konsorten/go-windows-terminal-sequences v1.0.2/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
github.com/kr/pretty v0.1.0 h1:L/CwN0zerZDmRFUapSPitk6f+Q3+0za1rQkzVuMiMFI=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/sirupsen/logrus v1.5.0 h1:1N5EYkVAPEywqZRJd7cwnRtCb6xJx7NH3T3WUTF980Q=
github.com/sirupsen/logrus v1.5.0/go.mod h1:+F7Ogzej0PZc/94MaYx/nvG9jOFMD2osvC3s+Squfpo=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/unidoc/pkcs7 v0.0.0-20200411230602-d883fd70d1df h1:1RV3lxQ6L6xGFNhngpP9iMjJPSwvH3p17JNbK9u5274=
github.com/unidoc/pkcs7 v0.0.0-20200411230602-d883fd70d1df/go.mod h1:UEzOZUEpJfDpywVJMUT8QiugqEZC29pDq7kdIZhWCr8=
github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a h1:RLtvUhe4DsUDl66m7MJ8OqBjq8jpWBXPK6/RKtqeTkc=
github.com/unidoc/timestamp v0.0.0-20200412005513-91597fd3793a/go.mod h1:j+qMWZVpZFTvDey3zxUkSgPJZEX33tDgU/QIA0IzCUw=
github.com/unidoc/unitype v0.0.0-20200426000514-43fb032b9ce6 h1:wKQZP0/WXDQ6kqniHSpdXol+895jojF+Sk4XORxxi3A=
github.com/unidoc/unitype v0.0.0-20200426000514-43fb032b9ce6/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
github.com/unidoc/unitype v0.1.0 h1:6zJYMl8XdwFBD45Cmg8Ge13WyE92jwLuK1tk2IsRb9s=
github.com/unidoc/unitype v0.1.0/go.mod h1:mafyug7zYmDOusqa7G0dJV45qp4b6TDAN+pHN7ZUIBU=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5 h1:58fnuSXlxZmFdJyvtTFVmVhcMLU6v5fEb/ok4wyqtNU=
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
Expand All @@ -51,6 +51,7 @@ golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs=
golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 h1:qIbj1fsPNlZgppZ+VLlY7N33q108Sa+fhmuc+sWQYwY=
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
1 change: 1 addition & 0 deletions internal/textencoding/truetype.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ type TrueTypeFontEncoder struct {
}

// SubsetRegistered subsets `enc` to only registered runes (that have been registered via encoding).
// NOTE: Make sure to call this soon before writing (once all needed runes have been registered).
func (enc *TrueTypeFontEncoder) SubsetRegistered() {
common.Log.Info("TTF Subset: Pruning")
for r := range enc.runeToGIDMap {
Expand Down
2 changes: 2 additions & 0 deletions model/font.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ type PdfFont struct {
// SubsetRegistered subsets the font to only the glyphs that have been registered by the encoder.
// NOTE: This only works on fonts that support subsetting. For unsupported fonts this is a no-op, although a debug
// message is emitted. Currently supported fonts are embedded Truetype CID fonts (type 0).
// NOTE: Make sure to call this soon before writing (once all needed runes have been registered).
// If using package creator, use its EnableFontSubsetting method instead.
func (font *PdfFont) SubsetRegistered() error {
switch t := font.context.(type) {
case *pdfFontType0:
Expand Down

0 comments on commit ad2a1e9

Please sign in to comment.