Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

huff0: Improve 4X decompression speed 5-10% #437

Merged
merged 4 commits into from
Jan 8, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
huff0: 4X Decompression experiment
Significant degradations, abandoned.

```
λ benchcmp before.txt after.txt
benchmark                                            old ns/op     new ns/op     delta
BenchmarkDecompress4XNoTable/digits-32               166962        183522        +9.92%
BenchmarkDecompress4XNoTable/gettysburg-32           2747          2783          +1.31%
BenchmarkDecompress4XNoTable/twain-32                579833        581783        +0.34%
BenchmarkDecompress4XNoTable/low-ent.10k-32          56815         65143         +14.66%
BenchmarkDecompress4XNoTable/superlow-ent-10k-32     15217         17399         +14.34%
BenchmarkDecompress4XNoTable/case1-32                223           215           -3.63%
BenchmarkDecompress4XNoTable/case2-32                178           173           -3.03%
BenchmarkDecompress4XNoTable/case3-32                186           178           -4.10%
BenchmarkDecompress4XNoTable/pngdata.001-32          78199         79470         +1.63%
BenchmarkDecompress4XNoTable/normcount2-32           295           280           -5.22%
BenchmarkDecompress4XTable/digits-32                 169105        186653        +10.38%
BenchmarkDecompress4XTable/gettysburg-32             4113          4243          +3.16%
BenchmarkDecompress4XTable/twain-32                  580482        589091        +1.48%
BenchmarkDecompress4XTable/low-ent.10k-32            57488         66318         +15.36%
BenchmarkDecompress4XTable/superlow-ent-10k-32       15801         18111         +14.62%
BenchmarkDecompress4XTable/case1-32                  2060          2062          +0.10%
BenchmarkDecompress4XTable/case2-32                  2004          2015          +0.55%
BenchmarkDecompress4XTable/case3-32                  2026          2040          +0.69%
BenchmarkDecompress4XTable/pngdata.001-32            81603         83582         +2.43%
BenchmarkDecompress4XTable/normcount2-32             1426          1401          -1.75%
```
  • Loading branch information
klauspost committed Sep 7, 2021
commit 9886fdcbbdf3e303746ae94b6ab5f6f0a9154c49
179 changes: 101 additions & 78 deletions huff0/decompress.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@ type dEntrySingle struct {

// double-symbols decoding
type dEntryDouble struct {
seq uint16
seq [2]byte
nBits uint8
len uint8
}

// Uses special code for all tables that are < 8 bits.
Expand Down Expand Up @@ -914,7 +913,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
out := dst
dstEvery := (dstSize + 3) / 4

shift := (8 - d.actualTableLog) & 7
shift := (56 + (8 - d.actualTableLog)) & 63

const tlSize = 1 << 8
single := d.dt.single[:tlSize]
Expand All @@ -938,37 +937,41 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
br[stream].fillFast()
br[stream2].fillFast()

v := single[br[stream].peekByteFast()>>shift].entry
v := single[uint8(br[stream].value>>shift)].entry
v2 := single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 := single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream+1] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream+2] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream+3] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63
}

{
Expand All @@ -977,37 +980,41 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
br[stream].fillFast()
br[stream2].fillFast()

v := single[br[stream].peekByteFast()>>shift].entry
v := single[uint8(br[stream].value>>shift)].entry
v2 := single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 := single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream+1] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream+2] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream+3] = uint8(v >> 8)
br[stream].advance(uint8(v))

v2 = single[br[stream2].peekByteFast()>>shift].entry
buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63
}

off += 4
Expand Down Expand Up @@ -1073,7 +1080,7 @@ func (d *Decoder) decompress4X8bit(dst, src []byte) ([]byte, error) {
}

// Read value and increment offset.
v := single[br.peekByteFast()>>shift].entry
v := single[uint8(br.value>>shift)].entry
nBits := uint8(v)
br.advance(nBits)
bitsLeft -= int(nBits)
Expand Down Expand Up @@ -1121,7 +1128,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
out := dst
dstEvery := (dstSize + 3) / 4

const shift = 0
const shift = 56
const tlSize = 1 << 8
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
Expand All @@ -1145,37 +1152,45 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
br[stream].fillFast()
br[stream2].fillFast()

v := single[br[stream].peekByteFast()>>shift].entry
v := single[uint8(br[stream].value>>shift)].entry
buf[off+bufoff*stream] = uint8(v >> 8)
br[stream].advance(uint8(v))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63

v2 := single[br[stream2].peekByteFast()>>shift].entry
v2 := single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
buf[off+bufoff*stream+1] = uint8(v >> 8)
br[stream].advance(uint8(v))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63

v2 = single[br[stream2].peekByteFast()>>shift].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
buf[off+bufoff*stream+2] = uint8(v >> 8)
br[stream].advance(uint8(v))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63

v2 = single[br[stream2].peekByteFast()>>shift].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
buf[off+bufoff*stream+3] = uint8(v >> 8)
br[stream].advance(uint8(v))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63

v2 = single[br[stream2].peekByteFast()>>shift].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63
}

{
Expand All @@ -1184,37 +1199,45 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
br[stream].fillFast()
br[stream2].fillFast()

v := single[br[stream].peekByteFast()>>shift].entry
v := single[uint8(br[stream].value>>shift)].entry
buf[off+bufoff*stream] = uint8(v >> 8)
br[stream].advance(uint8(v))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63

v2 := single[br[stream2].peekByteFast()>>shift].entry
v2 := single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
buf[off+bufoff*stream+1] = uint8(v >> 8)
br[stream].advance(uint8(v))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63

v2 = single[br[stream2].peekByteFast()>>shift].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream2+1] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
buf[off+bufoff*stream+2] = uint8(v >> 8)
br[stream].advance(uint8(v))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63

v2 = single[br[stream2].peekByteFast()>>shift].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream2+2] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63

v = single[br[stream].peekByteFast()>>shift].entry
v = single[uint8(br[stream].value>>shift)].entry
buf[off+bufoff*stream+3] = uint8(v >> 8)
br[stream].advance(uint8(v))
br[stream].bitsRead += uint8(v)
br[stream].value <<= v & 63

v2 = single[br[stream2].peekByteFast()>>shift].entry
v2 = single[uint8(br[stream2].value>>shift)].entry
buf[off+bufoff*stream2+3] = uint8(v2 >> 8)
br[stream2].advance(uint8(v2))
br[stream2].bitsRead += uint8(v2)
br[stream2].value <<= v2 & 63
}

off += 4
Expand Down Expand Up @@ -1280,7 +1303,7 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
}

// Read value and increment offset.
v := single[br.peekByteFast()>>shift].entry
v := single[br.peekByteFast()].entry
nBits := uint8(v)
br.advance(nBits)
bitsLeft -= int(nBits)
Expand Down