-
Notifications
You must be signed in to change notification settings - Fork 0
/
dict-decoder.mbt
194 lines (177 loc) · 6.74 KB
/
dict-decoder.mbt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
// This file is based on the Go implementation found here:
// https://cs.opensource.google/go/go/+/refs/tags/go1.23.1:src/compress/flate/dict_decoder.go
// which has the copyright notice:
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
///| DictDecoder implements the LZ77 sliding dictionary as used in decompression.
/// LZ77 decompresses data through sequences of two forms of commands:
///
/// - Literal insertions: Runs of one or more symbols are inserted into the data
/// stream as is. This is accomplished through the write_byte method for a
/// single symbol, or combinations of write_slice/write_mark for multiple symbols.
/// Any valid stream must start with a literal insertion if no preset dictionary
/// is used.
///
/// - Backward copies: Runs of one or more symbols are copied from previously
/// emitted data. Backward copies come as the tuple (dist, length) where dist
/// determines how far back in the stream to copy from and length determines how
/// many bytes to copy. Note that it is valid for the length to be greater than
/// the distance. Since LZ77 uses forward copies, that situation is used to
/// perform a form of run-length encoding on repeated runs of symbols.
/// The write_copy and try_write_copy are used to implement this command.
///
/// For performance reasons, this implementation performs little to no sanity
/// checks about the arguments. As such, the invariants documented for each
/// method call must be respected.
struct DictDecoder {
hist : Slice[Byte] // []byte // Sliding window history
// Invariant: 0 <= rd_pos <= wr_pos <= len(hist)
mut wr_pos : Int // Current output position in buffer
mut rd_pos : Int // Have emitted hist[:rd_pos] already
mut full : Bool // Has a full window length been written yet?
}
///| DictDecoder::new initializes DictDecoder to have a sliding window dictionary of the given
/// size. If a preset dict is provided, it will initialize the dictionary with
/// the contents of dict.
fn DictDecoder::new(size : Int, dict : Slice[Byte]) -> DictDecoder {
let dd = {
hist: Slice::new(Array::make(size, b'\x00')),
wr_pos: 0,
rd_pos: 0,
full: false,
}
let mut dict = dict
if dict.length() > size {
dict = dict[dict.length() - size:]
}
dd.wr_pos = slice_copy(dd.hist, dict)
if dd.wr_pos == size {
dd.wr_pos = 0
dd.full = true
}
dd.rd_pos = dd.wr_pos
dd
}
///| hist_size reports the total amount of historical data in the dictionary.
fn hist_size(self : DictDecoder) -> Int {
if self.full {
return self.hist.length()
}
self.wr_pos
}
///| avail_read reports the number of bytes that can be flushed by read_flush.
fn avail_read(self : DictDecoder) -> Int {
self.wr_pos - self.rd_pos
}
///| avail_write reports the available amount of output buffer space.
fn avail_write(self : DictDecoder) -> Int {
self.hist.length() - self.wr_pos
}
///| write_slice returns a slice of the available buffer to write data to.
///
/// This invariant will be kept: len(s) <= avail_write()
fn write_slice(self : DictDecoder) -> Slice[Byte] {
self.hist[self.wr_pos:]
}
///| write_mark advances the writer pointer by cnt.
///
/// This invariant must be kept: 0 <= cnt <= avail_write()
fn write_mark(self : DictDecoder, cnt : Int) -> Unit {
self.wr_pos += cnt
}
///| write_byte writes a single byte to the dictionary.
///
/// This invariant must be kept: 0 < avail_write()
fn write_byte(self : DictDecoder, c : Byte) -> Unit {
self.hist[self.wr_pos] = c
self.wr_pos += 1
}
///| write_copy copies a string at a given (dist, length) to the output.
/// This returns the number of bytes copied and may be less than the requested
/// length if the available space in the output buffer is too small.
///
/// This invariant must be kept: 0 < dist <= hist_size()
fn write_copy(self : DictDecoder, dist : Int, length : Int) -> Int {
let dst_base = self.wr_pos
let mut dst_pos = dst_base
let mut src_pos = dst_pos - dist
let mut end_pos = dst_pos + length
if end_pos > self.hist.length() {
end_pos = self.hist.length()
}
// Copy non-overlapping section after destination position.
//
// This section is non-overlapping in that the copy length for this section
// is always less than or equal to the backwards distance. This can occur
// if a distance refers to data that wraps-around in the buffer.
// Thus, a backwards copy is performed here; that is, the exact bytes in
// the source prior to the copy is placed in the destination.
if src_pos < 0 {
src_pos += self.hist.length()
dst_pos += slice_copy(self.hist[dst_pos:end_pos], self.hist[src_pos:])
src_pos = 0
}
// Copy possibly overlapping section before destination position.
//
// This section can overlap if the copy length for this section is larger
// than the backwards distance. This is allowed by LZ77 so that repeated
// strings can be succinctly represented using (dist, length) pairs.
// Thus, a forwards copy is performed here; that is, the bytes copied is
// possibly dependent on the resulting bytes in the destination as the copy
// progresses along.
while dst_pos < end_pos {
dst_pos += slice_copy(
self.hist[dst_pos:end_pos],
self.hist[src_pos:dst_pos],
)
}
//
self.wr_pos = dst_pos
return dst_pos - dst_base
}
///| try_write_copy tries to copy a string at a given (distance, length) to the
/// output. This specialized version is optimized for short distances.
///
/// This method is designed to be inlined for performance reasons.
///
/// This invariant must be kept: 0 < dist <= hist_size()
fn try_write_copy(self : DictDecoder, dist : Int, length : Int) -> Int {
let mut dst_pos = self.wr_pos
let end_pos = dst_pos + length
if dst_pos < dist || end_pos > self.hist.length() {
return 0
}
let dst_base = dst_pos
let src_pos = dst_pos - dist
// Copy possibly overlapping section before destination position.
while dst_pos < end_pos {
dst_pos += slice_copy(
self.hist[dst_pos:end_pos],
self.hist[src_pos:dst_pos],
)
}
self.wr_pos = dst_pos
return dst_pos - dst_base
}
///|
fn slice_copy(dst : Slice[Byte], src : Slice[Byte]) -> Int {
let n = @math.minimum(dst.length(), src.length())
for i = 0; i < n; i = i + 1 {
dst[i] = src[i]
}
n
}
///| read_flush returns a slice of the historical buffer that is ready to be
/// emitted to the user. The data returned by read_flush must be fully consumed
/// before calling any other DictDecoder methods.
fn read_flush(self : DictDecoder) -> Slice[Byte] {
let to_read = self.hist[self.rd_pos:self.wr_pos]
self.rd_pos = self.wr_pos
if self.wr_pos == self.hist.length() {
self.wr_pos = 0
self.rd_pos = 0
self.full = true
}
return to_read
}