-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrenc.py
66 lines (55 loc) · 2.72 KB
/
crenc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
"""crEnc encoding based on yEnc and optimized for inline HTML / JS text compression and image encoding
In the spirit of yEnc (why encode?), we only encode symbols where absolutely required.
If the HTML or JS charset can be set to a single-byte encoding as cp1252 (or latin1),
the only symbol requiring special treatment is the carriage-return (CR), hence crEnc,
which can be dealt with by simple backslash escaping.
We embed in JS template literals quotes ``, so we also escape backslash, ` and ${
giving us an effective 253 byte values out of 256,
with an overhead of ~ 3/256 ~ 1.2% (compared to 33.3% for Base64).
JS does the unescaping, so the decoder only needs to take care of HTML character overrides for NUL and codes in 128 - 159.
An optimal global character modular offset can be applied to minimize escaping, similar to dynEncode (enabled by default).
A minimalistic JS decoder code is generated.
References:
https://en.wikipedia.org/wiki/Binary-to-text_encoding
http://www.yenc.org
https://github.com/eshaz/simple-yenc
https://github.com/eshaz/simple-yenc#what-is-dynencode
https://html.spec.whatwg.org/multipage/parsing.html#table-charref-overrides
https://stackoverflow.com/questions/10080605/special-character-u0098-read-as-u02dc-using-charcodeat/#10081375
"""
from typing import Optional, Tuple
if not __package__:
import default_vars, webify
else:
# noinspection PyPackages
from . import default_vars, webify
def encode(data: bytes, offset: int = 0) -> bytes:
if offset:
data = bytes(byte+offset & 255 for byte in data)
return webify.escape(data)
def optimize_encode(data: bytes) -> Tuple[bytes, int, int]:
best_offset = 0
for offset in range(256):
out = encode(data, offset)
length = len(out)
if offset == 0:
best_length = length0 = length
if length < best_length:
best_length = length
best_offset = offset
out = encode(data, best_offset)
return out, best_offset, length0 - best_length
def get_js_decoder(data: bytes,
offset: Optional[int] = None,
output_var: str = default_vars.bytearray
) -> bytes:
if offset is None:
encoded, offset, saved = optimize_encode(data) # Time-consuming op.
else:
encoded = encode(data, offset)
first_part = f'{output_var}=Uint8Array.from(`'
function = f"(i=c.charCodeAt()%65533)>>8?129+' \x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c \x8e \x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c \x9e\x9f'.indexOf(c):i"
if offset:
function = f'({function})-{offset}'
last_part = f"`,c=>{function})\n"
return first_part.encode() + encoded + last_part.encode('l1') # Encode with l1 as I used explicit bytes above