Skip to content

Commit

Permalink
Less duplicated data in unicode_cat_of
Browse files Browse the repository at this point in the history
  • Loading branch information
Kijewski committed Mar 1, 2024
1 parent f461cb5 commit 0f53dbd
Show file tree
Hide file tree
Showing 2 changed files with 1,749 additions and 5,173 deletions.
82 changes: 49 additions & 33 deletions scripts/make_unicode_categories.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python

from argparse import ArgumentParser
from collections import defaultdict
from collections import defaultdict, OrderedDict
from functools import reduce
from pathlib import Path
from re import match
Expand Down Expand Up @@ -30,7 +30,7 @@ def main(input_file, output_file):
"nd": IdentifierPart,
}

planes = defaultdict(lambda: [0] * 0x10000)
planes = defaultdict(lambda: [0] * 0x1000)

for input_line in input_file:
m = match(r"^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s+;\s+([A-Z][a-z])", input_line)
Expand All @@ -43,19 +43,19 @@ def main(input_file, output_file):
end = int(end or start, 16)
start = int(start, 16)
for i in range(start, end + 1):
planes[i // 0x10000][i % 0x10000] = idx
planes[i // 0x1000][i % 0x1000] = idx

# per: https://spec.json5.org/#white-space
for i in (0x9, 0xA, 0xB, 0xC, 0xD, 0x20, 0xA0, 0x2028, 0x2028, 0x2029, 0xFEFF):
planes[0][i] = WhiteSpace
planes[i // 0x1000][i % 0x1000] = WhiteSpace

# per: https://www.ecma-international.org/ecma-262/5.1/#sec-7.6
for i in (ord("$"), ord("_"), ord("\\")):
planes[0][i] = IdentifierStart
planes[i // 0x1000][i % 0x1000] = IdentifierStart

# per: https://www.ecma-international.org/ecma-262/5.1/#sec-7.6
for i in (0x200C, 0x200D):
planes[0][i] = IdentifierPart
planes[i // 0x1000][i % 0x1000] = IdentifierPart

print("#ifndef JSON5EncoderCpp_unicode_cat_of", file=output_file)
print("#define JSON5EncoderCpp_unicode_cat_of", file=output_file)
Expand All @@ -70,50 +70,66 @@ def main(input_file, output_file):
print(file=output_file)
print("static unsigned unicode_cat_of(std::uint32_t codepoint) {", file=output_file)

print(" static std::uint8_t plane_X[0x10000 / 4] = {0};", file=output_file)
print(file=output_file)
demiplane_to_idx = OrderedDict() # demiplane_idx → data_idx
data_to_idx = [None] * 272 # demiplane data → data_idx
for i in range(272):
plane_data = ""
plane = planes[i]
while plane and plane[-1] == 0:
plane.pop()

for plane_idx, plane_data in planes.items():
print(
" static std::uint8_t plane_" + str(plane_idx) + "[0x10000 / 4] = {",
file=output_file,
)
for chunk in chunked(plane_data, 4 * 16):
print(" ", end="", file=output_file)
for chunk in chunked(plane, 4 * 16):
plane_data += " "
for value in chunked(chunk, 4):
value = reduce(lambda a, i: ((a << 2) | i), reversed(value), 0)
print("0x{:02x}u".format(value), end=", ", file=output_file)
print(file=output_file)
print(" };", file=output_file)
print(file=output_file)

print(" static std::uint8_t *planes[17] = {", end="", file=output_file)
for plane_idx in range(0, 17):
if plane_idx % 8 == 0:
print("\n ", end="", file=output_file)
if plane_idx in planes:
print("plane_" + str(plane_idx) + ", ", end="", file=output_file)
else:
print("plane_X, ", end="", file=output_file)
plane_data += "0x{:02x}u, ".format(value)
plane_data = plane_data[:-1] + "\n"

produced_idx = demiplane_to_idx.get(plane_data)
if produced_idx is None:
produced_idx = i
demiplane_to_idx[plane_data] = produced_idx

print(
" static std::uint8_t data{:03d}[0x1000 / 4] = {{".format(
produced_idx
),
file=output_file,
)
print(plane_data, file=output_file, end="")
print(" };", file=output_file)

data_to_idx[i] = produced_idx
print(file=output_file)

print(" // A 'demiplane' is a 1/16th of a Unicode plane.", file=output_file)
print(" static std::uint8_t *demiplanes[272] = {", end="", file=output_file)
for i in range(272):
if i % 8 == 0:
print("\n ", end="", file=output_file)
print(" data{:03d},".format(data_to_idx[i]), end="", file=output_file)
print(file=output_file)
print(" };", file=output_file)
print(file=output_file)

print(
" std::uint16_t plane_idx = std::uint16_t(codepoint / 0x10000);",
" std::uint16_t demiplane_idx = std::uint16_t(codepoint / 0x1000);",
file=output_file,
)
print(
" if (JSON5EncoderCpp_expect(demiplane_idx >= 272, false)) return 1;",
file=output_file,
)
print(
" if (JSON5EncoderCpp_expect(plane_idx > 16, false)) return 1;",
" std::uint16_t datum_idx = std::uint16_t(codepoint & 0x0fff);",
file=output_file,
)
print(
" std::uint16_t datum_idx = std::uint16_t(codepoint & 0xffff);",
" const std::uint8_t *demiplane = demiplanes[demiplane_idx];",
file=output_file,
)
print(" const std::uint8_t *plane = planes[plane_idx];", file=output_file)
print(
" return (plane[datum_idx / 4] >> (2 * (datum_idx % 4))) % 4;",
" return (demiplane[datum_idx / 4] >> (2 * (datum_idx % 4))) % 4;",
file=output_file,
)
print("}", file=output_file)
Expand Down
Loading

0 comments on commit 0f53dbd

Please sign in to comment.