# ===--- GYBUnicodeDataUtils.py ----------------------*- coding: utf-8 -*-===// # # This source file is part of the Swift.org open source project # # Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors # Licensed under Apache License v2.0 with Runtime Library Exception # # See https://swift.org/LICENSE.txt for license information # See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors import codecs import re class UnicodeProperty(object): """Abstract base class for Unicode properties.""" def __init__(self): raise NotImplementedError( "UnicodeProperty.__init__ is not implemented.") def get_default_value(self): raise NotImplementedError( "UnicodeProperty.get_default_value is not implemented.") def get_value(self, cp): raise NotImplementedError( "UnicodeProperty.get_value is not implemented.") def to_numeric_value(self, value): raise NotImplementedError( "UnicodeProperty.to_numeric_value is not implemented.") def get_numeric_value(self, cp): raise NotImplementedError( "UnicodeProperty.get_numeric_value is not implemented.") class GraphemeClusterBreakPropertyTable(UnicodeProperty): """Grapheme_Cluster_Break property.""" # An array of tuples (start_code_point, end_code_point, value). property_value_ranges = [] property_values = [None for i in range(0, 0x110000)] # Note: Numeric values (including the names) should be consistent with # '_GraphemeClusterBreakPropertyValue' enum on the Swift side, and with # 'GraphemeClusterBreakProperty' in the compiler C++ code. If there is a # reason for either of those to differ, then this mapping can be overridden # after an instance of this class is created. numeric_value_table = { 'Other': 0, 'CR': 1, 'LF': 2, 'Control': 3, 'Extend': 4, 'Regional_Indicator': 5, 'Prepend': 6, 'SpacingMark': 7, 'L': 8, 'V': 9, 'T': 10, 'LV': 11, 'LVT': 12, } def __init__(self, grapheme_break_property_file_name): # Build 'self.symbolic_values' -- an array that maps numeric property # values to symbolic values. self.symbolic_values = \ [None] * (max(self.numeric_value_table.values()) + 1) for k, v in self.numeric_value_table.items(): self.symbolic_values[v] = k # Load the data file. with codecs.open( grapheme_break_property_file_name, encoding='utf-8', errors='strict') as f: for line in f: # Strip comments. line = re.sub('#.*', '', line) # Single code point? m = re.match('([0-9A-F]+) +; +([a-zA-Z]+) ', line) if m: code_point = int(m.group(1), 16) value = m.group(2) self.property_value_ranges += \ [(code_point, code_point, value)] continue # Range of code points? m = re.match( '([0-9A-F]+)..([0-9A-F]+) +; +([a-zA-Z_]+) ', line) if m: start_code_point = int(m.group(1), 16) end_code_point = int(m.group(2), 16) value = m.group(3) self.property_value_ranges += \ [(start_code_point, end_code_point, value)] # Prepare a flat lookup table for fast access. for cp in range(0, 0x110000): self.property_values[cp] = self.get_default_value() for start_code_pt, end_code_pt, val in self.property_value_ranges: for cp in range(start_code_pt, end_code_pt + 1): self.property_values[cp] = val def get_default_value(self): return 'Other' def get_value(self, cp): return self.property_values[cp] def to_numeric_value(self, value): return self.numeric_value_table[value] def get_numeric_value(self, cp): return self.to_numeric_value(self.get_value(cp)) # BMP code points are 16-bit values. The code point value is split as # follows: # # 8 bits 8 bits # +-------------------------+-------------------------+ # | 15 14 13 12 11 10 9 8 | 7 6 5 4 3 2 1 0 | # +-------------------------+-------------------------+ # first-level index data offset # # Supplementary code points (U+XXXX where XXXX > 0xffff) are 21-bit values. # The code point value is split as follows: # # 5 bits 8 bits 8 bits # +----------------+-------------------------+-------------------------+ # | 20 19 18 17 16 | 15 14 13 12 11 10 9 8 | 7 6 5 4 3 2 1 0 | # +----------------+-------------------------+-------------------------+ # first-level second-level index data offset # index # # The actual number of bits are just trie parameters. They affect the size of # the lookup tables (and thus, lookup time), but do not change the overall # structure of the trie. # # Here and below 'supp' stands for 'supplementary characters'. # # Property data for BMP code points is stored as a one-stage trie. # A trie with one lookup table consists of two memory blocks: # # First-level lookup table # +-----+-----+-----+-----+--...--+ # | * | * | * | * | | # +--|--+--|--+--|--+--|--+--...--+ # | | | \ The references don't form # | \____| \___, a systematic pattern # | | | # | | | Data storage # +-V--------++-V--------++-V--------++---...---+ # | data || data || data || | # +----------++----------++----------++---...---+ # # In order to fetch data for a given code point, you need to: # * load from the first-level lookup table using first-level index; this will # give you the number of the data block that you should use. # * load from the data block applying the data offset. # # Property data for supplementary code points is stored as a two-stage trie. # A trie with two-stage lookup tables consists of three memory blocks. The # following drawing explains how it is implemented: # # First-level lookup table # +-----+-----+-----+-----+-----+--...--+ # | * | * | * | * | * | | # +--|--+--|--+--|--+--|--+--|--+--...--+ # | | | | \ The references don't form # ,__/ | \____| \___, a systematic pattern # / | | | # | | | | Second-level lookup table # +-V--------++-V--------++-V--------++-V--------++---...---+ # | ******** || ******** || ******** || || | # +-||||||||-++-||||||||-++-||||||||-++----------++---...---+ # \\\|//// ||||||VV |VVV|V|V # \\|/// |||||| / | | # \|// |||||| / | | # |/ ||||| \__|___. \ \ The references don't form # | |||| \___|__. \ | \ a systematic pattern # | ||| \____| \ \__| \ # | || \_____|__. \___|___\ ...___. # | | \______| \____| \___, | Data storage # +-V-----++-V-----++-V-----++-V-----++-V-----++-V-----++---...---+ # | data || data || data || data || || || | # +-------++-------++-------++-------++-------++-------++---...---+ # # In order to fetch data for a given code point, you need to: # * load from the first-level lookup table using first-level index; this will # give you the number of the second-level lookup table that you should use. # * load from the chosen second-level lookup table using the second-level # index, which will give you the number of the data block that you should # use. # * load from the data block applying the data offset. # # First- and second-level lookup tables in the general case contain 16-bit # words; that will be sufficient to store a trie that does not compress at all. # But in many cases, after trie compression there will be fewer than 256 # unique second-level lookup tables and/or data storage blocks, which allows # one to use 8-bit words in lookup tables. # # The bitwidth of data depends on the application of the trie. # # The supp tables contain entries for BMP code units to simplify trie # implementation, but those BMP entries are filled with the default value, so # they compress well. class UnicodeTrieGenerator(object): # Note: if you change any of these parameters, don't forget to update the # ASCII art above. bmp_first_level_index_bits = 8 supp_first_level_index_bits = 5 supp_second_level_index_bits = 8 def get_bmp_first_level_index(self, cp): return cp >> self.bmp_data_offset_bits def get_bmp_data_offset(self, cp): return cp & ((1 << self.bmp_data_offset_bits) - 1) def get_supp_first_level_index(self, cp): return cp >> \ (self.supp_second_level_index_bits + self.supp_data_offset_bits) def get_supp_second_level_index(self, cp): return (cp >> self.supp_data_offset_bits) & \ ((1 << self.supp_second_level_index_bits) - 1) def get_supp_data_offset(self, cp): return cp & ((1 << self.supp_data_offset_bits) - 1) def __init__(self): """Create a trie generator with default parameters.""" pass def create_tables(self): """Compute derived parameter values and create internal data structures. Don't change parameter values after calling this method. """ self.bmp_data_offset_bits = 16 - self.bmp_first_level_index_bits self.supp_data_offset_bits = \ 21 - self.supp_first_level_index_bits - \ self.supp_second_level_index_bits # The maximum value of the first level index for supp tables. It is # not equal to ((1 << supp_first_level_index_bits) - 1), because # maximum Unicode code point value is not 2^21-1 (0x1fffff), it is # 0x10ffff. self.supp_first_level_index_max = \ 0x10ffff >> \ (self.supp_second_level_index_bits + self.supp_data_offset_bits) # A mapping from BMP first-level index to BMP data block index. self.bmp_lookup = \ [i for i in range(0, 1 << self.bmp_first_level_index_bits)] # An array of BMP data blocks. self.bmp_data = [ [-1 for i in range(0, 1 << self.bmp_data_offset_bits)] for i in range(0, 1 << self.bmp_first_level_index_bits) ] # A mapping from supp first-level index to an index of the second-level # lookup table. self.supp_lookup1 = \ [i for i in range(0, self.supp_first_level_index_max + 1)] # An array of second-level lookup tables. Each second-level lookup # table is a mapping from a supp second-level index to supp data block # index. self.supp_lookup2 = [ [j for j in range(i << self.supp_second_level_index_bits, (i + 1) << self.supp_second_level_index_bits)] for i in range(0, self.supp_first_level_index_max + 1) ] # An array of supp data blocks. self.supp_data = [ [-1 for i in range(0, 1 << self.supp_data_offset_bits)] for i in range(0, (self.supp_first_level_index_max + 1) * (1 << self.supp_second_level_index_bits)) ] def splat(self, value): for i in range(0, len(self.bmp_data)): for j in range(0, len(self.bmp_data[i])): self.bmp_data[i][j] = value for i in range(0, len(self.supp_data)): for j in range(0, len(self.supp_data[i])): self.supp_data[i][j] = value def set_value(self, cp, value): if cp <= 0xffff: data_block_index = self.bmp_lookup[ self.get_bmp_first_level_index(cp)] self.bmp_data[data_block_index][ self.get_bmp_data_offset(cp)] = value else: second_lookup_index = self.supp_lookup1[ self.get_supp_first_level_index(cp)] data_block_index = self.supp_lookup2[second_lookup_index][ self.get_supp_second_level_index(cp)] self.supp_data[data_block_index][ self.get_supp_data_offset(cp)] = value def get_value(self, cp): if cp <= 0xffff: data_block_index = self.bmp_lookup[ self.get_bmp_first_level_index(cp)] return self.bmp_data[data_block_index][ self.get_bmp_data_offset(cp)] else: second_lookup_index = self.supp_lookup1[ self.get_supp_first_level_index(cp)] data_block_index = self.supp_lookup2[second_lookup_index][ self.get_supp_second_level_index(cp)] return self.supp_data[data_block_index][ self.get_supp_data_offset(cp)] def fill_from_unicode_property(self, unicode_property): self.splat(unicode_property.get_default_value()) for cp in range(0, 0x110000): self.set_value(cp, unicode_property.get_value(cp)) def verify(self, unicode_property): for cp in range(0, 0x110000): expected_value = unicode_property.get_value(cp) actual_value = self.get_value(cp) assert expected_value == actual_value def freeze(self): """Compress internal trie representation. Don't mutate the trie after calling this method. """ def remap_indexes(indexes, old_idx, new_idx): def map_index(idx): if idx == old_idx: return new_idx elif idx > old_idx: return idx - 1 else: return idx return list(map(map_index, indexes)) # If self.bmp_data contains identical data blocks, keep the first one, # remove duplicates and change the indexes in self.bmp_lookup to point # to the first one. i = 0 while i < len(self.bmp_data): j = i + 1 while j < len(self.bmp_data): if self.bmp_data[i] == self.bmp_data[j]: self.bmp_data.pop(j) self.bmp_lookup = \ remap_indexes(self.bmp_lookup, old_idx=j, new_idx=i) else: j += 1 i += 1 # For supp tables, perform bottom-up deduplication: first, deduplicate # data blocks. The algorithm is the same as above, but operates on # self.supp_data/supp_lookup2. i = 0 while i < len(self.supp_data): j = i + 1 while j < len(self.supp_data): if self.supp_data[i] == self.supp_data[j]: self.supp_data.pop(j) for k in range(0, len(self.supp_lookup2)): self.supp_lookup2[k] = \ remap_indexes(self.supp_lookup2[k], old_idx=j, new_idx=i) else: j += 1 i += 1 # Next, deduplicate second-level lookup tables. # Same as above, but for supp_lookup1/supp_lookup2. i = 0 while i < len(self.supp_lookup2): j = i + 1 while j < len(self.supp_lookup2): if self.supp_lookup2[i] == self.supp_lookup2[j]: self.supp_lookup2.pop(j) self.supp_lookup1 = \ remap_indexes(self.supp_lookup1, old_idx=j, new_idx=i) else: j += 1 i += 1 def _int_to_le_bytes(self, data, width): if width == 1: assert data & ~0xff == 0 return [data] if width == 2: assert data & ~0xffff == 0 return [data & 0xff, data & 0xff00] assert False def _int_list_to_le_bytes(self, ints, width): return [ byte for elt in ints for byte in self._int_to_le_bytes(elt, width)] def serialize(self, unicode_property): self.bmp_lookup_bytes_per_entry = 1 if len(self.bmp_data) < 256 else 2 self.bmp_data_bytes_per_entry = 1 self.supp_lookup1_bytes_per_entry = 1 if len(self.supp_lookup2) < 256 \ else 2 self.supp_lookup2_bytes_per_entry = 1 if len(self.supp_data) < 256 \ else 2 self.supp_data_bytes_per_entry = 1 bmp_lookup_words = list(self.bmp_lookup) bmp_data_words = [ unicode_property.to_numeric_value(elt) for block in self.bmp_data for elt in block] supp_lookup1_words = list(self.supp_lookup1) supp_lookup2_words = [ elt for block in self.supp_lookup2 for elt in block] supp_data_words = [ unicode_property.to_numeric_value(elt) for block in self.supp_data for elt in block] bmp_lookup_bytes = self._int_list_to_le_bytes( bmp_lookup_words, self.bmp_lookup_bytes_per_entry) bmp_data_bytes = self._int_list_to_le_bytes( bmp_data_words, self.bmp_data_bytes_per_entry) supp_lookup1_bytes = self._int_list_to_le_bytes( supp_lookup1_words, self.supp_lookup1_bytes_per_entry) supp_lookup2_bytes = self._int_list_to_le_bytes( supp_lookup2_words, self.supp_lookup2_bytes_per_entry) supp_data_bytes = self._int_list_to_le_bytes( supp_data_words, self.supp_data_bytes_per_entry) self.trie_bytes = [] self.bmp_lookup_bytes_offset = 0 self.trie_bytes += bmp_lookup_bytes self.bmp_data_bytes_offset = len(self.trie_bytes) self.trie_bytes += bmp_data_bytes self.supp_lookup1_bytes_offset = len(self.trie_bytes) self.trie_bytes += supp_lookup1_bytes self.supp_lookup2_bytes_offset = len(self.trie_bytes) self.trie_bytes += supp_lookup2_bytes self.supp_data_bytes_offset = len(self.trie_bytes) self.trie_bytes += supp_data_bytes def get_extended_grapheme_cluster_rules_matrix(grapheme_cluster_break_table): any_value = \ grapheme_cluster_break_table.symbolic_values # Rules to determine extended grapheme cluster boundaries, as defined in # 'Grapheme Break Chart', # http://www.unicode.org/Public/6.3.0/ucd/auxiliary/GraphemeBreakTest.html, # Unicode 6.3.0. # # The Unicode 7.0.0 draft does not change these rules. # # As in the referenced document, the rules are specified in order of # decreasing priority. rules = [ (['CR'], 'no_boundary', ['LF']), (['Control', 'CR', 'LF'], 'boundary', any_value), (any_value, 'boundary', ['Control', 'CR', 'LF']), (['L'], 'no_boundary', ['L', 'V', 'LV', 'LVT']), (['LV', 'V'], 'no_boundary', ['V', 'T']), (['LVT', 'T'], 'no_boundary', ['T']), (['Regional_Indicator'], 'no_boundary', ['Regional_Indicator']), (any_value, 'no_boundary', ['Extend']), (any_value, 'no_boundary', ['SpacingMark']), (['Prepend'], 'no_boundary', any_value), (any_value, 'boundary', any_value), ] # Expand the rules into a matrix. rules_matrix = {} for first in any_value: rules_matrix[first] = \ dict.fromkeys(any_value, None) # Iterate over rules in the order of increasing priority. for first_list, action, second_list in reversed(rules): for first in first_list: for second in second_list: rules_matrix[first][second] = action # Make sure we can pack one row of the matrix into a 'uint16_t'. assert len(any_value) <= 16 result = [] for first in any_value: # Retrieve a row that corresponds to this first code point. row = rules_matrix[first] # Change strings into bits. bits = [row[second] == 'no_boundary' for second in any_value] # Pack bits into an integer. packed = sum([bits[i] * pow(2, i) for i in range(0, len(bits))]) result += [packed] return result def get_grapheme_cluster_break_tests_as_utf8(grapheme_break_test_file_name): def _convert_line(line): # Strip comments. line = re.sub('#.*', '', line).strip() if line == "": return None test = "" curr_bytes = 0 boundaries = [] # Match a list of code points. for token in line.split(" "): if token == u"÷": boundaries += [curr_bytes] elif token == u"×": pass else: code_point = int(token, 16) # Tests from Unicode spec have isolated surrogates in them. # Our segmentation algorithm works on UTF-8 sequences, so # encoding a surrogate would produce an invalid code unit # sequence. Instead of trying to emulate the maximal subpart # algorithm for inserting U+FFFD in Python, we just replace # every isolated surrogate with U+200B, which also has # Grapheme_Cluster_Break equal to 'Control' and test # separately that we handle ill-formed UTF-8 sequences. if code_point >= 0xd800 and code_point <= 0xdfff: code_point = 0x200b code_point = (b'\U%(cp)08x' % {b'cp': code_point}).decode( 'unicode_escape', 'strict') as_utf8_bytes = bytearray(code_point.encode('utf8', 'strict')) as_utf8_escaped = ''.join( ['\\x%(byte)02x' % {'byte': byte} for byte in as_utf8_bytes]) test += as_utf8_escaped curr_bytes += len(as_utf8_bytes) return (test, boundaries) # Self-test. assert (_convert_line(u'÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ('\\xe0\\xa4\\x83\\xcc\\x88\\xea\\xb0\\x81', [0, 5, 8])) assert _convert_line(u'÷ D800 ÷ # abc') == ('\\xe2\\x80\\x8b', [0, 3]) result = [] with codecs.open( grapheme_break_test_file_name, encoding='utf-8', errors='strict') as f: for line in f: test = _convert_line(line) if test: result += [test] return result def get_grapheme_cluster_break_tests_as_unicode_scalars( grapheme_break_test_file_name): def _convert_line(line): # Strip comments. line = re.sub('#.*', '', line).strip() if line == "": return None test = [] curr_code_points = 0 boundaries = [] # Match a list of code points. for token in line.split(" "): if token == "÷": boundaries += [curr_code_points] elif token == "×": pass else: code_point = int(token, 16) # Tests from Unicode spec have isolated surrogates in them. Our # segmentation algorithm works on UTF-16 sequences, so encoding # a surrogate would produce an invalid code unit sequence. # Instead of trying to emulate the maximal subpart algorithm # for inserting U+FFFD in Python, we just replace every # isolated surrogate with U+200B, which also has # Grapheme_Cluster_Break equal to 'Control' and test separately # that we handle ill-formed UTF-8 sequences. if code_point >= 0xd800 and code_point <= 0xdfff: code_point = 0x200b test += [code_point] curr_code_points += 1 return (test, boundaries) # Self-test. assert (_convert_line('÷ 0903 × 0308 ÷ AC01 ÷ # abc') == ([0x0903, 0x0308, 0xac01], [0, 2, 3])) assert _convert_line('÷ D800 ÷ # abc') == ([0x200b], [0, 1]) result = [] with open(grapheme_break_test_file_name, 'rb') as f: for line in f: test = _convert_line(line) if test: result += [test] return result