diff --git a/lib/Platform/Unicode/UnicodeData.inc b/lib/Platform/Unicode/UnicodeData.inc index 00b935b00ed..a03d6b26a01 100644 --- a/lib/Platform/Unicode/UnicodeData.inc +++ b/lib/Platform/Unicode/UnicodeData.inc @@ -5,11 +5,28 @@ // SpecialCasing.txt SHA1: 67fad2f44098864ce4362ea2434a85b82a7566ec // *** DO NOT EDIT BY HAND *** +/// An inclusive range of Unicode characters. struct UnicodeRange { uint32_t first; uint32_t second; }; +/// A UnicodeTransformRange expresses a mapping such as case folding. +/// A character cp is mapped to cp + delta if cp is 0 for the given modulus. +struct UnicodeTransformRange { + /// The first codepoint of the range. + unsigned start : 24; + + /// The number of characters in the range. + unsigned count : 8; + + /// The signed delta amount. + int delta : 24; + + /// The modulo amount. + unsigned modulo : 8; +}; + // UNICODE_LETTERS Lu Ll Lt Lm Lo Nl // static constexpr uint32_t UNICODE_LETTERS_SIZE = 335; static constexpr UnicodeRange UNICODE_LETTERS[] = { @@ -319,3 +336,64 @@ static constexpr UnicodePrecanonicalizationMapping UNICODE_PRECANONS[] = { {0x212A, {}}, {0x212B, {}}, {0xA64A, {0x1C88, 0xA64B}}}; + +// static constexpr uint32_t LEGACY_CANONS_SIZE = 173; +static constexpr UnicodeTransformRange LEGACY_CANONS[] = { + {0x0061, 26, -32, 1}, {0x00B5, 1, 743, 1}, {0x00E0, 23, -32, 1}, + {0x00F8, 7, -32, 1}, {0x00FF, 1, 121, 1}, {0x0101, 47, -1, 2}, + {0x0133, 5, -1, 2}, {0x013A, 15, -1, 2}, {0x014B, 45, -1, 2}, + {0x017A, 5, -1, 2}, {0x0180, 1, 195, 1}, {0x0183, 3, -1, 2}, + {0x0188, 5, -1, 4}, {0x0192, 1, -1, 1}, {0x0195, 1, 97, 1}, + {0x0199, 1, -1, 1}, {0x019A, 1, 163, 1}, {0x019E, 1, 130, 1}, + {0x01A1, 5, -1, 2}, {0x01A8, 6, -1, 5}, {0x01B0, 5, -1, 4}, + {0x01B6, 4, -1, 3}, {0x01BD, 1, -1, 1}, {0x01BF, 1, 56, 1}, + {0x01C5, 1, -1, 1}, {0x01C6, 1, -2, 1}, {0x01C8, 1, -1, 1}, + {0x01C9, 1, -2, 1}, {0x01CB, 1, -1, 1}, {0x01CC, 1, -2, 1}, + {0x01CE, 15, -1, 2}, {0x01DD, 1, -79, 1}, {0x01DF, 17, -1, 2}, + {0x01F2, 1, -1, 1}, {0x01F3, 1, -2, 1}, {0x01F5, 5, -1, 4}, + {0x01FB, 37, -1, 2}, {0x0223, 17, -1, 2}, {0x023C, 1, -1, 1}, + {0x023F, 2, 10815, 1}, {0x0242, 6, -1, 5}, {0x0249, 7, -1, 2}, + {0x0250, 1, 10783, 1}, {0x0251, 1, 10780, 1}, {0x0252, 1, 10782, 1}, + {0x0253, 1, -210, 1}, {0x0254, 1, -206, 1}, {0x0256, 2, -205, 1}, + {0x0259, 1, -202, 1}, {0x025B, 1, -203, 1}, {0x025C, 1, 42319, 1}, + {0x0260, 1, -205, 1}, {0x0261, 1, 42315, 1}, {0x0263, 1, -207, 1}, + {0x0265, 1, 42280, 1}, {0x0266, 1, 42308, 1}, {0x0268, 1, -209, 1}, + {0x0269, 1, -211, 1}, {0x026A, 1, 42308, 1}, {0x026B, 1, 10743, 1}, + {0x026C, 1, 42305, 1}, {0x026F, 1, -211, 1}, {0x0271, 1, 10749, 1}, + {0x0272, 1, -213, 1}, {0x0275, 1, -214, 1}, {0x027D, 1, 10727, 1}, + {0x0280, 1, -218, 1}, {0x0282, 1, 42307, 1}, {0x0283, 1, -218, 1}, + {0x0287, 1, 42282, 1}, {0x0288, 1, -218, 1}, {0x0289, 1, -69, 1}, + {0x028A, 2, -217, 1}, {0x028C, 1, -71, 1}, {0x0292, 1, -219, 1}, + {0x029D, 1, 42261, 1}, {0x029E, 1, 42258, 1}, {0x0345, 1, 84, 1}, + {0x0371, 3, -1, 2}, {0x0377, 1, -1, 1}, {0x037B, 3, 130, 1}, + {0x03AC, 1, -38, 1}, {0x03AD, 3, -37, 1}, {0x03B1, 17, -32, 1}, + {0x03C2, 1, -31, 1}, {0x03C3, 9, -32, 1}, {0x03CC, 1, -64, 1}, + {0x03CD, 2, -63, 1}, {0x03D0, 1, -62, 1}, {0x03D1, 1, -57, 1}, + {0x03D5, 1, -47, 1}, {0x03D6, 1, -54, 1}, {0x03D7, 1, -8, 1}, + {0x03D9, 23, -1, 2}, {0x03F0, 1, -86, 1}, {0x03F1, 1, -80, 1}, + {0x03F2, 1, 7, 1}, {0x03F3, 1, -116, 1}, {0x03F5, 1, -96, 1}, + {0x03F8, 4, -1, 3}, {0x0430, 32, -32, 1}, {0x0450, 16, -80, 1}, + {0x0461, 33, -1, 2}, {0x048B, 53, -1, 2}, {0x04C2, 13, -1, 2}, + {0x04CF, 1, -15, 1}, {0x04D1, 95, -1, 2}, {0x0561, 38, -48, 1}, + {0x10D0, 43, 3008, 1}, {0x10FD, 3, 3008, 1}, {0x13F8, 6, -8, 1}, + {0x1C80, 1, -6254, 1}, {0x1C81, 1, -6253, 1}, {0x1C82, 1, -6244, 1}, + {0x1C83, 2, -6242, 1}, {0x1C85, 1, -6243, 1}, {0x1C86, 1, -6236, 1}, + {0x1C87, 1, -6181, 1}, {0x1C88, 1, 35266, 1}, {0x1D79, 1, 35332, 1}, + {0x1D7D, 1, 3814, 1}, {0x1D8E, 1, 35384, 1}, {0x1E01, 149, -1, 2}, + {0x1E9B, 1, -59, 1}, {0x1EA1, 95, -1, 2}, {0x1F00, 8, 8, 1}, + {0x1F10, 6, 8, 1}, {0x1F20, 8, 8, 1}, {0x1F30, 8, 8, 1}, + {0x1F40, 6, 8, 1}, {0x1F51, 7, 8, 2}, {0x1F60, 8, 8, 1}, + {0x1F70, 2, 74, 1}, {0x1F72, 4, 86, 1}, {0x1F76, 2, 100, 1}, + {0x1F78, 2, 128, 1}, {0x1F7A, 2, 112, 1}, {0x1F7C, 2, 126, 1}, + {0x1FB0, 2, 8, 1}, {0x1FBE, 1, -7205, 1}, {0x1FD0, 2, 8, 1}, + {0x1FE0, 2, 8, 1}, {0x1FE5, 1, 7, 1}, {0x214E, 1, -28, 1}, + {0x2170, 16, -16, 1}, {0x2184, 1, -1, 1}, {0x24D0, 26, -26, 1}, + {0x2C30, 47, -48, 1}, {0x2C61, 1, -1, 1}, {0x2C65, 1, -10795, 1}, + {0x2C66, 1, -10792, 1}, {0x2C68, 5, -1, 2}, {0x2C73, 4, -1, 3}, + {0x2C81, 99, -1, 2}, {0x2CEC, 3, -1, 2}, {0x2CF3, 1, -1, 1}, + {0x2D00, 38, -7264, 1}, {0x2D27, 7, -7264, 6}, {0xA641, 45, -1, 2}, + {0xA681, 27, -1, 2}, {0xA723, 13, -1, 2}, {0xA733, 61, -1, 2}, + {0xA77A, 3, -1, 2}, {0xA77F, 9, -1, 2}, {0xA78C, 6, -1, 5}, + {0xA793, 1, -1, 1}, {0xA794, 1, 48, 1}, {0xA797, 19, -1, 2}, + {0xA7B5, 11, -1, 2}, {0xA7C3, 1, -1, 1}, {0xAB53, 1, -928, 1}, + {0xAB70, 80, -38864, 1}, {0xFF41, 26, -32, 1}}; diff --git a/utils/genUnicodeTable.py b/utils/genUnicodeTable.py index 215db789f46..53c9152f102 100755 --- a/utils/genUnicodeTable.py +++ b/utils/genUnicodeTable.py @@ -48,8 +48,24 @@ def print_header(unicodedata_sha1, specialcasing_sha1): // SpecialCasing.txt SHA1: ${specialcasing_sha1} // *** DO NOT EDIT BY HAND *** +/// An inclusive range of Unicode characters. struct UnicodeRange { uint32_t first; uint32_t second; }; +/// A UnicodeTransformRange expresses a mapping such as case folding. +/// A character cp is mapped to cp + delta if cp is 0 for the given modulus. +struct UnicodeTransformRange { + /// The first codepoint of the range. + unsigned start:24; + + /// The number of characters in the range. + unsigned count:8; + + /// The signed delta amount. + int delta:24; + + /// The modulo amount. + unsigned modulo:8; +}; """, today=str(datetime.date.today()), unicodedata_sha1=unicodedata_sha1, @@ -114,6 +130,58 @@ def print_categories(unicode_data_lines): run_interval(unicode_data_lines, cat.split()) +def stride_from(p1, p2): + return p2[0] - p1[0] + + +def delta_within(p): + return p[1] - p[0] + + +def as_hex(cp): + return "0x%.4X" % cp + + +class DeltaMapBlock(object): + def __init__(self): + self.pairs = [] + + def stride(self): + return stride_from(self.pairs[0], self.pairs[1]) + + def delta(self): + return delta_within(self.pairs[0]) + + def can_append(self, pair): + if not self.pairs: + return True + if pair[0] - self.pairs[0][0] >= 256: + return False + if self.delta() != delta_within(pair): + return False + return len(self.pairs) < 2 or self.stride() == stride_from(self.pairs[-1], pair) + + @staticmethod + def append_to_list(blocks, p): + if not blocks or not blocks[-1].can_append(p): + blocks.append(DeltaMapBlock()) + blocks[-1].pairs.append(p) + + def output(self): + pairs = self.pairs + if not pairs: + return "" + + first = pairs[0][0] + last = pairs[-1][0] + modulo = self.stride() if len(pairs) >= 2 else 1 + delta = self.delta() + code = Template("{$first, $count, $delta, $modulo}").substitute( + first=as_hex(first), count=last - first + 1, delta=delta, modulo=modulo + ) + return code.strip() + + class CaseMap(object): """Unicode case mapping helper. @@ -186,6 +254,29 @@ def canonicalize(self, ch): return upper_ch +def print_canonicalizations(casemap): + blocks = [] + for cp in casemap.codepoints: + # legacy does not decode surrogate pairs, so we can skip large code points. + if cp > 0xFFFF: + continue + canon_cp = casemap.canonicalize(cp) + if cp != canon_cp: + DeltaMapBlock.append_to_list(blocks, (cp, canon_cp)) + + print_template( + """ +// static constexpr uint32_t ${name}_SIZE = ${entry_count}; +static constexpr UnicodeTransformRange ${name}[] = { +${entry_text} +}; +""", + name="LEGACY_CANONS", + entry_count=len(blocks), + entry_text=",\n".join(b.output() for b in blocks), + ) + + def print_precanonicalizations(casemap): """Print a table of pre-canonicalizations. @@ -265,5 +356,9 @@ def as_hex(cp): ) udata_lines = unicode_data.decode("utf-8").splitlines() special_lines = special_casing.decode("utf-8").splitlines() + casemap = CaseMap( + unicode_data_lines=udata_lines, special_casing_lines=special_lines + ) print_categories(udata_lines) - print_precanonicalizations(CaseMap(udata_lines, special_lines)) + print_precanonicalizations(casemap) + print_canonicalizations(casemap)