Skip to content

Commit

Permalink
Add legacy canonicalizations to unicode tables
Browse files Browse the repository at this point in the history
Summary:
This introduces a notion of a UnicodeTransformRange which expresses a range
of Unicode characters which are transformed (for example, case-folded)
through adding a delta.

Use this to express legacy (non-Unicode) canonicalizations for regexp,
and update the Unicode tables.

Reviewed By: avp

Differential Revision: D17413823

fbshipit-source-id: b7d8b253e8cb4bb370a4ba11137a002740962489
  • Loading branch information
Peter Ammon authored and facebook-github-bot committed Oct 3, 2019
1 parent 25ca98f commit 9ef3c7c
Show file tree
Hide file tree
Showing 2 changed files with 174 additions and 1 deletion.
78 changes: 78 additions & 0 deletions lib/Platform/Unicode/UnicodeData.inc
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,28 @@
// SpecialCasing.txt SHA1: 67fad2f44098864ce4362ea2434a85b82a7566ec
// *** DO NOT EDIT BY HAND ***

/// An inclusive range of Unicode characters.
struct UnicodeRange {
uint32_t first;
uint32_t second;
};

/// A UnicodeTransformRange expresses a mapping such as case folding.
/// A character cp is mapped to cp + delta if cp is 0 for the given modulus.
struct UnicodeTransformRange {
/// The first codepoint of the range.
unsigned start : 24;

/// The number of characters in the range.
unsigned count : 8;

/// The signed delta amount.
int delta : 24;

/// The modulo amount.
unsigned modulo : 8;
};

// UNICODE_LETTERS Lu Ll Lt Lm Lo Nl
// static constexpr uint32_t UNICODE_LETTERS_SIZE = 335;
static constexpr UnicodeRange UNICODE_LETTERS[] = {
Expand Down Expand Up @@ -319,3 +336,64 @@ static constexpr UnicodePrecanonicalizationMapping UNICODE_PRECANONS[] = {
{0x212A, {}},
{0x212B, {}},
{0xA64A, {0x1C88, 0xA64B}}};

// static constexpr uint32_t LEGACY_CANONS_SIZE = 173;
static constexpr UnicodeTransformRange LEGACY_CANONS[] = {
{0x0061, 26, -32, 1}, {0x00B5, 1, 743, 1}, {0x00E0, 23, -32, 1},
{0x00F8, 7, -32, 1}, {0x00FF, 1, 121, 1}, {0x0101, 47, -1, 2},
{0x0133, 5, -1, 2}, {0x013A, 15, -1, 2}, {0x014B, 45, -1, 2},
{0x017A, 5, -1, 2}, {0x0180, 1, 195, 1}, {0x0183, 3, -1, 2},
{0x0188, 5, -1, 4}, {0x0192, 1, -1, 1}, {0x0195, 1, 97, 1},
{0x0199, 1, -1, 1}, {0x019A, 1, 163, 1}, {0x019E, 1, 130, 1},
{0x01A1, 5, -1, 2}, {0x01A8, 6, -1, 5}, {0x01B0, 5, -1, 4},
{0x01B6, 4, -1, 3}, {0x01BD, 1, -1, 1}, {0x01BF, 1, 56, 1},
{0x01C5, 1, -1, 1}, {0x01C6, 1, -2, 1}, {0x01C8, 1, -1, 1},
{0x01C9, 1, -2, 1}, {0x01CB, 1, -1, 1}, {0x01CC, 1, -2, 1},
{0x01CE, 15, -1, 2}, {0x01DD, 1, -79, 1}, {0x01DF, 17, -1, 2},
{0x01F2, 1, -1, 1}, {0x01F3, 1, -2, 1}, {0x01F5, 5, -1, 4},
{0x01FB, 37, -1, 2}, {0x0223, 17, -1, 2}, {0x023C, 1, -1, 1},
{0x023F, 2, 10815, 1}, {0x0242, 6, -1, 5}, {0x0249, 7, -1, 2},
{0x0250, 1, 10783, 1}, {0x0251, 1, 10780, 1}, {0x0252, 1, 10782, 1},
{0x0253, 1, -210, 1}, {0x0254, 1, -206, 1}, {0x0256, 2, -205, 1},
{0x0259, 1, -202, 1}, {0x025B, 1, -203, 1}, {0x025C, 1, 42319, 1},
{0x0260, 1, -205, 1}, {0x0261, 1, 42315, 1}, {0x0263, 1, -207, 1},
{0x0265, 1, 42280, 1}, {0x0266, 1, 42308, 1}, {0x0268, 1, -209, 1},
{0x0269, 1, -211, 1}, {0x026A, 1, 42308, 1}, {0x026B, 1, 10743, 1},
{0x026C, 1, 42305, 1}, {0x026F, 1, -211, 1}, {0x0271, 1, 10749, 1},
{0x0272, 1, -213, 1}, {0x0275, 1, -214, 1}, {0x027D, 1, 10727, 1},
{0x0280, 1, -218, 1}, {0x0282, 1, 42307, 1}, {0x0283, 1, -218, 1},
{0x0287, 1, 42282, 1}, {0x0288, 1, -218, 1}, {0x0289, 1, -69, 1},
{0x028A, 2, -217, 1}, {0x028C, 1, -71, 1}, {0x0292, 1, -219, 1},
{0x029D, 1, 42261, 1}, {0x029E, 1, 42258, 1}, {0x0345, 1, 84, 1},
{0x0371, 3, -1, 2}, {0x0377, 1, -1, 1}, {0x037B, 3, 130, 1},
{0x03AC, 1, -38, 1}, {0x03AD, 3, -37, 1}, {0x03B1, 17, -32, 1},
{0x03C2, 1, -31, 1}, {0x03C3, 9, -32, 1}, {0x03CC, 1, -64, 1},
{0x03CD, 2, -63, 1}, {0x03D0, 1, -62, 1}, {0x03D1, 1, -57, 1},
{0x03D5, 1, -47, 1}, {0x03D6, 1, -54, 1}, {0x03D7, 1, -8, 1},
{0x03D9, 23, -1, 2}, {0x03F0, 1, -86, 1}, {0x03F1, 1, -80, 1},
{0x03F2, 1, 7, 1}, {0x03F3, 1, -116, 1}, {0x03F5, 1, -96, 1},
{0x03F8, 4, -1, 3}, {0x0430, 32, -32, 1}, {0x0450, 16, -80, 1},
{0x0461, 33, -1, 2}, {0x048B, 53, -1, 2}, {0x04C2, 13, -1, 2},
{0x04CF, 1, -15, 1}, {0x04D1, 95, -1, 2}, {0x0561, 38, -48, 1},
{0x10D0, 43, 3008, 1}, {0x10FD, 3, 3008, 1}, {0x13F8, 6, -8, 1},
{0x1C80, 1, -6254, 1}, {0x1C81, 1, -6253, 1}, {0x1C82, 1, -6244, 1},
{0x1C83, 2, -6242, 1}, {0x1C85, 1, -6243, 1}, {0x1C86, 1, -6236, 1},
{0x1C87, 1, -6181, 1}, {0x1C88, 1, 35266, 1}, {0x1D79, 1, 35332, 1},
{0x1D7D, 1, 3814, 1}, {0x1D8E, 1, 35384, 1}, {0x1E01, 149, -1, 2},
{0x1E9B, 1, -59, 1}, {0x1EA1, 95, -1, 2}, {0x1F00, 8, 8, 1},
{0x1F10, 6, 8, 1}, {0x1F20, 8, 8, 1}, {0x1F30, 8, 8, 1},
{0x1F40, 6, 8, 1}, {0x1F51, 7, 8, 2}, {0x1F60, 8, 8, 1},
{0x1F70, 2, 74, 1}, {0x1F72, 4, 86, 1}, {0x1F76, 2, 100, 1},
{0x1F78, 2, 128, 1}, {0x1F7A, 2, 112, 1}, {0x1F7C, 2, 126, 1},
{0x1FB0, 2, 8, 1}, {0x1FBE, 1, -7205, 1}, {0x1FD0, 2, 8, 1},
{0x1FE0, 2, 8, 1}, {0x1FE5, 1, 7, 1}, {0x214E, 1, -28, 1},
{0x2170, 16, -16, 1}, {0x2184, 1, -1, 1}, {0x24D0, 26, -26, 1},
{0x2C30, 47, -48, 1}, {0x2C61, 1, -1, 1}, {0x2C65, 1, -10795, 1},
{0x2C66, 1, -10792, 1}, {0x2C68, 5, -1, 2}, {0x2C73, 4, -1, 3},
{0x2C81, 99, -1, 2}, {0x2CEC, 3, -1, 2}, {0x2CF3, 1, -1, 1},
{0x2D00, 38, -7264, 1}, {0x2D27, 7, -7264, 6}, {0xA641, 45, -1, 2},
{0xA681, 27, -1, 2}, {0xA723, 13, -1, 2}, {0xA733, 61, -1, 2},
{0xA77A, 3, -1, 2}, {0xA77F, 9, -1, 2}, {0xA78C, 6, -1, 5},
{0xA793, 1, -1, 1}, {0xA794, 1, 48, 1}, {0xA797, 19, -1, 2},
{0xA7B5, 11, -1, 2}, {0xA7C3, 1, -1, 1}, {0xAB53, 1, -928, 1},
{0xAB70, 80, -38864, 1}, {0xFF41, 26, -32, 1}};
97 changes: 96 additions & 1 deletion utils/genUnicodeTable.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,24 @@ def print_header(unicodedata_sha1, specialcasing_sha1):
// SpecialCasing.txt SHA1: ${specialcasing_sha1}
// *** DO NOT EDIT BY HAND ***
/// An inclusive range of Unicode characters.
struct UnicodeRange { uint32_t first; uint32_t second; };
/// A UnicodeTransformRange expresses a mapping such as case folding.
/// A character cp is mapped to cp + delta if cp is 0 for the given modulus.
struct UnicodeTransformRange {
/// The first codepoint of the range.
unsigned start:24;
/// The number of characters in the range.
unsigned count:8;
/// The signed delta amount.
int delta:24;
/// The modulo amount.
unsigned modulo:8;
};
""",
today=str(datetime.date.today()),
unicodedata_sha1=unicodedata_sha1,
Expand Down Expand Up @@ -114,6 +130,58 @@ def print_categories(unicode_data_lines):
run_interval(unicode_data_lines, cat.split())


def stride_from(p1, p2):
return p2[0] - p1[0]


def delta_within(p):
return p[1] - p[0]


def as_hex(cp):
return "0x%.4X" % cp


class DeltaMapBlock(object):
def __init__(self):
self.pairs = []

def stride(self):
return stride_from(self.pairs[0], self.pairs[1])

def delta(self):
return delta_within(self.pairs[0])

def can_append(self, pair):
if not self.pairs:
return True
if pair[0] - self.pairs[0][0] >= 256:
return False
if self.delta() != delta_within(pair):
return False
return len(self.pairs) < 2 or self.stride() == stride_from(self.pairs[-1], pair)

@staticmethod
def append_to_list(blocks, p):
if not blocks or not blocks[-1].can_append(p):
blocks.append(DeltaMapBlock())
blocks[-1].pairs.append(p)

def output(self):
pairs = self.pairs
if not pairs:
return ""

first = pairs[0][0]
last = pairs[-1][0]
modulo = self.stride() if len(pairs) >= 2 else 1
delta = self.delta()
code = Template("{$first, $count, $delta, $modulo}").substitute(
first=as_hex(first), count=last - first + 1, delta=delta, modulo=modulo
)
return code.strip()


class CaseMap(object):
"""Unicode case mapping helper.
Expand Down Expand Up @@ -186,6 +254,29 @@ def canonicalize(self, ch):
return upper_ch


def print_canonicalizations(casemap):
blocks = []
for cp in casemap.codepoints:
# legacy does not decode surrogate pairs, so we can skip large code points.
if cp > 0xFFFF:
continue
canon_cp = casemap.canonicalize(cp)
if cp != canon_cp:
DeltaMapBlock.append_to_list(blocks, (cp, canon_cp))

print_template(
"""
// static constexpr uint32_t ${name}_SIZE = ${entry_count};
static constexpr UnicodeTransformRange ${name}[] = {
${entry_text}
};
""",
name="LEGACY_CANONS",
entry_count=len(blocks),
entry_text=",\n".join(b.output() for b in blocks),
)


def print_precanonicalizations(casemap):
"""Print a table of pre-canonicalizations.
Expand Down Expand Up @@ -265,5 +356,9 @@ def as_hex(cp):
)
udata_lines = unicode_data.decode("utf-8").splitlines()
special_lines = special_casing.decode("utf-8").splitlines()
casemap = CaseMap(
unicode_data_lines=udata_lines, special_casing_lines=special_lines
)
print_categories(udata_lines)
print_precanonicalizations(CaseMap(udata_lines, special_lines))
print_precanonicalizations(casemap)
print_canonicalizations(casemap)

0 comments on commit 9ef3c7c

Please sign in to comment.