utils/genUnicodeTable.py

#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the LICENSE
# file in the root directory of this source tree.

# -*- coding: utf-8 -*-

# Generates list of unicode ranges belonging to a set of categories
# Usage: genUnicodeTable.py

import datetime
import hashlib
import sys
import urllib.request
from string import Template


UNICODE_DATA_URL = "ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"

UNICODE_SPECIAL_CASING_URL = "ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt"


# Unicode data field indexes. See UnicodeData.txt.
CODEPOINT_FIELD = 0
GENERAL_CATEGORY_FIELD = 2
UPPERCASE_FIELD = 12
LOWERCASE_FIELD = 13


def print_template(s, **kwargs):
    """ Substitute in the keyword arguments to the template string
        (or direct template) s, and print the result, followed by a
        newline.
    """
    text = Template(s).substitute(**kwargs)
    print(text.strip())
    print("")


def print_header(unicodedata_sha1, specialcasing_sha1):
    print_template(
        """
//
// File generated by genUnicodeTable.py
// using Unicode data files downloaded on ${today}
// UnicodeData.txt SHA1:   ${unicodedata_sha1}
// SpecialCasing.txt SHA1: ${specialcasing_sha1}
// *** DO NOT EDIT BY HAND ***

struct UnicodeRange { uint32_t first; uint32_t second; };

""",
        today=str(datetime.date.today()),
        unicodedata_sha1=unicodedata_sha1,
        specialcasing_sha1=specialcasing_sha1,
    )


def run_interval(unicode_data_lines, args):
    name = args[0]
    categories = set(args[1:])
    begin = 0
    intervals = []
    last_cp = 0
    openi = False
    for line in unicode_data_lines:
        fields = line.split(";")
        cp_str, category = fields[CODEPOINT_FIELD], fields[GENERAL_CATEGORY_FIELD]
        cp = int(cp_str, 16)
        if category in categories:
            if not openi:
                begin = cp
                openi = True
            else:
                pass  # do nothing we are still in interval
        else:
            if openi:
                intervals.append((begin, last_cp))
                openi = False
            else:
                pass  # keep looking
        last_cp = cp

    if openi:
        intervals.append((begin, last_cp))

    print_template(
        """
// ${args}
// static constexpr uint32_t ${name}_SIZE = $interval_count;
static constexpr UnicodeRange ${name}[] = {
${intervals}
};
    """,
        args=" ".join(args),
        name=name,
        interval_count=len(intervals),
        intervals="\n".join(
            "{" + hex(i[0]) + ", " + hex(i[1]) + "}," for i in intervals
        ),
    )


def print_categories(unicode_data_lines):
    """Output UnicodeRanges for Unicode General Categories."""
    categories = [
        "UNICODE_LETTERS Lu Ll Lt Lm Lo Nl",
        "UNICODE_COMBINING_MARK Mn Mc",
        "UNICODE_DIGIT Nd",
        "UNICODE_CONNECTOR_PUNCTUATION Pc",
    ]
    for cat in categories:
        run_interval(unicode_data_lines, cat.split())


class CaseMap(object):
    """Unicode case mapping helper.

    This class holds the list of codepoints, and their uppercase and
    lowercase mappings.

    """

    def __init__(self, unicode_data_lines, special_casing_lines):
        """Construct with the lines from UnicodeData and SpecialCasing."""
        self.toupper = {}
        self.tolower = {}
        self.codepoints = []
        for line in unicode_data_lines:
            fields = line.split(";")
            self.__set_casemap(
                fields[CODEPOINT_FIELD],
                upper=fields[UPPERCASE_FIELD],
                lower=fields[LOWERCASE_FIELD],
            )
        self.codepoints.extend(self.toupper.keys())

        # Apply special cases. This is to support ES5.1 Canonicalize, which is
        # cast in terms of toUpperCase(). The desire here is to have a
        # locale-independent result. Thus we ignore SpecialCasing rules that
        # are locale specific. We can also get away with ignoring
        # context-sensitive rules because Canonicalize only considers one
        # character. Thus ignore any rules that have a condition.
        # Format is codepoint, lower, title, upper, condition
        for line in special_casing_lines:
            # Trim comments
            line = line.split("#")[0]
            fields = line.split(";")
            if len(fields) < 5:
                continue
            cps, lower, title, upper, condition = fields[:5]
            # Title is unused
            _ = title  # noqa: F841
            if not condition.strip():
                self.__set_casemap(cps, upper=upper, lower=lower)

    def __set_casemap(self, cp, upper, lower):
        """Set a case mapping.

        Mark the upper and lower case forms of cp. If a form is empty,
        the character is its own case mapping.
        All parameters are code points encoded via hex into a string.

        """
        # Parse the codepoint from hex.
        cp = int(cp, 16)

        # "The simple uppercase is omitted in the data file if the uppercase
        # is the same as the code point itself."
        # The same is true for the lowercase.
        # Skip eszett or anything else that maps to more than one character.
        self.toupper[cp] = int(upper, 16) if upper and len(upper.split()) == 1 else cp
        self.tolower[cp] = int(lower, 16) if lower and len(lower.split()) == 1 else cp

    def canonicalize(self, ch):
        """Canonicalize a character per ES5.1 15.10.2.8."""
        upper_ch = self.toupper[ch]
        # "If u does not consist of a single character, return ch"
        # We only store 1-1 mappings.
        # "If ch's code unit value is greater than or equal to decimal 128
        # and cu's code unit value is less than decimal 128, then return ch"
        # That is, only ASCII may canonicalize to ASCII.
        if upper_ch < 128 and ch >= 128:
            return ch
        return upper_ch


def print_precanonicalizations(casemap):
    """Print a table of pre-canonicalizations.

    For each canonicalized code point, print the list of code points
    that canonicalize to it (its "pre-canonicalizations"). However do
    not print entries for characters whose pre-canonicalizations are
    only the character and its lowercase form.

    """
    # Get the simple case mappings, then build an inverted table for
    # canonicalizations.
    precanons = {}
    for cp in casemap.codepoints:
        canon_cp = casemap.canonicalize(cp)
        precanons.setdefault(canon_cp, []).append(cp)

    # Remove "obvious" entries that consist of exactly the character
    # and its lowercase form.
    for canon_cp in list(precanons.keys()):
        trivial_precanons = {canon_cp, casemap.tolower[canon_cp]}
        if set(precanons[canon_cp]) == trivial_precanons:
            del precanons[canon_cp]

    # Construct the entries.
    # Each entry leads with the canonicalized codepoint, then the codepoints
    # that canonicalize to it. Example: {0x1c4, {0x1c4, 0x1c5, 0x1c6}}
    def as_hex(cp):
        return "0x{:04X}".format(cp)

    entries = []
    for canon_cp in sorted(precanons.keys()):
        cps_strs = [as_hex(cp) for cp in precanons[canon_cp] if cp != canon_cp]
        entries.append("{%s, {%s}}" % (as_hex(canon_cp), ", ".join(cps_strs)))

    # Print the table.
    print_template(
        """
struct UnicodePrecanonicalizationMapping {
    /// The canonicalized form of the character.
    uint16_t canonicalized;

    /// A list of up to 3 characters which canonicalize to this character.
    /// The value 3 is significant because it is the maximum number of
    /// pre-canonicalizations of any character.
    /// 0 (NUL) is used to indicate none.
    uint16_t forms[3];
};

// The precanonicalizations is a list of exceptional canocializations.
// That is, each canonicalized input character maps to a list of forms that
// canonicalize to it, per the algorithm given in ES5 15.10.2.8. However, if a
// character is only canonicalized to by itself and its lowercase variant, that
// is omitted from the table; this helps keep the table small. Note some
// entries are empty; this indicates that c != uppercase(lowercase(c)). Note
// also this table is sorted.
static constexpr uint32_t UNICODE_PRECANONS_SIZE = $entry_count;
static constexpr UnicodePrecanonicalizationMapping UNICODE_PRECANONS[] = {
$entries
};
    """,
        entry_count=len(entries),
        entries=",\n".join(entries),
    )


if __name__ == "__main__":
    print("Fetching %s..." % UNICODE_DATA_URL, file=sys.stderr)
    with urllib.request.urlopen(UNICODE_DATA_URL) as f:
        unicode_data = f.read()

    print("Fetching %s..." % UNICODE_SPECIAL_CASING_URL, file=sys.stderr)
    with urllib.request.urlopen(UNICODE_SPECIAL_CASING_URL) as f:
        special_casing = f.read()

    print_header(
        hashlib.sha1(unicode_data).hexdigest(), hashlib.sha1(special_casing).hexdigest()
    )
    udata_lines = unicode_data.decode("utf-8").splitlines()
    special_lines = special_casing.decode("utf-8").splitlines()
    print_categories(udata_lines)
    print_precanonicalizations(CaseMap(udata_lines, special_lines))