forked from facebook/hermes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
genUnicodeTable.py
executable file
·269 lines (224 loc) · 8.98 KB
/
genUnicodeTable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the LICENSE
# file in the root directory of this source tree.
# -*- coding: utf-8 -*-
# Generates list of unicode ranges belonging to a set of categories
# Usage: genUnicodeTable.py
import datetime
import hashlib
import sys
import urllib.request
from string import Template
UNICODE_DATA_URL = "ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt"
UNICODE_SPECIAL_CASING_URL = "ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt"
# Unicode data field indexes. See UnicodeData.txt.
CODEPOINT_FIELD = 0
GENERAL_CATEGORY_FIELD = 2
UPPERCASE_FIELD = 12
LOWERCASE_FIELD = 13
def print_template(s, **kwargs):
""" Substitute in the keyword arguments to the template string
(or direct template) s, and print the result, followed by a
newline.
"""
text = Template(s).substitute(**kwargs)
print(text.strip())
print("")
def print_header(unicodedata_sha1, specialcasing_sha1):
print_template(
"""
//
// File generated by genUnicodeTable.py
// using Unicode data files downloaded on ${today}
// UnicodeData.txt SHA1: ${unicodedata_sha1}
// SpecialCasing.txt SHA1: ${specialcasing_sha1}
// *** DO NOT EDIT BY HAND ***
struct UnicodeRange { uint32_t first; uint32_t second; };
""",
today=str(datetime.date.today()),
unicodedata_sha1=unicodedata_sha1,
specialcasing_sha1=specialcasing_sha1,
)
def run_interval(unicode_data_lines, args):
name = args[0]
categories = set(args[1:])
begin = 0
intervals = []
last_cp = 0
openi = False
for line in unicode_data_lines:
fields = line.split(";")
cp_str, category = fields[CODEPOINT_FIELD], fields[GENERAL_CATEGORY_FIELD]
cp = int(cp_str, 16)
if category in categories:
if not openi:
begin = cp
openi = True
else:
pass # do nothing we are still in interval
else:
if openi:
intervals.append((begin, last_cp))
openi = False
else:
pass # keep looking
last_cp = cp
if openi:
intervals.append((begin, last_cp))
print_template(
"""
// ${args}
// static constexpr uint32_t ${name}_SIZE = $interval_count;
static constexpr UnicodeRange ${name}[] = {
${intervals}
};
""",
args=" ".join(args),
name=name,
interval_count=len(intervals),
intervals="\n".join(
"{" + hex(i[0]) + ", " + hex(i[1]) + "}," for i in intervals
),
)
def print_categories(unicode_data_lines):
"""Output UnicodeRanges for Unicode General Categories."""
categories = [
"UNICODE_LETTERS Lu Ll Lt Lm Lo Nl",
"UNICODE_COMBINING_MARK Mn Mc",
"UNICODE_DIGIT Nd",
"UNICODE_CONNECTOR_PUNCTUATION Pc",
]
for cat in categories:
run_interval(unicode_data_lines, cat.split())
class CaseMap(object):
"""Unicode case mapping helper.
This class holds the list of codepoints, and their uppercase and
lowercase mappings.
"""
def __init__(self, unicode_data_lines, special_casing_lines):
"""Construct with the lines from UnicodeData and SpecialCasing."""
self.toupper = {}
self.tolower = {}
self.codepoints = []
for line in unicode_data_lines:
fields = line.split(";")
self.__set_casemap(
fields[CODEPOINT_FIELD],
upper=fields[UPPERCASE_FIELD],
lower=fields[LOWERCASE_FIELD],
)
self.codepoints.extend(self.toupper.keys())
# Apply special cases. This is to support ES5.1 Canonicalize, which is
# cast in terms of toUpperCase(). The desire here is to have a
# locale-independent result. Thus we ignore SpecialCasing rules that
# are locale specific. We can also get away with ignoring
# context-sensitive rules because Canonicalize only considers one
# character. Thus ignore any rules that have a condition.
# Format is codepoint, lower, title, upper, condition
for line in special_casing_lines:
# Trim comments
line = line.split("#")[0]
fields = line.split(";")
if len(fields) < 5:
continue
cps, lower, title, upper, condition = fields[:5]
# Title is unused
_ = title # noqa: F841
if not condition.strip():
self.__set_casemap(cps, upper=upper, lower=lower)
def __set_casemap(self, cp, upper, lower):
"""Set a case mapping.
Mark the upper and lower case forms of cp. If a form is empty,
the character is its own case mapping.
All parameters are code points encoded via hex into a string.
"""
# Parse the codepoint from hex.
cp = int(cp, 16)
# "The simple uppercase is omitted in the data file if the uppercase
# is the same as the code point itself."
# The same is true for the lowercase.
# Skip eszett or anything else that maps to more than one character.
self.toupper[cp] = int(upper, 16) if upper and len(upper.split()) == 1 else cp
self.tolower[cp] = int(lower, 16) if lower and len(lower.split()) == 1 else cp
def canonicalize(self, ch):
"""Canonicalize a character per ES5.1 15.10.2.8."""
upper_ch = self.toupper[ch]
# "If u does not consist of a single character, return ch"
# We only store 1-1 mappings.
# "If ch's code unit value is greater than or equal to decimal 128
# and cu's code unit value is less than decimal 128, then return ch"
# That is, only ASCII may canonicalize to ASCII.
if upper_ch < 128 and ch >= 128:
return ch
return upper_ch
def print_precanonicalizations(casemap):
"""Print a table of pre-canonicalizations.
For each canonicalized code point, print the list of code points
that canonicalize to it (its "pre-canonicalizations"). However do
not print entries for characters whose pre-canonicalizations are
only the character and its lowercase form.
"""
# Get the simple case mappings, then build an inverted table for
# canonicalizations.
precanons = {}
for cp in casemap.codepoints:
canon_cp = casemap.canonicalize(cp)
precanons.setdefault(canon_cp, []).append(cp)
# Remove "obvious" entries that consist of exactly the character
# and its lowercase form.
for canon_cp in list(precanons.keys()):
trivial_precanons = {canon_cp, casemap.tolower[canon_cp]}
if set(precanons[canon_cp]) == trivial_precanons:
del precanons[canon_cp]
# Construct the entries.
# Each entry leads with the canonicalized codepoint, then the codepoints
# that canonicalize to it. Example: {0x1c4, {0x1c4, 0x1c5, 0x1c6}}
def as_hex(cp):
return "0x{:04X}".format(cp)
entries = []
for canon_cp in sorted(precanons.keys()):
cps_strs = [as_hex(cp) for cp in precanons[canon_cp] if cp != canon_cp]
entries.append("{%s, {%s}}" % (as_hex(canon_cp), ", ".join(cps_strs)))
# Print the table.
print_template(
"""
struct UnicodePrecanonicalizationMapping {
/// The canonicalized form of the character.
uint16_t canonicalized;
/// A list of up to 3 characters which canonicalize to this character.
/// The value 3 is significant because it is the maximum number of
/// pre-canonicalizations of any character.
/// 0 (NUL) is used to indicate none.
uint16_t forms[3];
};
// The precanonicalizations is a list of exceptional canocializations.
// That is, each canonicalized input character maps to a list of forms that
// canonicalize to it, per the algorithm given in ES5 15.10.2.8. However, if a
// character is only canonicalized to by itself and its lowercase variant, that
// is omitted from the table; this helps keep the table small. Note some
// entries are empty; this indicates that c != uppercase(lowercase(c)). Note
// also this table is sorted.
static constexpr uint32_t UNICODE_PRECANONS_SIZE = $entry_count;
static constexpr UnicodePrecanonicalizationMapping UNICODE_PRECANONS[] = {
$entries
};
""",
entry_count=len(entries),
entries=",\n".join(entries),
)
if __name__ == "__main__":
print("Fetching %s..." % UNICODE_DATA_URL, file=sys.stderr)
with urllib.request.urlopen(UNICODE_DATA_URL) as f:
unicode_data = f.read()
print("Fetching %s..." % UNICODE_SPECIAL_CASING_URL, file=sys.stderr)
with urllib.request.urlopen(UNICODE_SPECIAL_CASING_URL) as f:
special_casing = f.read()
print_header(
hashlib.sha1(unicode_data).hexdigest(), hashlib.sha1(special_casing).hexdigest()
)
udata_lines = unicode_data.decode("utf-8").splitlines()
special_lines = special_casing.decode("utf-8").splitlines()
print_categories(udata_lines)
print_precanonicalizations(CaseMap(udata_lines, special_lines))