Skip to content

Commit f6ee005

Browse files
committed
Better Unicode support for word navigation.
1 parent 744c61e commit f6ee005

File tree

2 files changed

+154
-12
lines changed

2 files changed

+154
-12
lines changed

InputField.lua

+51-12
Original file line numberDiff line numberDiff line change
@@ -232,13 +232,52 @@ do
232232
return set
233233
end
234234

235-
local PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\\]^`{|}~"; PUNCTUATION = newSet{ PUNCTUATION:byte(1, #PUNCTUATION) } -- @Incomplete: Unicode punctuation.
236-
local WHITESPACE = newSet{ 9,10,11,12,13,32 } -- Horizontal tab, line feed, vertical tab, form feed, carriage return, space. @Incomplete: Unicode whitespace.
237-
238-
local function getCodepointCharType(c)
239-
return PUNCTUATION[c] and "punctuation"
240-
or WHITESPACE[c] and "whitespace"
241-
or "word"
235+
local ASCII_PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\\]^`{|}~"; ASCII_PUNCTUATION = newSet{ ASCII_PUNCTUATION:byte(1, #ASCII_PUNCTUATION) }
236+
local ASCII_WHITESPACE = newSet{ 9,10,11,12,13,32 } -- Horizontal tab, line feed, vertical tab, form feed, carriage return, space.
237+
238+
-- Generated by tools/generateUnicodeInfo.lua:
239+
240+
-- PUNCTUATION (8330, 97+230)
241+
local UNICODE_PUNCTUATION = newSet{180,187,191,215,247,749,885,894,903,1014,1154,1470,1472,1475,1478,1563,1748,1758,1769,2142,2416,2557,2678,2928,3191,3199,3204,3407,3449,
242+
3572,3647,3663,3892,3894,3896,3973,4347,5120,6464,7379,8125,8468,8485,8487,8489,8494,8527,11632,12336,12448,12539,12880,42611,42622,43260,43359,43867,44011,64297,
243+
65952,66463,66512,66927,67671,67871,67903,68223,68296,69293,70093,70107,70313,70749,70854,71739,72162,73727,92917,94178,113820,113823,119365,120513,120539,120571,120597,120629,120655,120687,
244+
120713,120745,120771,123215,123647,126124,126128,126254}
245+
local ranges = {161,169,171,172,174,177,182,184,706,709,722,735,741,747,751,767,900,901,1370,1375,1417,1418,
246+
1421,1423,1523,1524,1542,1551,1566,1567,1642,1645,1789,1790,1792,1805,2038,2041,2046,2047,2096,2110,2404,2405,2546,2547,
247+
2554,2555,2800,2801,3059,3066,3674,3675,3841,3863,3866,3871,3898,3901,4030,4037,4039,4044,4046,4058,4170,4175,4254,4255,
248+
4960,4968,5008,5017,5741,5742,5787,5788,5867,5869,5941,5942,6100,6102,6104,6107,6144,6154,6468,6469,6622,6655,6686,6687,
249+
6816,6822,6824,6829,7002,7018,7028,7036,7164,7167,7227,7231,7294,7295,7360,7367,8127,8129,8141,8143,8157,8159,8173,8175,
250+
8189,8190,8208,8231,8240,8286,8314,8318,8330,8334,8352,8383,8448,8449,8451,8454,8456,8457,8470,8472,8478,8483,8506,8507,
251+
8512,8516,8522,8525,8586,8587,8592,9254,9280,9290,9372,9449,9472,10101,10132,11123,11126,11157,11159,11263,11493,11498,11513,11516,
252+
11518,11519,11776,11822,11824,11858,11904,11929,11931,12019,12032,12245,12272,12283,12289,12292,12296,12320,12342,12343,12349,12351,12443,12444,
253+
12688,12689,12694,12703,12736,12771,12800,12830,12842,12871,12896,12927,12938,12976,12992,13311,19904,19967,42128,42182,42238,42239,42509,42511,
254+
42738,42743,42752,42774,42784,42785,42889,42890,43048,43051,43062,43065,43124,43127,43214,43215,43256,43258,43310,43311,43457,43469,43486,43487,
255+
43612,43615,43639,43641,43742,43743,43760,43761,43882,43883,64434,64449,64830,64831,65020,65021,65040,65049,65072,65106,65108,65126,65128,65131,
256+
65281,65295,65306,65312,65339,65344,65371,65381,65504,65510,65512,65518,65532,65533,65792,65794,65847,65855,65913,65929,65932,65934,65936,65948,
257+
66000,66044,67703,67704,68176,68184,68336,68342,68409,68415,68505,68508,69461,69465,69703,69709,69819,69820,69822,69825,69952,69955,70004,70005,
258+
70085,70088,70109,70111,70200,70205,70731,70735,70746,70747,71105,71127,71233,71235,71264,71276,71484,71487,72004,72006,72255,72262,72346,72348,
259+
72350,72354,72769,72773,72816,72817,73463,73464,73685,73713,74864,74868,92782,92783,92983,92991,92996,92997,93847,93850,118784,119029,119040,119078,
260+
119081,119140,119146,119148,119171,119172,119180,119209,119214,119272,119296,119361,119552,119638,120832,121343,121399,121402,121453,121460,121462,121475,121477,121483,
261+
125278,125279,126704,126705,126976,127019,127024,127123,127136,127150,127153,127167,127169,127183,127185,127221,127245,127405,127462,127490,127504,127547,127552,127560,
262+
127568,127569,127584,127589,127744,128727,128736,128748,128752,128764,128768,128883,128896,128984,128992,129003,129024,129035,129040,129095,129104,129113,129120,129159,
263+
129168,129197,129200,129201,129280,129400,129402,129483,129485,129619,129632,129645,129648,129652,129656,129658,129664,129670,129680,129704,129712,129718,129728,129730,
264+
129744,129750,129792,129938,129940,129994}
265+
for i = 1, #ranges, 2 do for cp = ranges[i], ranges[i+1] do UNICODE_PUNCTUATION[cp] = true end end
266+
267+
-- WHITESPACE (18, 5+2)
268+
local UNICODE_WHITESPACE = newSet{160,5760,8239,8287,12288}
269+
local ranges = {8192,8202,8232,8233}
270+
for i = 1, #ranges, 2 do for cp = ranges[i], ranges[i+1] do UNICODE_WHITESPACE[cp] = true end end
271+
272+
273+
274+
local function getCodepointCharType(cp)
275+
return ASCII_PUNCTUATION[cp] and "punctuation"
276+
or ASCII_WHITESPACE[cp] and "whitespace"
277+
or cp <= 127 and "word"
278+
or UNICODE_PUNCTUATION[cp] and "punctuation"
279+
or UNICODE_WHITESPACE[cp] and "whitespace"
280+
or "word"
242281
end
243282

244283
local codepoints = {}
@@ -265,17 +304,17 @@ do
265304
pos = pos + direction
266305

267306
-- Check for end of string.
268-
local prevC = codepoints[pos]
269-
local nextC = codepoints[pos+direction]
307+
local prevCp = codepoints[pos]
308+
local nextCp = codepoints[pos+direction]
270309

271-
if not (prevC and nextC) then
310+
if not (prevCp and nextCp) then
272311
pos = pos + direction
273312
break
274313
end
275314

276315
-- Check for word bound.
277-
local prevType = getCodepointCharType(prevC)
278-
local nextType = getCodepointCharType(nextC)
316+
local prevType = getCodepointCharType(prevCp)
317+
local nextType = getCodepointCharType(nextCp)
279318

280319
if nextType ~= prevType and not (nextType ~= "whitespace" and prevType == "whitespace") then
281320
if direction < 0 then pos = pos-1 end

tools/generateUnicodeInfo.lua

+103
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
--
2+
-- Unicode info generator
3+
--
4+
-- How to generate:
5+
-- > Put UnicodeData.txt in "local/UnicodeData.txt". (https://unicode.org/Public/UNIDATA/UnicodeData.txt)
6+
-- > Run: lua tools/generateUnicodeInfo.lua
7+
-- > Copy printed info
8+
--
9+
10+
local cpSetByCategory = {}
11+
local maxCp = 128
12+
13+
for line in io.lines"local/UnicodeData.txt" do
14+
--[[
15+
Fields:
16+
1. Codepoint
17+
2. Name
18+
3. General_Category
19+
4. Canonical_Combining_Class
20+
5. Bidi_Class
21+
6. Decomposition_Type
22+
7. Decomposition_Mapping
23+
8. Numeric_Type
24+
9. Numeric_Value
25+
10. Bidi_Mirrored
26+
11. Unicode_1_Name (Obsolete as of 6.2.0)
27+
12. ISO_Comment (Obsolete as of 5.2.0; Deprecated and Stabilized as of 6.0.0)
28+
13. Simple_Uppercase_Mapping
29+
14. Simple_Lowercase_Mapping
30+
15. Simple_Titlecase_Mapping
31+
]]
32+
local cp, category = line:match"^(%x+);[^;]*;([^;]*)"
33+
34+
cp = tonumber(cp, 16)
35+
category = category:sub(1, 1) -- Generalize categories more (e.g. make 'Zs' into 'Z').
36+
37+
cpSetByCategory[category] = cpSetByCategory[category] or {}
38+
cpSetByCategory[category][cp] = true
39+
40+
maxCp = math.max(maxCp, cp)
41+
end
42+
43+
local INDENTATION = string.rep("\t", 1)
44+
45+
local function generateInfoForCharacterType(name, categories)
46+
local cpSetJoined = {}
47+
local count = 0
48+
49+
for _, category in ipairs(categories) do
50+
local cpSet = cpSetByCategory[category]
51+
local lastStartCp
52+
53+
for cp = 128, maxCp do
54+
if cpSet[cp] then
55+
cpSetJoined[cp] = true
56+
count = count + 1
57+
end
58+
end
59+
end
60+
61+
local singles = {}
62+
local ranges = {}
63+
local lastStartCp
64+
65+
for cp = 128, maxCp do
66+
if cpSetJoined[cp] then
67+
if not cpSetJoined[cp-1] then
68+
lastStartCp = cp
69+
end
70+
if not cpSetJoined[cp+1] then
71+
if cp == lastStartCp then table.insert(singles, cp)
72+
else table.insert(ranges, {from=lastStartCp, to=cp}) end
73+
end
74+
end
75+
end
76+
77+
table.sort(singles)
78+
table.sort(ranges, function(a, b) return a.from < b.from end)
79+
80+
io.write(INDENTATION, "-- ", name, " (", count, ", ", #singles, "+", #ranges, ")\n")
81+
82+
io.write(INDENTATION, "local UNICODE_", name, " = newSet{")
83+
for i, cp in ipairs(singles) do
84+
if i > 1 then io.write(",") end
85+
if i % 30 == 0 then io.write("\n\t", INDENTATION) end
86+
io.write(cp)
87+
end
88+
io.write("}\n")
89+
90+
io.write(INDENTATION, "local ranges = {")
91+
for i, range in ipairs(ranges) do
92+
if i > 1 then io.write(",") end
93+
if i %12 == 0 then io.write("\n\t", INDENTATION) end
94+
io.write(range.from, ",", range.to)
95+
end
96+
io.write("}\n")
97+
io.write(INDENTATION, "for i = 1, #ranges, 2 do for cp = ranges[i], ranges[i+1] do UNICODE_", name, "[cp] = true end end\n")
98+
99+
io.write("\n")
100+
end
101+
102+
generateInfoForCharacterType("PUNCTUATION", {"P"--[[punctuation]],"S"--[[symbol]]})
103+
generateInfoForCharacterType("WHITESPACE", {"Z"--[[separator]]})

0 commit comments

Comments
 (0)