|
232 | 232 | return set
|
233 | 233 | end
|
234 | 234 |
|
235 |
| - local PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\\]^`{|}~"; PUNCTUATION = newSet{ PUNCTUATION:byte(1, #PUNCTUATION) } -- @Incomplete: Unicode punctuation. |
236 |
| - local WHITESPACE = newSet{ 9,10,11,12,13,32 } -- Horizontal tab, line feed, vertical tab, form feed, carriage return, space. @Incomplete: Unicode whitespace. |
237 |
| - |
238 |
| - local function getCodepointCharType(c) |
239 |
| - return PUNCTUATION[c] and "punctuation" |
240 |
| - or WHITESPACE[c] and "whitespace" |
241 |
| - or "word" |
| 235 | + local ASCII_PUNCTUATION = "!\"#$%&'()*+,-./:;<=>?@[\\]^`{|}~"; ASCII_PUNCTUATION = newSet{ ASCII_PUNCTUATION:byte(1, #ASCII_PUNCTUATION) } |
| 236 | + local ASCII_WHITESPACE = newSet{ 9,10,11,12,13,32 } -- Horizontal tab, line feed, vertical tab, form feed, carriage return, space. |
| 237 | + |
| 238 | + -- Generated by tools/generateUnicodeInfo.lua: |
| 239 | + |
| 240 | + -- PUNCTUATION (8330, 97+230) |
| 241 | + local UNICODE_PUNCTUATION = newSet{180,187,191,215,247,749,885,894,903,1014,1154,1470,1472,1475,1478,1563,1748,1758,1769,2142,2416,2557,2678,2928,3191,3199,3204,3407,3449, |
| 242 | + 3572,3647,3663,3892,3894,3896,3973,4347,5120,6464,7379,8125,8468,8485,8487,8489,8494,8527,11632,12336,12448,12539,12880,42611,42622,43260,43359,43867,44011,64297, |
| 243 | + 65952,66463,66512,66927,67671,67871,67903,68223,68296,69293,70093,70107,70313,70749,70854,71739,72162,73727,92917,94178,113820,113823,119365,120513,120539,120571,120597,120629,120655,120687, |
| 244 | + 120713,120745,120771,123215,123647,126124,126128,126254} |
| 245 | + local ranges = {161,169,171,172,174,177,182,184,706,709,722,735,741,747,751,767,900,901,1370,1375,1417,1418, |
| 246 | + 1421,1423,1523,1524,1542,1551,1566,1567,1642,1645,1789,1790,1792,1805,2038,2041,2046,2047,2096,2110,2404,2405,2546,2547, |
| 247 | + 2554,2555,2800,2801,3059,3066,3674,3675,3841,3863,3866,3871,3898,3901,4030,4037,4039,4044,4046,4058,4170,4175,4254,4255, |
| 248 | + 4960,4968,5008,5017,5741,5742,5787,5788,5867,5869,5941,5942,6100,6102,6104,6107,6144,6154,6468,6469,6622,6655,6686,6687, |
| 249 | + 6816,6822,6824,6829,7002,7018,7028,7036,7164,7167,7227,7231,7294,7295,7360,7367,8127,8129,8141,8143,8157,8159,8173,8175, |
| 250 | + 8189,8190,8208,8231,8240,8286,8314,8318,8330,8334,8352,8383,8448,8449,8451,8454,8456,8457,8470,8472,8478,8483,8506,8507, |
| 251 | + 8512,8516,8522,8525,8586,8587,8592,9254,9280,9290,9372,9449,9472,10101,10132,11123,11126,11157,11159,11263,11493,11498,11513,11516, |
| 252 | + 11518,11519,11776,11822,11824,11858,11904,11929,11931,12019,12032,12245,12272,12283,12289,12292,12296,12320,12342,12343,12349,12351,12443,12444, |
| 253 | + 12688,12689,12694,12703,12736,12771,12800,12830,12842,12871,12896,12927,12938,12976,12992,13311,19904,19967,42128,42182,42238,42239,42509,42511, |
| 254 | + 42738,42743,42752,42774,42784,42785,42889,42890,43048,43051,43062,43065,43124,43127,43214,43215,43256,43258,43310,43311,43457,43469,43486,43487, |
| 255 | + 43612,43615,43639,43641,43742,43743,43760,43761,43882,43883,64434,64449,64830,64831,65020,65021,65040,65049,65072,65106,65108,65126,65128,65131, |
| 256 | + 65281,65295,65306,65312,65339,65344,65371,65381,65504,65510,65512,65518,65532,65533,65792,65794,65847,65855,65913,65929,65932,65934,65936,65948, |
| 257 | + 66000,66044,67703,67704,68176,68184,68336,68342,68409,68415,68505,68508,69461,69465,69703,69709,69819,69820,69822,69825,69952,69955,70004,70005, |
| 258 | + 70085,70088,70109,70111,70200,70205,70731,70735,70746,70747,71105,71127,71233,71235,71264,71276,71484,71487,72004,72006,72255,72262,72346,72348, |
| 259 | + 72350,72354,72769,72773,72816,72817,73463,73464,73685,73713,74864,74868,92782,92783,92983,92991,92996,92997,93847,93850,118784,119029,119040,119078, |
| 260 | + 119081,119140,119146,119148,119171,119172,119180,119209,119214,119272,119296,119361,119552,119638,120832,121343,121399,121402,121453,121460,121462,121475,121477,121483, |
| 261 | + 125278,125279,126704,126705,126976,127019,127024,127123,127136,127150,127153,127167,127169,127183,127185,127221,127245,127405,127462,127490,127504,127547,127552,127560, |
| 262 | + 127568,127569,127584,127589,127744,128727,128736,128748,128752,128764,128768,128883,128896,128984,128992,129003,129024,129035,129040,129095,129104,129113,129120,129159, |
| 263 | + 129168,129197,129200,129201,129280,129400,129402,129483,129485,129619,129632,129645,129648,129652,129656,129658,129664,129670,129680,129704,129712,129718,129728,129730, |
| 264 | + 129744,129750,129792,129938,129940,129994} |
| 265 | + for i = 1, #ranges, 2 do for cp = ranges[i], ranges[i+1] do UNICODE_PUNCTUATION[cp] = true end end |
| 266 | + |
| 267 | + -- WHITESPACE (18, 5+2) |
| 268 | + local UNICODE_WHITESPACE = newSet{160,5760,8239,8287,12288} |
| 269 | + local ranges = {8192,8202,8232,8233} |
| 270 | + for i = 1, #ranges, 2 do for cp = ranges[i], ranges[i+1] do UNICODE_WHITESPACE[cp] = true end end |
| 271 | + |
| 272 | + |
| 273 | + |
| 274 | + local function getCodepointCharType(cp) |
| 275 | + return ASCII_PUNCTUATION[cp] and "punctuation" |
| 276 | + or ASCII_WHITESPACE[cp] and "whitespace" |
| 277 | + or cp <= 127 and "word" |
| 278 | + or UNICODE_PUNCTUATION[cp] and "punctuation" |
| 279 | + or UNICODE_WHITESPACE[cp] and "whitespace" |
| 280 | + or "word" |
242 | 281 | end
|
243 | 282 |
|
244 | 283 | local codepoints = {}
|
|
265 | 304 | pos = pos + direction
|
266 | 305 |
|
267 | 306 | -- Check for end of string.
|
268 |
| - local prevC = codepoints[pos] |
269 |
| - local nextC = codepoints[pos+direction] |
| 307 | + local prevCp = codepoints[pos] |
| 308 | + local nextCp = codepoints[pos+direction] |
270 | 309 |
|
271 |
| - if not (prevC and nextC) then |
| 310 | + if not (prevCp and nextCp) then |
272 | 311 | pos = pos + direction
|
273 | 312 | break
|
274 | 313 | end
|
275 | 314 |
|
276 | 315 | -- Check for word bound.
|
277 |
| - local prevType = getCodepointCharType(prevC) |
278 |
| - local nextType = getCodepointCharType(nextC) |
| 316 | + local prevType = getCodepointCharType(prevCp) |
| 317 | + local nextType = getCodepointCharType(nextCp) |
279 | 318 |
|
280 | 319 | if nextType ~= prevType and not (nextType ~= "whitespace" and prevType == "whitespace") then
|
281 | 320 | if direction < 0 then pos = pos-1 end
|
|
0 commit comments