-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlex.go
553 lines (492 loc) · 12.9 KB
/
lex.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
package parser
import (
"fmt"
"strconv"
"strings"
"unicode"
"unicode/utf16"
"unicode/utf8"
)
// token represents a single token in the input stream.
// Name: mnemonic name (numeric).
// Val: string value of the token from the original stream.
// Pos: position - offset from beginning of stream.
type token struct {
// tok identifies the token, either a rune or a negative number
// representing a special token.
tok rune
// val contains the string representation of the token. When tok is
// goString, val will be a fully-parsed Go string. When tok is invalid,
// val will be the error message.
val string
// pos represents the position of the start of the token.
pos int
}
const (
// Non-rune special tokens are negative.
invalid = -(iota + 1)
eof
identifier
integer
number
goString
blankSpace
boolTrue
boolFalse
jsonNull
)
// Special values.
const (
// Numeric bases.
decimal = 10
hex = 16
// Token literals.
nullByte = 0x00
backslash = '\\'
)
// name returns the token name, a string for special tokens and a quoted rune
// or escaped rune.
func (tok token) name() string {
switch tok.tok {
case invalid:
return "invalid"
case eof:
return "eof"
case identifier:
return "identifier"
case integer:
return "integer"
case number:
return "number"
case goString:
return "string"
case blankSpace:
return "blank space"
default:
return strconv.QuoteRune(tok.tok)
}
}
// String returns a string representation of the token.
func (tok token) String() string {
return fmt.Sprintf("Token{%v, %q, %v}", tok.name(), tok.val, tok.pos)
}
// err returns an error for invalid tokens and nil for all other tokens.
func (tok token) err() error {
if tok.tok != invalid {
return nil
}
return fmt.Errorf("%w: %v at %v", ErrPathParse, tok.val, tok.pos)
}
// errToken creates and returns an error token.
func (lex *lexer) errToken(pos int, msg string) token {
// Set r to invalid so scan() ceases to scan.
lex.r = invalid
return token{invalid, msg, pos}
}
// lexer for [RFC 9535] JSONPath queries.
//
// Create a new lexer with NewLexer and then call NextToken repeatedly to get
// tokens from the stream. The lexer will return a token with the name EOF when
// done.
//
// Based on the public domain [TableGen lexer] by [Eli Bendersky].
//
// [RFC 9535]: https://www.rfc-editor.org/rfc/rfc9535.html
// [TableGen lexer]: https://github.com/eliben/code-for-blog/blob/main/2014/tablegen-lexer-go/lexer-string/lexer.go
// [Eli Bendersky]: https://eli.thegreenplace.net
type lexer struct {
buf string
// Current rune.
r rune
// Position of the current rune in buf.
rPos int
// Position of the next rune in buf.
nextPos int
// Last scanned token.
prev token
}
// newLexer creates a new lexer for the given input.
func newLexer(buf string) *lexer {
lex := lexer{buf, -1, 0, 0, token{}}
// Prime the lexer by calling .next
lex.next()
return &lex
}
// scan returns the next token.
func (lex *lexer) scan() token {
switch {
case lex.r < 0:
lex.prev = token{eof, "", lex.rPos}
case lex.r == '$':
if isIdentRune(lex.peek(), 0) {
lex.prev = lex.scanIdentifier()
} else {
lex.prev = token{lex.r, "", lex.rPos}
lex.next()
}
case isIdentRune(lex.r, 0):
lex.prev = lex.scanIdentifier()
case isDigit(lex.r) || lex.r == '-':
lex.prev = lex.scanNumber()
case lex.r == '"' || lex.r == '\'':
lex.prev = lex.scanString()
case lex.isBlankSpace(lex.r):
lex.prev = lex.scanBlankSpace()
default:
lex.prev = token{lex.r, "", lex.rPos}
lex.next()
}
return lex.prev
}
// next advances the lexer's internal state to point to the next rune in the
// input.
func (lex *lexer) next() rune {
if lex.nextPos < len(lex.buf) {
lex.rPos = lex.nextPos
r, w := rune(lex.buf[lex.nextPos]), 1
if r >= utf8.RuneSelf {
r, w = utf8.DecodeRuneInString(lex.buf[lex.nextPos:])
}
lex.nextPos += w
lex.r = r
} else {
lex.rPos = len(lex.buf)
lex.r = eof
}
return lex.r
}
// peek returns the next byte in the stream (the one after lex.r).
// Note: a single byte is peeked at - if there's a rune longer than a byte
// there, only its first byte is returned. Returns eof if there is no next
// byte.
func (lex *lexer) peek() rune {
if lex.nextPos < len(lex.buf) {
return rune(lex.buf[lex.nextPos])
}
return rune(eof)
}
// scanBlankSpace scan and returns a token of blank spaces.
func (lex *lexer) scanBlankSpace() token {
startPos := lex.rPos
for lex.isBlankSpace(lex.r) {
lex.next()
}
return token{blankSpace, lex.buf[startPos:lex.rPos], startPos}
}
// skipBlankSpace skips blank spaces from the current position until the next
// non-blank space token.
func (lex *lexer) skipBlankSpace() rune {
if lex.isBlankSpace(lex.r) {
lex.scan()
}
return lex.r
}
// isBlankSpace returns true if r is blank space.
func (lex *lexer) isBlankSpace(r rune) bool {
switch r {
case '\t', '\n', '\r', ' ':
return true
}
return false
}
// peekPastBlankSpace returns the next non-blank space rune from the current
// position without advancing the internal state.
func (lex *lexer) peekPastBlankSpace() rune {
np := lex.nextPos
for np < len(lex.buf) {
r := rune(lex.buf[np])
if !lex.isBlankSpace(r) {
return r
}
np++
}
return rune(eof)
}
// scanIdentifier scans an identifier, including shorthand names and
// constants. lex.r should be the first rune in the identifier, and
// isIdentRune(lex.r, 0) should have already returned true.
func (lex *lexer) scanIdentifier() token {
buf := new(strings.Builder)
startPos := lex.rPos
escaped := false
// Scan the identifier as long as we have legit identifier runes.
for isIdentRune(lex.r, 1) {
buf.WriteRune(lex.r)
lex.next()
}
tok := token{identifier, buf.String(), startPos}
if !escaped {
// Set proper name for literals.
switch tok.val {
case "true":
tok.tok = boolTrue
case "false":
tok.tok = boolFalse
case "null":
tok.tok = jsonNull
}
}
return tok
}
// isIdentRune is a predicate controlling the characters accepted as the ith
// rune in an identifier. These follow JSONPath [shorthand notation syntax].
//
// [shorthand notation syntax]: https://www.rfc-editor.org/rfc/rfc9535.html#section-2.5.1.1-2
func isIdentRune(r rune, i int) bool {
if i == 0 && ('0' <= r && r <= '9') {
return false
}
return (r >= 'a' && r <= 'z') ||
('A' <= r && r <= 'Z') ||
r == '_' ||
('0' <= r && r <= '9') ||
(0x80 <= r && r <= 0xd7ff) ||
(0xE000 <= r && r <= 0x10FFFF)
}
// scanNumber scans an integer or decimal number.
func (lex *lexer) scanNumber() token {
startPos := lex.rPos
// Start with integer.
switch lex.r {
case '0':
next := lex.next()
if next != '.' && next != 'e' {
if isDigit(next) {
// No leading zeros for integers.
return lex.errToken(startPos, "invalid number literal")
}
// Standalone zero.
return token{integer, "0", startPos}
}
case '-':
// ["-"] DIGIT1 *DIGIT / (int / "-0") [ frac ] [ exp ]
next := lex.next()
switch {
case next == '0':
if isDigit(lex.peek()) {
// No leading zeros.
return lex.errToken(startPos, "invalid number literal")
}
case !isDigit(next):
// Need digits after decimal.
return lex.errToken(startPos, "invalid number literal")
}
fallthrough
default:
for isDigit(lex.r) {
lex.next()
}
}
// We have the integer part. Is there a decimal and/or exponent?
switch lex.r {
case '.':
// Parse fraction: "." 1*DIGIT
if !isDigit(lex.next()) {
// No digits after decimal.
return lex.errToken(startPos, "invalid number literal")
}
// Collect remaining digits.
for isDigit(lex.r) {
lex.next()
}
switch lex.r {
case 'e', 'E':
// Exponent.
return lex.scanExponent(startPos)
}
return token{number, lex.buf[startPos:lex.rPos], startPos}
case 'e', 'E':
// Exponent.
return lex.scanExponent(startPos)
default:
// Just an integer.
return token{integer, lex.buf[startPos:lex.rPos], startPos}
}
}
// scanExponent scans the exponent part of a decimal number. lex.r should be
// set to 'e' and expect to be followed by an optional '-' or '+' and then one
// or more digits.
func (lex *lexer) scanExponent(startPos int) token {
// Parse exponent: "e" [ "-" / "+" ] 1*DIGIT
switch lex.next() {
case '-', '+':
if !isDigit(lex.next()) {
// No digit after + or -
return lex.errToken(startPos, "invalid number literal")
}
default:
if !isDigit(lex.r) {
// No digit after e
return lex.errToken(startPos, "invalid number literal")
}
}
// Consume remaining digits.
for isDigit(lex.r) {
lex.next()
}
return token{number, lex.buf[startPos:lex.rPos], startPos}
}
// scanString scans and parses a single- or double-quoted JavaScript string.
// Token.Val contains the parsed value. Returns an error token on error.
func (lex *lexer) scanString() token {
startPos := lex.rPos
q := lex.r
lex.next()
buf := new(strings.Builder)
NEXT:
for lex.r > 0 {
switch {
case isUnescaped(lex.r, q):
// Regular character.
buf.WriteRune(lex.r)
lex.next()
case lex.r == '\\':
if !lex.writeEscape(q, buf) {
pos := lex.rPos
lex.next()
return lex.errToken(pos, "invalid escape after backslash")
}
default:
// End of string or buffer.
break NEXT
}
}
if lex.r != q {
return lex.errToken(lex.rPos, "unterminated string literal")
}
lex.next()
return token{goString, buf.String(), startPos}
}
// writeEscape handles string escapes in the context of a string. Set q to '
// or " to indicate a single or double-quoted string context, respectively,
// and -1 for an identifier. lex.r should be set to the rune following the
// backslash that initiated the escape.
func (lex *lexer) writeEscape(q rune, buf *strings.Builder) bool {
// Starting an escape sequence.
next := lex.next()
if r := unescape(next, q); r > 0 {
// A single-character escape.
buf.WriteRune(r)
lex.next()
return true
}
if next == '\u0075' { // uXXXX U+XXXX
// \uXXXX unicode escape.
if r := lex.parseUnicode(); r >= 0 {
buf.WriteRune(r)
lex.next()
return true
}
}
return false
}
// Returns true if r is a regular, non-escaped character. Pass q as " or ' to
// indicate the scanning of a double or single quotation string.
func isUnescaped(r rune, q rune) bool {
switch {
case r == q:
return false
case r >= '\u0020' && r <= '\u005b': // omit 0x5C \
return true
case r >= '\u005d' && r <= '\ud7ff': // skip surrogate code points
return true
case r >= '\ue000' && r <= '\U0010FFFF':
return true
default:
return false
}
}
// Returns the value corresponding to escape code r. Pass q as " or ' to
// indicate the scanning of a double or single quotation string.
func unescape(r rune, q rune) rune {
switch r {
case q: // ' or "
return q
case '\u0062': // b BS backspace U+0008
return '\u0008'
case '\u0066': // f FF form feed U+000C
return '\u000c'
case '\u006e': // n LF line feed U+000A
return '\u000a'
case '\u0072': // r CR carriage return U+000D
return '\u000d'
case '\u0074': // t HT horizontal tab U+0009
return '\u0009'
case '/': // / slash (solidus) U+002F
return '/'
case '\\': // \ backslash (reverse solidus) U+005C
return '\\'
default:
return 0
}
}
// Parses a \u unicode escape sequence. Returns invalid (-1) on error.
func (lex *lexer) parseUnicode() rune {
if !isHexDigit(lex.next()) {
return rune(invalid)
}
if lex.r != 'd' && lex.r != 'D' {
// non-surrogate DIGIT / "A"/"B"/"C" / "E"/"F"
return lex.scanUnicode()
}
switch lex.peek() {
case '0', '1', '2', '3', '4', '5', '6', '7':
// non-surrogate "D" %x30-37 2HEXDIG
return lex.scanUnicode()
}
// potential high-surrogate "D" ("8"/"9"/"A"/"B") 2HEXDIG
high := lex.scanUnicode()
// Must be followed by \u
if high < nullByte || lex.next() != '\\' || lex.next() != 'u' {
return rune(invalid)
}
// potential low-surrogate "D" ("C"/"D"/"E"/"F") 2HEXDIG
lex.next()
low := lex.scanUnicode()
if low < nullByte {
return rune(invalid)
}
// Merge and return the surrogate pair, if valid.
if dec := utf16.DecodeRune(high, low); dec != unicode.ReplacementChar {
return dec
}
// Adjust position to start of second \u escape for error reporting.
lex.rPos -= 3
return rune(invalid)
}
// scanUnicode scans the current rune plus the next four and merges them into
// the resulting unicode rune. Returns invalid (-1) on error.
func (lex *lexer) scanUnicode() rune {
rr := hexChar(lex.r)
for i := range 3 {
c := hexChar(lex.next())
if c < nullByte {
// Reset to before first rune for error reporting.
lex.rPos -= i + 1
return c
}
rr = rr*hex + c
}
return rr
}
// isHexDigit returns true if r represents a hex digit matching [0-9a-fA-F].
func isHexDigit(r rune) bool {
return isDigit(r) || (r >= 'A' && r <= 'F') || (r >= 'a' && r <= 'f')
}
// hexChar returns the byte value corresponding to hex character c.
func hexChar(c rune) rune {
switch {
case '0' <= c && c <= '9':
return c - '0'
case 'a' <= c && c <= 'f':
return c - 'a' + decimal
case 'A' <= c && c <= 'F':
return c - 'A' + decimal
default:
return rune(invalid)
}
}
// isDigit returns true if r is a digit ([0-9]).
func isDigit(r rune) bool {
return '0' <= r && r <= '9'
}