-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregex_lexer.py
141 lines (103 loc) · 2.76 KB
/
regex_lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from typing import Iterator
from curses.ascii import isalpha
class Token:
def __repr__(self):
raise NotImplementedError()
def __str__(self):
return repr(self)
class SymbolT(Token):
def __init__(self, symbol):
super().__init__()
self.symbol = symbol
def __repr__(self):
return f'S<{self.symbol}>'
class RepeatT(Token):
def __repr__(self):
return '*'
class PlusT(Token):
def __repr__(self):
return '+'
class ThenT(Token):
def __repr__(self):
return '>'
class OrT(Token):
def __repr__(self):
return '|'
class NotT(Token):
def __repr__(self):
return '!'
class AnyT(Token):
def __repr__(self):
return '.'
class NonappearT(Token):
def __repr__(self):
return '_'
class NonemptyT(Token):
def __repr__(self):
return ':'
class ComplementT(Token):
def __repr__(self):
return '~'
class AndT(Token):
def __repr__(self):
return '&'
class OpenT(Token):
def __repr__(self):
return '('
class CloseT(Token):
def __repr__(self):
return ')'
class OpenMulT(Token):
def __repr__(self):
return '{'
class CloseMulT(Token):
def __repr__(self):
return '}'
class EndT(Token):
def __repr__(self):
return 'End'
def lex(src: str) -> Iterator[Token]:
symbol_buffer = []
for c in src:
if isalpha(c) or c.isnumeric() or c == '$' or c == '#' or ((c == '_' or c == ':' or c.isnumeric()) and len(symbol_buffer) > 0):
symbol_buffer.append(c)
else:
if len(symbol_buffer) > 0:
yield SymbolT(''.join(symbol_buffer))
symbol_buffer = []
if c == ' ':
continue
elif c == '|':
yield OrT()
elif c == '>':
yield ThenT()
elif c == '&':
yield AndT()
elif c == '!':
yield NotT()
elif c == '.':
yield AnyT()
elif c == '_':
yield NonappearT()
elif c == ':':
yield NonemptyT()
elif c == '*':
yield RepeatT()
elif c == '+':
yield PlusT()
elif c == '~':
yield ComplementT()
elif c == '(':
yield OpenT()
elif c == ')':
yield CloseT()
elif c == '{':
yield OpenMulT()
elif c == '}':
yield CloseMulT()
else:
print(src)
raise ValueError(f'unrecognized character `{c}`')
if len(symbol_buffer) != 0:
yield SymbolT(''.join(symbol_buffer))
yield EndT()