-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.py
83 lines (65 loc) · 2.12 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import re
TOKEN_TYPES = [
("NUMBER", r"\d+"),
("STRING", r'"([^"\\]|\\.)*"|\'([^\'\\]|\\.)*\''),
("BOOLEAN", r"\b(True|False)\b"),
("STARTBLOCK", r"{"),
("ENDBLOCK", r"}"),
("OR", r"or"),
("AND", r"and"),
("NOT", r"not"),
("PLUSPLUSTILL", r"\+\+:"),
("ISEQUAL", r"=="),
("NOTEQUAL", r"!="),
("GREATER", r">"),
("LESS", r"<"),
("GREATEREQUAL", r">="),
("LESSEQUAL", r"<="),
("POWER", r"\*\*"),
("SQUAREROOT", r"//"),
("PLUS", r"\+"),
("MINUS", r"-"),
("TIMES", r"\*"),
("DIVIDE", r"/"),
("MODULAS", r"%"),
("EQUALS", r"="),
("PRINT", r"say"),
("FOR", r"for"),
("BREAK", r";"),
("WHILE", r"while"),
("IF", r"if"),
("ELSE", r"else"),
("IDENTIFIER", r"[a-zA-Z_][a-zA-Z_0-9]*"),
("LEFTPAREN", r"\("),
("RIGHTPAREN", r"\)"),
("NEWLINE", r"\n"),
("SKIP", r"[ \t]+"),
("COMMENT", r"#.*"),
("MISMATCH", r".")
]
master_pattern = re.compile("|".join(f"(?P<{pair[0]}>{pair[1]})" for pair in TOKEN_TYPES))
class Lexer:
def __init__(self, source_code):
self.source_code = source_code
self.tokens = []
self.tokenize()
def tokenize(self):
position = 0
while position < len(self.source_code):
match = None
for token_type, regex in TOKEN_TYPES:
regex_match = re.match(regex, self.source_code[position:])
if regex_match:
match = regex_match.group(0)
if token_type == "SKIP" or token_type == "COMMENT" or token_type == "NEWLINE":
# Skip whitespace and comments (do not add them to tokens)
pass
else:
# Add all other tokens to the list
self.tokens.append((token_type, match))
position += len(match)
break
if not match:
raise SyntaxError(f"Unexpected character: {self.source_code[position]}")
def get_tokens(self):
return self.tokens # Now this returns the list of tokens.