-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlexer.py
235 lines (207 loc) · 9.84 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#Imports the sys module, which provides access to some variables maintained by the Python interpreter and to functions that interact with the interpreter.
import sys
#A class which will be responsible for tokenizing the input source code.
class Lexer:
#A constructor function used to initialize the lexer
def __init__(self, input):
self.source = input #Source code
self.curChar = '' #Current character
self.curPos = -1 #Current position
self.tokenList = [] #List of tokens
self.error = None #Errors
self.nextChar()
#A method used to move to the next character in the source code
def nextChar(self):
self.curPos += 1
if self.curPos >= len(self.source):
self.curChar = '\0'
else:
self.curChar = self.source[self.curPos]
#A method that allows the lexer to peek at the next character without changing the position
def peek(self):
if self.curPos + 1 >= len(self.source):
return '\0'
return self.source[self.curPos+1]
#A method used to exit the program in case of a lexing error
def abort(self, message):
sys.exit("Lexing error. " + message)
#A method is responsible for skipping over whitespace characters during lexical analysis
def skipWhitespace(self):
while self.curChar == ' ' or self.curChar == '\r':
self.nextChar()
#A method is responsible for skipping comments starting with #
def skipComment(self):
if self.curChar == '#':
while self.curChar != '\n':
self.nextChar()
#A method is responsible for tokenizing the source code.
def getTokens(self):
while self.curChar != '\0':
self.skipWhitespace()
self.skipComment()
token = None
if self.curChar == '+':
token = Token("TT_PLUS", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == '-':
token = Token("TT_MINUS", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == '*':
if self.peek() == '*':
lastChar = self.curChar
self.nextChar()
token = Token("TT_POW", lastChar + self.curChar, self.curPos - 1, self.curPos)
else:
token = Token("TT_MULT", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == '/':
token = Token("TT_DIV", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == '=':
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token("TT_EQEQ", lastChar + self.curChar, self.curPos-1, self.curPos)
else:
token = Token("TT_EQ", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == '>':
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token("TT_GTEQ", lastChar + self.curChar, self.curPos-1, self.curPos)
else:
token = Token("TT_GT", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == '<':
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token("TT_LTEQ", lastChar + self.curChar, self.curPos-1, self.curPos)
else:
token = Token("TT_LT", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == '!':
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token("TT_NTEQ", lastChar + self.curChar, self.curPos, self.curPos)
else:
self.abort("Expected !=, got !" + self.peek())
self.tokenList.append(token)
elif self.curChar == '\"':
self.nextChar()
startPos = self.curPos
while self.curChar != '\"':
if self.curChar == '\r' or self.curChar == '\n' or self.curChar == '\t' or self.curChar == '\\' or self.curChar == '%':
self.abort("Illegal character in string.")
self.nextChar()
tokText = self.source[startPos : self.curPos+1]
token = Token("TT_STRING", tokText, startPos-1, self.curPos)
self.tokenList.append(token)
elif self.curChar == "\'":
self.nextChar()
startPos = self.curPos
while self.curChar != "\'":
if self.curChar == '\r' or self.curChar == '\n' or self.curChar == '\t' or self.curChar == '\\' or self.curChar == '%':
self.abort("Illegal character in string.")
self.nextChar()
tokText = self.source[startPos : self.curPos+1]
token = Token("TT_STRING", tokText, startPos-1, self.curPos)
self.tokenList.append(token)
elif self.curChar.isdigit():
startPos = self.curPos
while self.peek().isdigit():
self.nextChar()
if self.peek() == '.':
self.nextChar()
if not self.peek().isdigit():
self.abort("Illegal character in number.")
while self.peek().isdigit():
self.nextChar()
tokText = self.source[startPos : self.curPos + 1]
token = Token("TT_NUMBER", tokText, startPos, self.curPos)
self.tokenList.append(token)
elif self.curChar.isalpha():
startPos = self.curPos
while self.peek().isalnum():
self.nextChar()
if self.source[startPos - 1].isdigit():
self.abort(f"Invalid Identifier: {self.source[startPos-1 : self.curPos +1]}")
tokText = self.source[startPos : self.curPos + 1]
keyword = isKeyWord(tokText)
if not keyword:
token = IdentToken("TT_IDENT", tokText, startPos, self.curPos)
else:
token = Token("TT_KEYW", tokText, startPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == '(':
token = Token("TT_LPAREN", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == ')':
token = Token("TT_RPAREN", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == '[':
token = Token("TT_LSQPAREN", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == ']':
token = Token("TT_RSQPAREN", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == ':':
token = Token("TT_COLON", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == ',':
token = Token("TT_COMMA", self.curChar, self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == '\t':
token = Token("TT_TAB", '', self.curPos, self.curPos)
self.tokenList.append(token)
elif self.curChar == '\n':
token = Token("TT_NWL", '', self.curPos, self.curPos)
self.tokenList.append(token)
else:
self.abort("Unknown token: " + self.curChar)
self.nextChar()
self.tokenList.append(Token("TT_EOF"))
return self.tokenList
# A class representing a lexical token with type, value, and position
class Token:
#A constructor function
def __init__(self, type_, value=None, start=None, end=None):
self.type = type_ #Type of token
self.value = value #Value of the token
self.start = start #Start position
self.end = end #End position
#A method that returns a string representation of the token
def __repr__(self):
if self.value: return f'{self.type}:\"{self.value}\"'
return f'{self.type}'
#A method is used to retrieve the value of the token
def read(self, obj):
if(self.value):
if self.type == "TT_NUMBER":
self.value = float(self.value)
return self.value
else:
return None
# IdentToken is a subclass of Tokens for identifier tokens.
class IdentToken(Token):
def __init__(self, type_, value=None, start=None, end=None):
super().__init__(type_, value, start, end)
def __repr__(self):
if self.value:
return f'{self.type}:\"{self.value}\"'
return f'{self.type}'
def read(self, obj):
try:
if obj.storage[self.value]:
return obj.storage[self.value]
except:
sys.exit(f"'{self.value}' doesn't exists")
return None
# A function to check if a given token is a keyword.
def isKeyWord(token):
keywords = ["print"]
if token in keywords:
return True
return False