-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.py
319 lines (265 loc) · 9.68 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
from enum import Enum, auto
from parse.source_location import SourceLocation
class Keywords(Enum):
Let = 'let'
If = 'if'
Else = 'else'
Elif = 'elif'
Func = 'func'
Import = 'import'
Return = 'return'
While = 'while'
For = 'for'
In = 'in'
Macro = 'macro'
Mixin = 'mixin'
Try = 'try'
Catch = 'catch'
Finally = 'finally'
class TokenType(Enum):
NoneToken = auto()
LParen = '('
RParen = ')'
LBrace = '{'
RBrace = '}'
LBracket = '['
RBracket = ']'
Plus = '+'
Minus = '-'
Multiply = '*'
Exponentiation = '**'
Divide = '/'
Equals = '='
Semicolon = ';'
Colon = ':'
Dot = '.'
Comma = ','
Not = '!'
Modulus = '%'
LessThan = '<'
LessThanEqual = '<='
GreaterThan = '>'
GreaterThanEqual = '>='
And = '&&'
Or = '||'
BitwiseOr = '|'
BitwiseAnd = '&'
BitwiseXor = '^'
BitwiseNot = '~'
BitwiseLShift = '<<'
BitwiseRShift = '>>'
Compare = '=='
NotCompare = '!='
Spaceship = '<=>'
Arrow = '->'
PlusEquals = '+='
MinusEquals = '-='
MultiplyEquals = '*='
DivideEquals = '/='
ModulusEquals = '%='
BitwiseOrEquals = '|='
BitwiseAndEquals = '&='
BitwiseXorEquals = '^='
BitwiseLShiftEquals = '<<='
BitwiseRShiftEquals = '>>='
Identifier = auto()
Number = auto()
String = auto()
Keyword = auto()
def get_type(self, value):
if value == '':
return None
if (value in self._value2member_map_):
return TokenType(value)
if value[0].isdigit() or value[0] == '.':
# what?
if len(value) > 1:
if value[1] == 'x' or value[1] == 'X':
return TokenType.Number
if '.' in value:
return TokenType.Number
return TokenType.Number
elif (value[0] == '"' and value[-1] == '"') or (value[0] == '\'' and value[-1] == '\''):
return TokenType.String
# check if string is keyword
if value in Keywords._value2member_map_:
return TokenType.Keyword
# nothing else, must be identifier
return TokenType.Identifier
def has_value(self, value):
# check if value exists in enum... wtf
return value in self._value2member_map_
class LexerToken():
def __init__(self, value, token_type=None):
if token_type is None:
self.type = TokenType.get_type(TokenType, value)
else:
self.type = token_type
self.value = value
self.location = (0, 0)
def __str__(self):
return "LexerToken[Type:{0}, Value:'{1}']".format(self.type, self.value)
def __repr__(self):
return self.__str__()
LexerToken.NONE = LexerToken('', TokenType.NoneToken)
class Lexer():
def __init__(self, data, source_location):
self.tokens = []
self.data = data
self.token_data = ""
self.index = 0
self.source_location = source_location
# Error handling
self.source_location.row = 1
self.source_location.col = 1
# return character and progress through buffer
def read_char(self, amt=1):
if self.index+amt > len(self.data):
return ''
rval = self.data[self.index]
self.index += amt
self.source_location.col += 1
if rval == '\n':
self.source_location.col = 1
self.source_location.row += 1
return rval
# return character and keep index
def peek_char(self, offset=1):
idx = self.index+offset
if idx >= len(self.data) or idx < 0:
return ''
return self.data[idx]
def push_token(self):
token = LexerToken(self.token_data)
if self.token_data == '':
raise Exception('tokendata blank')
token.location = self.source_location.col_row
self.tokens.append(token)
self.token_data = ""
def skip_whitespace(self):
if self.peek_char(0).isspace():
while self.peek_char(0).isspace():
self.read_char()
return True
return False
def lex(self):
splitables = "(){}[];:+-*/=.,!|&~<>^%"
multichar_splitables = [
'**', '<=>', '<<=', '>>=',
'|=', '&=', '^=',
'==', '!=', '<=', '>=',
'+=', '-=', '*=', '/=',
'%=', '==', '!=', '->',
'&&', '||', '<<', '>>'
]
escape_chars = {
'n': '\n',
'b': '\b',
't': '\t',
'v': '\v',
'a': '\a',
'r': '\r',
'f': '\f',
's': '\s',
'033': '\033',
'\\': '\\',
'\'': '\'',
'"': '\"',
}
self.skip_whitespace()
string_type = None
while self.peek_char(0) != '':
if string_type and self.peek_char(0) == '\\':
# skip '/'
self.read_char()
escape_char = self.read_char()
if escape_char in escape_chars:
self.token_data += escape_chars[escape_char]
else:
print("Error: Unknown escape character '{}'".format(escape_char))
continue
# multiline comments
if string_type == None and (self.peek_char(0) == '#' or (self.peek_char(0) == '/' and self.peek_char(1) == '/') or (self.peek_char(0) == '/' and self.peek_char(1) == '*')):
self.read_char()
if self.peek_char(-1) == '#' and self.peek_char(0) == '*':
# multiline comment
# skip '*' character
self.read_char()
# read until '*#'
while (self.read_char() != '*' and self.peek_char(1) != '#'):
pass
# skip '*#' characters
self.read_char(1)
elif self.peek_char(-1) == '/' and self.peek_char(0) == '*':
# multiline comment
# skip '*' character
self.read_char()
# read until '*/'
while (self.read_char() != '*' and self.peek_char(1) != '/'):
pass
# skip '*/' characters
self.read_char(1)
else:
while self.read_char() != '\n':
# EOF
if self.peek_char(0) == '':
break
# skip any whitespace after comment
self.skip_whitespace()
continue
# encountered whitespace and not in string, push token
elif string_type == None and self.skip_whitespace():
self.push_token()
continue
elif self.peek_char(0) in splitables and string_type == None:
if not self.peek_char(-1).isspace() and self.peek_char(-1) not in splitables and len(self.token_data) > 0:
self.push_token()
multichar = False
for tok in multichar_splitables:
idx = self.data.find(tok, self.index, self.index+len(tok))
if idx != -1:
for i in range(len(tok)):
self.token_data += self.read_char()
multichar = True
# if self.peek_char(-1).isdigit() and self.peek_char(0) == '.':
# self.token_data += self.read_char()
# continue
if not multichar:
self.token_data = self.read_char()
self.push_token()
self.skip_whitespace()
continue
elif self.peek_char(0).isdigit() and string_type == None:
is_float = False
while self.peek_char(0).isdigit():
self.token_data += self.read_char()
if not is_float and self.peek_char(0) == '.':
# if next char is identifier, its not float,
# rather it could be something like
# `1.to_str()`
if not self.peek_char(1).isdigit():
break
self.token_data += self.read_char()
is_float = True
self.push_token()
self.skip_whitespace()
continue
# check if string character
if (self.peek_char(0) == '"'):
# if currently in double quotes string, end and set string_type to none
if string_type == '"':
string_type = None
# if no string is open, open a new one
elif string_type == None:
string_type = '"'
# if currently in single quotes string, ignore
elif (self.peek_char(0) == '\''):
if string_type == '\'':
string_type = None
elif string_type == None:
string_type = '\''
self.token_data += self.read_char()
# still some data left in token_data, push to end
if self.token_data != '':
self.push_token()
return self.tokens