from enum import Enum, auto from parse.source_location import SourceLocation class Keywords(Enum): Let = 'let' If = 'if' Else = 'else' Elif = 'elif' Func = 'func' Import = 'import' Return = 'return' While = 'while' For = 'for' In = 'in' Macro = 'macro' Mixin = 'mixin' Try = 'try' Catch = 'catch' Finally = 'finally' class TokenType(Enum): NoneToken = auto() LParen = '(' RParen = ')' LBrace = '{' RBrace = '}' LBracket = '[' RBracket = ']' Plus = '+' Minus = '-' Multiply = '*' Exponentiation = '**' Divide = '/' Equals = '=' Semicolon = ';' Colon = ':' Dot = '.' Comma = ',' Not = '!' Question = '?' Modulus = '%' LessThan = '<' LessThanEqual = '<=' GreaterThan = '>' GreaterThanEqual = '>=' And = '&&' Or = '||' BitwiseOr = '|' BitwiseAnd = '&' BitwiseXor = '^' BitwiseNot = '~' BitwiseLShift = '<<' BitwiseRShift = '>>' Compare = '==' NotCompare = '!=' Spaceship = '<=>' Arrow = '->' PlusEquals = '+=' MinusEquals = '-=' MultiplyEquals = '*=' DivideEquals = '/=' ModulusEquals = '%=' BitwiseOrEquals = '|=' BitwiseAndEquals = '&=' BitwiseXorEquals = '^=' BitwiseLShiftEquals = '<<=' BitwiseRShiftEquals = '>>=' Identifier = auto() Number = auto() String = auto() Keyword = auto() def get_type(self, value): if value == '': return None if (value in self._value2member_map_): return TokenType(value) if value[0].isdigit() or value[0] == '.': # what? if len(value) > 1: if value[1] == 'x' or value[1] == 'X': return TokenType.Number if '.' in value: return TokenType.Number return TokenType.Number elif (value[0] == '"' and value[-1] == '"') or (value[0] == '\'' and value[-1] == '\''): return TokenType.String # check if string is keyword if value in Keywords._value2member_map_: return TokenType.Keyword # nothing else, must be identifier return TokenType.Identifier def has_value(self, value): # check if value exists in enum... wtf return value in self._value2member_map_ class LexerToken(): def __init__(self, value, token_type=None): if token_type is None: self.type = TokenType.get_type(TokenType, value) else: self.type = token_type self.value = value self.location = (0, 0) def __str__(self): return "LexerToken[Type:{0}, Value:'{1}']".format(self.type, self.value) def __repr__(self): return self.__str__() LexerToken.NONE = LexerToken('', TokenType.NoneToken) class Lexer(): def __init__(self, data, source_location): self.tokens = [] self.data = data self.token_data = "" self.index = 0 self.source_location = source_location # Error handling self.source_location.row = 1 self.source_location.col = 1 # return character and progress through buffer def read_char(self, amt=1): if self.index+amt > len(self.data): return '' rval = self.data[self.index] self.index += amt self.source_location.col += 1 if rval == '\n': self.source_location.col = 1 self.source_location.row += 1 return rval # return character and keep index def peek_char(self, offset=1): idx = self.index+offset if idx >= len(self.data) or idx < 0: return '' return self.data[idx] def push_token(self): token = LexerToken(self.token_data) if self.token_data == '': raise Exception('tokendata blank') token.location = self.source_location.col_row self.tokens.append(token) self.token_data = "" def skip_whitespace(self): if self.peek_char(0).isspace(): while self.peek_char(0).isspace(): self.read_char() return True return False def lex(self): splitables = "(){}[];:+-*/=.,!?|&~<>^%" multichar_splitables = [ '**', '<=>', '<<=', '>>=', '|=', '&=', '^=', '==', '!=', '<=', '>=', '+=', '-=', '*=', '/=', '%=', '==', '!=', '->', '&&', '||', '<<', '>>' ] escape_chars = { 'n': '\n', 'b': '\b', 't': '\t', 'v': '\v', 'a': '\a', 'r': '\r', 'f': '\f', 's': '\s', '033': '\033', '\\': '\\', '\'': '\'', '"': '\"', } self.skip_whitespace() string_type = None while self.peek_char(0) != '': if string_type and self.peek_char(0) == '\\': # skip '/' self.read_char() escape_char = self.read_char() if escape_char in escape_chars: self.token_data += escape_chars[escape_char] else: print("Error: Unknown escape character '{}'".format(escape_char)) continue # comments if string_type == None and (self.peek_char(0) == '#' or (self.peek_char(0) == '/' and self.peek_char(1) == '/') or (self.peek_char(0) == '/' and self.peek_char(1) == '*')): self.read_char() if self.peek_char(-1) == '#' and self.peek_char(0) == '*': # multiline comment # skip '*' character self.read_char() # read until '*#' while (self.read_char() != '*' and self.peek_char(1) != '#'): # EOF if self.peek_char(0) == '': break # skip '*#' characters self.read_char(2) # end by pushing the token and skipping any whitespace afterwards if self.token_data != '': self.push_token() self.skip_whitespace() elif self.peek_char(-1) == '/' and self.peek_char(0) == '*': # multiline comment # skip '*' character self.read_char() # read until '*/' while (self.read_char() != '*' and self.peek_char(1) != '/'): # EOF if self.peek_char(0) == '': break # skip '*/' characters self.read_char(2) # end by pushing the token and skipping any whitespace afterwards if self.token_data != '': self.push_token() self.skip_whitespace() else: while self.read_char() != '\n': # EOF if self.peek_char(0) == '': break # skip any whitespace after comment self.skip_whitespace() continue # encountered whitespace and not in string, push token elif string_type == None and self.skip_whitespace(): self.push_token() continue elif self.peek_char(0) in splitables and string_type == None: if not self.peek_char(-1).isspace() and self.peek_char(-1) not in splitables and len(self.token_data) > 0: self.push_token() multichar = False for tok in multichar_splitables: idx = self.data.find(tok, self.index, self.index+len(tok)) if idx != -1: for i in range(len(tok)): self.token_data += self.read_char() multichar = True # if self.peek_char(-1).isdigit() and self.peek_char(0) == '.': # self.token_data += self.read_char() # continue if not multichar: self.token_data = self.read_char() self.push_token() self.skip_whitespace() continue elif self.peek_char(0).isdigit() and string_type == None: is_float = False while self.peek_char(0).isdigit(): self.token_data += self.read_char() if not is_float and self.peek_char(0) == '.': # if next char is identifier, its not float, # rather it could be something like # `1.to_str()` if not self.peek_char(1).isdigit(): break self.token_data += self.read_char() is_float = True self.push_token() self.skip_whitespace() continue # check if string character if (self.peek_char(0) == '"'): # if currently in double quotes string, end and set string_type to none if string_type == '"': string_type = None # if no string is open, open a new one elif string_type == None: string_type = '"' # if currently in single quotes string, ignore elif (self.peek_char(0) == '\''): if string_type == '\'': string_type = None elif string_type == None: string_type = '\'' self.token_data += self.read_char() # still some data left in token_data, push to end if self.token_data != '': self.push_token() return self.tokens