started implementing new and improved lexer
This commit is contained in:
@@ -1,8 +1,98 @@
|
||||
"""Lexer.py: Definition for Lexer class"""
|
||||
|
||||
from function_scope import Function_scope
|
||||
import logging
|
||||
import re
|
||||
from typing import List, overload
|
||||
|
||||
from interpreter.function_scope import Function_scope
|
||||
from interpreter._token import Token, make_token
|
||||
class Lexer:
|
||||
"""This class will lex the provided Java source and generate a list of Function_scopes"""
|
||||
|
||||
TOKEN_MATCH = re.compile("\(|\)|\{|\}|;|(\n)|\+|-|\*|/|<|>|,| ")
|
||||
|
||||
def __init__(self, file_name: str) -> None:
|
||||
with open(file_name) as f:
|
||||
self.source_text = f.read()
|
||||
self.source_index = 0
|
||||
self.line_number = 1
|
||||
|
||||
def lex(self) -> List[Token]:
|
||||
|
||||
tokens = []
|
||||
|
||||
while char := self._consume():
|
||||
|
||||
if char.isspace():
|
||||
continue
|
||||
|
||||
if self._handle_comments(char):
|
||||
continue
|
||||
|
||||
token = self._get_token(char)
|
||||
logging.debug(f"found token \"{token}\" on line {self.line_number}")
|
||||
#tokens.append(make_token(token))
|
||||
|
||||
return tokens
|
||||
|
||||
def _get_token(self, char: str) -> str:
|
||||
token = char
|
||||
|
||||
if not re.match(Lexer.TOKEN_MATCH, token):
|
||||
|
||||
while (token_char := self._peek()):
|
||||
if re.match(Lexer.TOKEN_MATCH, token_char):
|
||||
break
|
||||
token += self._consume()
|
||||
|
||||
return token
|
||||
|
||||
def _handle_comments(self, char: str) -> bool:
|
||||
if char == '/' and self._peek() == '/':
|
||||
self._get_line() #skip the entire line
|
||||
return True
|
||||
elif char == '/' and self._peek() == '*':
|
||||
self._consume()
|
||||
self._consume_until('/') #skip until closing character. Will probably bug out at some point
|
||||
return True
|
||||
return False
|
||||
|
||||
def _get_line(self) -> str:
|
||||
return self._consume_until(re.compile("(\n)|;"))
|
||||
|
||||
def _peek(self, offset:int = 0) -> str:
|
||||
if (self.source_index + offset) >= len(self.source_text):
|
||||
return ''
|
||||
char = self.source_text[self.source_index]
|
||||
|
||||
return char
|
||||
|
||||
def _consume(self) -> str:
|
||||
char = self._peek()
|
||||
|
||||
if char == '\n':
|
||||
self.line_number += 1
|
||||
|
||||
self.source_index += 1
|
||||
return char
|
||||
|
||||
@overload
|
||||
def _consume_until(self, end_token: str) -> str:...
|
||||
|
||||
@overload
|
||||
def _consume_until(self, end_pattern:re.Pattern) -> str:...
|
||||
|
||||
def _consume_until(self, end_token) -> str:
|
||||
res = ""
|
||||
|
||||
if isinstance(end_token, str):
|
||||
while self._peek() and (char:= self._consume()) != end_token:
|
||||
res += char
|
||||
|
||||
return res
|
||||
|
||||
elif isinstance(end_token, re.Pattern):
|
||||
while self._peek() and not end_token.match(char:= self._consume()):
|
||||
res += char
|
||||
|
||||
return res
|
||||
58
interpreter/_token.py
Normal file
58
interpreter/_token.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""Private definitions for Token class used by the Lexer"""
|
||||
|
||||
import re
|
||||
|
||||
from enum import IntEnum
|
||||
from typing import Union
|
||||
|
||||
NUMERIC_CONSTANT_PATTERN = re.compile("([0-9]+)|(true)|(false)")
|
||||
KEYWORD_PATTERN = re.compile("(return)|(continue)|(break)")
|
||||
|
||||
class Token_type(IntEnum):
|
||||
UNKNOWN=-1
|
||||
LEFT_PAREN=0,
|
||||
RIGTH_PAREN=1,
|
||||
LEFT_CURLY=2,
|
||||
RIGHT_CURLY=3,
|
||||
LEFT_BRACKET=4,
|
||||
RIGHT_BRACKET=5,
|
||||
COMMA=6,
|
||||
NUMERIC_CONSTANT=7,
|
||||
IF_STATEMENT=8,
|
||||
WHILE_STATEMENT=9,
|
||||
DO_WHILE_STATEMENT=10,
|
||||
FOR_STATEMENT=11,
|
||||
KEY_WORD=13,
|
||||
STRING_LITERAL=14
|
||||
|
||||
class Token:
|
||||
def __init__(self, type: Token_type, content: Union[str, None]=None) -> None:
|
||||
self.type = type
|
||||
self.content = content
|
||||
|
||||
def make_token(tag: str) -> Token:
|
||||
if tag == '(':
|
||||
return Token(Token_type.LEFT_PAREN)
|
||||
elif tag == ')':
|
||||
return Token(Token_type.RIGTH_PAREN)
|
||||
elif tag == '{':
|
||||
return Token(Token_type.LEFT_CURLY)
|
||||
elif tag == '}':
|
||||
return Token(Token_type.RIGHT_CURLY)
|
||||
elif tag == '[':
|
||||
return Token(Token_type.LEFT_BRACKET)
|
||||
elif tag == ']':
|
||||
return Token(Token_type.RIGHT_BRACKET)
|
||||
elif tag == ',':
|
||||
return Token(Token_type.COMMA)
|
||||
elif NUMERIC_CONSTANT_PATTERN.match(tag):
|
||||
return Token(Token_type.NUMERIC_CONSTANT, tag)
|
||||
elif tag == 'if':
|
||||
return Token(Token_type.IF_STATEMENT)
|
||||
elif tag == 'while':
|
||||
return Token(Token_type.WHILE_STATEMENT)
|
||||
elif tag == 'do':
|
||||
return Token(Token_type.DO_WHILE_STATEMENT)
|
||||
elif tag == 'for':
|
||||
return Token(Token_type.FOR_STATEMENT)
|
||||
#TODO: finish this
|
||||
Reference in New Issue
Block a user