started implementing new and improved lexer

This commit is contained in:
weckyy702
2021-03-28 17:02:22 +02:00
parent 3860250e31
commit 5662e87c83
2 changed files with 149 additions and 1 deletions

View File

@@ -1,8 +1,98 @@
"""Lexer.py: Definition for Lexer class"""
from function_scope import Function_scope
import logging
import re
from typing import List, overload
from interpreter.function_scope import Function_scope
from interpreter._token import Token, make_token
class Lexer:
"""This class will lex the provided Java source and generate a list of Function_scopes"""
TOKEN_MATCH = re.compile("\(|\)|\{|\}|;|(\n)|\+|-|\*|/|<|>|,| ")
def __init__(self, file_name: str) -> None:
with open(file_name) as f:
self.source_text = f.read()
self.source_index = 0
self.line_number = 1
def lex(self) -> List[Token]:
tokens = []
while char := self._consume():
if char.isspace():
continue
if self._handle_comments(char):
continue
token = self._get_token(char)
logging.debug(f"found token \"{token}\" on line {self.line_number}")
#tokens.append(make_token(token))
return tokens
def _get_token(self, char: str) -> str:
token = char
if not re.match(Lexer.TOKEN_MATCH, token):
while (token_char := self._peek()):
if re.match(Lexer.TOKEN_MATCH, token_char):
break
token += self._consume()
return token
def _handle_comments(self, char: str) -> bool:
if char == '/' and self._peek() == '/':
self._get_line() #skip the entire line
return True
elif char == '/' and self._peek() == '*':
self._consume()
self._consume_until('/') #skip until closing character. Will probably bug out at some point
return True
return False
def _get_line(self) -> str:
return self._consume_until(re.compile("(\n)|;"))
def _peek(self, offset:int = 0) -> str:
if (self.source_index + offset) >= len(self.source_text):
return ''
char = self.source_text[self.source_index]
return char
def _consume(self) -> str:
char = self._peek()
if char == '\n':
self.line_number += 1
self.source_index += 1
return char
@overload
def _consume_until(self, end_token: str) -> str:...
@overload
def _consume_until(self, end_pattern:re.Pattern) -> str:...
def _consume_until(self, end_token) -> str:
res = ""
if isinstance(end_token, str):
while self._peek() and (char:= self._consume()) != end_token:
res += char
return res
elif isinstance(end_token, re.Pattern):
while self._peek() and not end_token.match(char:= self._consume()):
res += char
return res

58
interpreter/_token.py Normal file
View File

@@ -0,0 +1,58 @@
"""Private definitions for Token class used by the Lexer"""
import re
from enum import IntEnum
from typing import Union
NUMERIC_CONSTANT_PATTERN = re.compile("([0-9]+)|(true)|(false)")
KEYWORD_PATTERN = re.compile("(return)|(continue)|(break)")
class Token_type(IntEnum):
UNKNOWN=-1
LEFT_PAREN=0,
RIGTH_PAREN=1,
LEFT_CURLY=2,
RIGHT_CURLY=3,
LEFT_BRACKET=4,
RIGHT_BRACKET=5,
COMMA=6,
NUMERIC_CONSTANT=7,
IF_STATEMENT=8,
WHILE_STATEMENT=9,
DO_WHILE_STATEMENT=10,
FOR_STATEMENT=11,
KEY_WORD=13,
STRING_LITERAL=14
class Token:
def __init__(self, type: Token_type, content: Union[str, None]=None) -> None:
self.type = type
self.content = content
def make_token(tag: str) -> Token:
if tag == '(':
return Token(Token_type.LEFT_PAREN)
elif tag == ')':
return Token(Token_type.RIGTH_PAREN)
elif tag == '{':
return Token(Token_type.LEFT_CURLY)
elif tag == '}':
return Token(Token_type.RIGHT_CURLY)
elif tag == '[':
return Token(Token_type.LEFT_BRACKET)
elif tag == ']':
return Token(Token_type.RIGHT_BRACKET)
elif tag == ',':
return Token(Token_type.COMMA)
elif NUMERIC_CONSTANT_PATTERN.match(tag):
return Token(Token_type.NUMERIC_CONSTANT, tag)
elif tag == 'if':
return Token(Token_type.IF_STATEMENT)
elif tag == 'while':
return Token(Token_type.WHILE_STATEMENT)
elif tag == 'do':
return Token(Token_type.DO_WHILE_STATEMENT)
elif tag == 'for':
return Token(Token_type.FOR_STATEMENT)
#TODO: finish this