133 lines
4.1 KiB
Python
133 lines
4.1 KiB
Python
"""Lexer.py: Definition for Lexer class"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import List, overload
|
|
|
|
from interpreter.function_scope import Function_scope
|
|
from interpreter._token import Token, make_token, Token_type
|
|
class Lexer:
|
|
"""This class will lex the provided Java source and generate a list of Function_scopes"""
|
|
|
|
TOKEN_MATCH = re.compile("\(|\)|\{|\}|;|(\n)|\+|-|\*|/|<|>|,| ")
|
|
|
|
def __init__(self, file_name: str) -> None:
|
|
with open(file_name) as f:
|
|
self.source_text = f.read()
|
|
self.source_index = 0
|
|
self.line_number = 1
|
|
|
|
self._tokens = []
|
|
self._token_index = 0
|
|
|
|
self.token_type_pattern = re.compile('(char)|(int)|(void)|(double)')
|
|
|
|
def _get_tokens(self):
|
|
|
|
while char := self._consume():
|
|
|
|
if char.isspace():
|
|
continue
|
|
|
|
if self._handle_comments(char):
|
|
continue
|
|
|
|
token = self._get_token(char)
|
|
#logging.debug(f"found token \"{token}\" on line {self.line_number}")
|
|
self._tokens.append(make_token(token, self.token_type_pattern))
|
|
|
|
def get_scopes(self) -> List[Function_scope]:
|
|
if not self._tokens:
|
|
self._get_tokens()
|
|
|
|
while token := self._consume_token():
|
|
if token.type == Token_type.UNKNOWN:
|
|
logging.debug(token)
|
|
elif token.type == Token_type.TYPE_NAME:
|
|
if self._peek_token().type != Token_type.UNKNOWN:
|
|
logging.error("Illegal identifier after Type name!")
|
|
raise Exception("Illegal identifier after Type name!")
|
|
elif self._peek_token(1).type == Token_type.LEFT_PAREN:
|
|
logging.debug(f"Function definition found: {token.content} {self._peek_token().content} ()")
|
|
self._consume_token()
|
|
self._consume_token()
|
|
args = ""
|
|
while function_token := self._consume_token():
|
|
if function_token.type == Token_type.RIGTH_PAREN:
|
|
break
|
|
print(function_token.type)
|
|
|
|
|
|
|
|
def _get_token(self, char: str) -> str:
|
|
token = char
|
|
|
|
if not re.match(Lexer.TOKEN_MATCH, token):
|
|
|
|
while (token_char := self._peek()):
|
|
if re.match(Lexer.TOKEN_MATCH, token_char):
|
|
break
|
|
token += self._consume()
|
|
|
|
return token
|
|
|
|
def _handle_comments(self, char: str) -> bool:
|
|
if char == '/' and self._peek() == '/':
|
|
self._get_line() #skip the entire line
|
|
return True
|
|
elif char == '/' and self._peek() == '*':
|
|
self._consume()
|
|
self._consume_until('/') #skip until closing character. Will probably bug out at some point
|
|
return True
|
|
return False
|
|
|
|
def _get_line(self) -> str:
|
|
return self._consume_until(re.compile("(\n)|;"))
|
|
|
|
def _peek(self, offset:int = 0) -> str:
|
|
if (self.source_index + offset) >= len(self.source_text):
|
|
return ''
|
|
char = self.source_text[self.source_index]
|
|
|
|
return char
|
|
|
|
def _consume(self) -> str:
|
|
char = self._peek()
|
|
|
|
if char == '\n':
|
|
self.line_number += 1
|
|
|
|
self.source_index += 1
|
|
return char
|
|
|
|
def _peek_token(self, offset:int=0):
|
|
if (self._token_index+offset) >= len(self._tokens):
|
|
return None
|
|
return self._tokens[self._token_index+offset]
|
|
|
|
def _consume_token(self):
|
|
token = self._peek_token()
|
|
self._token_index+=1
|
|
return token
|
|
|
|
|
|
@overload
|
|
def _consume_until(self, end_token: str) -> str:...
|
|
|
|
@overload
|
|
def _consume_until(self, end_pattern:re.Pattern) -> str:...
|
|
|
|
def _consume_until(self, end_token) -> str:
|
|
res = ""
|
|
|
|
if isinstance(end_token, str):
|
|
while self._peek() and (char:= self._consume()) != end_token:
|
|
res += char
|
|
|
|
return res
|
|
|
|
elif isinstance(end_token, re.Pattern):
|
|
while self._peek() and not end_token.match(char:= self._consume()):
|
|
res += char
|
|
|
|
return res |