403 lines
16 KiB
Python
403 lines
16 KiB
Python
"""Lexer.py: Definition for Lexer class"""
|
|
from interpreter.Tokenizer import Tokenizer
|
|
from os import linesep
|
|
from draw.Iinstruction import *
|
|
from typing import List, Optional, Union, Tuple
|
|
import logging
|
|
|
|
from interpreter.function_scope import Function_scope
|
|
from interpreter._token import Token, Token_type
|
|
from errors.custom import JavaSyntaxError
|
|
|
|
def print_arr(arr):
|
|
print("[")
|
|
for elem in arr:
|
|
if isinstance(elem, List):
|
|
print_arr(elem)
|
|
else:
|
|
print(elem)
|
|
print("]")
|
|
|
|
class Lexer:
|
|
def __init__(self, tokens: List[Token]) -> None:
|
|
self._tokens = tokens
|
|
self._token_index = 0
|
|
|
|
self._scopes: List[Function_scope] = []
|
|
|
|
#in case the tokenizer finds valid tokens in the global scope, they will be saved here
|
|
self._global_instructions = []
|
|
|
|
|
|
|
|
def _peek(self, offset:int=0) -> Optional[Token]:
|
|
if (self._token_index+offset) >= len(self._tokens):
|
|
return None
|
|
return self._tokens[self._token_index+offset]
|
|
|
|
def _consume(self):
|
|
token = self._peek()
|
|
self._token_index+=1
|
|
return token
|
|
|
|
|
|
def get_instructions(self):
|
|
|
|
scopes = []
|
|
|
|
while self._peek():
|
|
line_tokens = self.get_line_tokens()
|
|
|
|
if self._is_function_def(line_tokens):
|
|
|
|
func_name, func_return_type, func_args = self._construct_function_header_from_tokens(line_tokens)
|
|
current_scope = Function_scope(func_name, func_return_type, func_args)
|
|
|
|
instructions = self._get_instructions_in_scope()
|
|
current_scope._add_instructions(instructions)
|
|
|
|
scopes.append(current_scope)
|
|
|
|
else:
|
|
#something was declared in global scope
|
|
self._global_instructions.append(self._construct_instruction_from_tokens(line_tokens))
|
|
|
|
self.add_globals_to_scope_list(scopes)
|
|
return scopes
|
|
|
|
def _get_instructions_in_scope(self):
|
|
|
|
instructions = []
|
|
|
|
while self._peek():
|
|
|
|
line_tokens = self.get_line_tokens()
|
|
instruction = self._construct_instruction_from_tokens(line_tokens)
|
|
if instruction:
|
|
instructions.append(instruction)
|
|
|
|
delimiter_token = line_tokens[-1]
|
|
if delimiter_token.type == Token_type.RIGHT_CURLY:
|
|
return instructions
|
|
|
|
raise JavaSyntaxError(f"{self._peek(-1).location}: Missing right curly!")
|
|
|
|
|
|
|
|
def get_tokens_until(self, delimiter_types: List[Token_type]) -> List[Token]:
|
|
tokens = []
|
|
while token := self._consume():
|
|
tokens.append(token)
|
|
if token.type in delimiter_types:
|
|
break
|
|
return tokens
|
|
|
|
def get_line_tokens(self):
|
|
return self.get_tokens_until([Token_type.SEMICOLON, Token_type.LEFT_CURLY, Token_type.RIGHT_CURLY])
|
|
|
|
|
|
|
|
def add_globals_to_scope_list(self, scope_list: List[Function_scope]):
|
|
global_scope = Function_scope("<Global scope>", "void", [])
|
|
global_scope._add_instructions(self._global_instructions)
|
|
|
|
scope_list.append(global_scope)
|
|
|
|
|
|
def _is_function_def(self, tokens: List[Token]) -> bool:
|
|
#if token list is of shape TYPE_NAME IDENTIFIER ( ... {
|
|
return tokens[0].type == Token_type.TYPE_NAME and tokens[1].type == Token_type.UNKNOWN and tokens[2].type == Token_type.LEFT_PAREN and tokens[-1].type == Token_type.LEFT_CURLY
|
|
|
|
|
|
|
|
def _construct_instruction_from_tokens(self, tokens: List[Token]):
|
|
instruction_token = tokens[0]
|
|
|
|
if instruction_token.type == Token_type.IF_STATEMENT:
|
|
return self._handle_if_construct(tokens)
|
|
|
|
elif instruction_token.type == Token_type.WHILE_STATEMENT:
|
|
return self._handle_while_construct(tokens)
|
|
|
|
elif instruction_token.type == Token_type.DO_WHILE_STATEMENT:
|
|
return self._handle_do_while_construct(tokens)
|
|
|
|
elif instruction_token.type == Token_type.FOR_STATEMENT:
|
|
return self._handle_for_construct(tokens)
|
|
|
|
elif instruction_token.type == Token_type.TYPE_NAME:
|
|
return self._handle_type_name_construct(tokens)
|
|
|
|
elif instruction_token.type == Token_type.UNKNOWN:
|
|
return self._handle_generic_construct(tokens)
|
|
|
|
def _construct_function_header_from_tokens(self, tokens: List[Token]) -> Tuple[str, str, List[str]]:
|
|
|
|
_ensure_correct_function_structure(tokens)
|
|
|
|
function_return_type = tokens[0].content
|
|
|
|
function_name = tokens[1].content
|
|
|
|
argument_list = _get_function_argument_list_from_tokens(tokens[3:-2])
|
|
|
|
return function_name, function_return_type, argument_list
|
|
|
|
def _construct_variable_def_from_tokens(self, tokens: List[Token]) -> str:
|
|
_ensure_correct_variable_structure(tokens)
|
|
|
|
variable_type = tokens[0].content
|
|
variable_name = tokens[1].content
|
|
|
|
if tokens[2].type == Token_type.SEMICOLON:
|
|
return f"declare variable '{variable_name}' of type {variable_type}"
|
|
|
|
variable_value = self._construct_source_line_from_tokens(tokens[3:])
|
|
|
|
return f"declare variable '{variable_name}' of type {variable_type} with value {variable_value}"
|
|
|
|
def _construct_source_line_from_tokens(self, tokens: List[Token]) -> str:
|
|
"""TODO: make this function smarter"""
|
|
line = ""
|
|
|
|
for token in tokens:
|
|
if token.type == Token_type.SEMICOLON:
|
|
break
|
|
line += token.content + ' '
|
|
|
|
return line[:-1] #ignore the space after the last instruction text
|
|
|
|
"""Handler functions for different types of language structures"""
|
|
|
|
def _handle_if_construct(self, tokens: List[Token]):
|
|
logging.debug("Found if construct")
|
|
|
|
_ensure_correct_if_structure(tokens)
|
|
|
|
condition_str = self._construct_source_line_from_tokens(tokens[2:-2])
|
|
|
|
true_case = self._get_instructions_in_scope()
|
|
false_case = self._handle_else_construct()
|
|
|
|
return if_instruction(condition_str, true_case, false_case)
|
|
|
|
def _handle_else_construct(self):
|
|
if self._peek().type == Token_type.ELSE_STATEMENT:
|
|
if self._peek(1).type == Token_type.IF_STATEMENT:
|
|
logging.debug("Found if-else construct")
|
|
else_if_tokens = self.get_line_tokens()[1:]
|
|
return [self._handle_if_construct(else_if_tokens)]
|
|
else:
|
|
logging.debug("Found else construct")
|
|
self.get_line_tokens()
|
|
return self._get_instructions_in_scope()
|
|
return None
|
|
|
|
def _handle_while_construct(self, tokens: List[Token]):
|
|
logging.debug("Found while construct")
|
|
|
|
_ensure_correct_while_structure(tokens)
|
|
|
|
condtion_str = self._construct_source_line_from_tokens(tokens[2:-2])
|
|
|
|
loop_instructions = self._get_instructions_in_scope()
|
|
|
|
return while_instruction_front(condtion_str, loop_instructions)
|
|
|
|
def _handle_do_while_construct(self, tokens: List[Token]):
|
|
logging.debug("Found do-while construct")
|
|
|
|
_ensure_correct_do_while_structure_part_1(tokens)
|
|
|
|
|
|
loop_instructions = self._get_instructions_in_scope()
|
|
|
|
|
|
while_tokens = self.get_line_tokens()
|
|
_ensure_correct_do_while_structure_part_2(while_tokens)
|
|
condtion_str = self._construct_source_line_from_tokens(while_tokens[2:-2])
|
|
|
|
return while_instruction_back(condtion_str, loop_instructions)
|
|
|
|
def _handle_for_construct(self, tokens: List[Token]):
|
|
logging.debug("Found for construct")
|
|
tokens.extend(self.get_tokens_until([Token_type.LEFT_CURLY]))
|
|
|
|
_ensure_correct_for_structure(tokens)
|
|
|
|
variable_tokens, condition_tokens, increment_tokens = _get_for_arguments_from_tokens(tokens[2:])
|
|
|
|
variable_str = ""
|
|
if len(variable_tokens) > 1: #if we got more than just a semicolon
|
|
variable_str = self._construct_variable_def_from_tokens(variable_tokens)
|
|
|
|
condition_str = "true"
|
|
if condition_tokens:
|
|
condition_str = self._construct_source_line_from_tokens(condition_tokens)
|
|
|
|
increment_instruction = None
|
|
if increment_tokens:
|
|
increment_instruction = generic_instruction(self._construct_source_line_from_tokens(increment_tokens))
|
|
|
|
loop_instructions = self._get_instructions_in_scope()
|
|
|
|
if increment_instruction:
|
|
loop_instructions.append(increment_instruction)
|
|
|
|
return for_instruction(variable_str, condition_str, loop_instructions)
|
|
|
|
def _handle_type_name_construct(self, tokens: List[Token]):
|
|
logging.debug("Found Type name construct")
|
|
|
|
return generic_instruction(self._construct_variable_def_from_tokens(tokens))
|
|
|
|
def _handle_generic_construct(self, tokens: List[Token]):
|
|
logging.debug("Found generic instruction")
|
|
|
|
return generic_instruction(self._construct_source_line_from_tokens(tokens))
|
|
|
|
|
|
|
|
def _ensure_correct_function_structure(tokens: List[Token]):
|
|
#function structure: TYPE_NAME IDENTIFIER ( ... ) {
|
|
if len(tokens) < 5:
|
|
raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed function declaration! Expected at least 5 tokens, got {len(tokens)}")
|
|
if tokens[-1].type != Token_type.LEFT_CURLY:
|
|
raise JavaSyntaxError(f"{tokens[-1].location}: Ill-formed function declaration! Expected last token to be LEFT_CURLY, got {str(tokens[-1].type)}")
|
|
if tokens[1].type != Token_type.UNKNOWN:
|
|
raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after function return type! Expected UNKNWON, got {str(tokens[1].type)}")
|
|
if tokens[2].type != Token_type.LEFT_PAREN:
|
|
raise JavaSyntaxError(f"{tokens[2].location}: Illegal token after funtion name! Expected LEFT_CURLY, got {str(tokens[2].type)}")
|
|
if tokens[-2].type != Token_type.RIGTH_PAREN:
|
|
raise JavaSyntaxError(f"{tokens[-2].location}: Illegal token after function parameter list! Expected RIGHT_PAREN, got {str(tokens[-2].type)}")
|
|
|
|
def _ensure_correct_variable_structure(tokens: List[Token]):
|
|
#variable structure: TYPE_NAME IDENTIFIER ;|( = EXPRESSION;)
|
|
if len(tokens) < 3:
|
|
raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed type construct! Expected at least 3 tokens, got {len(tokens)}")
|
|
if tokens[1].type != Token_type.UNKNOWN:
|
|
raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after type name! Expected UNKNOWN, got {str(tokens[1].type)}")
|
|
if not tokens[2].type in [Token_type.SEMICOLON, Token_type.EQUAL_SIGN]:
|
|
raise JavaSyntaxError(f"{tokens[2].location}: Illegal token after variable name! Expected SEMICOLON or EQUAL_SIGN, got {str(tokens[2].type)}")
|
|
if tokens[2].type == Token_type.EQUAL_SIGN and len(tokens) < 5:
|
|
raise JavaSyntaxError(f"{tokens[2].location}: Ill-formed assignment expression! Expected at least 5 tokens, got {len(tokens)}")
|
|
|
|
def _ensure_correct_if_structure(tokens: List[Token]):
|
|
#if structure: IF ( ... ) { <-- the opening curly is technically not needed, but we require it anyways
|
|
if len(tokens) < 5:
|
|
raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed if construct! Expected at least 5 tokens, got {len(tokens)}")
|
|
if tokens[-1].type != Token_type.LEFT_CURLY:
|
|
raise JavaSyntaxError(f"{tokens[-1].location}: Ill-formed if construct! Expected last token to be LEFT_CURLY, got {str(tokens[-1].type)}")
|
|
if tokens[1].type != Token_type.LEFT_PAREN:
|
|
raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after if token! Expected LEFT_PAREN, got {str(tokens[1].type)}")
|
|
if tokens[-2].type != Token_type.RIGTH_PAREN:
|
|
raise JavaSyntaxError(f"{tokens[-2].location}: Illegal token after conditional expression! Expected RIGHT_PAREN, got {str(tokens[-2].type)}")
|
|
|
|
def _ensure_correct_while_structure(tokens: List[Token]):
|
|
#while structure: WHILE ( ... ) { <-- might not be required by the standard, but is required by us
|
|
if len(tokens) < 5:
|
|
raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed while construct! Expected at least 5 tokens, got {len(tokens)}")
|
|
if tokens[-1].type != Token_type.LEFT_CURLY:
|
|
raise JavaSyntaxError(f"{tokens[-1].location}: Ill-formed while construct! Expected last token to be LEFT_CURLY, got {str(tokens[-1].type)}")
|
|
if tokens[1].type != Token_type.LEFT_PAREN:
|
|
raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after while token! Expected LEFT_PAREN, got {str(tokens[1].type)}")
|
|
if tokens[-2].type != Token_type.RIGTH_PAREN:
|
|
raise JavaSyntaxError(f"{tokens[-2].location}: Illegal token after while condition! Expected RIGHT_PAREN, got {str(tokens[-2].type)}")
|
|
|
|
def _ensure_correct_do_while_structure_part_1(tokens: List[Token]):
|
|
#do-while structure: do{
|
|
if len(tokens) != 2:
|
|
raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed do-while construct! Expected 2 tokens, got {len(tokens)}")
|
|
if tokens[1].type != Token_type.LEFT_CURLY:
|
|
raise JavaSyntaxError(f"Illegal token after do token! Expected LEFT_CURLY, got {str(tokens[1].type)}")
|
|
|
|
def _ensure_correct_do_while_structure_part_2(tokens: List[Token]):
|
|
#do-while structure: while( ... );
|
|
if len(tokens) < 5:
|
|
raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed do while contruct! Expected at least 5 tokens, got {len(tokens)}")
|
|
if tokens[0].type != Token_type.WHILE_STATEMENT:
|
|
raise JavaSyntaxError(f"{tokens[0].location}: Illegal token after do block! Expected WHILE_STATEMENT, got {str(tokens[1].type)}")
|
|
if tokens[-1].type != Token_type.SEMICOLON:
|
|
raise JavaSyntaxError(f"{tokens[-1].location}: Ill-formed do-while construct! Expected last token to be SEMICOLON, got {str(tokens[-1].type)}")
|
|
if tokens[1].type != Token_type.LEFT_PAREN:
|
|
raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after while token! Expected LEFT_PAREN, got {str(tokens[1].type)}")
|
|
if tokens[-2].type != Token_type.RIGTH_PAREN:
|
|
raise JavaSyntaxError(f"{tokens[-2].location}: Illegal token after do-while condition! Expected RIGHT_PAREN, got {str(tokens[-2].type)}")
|
|
|
|
def _ensure_correct_for_structure(tokens: List[Token]):
|
|
#for structure: for(...?;...?;...?) {
|
|
if len(tokens) < 6:
|
|
raise JavaSyntaxError(f"{tokens[0].location}: Illf-formed for loop construct! Expected at least 6 tokens, got {len(tokens)}")
|
|
if tokens[-1].type != Token_type.LEFT_CURLY:
|
|
raise JavaSyntaxError(f"{tokens[-1].location}: Ill-formed for loop construct! Expected last token to be LEFT_CURLY, got {str(tokens[-1].type)}")
|
|
if tokens[1].type != Token_type.LEFT_PAREN:
|
|
raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after for token! Expected LEFT_PAREN, got {str(tokens[1].type)}")
|
|
if tokens[-2].type != Token_type.RIGTH_PAREN:
|
|
raise JavaSyntaxError(f"{tokens[-2].location}: Illegal token after for loop increment! Expected RIGHT_PAREN, got {str(tokens[-2].type)}")
|
|
if ( semicolon_count := tokens.count(Token(Token_type.SEMICOLON)) ) != 2:
|
|
raise JavaSyntaxError(f"Ill-formed for loop construct! Expected exactly 2 SEMICOLON tokens, got {semicolon_count}")
|
|
|
|
|
|
|
|
def _get_function_argument_list_from_tokens(tokens: List[Token]) -> List[str]:
|
|
arg_tokens = _get_seperated_token_list(tokens, [Token_type.COMMA])
|
|
|
|
args = []
|
|
|
|
for arg in arg_tokens:
|
|
arg_str = ""
|
|
for token in arg:
|
|
arg_str += token.content + ' '
|
|
arg_str = arg_str[:-1]
|
|
args.append(arg_str)
|
|
|
|
return args
|
|
|
|
def _get_seperated_token_list(tokens: List[Token], seperator_types: List[Token_type]) -> List[List[Token]]:
|
|
token_segments = []
|
|
tokens_in_segment = []
|
|
|
|
for token in tokens:
|
|
if token.type in seperator_types:
|
|
token_segments.append(tokens_in_segment)
|
|
tokens_in_segment = []
|
|
continue
|
|
tokens_in_segment.append(token)
|
|
|
|
token_segments.append(tokens_in_segment)
|
|
|
|
return token_segments
|
|
|
|
|
|
def _get_for_arguments_from_tokens(tokens: List[Token]) -> Tuple[List[Token], List[Token], List[Token]]:
|
|
variable_tokens = []
|
|
condition_tokens = []
|
|
increment_tokens = []
|
|
|
|
token_index = 0
|
|
|
|
while True:
|
|
token = tokens[token_index]
|
|
token_index += 1
|
|
|
|
variable_tokens.append(token)
|
|
if token.type == Token_type.SEMICOLON:
|
|
break
|
|
|
|
while True:
|
|
token = tokens[token_index]
|
|
token_index += 1
|
|
|
|
if token.type == Token_type.SEMICOLON:
|
|
break
|
|
condition_tokens.append(token)
|
|
|
|
while True:
|
|
token = tokens[token_index]
|
|
token_index += 1
|
|
|
|
if token.type == Token_type.LEFT_CURLY:
|
|
break
|
|
increment_tokens.append(token)
|
|
|
|
return variable_tokens, condition_tokens, increment_tokens[:-1] |