Files
2021-04-16 11:46:32 +02:00

403 lines
16 KiB
Python

"""Lexer.py: Definition for Lexer class"""
from interpreter.Tokenizer import Tokenizer
from os import linesep
from draw.Iinstruction import *
from typing import List, Optional, Union, Tuple
import logging
from interpreter.function_scope import Function_scope
from interpreter._token import Token, Token_type
from errors.custom import JavaSyntaxError
def print_arr(arr):
print("[")
for elem in arr:
if isinstance(elem, List):
print_arr(elem)
else:
print(elem)
print("]")
class Lexer:
def __init__(self, tokens: List[Token]) -> None:
self._tokens = tokens
self._token_index = 0
self._scopes: List[Function_scope] = []
#in case the tokenizer finds valid tokens in the global scope, they will be saved here
self._global_instructions = []
def _peek(self, offset:int=0) -> Optional[Token]:
if (self._token_index+offset) >= len(self._tokens):
return None
return self._tokens[self._token_index+offset]
def _consume(self):
token = self._peek()
self._token_index+=1
return token
def get_instructions(self):
scopes = []
while self._peek():
line_tokens = self.get_line_tokens()
if self._is_function_def(line_tokens):
func_name, func_return_type, func_args = self._construct_function_header_from_tokens(line_tokens)
current_scope = Function_scope(func_name, func_return_type, func_args)
instructions = self._get_instructions_in_scope()
current_scope._add_instructions(instructions)
scopes.append(current_scope)
else:
#something was declared in global scope
self._global_instructions.append(self._construct_instruction_from_tokens(line_tokens))
self.add_globals_to_scope_list(scopes)
return scopes
def _get_instructions_in_scope(self):
instructions = []
while self._peek():
line_tokens = self.get_line_tokens()
instruction = self._construct_instruction_from_tokens(line_tokens)
if instruction:
instructions.append(instruction)
delimiter_token = line_tokens[-1]
if delimiter_token.type == Token_type.RIGHT_CURLY:
return instructions
raise JavaSyntaxError(f"{self._peek(-1).location}: Missing right curly!")
def get_tokens_until(self, delimiter_types: List[Token_type]) -> List[Token]:
tokens = []
while token := self._consume():
tokens.append(token)
if token.type in delimiter_types:
break
return tokens
def get_line_tokens(self):
return self.get_tokens_until([Token_type.SEMICOLON, Token_type.LEFT_CURLY, Token_type.RIGHT_CURLY])
def add_globals_to_scope_list(self, scope_list: List[Function_scope]):
global_scope = Function_scope("<Global scope>", "void", [])
global_scope._add_instructions(self._global_instructions)
scope_list.append(global_scope)
def _is_function_def(self, tokens: List[Token]) -> bool:
#if token list is of shape TYPE_NAME IDENTIFIER ( ... {
return tokens[0].type == Token_type.TYPE_NAME and tokens[1].type == Token_type.UNKNOWN and tokens[2].type == Token_type.LEFT_PAREN and tokens[-1].type == Token_type.LEFT_CURLY
def _construct_instruction_from_tokens(self, tokens: List[Token]):
instruction_token = tokens[0]
if instruction_token.type == Token_type.IF_STATEMENT:
return self._handle_if_construct(tokens)
elif instruction_token.type == Token_type.WHILE_STATEMENT:
return self._handle_while_construct(tokens)
elif instruction_token.type == Token_type.DO_WHILE_STATEMENT:
return self._handle_do_while_construct(tokens)
elif instruction_token.type == Token_type.FOR_STATEMENT:
return self._handle_for_construct(tokens)
elif instruction_token.type == Token_type.TYPE_NAME:
return self._handle_type_name_construct(tokens)
elif instruction_token.type == Token_type.UNKNOWN:
return self._handle_generic_construct(tokens)
def _construct_function_header_from_tokens(self, tokens: List[Token]) -> Tuple[str, str, List[str]]:
_ensure_correct_function_structure(tokens)
function_return_type = tokens[0].content
function_name = tokens[1].content
argument_list = _get_function_argument_list_from_tokens(tokens[3:-2])
return function_name, function_return_type, argument_list
def _construct_variable_def_from_tokens(self, tokens: List[Token]) -> str:
_ensure_correct_variable_structure(tokens)
variable_type = tokens[0].content
variable_name = tokens[1].content
if tokens[2].type == Token_type.SEMICOLON:
return f"declare variable '{variable_name}' of type {variable_type}"
variable_value = self._construct_source_line_from_tokens(tokens[3:])
return f"declare variable '{variable_name}' of type {variable_type} with value {variable_value}"
def _construct_source_line_from_tokens(self, tokens: List[Token]) -> str:
"""TODO: make this function smarter"""
line = ""
for token in tokens:
if token.type == Token_type.SEMICOLON:
break
line += token.content + ' '
return line[:-1] #ignore the space after the last instruction text
"""Handler functions for different types of language structures"""
def _handle_if_construct(self, tokens: List[Token]):
logging.debug("Found if construct")
_ensure_correct_if_structure(tokens)
condition_str = self._construct_source_line_from_tokens(tokens[2:-2])
true_case = self._get_instructions_in_scope()
false_case = self._handle_else_construct()
return if_instruction(condition_str, true_case, false_case)
def _handle_else_construct(self):
if self._peek().type == Token_type.ELSE_STATEMENT:
if self._peek(1).type == Token_type.IF_STATEMENT:
logging.debug("Found if-else construct")
else_if_tokens = self.get_line_tokens()[1:]
return [self._handle_if_construct(else_if_tokens)]
else:
logging.debug("Found else construct")
self.get_line_tokens()
return self._get_instructions_in_scope()
return None
def _handle_while_construct(self, tokens: List[Token]):
logging.debug("Found while construct")
_ensure_correct_while_structure(tokens)
condtion_str = self._construct_source_line_from_tokens(tokens[2:-2])
loop_instructions = self._get_instructions_in_scope()
return while_instruction_front(condtion_str, loop_instructions)
def _handle_do_while_construct(self, tokens: List[Token]):
logging.debug("Found do-while construct")
_ensure_correct_do_while_structure_part_1(tokens)
loop_instructions = self._get_instructions_in_scope()
while_tokens = self.get_line_tokens()
_ensure_correct_do_while_structure_part_2(while_tokens)
condtion_str = self._construct_source_line_from_tokens(while_tokens[2:-2])
return while_instruction_back(condtion_str, loop_instructions)
def _handle_for_construct(self, tokens: List[Token]):
logging.debug("Found for construct")
tokens.extend(self.get_tokens_until([Token_type.LEFT_CURLY]))
_ensure_correct_for_structure(tokens)
variable_tokens, condition_tokens, increment_tokens = _get_for_arguments_from_tokens(tokens[2:])
variable_str = ""
if len(variable_tokens) > 1: #if we got more than just a semicolon
variable_str = self._construct_variable_def_from_tokens(variable_tokens)
condition_str = "true"
if condition_tokens:
condition_str = self._construct_source_line_from_tokens(condition_tokens)
increment_instruction = None
if increment_tokens:
increment_instruction = generic_instruction(self._construct_source_line_from_tokens(increment_tokens))
loop_instructions = self._get_instructions_in_scope()
if increment_instruction:
loop_instructions.append(increment_instruction)
return for_instruction(variable_str, condition_str, loop_instructions)
def _handle_type_name_construct(self, tokens: List[Token]):
logging.debug("Found Type name construct")
return generic_instruction(self._construct_variable_def_from_tokens(tokens))
def _handle_generic_construct(self, tokens: List[Token]):
logging.debug("Found generic instruction")
return generic_instruction(self._construct_source_line_from_tokens(tokens))
def _ensure_correct_function_structure(tokens: List[Token]):
#function structure: TYPE_NAME IDENTIFIER ( ... ) {
if len(tokens) < 5:
raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed function declaration! Expected at least 5 tokens, got {len(tokens)}")
if tokens[-1].type != Token_type.LEFT_CURLY:
raise JavaSyntaxError(f"{tokens[-1].location}: Ill-formed function declaration! Expected last token to be LEFT_CURLY, got {str(tokens[-1].type)}")
if tokens[1].type != Token_type.UNKNOWN:
raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after function return type! Expected UNKNWON, got {str(tokens[1].type)}")
if tokens[2].type != Token_type.LEFT_PAREN:
raise JavaSyntaxError(f"{tokens[2].location}: Illegal token after funtion name! Expected LEFT_CURLY, got {str(tokens[2].type)}")
if tokens[-2].type != Token_type.RIGTH_PAREN:
raise JavaSyntaxError(f"{tokens[-2].location}: Illegal token after function parameter list! Expected RIGHT_PAREN, got {str(tokens[-2].type)}")
def _ensure_correct_variable_structure(tokens: List[Token]):
#variable structure: TYPE_NAME IDENTIFIER ;|( = EXPRESSION;)
if len(tokens) < 3:
raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed type construct! Expected at least 3 tokens, got {len(tokens)}")
if tokens[1].type != Token_type.UNKNOWN:
raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after type name! Expected UNKNOWN, got {str(tokens[1].type)}")
if not tokens[2].type in [Token_type.SEMICOLON, Token_type.EQUAL_SIGN]:
raise JavaSyntaxError(f"{tokens[2].location}: Illegal token after variable name! Expected SEMICOLON or EQUAL_SIGN, got {str(tokens[2].type)}")
if tokens[2].type == Token_type.EQUAL_SIGN and len(tokens) < 5:
raise JavaSyntaxError(f"{tokens[2].location}: Ill-formed assignment expression! Expected at least 5 tokens, got {len(tokens)}")
def _ensure_correct_if_structure(tokens: List[Token]):
#if structure: IF ( ... ) { <-- the opening curly is technically not needed, but we require it anyways
if len(tokens) < 5:
raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed if construct! Expected at least 5 tokens, got {len(tokens)}")
if tokens[-1].type != Token_type.LEFT_CURLY:
raise JavaSyntaxError(f"{tokens[-1].location}: Ill-formed if construct! Expected last token to be LEFT_CURLY, got {str(tokens[-1].type)}")
if tokens[1].type != Token_type.LEFT_PAREN:
raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after if token! Expected LEFT_PAREN, got {str(tokens[1].type)}")
if tokens[-2].type != Token_type.RIGTH_PAREN:
raise JavaSyntaxError(f"{tokens[-2].location}: Illegal token after conditional expression! Expected RIGHT_PAREN, got {str(tokens[-2].type)}")
def _ensure_correct_while_structure(tokens: List[Token]):
#while structure: WHILE ( ... ) { <-- might not be required by the standard, but is required by us
if len(tokens) < 5:
raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed while construct! Expected at least 5 tokens, got {len(tokens)}")
if tokens[-1].type != Token_type.LEFT_CURLY:
raise JavaSyntaxError(f"{tokens[-1].location}: Ill-formed while construct! Expected last token to be LEFT_CURLY, got {str(tokens[-1].type)}")
if tokens[1].type != Token_type.LEFT_PAREN:
raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after while token! Expected LEFT_PAREN, got {str(tokens[1].type)}")
if tokens[-2].type != Token_type.RIGTH_PAREN:
raise JavaSyntaxError(f"{tokens[-2].location}: Illegal token after while condition! Expected RIGHT_PAREN, got {str(tokens[-2].type)}")
def _ensure_correct_do_while_structure_part_1(tokens: List[Token]):
#do-while structure: do{
if len(tokens) != 2:
raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed do-while construct! Expected 2 tokens, got {len(tokens)}")
if tokens[1].type != Token_type.LEFT_CURLY:
raise JavaSyntaxError(f"Illegal token after do token! Expected LEFT_CURLY, got {str(tokens[1].type)}")
def _ensure_correct_do_while_structure_part_2(tokens: List[Token]):
#do-while structure: while( ... );
if len(tokens) < 5:
raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed do while contruct! Expected at least 5 tokens, got {len(tokens)}")
if tokens[0].type != Token_type.WHILE_STATEMENT:
raise JavaSyntaxError(f"{tokens[0].location}: Illegal token after do block! Expected WHILE_STATEMENT, got {str(tokens[1].type)}")
if tokens[-1].type != Token_type.SEMICOLON:
raise JavaSyntaxError(f"{tokens[-1].location}: Ill-formed do-while construct! Expected last token to be SEMICOLON, got {str(tokens[-1].type)}")
if tokens[1].type != Token_type.LEFT_PAREN:
raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after while token! Expected LEFT_PAREN, got {str(tokens[1].type)}")
if tokens[-2].type != Token_type.RIGTH_PAREN:
raise JavaSyntaxError(f"{tokens[-2].location}: Illegal token after do-while condition! Expected RIGHT_PAREN, got {str(tokens[-2].type)}")
def _ensure_correct_for_structure(tokens: List[Token]):
#for structure: for(...?;...?;...?) {
if len(tokens) < 6:
raise JavaSyntaxError(f"{tokens[0].location}: Illf-formed for loop construct! Expected at least 6 tokens, got {len(tokens)}")
if tokens[-1].type != Token_type.LEFT_CURLY:
raise JavaSyntaxError(f"{tokens[-1].location}: Ill-formed for loop construct! Expected last token to be LEFT_CURLY, got {str(tokens[-1].type)}")
if tokens[1].type != Token_type.LEFT_PAREN:
raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after for token! Expected LEFT_PAREN, got {str(tokens[1].type)}")
if tokens[-2].type != Token_type.RIGTH_PAREN:
raise JavaSyntaxError(f"{tokens[-2].location}: Illegal token after for loop increment! Expected RIGHT_PAREN, got {str(tokens[-2].type)}")
if ( semicolon_count := tokens.count(Token(Token_type.SEMICOLON)) ) != 2:
raise JavaSyntaxError(f"Ill-formed for loop construct! Expected exactly 2 SEMICOLON tokens, got {semicolon_count}")
def _get_function_argument_list_from_tokens(tokens: List[Token]) -> List[str]:
arg_tokens = _get_seperated_token_list(tokens, [Token_type.COMMA])
args = []
for arg in arg_tokens:
arg_str = ""
for token in arg:
arg_str += token.content + ' '
arg_str = arg_str[:-1]
args.append(arg_str)
return args
def _get_seperated_token_list(tokens: List[Token], seperator_types: List[Token_type]) -> List[List[Token]]:
token_segments = []
tokens_in_segment = []
for token in tokens:
if token.type in seperator_types:
token_segments.append(tokens_in_segment)
tokens_in_segment = []
continue
tokens_in_segment.append(token)
token_segments.append(tokens_in_segment)
return token_segments
def _get_for_arguments_from_tokens(tokens: List[Token]) -> Tuple[List[Token], List[Token], List[Token]]:
variable_tokens = []
condition_tokens = []
increment_tokens = []
token_index = 0
while True:
token = tokens[token_index]
token_index += 1
variable_tokens.append(token)
if token.type == Token_type.SEMICOLON:
break
while True:
token = tokens[token_index]
token_index += 1
if token.type == Token_type.SEMICOLON:
break
condition_tokens.append(token)
while True:
token = tokens[token_index]
token_index += 1
if token.type == Token_type.LEFT_CURLY:
break
increment_tokens.append(token)
return variable_tokens, condition_tokens, increment_tokens[:-1]