From 3bc93fa3a74363cbd8a7dce026b889fc179d0087 Mon Sep 17 00:00:00 2001 From: weckyy702 Date: Thu, 1 Apr 2021 17:00:19 +0200 Subject: [PATCH] Added instruction generation code to Lexer --- draw/Iinstruction.py | 2 +- interpreter/Lexer.py | 156 +++++++++++++++++++++++++++------------ interpreter/Tokenizer.py | 10 ++- interpreter/_token.py | 54 ++++++++------ 4 files changed, 148 insertions(+), 74 deletions(-) diff --git a/draw/Iinstruction.py b/draw/Iinstruction.py index 79bcb54..0d2fd79 100644 --- a/draw/Iinstruction.py +++ b/draw/Iinstruction.py @@ -133,7 +133,7 @@ class if_instruction(Iinstruction): res += '\t'+str(inst)+";\n" res += "}" if self.false_case: - res += " else {" + res += " else {\n" for inst in self.true_case: res += '\t'+str(inst)+";\n" res += "}" diff --git a/interpreter/Lexer.py b/interpreter/Lexer.py index e7da0ca..3ca5ce5 100644 --- a/interpreter/Lexer.py +++ b/interpreter/Lexer.py @@ -1,4 +1,5 @@ """Lexer.py: Definition for Lexer class""" +from interpreter.Tokenizer import Tokenizer from os import linesep from draw.Iinstruction import * from typing import List, Optional, Union, Tuple @@ -43,48 +44,37 @@ class Lexer: if self._is_function_def(line_tokens): func_name, func_return_type, func_args = self._construct_function_header_from_tokens(line_tokens) + fs = Function_scope(func_name, func_return_type, func_args) + instructions = self._get_instructions_in_scope() - scopes.append(Function_scope(func_name, func_return_type, func_args)) - scopes.append(instructions) + fs._add_instructions(instructions) + + scopes.append(fs) else: - line = "" - for token in line_tokens[:-1]: - line += token.content + ' ' - - self._construct_instruction_from_tokens(line_tokens) - - scopes.append([line]) - + #something was declared in global scope + self._global_instructions.append(self._construct_instruction_from_tokens(line_tokens)) + self.add_globals_to_scope_list(scopes) return scopes def _get_instructions_in_scope(self): - lines = [] + instructions = [] while self._peek(): line_tokens = self.get_line_tokens() - - if len(line_tokens) > 1: - line = "" - for token in line_tokens[:-1]: - line += token.content + ' ' - - self._construct_instruction_from_tokens(line_tokens) - - lines.append(line) + instruction = self._construct_instruction_from_tokens(line_tokens) + if instruction: + instructions.append(instruction) delimiter_token = line_tokens[-1] - if delimiter_token.type == Token_type.RIGHT_CURLY: - return lines - if delimiter_token.type == Token_type.LEFT_CURLY: - instructions = self._get_instructions_in_scope() - for line in instructions: - lines.append('\t'+line) + return instructions + + raise JavaSyntaxError(f"Missing right curly!") @@ -96,6 +86,13 @@ class Lexer: break return tokens + + + def add_globals_to_scope_list(self, scope_list: List[Function_scope]): + global_scope = Function_scope("", "void", []) + global_scope._add_instructions(self._global_instructions) + + scope_list.append(global_scope) def _is_function_def(self, tokens: List[Token]) -> bool: @@ -103,45 +100,110 @@ class Lexer: return tokens[0].type == Token_type.TYPE_NAME and tokens[1].type == Token_type.UNKNOWN and tokens[2].type == Token_type.LEFT_PAREN and tokens[-1].type == Token_type.LEFT_CURLY + def _construct_instruction_from_tokens(self, tokens: List[Token]): instruction_token = tokens[0] if instruction_token.type == Token_type.IF_STATEMENT: - logging.debug("Found if construct") - for token in tokens: - print('\t', token) + return self._handle_if_construct(tokens) elif instruction_token.type == Token_type.WHILE_STATEMENT: - logging.debug("Found while construct") - for token in tokens: - print('\t', token) + return self._handle_while_construct(tokens) elif instruction_token.type == Token_type.DO_WHILE_STATEMENT: - logging.debug("Found do-while construct") - for token in tokens: - print('\t', token) + return self._handle_do_while_construct(tokens) elif instruction_token.type == Token_type.FOR_STATEMENT: - #TODO: change that - logging.debug("Found for construct") - tokens.extend(self.get_line_tokens()) - tokens.extend(self.get_line_tokens()) - for token in tokens: - print('\t', token) + return self._handle_for_construct(tokens) elif instruction_token.type == Token_type.TYPE_NAME: - logging.debug("Found Type name construct") - for token in tokens: - print('\t', token) + return self._handle_type_name_construct(tokens) elif instruction_token.type == Token_type.UNKNOWN: - logging.debug("Found generic instruction") - for token in tokens: - print('\t', token) + return self._handle_generic_construct(tokens) def _construct_function_header_from_tokens(self, tokens: List[Token]) -> Tuple[str, str, List[str]]: return "name", "return_type", ["int arg1", "String arg2"] + def _construct_variable_def_from_tokens(self, tokens: List[Token]) -> str: + #token_list: TYPE_NAME IDENTIFIER ;|( = EXPRESSION) + _ensure_correct_variable_structure(tokens) + if var_value: + return f"decalare variable '{'name'}' of type {'type'} type with value {'value'}" + return f"declare variable '{'name'}' of type {'type'}" + + """Handler functions for different types of language structures""" + + def _handle_if_construct(self, tokens: List[Token]): + logging.debug("Found if construct") + + true_case = self._get_instructions_in_scope() + false_case = self._handle_else_construct() + + return if_instruction("if_instruction", true_case, false_case) + + def _handle_else_construct(self): + if self._peek().type == Token_type.ELSE_STATEMENT: + if self._peek(1).type == Token_type.IF_STATEMENT: + logging.debug("Found if-else construct") + else_if_tokens = self.get_line_tokens()[1:] + return [self._handle_if_construct(else_if_tokens)] + else: + logging.debug("Found else construct") + self.get_line_tokens() + return self._get_instructions_in_scope() + return None + + def _handle_while_construct(self, tokens: List[Token]): + logging.debug("Found while construct") + + loop_instructions = self._get_instructions_in_scope() + + return while_instruction_front("while_instruction", loop_instructions) + + def _handle_do_while_construct(self, tokens: List[Token]): + logging.debug("Found do-while construct") + + loop_instructions = self._get_instructions_in_scope() + + self.get_line_tokens() + + return while_instruction_back("while_instruction_back", loop_instructions) + + + + def _handle_for_construct(self, tokens: List[Token]): + #TODO: change that + logging.debug("Found for construct") + tokens.extend(self.get_line_tokens()) + tokens.extend(self.get_line_tokens()) + + loop_instructions = self._get_instructions_in_scope() + + loop_instructions.append(generic_instruction("increment")) + + return for_instruction("for_instruction", loop_instructions) + + def _handle_type_name_construct(self, tokens: List[Token]): + logging.debug("Found Type name construct") + _ensure_correct_variable_structure(tokens) + return generic_instruction("type_name_construct") + + def _handle_generic_construct(self, tokens: List[Token]): + logging.debug("Found generic instruction") + + return generic_instruction("generic_instruction") + +def _ensure_correct_variable_structure(tokens: List[Token]): + #variable structure: TYPE_NAME IDENTIFIER ;|( = EXPRESSION) + if len(tokens) < 3: + raise JavaSyntaxError(f"{tokens[0].location}: Ill-formed type construct! Expected at least 3 tokens, got {len(tokens)}") + if tokens[1].type != Token_type.UNKNOWN: + raise JavaSyntaxError(f"{tokens[1].location}: Illegal token after type name! Expected UNKNOWN, got {str(tokens[1].type)}") + if not tokens[2].type in [Token_type.SEMICOLON, Token_type.EQUAL_SIGN]: + raise JavaSyntaxError(f"{tokens[2].location}: Illegal token after variable name! Expected SEMICOLON or EQUAL_SIGN, got {str(tokens[2].type)}") + if tokens[2].type == Token_type.EQUAL_SIGN and len(tokens) < 5: + raise JavaSyntaxError(f"{tokens[2].location}: Ill-formed assignment expression! Expected at least 5 tokens, got {len(tokens)}") # def get_scopes(self) -> List[Function_scope]: diff --git a/interpreter/Tokenizer.py b/interpreter/Tokenizer.py index f67f215..4fb208a 100644 --- a/interpreter/Tokenizer.py +++ b/interpreter/Tokenizer.py @@ -4,12 +4,12 @@ import logging import re from typing import List, Optional -from interpreter._token import Token, make_token +from interpreter._token import Token, make_token, SourceLocation class Tokenizer: """This class will take the provided source file and convert it to a list of tokens""" - TOKEN_MATCH = re.compile(r"""\(|\)|\{|\}|;|(\n)|\+|-|\*|/|<|>|,| """) #TODO: make this modular + TOKEN_MATCH = re.compile(r"""\(|\)|\{|\}|;|(\n)|\+|-|\*|/|<|>|,| """) def __init__(self, file_name: str) -> None: with open(file_name) as f: @@ -17,8 +17,12 @@ class Tokenizer: self.source_index = 0 self.line_number = 1 + self.source_text = re.sub("(private)|(public)|(protected)", "", self.source_text) + self.type_name_pattern = re.compile('(char)|(int)|(void)|(double)|(Pixel)') #TODO: make this modular + self._filename = file_name + def get_tokens(self) -> List[Token]: tokens = [] @@ -33,7 +37,7 @@ class Tokenizer: token = self._get_token(char) logging.debug(f"found token \"{token}\" on line {self.line_number}") - tokens.append(make_token(token, self.type_name_pattern)) + tokens.append(make_token(token, SourceLocation(self._filename, self.line_number), self.type_name_pattern)) return tokens diff --git a/interpreter/_token.py b/interpreter/_token.py index 59653aa..1fad304 100644 --- a/interpreter/_token.py +++ b/interpreter/_token.py @@ -33,55 +33,63 @@ class Token_type(IntEnum): STRING_LITERAL=17 TYPE_NAME=18 +class SourceLocation: + def __init__(self, filename: str, line: int) -> None: + self.filename = filename + self.line = line + + def __str__(self) -> str: + return f"File {self.filename}, line {self.line}" + class Token: - def __init__(self, type: Token_type, content: Union[str, None]=None) -> None: + def __init__(self, type: Token_type, location: SourceLocation, content: Union[str, None]=None) -> None: self.type = type self.content = content + self.location = location def __str__(self) -> str: if self.content: return f"{str(self.type)}: {self.content}" return f"{self.type}" -def make_token(tag: str, type_name_pattern:re.Pattern) -> Token: +def make_token(tag: str, location: SourceLocation, type_name_pattern:re.Pattern) -> Token: if tag == '(': - return Token(Token_type.LEFT_PAREN, tag) + return Token(Token_type.LEFT_PAREN, location, tag) elif tag == ')': - return Token(Token_type.RIGTH_PAREN, tag) + return Token(Token_type.RIGTH_PAREN, location, tag) elif tag == '{': - return Token(Token_type.LEFT_CURLY, tag) + return Token(Token_type.LEFT_CURLY, location, tag) elif tag == '}': - return Token(Token_type.RIGHT_CURLY, tag) + return Token(Token_type.RIGHT_CURLY, location, tag) elif tag == '[': - return Token(Token_type.LEFT_BRACKET, tag) + return Token(Token_type.LEFT_BRACKET, location, tag) elif tag == ']': - return Token(Token_type.RIGHT_BRACKET, tag) + return Token(Token_type.RIGHT_BRACKET, location, tag) elif tag == ',': - return Token(Token_type.COMMA, tag) + return Token(Token_type.COMMA, location, tag) elif tag == '=': - return Token(Token_type.EQUAL_SIGN, tag) + return Token(Token_type.EQUAL_SIGN, location, tag) elif tag == ';': - return Token(Token_type.SEMICOLON, tag) + return Token(Token_type.SEMICOLON, location, tag) elif MATH_OP_PATTERN.match(tag): - return Token(Token_type.MATH_OP, tag) + return Token(Token_type.MATH_OP, location, tag) elif NUMERIC_CONSTANT_PATTERN.match(tag): - return Token(Token_type.NUMERIC_CONSTANT, tag) + return Token(Token_type.NUMERIC_CONSTANT, location, tag) elif tag == "if": - return Token(Token_type.IF_STATEMENT, tag) + return Token(Token_type.IF_STATEMENT, location, tag) elif tag == "else": - return Token(Token_type.ELSE_STATEMENT, tag) + return Token(Token_type.ELSE_STATEMENT, location, tag) elif tag == "while": - return Token(Token_type.WHILE_STATEMENT, tag) + return Token(Token_type.WHILE_STATEMENT, location, tag) elif tag == "do": - return Token(Token_type.DO_WHILE_STATEMENT, tag) + return Token(Token_type.DO_WHILE_STATEMENT, location, tag) elif tag == "for": - return Token(Token_type.FOR_STATEMENT, tag) + return Token(Token_type.FOR_STATEMENT, location, tag) elif KEYWORD_PATTERN.match(tag): - return Token(Token_type.KEY_WORD, tag) + return Token(Token_type.KEY_WORD, location, tag) elif STRING_LITERAL_PATTERN.match(tag): - return Token(Token_type.STRING_LITERAL, tag) + return Token(Token_type.STRING_LITERAL, location, tag) elif type_name_pattern.match(tag): - return Token(Token_type.TYPE_NAME, tag) + return Token(Token_type.TYPE_NAME, location, tag) else: - logging.info(f"found unknown token {tag}... Function or variable name?") - return Token(Token_type.UNKNOWN, tag) \ No newline at end of file + return Token(Token_type.UNKNOWN, location, tag) \ No newline at end of file