From 5662e87c831ee56bc83b142563783a3f41b790fa Mon Sep 17 00:00:00 2001 From: weckyy702 Date: Sun, 28 Mar 2021 17:02:22 +0200 Subject: [PATCH] started implementing new and improved lexer --- interpreter/Lexer.py | 92 ++++++++++++++++++++++++++++++++++++++++++- interpreter/_token.py | 58 +++++++++++++++++++++++++++ 2 files changed, 149 insertions(+), 1 deletion(-) create mode 100644 interpreter/_token.py diff --git a/interpreter/Lexer.py b/interpreter/Lexer.py index 51adf2f..c15a946 100644 --- a/interpreter/Lexer.py +++ b/interpreter/Lexer.py @@ -1,8 +1,98 @@ """Lexer.py: Definition for Lexer class""" -from function_scope import Function_scope +import logging +import re +from typing import List, overload +from interpreter.function_scope import Function_scope +from interpreter._token import Token, make_token class Lexer: """This class will lex the provided Java source and generate a list of Function_scopes""" + TOKEN_MATCH = re.compile("\(|\)|\{|\}|;|(\n)|\+|-|\*|/|<|>|,| ") + def __init__(self, file_name: str) -> None: + with open(file_name) as f: + self.source_text = f.read() + self.source_index = 0 + self.line_number = 1 + + def lex(self) -> List[Token]: + + tokens = [] + + while char := self._consume(): + + if char.isspace(): + continue + + if self._handle_comments(char): + continue + + token = self._get_token(char) + logging.debug(f"found token \"{token}\" on line {self.line_number}") + #tokens.append(make_token(token)) + + return tokens + + def _get_token(self, char: str) -> str: + token = char + + if not re.match(Lexer.TOKEN_MATCH, token): + + while (token_char := self._peek()): + if re.match(Lexer.TOKEN_MATCH, token_char): + break + token += self._consume() + + return token + + def _handle_comments(self, char: str) -> bool: + if char == '/' and self._peek() == '/': + self._get_line() #skip the entire line + return True + elif char == '/' and self._peek() == '*': + self._consume() + self._consume_until('/') #skip until closing character. Will probably bug out at some point + return True + return False + + def _get_line(self) -> str: + return self._consume_until(re.compile("(\n)|;")) + + def _peek(self, offset:int = 0) -> str: + if (self.source_index + offset) >= len(self.source_text): + return '' + char = self.source_text[self.source_index] + + return char + + def _consume(self) -> str: + char = self._peek() + + if char == '\n': + self.line_number += 1 + + self.source_index += 1 + return char + + @overload + def _consume_until(self, end_token: str) -> str:... + + @overload + def _consume_until(self, end_pattern:re.Pattern) -> str:... + + def _consume_until(self, end_token) -> str: + res = "" + + if isinstance(end_token, str): + while self._peek() and (char:= self._consume()) != end_token: + res += char + + return res + + elif isinstance(end_token, re.Pattern): + while self._peek() and not end_token.match(char:= self._consume()): + res += char + + return res \ No newline at end of file diff --git a/interpreter/_token.py b/interpreter/_token.py new file mode 100644 index 0000000..27af5d0 --- /dev/null +++ b/interpreter/_token.py @@ -0,0 +1,58 @@ +"""Private definitions for Token class used by the Lexer""" + +import re + +from enum import IntEnum +from typing import Union + +NUMERIC_CONSTANT_PATTERN = re.compile("([0-9]+)|(true)|(false)") +KEYWORD_PATTERN = re.compile("(return)|(continue)|(break)") + +class Token_type(IntEnum): + UNKNOWN=-1 + LEFT_PAREN=0, + RIGTH_PAREN=1, + LEFT_CURLY=2, + RIGHT_CURLY=3, + LEFT_BRACKET=4, + RIGHT_BRACKET=5, + COMMA=6, + NUMERIC_CONSTANT=7, + IF_STATEMENT=8, + WHILE_STATEMENT=9, + DO_WHILE_STATEMENT=10, + FOR_STATEMENT=11, + KEY_WORD=13, + STRING_LITERAL=14 + +class Token: + def __init__(self, type: Token_type, content: Union[str, None]=None) -> None: + self.type = type + self.content = content + +def make_token(tag: str) -> Token: + if tag == '(': + return Token(Token_type.LEFT_PAREN) + elif tag == ')': + return Token(Token_type.RIGTH_PAREN) + elif tag == '{': + return Token(Token_type.LEFT_CURLY) + elif tag == '}': + return Token(Token_type.RIGHT_CURLY) + elif tag == '[': + return Token(Token_type.LEFT_BRACKET) + elif tag == ']': + return Token(Token_type.RIGHT_BRACKET) + elif tag == ',': + return Token(Token_type.COMMA) + elif NUMERIC_CONSTANT_PATTERN.match(tag): + return Token(Token_type.NUMERIC_CONSTANT, tag) + elif tag == 'if': + return Token(Token_type.IF_STATEMENT) + elif tag == 'while': + return Token(Token_type.WHILE_STATEMENT) + elif tag == 'do': + return Token(Token_type.DO_WHILE_STATEMENT) + elif tag == 'for': + return Token(Token_type.FOR_STATEMENT) + #TODO: finish this \ No newline at end of file