split interpretation into two steps: Tokenization and Lexing

This commit is contained in:
weckyy702
2021-03-31 19:47:37 +02:00
parent 53534ee707
commit 49f3c84e60
5 changed files with 462 additions and 138 deletions

92
interpreter/Tokenizer.py Normal file
View File

@@ -0,0 +1,92 @@
"""Tokenizer.py: Definition for Tokenizer class"""
import logging
import re
from typing import List, Optional
from interpreter._token import Token, make_token
class Tokenizer:
"""This class will take the provided source file and convert it to a list of tokens"""
TOKEN_MATCH = re.compile(r"""\(|\)|\{|\}|;|(\n)|\+|-|\*|/|<|>|,| """) #TODO: make this modular
def __init__(self, file_name: str) -> None:
with open(file_name) as f:
self.source_text = f.read()
self.source_index = 0
self.line_number = 1
self.type_name_pattern = re.compile('(char)|(int)|(void)|(double)|(Pixel)') #TODO: make this modular
def get_tokens(self) -> List[Token]:
tokens = []
while char := self._consume():
if char.isspace():
continue
if self._handle_comments(char):
continue
token = self._get_token(char)
logging.debug(f"found token \"{token}\" on line {self.line_number}")
tokens.append(make_token(token, self.type_name_pattern))
return tokens
def _get_token(self, char: str) -> str:
token = char
if not re.match(Tokenizer.TOKEN_MATCH, token):
while (token_char := self._peek()):
if re.match(Tokenizer.TOKEN_MATCH, token_char):
break
token += self._consume()
return token
def _handle_comments(self, char: str) -> bool:
if char == '/' and self._peek() == '/':
self._get_line() #skip the entire line
return True
elif char == '/' and self._peek() == '*':
self._consume()
self._consume_multiline_comment()
return True
return False
def _get_line(self) -> str:
return self._consume_until('\n')
def _peek(self, offset:int = 0) -> str:
if (self.source_index + offset) >= len(self.source_text):
return ''
char = self.source_text[self.source_index]
return char
def _consume(self) -> str:
char = self._peek()
if char == '\n':
self.line_number += 1
self.source_index += 1
return char
def _consume_multiline_comment(self):
while self._peek():
if self._consume() == '*' and self._peek() == '/':
self._consume()
break
def _consume_until(self, end_tag: str) -> str:
res = ""
while self._peek() and (char:= self._consume()) != end_tag:
res += char
return res