112 lines
3.3 KiB
Python
112 lines
3.3 KiB
Python
"""Tokenizer.py: Definition for Tokenizer class"""
|
|
|
|
from errors.custom import JavaSyntaxError
|
|
import logging
|
|
import re
|
|
from typing import List, Optional
|
|
|
|
from interpreter._token import Token, make_token, SourceLocation
|
|
|
|
class Tokenizer:
|
|
"""This class will take the provided source file and convert it to a list of tokens"""
|
|
|
|
TOKEN_MATCH = re.compile(r"""\(|\)|\{|\}|;|(\n)|\+|-|\*|/|<|>|,| """)
|
|
|
|
def __init__(self, file_name: str) -> None:
|
|
with open(file_name) as f:
|
|
self.source_text = f.read()
|
|
self.source_index = 0
|
|
self.line_number = 1
|
|
self.column_number = 0
|
|
|
|
self.source_text = re.sub("(private)|(public)|(protected)|(final)", "", self.source_text)
|
|
|
|
self.type_name_pattern = re.compile('(char)|(int)|(void)|(double)|(boolean)|(Pixel)|(String)') #TODO: make this modular
|
|
|
|
self._filename = file_name
|
|
|
|
self._left_curly_number=0
|
|
self._right_curly_number=0
|
|
|
|
def get_tokens(self) -> List[Token]:
|
|
|
|
tokens = []
|
|
|
|
while char := self._consume():
|
|
|
|
if char.isspace():
|
|
continue
|
|
|
|
if self._handle_comments(char):
|
|
continue
|
|
|
|
tag = self._get_token(char)
|
|
logging.debug(f"found tag \"{tag}\" on line {self.line_number}")
|
|
|
|
if tag == "{":
|
|
self._left_curly_number+=1
|
|
elif tag == "}":
|
|
self._right_curly_number+=1
|
|
|
|
tokens.append(make_token(tag, SourceLocation(self._filename, self.line_number, self.column_number), self.type_name_pattern))
|
|
|
|
if self._left_curly_number != self._right_curly_number:
|
|
raise JavaSyntaxError(f"Ill-formed Java program! Expected equal number of '{'{'}' and '{'}'}' tokens, got {self._left_curly_number} and {self._right_curly_number}")
|
|
|
|
return tokens
|
|
|
|
def _get_token(self, char: str) -> str:
|
|
token = char
|
|
|
|
if not re.match(Tokenizer.TOKEN_MATCH, token):
|
|
|
|
while (token_char := self._peek()):
|
|
if re.match(Tokenizer.TOKEN_MATCH, token_char):
|
|
break
|
|
token += self._consume()
|
|
|
|
return token
|
|
|
|
def _handle_comments(self, char: str) -> bool:
|
|
if char == '/' and self._peek() == '/':
|
|
self._get_line() #skip the entire line
|
|
return True
|
|
elif char == '/' and self._peek() == '*':
|
|
self._consume()
|
|
self._consume_multiline_comment()
|
|
return True
|
|
return False
|
|
|
|
def _get_line(self) -> str:
|
|
return self._consume_until('\n')
|
|
|
|
def _peek(self, offset:int = 0) -> str:
|
|
if (self.source_index + offset) >= len(self.source_text):
|
|
return ''
|
|
char = self.source_text[self.source_index]
|
|
|
|
return char
|
|
|
|
def _consume(self) -> str:
|
|
char = self._peek()
|
|
|
|
if char == '\n':
|
|
self.line_number += 1
|
|
self.column_number = 1
|
|
|
|
self.source_index += 1
|
|
self.column_number += 1
|
|
return char
|
|
|
|
def _consume_multiline_comment(self):
|
|
while self._peek():
|
|
if self._consume() == '*' and self._peek() == '/':
|
|
self._consume()
|
|
break
|
|
|
|
def _consume_until(self, end_tag: str) -> str:
|
|
res = ""
|
|
while self._peek() and (char:= self._consume()) != end_tag:
|
|
res += char
|
|
|
|
return res |