Scanning.py 16.39 KiB
import string
# class ScanDFA():
# scans the input line by line, and char by char for each line
# explicitly recognize whitespace when scanning, but discard whitespace tokens at the end
##################### Token ##########################################
# A Token is a pair (name, lexeme)
class Token():
def __init__(self, name, lex):
self.name = name
self.lex = lex
##################### Joos Tokens Map ###############################
# For tokens that are recognized as another name in the maximal munch scanner
# e.g. all keywords are scanned as ID first, as well as boolean literals, and null
# Key: lexeme, Value: name, use lexeme to reassign those tokens correct names
idToTokenDict = dict({
# Keywords
'abstract': 'ABSTRACT',
'boolean': 'BOOLEAN',
'byte': 'BYTE',
'char': 'CHAR',
'class': 'CLASS',
'else': 'ELSE',
'extends': 'EXTENDS',
'final': 'FINAL',
'for': 'FOR',
'if': 'IF',
'implements': 'IMPLEMENTS',
'import': 'IMPORT',
'instanceof': 'INSTANCEOF',
'int': 'INT',
'interface': 'INTERFACE',
'native': 'NATIVE',
'new': 'NEW',
'package': 'PACKAGE',
'protected': 'PROTECTED',
'public': 'PUBLIC',
'return': 'RETURN',
'short': 'SHORT',
'static': 'STATIC',
'void': 'VOID',
'while': 'WHILE',
# Literals
'true': 'LITERALBOOL',
'false': 'LITERALBOOL',
'null': 'NULL'
})
def idsToTokens(tokens):
for t in tokens:
if t.name == 'ID':
if t.lex in idToTokenDict:
t.name = idToTokenDict.get(t.lex)
# a set that contains keywords that are in Java but not in Joos
wrongJavaKeyWordDict = {
'assert',
'break',
'case',
'catch',
'const',
'continue',
'default',
'do',
'double',
'enum',
'finally',
'float',
'goto',
'long',
'private',
'strictfp',
'switch',
'super',
'synchronized',
'throw',
'throws',
'transient',
'try',
'volatile',
'_',
'Long',
'Float'
}
# a set that contains unsed operators or seperators
wrongJavaOpDict = {
'ELLIPSIS',
'ARROW'
}
######################## DFA Stuff ###################################
################# Joos DFA Tokens ###################################
JoosDFATokens = set([
# Identifiers or Keywords (keywords are mapped to their token name in idToTokenDict above)
'ID',
# Literals and names (note: 'this' is considered as a keyword)
'NUM', # number (excludes 0)
'ZERO', # 0
'NEGATIVENUM', # to do integer range checking, will be mapped to NUM after checking
'LITERALBOOL', # true or false
'LITERALCHAR', # character e.g. 'c', includes escape characters?
'LITERALSTRING', # string e.g. "hello", includes escape sequences
'NULL', # null
'COMPID', # compound ids e.g. java.util.vectors
'IMPORTALL', # compound names that have the import all wildcard e.g. java.util.*
# Operators:
'ASSIGN', # =
'GT', # >
'LT', # <
'NOT', # !
'EQUAL', # ==
'GE', # >=
'LE', # <=
'NE', # !=
'AND', # &&
'OR', # ||
'ADD', # +
'SUB', # -
'MULT', # *
'DIV', # /
'MOD', # %
'BITAND', # &
'BITOR', # |
# Separators:
'SEMICO', # ;
'LPAREN', # (
'RPAREN', # )
'LBRACK', # {
'RBRACK', # }
'LSQRBRACK', # [
'RSQRBRACK', # ]
'COMMA', # ,
'PERIOD', # .
# Unused Seperator (TODO: refactor this)
'ELLIPSIS', # ...
])
##################### Transition function ############################
# returns next state after transition on one input character
# Note: recognize keywords as ID, then convert them to different tokens later
def JoosTransition(input, state):
if (state == 'WHITESPACE'):
if (input == ' ' or ord(input) == 9 or ord(input) == 12):
return 'WHITESPACE'
else:
return None
elif state == 'NEWLINE':
if input == '\n':
return 'NEWLINE'
if input == '\r':
return 'NEWLINEC'
return None
elif state == 'NEWLINEC':
if input == '\r' or ord(input) == 10:
return 'NEWLINEC'
if input == '\n':
return 'NEWLINE'
return None
elif (state == 'START'):
if (input.isalpha() or input == '_' or input == '$'):
return 'ID'
if (input.isdigit()):
if (input == '0'):
return 'ZERO'
return 'NUM'
# whitespace
if (input == ' ' or ord(input) == 9 or ord(input) == 12):
return 'WHITESPACE'
# newline
if input == '\n':
return 'NEWLINE'
if input == '\r':
return 'NEWLINEC'
# operators
if (input == '='):
return 'ASSIGN'
if (input == '>'):
return 'GT'
if (input == '<'):
return 'LT'
if (input == '!'):
return 'NOT'
if (input == '+'):
return 'ADD'
if (input == '-'):
return 'SUB'
if (input == '*'):
return 'MULT'
if (input == '/'):
return 'DIV'
if (input == '&'):
return 'BITAND'
if (input == '|'):
return 'BITOR'
if (input == '%'):
return 'MOD'
# separators
if (input == ';'):
return 'SEMICO'
if (input == '('):
return 'LPAREN'
if (input == ')'):
return 'RPAREN'
if (input == '{'):
return 'LBRACK'
if (input == '}'):
return 'RBRACK'
if (input == '['):
return 'LSQRBRACK'
if (input == ']'):
return 'RSQRBRACK'
if (input == ','):
return 'COMMA'
if (input == '.'):
return 'PERIOD'
# literals
if (input == '\"'):
return 'LSTRING'
if (input == '\''):
return 'LCHAR'
# Handling all operators that are not allowed in Joos (some cases are handled elsewhere) and negative numbers
elif state == 'ADD':
if input == '+':
return 'OPDISCARD'
if input == '=':
return 'OPDISCARD'
return None
elif state == 'SUB':
if (input == '>'):
return 'OPDISCARD'
if input.isdigit():
return 'NEGATIVENUM'
if input == '-':
return 'OPDISCARD'
if input == '=':
return 'OPDISCARD'
return None
elif state == 'MULT':
if input == '=':
return 'OPDISCARD'
return None
elif state == 'MOD':
if input == '=':
return 'OPDISCARD'
return None
elif (state == 'ID'):
# dealing with compound names
if input == '.':
return 'HALFCOMP'
if (input.isalpha() or input.isdigit() or input == '_' or input == '$'):
return 'ID'
return None
# Compound names
elif state == 'HALFCOMP':
if input.isalpha():
return 'COMPID'
if input == '*':
return 'IMPORTALL'
if input == '/':
return 'DIV'
return None
elif state == 'COMPID':
if input == '.':
return 'HALFCOMP'
if input.isalpha() or input.isdigit() or input == '_' or input == '$':
return 'COMPID'
return None
elif (state == 'NUM'):
if(input.isdigit()):
return 'NUM'
if input == '.':
return 'FLOAT' # not accepted
return None
elif state == 'NEGATIVENUM':
if input.isdigit():
return 'NEGATIVENUM'
if input == '.':
return 'FLOAT' # not accepted
return None
# string literal
elif (state == 'LSTRING'):
if (input == '\\'):
return 'STRINGESC'
if (input == '\"'):
return 'LITERALSTRING'
return 'LSTRING'
elif (state == 'STRINGESC'):
if input == 'u':
return 'UNICODE' #going to be discarded
if input.isdigit() and 0 <= int(input) <= 7:
return 'LSTRING'
if input not in ('n', 'r', 't', 'v', '\\','\'', '"', '?', 'b' ):
return None
return 'LSTRING'
elif(state == 'CHAROCTAVEESC'):
if input.isdigit() and 0 <= int(input) <= 7:
return 'CHAROCTAVEESC1'
if (input == '\''):
return 'LITERALCHAR'
return None
elif(state == 'CHAROCTAVEESC1'):
if input.isdigit() and 0 <= int(input) <= 7:
return 'CHAREND'
if (input == '\''):
return 'LITERALCHAR'
return None
# char literal
elif (state == 'LCHAR'):
if (input == '\\'):
return 'CHARESC'
return 'CHAREND'
elif (state == 'CHARESC'):
if input.isdigit() and 0 <= int(input) <= 7:
return 'CHAROCTAVEESC'
if input not in ('n', 'r', 't', 'v', '\\','\'', '"', '?', 'b' ):
return None
return 'CHAREND'
elif (state == 'CHAREND'):
if (input == '\''):
return 'LITERALCHAR'
return None
# length 2 operators
elif (state == 'ASSIGN'):
if (input == '='):
return 'EQUAL'
return None
elif (state == 'GT'):
if (input == '='):
return 'GE'
elif input == '>':
return 'OPDISCARD'
return None
elif (state == 'LT'):
if (input == '='):
return 'LE'
elif input == '<':
return 'OPDISCARD'
return None
elif (state == 'NOT'):
if (input == '='):
return 'NE'
return None
elif (state == 'BITAND'):
if (input == '&'):
return 'AND'
elif input == '=':
return 'OPDISCARD'
return None
elif (state == 'BITOR'):
if (input == '|'):
return 'OR'
elif input == '=':
return 'OPDISCARD'
return None
# Comments
elif(state == 'DIV'):
if (input == '/'):
return 'COMMENT'
elif (input == '*'):
return 'LCOMMENT'
elif input == '=':
return 'OPDISCARD'
return None
elif(state == 'LCOMMENT'):
if (input == '*'):
return 'LCOM2'
return 'LCOMMENT'
elif state == 'LCOM2':
if input == '/':
return 'RCOMMENT'
if input == '*':
return 'LCOM3'
return 'LCOMMENT'
elif state == 'LCOM3':
if input == '/':
return 'RCOMMENT'
return 'LCOMMENT'
elif state == 'COMMENT':
if input == '\n':
return 'NEWLINE'
if input == '\r':
return 'NEWLINEC'
return 'COMMENT'
# Recognizing unused operators to be filtered later
elif(state == 'PERIOD'):
if(input == '.'):
return 'PERIOD2'
return None
elif(state == 'PERIOD2'):
if(input == '.'):
return 'ELLIPSIS'
return None
# Handling hexidecimal literals (error case)
elif state == 'ZERO':
if input == 'x':
return 'HEXLITERAL'
elif input == '.':
return 'FLOAT'
else:
return None
##################### Other DFA elements ##############################
#TODO: remove alphabets since it's unecessary in our DFA implementation
specialChars = set(list(".;:,@{}()[]<>!?+-*/&|^%=''\"\\"))
JoosAccept = JoosDFATokens.union({'WHITESPACE', 'COMMENT', 'NEWLINE', 'LCOMMENT', 'RCOMMENT', 'NEWLINEC', 'LCOM2', 'LCOM3'})
JoosStates = JoosAccept.union({'START', 'PERIOD2', 'HALFCOMP', 'HEXLITERAL', 'OPDISCARD', 'FLOAT', 'UNICODE'})
JoosAlphabet = set(string.ascii_lowercase).union(set(string.ascii_uppercase)).union(set(string.digits)).union(specialChars)
######################### DFA #######################################
class DFA ():
def __init__(self, states, alphabet, transition, start, accept):
self.states = states
self.alphabet = alphabet
self.transition = transition
self.start = start
self.accept = accept
JoosDFA = DFA(
states = JoosDFATokens,
alphabet = JoosAlphabet,
start = 'START',
accept = JoosAccept,
transition = JoosTransition
)
################### Simplified Maximal Munch ###########################
def SMM(input, dfa):
# list of tokens scanned
scanned = []
while (input):
seenInput = ""
state = dfa.start
while (input):
# print("input is: ", input[0])
if ord(input[0]) > 127:
return (None, "Not ASCII")
newState = dfa.transition(input[0], state)
if not newState:
break
seenInput += input[0]
input = input[1:]
state = newState
if (state in dfa.accept):
scanned.append(Token(state, seenInput))
else:
if input:
m = ord(input[0])
return (None, m)
return (scanned, "success")
################# Scan ################################################
def scan(input):
result = SMM(input, JoosDFA)
tokens = result[0]
# Handling error in munching
if (tokens is None):
return (None, "Error on Scanning character: " + str(result[1]))
if (tokens):
# Handle erroneous tokens (return None and error string)
multiLineCommentFlag = False
indexRange = len(tokens)
for index,token in enumerate(tokens):
# dealing with numbers that start with 0 (e.g. 09)
if token.name == 'ZERO':
if index < indexRange-1:
if tokens[index+1].name == 'NUM':
return (None, "wrong integer literal: starts with 0")
# Checking integer range
elif token.name == 'NEGATIVENUM' and int(token.lex) < -2147483648:
return (None, "negative int too large")
# dealing with keywords in Java but not in Joos
elif token.name == 'ID' and token.lex in wrongJavaKeyWordDict:
return (None, "keyword in Java but not in Joos")
# dealing with operators and seperators in Java but not in Joos
elif token.name in wrongJavaOpDict:
return (None, "operator in Java but not in Joos")
# TODO: need to figure out what is allowed in compID and refactor this checking for explicit this calls
elif token.name == 'ID' and token.lex is 'this':
return (None, "explicit this call not allowed")
# Checking wrong keywords in compIDs
elif token.name == 'COMPID':
temp = token.lex.split('.')
compIDIndexRange = len(temp)
for i, t in enumerate(temp):
if i == 0 and t in wrongJavaKeyWordDict:
return (None, "wrong keyword in comp id")
if i is not 0 and t == 'this':
return (None, "this cannot be in the middle of a compid")
# Checking if the multi line comment has a closing tag
if token.name == 'LCOMMENT':
multiLineCommentFlag = True
if token.name == 'RCOMMENT':
multiLineCommentFlag = False
# Checking for missing closing */ for multi line comments
if multiLineCommentFlag is True:
return (None, "missing closing characters for multi line comments")
idsToTokens(tokens)
# remove whitespace, newline characters and comments
tokens = list(filter(lambda t: t.name not in ("COMMENT", 'LCOMMENT', 'RCOMMENT', 'NEWLINE', 'NEWLINEC', 'LCOM2', 'LCOM3'), tokens))
for index,token in enumerate(tokens):
if token.name == 'NUM' and int(token.lex) > 2147483647 and (index is 0 or tokens[index-1].name is not 'SUB'):
return (None, "positive int too large")
tokens = list(filter(lambda t: t.name is not 'WHITESPACE', tokens))
tokensNew = []
for index, t in enumerate(tokens):
if t.name is 'NEGATIVENUM':
tokensNew.append(Token('SUB', '-'))
tokensNew.append(Token('NUM', t.lex[1:]))
else:
tokensNew.append(t)
return (tokensNew, "success")