import string # class ScanDFA(): # scans the input line by line, and char by char for each line # explicitly recognize whitespace when scanning, but discard whitespace tokens at the end ##################### Token ########################################## # A Token is a pair (name, lexeme) class Token(): def __init__(self, name, lex): self.name = name self.lex = lex ##################### Joos Tokens Map ############################### # For tokens that are recognized as another name in the maximal munch scanner # e.g. all keywords are scanned as ID first, as well as boolean literals, and null # Key: lexeme, Value: name, use lexeme to reassign those tokens correct names idToTokenDict = dict({ # Keywords 'abstract': 'ABSTRACT', 'boolean': 'BOOLEAN', 'byte': 'BYTE', 'char': 'CHAR', 'class': 'CLASS', 'else': 'ELSE', 'extends': 'EXTENDS', 'final': 'FINAL', 'for': 'FOR', 'if': 'IF', 'implements': 'IMPLEMENTS', 'import': 'IMPORT', 'instanceof': 'INSTANCEOF', 'int': 'INT', 'interface': 'INTERFACE', 'native': 'NATIVE', 'new': 'NEW', 'package': 'PACKAGE', 'protected': 'PROTECTED', 'public': 'PUBLIC', 'return': 'RETURN', 'short': 'SHORT', 'static': 'STATIC', 'this': 'THIS', 'void': 'VOID', 'while': 'WHILE', # Literals 'true': 'LITERALBOOL', 'false': 'LITERALBOOL', 'null': 'NULL' }) def idsToTokens(tokens): for t in tokens: if t.name == 'ID': if t.lex in idToTokenDict: t.name = idToTokenDict.get(t.lex) # a set that contains keywords that are in Java but not in Joos wrongJavaKeyWordDict = { 'assert', 'break', 'case', 'catch', 'const', 'continue', 'default', 'do', 'double', 'enum', 'finally', 'float', 'goto', 'long', 'private', 'strictfp', 'switch', 'synchronized', 'throw', 'throws', 'transient', 'try', 'volatile', '_', 'Long', 'Float' } # a set that contains unsed operators or seperators wrongJavaOpDict = { 'ELLIPSIS', 'ARROW' } ######################## DFA Stuff ################################### ################# Joos DFA Tokens ################################### JoosDFATokens = set([ # Identifiers or Keywords (keywords are mapped to their token name in idToTokenDict above) 'ID', # Literals and names (note: 'this' is considered as a keyword) 'NUM', # number (excludes 0) 'ZERO', # 0 'LITERALBOOL', # true or false 'LITERALCHAR', # character e.g. 'c', includes escape characters? 'LITERALSTRING', # string e.g. "hello", includes escape sequences 'NULL', # null 'COMPID', # compound ids e.g. java.util.vectors 'IMPORTALL', # compound names that have the import all wildcard e.g. java.util.* # Operators: 'ASSIGN', # = 'GT', # > 'LT', # < 'NOT', # ! 'EQUAL', # == 'GE', # >= 'LE', # <= 'NE', # != 'AND', # && 'OR', # || 'ADD', # + 'SUB', # - 'MULT', # * 'DIV', # / 'MOD', # % 'BITAND', # & 'BITOR', # | # Separators: 'SEMICO', # ; 'LPAREN', # ( 'RPAREN', # ) 'LBRACK', # { 'RBRACK', # } 'LSQRBRACK', # [ 'RSQRBRACK', # ] 'COMMA', # , 'PERIOD', # . # Unused Seperator (TODO: refactor this) 'ELLIPSIS', # ... ]) ##################### Transition function ############################ # returns next state after transition on one input character # Note: recognize keywords as ID, then convert them to different tokens later def JoosTransition(input, state): if (state == 'WHITESPACE'): if (input == ' ' or ord(input) == 9 or ord(input) == 12): return 'WHITESPACE' else: return None elif state == 'NEWLINE': if input == '\n': return 'NEWLINE' if input == '\r': return 'NEWLINEC' return None elif state == 'NEWLINEC': if input == '\r' or ord(input) == 10: return 'NEWLINEC' if input == '\n': return 'NEWLINE' return None elif (state == 'START'): if (input.isalpha() or input == '_' or input == '$'): return 'ID' if (input.isdigit()): if (input == '0'): return 'ZERO' return 'NUM' # whitespace if (input == ' ' or ord(input) == 9 or ord(input) == 12): return 'WHITESPACE' # newline if input == '\n': return 'NEWLINE' if input == '\r': return 'NEWLINEC' # operators if (input == '='): return 'ASSIGN' if (input == '>'): return 'GT' if (input == '<'): return 'LT' if (input == '!'): return 'NOT' if (input == '+'): return 'ADD' if (input == '-'): return 'SUB' if (input == '*'): return 'MULT' if (input == '/'): return 'DIV' if (input == '&'): return 'BITAND' if (input == '|'): return 'BITOR' if (input == '%'): return 'MOD' # separators if (input == ';'): return 'SEMICO' if (input == '('): return 'LPAREN' if (input == ')'): return 'RPAREN' if (input == '{'): return 'LBRACK' if (input == '}'): return 'RBRACK' if (input == '['): return 'LSQRBRACK' if (input == ']'): return 'RSQRBRACK' if (input == ','): return 'COMMA' if (input == '.'): return 'PERIOD' # literals if (input == '\"'): return 'LSTRING' if (input == '\''): return 'LCHAR' # Handling all operators that are not allowed in Joos (some cases are handled elsewhere) elif state == 'ADD': if input == '+': return 'OPDISCARD' if input == '=': return 'OPDISCARD' return None elif state == 'SUB': if (input == '>'): return 'OPDISCARD' if input == '-': return 'OPDISCARD' if input == '=': return 'OPDISCARD' return None elif state == 'MULT': if input == '=': return 'OPDISCARD' return None elif state == 'MOD': if input == '=': return 'OPDISCARD' return None elif (state == 'ID'): # dealing with compound names if input == '.': return 'HALFCOMP' if (input.isalpha() or input.isdigit() or input == '_' or input == '$'): return 'ID' return None # Compound names elif state == 'HALFCOMP': if input.isalpha(): return 'COMPID' if input == '*': return 'IMPORTALL' if input == '/': return 'DIV' return None elif state == 'COMPID': if input == '.': return 'HALFCOMP' if input.isalpha() or input.isdigit(): return 'COMPID' return None elif (state == 'NUM'): if(input.isdigit()): return 'NUM' if input == '.': return 'FLOAT' # not accepted return None # string literal elif (state == 'LSTRING'): if (input == '\\'): return 'STRINGESC' if (input == '\"'): return 'LITERALSTRING' return 'LSTRING' elif (state == 'STRINGESC'): if input == 'u': return 'UNICODE' #going to be discarded if input.isdigit() and 0 <= int(input) <= 7: return 'LSTRING' if input not in ('n', 'r', 't', 'v', '\\','\'', '"', '?', 'b' ): return None return 'LSTRING' elif(state == 'CHAROCTAVEESC'): if input.isdigit() and 0 <= int(input) <= 7: return 'CHAROCTAVEESC1' if (input == '\''): return 'LITERALCHAR' return None elif(state == 'CHAROCTAVEESC1'): if input.isdigit() and 0 <= int(input) <= 7: return 'CHAREND' if (input == '\''): return 'LITERALCHAR' return None # char literal elif (state == 'LCHAR'): if (input == '\\'): return 'CHARESC' return 'CHAREND' elif (state == 'CHARESC'): if input.isdigit() and 0 <= int(input) <= 7: return 'CHAROCTAVEESC' if input not in ('n', 'r', 't', 'v', '\\','\'', '"', '?', 'b' ): return None return 'CHAREND' elif (state == 'CHAREND'): if (input == '\''): return 'LITERALCHAR' return None # length 2 operators elif (state == 'ASSIGN'): if (input == '='): return 'EQUAL' return None elif (state == 'GT'): if (input == '='): return 'GE' elif input == '>': return 'OPDISCARD' return None elif (state == 'LT'): if (input == '='): return 'LE' elif input == '<': return 'OPDISCARD' return None elif (state == 'NOT'): if (input == '='): return 'NE' return None elif (state == 'BITAND'): if (input == '&'): return 'AND' elif input == '=': return 'OPDISCARD' return None elif (state == 'BITOR'): if (input == '|'): return 'OR' elif input == '=': return 'OPDISCARD' return None # Comments elif(state == 'DIV'): if (input == '/'): return 'COMMENT' elif (input == '*'): return 'LCOMMENT' elif input == '=': return 'OPDISCARD' return None elif(state == 'LCOMMENT'): if (input == '*'): return 'LCOM2' return 'LCOMMENT' elif state == 'LCOM2': if input == '/': return 'RCOMMENT' if input == '*': return 'LCOM3' return 'LCOMMENT' elif state == 'LCOM3': if input == '/': return 'RCOMMENT' return 'LCOMMENT' elif state == 'COMMENT': if input == '\n': return 'NEWLINE' if input == '\r': return 'NEWLINEC' return 'COMMENT' # Recognizing unused operators to be filtered later elif(state == 'PERIOD'): if(input == '.'): return 'PERIOD2' return None elif(state == 'PERIOD2'): if(input == '.'): return 'ELLIPSIS' return None # Handling hexidecimal literals (error case) elif state == 'ZERO': if input == 'x': return 'HEXLITERAL' elif input == '.': return 'FLOAT' else: return None ##################### Other DFA elements ############################## #TODO: remove alphabets since it's unecessary in our DFA implementation specialChars = set(list(".;:,@{}()[]<>!?+-*/&|^%=''\"\\")) JoosAccept = JoosDFATokens.union({'WHITESPACE', 'COMMENT', 'NEWLINE', 'LCOMMENT', 'RCOMMENT', 'NEWLINEC', 'LCOM2', 'LCOM3'}) JoosStates = JoosAccept.union({'START', 'PERIOD2', 'HALFCOMP', 'HEXLITERAL', 'OPDISCARD', 'FLOAT', 'UNICODE'}) JoosAlphabet = set(string.ascii_lowercase).union(set(string.ascii_uppercase)).union(set(string.digits)).union(specialChars) ######################### DFA ####################################### class DFA (): def __init__(self, states, alphabet, transition, start, accept): self.states = states self.alphabet = alphabet self.transition = transition self.start = start self.accept = accept JoosDFA = DFA( states = JoosDFATokens, alphabet = JoosAlphabet, start = 'START', accept = JoosAccept, transition = JoosTransition ) ################### Simplified Maximal Munch ########################### def SMM(input, dfa): # list of tokens scanned scanned = [] while (input): seenInput = "" state = dfa.start while (input): # print("input is: ", input[0]) if ord(input[0]) > 127: return (None, "Not ASCII") newState = dfa.transition(input[0], state) if not newState: break seenInput += input[0] input = input[1:] state = newState if (state in dfa.accept): scanned.append(Token(state, seenInput)) else: if input: m = ord(input[0]) return (None, m) return (scanned, "success") ################# Scan ################################################ def scan(input): result = SMM(input, JoosDFA) tokens = result[0] # Handling error in munching if (tokens is None): return (None, "Error on Scanning character: " + str(result[1])) if (tokens): # Handle erroneous tokens (return None and error string) multiLineCommentFlag = False indexRange = len(tokens) for index,token in enumerate(tokens): # dealing with numbers that start with 0 (e.g. 09) if token.name == 'ZERO': if index < indexRange-1: if tokens[index+1].name == 'NUM': return (None, "wrong integer literal: starts with 0") # Checking integer range (does not cover all edge cases) # elif token.name == 'NUM' and index > 0 and tokens[index-1].name == 'SUB' and int(token.lex) > 2147483648: # return (None, "integer too small") # elif token.name == 'NUM' and int(token.lex) > 2147483647 and (index is 0 or tokens[index-1].name is not 'SUB'): # return (None, "interger too large") # dealing with keywords in Java but not in Joos elif token.name == 'ID' and token.lex in wrongJavaKeyWordDict: return (None, "keyword in Java but not in Joos") # dealing with operators and seperators in Java but not in Joos elif token.name in wrongJavaOpDict: return (None, "operator in Java but not in Joos") # TODO: need to figure out what is allowed in compID and refactor this checking for explicit this calls elif token.name == 'ID' and token.lex is 'this': return (None, "explicit this call not allowed") elif token.name == 'ID' and token.lex is 'super': return (None, "explicit super call not allowed") # Checking wrong keywords in compIDs elif token.name == 'COMPID': temp = token.lex.split('.') compIDIndexRange = len(temp) for i, t in enumerate(temp): if i == 0 and t in wrongJavaKeyWordDict: return (None, "wrong keyword in comp id") if i is not 0 and (t == 'Class' or t == 'class'): return (None, "wrong keyword in comp id") # Checking if the multi line comment has a closing tag if token.name == 'LCOMMENT': multiLineCommentFlag = True if token.name == 'RCOMMENT': multiLineCommentFlag = False # Checking for missing closing */ for multi line comments if multiLineCommentFlag is True: return (None, "missing closing characters for multi line comments") idsToTokens(tokens) # remove whitespace, newline characters and comments tokens = list(filter(lambda t: t.name not in ("WHITESPACE", "COMMENT", 'LCOMMENT', 'RCOMMENT', 'NEWLINE', 'NEWLINEC', 'LCOM2', 'LCOM3'), tokens)) for index,token in enumerate(tokens): # Checking integer range (does not cover all edge cases) if token.name == 'NUM' and index > 0 and tokens[index-1].name == 'SUB' and int(token.lex) > 2147483648: return (None, "integer too small") elif token.name == 'NUM' and int(token.lex) > 2147483647 and (index is 0 or tokens[index-1].name is not 'SUB'): return (None, "interger too large") return (tokens, "success")