Scanning.py

import string

# class ScanDFA():
# scans the input line by line, and char by char for each line
# explicitly recognize whitespace when scanning, but discard whitespace tokens at the end

##################### Token ##########################################
# A Token is a pair (name, lexeme)
class Token():
    def __init__(self, name, lex):
        self.name = name
        self.lex = lex

##################### Joos Tokens Map ###############################

# For tokens that are recognized as another name in the maximal munch scanner
# e.g. all keywords are scanned as ID first, as well as boolean literals, and null
# Key: lexeme, Value: name, use lexeme to reassign those tokens correct names

idToTokenDict = dict({
    # Keywords
    'abstract': 'ABSTRACT',
    'boolean': 'BOOLEAN',
    'byte': 'BYTE',
    'char': 'CHAR',
    'class': 'CLASS',
    'else': 'ELSE',
    'extends': 'EXTENDS',
    'final': 'FINAL',
    'for': 'FOR',
    'if': 'IF',
    'implements': 'IMPLEMENTS',
    'import': 'IMPORT',
    'instanceof': 'INSTANCEOF',
    'int': 'INT',
    'interface': 'INTERFACE',
    'native': 'NATIVE',
    'new': 'NEW',
    'package': 'PACKAGE',
    'protected': 'PROTECTED',
    'public': 'PUBLIC',
    'return': 'RETURN',
    'short': 'SHORT',
    'static': 'STATIC',
    'void': 'VOID',
    'while': 'WHILE',

    # Literals
    'true': 'LITERALBOOL',
    'false': 'LITERALBOOL',
    'null': 'NULL'
    })

def idsToTokens(tokens):
    for t in tokens:
        if t.name == 'ID':
            if t.lex in idToTokenDict:
                t.name = idToTokenDict.get(t.lex)


# a set that contains keywords that are in Java but not in Joos
wrongJavaKeyWordDict = {
    'assert',
    'break',
    'case',
    'catch',
    'const',
    'continue',
    'default',
    'do',
    'double',
    'enum',
    'finally',
    'float',
    'goto',
    'long',
    'private',
    'strictfp',
    'switch',
    'super',
    'synchronized',
    'throw',
    'throws',
    'transient',
    'try',
    'volatile',
    '_',
    'Long',
    'Float'
}

# a set that contains unsed operators or seperators
wrongJavaOpDict = {
    'ELLIPSIS',
    'ARROW'
}

######################## DFA Stuff ###################################

#################  Joos DFA Tokens ###################################
JoosDFATokens = set([
    # Identifiers or Keywords (keywords are mapped to their token name in idToTokenDict above)
    'ID',

    # Literals and names (note: 'this' is considered as a keyword)
    'NUM',            # number (excludes 0)
    'ZERO',           # 0
    'NEGATIVENUM',    # to do integer range checking, will be mapped to NUM after checking
    'LITERALBOOL',    # true or false
    'LITERALCHAR',    # character e.g. 'c', includes escape characters?
    'LITERALSTRING',  # string e.g. "hello", includes escape sequences
    'NULL',           # null
    'COMPID',         # compound ids e.g. java.util.vectors
    'IMPORTALL',      # compound names that have the import all wildcard e.g. java.util.*

    # Operators:
    'ASSIGN',         # =
    'GT',             # >
    'LT',             # <
    'NOT',            # !
    'EQUAL',          # ==
    'GE',             # >=
    'LE',             # <=
    'NE',             # !=
    'AND',            # &&
    'OR',             # ||
    'ADD',            # +
    'SUB',            # -
    'MULT',           # *
    'DIV',            # /
    'MOD',            # %
    'BITAND',         # &
    'BITOR',          # |

    # Separators:
    'SEMICO',         # ;
    'LPAREN',         # (
    'RPAREN',         # )
    'LBRACK',         # {
    'RBRACK',         # }
    'LSQRBRACK',      # [
    'RSQRBRACK',      # ]
    'COMMA',          # ,
    'PERIOD',         # .

    # Unused Seperator (TODO: refactor this)
    'ELLIPSIS',       # ...

    ])

##################### Transition function ############################

# returns next state after transition on one input character
# Note: recognize keywords as ID, then convert them to different tokens later
def JoosTransition(input, state):
    if (state == 'WHITESPACE'):
        if (input == ' ' or ord(input) == 9 or ord(input) == 12):
            return 'WHITESPACE'
        else:
            return None
    elif state == 'NEWLINE':
        if input == '\n':
            return 'NEWLINE'
        if input == '\r':
            return 'NEWLINEC'
        return None
    elif state == 'NEWLINEC':
        if input == '\r' or ord(input) == 10:
            return 'NEWLINEC'
        if input == '\n':
            return 'NEWLINE'
        return None

    elif (state == 'START'):
        if (input.isalpha() or input == '_' or input == '$'):
            return 'ID'
        if (input.isdigit()):
            if (input == '0'):
                return 'ZERO'
            return 'NUM'

        # whitespace
        if (input == ' ' or ord(input) == 9 or ord(input) == 12):
            return 'WHITESPACE'
        # newline
        if input == '\n':
            return 'NEWLINE'
        if input == '\r':
            return 'NEWLINEC'

        # operators
        if (input == '='):
            return 'ASSIGN'
        if (input == '>'):
            return 'GT'
        if (input == '<'):
            return 'LT'
        if (input == '!'):
            return 'NOT'
        if (input == '+'):
            return 'ADD'
        if (input == '-'):
            return 'SUB'
        if (input == '*'):
            return 'MULT'
        if (input == '/'):
            return 'DIV'
        if (input == '&'):
            return 'BITAND'
        if (input == '|'):
            return 'BITOR'
        if (input == '%'):
            return 'MOD'

        # separators
        if (input == ';'):
            return 'SEMICO'
        if (input == '('):
            return 'LPAREN'
        if (input == ')'):
            return 'RPAREN'
        if (input == '{'):
            return 'LBRACK'
        if (input == '}'):
            return 'RBRACK'
        if (input == '['):
            return 'LSQRBRACK'
        if (input == ']'):
            return 'RSQRBRACK'
        if (input == ','):
            return 'COMMA'
        if (input == '.'):
            return 'PERIOD'

        # literals
        if (input == '\"'):
            return 'LSTRING'
        if (input == '\''):
            return 'LCHAR'

    # Handling all operators that are not allowed in Joos (some cases are handled elsewhere) and negative numbers
    elif state == 'ADD':
        if input == '+':
            return 'OPDISCARD'
        if input == '=':
            return 'OPDISCARD'
        return None
    elif state == 'SUB':
        if (input == '>'):
            return 'OPDISCARD'
        if input.isdigit():
            return 'NEGATIVENUM'
        if input == '-':
            return 'OPDISCARD'
        if input == '=':
            return 'OPDISCARD'
        return None
    elif state == 'MULT':
        if input == '=':
            return 'OPDISCARD'
        return None
    elif state == 'MOD':
        if input == '=':
            return 'OPDISCARD'
        return None


    elif (state == 'ID'):
        # dealing with compound names
        if input == '.':
            return 'HALFCOMP'
        if (input.isalpha() or input.isdigit() or input == '_' or input == '$'):
            return 'ID'
        return None

    # Compound names
    elif state == 'HALFCOMP':
        if input.isalpha():
            return 'COMPID'
        if input == '*':
            return 'IMPORTALL'
        if input == '/':
            return 'DIV'
        return None
    elif state == 'COMPID':
        if input == '.':
            return 'HALFCOMP'
        if input.isalpha() or input.isdigit() or input == '_' or input == '$':
            return 'COMPID'
        return None

    elif (state == 'NUM'):
        if(input.isdigit()):
            return 'NUM'
        if input == '.':
            return 'FLOAT' # not accepted
        return None

    elif state == 'NEGATIVENUM':
        if input.isdigit():
            return 'NEGATIVENUM'
        if input == '.':
            return 'FLOAT' # not accepted
        return None

    # string literal
    elif (state == 'LSTRING'):
        if (input == '\\'):
            return 'STRINGESC'
        if (input == '\"'):
            return 'LITERALSTRING'
        return 'LSTRING'
    elif (state == 'STRINGESC'):
        if input == 'u':
            return 'UNICODE' #going to be discarded
        if input.isdigit() and  0  <= int(input) <= 7:
            return 'LSTRING'
        if input not in ('n', 'r', 't', 'v', '\\','\'', '"', '?', 'b' ):
            return None
        return 'LSTRING'

    elif(state == 'CHAROCTAVEESC'):
        if input.isdigit() and  0 <= int(input) <= 7:
            return 'CHAROCTAVEESC1'
        if (input == '\''):
            return 'LITERALCHAR'
        return None
    elif(state == 'CHAROCTAVEESC1'):
        if input.isdigit() and  0 <= int(input) <= 7:
            return 'CHAREND'
        if (input == '\''):
            return 'LITERALCHAR'
        return None

    # char literal
    elif (state == 'LCHAR'):
        if (input == '\\'):
            return 'CHARESC'
        return 'CHAREND'
    elif (state == 'CHARESC'):
        if input.isdigit() and  0 <= int(input) <= 7:
            return 'CHAROCTAVEESC'
        if input not in ('n', 'r', 't', 'v', '\\','\'', '"', '?', 'b' ):
            return None
        return 'CHAREND'
    elif (state == 'CHAREND'):
        if (input == '\''):
            return 'LITERALCHAR'
        return None

    # length 2 operators
    elif (state == 'ASSIGN'):
        if (input == '='):
            return 'EQUAL'
        return None
    elif (state == 'GT'):
        if (input == '='):
            return 'GE'
        elif input == '>':
            return 'OPDISCARD'
        return None
    elif (state == 'LT'):
        if (input == '='):
            return 'LE'
        elif input == '<':
            return 'OPDISCARD'
        return None
    elif (state == 'NOT'):
        if (input == '='):
            return 'NE'
        return None
    elif (state == 'BITAND'):
        if (input == '&'):
            return 'AND'
        elif input == '=':
            return 'OPDISCARD'
        return None
    elif (state == 'BITOR'):
        if (input == '|'):
            return 'OR'
        elif input == '=':
            return 'OPDISCARD'
        return None


    # Comments
    elif(state == 'DIV'):
        if (input == '/'):
            return 'COMMENT'
        elif (input == '*'):
            return 'LCOMMENT'
        elif input == '=':
            return 'OPDISCARD'
        return None
    elif(state == 'LCOMMENT'):
        if (input == '*'):
            return 'LCOM2'
        return 'LCOMMENT'
    elif state == 'LCOM2':
        if input == '/':
            return 'RCOMMENT'
        if input == '*':
            return 'LCOM3'
        return 'LCOMMENT'
    elif state == 'LCOM3':
        if input == '/':
            return 'RCOMMENT'
        return 'LCOMMENT'
    elif state == 'COMMENT':
        if input == '\n':
            return 'NEWLINE'
        if input == '\r':
            return 'NEWLINEC'
        return 'COMMENT'

    # Recognizing unused operators to be filtered later
    elif(state == 'PERIOD'):
        if(input == '.'):
            return 'PERIOD2'
        return None
    elif(state == 'PERIOD2'):
        if(input == '.'):
            return 'ELLIPSIS'
        return None

    # Handling hexidecimal literals (error case)
    elif state == 'ZERO':
        if input == 'x':
            return 'HEXLITERAL'
        elif input == '.':
            return 'FLOAT'

    else:
        return None

##################### Other DFA elements ##############################
#TODO: remove alphabets since it's unecessary in our DFA implementation
specialChars = set(list(".;:,@{}()[]<>!?+-*/&|^%=''\"\\"))
JoosAccept = JoosDFATokens.union({'WHITESPACE', 'COMMENT', 'NEWLINE', 'LCOMMENT', 'RCOMMENT', 'NEWLINEC', 'LCOM2', 'LCOM3'})
JoosStates = JoosAccept.union({'START', 'PERIOD2', 'HALFCOMP', 'HEXLITERAL', 'OPDISCARD', 'FLOAT', 'UNICODE'})
JoosAlphabet = set(string.ascii_lowercase).union(set(string.ascii_uppercase)).union(set(string.digits)).union(specialChars)

#########################  DFA  #######################################
class DFA ():
    def __init__(self, states, alphabet, transition, start, accept):
            self.states = states
            self.alphabet = alphabet
            self.transition = transition
            self.start = start
            self.accept = accept

JoosDFA = DFA(
    states = JoosDFATokens,
    alphabet = JoosAlphabet,
    start = 'START',
    accept = JoosAccept,
    transition = JoosTransition
    )

################### Simplified Maximal Munch ###########################

def SMM(input, dfa):
    # list of tokens scanned
    scanned = []
    while (input):
        seenInput = ""
        state = dfa.start
        while (input):
            # print("input is: ", input[0])
            if ord(input[0]) > 127:
                return (None, "Not ASCII")
            newState = dfa.transition(input[0], state)
            if not newState:
                break
            seenInput += input[0]
            input = input[1:]
            state = newState
        if (state in dfa.accept):
            scanned.append(Token(state, seenInput))
        else:
            if input:
                m = ord(input[0])
                return (None, m)

    return (scanned, "success")

################# Scan ################################################

def scan(input):
    result = SMM(input, JoosDFA)
    tokens = result[0]
    # Handling error in munching
    if (tokens is None):
        return (None, "Error on Scanning character: " + str(result[1]))

    if (tokens):
        # Handle erroneous tokens (return None and error string)
        multiLineCommentFlag = False
        indexRange = len(tokens)
        for index,token in enumerate(tokens):
            # dealing with numbers that start with 0 (e.g. 09)
            if token.name == 'ZERO':
                if index < indexRange-1:
                    if tokens[index+1].name == 'NUM':
                        return (None, "wrong integer literal: starts with 0")
            # Checking integer range
            elif token.name == 'NEGATIVENUM' and int(token.lex) < -2147483648:
                return (None, "negative int too large")

            # dealing with keywords in Java but not in Joos
            elif token.name == 'ID' and token.lex in wrongJavaKeyWordDict:
                return (None, "keyword in Java but not in Joos")
            # dealing with operators and seperators in Java but not in Joos
            elif token.name in wrongJavaOpDict:
                return (None, "operator in Java but not in Joos")
            # TODO: need to figure out what is allowed in compID and refactor this checking for explicit this calls
            elif token.name == 'ID' and token.lex is 'this':
                return (None, "explicit this call not allowed")

            # Checking wrong keywords in compIDs
            elif token.name == 'COMPID':
                temp = token.lex.split('.')
                compIDIndexRange = len(temp)
                for i, t in enumerate(temp):
                    if i == 0 and t in wrongJavaKeyWordDict:
                        return (None, "wrong keyword in comp id")
                    if i is not 0 and t == 'this':
                        return (None, "this cannot be in the middle of a compid")

            # Checking if the multi line comment has a closing tag
            if token.name == 'LCOMMENT':
                multiLineCommentFlag = True
            if token.name == 'RCOMMENT':
                multiLineCommentFlag = False


        # Checking for missing closing */ for multi line comments
        if multiLineCommentFlag is True:
            return (None, "missing closing characters for multi line comments")

        idsToTokens(tokens)

        # remove whitespace, newline characters and comments
        tokens = list(filter(lambda t: t.name not in ("COMMENT", 'LCOMMENT', 'RCOMMENT', 'NEWLINE', 'NEWLINEC', 'LCOM2', 'LCOM3'), tokens))

        for index,token in enumerate(tokens):
            if token.name == 'NUM' and int(token.lex) > 2147483647 and (index is 0 or tokens[index-1].name is not 'SUB'):
                return (None, "positive int too large")

        tokens = list(filter(lambda t: t.name is not 'WHITESPACE', tokens))

        tokensNew = []
        for index, t in enumerate(tokens):
            if t.name is 'NEGATIVENUM':
                tokensNew.append(Token('SUB', '-'))
                tokensNew.append(Token('NUM', t.lex[1:]))
            else:
                tokensNew.append(t)

    return (tokensNew, "success")