simple scanner

02f53fd3 · Xun Yang · 8f682989 · 02f53fd3 · 02f53fd3
Commit 02f53fd3 authored 5 years ago by Xun Yang
--- a/Scanning.py
+++ b/Scanning.py
-import re # regex
 import string
 # class ScanDFA():
@@ -7,71 +6,124 @@ import string
 ##################### Token ##########################################
 # A Token is a pair (name, lexeme)
-class Token(){
+class Token():
    def __init__(self, name, lex):
        self.name = name
        self.lex = lex
-}
-#################  Joos Token Names in 5 categoeis ##########################
+##################### Joos Tokens Map ###############################
-JoosTokens = set([
+# For tokens that are recognized as another name in the maximal munch scanner
+# e.g. all keywords are scanned as ID first
+# Key: lexeme, Value: name, use lexeme to reassign those tokens correct names
+# TODO: place entries in lexigraphical order for readability
+keywordDict = dict({
+    'boolean': 'BOOLEAN',
+    'class': 'CLASS',
+    'protected': 'PROTECTED',
+    'public': 'PUBLIC',
+    })
+def tokenToKeywords(tokens):
+    for t in tokens:
+        if t.name == 'ID':
+            if t.lex in keywordDict:
+                t.name = keywordDict.get(t.lex)
+######################## DFA Stuff ###################################
+#################  Joos DFA Tokens ###################################
+JoosDFATokens = set([
+    'ID',             # string that is an identifier or keyword
    # Literals:
-    'INT',
+    'NUM',            # number
    # Operants:
-    '+',
+    'ASSIGN',         # =
    # Separators:
-    'L(',
+    'SEMICO',         # ;
-    # Keywords:
+    'LPAREN',         # (
-    'ID', '', 'SQL', 'Git', 'Tableau', 'SAS',
+    'RPAREN',         # )
+    'LBRACK',         # {
+    'RBRACK',         # }
    ])
 ##################### Transition function ############################
-idRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
+# returns next state after transition on one input character
+# recognize keywords as ID, then convert them to different tokens later
 def JoosTransition(input, state):
-    if (state == 'START'):
+    if (state == 'WHITESPACE'):
+        if (input in (' ', '\n')):
+            return 'WHITESPACE'
+        else:
+            return None
+    elif (state != 'START' and input in (' ', '\n')):
+        return None
+    elif (state == 'START'):
+        if (input.isalpha()):
+            return 'ID'
+        if (input.isdigit()):
+            return 'NUM'
+        # whitespace and comments
+        if (input == ' '):
+            return 'WHITESPACE'
+        if (input == '\n'):
+            return 'WHITESPACE'
+        # operators
+        if (input == '='):
+            return 'ASSIGN'
+        # separatos
+        if (input == ';'):
+            return 'SEMICO'
        if (input == '('):
            return 'LPAREN'
        if (input == ')'):
            return 'RPAREN'
+        if (input == '{'):
+            return 'LBRACK'
+        if (input == '}'):
+            return 'RBRACK'
-    elif (idRegex.fullmatch(input) != None):
+    elif (state == 'ID'):
-        return 'ID'
+        if (input.isalpha() or input.isdigit()):
+            return 'ID'
+        return None
+    else:
+        return None
 ##################### Other DFA elements ##############################
+#TODO: add operand and separator characters to alphabet
+specialChars = set(list(".;,{}()[]<>!+-*/=''\"\\"))
-JoosAccept = JoosTokens.union({'WHITESPACE', 'COMMENT'})
+JoosAccept = JoosDFATokens.union({'WHITESPACE', 'COMMENT'})
 JoosStates = JoosAccept.union({'START'}) #TODO: add intermediate states here
-JoosAlphabet = set(string.ascii_lowercase)
+JoosAlphabet = set(string.ascii_lowercase).union(set(string.ascii_uppercase)).union(set(string.digits)).union(specialChars)
-            .union(set(string.ascii_uppercase))
-            .union(set(string.digits))
-            .union(set(list(".;,{}()[]<>!+-*/=''\"\\"))),
-             #TODO: add operand and separator characters to alphabet
 #########################  DFA  #######################################
 class DFA ():
-    def __init__(self, states, alphabet, transition, start, accepting):
+    def __init__(self, states, alphabet, transition, start, accept):
            self.states = states
            self.alphabet = alphabet
            self.transition = transition
            self.start = start
-            self.accepting = accepting
+            self.accept = accept
-    def recognize(input, state):
-        if (input):
-            return state in self.accepting
-        elif (self.transition.isDefinedAt(input[0], state)):
-            recognize(input[1:], self.transition(state, input[0]))
-        else:
-            return false
 JoosDFA = DFA(
-    states = JoosTokens,
+    states = JoosDFATokens,
    alphabet = JoosAlphabet,
    start = 'START',
-    accept = JoosTokens,
+    accept = JoosAccept,
    transition = JoosTransition
    )
@@ -80,26 +132,40 @@ JoosDFA = DFA(
 def SMM(input, dfa):
    # list of tokens scanned
    scanned = []
-    seenInput = []
-    state = dfa.start
    while (input):
-        while (input and transition):
+        seenInput = ""
-            state = dfa.transition(input[0], state)
+        state = dfa.start
-            seenInput.append(input[0])
+        while (input):
+            newState = dfa.transition(input[0], state)
+            if not newState:
+                break
+            seenInput += input[0]
            input = input[1:]
+            state = newState
        if (state in dfa.accept):
            scanned.append(Token(state, seenInput))
-        else :
+        else:
-            print("ERROR on Maximal Munch")
+            print(ord(input), "ERROR on Maximal Munch")
-        seenInput = []
+            break
-        state = dfa.start
    return scanned
 ################# Scan ################################################
 def scan(input):
    tokens = SMM(input, JoosDFA)
-    # TODO: handle edge cases (e.g. check int range, error on ++
+    if (tokens):
+        # TODO: handle edge cases (e.g. check int range, error on ++
+        tokenToKeywords(tokens)
-    # remove whitespace and comments
+        # remove whitespace and comments
-    tokens = filter(lambda t: t.name not in ("WHITESPACE", "COMMENT"), tokens)
+        tokens = filter(lambda t: t.name not in ("WHITESPACE", "COMMENT"), tokens)
    return tokens
+# TODO:
+# check range for digits
+# handle string literals
+# is 00 valid in java?
+# We might not need alphabet for the DFA (alphabet is imply by the transition rules)
--- a/TestScan.py
+++ b/TestScan.py
+import sys
 from Scanning import scan
 # Lines of Tokens
@@ -13,5 +15,6 @@ for line in lines:
 for tline in tlines:
    s = ""
    for token in tline:
-        s += '(' + token.name + ',' + token.lex + '), '
+        if (token.name and token.lex):
+            s += '(' + token.name + ',' + token.lex + '), '
    print(s)