scanning rough structure

ef21ba75 · Xun Yang · ef21ba75
Commit ef21ba75 authored 5 years ago by Xun Yang
--- a/Scanning.py
+++ b/Scanning.py
+import re # regex
+import string
+
+
+# class ScanDFA():
+# scans the input line by line, and char by char for each line
+# explicitly recognize whitespace when scanning, but discard whitespace tokens at the end
+
+#################  Joos Token Names in 5 categoeis ##########################
+
+JoosTokens = set([
+
+    # Literals:
+    'INT',
+    # Operants:
+    '+',
+
+
+    # Separators:
+    'L(',
+
+
+
+    # Keywords:
+    'ID', '', 'SQL', 'Git', 'Tableau', 'SAS',
+
+
+    ])
+##################### Token ##########################################
+# A Token is a pair (name, lexeme)
+class Token(){
+    def __init__(self, name, lex):
+        self.name = name
+        self.lex = lex
+}
+
+##################### Transition function ############################
+
+idRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d')
+
+def JoosTransition(input, state):
+    if (state == 'START'):
+        if (input == '('):
+            return 'LPAREN'
+        if (input == ')'):
+            return 'RPAREN'
+
+    elif (idRegex.fullmatch(input) != None):
+        return 'ID'
+
+##################### Other DFA elements ##############################
+
+#########################  DFA  #######################################
+class DFA ():
+    def __init__(self, states, alphabet, transition, start, accepting):
+            self.states = states
+            self.alphabet = alphabet
+            self.transition = transition
+            self.start = start
+            self.accepting = accepting
+
+    def recognize(input, state):
+        if (input):
+            return state in self.accepting
+        elif (self.transition.isDefinedAt(input[0], state)):
+            recognize(input[1:], self.transition(state, input[0]))
+        else:
+            return false
+
+JoosDFA = DFA(
+    states = JoosTokens,
+    alphabet = set(string.ascii_lowercase)
+                .union(set(string.ascii_uppercase))
+                .union(set(list(".;,{}()[]<>!+-*/=''\"\\"))),
+                 #TODO: add operand and separator characters to alphabet
+    start = 'START',
+    accept = JoosTokens,
+    transition = JoosTransition
+    )
+
+################### Simplified Maximal Munch ###########################
+
+def SMM(input, dfa):
+    # list of tokens scanned
+    scanned = []
+    seenInput = []
+    state = dfa.start
+    while (input):
+        while (input and transition):
+            state = dfa.transition(input[0], state)
+            seenInput.append(input[0])
+            input = input[1:]
+        if (state in dfa.accept):
+            scanned.append(Token(state, seenInput))
+        else :
+            print("ERROR on Maximal Munch")
+        seenInput = []
+        state = dfa.start
+    return scanned
+
+################# Scan ################################################
+def scan(input):
+    tokens = SMM(input, JoosDFA)
+    # TODO: handle edge cases (e.g. check int range, error on ++
+
+    # remove whitespace and comments
+    tokens = filter(lambda t: t.name not in ("WHITESPACE", "COMMENT"), tokens) 
+    return tokens