Skip to content
Snippets Groups Projects
Commit 02f53fd3 authored by Xun Yang's avatar Xun Yang
Browse files

simple scanner

parent 8f682989
No related branches found
No related tags found
No related merge requests found
import re # regex
import string import string
# class ScanDFA(): # class ScanDFA():
...@@ -7,71 +6,124 @@ import string ...@@ -7,71 +6,124 @@ import string
##################### Token ########################################## ##################### Token ##########################################
# A Token is a pair (name, lexeme) # A Token is a pair (name, lexeme)
class Token(){ class Token():
def __init__(self, name, lex): def __init__(self, name, lex):
self.name = name self.name = name
self.lex = lex self.lex = lex
}
################# Joos Token Names in 5 categoeis ########################## ##################### Joos Tokens Map ###############################
JoosTokens = set([ # For tokens that are recognized as another name in the maximal munch scanner
# e.g. all keywords are scanned as ID first
# Key: lexeme, Value: name, use lexeme to reassign those tokens correct names
# TODO: place entries in lexigraphical order for readability
keywordDict = dict({
'boolean': 'BOOLEAN',
'class': 'CLASS',
'protected': 'PROTECTED',
'public': 'PUBLIC',
})
def tokenToKeywords(tokens):
for t in tokens:
if t.name == 'ID':
if t.lex in keywordDict:
t.name = keywordDict.get(t.lex)
######################## DFA Stuff ###################################
################# Joos DFA Tokens ###################################
JoosDFATokens = set([
'ID', # string that is an identifier or keyword
# Literals: # Literals:
'INT', 'NUM', # number
# Operants: # Operants:
'+', 'ASSIGN', # =
# Separators: # Separators:
'L(', 'SEMICO', # ;
# Keywords: 'LPAREN', # (
'ID', '', 'SQL', 'Git', 'Tableau', 'SAS', 'RPAREN', # )
'LBRACK', # {
'RBRACK', # }
]) ])
##################### Transition function ############################ ##################### Transition function ############################
idRegex = re.compile(r'\d\d\d-\d\d\d-\d\d\d\d') # returns next state after transition on one input character
# recognize keywords as ID, then convert them to different tokens later
def JoosTransition(input, state): def JoosTransition(input, state):
if (state == 'START'): if (state == 'WHITESPACE'):
if (input in (' ', '\n')):
return 'WHITESPACE'
else:
return None
elif (state != 'START' and input in (' ', '\n')):
return None
elif (state == 'START'):
if (input.isalpha()):
return 'ID'
if (input.isdigit()):
return 'NUM'
# whitespace and comments
if (input == ' '):
return 'WHITESPACE'
if (input == '\n'):
return 'WHITESPACE'
# operators
if (input == '='):
return 'ASSIGN'
# separatos
if (input == ';'):
return 'SEMICO'
if (input == '('): if (input == '('):
return 'LPAREN' return 'LPAREN'
if (input == ')'): if (input == ')'):
return 'RPAREN' return 'RPAREN'
if (input == '{'):
return 'LBRACK'
if (input == '}'):
return 'RBRACK'
elif (idRegex.fullmatch(input) != None): elif (state == 'ID'):
return 'ID' if (input.isalpha() or input.isdigit()):
return 'ID'
return None
else:
return None
##################### Other DFA elements ############################## ##################### Other DFA elements ##############################
#TODO: add operand and separator characters to alphabet
specialChars = set(list(".;,{}()[]<>!+-*/=''\"\\"))
JoosAccept = JoosTokens.union({'WHITESPACE', 'COMMENT'}) JoosAccept = JoosDFATokens.union({'WHITESPACE', 'COMMENT'})
JoosStates = JoosAccept.union({'START'}) #TODO: add intermediate states here JoosStates = JoosAccept.union({'START'}) #TODO: add intermediate states here
JoosAlphabet = set(string.ascii_lowercase) JoosAlphabet = set(string.ascii_lowercase).union(set(string.ascii_uppercase)).union(set(string.digits)).union(specialChars)
.union(set(string.ascii_uppercase))
.union(set(string.digits))
.union(set(list(".;,{}()[]<>!+-*/=''\"\\"))),
#TODO: add operand and separator characters to alphabet
######################### DFA ####################################### ######################### DFA #######################################
class DFA (): class DFA ():
def __init__(self, states, alphabet, transition, start, accepting): def __init__(self, states, alphabet, transition, start, accept):
self.states = states self.states = states
self.alphabet = alphabet self.alphabet = alphabet
self.transition = transition self.transition = transition
self.start = start self.start = start
self.accepting = accepting self.accept = accept
def recognize(input, state):
if (input):
return state in self.accepting
elif (self.transition.isDefinedAt(input[0], state)):
recognize(input[1:], self.transition(state, input[0]))
else:
return false
JoosDFA = DFA( JoosDFA = DFA(
states = JoosTokens, states = JoosDFATokens,
alphabet = JoosAlphabet, alphabet = JoosAlphabet,
start = 'START', start = 'START',
accept = JoosTokens, accept = JoosAccept,
transition = JoosTransition transition = JoosTransition
) )
...@@ -80,26 +132,40 @@ JoosDFA = DFA( ...@@ -80,26 +132,40 @@ JoosDFA = DFA(
def SMM(input, dfa): def SMM(input, dfa):
# list of tokens scanned # list of tokens scanned
scanned = [] scanned = []
seenInput = []
state = dfa.start
while (input): while (input):
while (input and transition): seenInput = ""
state = dfa.transition(input[0], state) state = dfa.start
seenInput.append(input[0]) while (input):
newState = dfa.transition(input[0], state)
if not newState:
break
seenInput += input[0]
input = input[1:] input = input[1:]
state = newState
if (state in dfa.accept): if (state in dfa.accept):
scanned.append(Token(state, seenInput)) scanned.append(Token(state, seenInput))
else : else:
print("ERROR on Maximal Munch") print(ord(input), "ERROR on Maximal Munch")
seenInput = [] break
state = dfa.start
return scanned return scanned
################# Scan ################################################ ################# Scan ################################################
def scan(input): def scan(input):
tokens = SMM(input, JoosDFA) tokens = SMM(input, JoosDFA)
# TODO: handle edge cases (e.g. check int range, error on ++ if (tokens):
# TODO: handle edge cases (e.g. check int range, error on ++
tokenToKeywords(tokens)
# remove whitespace and comments # remove whitespace and comments
tokens = filter(lambda t: t.name not in ("WHITESPACE", "COMMENT"), tokens) tokens = filter(lambda t: t.name not in ("WHITESPACE", "COMMENT"), tokens)
return tokens return tokens
# TODO:
# check range for digits
# handle string literals
# is 00 valid in java?
# We might not need alphabet for the DFA (alphabet is imply by the transition rules)
import sys
from Scanning import scan from Scanning import scan
# Lines of Tokens # Lines of Tokens
...@@ -13,5 +15,6 @@ for line in lines: ...@@ -13,5 +15,6 @@ for line in lines:
for tline in tlines: for tline in tlines:
s = "" s = ""
for token in tline: for token in tline:
s += '(' + token.name + ',' + token.lex + '), ' if (token.name and token.lex):
s += '(' + token.name + ',' + token.lex + '), '
print(s) print(s)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment