Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
cs444
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Xun Yang
cs444
Commits
02f53fd3
There was an error fetching the commit references. Please try again later.
Commit
02f53fd3
authored
5 years ago
by
Xun Yang
Browse files
Options
Downloads
Patches
Plain Diff
simple scanner
parent
8f682989
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
Scanning.py
+111
-45
111 additions, 45 deletions
Scanning.py
TestScan.py
+4
-1
4 additions, 1 deletion
TestScan.py
with
115 additions
and
46 deletions
Scanning.py
+
111
−
45
View file @
02f53fd3
import
re
# regex
import
string
import
string
# class ScanDFA():
# class ScanDFA():
...
@@ -7,71 +6,124 @@ import string
...
@@ -7,71 +6,124 @@ import string
##################### Token ##########################################
##################### Token ##########################################
# A Token is a pair (name, lexeme)
# A Token is a pair (name, lexeme)
class
Token
()
{
class
Token
()
:
def
__init__
(
self
,
name
,
lex
):
def
__init__
(
self
,
name
,
lex
):
self
.
name
=
name
self
.
name
=
name
self
.
lex
=
lex
self
.
lex
=
lex
}
#################
Joos Token
Names in 5 categoeis
##########################
#################
####
Joos Token
s Map #####
##########################
JoosTokens
=
set
([
# For tokens that are recognized as another name in the maximal munch scanner
# e.g. all keywords are scanned as ID first
# Key: lexeme, Value: name, use lexeme to reassign those tokens correct names
# TODO: place entries in lexigraphical order for readability
keywordDict
=
dict
({
'
boolean
'
:
'
BOOLEAN
'
,
'
class
'
:
'
CLASS
'
,
'
protected
'
:
'
PROTECTED
'
,
'
public
'
:
'
PUBLIC
'
,
})
def
tokenToKeywords
(
tokens
):
for
t
in
tokens
:
if
t
.
name
==
'
ID
'
:
if
t
.
lex
in
keywordDict
:
t
.
name
=
keywordDict
.
get
(
t
.
lex
)
######################## DFA Stuff ###################################
################# Joos DFA Tokens ###################################
JoosDFATokens
=
set
([
'
ID
'
,
# string that is an identifier or keyword
# Literals:
# Literals:
'
INT
'
,
'
NUM
'
,
# number
# Operants:
# Operants:
'
+
'
,
'
ASSIGN
'
,
# =
# Separators:
# Separators:
'
L(
'
,
'
SEMICO
'
,
# ;
# Keywords:
'
LPAREN
'
,
# (
'
ID
'
,
''
,
'
SQL
'
,
'
Git
'
,
'
Tableau
'
,
'
SAS
'
,
'
RPAREN
'
,
# )
'
LBRACK
'
,
# {
'
RBRACK
'
,
# }
])
])
##################### Transition function ############################
##################### Transition function ############################
idRegex
=
re
.
compile
(
r
'
\d\d\d-\d\d\d-\d\d\d\d
'
)
# returns next state after transition on one input character
# recognize keywords as ID, then convert them to different tokens later
def
JoosTransition
(
input
,
state
):
def
JoosTransition
(
input
,
state
):
if
(
state
==
'
START
'
):
if
(
state
==
'
WHITESPACE
'
):
if
(
input
in
(
'
'
,
'
\n
'
)):
return
'
WHITESPACE
'
else
:
return
None
elif
(
state
!=
'
START
'
and
input
in
(
'
'
,
'
\n
'
)):
return
None
elif
(
state
==
'
START
'
):
if
(
input
.
isalpha
()):
return
'
ID
'
if
(
input
.
isdigit
()):
return
'
NUM
'
# whitespace and comments
if
(
input
==
'
'
):
return
'
WHITESPACE
'
if
(
input
==
'
\n
'
):
return
'
WHITESPACE
'
# operators
if
(
input
==
'
=
'
):
return
'
ASSIGN
'
# separatos
if
(
input
==
'
;
'
):
return
'
SEMICO
'
if
(
input
==
'
(
'
):
if
(
input
==
'
(
'
):
return
'
LPAREN
'
return
'
LPAREN
'
if
(
input
==
'
)
'
):
if
(
input
==
'
)
'
):
return
'
RPAREN
'
return
'
RPAREN
'
if
(
input
==
'
{
'
):
return
'
LBRACK
'
if
(
input
==
'
}
'
):
return
'
RBRACK
'
elif
(
idRegex
.
fullmatch
(
input
)
!=
None
):
elif
(
state
==
'
ID
'
):
return
'
ID
'
if
(
input
.
isalpha
()
or
input
.
isdigit
()):
return
'
ID
'
return
None
else
:
return
None
##################### Other DFA elements ##############################
##################### Other DFA elements ##############################
#TODO: add operand and separator characters to alphabet
specialChars
=
set
(
list
(
"
.;,{}()[]<>!+-*/=
''
\"\\
"
))
JoosAccept
=
JoosTokens
.
union
({
'
WHITESPACE
'
,
'
COMMENT
'
})
JoosAccept
=
Joos
DFA
Tokens
.
union
({
'
WHITESPACE
'
,
'
COMMENT
'
})
JoosStates
=
JoosAccept
.
union
({
'
START
'
})
#TODO: add intermediate states here
JoosStates
=
JoosAccept
.
union
({
'
START
'
})
#TODO: add intermediate states here
JoosAlphabet
=
set
(
string
.
ascii_lowercase
)
JoosAlphabet
=
set
(
string
.
ascii_lowercase
).
union
(
set
(
string
.
ascii_uppercase
)).
union
(
set
(
string
.
digits
)).
union
(
specialChars
)
.
union
(
set
(
string
.
ascii_uppercase
))
.
union
(
set
(
string
.
digits
))
.
union
(
set
(
list
(
"
.;,{}()[]<>!+-*/=
''
\"\\
"
))),
#TODO: add operand and separator characters to alphabet
######################### DFA #######################################
######################### DFA #######################################
class
DFA
():
class
DFA
():
def
__init__
(
self
,
states
,
alphabet
,
transition
,
start
,
accept
ing
):
def
__init__
(
self
,
states
,
alphabet
,
transition
,
start
,
accept
):
self
.
states
=
states
self
.
states
=
states
self
.
alphabet
=
alphabet
self
.
alphabet
=
alphabet
self
.
transition
=
transition
self
.
transition
=
transition
self
.
start
=
start
self
.
start
=
start
self
.
accepting
=
accepting
self
.
accept
=
accept
def
recognize
(
input
,
state
):
if
(
input
):
return
state
in
self
.
accepting
elif
(
self
.
transition
.
isDefinedAt
(
input
[
0
],
state
)):
recognize
(
input
[
1
:],
self
.
transition
(
state
,
input
[
0
]))
else
:
return
false
JoosDFA
=
DFA
(
JoosDFA
=
DFA
(
states
=
JoosTokens
,
states
=
Joos
DFA
Tokens
,
alphabet
=
JoosAlphabet
,
alphabet
=
JoosAlphabet
,
start
=
'
START
'
,
start
=
'
START
'
,
accept
=
Joos
Tokens
,
accept
=
Joos
Accept
,
transition
=
JoosTransition
transition
=
JoosTransition
)
)
...
@@ -80,26 +132,40 @@ JoosDFA = DFA(
...
@@ -80,26 +132,40 @@ JoosDFA = DFA(
def
SMM
(
input
,
dfa
):
def
SMM
(
input
,
dfa
):
# list of tokens scanned
# list of tokens scanned
scanned
=
[]
scanned
=
[]
seenInput
=
[]
state
=
dfa
.
start
while
(
input
):
while
(
input
):
while
(
input
and
transition
):
seenInput
=
""
state
=
dfa
.
transition
(
input
[
0
],
state
)
state
=
dfa
.
start
seenInput
.
append
(
input
[
0
])
while
(
input
):
newState
=
dfa
.
transition
(
input
[
0
],
state
)
if
not
newState
:
break
seenInput
+=
input
[
0
]
input
=
input
[
1
:]
input
=
input
[
1
:]
state
=
newState
if
(
state
in
dfa
.
accept
):
if
(
state
in
dfa
.
accept
):
scanned
.
append
(
Token
(
state
,
seenInput
))
scanned
.
append
(
Token
(
state
,
seenInput
))
else
:
else
:
print
(
"
ERROR on Maximal Munch
"
)
print
(
ord
(
input
),
"
ERROR on Maximal Munch
"
)
seenInput
=
[]
break
state
=
dfa
.
start
return
scanned
return
scanned
################# Scan ################################################
################# Scan ################################################
def
scan
(
input
):
def
scan
(
input
):
tokens
=
SMM
(
input
,
JoosDFA
)
tokens
=
SMM
(
input
,
JoosDFA
)
# TODO: handle edge cases (e.g. check int range, error on ++
if
(
tokens
):
# TODO: handle edge cases (e.g. check int range, error on ++
tokenToKeywords
(
tokens
)
# remove whitespace and comments
# remove whitespace and comments
tokens
=
filter
(
lambda
t
:
t
.
name
not
in
(
"
WHITESPACE
"
,
"
COMMENT
"
),
tokens
)
tokens
=
filter
(
lambda
t
:
t
.
name
not
in
(
"
WHITESPACE
"
,
"
COMMENT
"
),
tokens
)
return
tokens
return
tokens
# TODO:
# check range for digits
# handle string literals
# is 00 valid in java?
# We might not need alphabet for the DFA (alphabet is imply by the transition rules)
This diff is collapsed.
Click to expand it.
TestScan.py
+
4
−
1
View file @
02f53fd3
import
sys
from
Scanning
import
scan
from
Scanning
import
scan
# Lines of Tokens
# Lines of Tokens
...
@@ -13,5 +15,6 @@ for line in lines:
...
@@ -13,5 +15,6 @@ for line in lines:
for
tline
in
tlines
:
for
tline
in
tlines
:
s
=
""
s
=
""
for
token
in
tline
:
for
token
in
tline
:
s
+=
'
(
'
+
token
.
name
+
'
,
'
+
token
.
lex
+
'
),
'
if
(
token
.
name
and
token
.
lex
):
s
+=
'
(
'
+
token
.
name
+
'
,
'
+
token
.
lex
+
'
),
'
print
(
s
)
print
(
s
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment