Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ece459-w23-a2
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Patrick Lam
ece459-w23-a2
Commits
7223dc4c
Commit
7223dc4c
authored
2 years ago
by
Patrick Lam
Browse files
Options
Downloads
Patches
Plain Diff
code cleanup
parent
a93df11f
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
src/main.rs
+19
-32
19 additions, 32 deletions
src/main.rs
src/packages/parser.rs
+63
-118
63 additions, 118 deletions
src/packages/parser.rs
with
82 additions
and
150 deletions
src/main.rs
+
19
−
32
View file @
7223dc4c
...
...
@@ -73,7 +73,7 @@ fn derive_2grams_from_trigram(trigram:&str) -> Vec<String> {
format!
(
"{}^{}"
,
grams
[
1
],
grams
[
2
])];
}
enum
LogFormat
{
pub
enum
LogFormat
{
Linux
,
OpenStack
,
Spark
,
...
...
@@ -93,62 +93,49 @@ fn main() {
let
args
=
Args
::
parse
();
let
mut
input_fn
=
None
;
let
mut
log_format
=
None
;
let
mut
log_format_opt
=
None
;
// hey, please let me know (email) if there's a more idiomatic way to do this
if
let
Some
(
raw_linux
)
=
args
.raw_linux
{
log_format
=
Some
(
Linux
);
log_format
_opt
=
Some
(
Linux
);
input_fn
=
Some
(
raw_linux
);
}
else
if
let
Some
(
raw_openstack
)
=
args
.raw_openstack
{
log_format
=
Some
(
OpenStack
);
log_format
_opt
=
Some
(
OpenStack
);
input_fn
=
Some
(
raw_openstack
);
}
else
if
let
Some
(
raw_spark
)
=
args
.raw_spark
{
log_format
=
Some
(
Spark
);
log_format
_opt
=
Some
(
Spark
);
input_fn
=
Some
(
raw_spark
);
}
else
if
let
Some
(
raw_hdfs
)
=
args
.raw_hdfs
{
log_format
=
Some
(
HDFS
);
log_format
_opt
=
Some
(
HDFS
);
input_fn
=
Some
(
raw_hdfs
);
}
else
if
let
Some
(
raw_hpc
)
=
args
.raw_hpc
{
log_format
=
Some
(
HPC
);
log_format
_opt
=
Some
(
HPC
);
input_fn
=
Some
(
raw_hpc
);
}
else
if
let
Some
(
raw_proxifier
)
=
args
.raw_proxifier
{
log_format
=
Some
(
Proxifier
);
log_format
_opt
=
Some
(
Proxifier
);
input_fn
=
Some
(
raw_proxifier
);
}
else
if
let
Some
(
raw_android
)
=
args
.raw_android
{
log_format
=
Some
(
Android
);
log_format
_opt
=
Some
(
Android
);
input_fn
=
Some
(
raw_android
);
}
else
if
let
Some
(
raw_healthapp
)
=
args
.raw_healthapp
{
log_format
=
Some
(
HealthApp
);
log_format
_opt
=
Some
(
HealthApp
);
input_fn
=
Some
(
raw_healthapp
);
}
let
log_format
=
match
log_format_opt
{
None
=>
panic!
(
"must specify a raw input file"
),
Some
(
lf
)
=>
lf
,
};
let
cutoff
=
match
args
.cutoff
{
None
=>
3
,
Some
(
c
)
=>
c
,
};
let
(
double_dict
,
triple_dict
,
_all_token_list
)
=
match
log_format
{
Some
(
Linux
)
=>
packages
::
parser
::
parse_raw_linux
(
input_fn
.unwrap
()),
Some
(
OpenStack
)
=>
packages
::
parser
::
parse_raw_openstack
(
input_fn
.unwrap
()),
Some
(
Spark
)
=>
packages
::
parser
::
parse_raw_spark
(
input_fn
.unwrap
()),
Some
(
HDFS
)
=>
packages
::
parser
::
parse_raw_hdfs
(
input_fn
.unwrap
()),
Some
(
HPC
)
=>
packages
::
parser
::
parse_raw_hpc
(
input_fn
.unwrap
()),
Some
(
Proxifier
)
=>
packages
::
parser
::
parse_raw_proxifier
(
input_fn
.unwrap
()),
Some
(
Android
)
=>
packages
::
parser
::
parse_raw_android
(
input_fn
.unwrap
()),
Some
(
HealthApp
)
=>
packages
::
parser
::
parse_raw_healthapp
(
input_fn
.unwrap
()),
None
=>
panic!
(
"no log format specified"
),
};
let
(
double_dict
,
triple_dict
,
_all_token_list
)
=
packages
::
parser
::
parse_raw
(
input_fn
.unwrap
(),
&
log_format
);
view_double_and_triple_dicts
(
&
double_dict
,
&
triple_dict
);
let
(
format_string_re
,
censored_regexps
)
=
match
log_format
{
Some
(
Linux
)
=>
(
packages
::
parser
::
regex_generator
(
packages
::
parser
::
linux_format
()),
packages
::
parser
::
linux_censored_regexps
()),
Some
(
OpenStack
)
=>
(
packages
::
parser
::
regex_generator
(
packages
::
parser
::
openstack_format
()),
packages
::
parser
::
openstack_censored_regexps
()),
Some
(
Spark
)
=>
(
packages
::
parser
::
regex_generator
(
packages
::
parser
::
spark_format
()),
packages
::
parser
::
spark_censored_regexps
()),
Some
(
HDFS
)
=>
(
packages
::
parser
::
regex_generator
(
packages
::
parser
::
hdfs_format
()),
packages
::
parser
::
hdfs_censored_regexps
()),
Some
(
HPC
)
=>
(
packages
::
parser
::
regex_generator
(
packages
::
parser
::
hpc_format
()),
packages
::
parser
::
hpc_censored_regexps
()),
Some
(
Proxifier
)
=>
(
packages
::
parser
::
regex_generator
(
packages
::
parser
::
proxifier_format
()),
packages
::
parser
::
proxifier_censored_regexps
()),
Some
(
Android
)
=>
(
packages
::
parser
::
regex_generator
(
packages
::
parser
::
android_format
()),
packages
::
parser
::
android_censored_regexps
()),
Some
(
HealthApp
)
=>
(
packages
::
parser
::
regex_generator
(
packages
::
parser
::
healthapp_format
()),
packages
::
parser
::
healthapp_censored_regexps
()),
None
=>
panic!
(
"no log format specified"
),
};
let
(
format_string_re
,
censored_regexps
)
=
(
packages
::
parser
::
regex_generator
(
packages
::
parser
::
format_string
(
&
log_format
)),
packages
::
parser
::
censored_regexps
(
&
log_format
));
//let sample_string = "Jun 23 23:30:05 combo sshd(pam_unix)[26190]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=218.22.3.51 user=root authentication".to_string();
// add befores and afters to the sample string, yielding extended_sample_string
...
...
This diff is collapsed.
Click to expand it.
src/packages/parser.rs
+
63
−
118
View file @
7223dc4c
...
...
@@ -5,83 +5,70 @@ use regex::Regex;
use
std
::
collections
::
HashMap
;
use
std
::
collections
::
BTreeSet
;
pub
fn
linux_format
()
->
String
{
return
"<Month> <Date> <Time> <Level> <Component>(
\\
[<PID>
\\
])?: <Content>"
.to_string
();
}
pub
fn
linux_censored_regexps
()
->
Vec
<
Regex
>
{
return
vec!
[
Regex
::
new
(
r"(\d+\.){3}\d+"
)
.unwrap
(),
Regex
::
new
(
r"\w{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \d{4}"
)
.unwrap
(),
Regex
::
new
(
r"\d{2}:\d{2}:\d{2}"
)
.unwrap
()];
}
pub
fn
openstack_format
()
->
String
{
return
r"'<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>'"
.to_string
();
}
pub
fn
openstack_censored_regexps
()
->
Vec
<
Regex
>
{
return
vec!
[
Regex
::
new
(
r"((\d+\.){3}\d+,?)+"
)
.unwrap
(),
Regex
::
new
(
r"/.+?\s"
)
.unwrap
()];
// I commented out Regex::new(r"\d+").unwrap() because that censors all numbers, which may not be what we want?
}
pub
fn
spark_format
()
->
String
{
return
"<Date> <Time> <Level> <Component>: <Content>"
.to_string
();
}
pub
fn
spark_censored_regexps
()
->
Vec
<
Regex
>
{
return
vec!
[
Regex
::
new
(
r"(\d+\.){3}\d+"
)
.unwrap
(),
Regex
::
new
(
r"\b[KGTM]?B\b"
)
.unwrap
(),
Regex
::
new
(
r"([\w-]+\.){2,}[\w-]+"
)
.unwrap
()]
}
pub
fn
hdfs_format
()
->
String
{
return
"<Date> <Time> <Pid> <Level> <Component>: <Content>"
.to_string
();
use
crate
::
LogFormat
;
use
crate
::
LogFormat
::
Linux
;
use
crate
::
LogFormat
::
OpenStack
;
use
crate
::
LogFormat
::
Spark
;
use
crate
::
LogFormat
::
HDFS
;
use
crate
::
LogFormat
::
HPC
;
use
crate
::
LogFormat
::
Proxifier
;
use
crate
::
LogFormat
::
Android
;
use
crate
::
LogFormat
::
HealthApp
;
pub
fn
format_string
(
lf
:
&
LogFormat
)
->
String
{
match
lf
{
Linux
=>
r"<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>"
.to_string
(),
OpenStack
=>
r"'<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>'"
.to_string
(),
Spark
=>
r"<Date> <Time> <Level> <Component>: <Content>"
.to_string
(),
HDFS
=>
r"<Date> <Time> <Pid> <Level> <Component>: <Content>"
.to_string
(),
HPC
=>
r"<LogId> <Node> <Component> <State> <Time> <Flag> <Content>"
.to_string
(),
Proxifier
=>
r"[<Time>] <Program> - <Content>"
.to_string
(),
Android
=>
r"<Date> <Time> <Pid> <Tid> <Level> <Component>: <Content>"
.to_string
(),
HealthApp
=>
"<Time>
\\
|<Component>
\\
|<Pid>
\\
|<Content>"
.to_string
()
}
}
pub
fn
hdfs_censored_regexps
()
->
Vec
<
Regex
>
{
return
vec!
[
Regex
::
new
(
r"blk_(|-)[0-9]+"
)
.unwrap
(),
// block id
pub
fn
censored_regexps
(
lf
:
&
LogFormat
)
->
Vec
<
Regex
>
{
match
lf
{
Linux
=>
vec!
[
Regex
::
new
(
r"(\d+\.){3}\d+"
)
.unwrap
(),
Regex
::
new
(
r"\w{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \d{4}"
)
.unwrap
(),
Regex
::
new
(
r"\d{2}:\d{2}:\d{2}"
)
.unwrap
()],
OpenStack
=>
vec!
[
Regex
::
new
(
r"((\d+\.){3}\d+,?)+"
)
.unwrap
(),
Regex
::
new
(
r"/.+?\s"
)
.unwrap
()],
// I commented out Regex::new(r"\d+").unwrap() because that censors all numbers, which may not be what we want?
Spark
=>
vec!
[
Regex
::
new
(
r"(\d+\.){3}\d+"
)
.unwrap
(),
Regex
::
new
(
r"\b[KGTM]?B\b"
)
.unwrap
(),
Regex
::
new
(
r"([\w-]+\.){2,}[\w-]+"
)
.unwrap
()],
HDFS
=>
vec!
[
Regex
::
new
(
r"blk_(|-)[0-9]+"
)
.unwrap
(),
// block id
Regex
::
new
(
r"(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)"
)
.unwrap
()
// IP
];
// oops, numbers require lookbehind, which rust doesn't support, sigh
// Regex::new(r"(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$").unwrap()]; // Numbers
}
pub
fn
hpc_format
()
->
String
{
return
"<LogId> <Node> <Component> <State> <Time> <Flag> <Content>"
.to_string
();
}
pub
fn
hpc_censored_regexps
()
->
Vec
<
Regex
>
{
return
vec!
[
Regex
::
new
(
r"=\d+"
)
.unwrap
()];
}
pub
fn
proxifier_format
()
->
String
{
return
"[<Time>] <Program> - <Content>"
.to_string
();
}
pub
fn
proxifier_censored_regexps
()
->
Vec
<
Regex
>
{
return
vec!
[
Regex
::
new
(
r"<\d+\ssec"
)
.unwrap
(),
Regex
::
new
(
r"([\w-]+\.)+[\w-]+(:\d+)?"
)
.unwrap
(),
Regex
::
new
(
r"\d{2}:\d{2}(:\d{2})*"
)
.unwrap
(),
Regex
::
new
(
r"[KGTM]B"
)
.unwrap
()];
}
pub
fn
android_format
()
->
String
{
return
"<Date> <Time> <Pid> <Tid> <Level> <Component>: <Content>"
.to_string
();
}
pub
fn
android_censored_regexps
()
->
Vec
<
Regex
>
{
return
vec!
[
Regex
::
new
(
r"(/[\w-]+)+"
)
.unwrap
(),
Regex
::
new
(
r"([\w-]+\.){2,}[\w-]+"
)
.unwrap
(),
Regex
::
new
(
r"\b(\-?\+?\d+)\b|\b0[Xx][a-fA-F\d]+\b|\b[a-fA-F\d]{4,}\b"
)
.unwrap
()];
}
pub
fn
healthapp_format
()
->
String
{
return
"<Time>
\\
|<Component>
\\
|<Pid>
\\
|<Content>"
.to_string
();
}
pub
fn
healthapp_censored_regexps
()
->
Vec
<
Regex
>
{
return
vec!
[];
],
// oops, numbers require lookbehind, which rust doesn't support, sigh
// Regex::new(r"(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$").unwrap()]; // Numbers
HPC
=>
vec!
[
Regex
::
new
(
r"=\d+"
)
.unwrap
()],
Proxifier
=>
vec!
[
Regex
::
new
(
r"<\d+\ssec"
)
.unwrap
(),
Regex
::
new
(
r"([\w-]+\.)+[\w-]+(:\d+)?"
)
.unwrap
(),
Regex
::
new
(
r"\d{2}:\d{2}(:\d{2})*"
)
.unwrap
(),
Regex
::
new
(
r"[KGTM]B"
)
.unwrap
()],
Android
=>
vec!
[
Regex
::
new
(
r"(/[\w-]+)+"
)
.unwrap
(),
Regex
::
new
(
r"([\w-]+\.){2,}[\w-]+"
)
.unwrap
(),
Regex
::
new
(
r"\b(\-?\+?\d+)\b|\b0[Xx][a-fA-F\d]+\b|\b[a-fA-F\d]{4,}\b"
)
.unwrap
()],
HealthApp
=>
vec!
[],
}
}
// https://doc.rust-lang.org/rust-by-example/std_misc/file/read_lines.html
...
...
@@ -307,50 +294,8 @@ fn test_dictionary_builder_process_line_lookahead_is_some() {
assert_eq!
(
trpl
,
trpl_oracle
);
}
pub
fn
parse_raw_linux
(
raw_fn
:
String
)
->
(
HashMap
<
String
,
i32
>
,
HashMap
<
String
,
i32
>
,
Vec
<
String
>
)
{
let
(
double_dict
,
triple_dict
,
all_token_list
)
=
dictionary_builder
(
raw_fn
,
linux_format
(),
linux_censored_regexps
());
println!
(
"double dictionary list len {}, triple {}, all tokens {}"
,
double_dict
.len
(),
triple_dict
.len
(),
all_token_list
.len
());
return
(
double_dict
,
triple_dict
,
all_token_list
);
}
pub
fn
parse_raw_openstack
(
raw_fn
:
String
)
->
(
HashMap
<
String
,
i32
>
,
HashMap
<
String
,
i32
>
,
Vec
<
String
>
)
{
let
(
double_dict
,
triple_dict
,
all_token_list
)
=
dictionary_builder
(
raw_fn
,
openstack_format
(),
openstack_censored_regexps
());
println!
(
"double dictionary list len {}, triple {}, all tokens {}"
,
double_dict
.len
(),
triple_dict
.len
(),
all_token_list
.len
());
return
(
double_dict
,
triple_dict
,
all_token_list
);
}
pub
fn
parse_raw_spark
(
raw_fn
:
String
)
->
(
HashMap
<
String
,
i32
>
,
HashMap
<
String
,
i32
>
,
Vec
<
String
>
)
{
let
(
double_dict
,
triple_dict
,
all_token_list
)
=
dictionary_builder
(
raw_fn
,
spark_format
(),
spark_censored_regexps
());
println!
(
"double dictionary list len {}, triple {}, all tokens {}"
,
double_dict
.len
(),
triple_dict
.len
(),
all_token_list
.len
());
return
(
double_dict
,
triple_dict
,
all_token_list
);
}
pub
fn
parse_raw_hdfs
(
raw_fn
:
String
)
->
(
HashMap
<
String
,
i32
>
,
HashMap
<
String
,
i32
>
,
Vec
<
String
>
)
{
let
(
double_dict
,
triple_dict
,
all_token_list
)
=
dictionary_builder
(
raw_fn
,
hdfs_format
(),
hdfs_censored_regexps
());
println!
(
"double dictionary list len {}, triple {}, all tokens {}"
,
double_dict
.len
(),
triple_dict
.len
(),
all_token_list
.len
());
return
(
double_dict
,
triple_dict
,
all_token_list
);
}
pub
fn
parse_raw_hpc
(
raw_fn
:
String
)
->
(
HashMap
<
String
,
i32
>
,
HashMap
<
String
,
i32
>
,
Vec
<
String
>
)
{
let
(
double_dict
,
triple_dict
,
all_token_list
)
=
dictionary_builder
(
raw_fn
,
hpc_format
(),
hpc_censored_regexps
());
println!
(
"double dictionary list len {}, triple {}, all tokens {}"
,
double_dict
.len
(),
triple_dict
.len
(),
all_token_list
.len
());
return
(
double_dict
,
triple_dict
,
all_token_list
);
}
pub
fn
parse_raw_proxifier
(
raw_fn
:
String
)
->
(
HashMap
<
String
,
i32
>
,
HashMap
<
String
,
i32
>
,
Vec
<
String
>
)
{
let
(
double_dict
,
triple_dict
,
all_token_list
)
=
dictionary_builder
(
raw_fn
,
proxifier_format
(),
proxifier_censored_regexps
());
println!
(
"double dictionary list len {}, triple {}, all tokens {}"
,
double_dict
.len
(),
triple_dict
.len
(),
all_token_list
.len
());
return
(
double_dict
,
triple_dict
,
all_token_list
);
}
pub
fn
parse_raw_android
(
raw_fn
:
String
)
->
(
HashMap
<
String
,
i32
>
,
HashMap
<
String
,
i32
>
,
Vec
<
String
>
)
{
let
(
double_dict
,
triple_dict
,
all_token_list
)
=
dictionary_builder
(
raw_fn
,
android_format
(),
android_censored_regexps
());
println!
(
"double dictionary list len {}, triple {}, all tokens {}"
,
double_dict
.len
(),
triple_dict
.len
(),
all_token_list
.len
());
return
(
double_dict
,
triple_dict
,
all_token_list
);
}
pub
fn
parse_raw_healthapp
(
raw_fn
:
String
)
->
(
HashMap
<
String
,
i32
>
,
HashMap
<
String
,
i32
>
,
Vec
<
String
>
)
{
let
(
double_dict
,
triple_dict
,
all_token_list
)
=
dictionary_builder
(
raw_fn
,
healthapp_format
(),
healthapp_censored_regexps
());
pub
fn
parse_raw
(
raw_fn
:
String
,
lf
:
&
LogFormat
)
->
(
HashMap
<
String
,
i32
>
,
HashMap
<
String
,
i32
>
,
Vec
<
String
>
)
{
let
(
double_dict
,
triple_dict
,
all_token_list
)
=
dictionary_builder
(
raw_fn
,
format_string
(
&
lf
),
censored_regexps
(
&
lf
));
println!
(
"double dictionary list len {}, triple {}, all tokens {}"
,
double_dict
.len
(),
triple_dict
.len
(),
all_token_list
.len
());
return
(
double_dict
,
triple_dict
,
all_token_list
);
}
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment