Skip to content
Snippets Groups Projects
Commit 7223dc4c authored by Patrick Lam's avatar Patrick Lam
Browse files

code cleanup

parent a93df11f
No related branches found
No related tags found
No related merge requests found
......@@ -73,7 +73,7 @@ fn derive_2grams_from_trigram(trigram:&str) -> Vec<String> {
format!("{}^{}", grams[1], grams[2])];
}
enum LogFormat {
pub enum LogFormat {
Linux,
OpenStack,
Spark,
......@@ -93,62 +93,49 @@ fn main() {
let args = Args::parse();
let mut input_fn = None;
let mut log_format = None;
let mut log_format_opt = None;
// hey, please let me know (email) if there's a more idiomatic way to do this
if let Some(raw_linux) = args.raw_linux {
log_format = Some(Linux);
log_format_opt = Some(Linux);
input_fn = Some(raw_linux);
} else if let Some(raw_openstack) = args.raw_openstack {
log_format = Some(OpenStack);
log_format_opt = Some(OpenStack);
input_fn = Some(raw_openstack);
} else if let Some(raw_spark) = args.raw_spark {
log_format = Some(Spark);
log_format_opt = Some(Spark);
input_fn = Some(raw_spark);
} else if let Some(raw_hdfs) = args.raw_hdfs {
log_format = Some(HDFS);
log_format_opt = Some(HDFS);
input_fn = Some(raw_hdfs);
} else if let Some(raw_hpc) = args.raw_hpc {
log_format = Some(HPC);
log_format_opt = Some(HPC);
input_fn = Some(raw_hpc);
} else if let Some(raw_proxifier) = args.raw_proxifier {
log_format = Some(Proxifier);
log_format_opt = Some(Proxifier);
input_fn = Some(raw_proxifier);
} else if let Some(raw_android) = args.raw_android {
log_format = Some(Android);
log_format_opt = Some(Android);
input_fn = Some(raw_android);
} else if let Some(raw_healthapp) = args.raw_healthapp {
log_format = Some(HealthApp);
log_format_opt = Some(HealthApp);
input_fn = Some(raw_healthapp);
}
let log_format = match log_format_opt {
None => panic!("must specify a raw input file"),
Some(lf) => lf,
};
let cutoff = match args.cutoff {
None => 3,
Some(c) => c,
};
let (double_dict, triple_dict, _all_token_list) = match log_format {
Some(Linux) => packages::parser::parse_raw_linux(input_fn.unwrap()),
Some(OpenStack) => packages::parser::parse_raw_openstack(input_fn.unwrap()),
Some(Spark) => packages::parser::parse_raw_spark(input_fn.unwrap()),
Some(HDFS) => packages::parser::parse_raw_hdfs(input_fn.unwrap()),
Some(HPC) => packages::parser::parse_raw_hpc(input_fn.unwrap()),
Some(Proxifier) => packages::parser::parse_raw_proxifier(input_fn.unwrap()),
Some(Android) => packages::parser::parse_raw_android(input_fn.unwrap()),
Some(HealthApp) => packages::parser::parse_raw_healthapp(input_fn.unwrap()),
None => panic!("no log format specified"),
};
let (double_dict, triple_dict, _all_token_list) =
packages::parser::parse_raw(input_fn.unwrap(), &log_format);
view_double_and_triple_dicts(&double_dict, &triple_dict);
let (format_string_re, censored_regexps) = match log_format {
Some(Linux) => (packages::parser::regex_generator(packages::parser::linux_format()), packages::parser::linux_censored_regexps()),
Some(OpenStack) => (packages::parser::regex_generator(packages::parser::openstack_format()), packages::parser::openstack_censored_regexps()),
Some(Spark) => (packages::parser::regex_generator(packages::parser::spark_format()), packages::parser::spark_censored_regexps()),
Some(HDFS) => (packages::parser::regex_generator(packages::parser::hdfs_format()), packages::parser::hdfs_censored_regexps()),
Some(HPC) => (packages::parser::regex_generator(packages::parser::hpc_format()), packages::parser::hpc_censored_regexps()),
Some(Proxifier) => (packages::parser::regex_generator(packages::parser::proxifier_format()), packages::parser::proxifier_censored_regexps()),
Some(Android) => (packages::parser::regex_generator(packages::parser::android_format()), packages::parser::android_censored_regexps()),
Some(HealthApp) => (packages::parser::regex_generator(packages::parser::healthapp_format()), packages::parser::healthapp_censored_regexps()),
None => panic!("no log format specified"),
};
let (format_string_re, censored_regexps) =
(packages::parser::regex_generator(packages::parser::format_string(&log_format)), packages::parser::censored_regexps(&log_format));
//let sample_string = "Jun 23 23:30:05 combo sshd(pam_unix)[26190]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=218.22.3.51 user=root authentication".to_string();
// add befores and afters to the sample string, yielding extended_sample_string
......
......@@ -5,83 +5,70 @@ use regex::Regex;
use std::collections::HashMap;
use std::collections::BTreeSet;
pub fn linux_format() -> String {
return "<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string();
}
pub fn linux_censored_regexps() -> Vec<Regex> {
return vec![Regex::new(r"(\d+\.){3}\d+").unwrap(),
Regex::new(r"\w{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \d{4}").unwrap(),
Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];
}
pub fn openstack_format() -> String {
return r"'<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>'".to_string();
}
pub fn openstack_censored_regexps() -> Vec<Regex> {
return vec![Regex::new(r"((\d+\.){3}\d+,?)+").unwrap(),
Regex::new(r"/.+?\s").unwrap()];
// I commented out Regex::new(r"\d+").unwrap() because that censors all numbers, which may not be what we want?
}
pub fn spark_format() -> String {
return "<Date> <Time> <Level> <Component>: <Content>".to_string();
}
pub fn spark_censored_regexps() -> Vec<Regex> {
return vec![Regex::new(r"(\d+\.){3}\d+").unwrap(),
Regex::new(r"\b[KGTM]?B\b").unwrap(),
Regex::new(r"([\w-]+\.){2,}[\w-]+").unwrap()]
}
pub fn hdfs_format() -> String {
return "<Date> <Time> <Pid> <Level> <Component>: <Content>".to_string();
use crate::LogFormat;
use crate::LogFormat::Linux;
use crate::LogFormat::OpenStack;
use crate::LogFormat::Spark;
use crate::LogFormat::HDFS;
use crate::LogFormat::HPC;
use crate::LogFormat::Proxifier;
use crate::LogFormat::Android;
use crate::LogFormat::HealthApp;
pub fn format_string(lf: &LogFormat) -> String {
match lf {
Linux =>
r"<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string(),
OpenStack =>
r"'<Logrecord> <Date> <Time> <Pid> <Level> <Component> \[<ADDR>\] <Content>'".to_string(),
Spark =>
r"<Date> <Time> <Level> <Component>: <Content>".to_string(),
HDFS =>
r"<Date> <Time> <Pid> <Level> <Component>: <Content>".to_string(),
HPC =>
r"<LogId> <Node> <Component> <State> <Time> <Flag> <Content>".to_string(),
Proxifier =>
r"[<Time>] <Program> - <Content>".to_string(),
Android =>
r"<Date> <Time> <Pid> <Tid> <Level> <Component>: <Content>".to_string(),
HealthApp =>
"<Time>\\|<Component>\\|<Pid>\\|<Content>".to_string()
}
}
pub fn hdfs_censored_regexps() -> Vec<Regex> {
return vec![Regex::new(r"blk_(|-)[0-9]+").unwrap(), // block id
pub fn censored_regexps(lf: &LogFormat) -> Vec<Regex> {
match lf {
Linux =>
vec![Regex::new(r"(\d+\.){3}\d+").unwrap(),
Regex::new(r"\w{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \d{4}").unwrap(),
Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()],
OpenStack =>
vec![Regex::new(r"((\d+\.){3}\d+,?)+").unwrap(),
Regex::new(r"/.+?\s").unwrap()],
// I commented out Regex::new(r"\d+").unwrap() because that censors all numbers, which may not be what we want?
Spark =>
vec![Regex::new(r"(\d+\.){3}\d+").unwrap(),
Regex::new(r"\b[KGTM]?B\b").unwrap(),
Regex::new(r"([\w-]+\.){2,}[\w-]+").unwrap()],
HDFS =>
vec![Regex::new(r"blk_(|-)[0-9]+").unwrap(), // block id
Regex::new(r"(/|)([0-9]+\.){3}[0-9]+(:[0-9]+|)(:|)").unwrap() // IP
];
// oops, numbers require lookbehind, which rust doesn't support, sigh
// Regex::new(r"(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$").unwrap()]; // Numbers
}
pub fn hpc_format() -> String {
return "<LogId> <Node> <Component> <State> <Time> <Flag> <Content>".to_string();
}
pub fn hpc_censored_regexps() -> Vec<Regex> {
return vec![Regex::new(r"=\d+").unwrap()];
}
pub fn proxifier_format() -> String {
return "[<Time>] <Program> - <Content>".to_string();
}
pub fn proxifier_censored_regexps() -> Vec<Regex> {
return vec![Regex::new(r"<\d+\ssec").unwrap(),
Regex::new(r"([\w-]+\.)+[\w-]+(:\d+)?").unwrap(),
Regex::new(r"\d{2}:\d{2}(:\d{2})*").unwrap(),
Regex::new(r"[KGTM]B").unwrap()];
}
pub fn android_format() -> String {
return "<Date> <Time> <Pid> <Tid> <Level> <Component>: <Content>".to_string();
}
pub fn android_censored_regexps() -> Vec<Regex> {
return vec![Regex::new(r"(/[\w-]+)+").unwrap(),
Regex::new(r"([\w-]+\.){2,}[\w-]+").unwrap(),
Regex::new(r"\b(\-?\+?\d+)\b|\b0[Xx][a-fA-F\d]+\b|\b[a-fA-F\d]{4,}\b").unwrap()];
}
pub fn healthapp_format() -> String {
return "<Time>\\|<Component>\\|<Pid>\\|<Content>".to_string();
}
pub fn healthapp_censored_regexps() -> Vec<Regex> {
return vec![];
],
// oops, numbers require lookbehind, which rust doesn't support, sigh
// Regex::new(r"(?<=[^A-Za-z0-9])(\-?\+?\d+)(?=[^A-Za-z0-9])|[0-9]+$").unwrap()]; // Numbers
HPC =>
vec![Regex::new(r"=\d+").unwrap()],
Proxifier =>
vec![Regex::new(r"<\d+\ssec").unwrap(),
Regex::new(r"([\w-]+\.)+[\w-]+(:\d+)?").unwrap(),
Regex::new(r"\d{2}:\d{2}(:\d{2})*").unwrap(),
Regex::new(r"[KGTM]B").unwrap()],
Android =>
vec![Regex::new(r"(/[\w-]+)+").unwrap(),
Regex::new(r"([\w-]+\.){2,}[\w-]+").unwrap(),
Regex::new(r"\b(\-?\+?\d+)\b|\b0[Xx][a-fA-F\d]+\b|\b[a-fA-F\d]{4,}\b").unwrap()],
HealthApp => vec![],
}
}
// https://doc.rust-lang.org/rust-by-example/std_misc/file/read_lines.html
......@@ -307,50 +294,8 @@ fn test_dictionary_builder_process_line_lookahead_is_some() {
assert_eq!(trpl, trpl_oracle);
}
pub fn parse_raw_linux(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, linux_format(), linux_censored_regexps());
println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
return (double_dict, triple_dict, all_token_list);
}
pub fn parse_raw_openstack(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, openstack_format(), openstack_censored_regexps());
println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
return (double_dict, triple_dict, all_token_list);
}
pub fn parse_raw_spark(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, spark_format(), spark_censored_regexps());
println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
return (double_dict, triple_dict, all_token_list);
}
pub fn parse_raw_hdfs(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, hdfs_format(), hdfs_censored_regexps());
println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
return (double_dict, triple_dict, all_token_list);
}
pub fn parse_raw_hpc(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, hpc_format(), hpc_censored_regexps());
println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
return (double_dict, triple_dict, all_token_list);
}
pub fn parse_raw_proxifier(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, proxifier_format(), proxifier_censored_regexps());
println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
return (double_dict, triple_dict, all_token_list);
}
pub fn parse_raw_android(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, android_format(), android_censored_regexps());
println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
return (double_dict, triple_dict, all_token_list);
}
pub fn parse_raw_healthapp(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, healthapp_format(), healthapp_censored_regexps());
pub fn parse_raw(raw_fn: String, lf:&LogFormat) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, format_string(&lf), censored_regexps(&lf));
println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
return (double_dict, triple_dict, all_token_list);
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment