Skip to content
Snippets Groups Projects
Commit eb137d52 authored by Patrick Lam's avatar Patrick Lam
Browse files

refactor to linux/openstack input options

parent 1de50455
No related branches found
No related tags found
No related merge requests found
......@@ -28,7 +28,7 @@ fn derive_2grams(trigram:&str) -> Vec<String> {
fn main() {
let args = Args::parse();
let (double_dict, triple_dict, all_token_list) = packages::parser::parse_raw(args.raw);
let (double_dict, triple_dict, all_token_list) = packages::parser::parse_raw_linux(args.raw);
packages::parser::print_dict(&triple_dict);
const CUTOFF : i32 = 72;
......
......@@ -5,6 +5,30 @@ use regex::Regex;
use std::collections::HashMap;
use std::collections::BTreeSet;
fn linux_format() -> String {
return "<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string();
}
fn linux_regexps() -> Vec<Regex> {
return vec![Regex::new(r"(\d+\.){3}\d+").unwrap(),
Regex::new(r"\w{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \d{4}").unwrap(),
Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];
}
fn openstack_format() -> String {
return r"(?P<Logrecord>.*?)\s+(?P<Date>.*?)\s+(?P<Time>.*?)\s+(?P<Pid>.*?)\s+(?P<Level>.*?)\s+(?P<Component>.*?)\s+(\[(?P<ADDR>.*?)\])?\s+(?P<Content>.*)".to_string();
}
fn openstack_regexps() -> Vec<Regex> {
return vec![Regex::new(r"((\d+\.){3}\d+,?)+").unwrap(),
Regex::new(r"/.+?\s").unwrap(),
Regex::new(r"\d+").unwrap()];
}
fn regexps() -> Vec<Regex> {
return linux_regexps();
}
// https://doc.rust-lang.org/rust-by-example/std_misc/file/read_lines.html
// The output is wrapped in a Result to allow matching on errors
// Returns an Iterator to the Reader of the lines of the file.
......@@ -14,29 +38,37 @@ where P: AsRef<Path>, {
Ok(io::BufReader::new(file).lines())
}
fn regex_generator(_format: String) -> Regex {
// let mut headers : Vec<String> = Vec::new();
// let splitters_re = Regex::new(r"(<[^<>]+>)").unwrap();
// let mut r = String::new();
// this is wrong, want to split_inclusive by some lambda
// for (k, splitter) in format.split_inclusive(splitters_re).enumerate() {
// println!("{} {}", k, splitter);
// if k % 2 == 0 {
// r.push_str(&splitter.replace(" +", r"\s+",));
// } else {
// let brackets : &[_] = &['<', '>'];
// let header = splitter.trim_matches(brackets).to_string();
// println!("{}", header);
// r.push_str(format!("(?P<{}>.*?)", header).as_str());
// headers.push(header);
// }
// }
// turns out I can just hardcode the answer for now
let r = r"(?P<Month>.*?)\s+(?P<Date>.*?)\s+(?P<Time>.*?)\s+(?P<Level>.*?)\s+(?P<Component>.*?)(\[(?P<PID>.*?)\])?:\s+(?P<Content>.*?)";
return Regex::new(format!("^{}$", r).as_str()).unwrap();
fn regex_generator_helper(format: String) -> String {
let splitters_re = Regex::new(r"(<[^<>]+>)").unwrap();
let spaces_re = Regex::new(r" +").unwrap();
let brackets : &[_] = &['<', '>'];
let mut r = String::new();
let mut prev_end = None;
for m in splitters_re.find_iter(&format) {
if let Some(pe) = prev_end {
let splitter = spaces_re.replace(&format[pe..m.start()], r"\s+");
r.push_str(&splitter);
}
let header = m.as_str().trim_matches(brackets).to_string();
r.push_str(format!("(?P<{}>.*?)", header).as_str());
prev_end = Some(m.end());
}
return r;
}
fn regex_generator(format: String) -> Regex {
return Regex::new(format!("^{}$", regex_generator_helper(format)).as_str()).unwrap();
}
#[test]
fn test_regex_generator_helper() {
let linux_format = r"<Month> <Date> <Time> <Level> <Component>(\[<PID>\])?: <Content>".to_string();
assert_eq!(regex_generator_helper(linux_format), r"(?P<Month>.*?)\s+(?P<Date>.*?)\s+(?P<Time>.*?)\s+(?P<Level>.*?)\s+(?P<Component>.*?)(\[(?P<PID>.*?)\])?:\s+(?P<Content>.*?)");
let openstack_format = r"<Logrecord> <Date> <Time> <Pid> <Level> <Component> (\[<ADDR>\])? <Content>".to_string();
assert_eq!(regex_generator_helper(openstack_format), r"(?P<Logrecord>.*?)\s+(?P<Date>.*?)\s+(?P<Time>.*?)\s+(?P<Pid>.*?)\s+(?P<Level>.*?)\s+(?P<Component>.*?)\s+(\[(?P<ADDR>.*?)\])?\s+(?P<Content>.*?)");
}
/// Replaces provided (domain-specific) regexps with <*> in the log_line.
......@@ -51,10 +83,7 @@ fn apply_domain_specific_re(log_line: String, domain_specific_re:&Vec<Regex>) ->
#[test]
fn test_apply_domain_specific_re() {
let line = "q2.34.4.5 Jun 14 15:16:02 combo sshd(pam_unix)[19937]: check pass; Fri Jun 17 20:55:07 2005 user unknown".to_string();
let linux_res = vec![Regex::new(r"(\d+\.){3}\d+").unwrap(),
Regex::new(r"\w{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \d{4}").unwrap(),
Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];
let censored_line = apply_domain_specific_re(line, &linux_res);
let censored_line = apply_domain_specific_re(line, &linux_regexps());
assert_eq!(censored_line, " q<*> Jun 14 <*> combo sshd(pam_unix)[19937]: check pass; <*> user unknown");
}
......@@ -71,9 +100,8 @@ fn token_splitter(log_line: String, re:&Regex, domain_specific_re:&Vec<Regex>) -
#[test]
fn test_token_splitter() {
let line = "Jun 14 15:16:02 combo sshd(pam_unix)[19937]: check pass; user unknown".to_string();
let re = regex_generator("<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string());
let linux_res = vec![Regex::new(r"(\d+\.){3}\d+").unwrap(), Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];
let split_line = token_splitter(line, &re, &linux_res);
let re = regex_generator(linux_format());
let split_line = token_splitter(line, &re, &linux_regexps());
assert_eq!(split_line, vec!["check", "pass;", "user", "unknown"]);
}
......@@ -154,12 +182,11 @@ fn dictionary_builder(raw_fn: String, format: String, regexps: Vec<Regex>) -> (H
#[test]
fn test_dictionary_builder_process_line_lookahead_is_none() {
let line = "Jun 14 15:16:02 combo sshd(pam_unix)[19937]: check pass; user unknown".to_string();
let re = regex_generator("<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string());
let linux_res = vec![Regex::new(r"(\d+\.){3}\d+").unwrap(), Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];
let re = regex_generator(linux_format());
let mut dbl = HashMap::new();
let mut trpl = HashMap::new();
let mut all_token_list = vec![];
process_dictionary_builder_line(line, None, &re, &linux_res, &mut dbl, &mut trpl, &mut all_token_list);
process_dictionary_builder_line(line, None, &re, &linux_regexps(), &mut dbl, &mut trpl, &mut all_token_list);
let mut dbl_oracle = HashMap::new();
dbl_oracle.insert("user^unknown".to_string(), 1);
......@@ -177,12 +204,11 @@ fn test_dictionary_builder_process_line_lookahead_is_none() {
fn test_dictionary_builder_process_line_lookahead_is_some() {
let line = "Jun 14 15:16:02 combo sshd(pam_unix)[19937]: check pass; user unknown".to_string();
let next_line = "Jun 14 15:16:02 combo sshd(pam_unix)[19937]: baz bad".to_string();
let re = regex_generator("<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string());
let linux_res = vec![Regex::new(r"(\d+\.){3}\d+").unwrap(), Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];
let re = regex_generator(linux_format());
let mut dbl = HashMap::new();
let mut trpl = HashMap::new();
let mut all_token_list = vec![];
process_dictionary_builder_line(line, Some(next_line), &re, &linux_res, &mut dbl, &mut trpl, &mut all_token_list);
process_dictionary_builder_line(line, Some(next_line), &re, &linux_regexps(), &mut dbl, &mut trpl, &mut all_token_list);
let mut dbl_oracle = HashMap::new();
dbl_oracle.insert("unknown^baz".to_string(), 1);
......@@ -199,20 +225,21 @@ fn test_dictionary_builder_process_line_lookahead_is_some() {
assert_eq!(trpl, trpl_oracle);
}
pub fn parse_raw(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let linux_format : String = "<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string(); // Linux format
let linux_re : Vec<Regex> = vec![Regex::new(r"\w{3} \w{3} (\d{2}| \d{1}) \d{2}:\d{2}:\d{2} \d{4}").unwrap(),
Regex::new(r"(\d+\.){3}\d+").unwrap(),
Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()
];
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, linux_format, linux_re);
pub fn parse_raw_linux(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, linux_format(), linux_regexps());
println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
return (double_dict, triple_dict, all_token_list);
}
pub fn parse_raw_openstack(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, openstack_format(), openstack_regexps());
println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
return (double_dict, triple_dict, all_token_list);
}
#[test]
fn test_parse_raw() {
let (double_dict, triple_dict, all_token_list) = parse_raw("data/from_paper.log".to_string());
fn test_parse_raw_linux() {
let (double_dict, triple_dict, all_token_list) = parse_raw_linux("data/from_paper.log".to_string());
let all_token_list_oracle = vec!["Found".to_string(), "block".to_string(), "rdd_42_20".to_string(), "locally".to_string(),"rdd_42_22".to_string(), "rdd_42_23".to_string(), "rdd_42_24".to_string()];
assert_eq!(all_token_list, all_token_list_oracle);
let mut double_dict_oracle = HashMap::new();
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment