refactor to linux/openstack input options

eb137d52 · Patrick Lam · 1de50455 · eb137d52 · eb137d52
Commit eb137d52 authored 2 years ago by Patrick Lam
--- a/src/main.rs
+++ b/src/main.rs
@@ -28,7 +28,7 @@ fn derive_2grams(trigram:&str) -> Vec<String> {
 fn main() {
    let args = Args::parse();

-    let (double_dict, triple_dict, all_token_list) = packages::parser::parse_raw(args.raw);
+    let (double_dict, triple_dict, all_token_list) = packages::parser::parse_raw_linux(args.raw);
    packages::parser::print_dict(&triple_dict);

    const CUTOFF : i32 = 72;

--- a/src/packages/parser.rs
+++ b/src/packages/parser.rs
@@ -5,6 +5,30 @@ use regex::Regex;
 use std::collections::HashMap;
 use std::collections::BTreeSet;

+fn linux_format() -> String {
+    return "<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string();
+}
+
+fn linux_regexps() -> Vec<Regex> {
+    return vec![Regex::new(r"(\d+\.){3}\d+").unwrap(), 
+                Regex::new(r"\w{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \d{4}").unwrap(),
+                Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];
+}
+
+fn openstack_format() -> String {
+    return r"(?P<Logrecord>.*?)\s+(?P<Date>.*?)\s+(?P<Time>.*?)\s+(?P<Pid>.*?)\s+(?P<Level>.*?)\s+(?P<Component>.*?)\s+(\[(?P<ADDR>.*?)\])?\s+(?P<Content>.*)".to_string();
+}
+
+fn openstack_regexps() -> Vec<Regex> {
+    return vec![Regex::new(r"((\d+\.){3}\d+,?)+").unwrap(),
+                Regex::new(r"/.+?\s").unwrap(),
+                Regex::new(r"\d+").unwrap()];
+}
+
+fn regexps() -> Vec<Regex> {
+    return linux_regexps();
+}
+
 // https://doc.rust-lang.org/rust-by-example/std_misc/file/read_lines.html
 // The output is wrapped in a Result to allow matching on errors
 // Returns an Iterator to the Reader of the lines of the file.
@@ -14,29 +38,37 @@ where P: AsRef<Path>, {
    Ok(io::BufReader::new(file).lines())
 }

-fn regex_generator(_format: String) -> Regex {
-    // let mut headers : Vec<String> = Vec::new();
-    // let splitters_re = Regex::new(r"(<[^<>]+>)").unwrap();
-
-    // let mut r = String::new();
-    // this is wrong, want to split_inclusive by some lambda
-    // for (k, splitter) in format.split_inclusive(splitters_re).enumerate() {
-    //     println!("{} {}", k, splitter);
-    //     if k % 2 == 0 {
-    //         r.push_str(&splitter.replace(" +", r"\s+",));
-    //     } else {
-    //         let brackets : &[_] = &['<', '>'];
-    //         let header = splitter.trim_matches(brackets).to_string();
-    //         println!("{}", header);
-    //         r.push_str(format!("(?P<{}>.*?)", header).as_str());
-    //         headers.push(header);
-    //     }
-    // }
-
-    // turns out I can just hardcode the answer for now
-    let r = r"(?P<Month>.*?)\s+(?P<Date>.*?)\s+(?P<Time>.*?)\s+(?P<Level>.*?)\s+(?P<Component>.*?)(\[(?P<PID>.*?)\])?:\s+(?P<Content>.*?)";
-
-    return Regex::new(format!("^{}$", r).as_str()).unwrap();
+fn regex_generator_helper(format: String) -> String {
+    let splitters_re = Regex::new(r"(<[^<>]+>)").unwrap();
+    let spaces_re = Regex::new(r" +").unwrap();
+    let brackets : &[_] = &['<', '>'];
+
+    let mut r = String::new();
+    let mut prev_end = None;
+    for m in splitters_re.find_iter(&format) {
+        if let Some(pe) = prev_end {
+            let splitter = spaces_re.replace(&format[pe..m.start()], r"\s+");
+            r.push_str(&splitter);
+        }
+        let header = m.as_str().trim_matches(brackets).to_string();
+        r.push_str(format!("(?P<{}>.*?)", header).as_str());
+        prev_end = Some(m.end());
+    }
+
+    return r;
+}
+
+fn regex_generator(format: String) -> Regex {
+    return Regex::new(format!("^{}$", regex_generator_helper(format)).as_str()).unwrap();
+}
+
+#[test]
+fn test_regex_generator_helper() {
+    let linux_format = r"<Month> <Date> <Time> <Level> <Component>(\[<PID>\])?: <Content>".to_string();
+    assert_eq!(regex_generator_helper(linux_format), r"(?P<Month>.*?)\s+(?P<Date>.*?)\s+(?P<Time>.*?)\s+(?P<Level>.*?)\s+(?P<Component>.*?)(\[(?P<PID>.*?)\])?:\s+(?P<Content>.*?)");
+
+    let openstack_format = r"<Logrecord> <Date> <Time> <Pid> <Level> <Component> (\[<ADDR>\])? <Content>".to_string();
+    assert_eq!(regex_generator_helper(openstack_format), r"(?P<Logrecord>.*?)\s+(?P<Date>.*?)\s+(?P<Time>.*?)\s+(?P<Pid>.*?)\s+(?P<Level>.*?)\s+(?P<Component>.*?)\s+(\[(?P<ADDR>.*?)\])?\s+(?P<Content>.*?)");
 }

 /// Replaces provided (domain-specific) regexps with <*> in the log_line.
@@ -51,10 +83,7 @@ fn apply_domain_specific_re(log_line: String, domain_specific_re:&Vec<Regex>) ->
 #[test]
 fn test_apply_domain_specific_re() {
    let line = "q2.34.4.5 Jun 14 15:16:02 combo sshd(pam_unix)[19937]: check pass; Fri Jun 17 20:55:07 2005 user unknown".to_string();
-    let linux_res = vec![Regex::new(r"(\d+\.){3}\d+").unwrap(), 
-                         Regex::new(r"\w{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \d{4}").unwrap(),
-                         Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];
-    let censored_line = apply_domain_specific_re(line, &linux_res);
+    let censored_line = apply_domain_specific_re(line, &linux_regexps());
    assert_eq!(censored_line, " q<*> Jun 14 <*> combo sshd(pam_unix)[19937]: check pass; <*> user unknown");
 }

@@ -71,9 +100,8 @@ fn token_splitter(log_line: String, re:&Regex, domain_specific_re:&Vec<Regex>) -
 #[test]
 fn test_token_splitter() {
    let line = "Jun 14 15:16:02 combo sshd(pam_unix)[19937]: check pass; user unknown".to_string();
-    let re = regex_generator("<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string());
-    let linux_res = vec![Regex::new(r"(\d+\.){3}\d+").unwrap(), Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];
-    let split_line = token_splitter(line, &re, &linux_res);
+    let re = regex_generator(linux_format());
+    let split_line = token_splitter(line, &re, &linux_regexps());
    assert_eq!(split_line, vec!["check", "pass;", "user", "unknown"]);
 }

@@ -154,12 +182,11 @@ fn dictionary_builder(raw_fn: String, format: String, regexps: Vec<Regex>) -> (H
 #[test]
 fn test_dictionary_builder_process_line_lookahead_is_none() {
    let line = "Jun 14 15:16:02 combo sshd(pam_unix)[19937]: check pass; user unknown".to_string();
-    let re = regex_generator("<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string());
-    let linux_res = vec![Regex::new(r"(\d+\.){3}\d+").unwrap(), Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];
+    let re = regex_generator(linux_format());
    let mut dbl = HashMap::new();
    let mut trpl = HashMap::new();
    let mut all_token_list = vec![];
-    process_dictionary_builder_line(line, None, &re, &linux_res, &mut dbl, &mut trpl, &mut all_token_list);
+    process_dictionary_builder_line(line, None, &re, &linux_regexps(), &mut dbl, &mut trpl, &mut all_token_list);

    let mut dbl_oracle = HashMap::new();
    dbl_oracle.insert("user^unknown".to_string(), 1);
@@ -177,12 +204,11 @@ fn test_dictionary_builder_process_line_lookahead_is_none() {
 fn test_dictionary_builder_process_line_lookahead_is_some() {
    let line = "Jun 14 15:16:02 combo sshd(pam_unix)[19937]: check pass; user unknown".to_string();
    let next_line = "Jun 14 15:16:02 combo sshd(pam_unix)[19937]: baz bad".to_string();
-    let re = regex_generator("<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string());
-    let linux_res = vec![Regex::new(r"(\d+\.){3}\d+").unwrap(), Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];
+    let re = regex_generator(linux_format());
    let mut dbl = HashMap::new();
    let mut trpl = HashMap::new();
    let mut all_token_list = vec![];
-    process_dictionary_builder_line(line, Some(next_line), &re, &linux_res, &mut dbl, &mut trpl, &mut all_token_list);
+    process_dictionary_builder_line(line, Some(next_line), &re, &linux_regexps(), &mut dbl, &mut trpl, &mut all_token_list);

    let mut dbl_oracle = HashMap::new();
    dbl_oracle.insert("unknown^baz".to_string(), 1);
@@ -199,20 +225,21 @@ fn test_dictionary_builder_process_line_lookahead_is_some() {
    assert_eq!(trpl, trpl_oracle);
 }

-pub fn parse_raw(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
-    let linux_format : String = "<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string(); // Linux format
-    let linux_re : Vec<Regex> = vec![Regex::new(r"\w{3} \w{3} (\d{2}| \d{1}) \d{2}:\d{2}:\d{2} \d{4}").unwrap(),
-                                     Regex::new(r"(\d+\.){3}\d+").unwrap(),
-                                     Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()
-                                     ];
-    let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, linux_format, linux_re);
+pub fn parse_raw_linux(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
+    let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, linux_format(), linux_regexps());
+    println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
+    return (double_dict, triple_dict, all_token_list);
+}
+
+pub fn parse_raw_openstack(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
+    let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, openstack_format(), openstack_regexps());
    println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
    return (double_dict, triple_dict, all_token_list);
 }

 #[test]
-fn test_parse_raw() {
-    let (double_dict, triple_dict, all_token_list) = parse_raw("data/from_paper.log".to_string());
+fn test_parse_raw_linux() {
+    let (double_dict, triple_dict, all_token_list) = parse_raw_linux("data/from_paper.log".to_string());
    let all_token_list_oracle = vec!["Found".to_string(), "block".to_string(), "rdd_42_20".to_string(), "locally".to_string(),"rdd_42_22".to_string(), "rdd_42_23".to_string(), "rdd_42_24".to_string()];
    assert_eq!(all_token_list, all_token_list_oracle);
    let mut double_dict_oracle = HashMap::new();