looking at the past

91cfb4ef · Patrick Lam · e00e9e47 · 91cfb4ef
Commit 91cfb4ef authored 2 years ago by Patrick Lam
--- a/src/packages/parser.rs
+++ b/src/packages/parser.rs
@@ -74,44 +74,81 @@ fn test_token_splitter() {
    assert_eq!(split_line, vec!["check", "pass;", "user", "unknown"]);
 }

+fn process_dictionary_builder_line(line: String, regexp:&Regex, regexps:&Vec<Regex>, dbl: &mut HashMap<String, i32>, trpl: &mut HashMap<String, i32>, all_token_list: &mut Vec<String>, prev1: Option<String>, prev2: Option<String>) -> (Option<String>, Option<String>){
+    let mut tokens = token_splitter(line, &regexp, &regexps);
+    if tokens.is_empty() {
+        return (None, None);
+    }
+    tokens.iter().for_each(|t| all_token_list.push(t.clone()));
+    let last1 = match tokens.len() {
+        0 => None,
+        n => Some(tokens[n-1].clone())
+    };
+    let last2 = match tokens.len() {
+        0 => None,
+        1 => None,
+        n => Some(tokens[n-2].clone())
+    };
+
+    // todo: across line boundaries, as mentioned in the paper; right now we don't cross lines.
+    let mut tokens2 = match prev1 {
+        None => tokens,
+        Some(x) => { let mut t = vec![x]; t.append(&mut tokens); t }
+    };
+
+    for doubles in tokens2.windows(2) {
+        let double_tmp = format!("{}^{}", doubles[0], doubles[1]);
+        if let Some(count) = dbl.get(&double_tmp) {
+            dbl.insert(double_tmp, count+1);
+        } else {
+            dbl.insert(double_tmp, 1);
+        }
+    }
+
+    let tokens3 = match prev2 {
+        None => tokens2,
+        Some(x) => { let mut t = vec![x]; t.append(&mut tokens2); t }
+    };
+    for triples in tokens3.windows(3) {
+        let triple_tmp = format!("{}^{}^{}", triples[0], triples[1], triples[2]);
+        if let Some(count) = trpl.get(&triple_tmp) {
+            trpl.insert(triple_tmp, count+1);
+        } else {
+            trpl.insert(triple_tmp, 1);
+        }
+    }
+    return (last1, last2);
+}
+
 fn dictionary_builder(raw_fn: String, format: String, regexps: Vec<Regex>) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
-    let mut dbl = HashMap::new(); dbl.insert("dictionary^DHT".to_string(), -1);
-    let mut trpl = HashMap::new(); trpl.insert("dictionary^DHT^triple".to_string(), -1);
+    let mut dbl = HashMap::new(); // dbl.insert("dictionary^DHT".to_string(), -1);
+    let mut trpl = HashMap::new(); // trpl.insert("dictionary^DHT^triple".to_string(), -1);
    let mut all_token_list = vec![];
    let regex = regex_generator(format);

+    let mut prev1 = None; let mut prev2 = None;
+
    if let Ok(lines) = read_lines(raw_fn) {
        for line in lines {
            if let Ok(ip) = line {
-                let tokens = token_splitter(ip, &regex, &regexps);
-                if tokens.is_empty() {
-                    continue;
-                }
-                tokens.iter().for_each(|t| all_token_list.push(t.clone()));
-                // todo: across line boundaries, as mentioned in the paper; right now we don't cross lines.
-                for triples in tokens.windows(3) {
-                    let triple_tmp = format!("{}^{}^{}", triples[0], triples[1], triples[2]);
-                    if let Some(count) = trpl.get(&triple_tmp) {
-                        trpl.insert(triple_tmp, count+1);
-                    } else {
-                        trpl.insert(triple_tmp, 1);
-                    }
-                }
-
-                for doubles in tokens.windows(2) {
-                    let double_tmp = format!("{}^{}", doubles[0], doubles[1]);
-                    if let Some(count) = dbl.get(&double_tmp) {
-                        dbl.insert(double_tmp, count+1);
-                    } else {
-                        dbl.insert(double_tmp, 1);
-                    }
-                }
+                (prev1, prev2) = process_dictionary_builder_line(ip, &regex, &regexps, &mut dbl, &mut trpl, &mut all_token_list, prev1, prev2);
            }
        }
    }
    return (dbl, trpl, all_token_list)
 }

+#[test]
+fn test_dictionary_builder_process_line() {
+    let line = "Jun 14 15:16:02 combo sshd(pam_unix)[19937]: check pass; user unknown".to_string();
+    let re = regex_generator("<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string());
+    let linux_res = vec![Regex::new(r"(\d+\.){3}\d+").unwrap(), Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];
+    let mut dbl = HashMap::new();
+    let mut trpl = HashMap::new();
+    let split_line = process_dictionary_builder_line(line, &re, &linux_res, &dbl, &trpl, None, None);
+    assert_eq!(split_line, vec!["check", "pass;", "user", "unknown"]);
+}
+
 pub fn parse_raw(raw_fn: String) {
    let linux_format : String = "<Month> <Date> <Time> <Level> <Component>(\\[<PID>\\])?: <Content>".to_string(); // Linux format
    let linux_re : Vec<Regex> = vec![Regex::new(r"(\d+\.){3}\d+").unwrap(), Regex::new(r"\d{2}:\d{2}:\d{2}").unwrap()];