Skip to content
Snippets Groups Projects
Commit d8a47409 authored by Patrick Lam's avatar Patrick Lam
Browse files

implement dynamic token finding

parent c749a8c4
No related branches found
Tags 7.x-1.0
No related merge requests found
......@@ -7,6 +7,7 @@ You can run cargo test to run the test cases.
Here's how you can invoke the program itself.
```
$ cargo run --release -- --raw-linux data/Linux_2k.log --to-parse "Jun 23 23:30:05 combo sshd(pam_unix)[26190]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=218.22.3.51 user=root"
$ cargo run --release -- --raw-spark data/from_paper.log --to-parse "17/06/09 20:11:11 INFO storage.BlockManager: Found block rdd_42_20 locally" --before "split: hdfs://hostname/2kSOSP.log:29168+7292" --after "Found block"
$ cargo run --release -- --raw-linux data/Linux_2k.log --to-parse "Jun 23 23:30:05 combo sshd(pam_unix)[26190]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=218.22.3.51 user=root" --before "rhost=<*> user=root" --after "session opened"
$ cargo run --release -- --raw-openstack data/openstack_normal2.log --to-parse "nova-compute.log.2017-05-17_12:02:35 2017-05-17 12:02:30.397 2931 INFO nova.virt.libvirt.imagecache [req-addc1839-2ed5-4778-b57e-5854eb7b8b09 - - - - -] image 0673dd71-34c5-4fbb-86c4-40623fbe45b4 at (/var/lib/nova/instances/_base/a489c868f0c37da93b76227c91bb03908ac0e742): in use: on this node 1 local, 0 on other nodes sharing this instance storage"
```
Jun 14 15:16:02 combo sshd(pam_unix)[19937]: Found block rdd_42_20 locally
Jun 14 15:16:02 combo sshd(pam_unix)[19937]: Found block rdd_42_22 locally
Jun 14 15:16:02 combo sshd(pam_unix)[19937]: Found block rdd_42_23 locally
Jun 14 15:16:02 combo sshd(pam_unix)[19937]: Found block rdd_42_24 locally
17/06/09 20:10:46 INFO rdd.HadoopRDD: Input split: hdfs://hostname/2kSOSP.log:21876+7292
17/06/09 20:10:46 INFO rdd.HadoopRDD: Input split: hdfs://hostname/2kSOSP.log:14584+7292
17/06/09 20:10:46 INFO rdd.HadoopRDD: Input split: hdfs://hostname/2kSOSP.log:0+7292
17/06/09 20:10:46 INFO rdd.HadoopRDD: Input split: hdfs://hostname/2kSOSP.log:7292+7292
17/06/09 20:10:46 INFO rdd.HadoopRDD: Input split: hdfs://hostname/2kSOSP.log:29168+7292
17/06/09 20:11:11 INFO storage.BlockManager: Found block rdd_42_20 locally
17/06/09 20:11:11 INFO storage.BlockManager: Found block rdd_42_22 locally
17/06/09 20:11:11 INFO storage.BlockManager: Found block rdd_42_23 locally
17/06/09 20:11:11 INFO storage.BlockManager: Found block rdd_42_24 locally
use clap::Parser;
use std::collections::HashMap;
use std::collections::HashSet;
use crate::LogFormat::Linux;
use crate::LogFormat::OpenStack;
use crate::LogFormat::Spark;
mod packages;
......@@ -15,8 +17,17 @@ struct Args {
#[arg(long)]
raw_openstack: Option<String>,
#[arg(long)]
raw_spark: Option<String>,
#[arg(long)]
to_parse: String,
#[arg(long)]
before: Option<String>,
#[arg(long)]
after: Option<String>,
}
#[test]
......@@ -36,6 +47,7 @@ fn derive_2grams_from_trigram(trigram:&str) -> Vec<String> {
enum LogFormat {
Linux,
OpenStack,
Spark,
}
fn view_double_and_triple_dicts(double_dict:&HashMap<String, i32>, triple_dict:&HashMap<String, i32>) {
......@@ -54,11 +66,15 @@ fn main() {
} else if let Some(raw_openstack) = args.raw_openstack {
log_format = Some(OpenStack);
input_fn = Some(raw_openstack);
} else if let Some(raw_spark) = args.raw_spark {
log_format = Some(Spark);
input_fn = Some(raw_spark);
}
let (double_dict, triple_dict, all_token_list) = match log_format {
Some(Linux) => packages::parser::parse_raw_linux(input_fn.unwrap()),
Some(OpenStack) => packages::parser::parse_raw_openstack(input_fn.unwrap()),
Some(Spark) => packages::parser::parse_raw_spark(input_fn.unwrap()),
None => panic!("no log format specified"),
};
......@@ -67,36 +83,85 @@ fn main() {
let (format_string_re, censored_regexps) = match log_format {
Some(Linux) => (packages::parser::regex_generator(packages::parser::linux_format()), packages::parser::linux_censored_regexps()),
Some(OpenStack) => (packages::parser::regex_generator(packages::parser::openstack_format()), packages::parser::openstack_censored_regexps()),
Some(Spark) => (packages::parser::regex_generator(packages::parser::spark_format()), packages::parser::spark_censored_regexps()),
None => panic!("no log format specified"),
};
const CUTOFF : i32 = 72;
// let's say that the cutoff is 72 for Linux2k.log.
// check all of the 2-grams from the less-frequent 3-grams
let mut two_grams = HashMap::new();
let (val_set, reverse_d) = packages::parser::reverse_dict(&triple_dict);
for val in val_set.iter().filter(|x| **x < CUTOFF) {
for key in reverse_d.get(val).unwrap() {
let val_2grams = derive_2grams_from_trigram(key);
for two_gram in val_2grams {
if let Some(count) = two_grams.get(&two_gram) {
two_grams.insert(two_gram, count+1);
} else {
two_grams.insert(two_gram, 1);
}
}
// let mut two_grams = HashMap::new();
// let (val_set, reverse_d) = packages::parser::reverse_dict(&triple_dict);
// for val in val_set.iter().filter(|x| **x < CUTOFF) {
// for key in reverse_d.get(val).unwrap() {
// let val_2grams = derive_2grams_from_trigram(key);
// for two_gram in val_2grams {
// if let Some(count) = two_grams.get(&two_gram) {
// two_grams.insert(two_gram, count+1);
// } else {
// two_grams.insert(two_gram, 1);
// }
// }
// }
// }
// packages::parser::print_dict("inverted", &two_grams);
//let sample_string = "Jun 23 23:30:05 combo sshd(pam_unix)[26190]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=218.22.3.51 user=root authentication".to_string();
// add befores and afters to the sample string, yielding extended_sample_string
let mut sample_string_tokens = packages::parser::token_splitter(args.to_parse,
&format_string_re,
&censored_regexps);
let mut befores = match args.before {
None => vec![],
Some(b) => b.split_whitespace().map(|s| s.to_string()).collect(),
};
let mut afters = match args.after {
None => vec![],
Some(a) => a.split_whitespace().map(|s| s.to_string()).collect(),
};
let mut extended_sample_string_tokens = vec![];
extended_sample_string_tokens.append(&mut befores);
extended_sample_string_tokens.append(&mut sample_string_tokens);
extended_sample_string_tokens.append(&mut afters);
println!("{:?}", extended_sample_string_tokens);
// collect 3-grams from extended_sample_string that occur less often than CUTOFF in the corpus
const CUTOFF : i32 = 3;
let mut uncommon_3grams = vec![];
for triple in extended_sample_string_tokens.windows(3) {
let three_gram = format!("{}^{}^{}", triple[0], triple[1], triple[2]);
if triple_dict.get(&three_gram).unwrap() < &CUTOFF {
uncommon_3grams.push(three_gram);
}
// println!("3-gram {}, count {}", three_gram, &triple_dict.get(&three_gram).unwrap());
}
packages::parser::print_dict("inverted", &two_grams);
//let sample_string = "Jun 23 23:30:05 combo sshd(pam_unix)[26190]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=218.22.3.51 user=root authentication".to_string();
let sample_string_tokens = packages::parser::token_splitter(args.to_parse,
&format_string_re,
&censored_regexps);
println!("{:?}", sample_string_tokens);
for pair in sample_string_tokens.windows(2) {
let two_gram = format!("{}^{}", pair[0], pair[1]);
println!("2-gram {}, count {}", two_gram, &double_dict.get(&two_gram).unwrap());
let mut deduped_2grams_from_uncommon_3grams : HashSet<String> = HashSet::new();
for three_g in uncommon_3grams {
for two_g in derive_2grams_from_trigram(&three_g) {
deduped_2grams_from_uncommon_3grams.insert(two_g);
}
}
let mut uncommon_2grams : Vec<String> = vec![];
for two_g in deduped_2grams_from_uncommon_3grams {
let two_g_count = double_dict.get(&two_g).unwrap();
if two_g_count < &CUTOFF {
uncommon_2grams.push(two_g);
// println!("2-gram {}, count {}", two_g, two_g_count);
}
}
// now, iterate on the original tokens again and look for uncommon 2grams that appear
let mut dynamic_tokens = vec![];
for triple in extended_sample_string_tokens.windows(3) {
let two_gram1 = format!("{}^{}", triple[0], triple[1]);
let two_gram2 = format!("{}^{}", triple[1], triple[2]);
if uncommon_2grams.contains(&two_gram1) && uncommon_2grams.contains(&two_gram2) {
dynamic_tokens.push(triple[1].to_string());
}
// println!("focus is {}, have {} {}, contains is {}/{}", triple[1], two_gram1, two_gram2, uncommon_2grams.contains(&two_gram1), uncommon_2grams.contains(&two_gram2));
}
println!("dynamic tokens: {:?}", dynamic_tokens);
}
......@@ -25,6 +25,16 @@ pub fn openstack_censored_regexps() -> Vec<Regex> {
// I commented out Regex::new(r"\d+").unwrap() because that censors all numbers, which may not be what we want?
}
pub fn spark_format() -> String {
return "<Date> <Time> <Level> <Component>: <Content>".to_string();
}
pub fn spark_censored_regexps() -> Vec<Regex> {
return vec![Regex::new(r"(\d+\.){3}\d+").unwrap(),
Regex::new(r"\b[KGTM]?B\b").unwrap(),
Regex::new(r"([\w-]+\.){2,}[\w-]+").unwrap()]
}
// https://doc.rust-lang.org/rust-by-example/std_misc/file/read_lines.html
// The output is wrapped in a Result to allow matching on errors
// Returns an Iterator to the Reader of the lines of the file.
......@@ -261,6 +271,12 @@ pub fn parse_raw_openstack(raw_fn: String) -> (HashMap<String, i32>, HashMap<Str
return (double_dict, triple_dict, all_token_list);
}
pub fn parse_raw_spark(raw_fn: String) -> (HashMap<String, i32>, HashMap<String, i32>, Vec<String>) {
let (double_dict, triple_dict, all_token_list) = dictionary_builder(raw_fn, spark_format(), spark_censored_regexps());
println!("double dictionary list len {}, triple {}, all tokens {}", double_dict.len(), triple_dict.len(), all_token_list.len());
return (double_dict, triple_dict, all_token_list);
}
#[test]
fn test_parse_raw_linux() {
let (double_dict, triple_dict, all_token_list) = parse_raw_linux("data/from_paper.log".to_string());
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment