Skip to content
Snippets Groups Projects
Commit 5b748c3f authored by Patrick Lam's avatar Patrick Lam
Browse files

reduce to 2grams

parent 2eb11ed1
No related branches found
No related tags found
No related merge requests found
use clap::Parser;
use std::collections::HashMap;
mod packages;
......@@ -10,13 +11,43 @@ struct Args {
raw: String,
}
#[test]
fn test_derive_2grams() {
let twograms_oracle = vec![("one^two"),
("two^three")];
let twograms = derive_2grams("one^two^three");
assert_eq!(twograms_oracle, twograms);
}
fn derive_2grams(trigram:&str) -> Vec<String> {
let grams : Vec<&str> = trigram.split("^").collect();
return vec![format!("{}^{}", grams[0], grams[1]),
format!("{}^{}", grams[1], grams[2])];
}
fn main() {
let args = Args::parse();
let (double_dict, triple_dict, all_token_list) = packages::parser::parse_raw(args.raw);
packages::parser::print_dict(triple_dict);
packages::parser::print_dict(&triple_dict);
const CUTOFF : i32 = 72;
// let's say that the cutoff is 72 for Linux2k.log.
// check all of the 2-grams from the less-frequent 3-grams
let mut two_grams = HashMap::new();
let (val_set, reverse_d) = packages::parser::reverse_dict(&triple_dict);
for val in val_set.iter().filter(|x| **x < CUTOFF) {
for key in reverse_d.get(val).unwrap() {
let val_2grams = derive_2grams(key);
for two_gram in val_2grams {
if let Some(count) = two_grams.get(&two_gram) {
two_grams.insert(two_gram, count+1);
} else {
two_grams.insert(two_gram, 1);
}
}
}
}
packages::parser::print_dict(&two_grams);
}
......@@ -244,7 +244,7 @@ fn test_parse_raw() {
}
/// standard mapreduce invert map: given {<k1, v1>, <k2, v2>, <k3, v1>}, returns ([v1, v2] (sorted), {<v1, [k1, k3]>, <v2, [k2]>})
pub fn reverse_dict(d: HashMap<String, i32>) -> (BTreeSet<i32>, HashMap<i32, Vec<String>>) {
pub fn reverse_dict(d: &HashMap<String, i32>) -> (BTreeSet<i32>, HashMap<i32, Vec<String>>) {
let mut reverse_d: HashMap<i32, Vec<String>> = HashMap::new();
let mut val_set: BTreeSet<i32> = BTreeSet::new();
......@@ -260,7 +260,7 @@ pub fn reverse_dict(d: HashMap<String, i32>) -> (BTreeSet<i32>, HashMap<i32, Vec
return (val_set, reverse_d);
}
pub fn print_dict(d: HashMap<String, i32>) {
pub fn print_dict(d: &HashMap<String, i32>) {
let (val_set, reverse_d) = reverse_dict(d);
println!("printing dict");
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment