From 2fa7178ed1c9eaddc78a3aa29e9e3497f8cc710b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 22 Apr 2018 17:34:41 +0200 Subject: [PATCH] cli: Make work to index json lines --- Cargo.lock | 14 ++++++++ Cargo.toml | 3 +- src/bin/raptor-cli.rs | 54 +++++++++++++++++++++++++++++ src/lib.rs | 80 ++++++++++++++++++++++++++++++++++++------- 4 files changed, 137 insertions(+), 14 deletions(-) create mode 100644 src/bin/raptor-cli.rs diff --git a/Cargo.lock b/Cargo.lock index c2eaf62ef..c2c1c20b8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6,6 +6,15 @@ dependencies = [ "nodrop 0.1.12 (registry+https://github.com/rust-lang/crates.io-index)", ] +[[package]] +name = "bincode" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", + "serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)", +] + [[package]] name = "bitflags" version = "1.0.1" @@ -294,6 +303,7 @@ dependencies = [ name = "raptor" version = "0.1.0" dependencies = [ + "bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)", @@ -376,6 +386,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" name = "smallvec" version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" +dependencies = [ + "serde 1.0.42 (registry+https://github.com/rust-lang/crates.io-index)", +] [[package]] name = "syn" @@ -625,6 +638,7 @@ dependencies = [ [metadata] "checksum arrayvec 0.4.7 (registry+https://github.com/rust-lang/crates.io-index)" = "a1e964f9e24d588183fcb43503abda40d288c8657dfc27311516ce2f05675aef" +"checksum bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "bda13183df33055cbb84b847becce220d392df502ebe7a4a78d7021771ed94d0" "checksum bitflags 1.0.1 (registry+https://github.com/rust-lang/crates.io-index)" = "b3c30d3802dfb7281680d6285f2ccdaa8c2d8fee41f93805dba5c4cf50dc23cf" "checksum byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "73b5bdfe7ee3ad0b99c9801d58807a9dbc9e09196365b0203853b99889ab3c87" "checksum bytes 0.4.6 (registry+https://github.com/rust-lang/crates.io-index)" = "1b7db437d718977f6dc9b2e3fd6fc343c02ac6b899b73fdd2179163447bd9ce9" diff --git a/Cargo.toml b/Cargo.toml index 40b7ea480..8cfc26b34 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,6 +4,7 @@ version = "0.1.0" authors = ["Kerollmops "] [dependencies] +bincode = "1.0" env_logger = { version = "0.3", default-features = false } fst = "0.3" futures = "0.1" @@ -13,5 +14,5 @@ tokio-service = "0.1" serde = "1.0" serde_json = "1.0" serde_derive = "1.0" -smallvec = "0.6" +smallvec = { version = "0.6", features = ["serde"] } url = "1.7" diff --git a/src/bin/raptor-cli.rs b/src/bin/raptor-cli.rs new file mode 100644 index 000000000..c54cfe6d0 --- /dev/null +++ b/src/bin/raptor-cli.rs @@ -0,0 +1,54 @@ +// TODO make the raptor binary expose multiple subcommand +// make only one binary + +extern crate fst; +extern crate raptor; +extern crate serde_json; +#[macro_use] extern crate serde_derive; + +use std::fs::File; +use std::io::{BufReader, BufRead}; + +use fst::Streamer; +use serde_json::from_str; + +use raptor::{MultiMapBuilder, MultiMap}; + +#[derive(Debug, Deserialize)] +struct Product { + product_id: u64, + title: String, + ft: String, +} + +fn main() { + let data = File::open("products.json_lines").unwrap(); + let data = BufReader::new(data); + + let mut builder = MultiMapBuilder::new(); + for line in data.lines() { + let line = line.unwrap(); + + let product: Product = from_str(&line).unwrap(); + + // TODO filter words here !!! + let title = product.title.split_whitespace(); + let description = product.ft.split_whitespace(); + let words = title.chain(description); + + for word in words { + builder.insert(word, product.product_id); + } + } + + let map = File::create("map.fst").unwrap(); + let values = File::create("values.vecs").unwrap(); + let (map, values) = builder.build(map, values).unwrap(); + + let map = unsafe { MultiMap::from_paths("map.fst", "values.vecs").unwrap() }; + + let mut stream = map.stream(); + while let Some(x) = stream.next() { + println!("{:?}", x); + } +} diff --git a/src/lib.rs b/src/lib.rs index 89de155ed..c3620158d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,4 @@ +extern crate bincode; extern crate fst; extern crate serde; extern crate serde_json; @@ -5,19 +6,15 @@ extern crate serde_json; extern crate smallvec; use std::io::Write; +use std::fs::File; +use std::path::Path; +use std::str::from_utf8_unchecked; pub use fst::MapBuilder; use smallvec::SmallVec; type SmallVec16 = SmallVec<[T; 16]>; -#[derive(Debug, Serialize)] -struct Product<'a> { - product_id: u64, - title: &'a str, - ft: &'a str, -} - #[derive(Debug)] pub struct MultiMap { map: fst::Map, @@ -25,6 +22,27 @@ pub struct MultiMap { } impl MultiMap { + pub unsafe fn from_paths(map: P, values: Q) -> fst::Result + where + P: AsRef, + Q: AsRef + { + let map = fst::Map::from_path(map)?; + + // TODO handle error !!! + let values_file = File::open(values).unwrap(); + let values = bincode::deserialize_from(values_file).unwrap(); + + Ok(MultiMap { map, values }) + } + + pub fn stream(&self) -> Stream { + Stream { + inner: self.map.stream(), + values: &self.values, + } + } + pub fn contains_key>(&self, key: K) -> bool { self.map.contains_key(key) } @@ -34,22 +52,43 @@ impl MultiMap { } } +pub struct Stream<'a, A: fst::Automaton = fst::automaton::AlwaysMatch> { + inner: fst::map::Stream<'a, A>, + values: &'a [SmallVec16], +} + +impl<'a, 'm, A: fst::Automaton> fst::Streamer<'a> for Stream<'m, A> { + type Item = (&'a str, &'a [u64]); + + fn next(&'a mut self) -> Option { + // Here we can't just `map` because of some borrow rules + match self.inner.next() { + Some((key, i)) => { + let key = unsafe { from_utf8_unchecked(key) }; + Some((key, &*self.values[i as usize])) + }, + None => None, + } + } +} + #[derive(Debug)] -pub struct MultiMapBuilder<'a> { - map: Vec<(&'a str, u64)>, +pub struct MultiMapBuilder { + map: Vec<(String, u64)>, values: Vec>, } -impl<'a> MultiMapBuilder<'a> { - pub fn new() -> MultiMapBuilder<'a> { +impl<'a> MultiMapBuilder { + pub fn new() -> MultiMapBuilder { MultiMapBuilder { map: Vec::new(), values: Vec::new(), } } - pub fn insert(&mut self, key: &'a str, value: u64) { - match self.map.binary_search_by_key(&key, |&(k, _)| k) { + pub fn insert>(&mut self, key: S, value: u64) { + let key = key.into(); + match self.map.binary_search_by_key(&key.as_str(), |&(ref k, _)| k) { Ok(index) => { let (_, index) = self.map[index]; let values = &mut self.values[index as usize]; @@ -78,4 +117,19 @@ impl<'a> MultiMapBuilder<'a> { values: self.values.into_boxed_slice(), }) } + + pub fn build(self, map_wrt: W, mut values_wrt: X) -> fst::Result<(W, X)> + where + W: Write, + X: Write + { + let mut builder = MapBuilder::new(map_wrt)?; + builder.extend_iter(self.map)?; + let map = builder.into_inner()?; + + // TODO handle that !!! + bincode::serialize_into(&mut values_wrt, &self.values).unwrap(); + + Ok((map, values_wrt)) + } }