From 6d57a8af0570e07ca53a77787750b9416c15fa01 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sat, 5 May 2018 22:13:00 +0200 Subject: [PATCH] feat(search): Add a StreamWithState Streamer --- Cargo.lock | 22 +++++++------- Cargo.toml | 11 +++++-- src/bin/raptor.rs | 77 +++++++++++++++++++++-------------------------- src/lib.rs | 73 ++++++++++++++++++++++++++++++++++++++------ 4 files changed, 118 insertions(+), 65 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 680034b97..c358a2218 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -85,7 +85,7 @@ dependencies = [ [[package]] name = "fst" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" +source = "git+https://github.com/Kerollmops/fst.git?branch=stream-with-state#a969462433944a22f1356a8bf2affb8e9bde6f67" dependencies = [ "byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -161,9 +161,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "levenshtein_automata" version = "0.1.0" -source = "git+https://github.com/tantivy-search/levenshtein-automata.git#ba2b62e3631593c408e2b9b8bb95c430384a331e" +source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#13a685e087efcf253936342c055166fa5d5c9b9c" dependencies = [ - "fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)", ] [[package]] @@ -305,9 +305,9 @@ version = "0.1.0" dependencies = [ "bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", - "fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", + "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)", "futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)", - "levenshtein_automata 0.1.0 (git+https://github.com/tantivy-search/levenshtein-automata.git)", + "levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)", "serde 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", @@ -345,7 +345,7 @@ dependencies = [ "proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", "quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive_internals 0.23.1 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 0.13.6 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.13.7 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -354,7 +354,7 @@ version = "0.23.1" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", - "syn 0.13.6 (registry+https://github.com/rust-lang/crates.io-index)", + "syn 0.13.7 (registry+https://github.com/rust-lang/crates.io-index)", ] [[package]] @@ -384,7 +384,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "syn" -version = "0.13.6" +version = "0.13.7" source = "registry+https://github.com/rust-lang/crates.io-index" dependencies = [ "proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", @@ -651,7 +651,7 @@ dependencies = [ "checksum crossbeam-utils 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d636a8b3bcc1b409d7ffd3facef8f21dcb4009626adbd0c5e6c4305c07253c7b" "checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" -"checksum fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d94485a00b1827b861dd9d1a2cc9764f9044d4c535514c0760a5a2012ef3399f" +"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)" = "" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)" = "1a70b146671de62ec8c8ed572219ca5d594d9b06c0b364d5e67b722fc559b48c" @@ -662,7 +662,7 @@ dependencies = [ "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c8f31047daa365f19be14b47c29df4f7c3b581832407daabe6ae77397619237d" "checksum lazycell 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a6f08839bc70ef4a3fe1d566d5350f519c5912ea86be0df1740a7d247c7fc0ef" -"checksum levenshtein_automata 0.1.0 (git+https://github.com/tantivy-search/levenshtein-automata.git)" = "" +"checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "" "checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b" "checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" "checksum log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "89f010e843f2b1a31dbd316b3b8d443758bc634bed37aabade59c686d644e0a2" @@ -689,7 +689,7 @@ dependencies = [ "checksum slab 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b4fcaed89ab08ef143da37bc52adbcc04d4a69014f4c1208d6b51f0c47bc23" "checksum slab 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fdeff4cd9ecff59ec7e3744cbca73dfe5ac35c2aedb2cfba8a1c715a18912e9d" "checksum smallvec 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4c8cbcd6df1e117c2210e13ab5109635ad68a929fcbb8964dc965b76cb5ee013" -"checksum syn 0.13.6 (registry+https://github.com/rust-lang/crates.io-index)" = "cd06d020ab141832177869072dffb95d84e76c0cc0ab26d6eb38583e07d0403b" +"checksum syn 0.13.7 (registry+https://github.com/rust-lang/crates.io-index)" = "61b8f1b737f929c6516ba46a3133fd6d5215ad8a62f66760f851f7048aebedfb" "checksum take 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b157868d8ac1f56b64604539990685fa7611d8fa9e5476cf0c02cf34d32917c5" "checksum time 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "a15375f1df02096fb3317256ce2cee6a1f42fc84ea5ad5fc8c421cfe40c73098" "checksum tokio 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7d00555353b013e170ed8bc4e13f648a317d1fd12157dbcae13f7013f6cf29f5" diff --git a/Cargo.toml b/Cargo.toml index 1aafa05d1..9d284c9d9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,8 +6,6 @@ authors = ["Kerollmops "] [dependencies] bincode = "1.0" env_logger = { version = "0.3", default-features = false } -fst = "0.3" -levenshtein_automata = { git = "https://github.com/tantivy-search/levenshtein-automata.git", features = ["fst_automaton"] } futures = "0.1" serde = "1.0" serde_derive = "1.0" @@ -17,5 +15,14 @@ tokio-proto = "0.1" tokio-service = "0.1" url = "1.7" +[dependencies.fst] +git = "https://github.com/Kerollmops/fst.git" +branch = "stream-with-state" + +[dependencies.levenshtein_automata] +git = "https://github.com/Kerollmops/levenshtein-automata.git" +branch = "custom-fst" +features = ["fst_automaton"] + [profile.release] lto = true diff --git a/src/bin/raptor.rs b/src/bin/raptor.rs index 20a285409..f9fe6eca7 100644 --- a/src/bin/raptor.rs +++ b/src/bin/raptor.rs @@ -14,8 +14,8 @@ use std::fs::File; use std::io::{Read, BufReader}; use fst::{IntoStreamer, Streamer}; -use levenshtein_automata::LevenshteinAutomatonBuilder; use futures::future; +use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use tokio_minihttp::{Request, Response, Http}; use tokio_proto::TcpServer; use tokio_service::Service; @@ -23,35 +23,18 @@ use tokio_service::Service; use raptor::FstMap; static mut MAP: Option> = None; -static mut LEV_AUT_BLDR_0: Option = None; -static mut LEV_AUT_BLDR_1: Option = None; -static mut LEV_AUT_BLDR_2: Option = None; +static mut LEV_BUILDER_0: Option = None; +static mut LEV_BUILDER_1: Option = None; +static mut LEV_BUILDER_2: Option = None; -struct MainService { - map: &'static FstMap, - lev_aut_bldr_0: &'static LevenshteinAutomatonBuilder, - lev_aut_bldr_1: &'static LevenshteinAutomatonBuilder, - lev_aut_bldr_2: &'static LevenshteinAutomatonBuilder, +struct MainService<'a> { + map: &'a FstMap, + lev_builder_0: &'a LevBuilder, + lev_builder_1: &'a LevBuilder, + lev_builder_2: &'a LevBuilder, } -fn construct_body<'f, S>(mut stream: S) -> String -where - S: 'f + for<'a> Streamer<'a, Item=(&'a str, &'a [u64])> -{ - let mut body = String::new(); - body.push_str(""); - - while let Some((key, values)) = stream.next() { - let values = &values[..values.len().min(10)]; - body.push_str(&format!("{:?} {:?}
", key, values)); - } - - body.push_str(""); - - body -} - -impl Service for MainService { +impl<'a> Service for MainService<'a> { type Request = Request; type Response = Response; type Error = io::Error; @@ -66,19 +49,29 @@ impl Service for MainService { resp.header("Content-Type", "text/html"); resp.header("charset", "utf-8"); - if let Some((_, key)) = url.query_pairs().find(|&(ref k, _)| k == "q") { - let key = key.to_lowercase(); + if let Some((_, query)) = url.query_pairs().find(|&(ref k, _)| k == "q") { + let query = query.to_lowercase(); - let lev = if key.len() <= 4 { - self.lev_aut_bldr_0.build_dfa(&key) - } else if key.len() <= 8 { - self.lev_aut_bldr_1.build_dfa(&key) + let lev = if query.len() <= 4 { + self.lev_builder_0.build_dfa(&query) + } else if query.len() <= 8 { + self.lev_builder_1.build_dfa(&query) } else { - self.lev_aut_bldr_2.build_dfa(&key) + self.lev_builder_2.build_dfa(&query) }; - let stream = self.map.search(lev).into_stream(); - let body = construct_body(stream); + let mut stream = self.map.search(&lev).with_state().into_stream(); + + let mut body = String::new(); + body.push_str(""); + + while let Some((key, values, state)) = stream.next() { + let values = &values[..values.len().min(10)]; + let distance = lev.distance(state); + body.push_str(&format!("

{:?} (dist: {:?}) {:?}

", key, distance, values)); + } + + body.push_str(""); resp.body_vec(body.into_bytes()); } @@ -108,9 +101,9 @@ fn main() { Some(FstMap::from_bytes(map, &values).unwrap()) }; - LEV_AUT_BLDR_0 = Some(LevenshteinAutomatonBuilder::new(0, false)); - LEV_AUT_BLDR_1 = Some(LevenshteinAutomatonBuilder::new(1, false)); - LEV_AUT_BLDR_2 = Some(LevenshteinAutomatonBuilder::new(2, false)); + LEV_BUILDER_0 = Some(LevBuilder::new(0, false)); + LEV_BUILDER_1 = Some(LevBuilder::new(1, false)); + LEV_BUILDER_2 = Some(LevBuilder::new(2, false)); } let addr = "0.0.0.0:8080".parse().unwrap(); @@ -118,9 +111,9 @@ fn main() { unsafe { TcpServer::new(Http, addr).serve(|| Ok(MainService { map: MAP.as_ref().unwrap(), - lev_aut_bldr_0: LEV_AUT_BLDR_0.as_ref().unwrap(), - lev_aut_bldr_1: LEV_AUT_BLDR_1.as_ref().unwrap(), - lev_aut_bldr_2: LEV_AUT_BLDR_2.as_ref().unwrap(), + lev_builder_0: LEV_BUILDER_0.as_ref().unwrap(), + lev_builder_1: LEV_BUILDER_1.as_ref().unwrap(), + lev_builder_2: LEV_BUILDER_2.as_ref().unwrap(), })) } } diff --git a/src/lib.rs b/src/lib.rs index afda8bd7e..870964cd9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,15 +15,23 @@ use fst::Automaton; pub use self::fst_map::{FstMap, FstMapBuilder}; use self::fst_map::Values; -pub struct StreamBuilder<'a, T: 'a, A: Automaton> { - inner: fst::map::StreamBuilder<'a, A>, - values: &'a Values, +pub struct StreamBuilder<'m, 'v, T: 'v, A> { + inner: fst::map::StreamBuilder<'m, A>, + values: &'v Values, } -impl<'a, T: 'a, A: Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, T, A> { - type Item = (&'a str, &'a [T]); +impl<'m, 'v, T: 'v, A> StreamBuilder<'m, 'v, T, A> { + pub fn with_state(self) -> StreamWithStateBuilder<'m, 'v, T, A> { + StreamWithStateBuilder { + inner: self.inner.with_state(), + values: self.values, + } + } +} - type Into = Stream<'a, T, A>; +impl<'m, 'v, 'a, T: 'v + 'a, A: Automaton> fst::IntoStreamer<'a> for StreamBuilder<'m, 'v, T, A> { + type Item = (&'a str, &'a [T]); + type Into = Stream<'m, 'v, T, A>; fn into_stream(self) -> Self::Into { Stream { @@ -33,12 +41,12 @@ impl<'a, T: 'a, A: Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, T, A> } } -pub struct Stream<'a, T: 'a, A: Automaton = fst::automaton::AlwaysMatch> { - inner: fst::map::Stream<'a, A>, - values: &'a Values, +pub struct Stream<'m, 'v, T: 'v, A: Automaton = fst::automaton::AlwaysMatch> { + inner: fst::map::Stream<'m, A>, + values: &'v Values, } -impl<'a, 'm, T: 'a, A: Automaton> fst::Streamer<'a> for Stream<'m, T, A> { +impl<'m, 'v, 'a, T: 'v + 'a, A: Automaton> fst::Streamer<'a> for Stream<'m, 'v, T, A> { type Item = (&'a str, &'a [T]); fn next(&'a mut self) -> Option { @@ -53,3 +61,48 @@ impl<'a, 'm, T: 'a, A: Automaton> fst::Streamer<'a> for Stream<'m, T, A> { } } } + +pub struct StreamWithStateBuilder<'m, 'v, T: 'v, A> { + inner: fst::map::StreamWithStateBuilder<'m, A>, + values: &'v Values, +} + +impl<'m, 'v, 'a, T: 'v + 'a, A: 'a> fst::IntoStreamer<'a> for StreamWithStateBuilder<'m, 'v, T, A> +where + A: Automaton, + A::State: Clone, +{ + type Item = (&'a str, &'a [T], A::State); + type Into = StreamWithState<'m, 'v, T, A>; + + fn into_stream(self) -> Self::Into { + StreamWithState { + inner: self.inner.into_stream(), + values: self.values, + } + } +} + +pub struct StreamWithState<'m, 'v, T: 'v, A: Automaton = fst::automaton::AlwaysMatch> { + inner: fst::map::StreamWithState<'m, A>, + values: &'v Values, +} + +impl<'m, 'v, 'a, T: 'v + 'a, A: 'a> fst::Streamer<'a> for StreamWithState<'m, 'v, T, A> +where + A: Automaton, + A::State: Clone, +{ + type Item = (&'a str, &'a [T], A::State); + + fn next(&'a mut self) -> Option { + match self.inner.next() { + Some((k, i, state)) => { + let key = unsafe { from_utf8_unchecked(k) }; + let values = unsafe { self.values.get_unchecked(i as usize) }; + Some((key, values, state)) + }, + None => None, + } + } +}