feat(search): Add a StreamWithState Streamer

This commit is contained in:
Kerollmops 2018-05-05 22:13:00 +02:00 committed by Clément Renault
parent 7fba62fc22
commit 6d57a8af05
4 changed files with 118 additions and 65 deletions

22
Cargo.lock generated
View File

@ -85,7 +85,7 @@ dependencies = [
[[package]] [[package]]
name = "fst" name = "fst"
version = "0.3.0" version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "git+https://github.com/Kerollmops/fst.git?branch=stream-with-state#a969462433944a22f1356a8bf2affb8e9bde6f67"
dependencies = [ dependencies = [
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
@ -161,9 +161,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "levenshtein_automata" name = "levenshtein_automata"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/tantivy-search/levenshtein-automata.git#ba2b62e3631593c408e2b9b8bb95c430384a331e" source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#13a685e087efcf253936342c055166fa5d5c9b9c"
dependencies = [ dependencies = [
"fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)",
] ]
[[package]] [[package]]
@ -305,9 +305,9 @@ version = "0.1.0"
dependencies = [ dependencies = [
"bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)", "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)",
"futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)",
"levenshtein_automata 0.1.0 (git+https://github.com/tantivy-search/levenshtein-automata.git)", "levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)",
"serde 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
@ -345,7 +345,7 @@ dependencies = [
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", "proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)", "quote 0.5.2 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive_internals 0.23.1 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive_internals 0.23.1 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 0.13.6 (registry+https://github.com/rust-lang/crates.io-index)", "syn 0.13.7 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
@ -354,7 +354,7 @@ version = "0.23.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", "proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
"syn 0.13.6 (registry+https://github.com/rust-lang/crates.io-index)", "syn 0.13.7 (registry+https://github.com/rust-lang/crates.io-index)",
] ]
[[package]] [[package]]
@ -384,7 +384,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "syn" name = "syn"
version = "0.13.6" version = "0.13.7"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
dependencies = [ dependencies = [
"proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)", "proc-macro2 0.3.8 (registry+https://github.com/rust-lang/crates.io-index)",
@ -651,7 +651,7 @@ dependencies = [
"checksum crossbeam-utils 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d636a8b3bcc1b409d7ffd3facef8f21dcb4009626adbd0c5e6c4305c07253c7b" "checksum crossbeam-utils 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d636a8b3bcc1b409d7ffd3facef8f21dcb4009626adbd0c5e6c4305c07253c7b"
"checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab" "checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab"
"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
"checksum fst 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "d94485a00b1827b861dd9d1a2cc9764f9044d4c535514c0760a5a2012ef3399f" "checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)" = "<none>"
"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
"checksum futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)" = "1a70b146671de62ec8c8ed572219ca5d594d9b06c0b364d5e67b722fc559b48c" "checksum futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)" = "1a70b146671de62ec8c8ed572219ca5d594d9b06c0b364d5e67b722fc559b48c"
@ -662,7 +662,7 @@ dependencies = [
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
"checksum lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c8f31047daa365f19be14b47c29df4f7c3b581832407daabe6ae77397619237d" "checksum lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c8f31047daa365f19be14b47c29df4f7c3b581832407daabe6ae77397619237d"
"checksum lazycell 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a6f08839bc70ef4a3fe1d566d5350f519c5912ea86be0df1740a7d247c7fc0ef" "checksum lazycell 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a6f08839bc70ef4a3fe1d566d5350f519c5912ea86be0df1740a7d247c7fc0ef"
"checksum levenshtein_automata 0.1.0 (git+https://github.com/tantivy-search/levenshtein-automata.git)" = "<none>" "checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>"
"checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b" "checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b"
"checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" "checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b"
"checksum log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "89f010e843f2b1a31dbd316b3b8d443758bc634bed37aabade59c686d644e0a2" "checksum log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "89f010e843f2b1a31dbd316b3b8d443758bc634bed37aabade59c686d644e0a2"
@ -689,7 +689,7 @@ dependencies = [
"checksum slab 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b4fcaed89ab08ef143da37bc52adbcc04d4a69014f4c1208d6b51f0c47bc23" "checksum slab 0.3.0 (registry+https://github.com/rust-lang/crates.io-index)" = "17b4fcaed89ab08ef143da37bc52adbcc04d4a69014f4c1208d6b51f0c47bc23"
"checksum slab 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fdeff4cd9ecff59ec7e3744cbca73dfe5ac35c2aedb2cfba8a1c715a18912e9d" "checksum slab 0.4.0 (registry+https://github.com/rust-lang/crates.io-index)" = "fdeff4cd9ecff59ec7e3744cbca73dfe5ac35c2aedb2cfba8a1c715a18912e9d"
"checksum smallvec 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4c8cbcd6df1e117c2210e13ab5109635ad68a929fcbb8964dc965b76cb5ee013" "checksum smallvec 0.2.1 (registry+https://github.com/rust-lang/crates.io-index)" = "4c8cbcd6df1e117c2210e13ab5109635ad68a929fcbb8964dc965b76cb5ee013"
"checksum syn 0.13.6 (registry+https://github.com/rust-lang/crates.io-index)" = "cd06d020ab141832177869072dffb95d84e76c0cc0ab26d6eb38583e07d0403b" "checksum syn 0.13.7 (registry+https://github.com/rust-lang/crates.io-index)" = "61b8f1b737f929c6516ba46a3133fd6d5215ad8a62f66760f851f7048aebedfb"
"checksum take 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b157868d8ac1f56b64604539990685fa7611d8fa9e5476cf0c02cf34d32917c5" "checksum take 0.1.0 (registry+https://github.com/rust-lang/crates.io-index)" = "b157868d8ac1f56b64604539990685fa7611d8fa9e5476cf0c02cf34d32917c5"
"checksum time 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "a15375f1df02096fb3317256ce2cee6a1f42fc84ea5ad5fc8c421cfe40c73098" "checksum time 0.1.39 (registry+https://github.com/rust-lang/crates.io-index)" = "a15375f1df02096fb3317256ce2cee6a1f42fc84ea5ad5fc8c421cfe40c73098"
"checksum tokio 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7d00555353b013e170ed8bc4e13f648a317d1fd12157dbcae13f7013f6cf29f5" "checksum tokio 0.1.6 (registry+https://github.com/rust-lang/crates.io-index)" = "7d00555353b013e170ed8bc4e13f648a317d1fd12157dbcae13f7013f6cf29f5"

View File

@ -6,8 +6,6 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies] [dependencies]
bincode = "1.0" bincode = "1.0"
env_logger = { version = "0.3", default-features = false } env_logger = { version = "0.3", default-features = false }
fst = "0.3"
levenshtein_automata = { git = "https://github.com/tantivy-search/levenshtein-automata.git", features = ["fst_automaton"] }
futures = "0.1" futures = "0.1"
serde = "1.0" serde = "1.0"
serde_derive = "1.0" serde_derive = "1.0"
@ -17,5 +15,14 @@ tokio-proto = "0.1"
tokio-service = "0.1" tokio-service = "0.1"
url = "1.7" url = "1.7"
[dependencies.fst]
git = "https://github.com/Kerollmops/fst.git"
branch = "stream-with-state"
[dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git"
branch = "custom-fst"
features = ["fst_automaton"]
[profile.release] [profile.release]
lto = true lto = true

View File

@ -14,8 +14,8 @@ use std::fs::File;
use std::io::{Read, BufReader}; use std::io::{Read, BufReader};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
use levenshtein_automata::LevenshteinAutomatonBuilder;
use futures::future; use futures::future;
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
use tokio_minihttp::{Request, Response, Http}; use tokio_minihttp::{Request, Response, Http};
use tokio_proto::TcpServer; use tokio_proto::TcpServer;
use tokio_service::Service; use tokio_service::Service;
@ -23,35 +23,18 @@ use tokio_service::Service;
use raptor::FstMap; use raptor::FstMap;
static mut MAP: Option<FstMap<u64>> = None; static mut MAP: Option<FstMap<u64>> = None;
static mut LEV_AUT_BLDR_0: Option<LevenshteinAutomatonBuilder> = None; static mut LEV_BUILDER_0: Option<LevBuilder> = None;
static mut LEV_AUT_BLDR_1: Option<LevenshteinAutomatonBuilder> = None; static mut LEV_BUILDER_1: Option<LevBuilder> = None;
static mut LEV_AUT_BLDR_2: Option<LevenshteinAutomatonBuilder> = None; static mut LEV_BUILDER_2: Option<LevBuilder> = None;
struct MainService { struct MainService<'a> {
map: &'static FstMap<u64>, map: &'a FstMap<u64>,
lev_aut_bldr_0: &'static LevenshteinAutomatonBuilder, lev_builder_0: &'a LevBuilder,
lev_aut_bldr_1: &'static LevenshteinAutomatonBuilder, lev_builder_1: &'a LevBuilder,
lev_aut_bldr_2: &'static LevenshteinAutomatonBuilder, lev_builder_2: &'a LevBuilder,
} }
fn construct_body<'f, S>(mut stream: S) -> String impl<'a> Service for MainService<'a> {
where
S: 'f + for<'a> Streamer<'a, Item=(&'a str, &'a [u64])>
{
let mut body = String::new();
body.push_str("<html><body>");
while let Some((key, values)) = stream.next() {
let values = &values[..values.len().min(10)];
body.push_str(&format!("{:?} {:?}</br>", key, values));
}
body.push_str("</body></html>");
body
}
impl Service for MainService {
type Request = Request; type Request = Request;
type Response = Response; type Response = Response;
type Error = io::Error; type Error = io::Error;
@ -66,19 +49,29 @@ impl Service for MainService {
resp.header("Content-Type", "text/html"); resp.header("Content-Type", "text/html");
resp.header("charset", "utf-8"); resp.header("charset", "utf-8");
if let Some((_, key)) = url.query_pairs().find(|&(ref k, _)| k == "q") { if let Some((_, query)) = url.query_pairs().find(|&(ref k, _)| k == "q") {
let key = key.to_lowercase(); let query = query.to_lowercase();
let lev = if key.len() <= 4 { let lev = if query.len() <= 4 {
self.lev_aut_bldr_0.build_dfa(&key) self.lev_builder_0.build_dfa(&query)
} else if key.len() <= 8 { } else if query.len() <= 8 {
self.lev_aut_bldr_1.build_dfa(&key) self.lev_builder_1.build_dfa(&query)
} else { } else {
self.lev_aut_bldr_2.build_dfa(&key) self.lev_builder_2.build_dfa(&query)
}; };
let stream = self.map.search(lev).into_stream(); let mut stream = self.map.search(&lev).with_state().into_stream();
let body = construct_body(stream);
let mut body = String::new();
body.push_str("<html><body>");
while let Some((key, values, state)) = stream.next() {
let values = &values[..values.len().min(10)];
let distance = lev.distance(state);
body.push_str(&format!("<p>{:?} (dist: {:?}) {:?}</p>", key, distance, values));
}
body.push_str("</body></html>");
resp.body_vec(body.into_bytes()); resp.body_vec(body.into_bytes());
} }
@ -108,9 +101,9 @@ fn main() {
Some(FstMap::from_bytes(map, &values).unwrap()) Some(FstMap::from_bytes(map, &values).unwrap())
}; };
LEV_AUT_BLDR_0 = Some(LevenshteinAutomatonBuilder::new(0, false)); LEV_BUILDER_0 = Some(LevBuilder::new(0, false));
LEV_AUT_BLDR_1 = Some(LevenshteinAutomatonBuilder::new(1, false)); LEV_BUILDER_1 = Some(LevBuilder::new(1, false));
LEV_AUT_BLDR_2 = Some(LevenshteinAutomatonBuilder::new(2, false)); LEV_BUILDER_2 = Some(LevBuilder::new(2, false));
} }
let addr = "0.0.0.0:8080".parse().unwrap(); let addr = "0.0.0.0:8080".parse().unwrap();
@ -118,9 +111,9 @@ fn main() {
unsafe { unsafe {
TcpServer::new(Http, addr).serve(|| Ok(MainService { TcpServer::new(Http, addr).serve(|| Ok(MainService {
map: MAP.as_ref().unwrap(), map: MAP.as_ref().unwrap(),
lev_aut_bldr_0: LEV_AUT_BLDR_0.as_ref().unwrap(), lev_builder_0: LEV_BUILDER_0.as_ref().unwrap(),
lev_aut_bldr_1: LEV_AUT_BLDR_1.as_ref().unwrap(), lev_builder_1: LEV_BUILDER_1.as_ref().unwrap(),
lev_aut_bldr_2: LEV_AUT_BLDR_2.as_ref().unwrap(), lev_builder_2: LEV_BUILDER_2.as_ref().unwrap(),
})) }))
} }
} }

View File

@ -15,15 +15,23 @@ use fst::Automaton;
pub use self::fst_map::{FstMap, FstMapBuilder}; pub use self::fst_map::{FstMap, FstMapBuilder};
use self::fst_map::Values; use self::fst_map::Values;
pub struct StreamBuilder<'a, T: 'a, A: Automaton> { pub struct StreamBuilder<'m, 'v, T: 'v, A> {
inner: fst::map::StreamBuilder<'a, A>, inner: fst::map::StreamBuilder<'m, A>,
values: &'a Values<T>, values: &'v Values<T>,
} }
impl<'a, T: 'a, A: Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, T, A> { impl<'m, 'v, T: 'v, A> StreamBuilder<'m, 'v, T, A> {
type Item = (&'a str, &'a [T]); pub fn with_state(self) -> StreamWithStateBuilder<'m, 'v, T, A> {
StreamWithStateBuilder {
inner: self.inner.with_state(),
values: self.values,
}
}
}
type Into = Stream<'a, T, A>; impl<'m, 'v, 'a, T: 'v + 'a, A: Automaton> fst::IntoStreamer<'a> for StreamBuilder<'m, 'v, T, A> {
type Item = (&'a str, &'a [T]);
type Into = Stream<'m, 'v, T, A>;
fn into_stream(self) -> Self::Into { fn into_stream(self) -> Self::Into {
Stream { Stream {
@ -33,12 +41,12 @@ impl<'a, T: 'a, A: Automaton> fst::IntoStreamer<'a> for StreamBuilder<'a, T, A>
} }
} }
pub struct Stream<'a, T: 'a, A: Automaton = fst::automaton::AlwaysMatch> { pub struct Stream<'m, 'v, T: 'v, A: Automaton = fst::automaton::AlwaysMatch> {
inner: fst::map::Stream<'a, A>, inner: fst::map::Stream<'m, A>,
values: &'a Values<T>, values: &'v Values<T>,
} }
impl<'a, 'm, T: 'a, A: Automaton> fst::Streamer<'a> for Stream<'m, T, A> { impl<'m, 'v, 'a, T: 'v + 'a, A: Automaton> fst::Streamer<'a> for Stream<'m, 'v, T, A> {
type Item = (&'a str, &'a [T]); type Item = (&'a str, &'a [T]);
fn next(&'a mut self) -> Option<Self::Item> { fn next(&'a mut self) -> Option<Self::Item> {
@ -53,3 +61,48 @@ impl<'a, 'm, T: 'a, A: Automaton> fst::Streamer<'a> for Stream<'m, T, A> {
} }
} }
} }
pub struct StreamWithStateBuilder<'m, 'v, T: 'v, A> {
inner: fst::map::StreamWithStateBuilder<'m, A>,
values: &'v Values<T>,
}
impl<'m, 'v, 'a, T: 'v + 'a, A: 'a> fst::IntoStreamer<'a> for StreamWithStateBuilder<'m, 'v, T, A>
where
A: Automaton,
A::State: Clone,
{
type Item = (&'a str, &'a [T], A::State);
type Into = StreamWithState<'m, 'v, T, A>;
fn into_stream(self) -> Self::Into {
StreamWithState {
inner: self.inner.into_stream(),
values: self.values,
}
}
}
pub struct StreamWithState<'m, 'v, T: 'v, A: Automaton = fst::automaton::AlwaysMatch> {
inner: fst::map::StreamWithState<'m, A>,
values: &'v Values<T>,
}
impl<'m, 'v, 'a, T: 'v + 'a, A: 'a> fst::Streamer<'a> for StreamWithState<'m, 'v, T, A>
where
A: Automaton,
A::State: Clone,
{
type Item = (&'a str, &'a [T], A::State);
fn next(&'a mut self) -> Option<Self::Item> {
match self.inner.next() {
Some((k, i, state)) => {
let key = unsafe { from_utf8_unchecked(k) };
let values = unsafe { self.values.get_unchecked(i as usize) };
Some((key, values, state))
},
None => None,
}
}
}