From 1476aa3dbad55fc5ffff6f1351f9ddf7241a6e4c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sun, 6 May 2018 12:23:42 +0200 Subject: [PATCH] feat(search): Accept multiple words and do a simple union --- Cargo.lock | 10 +-- Cargo.toml | 14 ++-- src/bin/raptor.rs | 43 ++++++++---- src/fst_map.rs | 170 ++++++++++++++++++++++++++++++++++++++++++++-- src/lib.rs | 8 +-- 5 files changed, 208 insertions(+), 37 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c358a2218..1516fe013 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -85,7 +85,6 @@ dependencies = [ [[package]] name = "fst" version = "0.3.0" -source = "git+https://github.com/Kerollmops/fst.git?branch=stream-with-state#a969462433944a22f1356a8bf2affb8e9bde6f67" dependencies = [ "byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", @@ -161,9 +160,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" [[package]] name = "levenshtein_automata" version = "0.1.0" -source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#13a685e087efcf253936342c055166fa5d5c9b9c" dependencies = [ - "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)", + "fst 0.3.0", ] [[package]] @@ -305,9 +303,9 @@ version = "0.1.0" dependencies = [ "bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", - "fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)", + "fst 0.3.0", "futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)", - "levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)", + "levenshtein_automata 0.1.0", "serde 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", @@ -651,7 +649,6 @@ dependencies = [ "checksum crossbeam-utils 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d636a8b3bcc1b409d7ffd3facef8f21dcb4009626adbd0c5e6c4305c07253c7b" "checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" -"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)" = "" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)" = "1a70b146671de62ec8c8ed572219ca5d594d9b06c0b364d5e67b722fc559b48c" @@ -662,7 +659,6 @@ dependencies = [ "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c8f31047daa365f19be14b47c29df4f7c3b581832407daabe6ae77397619237d" "checksum lazycell 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a6f08839bc70ef4a3fe1d566d5350f519c5912ea86be0df1740a7d247c7fc0ef" -"checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "" "checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b" "checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" "checksum log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "89f010e843f2b1a31dbd316b3b8d443758bc634bed37aabade59c686d644e0a2" diff --git a/Cargo.toml b/Cargo.toml index 9d284c9d9..986d104d8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,13 +16,15 @@ tokio-service = "0.1" url = "1.7" [dependencies.fst] -git = "https://github.com/Kerollmops/fst.git" -branch = "stream-with-state" +path = "../../fst" +# git = "https://github.com/Kerollmops/fst.git" +# branch = "stream-with-state" [dependencies.levenshtein_automata] -git = "https://github.com/Kerollmops/levenshtein-automata.git" -branch = "custom-fst" +path = "../../levenshtein-automata" +# git = "https://github.com/Kerollmops/levenshtein-automata.git" +# branch = "custom-fst" features = ["fst_automaton"] -[profile.release] -lto = true +# [profile.release] +# lto = true diff --git a/src/bin/raptor.rs b/src/bin/raptor.rs index 20394208e..8862d89bd 100644 --- a/src/bin/raptor.rs +++ b/src/bin/raptor.rs @@ -13,14 +13,14 @@ use std::path::Path; use std::fs::File; use std::io::{Read, BufReader}; -use fst::{IntoStreamer, Streamer}; +use fst::Streamer; use futures::future; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use tokio_minihttp::{Request, Response, Http}; use tokio_proto::TcpServer; use tokio_service::Service; -use raptor::FstMap; +use raptor::{FstMap, OpWithStateBuilder}; static mut MAP: Option> = None; static mut LEV_BUILDER_0: Option = None; @@ -52,25 +52,40 @@ impl<'a> Service for MainService<'a> { if let Some((_, query)) = url.query_pairs().find(|&(ref k, _)| k == "q") { let query = query.to_lowercase(); - let lev = if query.len() <= 4 { - self.lev_builder_0.build_dfa(&query) - } else if query.len() <= 8 { - self.lev_builder_1.build_dfa(&query) - } else { - self.lev_builder_2.build_dfa(&query) - }; + let mut automatons = Vec::new(); - let mut stream = self.map.search(&lev).with_state().into_stream(); + for query in query.split_whitespace() { + let lev = if query.len() <= 4 { + self.lev_builder_0.build_dfa(&query) + } else if query.len() <= 8 { + self.lev_builder_1.build_dfa(&query) + } else { + self.lev_builder_2.build_dfa(&query) + }; + automatons.push(lev); + } + + let mut op = OpWithStateBuilder::new(self.map.values()); + + for automaton in automatons.iter().cloned() { + let stream = self.map.as_map().search(automaton).with_state(); + op.push(stream); + } + + let mut stream = op.union(); let mut body = String::new(); body.push_str(""); - while let Some((key, values, state)) = stream.next() { + while let Some((key, ivalues)) = stream.next() { match std::str::from_utf8(key) { Ok(key) => { - let values = &values[..values.len().min(10)]; - let distance = lev.distance(state); - body.push_str(&format!("

{:?} (dist: {:?}) {:?}

", key, distance, values)); + for ivalue in ivalues { + let i = ivalue.index; + let state = ivalue.state; + let distance = automatons[i].distance(state); + body.push_str(&format!("

{:?} (dist: {:?}) {:?}

", key, distance, ivalue.values)); + } }, Err(e) => eprintln!("{:?}", e), } diff --git a/src/fst_map.rs b/src/fst_map.rs index 42b77e115..d97fc9810 100644 --- a/src/fst_map.rs +++ b/src/fst_map.rs @@ -1,5 +1,5 @@ use bincode; -use fst::{self, Map, MapBuilder, Automaton}; +use fst::{self, Automaton}; use serde::de::DeserializeOwned; use serde::ser::Serialize; use std::fs::File; @@ -10,7 +10,7 @@ use {StreamBuilder, Stream}; #[derive(Debug)] pub struct FstMap { - inner: Map, + inner: fst::Map, values: Values, } @@ -21,7 +21,7 @@ impl FstMap { P: AsRef, Q: AsRef { - let inner = Map::from_path(map)?; + let inner = fst::Map::from_path(map)?; // TODO handle errors !!! let values = File::open(values).unwrap(); @@ -35,7 +35,7 @@ impl FstMap { where T: DeserializeOwned { - let inner = Map::from_bytes(map)?; + let inner = fst::Map::from_bytes(map)?; let values = bincode::deserialize(values).unwrap(); Ok(Self { inner, values }) @@ -62,6 +62,19 @@ impl FstMap { values: &self.values, } } + + pub fn op(&self) -> OpBuilder { + // OpBuilder::new(&self.values).add(self.as_inner()) + unimplemented!() + } + + pub fn as_map(&self) -> &fst::Map { + &self.inner + } + + pub fn values(&self) -> &Values { + &self.values + } } #[derive(Debug, Serialize, Deserialize)] @@ -137,7 +150,7 @@ impl FstMapBuilder { pub fn build_memory(self) -> fst::Result> { Ok(FstMap { - inner: Map::from_iter(self.map)?, + inner: fst::Map::from_iter(self.map)?, values: Values::new(self.values), }) } @@ -148,7 +161,7 @@ impl FstMapBuilder { W: Write, X: Write { - let mut builder = MapBuilder::new(map_wrt)?; + let mut builder = fst::MapBuilder::new(map_wrt)?; builder.extend_iter(self.map)?; let map = builder.into_inner()?; let values = Values::new(self.values); @@ -159,3 +172,148 @@ impl FstMapBuilder { Ok((map, values_wrt)) } } + +pub struct OpBuilder<'m, 'v, T: 'v> { + inner: fst::map::OpBuilder<'m>, + values: &'v Values, +} + +impl<'m, 'v, T: 'v> OpBuilder<'m, 'v, T> { + pub fn new(values: &'v Values) -> Self { + OpBuilder { + inner: fst::map::OpBuilder::new(), + values: values, + } + } + + pub fn add(mut self, streamable: I) -> Self + where + I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, + S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>, + { + self.push(streamable); + self + } + + pub fn push(&mut self, streamable: I) + where + I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>, + S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>, + { + self.inner.push(streamable); + } + + pub fn union(self) -> Union<'m, 'v, T> { + Union { + inner: self.inner.union(), + outs: Vec::new(), + values: self.values, + } + } +} + +pub struct Union<'m, 'v, T: 'v> { + inner: fst::map::Union<'m>, + outs: Vec>, + values: &'v Values, +} + +impl<'a, 'm, 'v, T: 'v + 'a> fst::Streamer<'a> for Union<'m, 'v, T> { + type Item = (&'a [u8], &'a [IndexedValues<'a, T>]); + + fn next(&'a mut self) -> Option { + match self.inner.next() { + Some((s, ivalues)) => { + self.outs.clear(); + for ivalue in ivalues { + let index = ivalue.index; + let values = unsafe { self.values.get_unchecked(ivalue.value as usize) }; + self.outs.push(IndexedValues { index, values }) + } + Some((s, &self.outs)) + }, + None => None, + } + } +} + +#[derive(Debug)] +pub struct IndexedValues<'a, T: 'a> { + pub index: usize, + pub values: &'a [T], +} + +pub struct OpWithStateBuilder<'m, 'v, T: 'v, U> { + inner: fst::map::OpWithStateBuilder<'m, U>, + values: &'v Values, +} + +impl<'m, 'v, T: 'v, U: 'static> OpWithStateBuilder<'m, 'v, T, U> { + pub fn new(values: &'v Values) -> Self { + Self { + inner: fst::map::OpWithStateBuilder::new(), + values: values, + } + } + + pub fn add(mut self, streamable: I) -> Self + where + I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>, + S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>, + { + self.push(streamable); + self + } + + pub fn push(&mut self, streamable: I) + where + I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>, + S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>, + { + self.inner.push(streamable); + } + + pub fn union(self) -> UnionWithState<'m, 'v, T, U> { + UnionWithState { + inner: self.inner.union(), + outs: Vec::new(), + values: self.values, + } + } +} + +pub struct UnionWithState<'m, 'v, T: 'v, U> { + inner: fst::map::UnionWithState<'m, U>, + outs: Vec>, + values: &'v Values, +} + +impl<'a, 'm, 'v, T: 'v + 'a, U: 'a> fst::Streamer<'a> for UnionWithState<'m, 'v, T, U> +where + U: Clone, +{ + type Item = (&'a [u8], &'a [IndexedValuesWithState<'a, T, U>]); + + fn next(&'a mut self) -> Option { + match self.inner.next() { + Some((s, ivalues)) => { + self.outs.clear(); + for ivalue in ivalues { + let index = ivalue.index; + let values = unsafe { self.values.get_unchecked(ivalue.value as usize) }; + let state = ivalue.state.clone(); + self.outs.push(IndexedValuesWithState { index, values, state }) + } + Some((s, &self.outs)) + }, + None => None, + } + } +} + +#[derive(Debug)] +pub struct IndexedValuesWithState<'a, T: 'a, U> { + pub index: usize, + pub values: &'a [T], + pub state: U, +} diff --git a/src/lib.rs b/src/lib.rs index e8eb1397d..0a082d839 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -5,13 +5,13 @@ extern crate serde; mod fst_map; -use std::ops::Range; -use std::io::{Write, BufReader}; -use std::fs::File; -use std::path::Path; use fst::Automaton; pub use self::fst_map::{FstMap, FstMapBuilder}; +pub use self::fst_map::{ + OpBuilder, IndexedValues, + OpWithStateBuilder, IndexedValuesWithState, +}; use self::fst_map::Values; pub struct StreamBuilder<'m, 'v, T: 'v, A> {