feat(search): Accept multiple words and do a simple union

This commit is contained in:
Kerollmops 2018-05-06 12:23:42 +02:00 committed by Clément Renault
parent 758baeb8e1
commit 1476aa3dba
5 changed files with 208 additions and 37 deletions

10
Cargo.lock generated
View File

@ -85,7 +85,6 @@ dependencies = [
[[package]] [[package]]
name = "fst" name = "fst"
version = "0.3.0" version = "0.3.0"
source = "git+https://github.com/Kerollmops/fst.git?branch=stream-with-state#a969462433944a22f1356a8bf2affb8e9bde6f67"
dependencies = [ dependencies = [
"byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)", "byteorder 1.2.2 (registry+https://github.com/rust-lang/crates.io-index)",
"memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)", "memmap 0.6.2 (registry+https://github.com/rust-lang/crates.io-index)",
@ -161,9 +160,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
[[package]] [[package]]
name = "levenshtein_automata" name = "levenshtein_automata"
version = "0.1.0" version = "0.1.0"
source = "git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst#13a685e087efcf253936342c055166fa5d5c9b9c"
dependencies = [ dependencies = [
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)", "fst 0.3.0",
] ]
[[package]] [[package]]
@ -305,9 +303,9 @@ version = "0.1.0"
dependencies = [ dependencies = [
"bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)", "bincode 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)",
"env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)", "env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)",
"fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)", "fst 0.3.0",
"futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)", "futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)",
"levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)", "levenshtein_automata 0.1.0",
"serde 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)", "serde 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_derive 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)", "serde_derive 1.0.45 (registry+https://github.com/rust-lang/crates.io-index)",
"serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)", "serde_json 1.0.17 (registry+https://github.com/rust-lang/crates.io-index)",
@ -651,7 +649,6 @@ dependencies = [
"checksum crossbeam-utils 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d636a8b3bcc1b409d7ffd3facef8f21dcb4009626adbd0c5e6c4305c07253c7b" "checksum crossbeam-utils 0.3.2 (registry+https://github.com/rust-lang/crates.io-index)" = "d636a8b3bcc1b409d7ffd3facef8f21dcb4009626adbd0c5e6c4305c07253c7b"
"checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab" "checksum dtoa 0.4.2 (registry+https://github.com/rust-lang/crates.io-index)" = "09c3753c3db574d215cba4ea76018483895d7bff25a31b49ba45db21c48e50ab"
"checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f" "checksum env_logger 0.3.5 (registry+https://github.com/rust-lang/crates.io-index)" = "15abd780e45b3ea4f76b4e9a26ff4843258dd8a3eed2775a0e7368c2e7936c2f"
"checksum fst 0.3.0 (git+https://github.com/Kerollmops/fst.git?branch=stream-with-state)" = "<none>"
"checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82" "checksum fuchsia-zircon 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "2e9763c69ebaae630ba35f74888db465e49e259ba1bc0eda7d06f4a067615d82"
"checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7" "checksum fuchsia-zircon-sys 0.3.3 (registry+https://github.com/rust-lang/crates.io-index)" = "3dcaa9ae7725d12cdb85b3ad99a434db70b468c09ded17e012d86b5c1010f7a7"
"checksum futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)" = "1a70b146671de62ec8c8ed572219ca5d594d9b06c0b364d5e67b722fc559b48c" "checksum futures 0.1.21 (registry+https://github.com/rust-lang/crates.io-index)" = "1a70b146671de62ec8c8ed572219ca5d594d9b06c0b364d5e67b722fc559b48c"
@ -662,7 +659,6 @@ dependencies = [
"checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d" "checksum kernel32-sys 0.2.2 (registry+https://github.com/rust-lang/crates.io-index)" = "7507624b29483431c0ba2d82aece8ca6cdba9382bff4ddd0f7490560c056098d"
"checksum lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c8f31047daa365f19be14b47c29df4f7c3b581832407daabe6ae77397619237d" "checksum lazy_static 1.0.0 (registry+https://github.com/rust-lang/crates.io-index)" = "c8f31047daa365f19be14b47c29df4f7c3b581832407daabe6ae77397619237d"
"checksum lazycell 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a6f08839bc70ef4a3fe1d566d5350f519c5912ea86be0df1740a7d247c7fc0ef" "checksum lazycell 0.6.0 (registry+https://github.com/rust-lang/crates.io-index)" = "a6f08839bc70ef4a3fe1d566d5350f519c5912ea86be0df1740a7d247c7fc0ef"
"checksum levenshtein_automata 0.1.0 (git+https://github.com/Kerollmops/levenshtein-automata.git?branch=custom-fst)" = "<none>"
"checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b" "checksum libc 0.2.40 (registry+https://github.com/rust-lang/crates.io-index)" = "6fd41f331ac7c5b8ac259b8bf82c75c0fb2e469bbf37d2becbba9a6a2221965b"
"checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b" "checksum log 0.3.9 (registry+https://github.com/rust-lang/crates.io-index)" = "e19e8d5c34a3e0e2223db8e060f9e8264aeeb5c5fc64a4ee9965c062211c024b"
"checksum log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "89f010e843f2b1a31dbd316b3b8d443758bc634bed37aabade59c686d644e0a2" "checksum log 0.4.1 (registry+https://github.com/rust-lang/crates.io-index)" = "89f010e843f2b1a31dbd316b3b8d443758bc634bed37aabade59c686d644e0a2"

View File

@ -16,13 +16,15 @@ tokio-service = "0.1"
url = "1.7" url = "1.7"
[dependencies.fst] [dependencies.fst]
git = "https://github.com/Kerollmops/fst.git" path = "../../fst"
branch = "stream-with-state" # git = "https://github.com/Kerollmops/fst.git"
# branch = "stream-with-state"
[dependencies.levenshtein_automata] [dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git" path = "../../levenshtein-automata"
branch = "custom-fst" # git = "https://github.com/Kerollmops/levenshtein-automata.git"
# branch = "custom-fst"
features = ["fst_automaton"] features = ["fst_automaton"]
[profile.release] # [profile.release]
lto = true # lto = true

View File

@ -13,14 +13,14 @@ use std::path::Path;
use std::fs::File; use std::fs::File;
use std::io::{Read, BufReader}; use std::io::{Read, BufReader};
use fst::{IntoStreamer, Streamer}; use fst::Streamer;
use futures::future; use futures::future;
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
use tokio_minihttp::{Request, Response, Http}; use tokio_minihttp::{Request, Response, Http};
use tokio_proto::TcpServer; use tokio_proto::TcpServer;
use tokio_service::Service; use tokio_service::Service;
use raptor::FstMap; use raptor::{FstMap, OpWithStateBuilder};
static mut MAP: Option<FstMap<u64>> = None; static mut MAP: Option<FstMap<u64>> = None;
static mut LEV_BUILDER_0: Option<LevBuilder> = None; static mut LEV_BUILDER_0: Option<LevBuilder> = None;
@ -52,25 +52,40 @@ impl<'a> Service for MainService<'a> {
if let Some((_, query)) = url.query_pairs().find(|&(ref k, _)| k == "q") { if let Some((_, query)) = url.query_pairs().find(|&(ref k, _)| k == "q") {
let query = query.to_lowercase(); let query = query.to_lowercase();
let lev = if query.len() <= 4 { let mut automatons = Vec::new();
self.lev_builder_0.build_dfa(&query)
} else if query.len() <= 8 {
self.lev_builder_1.build_dfa(&query)
} else {
self.lev_builder_2.build_dfa(&query)
};
let mut stream = self.map.search(&lev).with_state().into_stream(); for query in query.split_whitespace() {
let lev = if query.len() <= 4 {
self.lev_builder_0.build_dfa(&query)
} else if query.len() <= 8 {
self.lev_builder_1.build_dfa(&query)
} else {
self.lev_builder_2.build_dfa(&query)
};
automatons.push(lev);
}
let mut op = OpWithStateBuilder::new(self.map.values());
for automaton in automatons.iter().cloned() {
let stream = self.map.as_map().search(automaton).with_state();
op.push(stream);
}
let mut stream = op.union();
let mut body = String::new(); let mut body = String::new();
body.push_str("<html><body>"); body.push_str("<html><body>");
while let Some((key, values, state)) = stream.next() { while let Some((key, ivalues)) = stream.next() {
match std::str::from_utf8(key) { match std::str::from_utf8(key) {
Ok(key) => { Ok(key) => {
let values = &values[..values.len().min(10)]; for ivalue in ivalues {
let distance = lev.distance(state); let i = ivalue.index;
body.push_str(&format!("<p>{:?} (dist: {:?}) {:?}</p>", key, distance, values)); let state = ivalue.state;
let distance = automatons[i].distance(state);
body.push_str(&format!("<p>{:?} (dist: {:?}) {:?}</p>", key, distance, ivalue.values));
}
}, },
Err(e) => eprintln!("{:?}", e), Err(e) => eprintln!("{:?}", e),
} }

View File

@ -1,5 +1,5 @@
use bincode; use bincode;
use fst::{self, Map, MapBuilder, Automaton}; use fst::{self, Automaton};
use serde::de::DeserializeOwned; use serde::de::DeserializeOwned;
use serde::ser::Serialize; use serde::ser::Serialize;
use std::fs::File; use std::fs::File;
@ -10,7 +10,7 @@ use {StreamBuilder, Stream};
#[derive(Debug)] #[derive(Debug)]
pub struct FstMap<T> { pub struct FstMap<T> {
inner: Map, inner: fst::Map,
values: Values<T>, values: Values<T>,
} }
@ -21,7 +21,7 @@ impl<T> FstMap<T> {
P: AsRef<Path>, P: AsRef<Path>,
Q: AsRef<Path> Q: AsRef<Path>
{ {
let inner = Map::from_path(map)?; let inner = fst::Map::from_path(map)?;
// TODO handle errors !!! // TODO handle errors !!!
let values = File::open(values).unwrap(); let values = File::open(values).unwrap();
@ -35,7 +35,7 @@ impl<T> FstMap<T> {
where where
T: DeserializeOwned T: DeserializeOwned
{ {
let inner = Map::from_bytes(map)?; let inner = fst::Map::from_bytes(map)?;
let values = bincode::deserialize(values).unwrap(); let values = bincode::deserialize(values).unwrap();
Ok(Self { inner, values }) Ok(Self { inner, values })
@ -62,6 +62,19 @@ impl<T> FstMap<T> {
values: &self.values, values: &self.values,
} }
} }
pub fn op(&self) -> OpBuilder<T> {
// OpBuilder::new(&self.values).add(self.as_inner())
unimplemented!()
}
pub fn as_map(&self) -> &fst::Map {
&self.inner
}
pub fn values(&self) -> &Values<T> {
&self.values
}
} }
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
@ -137,7 +150,7 @@ impl<T> FstMapBuilder<T> {
pub fn build_memory(self) -> fst::Result<FstMap<T>> { pub fn build_memory(self) -> fst::Result<FstMap<T>> {
Ok(FstMap { Ok(FstMap {
inner: Map::from_iter(self.map)?, inner: fst::Map::from_iter(self.map)?,
values: Values::new(self.values), values: Values::new(self.values),
}) })
} }
@ -148,7 +161,7 @@ impl<T> FstMapBuilder<T> {
W: Write, W: Write,
X: Write X: Write
{ {
let mut builder = MapBuilder::new(map_wrt)?; let mut builder = fst::MapBuilder::new(map_wrt)?;
builder.extend_iter(self.map)?; builder.extend_iter(self.map)?;
let map = builder.into_inner()?; let map = builder.into_inner()?;
let values = Values::new(self.values); let values = Values::new(self.values);
@ -159,3 +172,148 @@ impl<T> FstMapBuilder<T> {
Ok((map, values_wrt)) Ok((map, values_wrt))
} }
} }
pub struct OpBuilder<'m, 'v, T: 'v> {
inner: fst::map::OpBuilder<'m>,
values: &'v Values<T>,
}
impl<'m, 'v, T: 'v> OpBuilder<'m, 'v, T> {
pub fn new(values: &'v Values<T>) -> Self {
OpBuilder {
inner: fst::map::OpBuilder::new(),
values: values,
}
}
pub fn add<I, S>(mut self, streamable: I) -> Self
where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
{
self.push(streamable);
self
}
pub fn push<I, S>(&mut self, streamable: I)
where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64)>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64)>,
{
self.inner.push(streamable);
}
pub fn union(self) -> Union<'m, 'v, T> {
Union {
inner: self.inner.union(),
outs: Vec::new(),
values: self.values,
}
}
}
pub struct Union<'m, 'v, T: 'v> {
inner: fst::map::Union<'m>,
outs: Vec<IndexedValues<'v, T>>,
values: &'v Values<T>,
}
impl<'a, 'm, 'v, T: 'v + 'a> fst::Streamer<'a> for Union<'m, 'v, T> {
type Item = (&'a [u8], &'a [IndexedValues<'a, T>]);
fn next(&'a mut self) -> Option<Self::Item> {
match self.inner.next() {
Some((s, ivalues)) => {
self.outs.clear();
for ivalue in ivalues {
let index = ivalue.index;
let values = unsafe { self.values.get_unchecked(ivalue.value as usize) };
self.outs.push(IndexedValues { index, values })
}
Some((s, &self.outs))
},
None => None,
}
}
}
#[derive(Debug)]
pub struct IndexedValues<'a, T: 'a> {
pub index: usize,
pub values: &'a [T],
}
pub struct OpWithStateBuilder<'m, 'v, T: 'v, U> {
inner: fst::map::OpWithStateBuilder<'m, U>,
values: &'v Values<T>,
}
impl<'m, 'v, T: 'v, U: 'static> OpWithStateBuilder<'m, 'v, T, U> {
pub fn new(values: &'v Values<T>) -> Self {
Self {
inner: fst::map::OpWithStateBuilder::new(),
values: values,
}
}
pub fn add<I, S>(mut self, streamable: I) -> Self
where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
{
self.push(streamable);
self
}
pub fn push<I, S>(&mut self, streamable: I)
where
I: for<'a> fst::IntoStreamer<'a, Into=S, Item=(&'a [u8], u64, U)>,
S: 'm + for<'a> fst::Streamer<'a, Item=(&'a [u8], u64, U)>,
{
self.inner.push(streamable);
}
pub fn union(self) -> UnionWithState<'m, 'v, T, U> {
UnionWithState {
inner: self.inner.union(),
outs: Vec::new(),
values: self.values,
}
}
}
pub struct UnionWithState<'m, 'v, T: 'v, U> {
inner: fst::map::UnionWithState<'m, U>,
outs: Vec<IndexedValuesWithState<'v, T, U>>,
values: &'v Values<T>,
}
impl<'a, 'm, 'v, T: 'v + 'a, U: 'a> fst::Streamer<'a> for UnionWithState<'m, 'v, T, U>
where
U: Clone,
{
type Item = (&'a [u8], &'a [IndexedValuesWithState<'a, T, U>]);
fn next(&'a mut self) -> Option<Self::Item> {
match self.inner.next() {
Some((s, ivalues)) => {
self.outs.clear();
for ivalue in ivalues {
let index = ivalue.index;
let values = unsafe { self.values.get_unchecked(ivalue.value as usize) };
let state = ivalue.state.clone();
self.outs.push(IndexedValuesWithState { index, values, state })
}
Some((s, &self.outs))
},
None => None,
}
}
}
#[derive(Debug)]
pub struct IndexedValuesWithState<'a, T: 'a, U> {
pub index: usize,
pub values: &'a [T],
pub state: U,
}

View File

@ -5,13 +5,13 @@ extern crate serde;
mod fst_map; mod fst_map;
use std::ops::Range;
use std::io::{Write, BufReader};
use std::fs::File;
use std::path::Path;
use fst::Automaton; use fst::Automaton;
pub use self::fst_map::{FstMap, FstMapBuilder}; pub use self::fst_map::{FstMap, FstMapBuilder};
pub use self::fst_map::{
OpBuilder, IndexedValues,
OpWithStateBuilder, IndexedValuesWithState,
};
use self::fst_map::Values; use self::fst_map::Values;
pub struct StreamBuilder<'m, 'v, T: 'v, A> { pub struct StreamBuilder<'m, 'v, T: 'v, A> {