From 34e0211567ab269b24a8d76a6c08d998b7a07cac Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Sat, 25 Aug 2018 12:35:29 +0200 Subject: [PATCH] feat: Simplify the RankedStrem code logic --- raptor/src/lib.rs | 5 +- raptor/src/rank/mod.rs | 108 +++++++++++++++++++---------------------- 2 files changed, 53 insertions(+), 60 deletions(-) diff --git a/raptor/src/lib.rs b/raptor/src/lib.rs index 39bf59f2a..7e6de58e7 100644 --- a/raptor/src/lib.rs +++ b/raptor/src/lib.rs @@ -1,3 +1,5 @@ +#![feature(nll)] + extern crate fst; extern crate fnv; extern crate group_by; @@ -77,7 +79,8 @@ pub struct Match { /// (i.e. at the start or the end of the attribute). /// /// The index in the attribute is limited to a maximum of `2^32` - /// this is because we index only the first 1000 words in an attribute. + /// this is because we index only the first 1000 words + /// in an attribute. pub attribute_index: u32, /// Whether the word that match is an exact match or a prefix. diff --git a/raptor/src/rank/mod.rs b/raptor/src/rank/mod.rs index 50617d226..54ef0a391 100644 --- a/raptor/src/rank/mod.rs +++ b/raptor/src/rank/mod.rs @@ -60,20 +60,18 @@ fn matches_into_iter(matches: FnvHashMap>, limit: usize) exact, ]; - { - let mut groups = vec![documents.as_mut_slice()]; + let mut groups = vec![documents.as_mut_slice()]; - for sort in sorts { - let mut temp = mem::replace(&mut groups, Vec::new()); - let mut computed = 0; + for sort in sorts { + let temp = mem::replace(&mut groups, Vec::new()); + let mut computed = 0; - for group in temp { - group.sort_unstable_by(sort); - for group in GroupByMut::new(group, |a, b| sort(a, b) == Ordering::Equal) { - computed += group.len(); - groups.push(group); - if computed >= limit { break } - } + for group in temp { + group.sort_unstable_by(sort); + for group in GroupByMut::new(group, |a, b| sort(a, b) == Ordering::Equal) { + computed += group.len(); + groups.push(group); + if computed >= limit { break } } } } @@ -82,7 +80,37 @@ fn matches_into_iter(matches: FnvHashMap>, limit: usize) documents.into_iter() } -pub enum RankedStream<'m, 'v> { +pub struct RankedStream<'m, 'v>(RankedStreamInner<'m, 'v>); + +impl<'m, 'v> RankedStream<'m, 'v> { + pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec, limit: usize) -> Self { + let mut op = OpWithStateBuilder::new(indexes); + + for automaton in automatons.iter().map(|l| l.dfa.clone()) { + let stream = map.search(automaton).with_state(); + op.push(stream); + } + + let inner = RankedStreamInner::Fed { + inner: op.union(), + automatons: automatons, + limit: limit, + matches: FnvHashMap::default(), + }; + + RankedStream(inner) + } +} + +impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> { + type Item = Document; + + fn next(&'a mut self) -> Option { + self.0.next() + } +} + +enum RankedStreamInner<'m, 'v> { Fed { inner: UnionWithState<'m, 'v, u32>, automatons: Vec, @@ -94,59 +122,27 @@ pub enum RankedStream<'m, 'v> { }, } -impl<'m, 'v> RankedStream<'m, 'v> { - pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec, limit: usize) -> Self { - let mut op = OpWithStateBuilder::new(indexes); - - for automaton in automatons.iter().map(|l| l.dfa.clone()) { - let stream = map.search(automaton).with_state(); - op.push(stream); - } - - RankedStream::Fed { - inner: op.union(), - automatons: automatons, - limit: limit, - matches: FnvHashMap::default(), - } - } -} - -impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> { +impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> { type Item = Document; fn next(&'a mut self) -> Option { loop { - // TODO remove that when NLL are here ! - let mut transfert_matches = None; - let mut transfert_limit = None; - match self { - RankedStream::Fed { inner, automatons, limit, matches } => { + RankedStreamInner::Fed { inner, automatons, limit, matches } => { match inner.next() { Some((string, indexed_values)) => { for iv in indexed_values { - // TODO extend documents matches by batch of query_index - // that way it will be possible to discard matches that - // have an invalid distance *before* adding them - // to the matches of the documents and, that way, avoid a sort - let automaton = &automatons[iv.index]; let distance = automaton.dfa.distance(iv.state).to_u8(); - // TODO remove the Pool system ! - // this is an internal Pool rule but - // it is more efficient to test that here - // if pool.limitation.is_reached() && distance != 0 { continue } - for di in iv.values { let match_ = Match { query_index: iv.index as u32, distance: distance, attribute: di.attribute, attribute_index: di.attribute_index, - is_exact: string.len() == automaton.query_len, + is_exact: distance == 0 && string.len() == automaton.query_len, }; matches.entry(di.document) .or_insert_with(Vec::new) @@ -155,23 +151,17 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> { } }, None => { - // TODO remove this when NLL are here ! - transfert_matches = Some(mem::replace(matches, FnvHashMap::default())); - transfert_limit = Some(mem::replace(limit, 0)); + let matches = mem::replace(matches, FnvHashMap::default()); + *self = RankedStreamInner::Pours { + inner: matches_into_iter(matches, *limit).into_iter() + }; }, } }, - RankedStream::Pours { inner } => { + RankedStreamInner::Pours { inner } => { return inner.next() }, } - - // transform the `RankedStream` into a `Pours` - if let (Some(matches), Some(limit)) = (transfert_matches, transfert_limit) { - *self = RankedStream::Pours { - inner: matches_into_iter(matches, limit).into_iter(), - } - } } } }