feat: Simplify the RankedStrem code logic

This commit is contained in:
Kerollmops 2018-08-25 12:35:29 +02:00 committed by Clément Renault
parent 9dce74e9c8
commit 34e0211567
2 changed files with 53 additions and 60 deletions

View File

@ -1,3 +1,5 @@
#![feature(nll)]
extern crate fst; extern crate fst;
extern crate fnv; extern crate fnv;
extern crate group_by; extern crate group_by;
@ -77,7 +79,8 @@ pub struct Match {
/// (i.e. at the start or the end of the attribute). /// (i.e. at the start or the end of the attribute).
/// ///
/// The index in the attribute is limited to a maximum of `2^32` /// The index in the attribute is limited to a maximum of `2^32`
/// this is because we index only the first 1000 words in an attribute. /// this is because we index only the first 1000 words
/// in an attribute.
pub attribute_index: u32, pub attribute_index: u32,
/// Whether the word that match is an exact match or a prefix. /// Whether the word that match is an exact match or a prefix.

View File

@ -60,11 +60,10 @@ fn matches_into_iter(matches: FnvHashMap<DocumentId, Vec<Match>>, limit: usize)
exact, exact,
]; ];
{
let mut groups = vec![documents.as_mut_slice()]; let mut groups = vec![documents.as_mut_slice()];
for sort in sorts { for sort in sorts {
let mut temp = mem::replace(&mut groups, Vec::new()); let temp = mem::replace(&mut groups, Vec::new());
let mut computed = 0; let mut computed = 0;
for group in temp { for group in temp {
@ -76,13 +75,42 @@ fn matches_into_iter(matches: FnvHashMap<DocumentId, Vec<Match>>, limit: usize)
} }
} }
} }
}
documents.truncate(limit); documents.truncate(limit);
documents.into_iter() documents.into_iter()
} }
pub enum RankedStream<'m, 'v> { pub struct RankedStream<'m, 'v>(RankedStreamInner<'m, 'v>);
impl<'m, 'v> RankedStream<'m, 'v> {
pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec<Levenshtein>, limit: usize) -> Self {
let mut op = OpWithStateBuilder::new(indexes);
for automaton in automatons.iter().map(|l| l.dfa.clone()) {
let stream = map.search(automaton).with_state();
op.push(stream);
}
let inner = RankedStreamInner::Fed {
inner: op.union(),
automatons: automatons,
limit: limit,
matches: FnvHashMap::default(),
};
RankedStream(inner)
}
}
impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> {
type Item = Document;
fn next(&'a mut self) -> Option<Self::Item> {
self.0.next()
}
}
enum RankedStreamInner<'m, 'v> {
Fed { Fed {
inner: UnionWithState<'m, 'v, u32>, inner: UnionWithState<'m, 'v, u32>,
automatons: Vec<Levenshtein>, automatons: Vec<Levenshtein>,
@ -94,59 +122,27 @@ pub enum RankedStream<'m, 'v> {
}, },
} }
impl<'m, 'v> RankedStream<'m, 'v> { impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStreamInner<'m, 'v> {
pub fn new(map: &'m fst::Map, indexes: &'v DocIndexes, automatons: Vec<Levenshtein>, limit: usize) -> Self {
let mut op = OpWithStateBuilder::new(indexes);
for automaton in automatons.iter().map(|l| l.dfa.clone()) {
let stream = map.search(automaton).with_state();
op.push(stream);
}
RankedStream::Fed {
inner: op.union(),
automatons: automatons,
limit: limit,
matches: FnvHashMap::default(),
}
}
}
impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> {
type Item = Document; type Item = Document;
fn next(&'a mut self) -> Option<Self::Item> { fn next(&'a mut self) -> Option<Self::Item> {
loop { loop {
// TODO remove that when NLL are here !
let mut transfert_matches = None;
let mut transfert_limit = None;
match self { match self {
RankedStream::Fed { inner, automatons, limit, matches } => { RankedStreamInner::Fed { inner, automatons, limit, matches } => {
match inner.next() { match inner.next() {
Some((string, indexed_values)) => { Some((string, indexed_values)) => {
for iv in indexed_values { for iv in indexed_values {
// TODO extend documents matches by batch of query_index
// that way it will be possible to discard matches that
// have an invalid distance *before* adding them
// to the matches of the documents and, that way, avoid a sort
let automaton = &automatons[iv.index]; let automaton = &automatons[iv.index];
let distance = automaton.dfa.distance(iv.state).to_u8(); let distance = automaton.dfa.distance(iv.state).to_u8();
// TODO remove the Pool system !
// this is an internal Pool rule but
// it is more efficient to test that here
// if pool.limitation.is_reached() && distance != 0 { continue }
for di in iv.values { for di in iv.values {
let match_ = Match { let match_ = Match {
query_index: iv.index as u32, query_index: iv.index as u32,
distance: distance, distance: distance,
attribute: di.attribute, attribute: di.attribute,
attribute_index: di.attribute_index, attribute_index: di.attribute_index,
is_exact: string.len() == automaton.query_len, is_exact: distance == 0 && string.len() == automaton.query_len,
}; };
matches.entry(di.document) matches.entry(di.document)
.or_insert_with(Vec::new) .or_insert_with(Vec::new)
@ -155,23 +151,17 @@ impl<'m, 'v, 'a> fst::Streamer<'a> for RankedStream<'m, 'v> {
} }
}, },
None => { None => {
// TODO remove this when NLL are here ! let matches = mem::replace(matches, FnvHashMap::default());
transfert_matches = Some(mem::replace(matches, FnvHashMap::default())); *self = RankedStreamInner::Pours {
transfert_limit = Some(mem::replace(limit, 0)); inner: matches_into_iter(matches, *limit).into_iter()
};
}, },
} }
}, },
RankedStream::Pours { inner } => { RankedStreamInner::Pours { inner } => {
return inner.next() return inner.next()
}, },
} }
// transform the `RankedStream` into a `Pours`
if let (Some(matches), Some(limit)) = (transfert_matches, transfert_limit) {
*self = RankedStream::Pours {
inner: matches_into_iter(matches, limit).into_iter(),
}
}
} }
} }
} }