2018-10-11 22:09:28 +08:00
|
|
|
use std::ops::Range;
|
2018-10-10 22:57:21 +08:00
|
|
|
use std::rc::Rc;
|
2018-10-11 22:09:28 +08:00
|
|
|
use std::{mem, vec, cmp};
|
2018-10-10 22:57:21 +08:00
|
|
|
|
|
|
|
use fnv::FnvHashMap;
|
|
|
|
use fst::Streamer;
|
|
|
|
use group_by::GroupByMut;
|
|
|
|
|
|
|
|
use crate::automaton::{DfaExt, AutomatonExt};
|
|
|
|
use crate::metadata::Metadata;
|
2018-10-11 20:04:41 +08:00
|
|
|
use crate::metadata::ops::OpBuilder;
|
|
|
|
use crate::rank::criterion::Criterion;
|
2018-10-10 22:57:21 +08:00
|
|
|
use crate::rank::Document;
|
2018-10-11 20:04:41 +08:00
|
|
|
use crate::Match;
|
2018-10-10 22:57:21 +08:00
|
|
|
|
2018-10-11 20:04:41 +08:00
|
|
|
#[derive(Clone)]
|
|
|
|
pub struct RankedStreamBuilder<'m, C> {
|
|
|
|
metadata: &'m Metadata,
|
|
|
|
automatons: Vec<Rc<DfaExt>>,
|
|
|
|
criteria: Vec<C>,
|
2018-10-10 22:57:21 +08:00
|
|
|
}
|
|
|
|
|
2018-10-11 20:04:41 +08:00
|
|
|
impl<'m, C> RankedStreamBuilder<'m, C> {
|
|
|
|
pub fn new(metadata: &'m Metadata, automatons: Vec<DfaExt>) -> Self {
|
|
|
|
RankedStreamBuilder {
|
|
|
|
metadata: metadata,
|
|
|
|
automatons: automatons.into_iter().map(Rc::new).collect(),
|
|
|
|
criteria: Vec::new(), // hummm... prefer the criterion::default() ones !
|
|
|
|
}
|
|
|
|
}
|
2018-10-10 22:57:21 +08:00
|
|
|
|
2018-10-11 20:04:41 +08:00
|
|
|
pub fn criteria(&mut self, criteria: Vec<C>) {
|
|
|
|
self.criteria = criteria;
|
2018-10-10 22:57:21 +08:00
|
|
|
}
|
|
|
|
|
2018-10-11 20:04:41 +08:00
|
|
|
pub fn build(&self) -> RankedStream<C> {
|
|
|
|
let mut builder = OpBuilder::with_automatons(self.automatons.clone());
|
|
|
|
builder.push(self.metadata);
|
2018-10-10 22:57:21 +08:00
|
|
|
|
2018-10-11 20:04:41 +08:00
|
|
|
RankedStream {
|
|
|
|
stream: builder.union(),
|
|
|
|
automatons: &self.automatons,
|
|
|
|
criteria: &self.criteria,
|
|
|
|
}
|
2018-10-10 22:57:21 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-11 20:04:41 +08:00
|
|
|
pub struct RankedStream<'a, 'm, C> {
|
|
|
|
stream: crate::metadata::ops::Union<'m>,
|
|
|
|
automatons: &'a [Rc<DfaExt>],
|
|
|
|
criteria: &'a [C],
|
2018-10-10 22:57:21 +08:00
|
|
|
}
|
|
|
|
|
2018-10-11 20:04:41 +08:00
|
|
|
impl<'a, 'm, C> RankedStream<'a, 'm, C> {
|
2018-10-11 22:09:28 +08:00
|
|
|
pub fn retrieve_documents(&mut self, range: Range<usize>) -> Vec<Document>
|
2018-10-11 20:04:41 +08:00
|
|
|
where C: Criterion
|
|
|
|
{
|
|
|
|
let mut matches = FnvHashMap::default();
|
|
|
|
|
|
|
|
while let Some((string, indexed_values)) = self.stream.next() {
|
|
|
|
for iv in indexed_values {
|
|
|
|
let automaton = &self.automatons[iv.index];
|
|
|
|
let distance = automaton.eval(string).to_u8();
|
|
|
|
let is_exact = distance == 0 && string.len() == automaton.query_len();
|
|
|
|
|
|
|
|
for di in iv.doc_indexes.as_slice() {
|
|
|
|
let match_ = Match {
|
|
|
|
query_index: iv.index as u32,
|
|
|
|
distance: distance,
|
|
|
|
attribute: di.attribute,
|
|
|
|
attribute_index: di.attribute_index,
|
|
|
|
is_exact: is_exact,
|
|
|
|
};
|
|
|
|
matches.entry(di.document).or_insert_with(Vec::new).push(match_);
|
|
|
|
}
|
2018-10-10 22:57:21 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-11 20:04:41 +08:00
|
|
|
// collect matches from an HashMap into a Vec
|
|
|
|
let mut documents: Vec<_> = matches.into_iter().map(|(id, mut matches)| {
|
|
|
|
matches.sort_unstable();
|
|
|
|
unsafe { Document::from_sorted_matches(id, matches) }
|
|
|
|
}).collect();
|
|
|
|
|
|
|
|
let mut groups = vec![documents.as_mut_slice()];
|
|
|
|
|
|
|
|
for criterion in self.criteria {
|
2018-10-11 22:09:28 +08:00
|
|
|
let tmp_groups = mem::replace(&mut groups, Vec::new());
|
|
|
|
let mut current_range = Range { start: 0, end: 0 };
|
|
|
|
|
|
|
|
'grp: for group in tmp_groups {
|
|
|
|
current_range.end += group.len();
|
|
|
|
|
|
|
|
// if a part of the current group is in the range returned
|
|
|
|
// we must sort it and emit the sub-groups
|
|
|
|
if current_range.contains(&range.start) {
|
|
|
|
group.sort_unstable_by(|a, b| criterion.evaluate(a, b));
|
|
|
|
for group in GroupByMut::new(group, |a, b| criterion.eq(a, b)) {
|
|
|
|
groups.push(group);
|
|
|
|
if current_range.end >= range.end { break 'grp }
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
groups.push(group)
|
2018-10-11 20:04:41 +08:00
|
|
|
}
|
2018-10-11 22:09:28 +08:00
|
|
|
|
|
|
|
current_range.start = current_range.end;
|
2018-10-10 22:57:21 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-11 22:09:28 +08:00
|
|
|
// TODO find a better algorithm, here we allocate for too many documents
|
|
|
|
// and we do a useless allocation, we should reuse the documents Vec
|
|
|
|
let start = cmp::min(range.start, documents.len());
|
|
|
|
let mut documents = documents.split_off(start);
|
|
|
|
documents.truncate(range.len());
|
2018-10-11 20:04:41 +08:00
|
|
|
documents
|
|
|
|
}
|
2018-10-10 22:57:21 +08:00
|
|
|
}
|