Make bucket candidates optionals

This commit is contained in:
many 2021-05-05 20:46:56 +02:00
parent c620626515
commit e923d51b8f
No known key found for this signature in database
GPG Key ID: 2CEF23B75189EACA
9 changed files with 276 additions and 346 deletions

View File

@ -94,7 +94,6 @@ impl<'t> Criterion for AscDesc<'t> {
None => { None => {
match self.parent.next(params)? { match self.parent.next(params)? {
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
let candidates_is_some = candidates.is_some();
self.query_tree = query_tree; self.query_tree = query_tree;
let candidates = match (&self.query_tree, candidates) { let candidates = match (&self.query_tree, candidates) {
(_, Some(mut candidates)) => { (_, Some(mut candidates)) => {
@ -103,7 +102,7 @@ impl<'t> Criterion for AscDesc<'t> {
}, },
(Some(qt), None) => { (Some(qt), None) => {
let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; let context = CriteriaBuilder::new(&self.rtxn, &self.index)?;
let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), params.wdcache)?; let mut candidates = resolve_query_tree(&context, qt, params.wdcache)?;
candidates -= params.excluded_candidates; candidates -= params.excluded_candidates;
candidates.intersect_with(&self.faceted_candidates); candidates.intersect_with(&self.faceted_candidates);
candidates candidates
@ -111,15 +110,9 @@ impl<'t> Criterion for AscDesc<'t> {
(None, None) => take(&mut self.faceted_candidates), (None, None) => take(&mut self.faceted_candidates),
}; };
// If our parent returns candidates it means that the bucket match bucket_candidates {
// candidates were already computed before and we can use them. Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
// None => self.bucket_candidates |= &candidates,
// If not, we must use the just computed candidates as our bucket
// candidates.
if candidates_is_some {
self.bucket_candidates.union_with(&bucket_candidates);
} else {
self.bucket_candidates.union_with(&candidates);
} }
if candidates.is_empty() { if candidates.is_empty() {
@ -143,7 +136,7 @@ impl<'t> Criterion for AscDesc<'t> {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(), query_tree: self.query_tree.clone(),
candidates: Some(candidates), candidates: Some(candidates),
bucket_candidates: take(&mut self.bucket_candidates), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, },
} }

View File

@ -24,13 +24,12 @@ const LEVEL_EXPONENTIATION_BASE: u32 = 4;
/// the system to choose between one algorithm or another. /// the system to choose between one algorithm or another.
const CANDIDATES_THRESHOLD: u64 = 1000; const CANDIDATES_THRESHOLD: u64 = 1000;
type FlattenedQueryTree = Vec<Vec<Vec<Query>>>;
pub struct Attribute<'t> { pub struct Attribute<'t> {
ctx: &'t dyn Context<'t>, ctx: &'t dyn Context<'t>,
query_tree: Option<Operation>, state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>,
candidates: Option<RoaringBitmap>,
bucket_candidates: RoaringBitmap, bucket_candidates: RoaringBitmap,
parent: Box<dyn Criterion + 't>, parent: Box<dyn Criterion + 't>,
flattened_query_tree: Option<Vec<Vec<Vec<Query>>>>,
current_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>, current_buckets: Option<btree_map::IntoIter<u64, RoaringBitmap>>,
} }
@ -38,11 +37,9 @@ impl<'t> Attribute<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self { pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
Attribute { Attribute {
ctx, ctx,
query_tree: None, state: None,
candidates: None,
bucket_candidates: RoaringBitmap::new(), bucket_candidates: RoaringBitmap::new(),
parent, parent,
flattened_query_tree: None,
current_buckets: None, current_buckets: None,
} }
} }
@ -52,29 +49,25 @@ impl<'t> Criterion for Attribute<'t> {
#[logging_timer::time("Attribute::{}")] #[logging_timer::time("Attribute::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> { fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> {
// remove excluded candidates when next is called, instead of doing it in the loop. // remove excluded candidates when next is called, instead of doing it in the loop.
if let Some(candidates) = self.candidates.as_mut() { if let Some((_, _, allowed_candidates)) = self.state.as_mut() {
*candidates -= params.excluded_candidates; *allowed_candidates -= params.excluded_candidates;
} }
loop { loop {
match (&self.query_tree, &mut self.candidates) { match self.state.take() {
(_, Some(candidates)) if candidates.is_empty() => { Some((query_tree, _, allowed_candidates)) if allowed_candidates.is_empty() => {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(), query_tree: Some(query_tree),
candidates: self.candidates.take(), candidates: Some(RoaringBitmap::new()),
bucket_candidates: take(&mut self.bucket_candidates), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, },
(Some(qt), Some(candidates)) => { Some((query_tree, flattened_query_tree, mut allowed_candidates)) => {
let flattened_query_tree = self.flattened_query_tree.get_or_insert_with(|| { let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD {
flatten_query_tree(&qt)
});
let found_candidates = if candidates.len() < CANDIDATES_THRESHOLD {
let current_buckets = match self.current_buckets.as_mut() { let current_buckets = match self.current_buckets.as_mut() {
Some(current_buckets) => current_buckets, Some(current_buckets) => current_buckets,
None => { None => {
let new_buckets = linear_compute_candidates(self.ctx, flattened_query_tree, candidates)?; let new_buckets = linear_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates)?;
self.current_buckets.get_or_insert(new_buckets.into_iter()) self.current_buckets.get_or_insert(new_buckets.into_iter())
}, },
}; };
@ -83,62 +76,60 @@ impl<'t> Criterion for Attribute<'t> {
Some((_score, candidates)) => candidates, Some((_score, candidates)) => candidates,
None => { None => {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(), query_tree: Some(query_tree),
candidates: self.candidates.take(), candidates: Some(RoaringBitmap::new()),
bucket_candidates: take(&mut self.bucket_candidates), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, },
} }
} else { } else {
match set_compute_candidates(self.ctx, flattened_query_tree, candidates, params.wdcache)? { match set_compute_candidates(self.ctx, &flattened_query_tree, &allowed_candidates, params.wdcache)? {
Some(candidates) => candidates, Some(candidates) => candidates,
None => { None => {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(), query_tree: Some(query_tree),
candidates: self.candidates.take(), candidates: Some(RoaringBitmap::new()),
bucket_candidates: take(&mut self.bucket_candidates), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, },
} }
}; };
candidates.difference_with(&found_candidates); allowed_candidates -= &found_candidates;
self.state = Some((query_tree.clone(), flattened_query_tree, allowed_candidates));
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(), query_tree: Some(query_tree),
candidates: Some(found_candidates), candidates: Some(found_candidates),
bucket_candidates: take(&mut self.bucket_candidates), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, },
(Some(qt), None) => { None => {
let mut query_tree_candidates = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), params.wdcache)?;
query_tree_candidates -= params.excluded_candidates;
self.bucket_candidates |= &query_tree_candidates;
self.candidates = Some(query_tree_candidates);
},
(None, Some(_)) => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(),
candidates: self.candidates.take(),
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(None, None) => {
match self.parent.next(params)? { match self.parent.next(params)? {
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => {
let candidates = match candidates {
Some(candidates) => candidates,
None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates,
};
let flattened_query_tree = flatten_query_tree(&query_tree);
match bucket_candidates {
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
None => self.bucket_candidates |= &candidates,
}
self.state = Some((query_tree, flattened_query_tree, candidates));
self.current_buckets = None;
},
Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: None, query_tree: None,
candidates: None, candidates,
bucket_candidates, bucket_candidates,
})); }));
}, },
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
self.query_tree = query_tree;
self.candidates = candidates;
self.bucket_candidates |= bucket_candidates;
self.flattened_query_tree = None;
self.current_buckets = None;
},
None => return Ok(None), None => return Ok(None),
} }
}, },
@ -467,7 +458,7 @@ impl<'t, 'q> Eq for Branch<'t, 'q> {}
fn initialize_query_level_iterators<'t, 'q>( fn initialize_query_level_iterators<'t, 'q>(
ctx: &'t dyn Context<'t>, ctx: &'t dyn Context<'t>,
branches: &'q Vec<Vec<Vec<Query>>>, branches: &'q FlattenedQueryTree,
allowed_candidates: &RoaringBitmap, allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<BinaryHeap<Branch<'t, 'q>>> { ) -> anyhow::Result<BinaryHeap<Branch<'t, 'q>>> {
@ -517,7 +508,7 @@ fn initialize_query_level_iterators<'t, 'q>(
fn set_compute_candidates<'t>( fn set_compute_candidates<'t>(
ctx: &'t dyn Context<'t>, ctx: &'t dyn Context<'t>,
branches: &Vec<Vec<Vec<Query>>>, branches: &FlattenedQueryTree,
allowed_candidates: &RoaringBitmap, allowed_candidates: &RoaringBitmap,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<Option<RoaringBitmap>> ) -> anyhow::Result<Option<RoaringBitmap>>
@ -570,11 +561,11 @@ fn set_compute_candidates<'t>(
fn linear_compute_candidates( fn linear_compute_candidates(
ctx: &dyn Context, ctx: &dyn Context,
branches: &Vec<Vec<Vec<Query>>>, branches: &FlattenedQueryTree,
allowed_candidates: &RoaringBitmap, allowed_candidates: &RoaringBitmap,
) -> anyhow::Result<BTreeMap<u64, RoaringBitmap>> ) -> anyhow::Result<BTreeMap<u64, RoaringBitmap>>
{ {
fn compute_candidate_rank(branches: &Vec<Vec<Vec<Query>>>, words_positions: HashMap<String, RoaringBitmap>) -> u64 { fn compute_candidate_rank(branches: &FlattenedQueryTree, words_positions: HashMap<String, RoaringBitmap>) -> u64 {
let mut min_rank = u64::max_value(); let mut min_rank = u64::max_value();
for branch in branches { for branch in branches {
@ -659,10 +650,10 @@ fn linear_compute_candidates(
} }
// TODO can we keep refs of Query // TODO can we keep refs of Query
fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Vec<Query>>> { fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree {
use crate::search::criteria::Operation::{And, Or, Consecutive}; use crate::search::criteria::Operation::{And, Or, Consecutive};
fn and_recurse(head: &Operation, tail: &[Operation]) -> Vec<Vec<Vec<Query>>> { fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree {
match tail.split_first() { match tail.split_first() {
Some((thead, tail)) => { Some((thead, tail)) => {
let tail = and_recurse(thead, tail); let tail = and_recurse(thead, tail);
@ -680,7 +671,7 @@ fn flatten_query_tree(query_tree: &Operation) -> Vec<Vec<Vec<Query>>> {
} }
} }
fn recurse(op: &Operation) -> Vec<Vec<Vec<Query>>> { fn recurse(op: &Operation) -> FlattenedQueryTree {
match op { match op {
And(ops) | Consecutive(ops) => { And(ops) | Consecutive(ops) => {
ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)) ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t))

View File

@ -1,4 +1,4 @@
use std::{collections::HashMap, mem}; use std::mem::take;
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -60,13 +60,13 @@ impl<'t> Criterion for Exactness<'t> {
self.query_tree = None; self.query_tree = None;
}, },
Some(state) => { Some(state) => {
let (candidates, state) = resolve_state(self.ctx, mem::take(state), &self.query)?; let (candidates, state) = resolve_state(self.ctx, take(state), &self.query)?;
self.state = state; self.state = state;
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(), query_tree: self.query_tree.clone(),
candidates: Some(candidates), candidates: Some(candidates),
bucket_candidates: mem::take(&mut self.bucket_candidates), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, },
None => { None => {
@ -74,11 +74,16 @@ impl<'t> Criterion for Exactness<'t> {
Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => {
let candidates = match candidates { let candidates = match candidates {
Some(candidates) => candidates, Some(candidates) => candidates,
None => resolve_query_tree(self.ctx, &query_tree, &mut HashMap::new(), params.wdcache)?, None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)?,
}; };
match bucket_candidates {
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
None => self.bucket_candidates |= &candidates,
}
self.state = Some(State::new(candidates)); self.state = Some(State::new(candidates));
self.query_tree = Some(query_tree); self.query_tree = Some(query_tree);
self.bucket_candidates |= bucket_candidates;
}, },
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {

View File

@ -1,5 +1,3 @@
use std::collections::HashMap;
use log::debug; use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -41,19 +39,15 @@ impl<'t> Final<'t> {
}; };
match self.parent.next(&mut criterion_parameters)? { match self.parent.next(&mut criterion_parameters)? {
Some(CriterionResult { query_tree, candidates, mut bucket_candidates }) => { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
let candidates = match candidates { let candidates = match (candidates, query_tree.as_ref()) {
Some(candidates) => candidates, (Some(candidates), _) => candidates,
None => { (None, Some(qt)) => resolve_query_tree(self.ctx, qt, &mut self.wdcache)?,
let candidates = match query_tree.as_ref() { (None, None) => self.ctx.documents_ids()?,
Some(qt) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?,
None => self.ctx.documents_ids()?,
};
bucket_candidates |= &candidates;
candidates
}
}; };
let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone());
self.returned_candidates |= &candidates; self.returned_candidates |= &candidates;
return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })); return Ok(Some(FinalResult { query_tree, candidates, bucket_candidates }));

View File

@ -12,8 +12,8 @@ impl Initial {
pub fn new(query_tree: Option<Operation>, mut candidates: Option<RoaringBitmap>) -> Initial { pub fn new(query_tree: Option<Operation>, mut candidates: Option<RoaringBitmap>) -> Initial {
let answer = CriterionResult { let answer = CriterionResult {
query_tree, query_tree,
candidates: candidates.clone(), candidates: candidates.take(),
bucket_candidates: candidates.take().unwrap_or_default(), bucket_candidates: None,
}; };
Initial { answer: Some(answer) } Initial { answer: Some(answer) }
} }

View File

@ -39,7 +39,7 @@ pub struct CriterionResult {
/// if None, it is up to the child to compute the candidates itself. /// if None, it is up to the child to compute the candidates itself.
candidates: Option<RoaringBitmap>, candidates: Option<RoaringBitmap>,
/// Candidates that comes from the current bucket of the initial criterion. /// Candidates that comes from the current bucket of the initial criterion.
bucket_candidates: RoaringBitmap, bucket_candidates: Option<RoaringBitmap>,
} }
#[derive(Debug, PartialEq)] #[derive(Debug, PartialEq)]
@ -57,15 +57,6 @@ enum Candidates {
Forbidden(RoaringBitmap) Forbidden(RoaringBitmap)
} }
impl Candidates {
fn into_inner(self) -> RoaringBitmap {
match self {
Self::Allowed(inner) => inner,
Self::Forbidden(inner) => inner,
}
}
}
impl Default for Candidates { impl Default for Candidates {
fn default() -> Self { fn default() -> Self {
Self::Forbidden(RoaringBitmap::new()) Self::Forbidden(RoaringBitmap::new())
@ -236,14 +227,12 @@ impl<'t> CriteriaBuilder<'t> {
pub fn resolve_query_tree<'t>( pub fn resolve_query_tree<'t>(
ctx: &'t dyn Context, ctx: &'t dyn Context,
query_tree: &Operation, query_tree: &Operation,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<RoaringBitmap> ) -> anyhow::Result<RoaringBitmap>
{ {
fn resolve_operation<'t>( fn resolve_operation<'t>(
ctx: &'t dyn Context, ctx: &'t dyn Context,
query_tree: &Operation, query_tree: &Operation,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
wdcache: &mut WordDerivationsCache, wdcache: &mut WordDerivationsCache,
) -> anyhow::Result<RoaringBitmap> ) -> anyhow::Result<RoaringBitmap>
{ {
@ -252,7 +241,7 @@ pub fn resolve_query_tree<'t>(
match query_tree { match query_tree {
And(ops) => { And(ops) => {
let mut ops = ops.iter().map(|op| { let mut ops = ops.iter().map(|op| {
resolve_operation(ctx, op, cache, wdcache) resolve_operation(ctx, op, wdcache)
}).collect::<anyhow::Result<Vec<_>>>()?; }).collect::<anyhow::Result<Vec<_>>>()?;
ops.sort_unstable_by_key(|cds| cds.len()); ops.sort_unstable_by_key(|cds| cds.len());
@ -296,7 +285,7 @@ pub fn resolve_query_tree<'t>(
Or(_, ops) => { Or(_, ops) => {
let mut candidates = RoaringBitmap::new(); let mut candidates = RoaringBitmap::new();
for op in ops { for op in ops {
let docids = resolve_operation(ctx, op, cache, wdcache)?; let docids = resolve_operation(ctx, op, wdcache)?;
candidates.union_with(&docids); candidates.union_with(&docids);
} }
Ok(candidates) Ok(candidates)
@ -305,7 +294,7 @@ pub fn resolve_query_tree<'t>(
} }
} }
resolve_operation(ctx, query_tree, cache, wdcache) resolve_operation(ctx, query_tree, wdcache)
} }

View File

@ -30,8 +30,8 @@ const PROXIMITY_THRESHOLD: u8 = 0;
pub struct Proximity<'t> { pub struct Proximity<'t> {
ctx: &'t dyn Context<'t>, ctx: &'t dyn Context<'t>,
/// ((max_proximity, query_tree), allowed_candidates) /// (max_proximity, query_tree, allowed_candidates)
state: Option<(Option<(usize, Operation)>, RoaringBitmap)>, state: Option<(u8, Operation, RoaringBitmap)>,
proximity: u8, proximity: u8,
bucket_candidates: RoaringBitmap, bucket_candidates: RoaringBitmap,
parent: Box<dyn Criterion + 't>, parent: Box<dyn Criterion + 't>,
@ -57,114 +57,90 @@ impl<'t> Criterion for Proximity<'t> {
#[logging_timer::time("Proximity::{}")] #[logging_timer::time("Proximity::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> { fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> {
// remove excluded candidates when next is called, instead of doing it in the loop. // remove excluded candidates when next is called, instead of doing it in the loop.
if let Some((_, candidates)) = self.state.as_mut() { if let Some((_, _, allowed_candidates)) = self.state.as_mut() {
*candidates -= params.excluded_candidates; *allowed_candidates -= params.excluded_candidates;
} }
loop { loop {
debug!("Proximity at iteration {} (max prox {:?}) ({:?})", debug!("Proximity at iteration {} (max prox {:?}) ({:?})",
self.proximity, self.proximity,
self.state.as_ref().map(|(qt, _)| qt.as_ref().map(|(mp, _)| mp)), self.state.as_ref().map(|(mp, _, _)| mp),
self.state.as_ref().map(|(_, cd)| cd), self.state.as_ref().map(|(_, _, cd)| cd),
); );
match &mut self.state { match &mut self.state {
Some((_, candidates)) if candidates.is_empty() => { Some((max_prox, _, allowed_candidates)) if allowed_candidates.is_empty() || self.proximity > *max_prox => {
self.state = None; // reset state self.state = None; // reset state
}, },
Some((Some((max_prox, query_tree)), candidates)) => { Some((_, query_tree, allowed_candidates)) => {
if self.proximity as usize > *max_prox { let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD {
self.state = None; // reset state if let Some(cache) = self.plane_sweep_cache.as_mut() {
} else { match cache.next() {
let mut new_candidates = if candidates.len() <= CANDIDATES_THRESHOLD && self.proximity > PROXIMITY_THRESHOLD { Some((p, candidates)) => {
if let Some(cache) = self.plane_sweep_cache.as_mut() { self.proximity = p;
match cache.next() { candidates
Some((p, candidates)) => { },
self.proximity = p; None => {
candidates self.state = None; // reset state
}, continue
None => { },
self.state = None; // reset state
continue
},
}
} else {
let cache = resolve_plane_sweep_candidates(
self.ctx,
query_tree,
candidates,
params.wdcache,
)?;
self.plane_sweep_cache = Some(cache.into_iter());
continue
} }
} else { // use set theory based algorithm } else {
resolve_candidates( let cache = resolve_plane_sweep_candidates(
self.ctx, self.ctx,
&query_tree, query_tree,
self.proximity, allowed_candidates,
&mut self.candidates_cache, params.wdcache,
params.wdcache, )?;
)? self.plane_sweep_cache = Some(cache.into_iter());
};
new_candidates.intersect_with(&candidates); continue
candidates.difference_with(&new_candidates); }
self.proximity += 1; } else { // use set theory based algorithm
resolve_candidates(
self.ctx,
&query_tree,
self.proximity,
&mut self.candidates_cache,
params.wdcache,
)?
};
new_candidates &= &*allowed_candidates;
*allowed_candidates -= &new_candidates;
self.proximity += 1;
return Ok(Some(CriterionResult {
query_tree: Some(query_tree.clone()),
candidates: Some(new_candidates),
bucket_candidates: take(&mut self.bucket_candidates),
}));
}
},
Some((None, candidates)) => {
let candidates = take(candidates);
self.state = None; // reset state
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: None, query_tree: Some(query_tree.clone()),
candidates: Some(candidates.clone()), candidates: Some(new_candidates),
bucket_candidates: candidates, bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, },
None => { None => {
match self.parent.next(params)? { match self.parent.next(params)? {
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => {
return Ok(Some(CriterionResult { let candidates = match candidates {
query_tree: None, Some(candidates) => candidates,
candidates: None, None => resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - params.excluded_candidates,
bucket_candidates,
}));
},
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
let candidates_is_some = candidates.is_some();
let candidates = match (&query_tree, candidates) {
(_, Some(candidates)) => candidates,
(Some(qt), None) => {
let candidates = resolve_query_tree(self.ctx, qt, &mut HashMap::new(), params.wdcache)?;
candidates - params.excluded_candidates
},
(None, None) => RoaringBitmap::new(),
}; };
// If our parent returns candidates it means that the bucket match bucket_candidates {
// candidates were already computed before and we can use them. Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
// None => self.bucket_candidates |= &candidates,
// If not, we must use the just computed candidates as our bucket
// candidates.
if candidates_is_some {
self.bucket_candidates.union_with(&bucket_candidates);
} else {
self.bucket_candidates.union_with(&candidates);
} }
let query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); let maximum_proximity = maximum_proximity(&query_tree);
self.state = Some((query_tree, candidates)); self.state = Some((maximum_proximity as u8, query_tree, candidates));
self.proximity = 0; self.proximity = 0;
self.plane_sweep_cache = None; self.plane_sweep_cache = None;
}, },
Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
bucket_candidates,
}));
},
None => return Ok(None), None => return Ok(None),
} }
}, },

View File

@ -13,15 +13,19 @@ use super::{
CriterionParameters, CriterionParameters,
CriterionResult, CriterionResult,
query_docids, query_docids,
query_pair_proximity_docids query_pair_proximity_docids,
resolve_query_tree,
}; };
/// Maximum number of typo for a word of any length.
const MAX_TYPOS_PER_WORD: u8 = 2;
pub struct Typo<'t> { pub struct Typo<'t> {
ctx: &'t dyn Context<'t>, ctx: &'t dyn Context<'t>,
query_tree: Option<(usize, Operation)>, /// (max_typos, query_tree, candidates)
number_typos: u8, state: Option<(u8, Operation, Candidates)>,
candidates: Candidates, typos: u8,
bucket_candidates: RoaringBitmap, bucket_candidates: Option<RoaringBitmap>,
parent: Box<dyn Criterion + 't>, parent: Box<dyn Criterion + 't>,
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
} }
@ -30,10 +34,9 @@ impl<'t> Typo<'t> {
pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self { pub fn new(ctx: &'t dyn Context<'t>, parent: Box<dyn Criterion + 't>) -> Self {
Typo { Typo {
ctx, ctx,
query_tree: None, state: None,
number_typos: 0, typos: 0,
candidates: Candidates::default(), bucket_candidates: None,
bucket_candidates: RoaringBitmap::new(),
parent, parent,
candidates_cache: HashMap::new(), candidates_cache: HashMap::new(),
} }
@ -45,113 +48,101 @@ impl<'t> Criterion for Typo<'t> {
fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> { fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> {
use Candidates::{Allowed, Forbidden}; use Candidates::{Allowed, Forbidden};
// remove excluded candidates when next is called, instead of doing it in the loop. // remove excluded candidates when next is called, instead of doing it in the loop.
match &mut self.candidates { match self.state.as_mut() {
Allowed(candidates) => *candidates -= params.excluded_candidates, Some((_, _, Allowed(candidates))) => *candidates -= params.excluded_candidates,
Forbidden(candidates) => *candidates |= params.excluded_candidates, Some((_, _, Forbidden(candidates))) => *candidates |= params.excluded_candidates,
None => (),
} }
loop { loop {
debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); debug!("Typo at iteration {} (max typos {:?}) ({:?})",
self.typos,
self.state.as_ref().map(|(mt, _, _)| mt),
self.state.as_ref().map(|(_, _, cd)| cd),
);
match (&mut self.query_tree, &mut self.candidates) { match self.state.as_mut() {
(_, Allowed(candidates)) if candidates.is_empty() => { Some((max_typos, _, _)) if self.typos > *max_typos => {
return Ok(Some(CriterionResult { self.state = None; // reset state
query_tree: self.query_tree.take().map(|(_, qt)| qt),
candidates: Some(take(&mut self.candidates).into_inner()),
bucket_candidates: take(&mut self.bucket_candidates),
}));
}, },
(Some((max_typos, query_tree)), Allowed(candidates)) => { Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => {
if self.number_typos as usize > *max_typos { self.state = None; // reset state
self.query_tree = None; },
self.candidates = Candidates::default(); Some((_, query_tree, candidates_authorization)) => {
} else { let fst = self.ctx.words_fst();
let fst = self.ctx.words_fst(); let new_query_tree = if self.typos < MAX_TYPOS_PER_WORD {
let new_query_tree = if self.number_typos < 2 { alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?
alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)? } else if self.typos == MAX_TYPOS_PER_WORD {
} else if self.number_typos == 2 { // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible,
*query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?; // we keep the altered query tree
query_tree.clone() *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.typos, params.wdcache)?;
} else { // we compute the allowed candidates
query_tree.clone() let query_tree_allowed_candidates = resolve_query_tree(self.ctx, query_tree, params.wdcache)?;
// we assign the allowed candidates to the candidates authorization.
*candidates_authorization = match take(candidates_authorization) {
Allowed(allowed_candidates) => Allowed(query_tree_allowed_candidates & allowed_candidates),
Forbidden(forbidden_candidates) => Allowed(query_tree_allowed_candidates - forbidden_candidates),
}; };
query_tree.clone()
let mut new_candidates = resolve_candidates(
self.ctx,
&new_query_tree,
self.number_typos,
&mut self.candidates_cache,
params.wdcache,
)?;
new_candidates.intersect_with(&candidates);
candidates.difference_with(&new_candidates);
self.number_typos += 1;
return Ok(Some(CriterionResult {
query_tree: Some(new_query_tree),
candidates: Some(new_candidates),
bucket_candidates: take(&mut self.bucket_candidates),
}));
}
},
(Some((max_typos, query_tree)), Forbidden(candidates)) => {
if self.number_typos as usize > *max_typos {
self.query_tree = None;
self.candidates = Candidates::default();
} else { } else {
let fst = self.ctx.words_fst(); query_tree.clone()
let new_query_tree = if self.number_typos < 2 { };
alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?
} else if self.number_typos == 2 {
*query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, params.wdcache)?;
query_tree.clone()
} else {
query_tree.clone()
};
let mut new_candidates = resolve_candidates( let mut candidates = resolve_candidates(
self.ctx, self.ctx,
&new_query_tree, &new_query_tree,
self.number_typos, self.typos,
&mut self.candidates_cache, &mut self.candidates_cache,
params.wdcache, params.wdcache,
)?; )?;
new_candidates.difference_with(&candidates);
candidates.union_with(&new_candidates);
self.number_typos += 1;
self.bucket_candidates.union_with(&new_candidates);
return Ok(Some(CriterionResult { match candidates_authorization {
query_tree: Some(new_query_tree), Allowed(allowed_candidates) => {
candidates: Some(new_candidates), candidates &= &*allowed_candidates;
bucket_candidates: take(&mut self.bucket_candidates), *allowed_candidates -= &candidates;
}));
}
},
(None, Allowed(_)) => {
let candidates = take(&mut self.candidates).into_inner();
return Ok(Some(CriterionResult {
query_tree: None,
candidates: Some(candidates.clone()),
bucket_candidates: candidates,
}));
},
(None, Forbidden(_)) => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates: None,
bucket_candidates,
}));
}, },
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { Forbidden(forbidden_candidates) => {
self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); candidates -= &*forbidden_candidates;
self.number_typos = 0; *forbidden_candidates |= &candidates;
self.candidates = candidates.map_or_else(|| { },
}
let bucket_candidates = match self.bucket_candidates.as_mut() {
Some(bucket_candidates) => take(bucket_candidates),
None => candidates.clone(),
};
self.typos += 1;
return Ok(Some(CriterionResult {
query_tree: Some(new_query_tree),
candidates: Some(candidates),
bucket_candidates: Some(bucket_candidates),
}));
},
None => {
match self.parent.next(params)? {
Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => {
self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) {
(Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc),
(self_bc, parent_bc) => self_bc.or(parent_bc),
};
let candidates = candidates.map_or_else(|| {
Candidates::Forbidden(params.excluded_candidates.clone()) Candidates::Forbidden(params.excluded_candidates.clone())
}, Candidates::Allowed); }, Candidates::Allowed);
self.bucket_candidates.union_with(&bucket_candidates);
let maximum_typos = maximum_typo(&query_tree) as u8;
self.state = Some((maximum_typos, query_tree, candidates));
self.typos = 0;
},
Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => {
return Ok(Some(CriterionResult {
query_tree: None,
candidates,
bucket_candidates,
}));
}, },
None => return Ok(None), None => return Ok(None),
} }
@ -185,7 +176,6 @@ fn alterate_query_tree(
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache))
}, },
Operation::Query(q) => { Operation::Query(q) => {
// TODO may be optimized when number_typos == 0
if let QueryKind::Tolerant { typo, word } = &q.kind { if let QueryKind::Tolerant { typo, word } = &q.kind {
// if no typo is allowed we don't call word_derivations function, // if no typo is allowed we don't call word_derivations function,
// and directly create an Exact query // and directly create an Exact query
@ -384,7 +374,7 @@ mod test {
]), ]),
])), ])),
candidates: Some(candidates_1.clone()), candidates: Some(candidates_1.clone()),
bucket_candidates: candidates_1, bucket_candidates: Some(candidates_1),
}; };
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1));
@ -406,7 +396,7 @@ mod test {
]), ]),
])), ])),
candidates: Some(candidates_2.clone()), candidates: Some(candidates_2.clone()),
bucket_candidates: candidates_2, bucket_candidates: Some(candidates_2),
}; };
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2));
@ -428,7 +418,7 @@ mod test {
let expected = CriterionResult { let expected = CriterionResult {
query_tree: None, query_tree: None,
candidates: Some(facet_candidates.clone()), candidates: Some(facet_candidates.clone()),
bucket_candidates: facet_candidates, bucket_candidates: None,
}; };
// first iteration, returns the facet candidates // first iteration, returns the facet candidates
@ -471,7 +461,7 @@ mod test {
]), ]),
])), ])),
candidates: Some(&candidates_1 & &facet_candidates), candidates: Some(&candidates_1 & &facet_candidates),
bucket_candidates: facet_candidates.clone(), bucket_candidates: Some(&candidates_1 & &facet_candidates),
}; };
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1)); assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_1));
@ -493,7 +483,7 @@ mod test {
]), ]),
])), ])),
candidates: Some(&candidates_2 & &facet_candidates), candidates: Some(&candidates_2 & &facet_candidates),
bucket_candidates: RoaringBitmap::new(), bucket_candidates: Some(&candidates_2 & &facet_candidates),
}; };
assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2)); assert_eq!(criteria.next(&mut criterion_parameters).unwrap(), Some(expected_2));

View File

@ -1,4 +1,3 @@
use std::collections::HashMap;
use std::mem::take; use std::mem::take;
use log::debug; use log::debug;
@ -11,9 +10,9 @@ pub struct Words<'t> {
ctx: &'t dyn Context<'t>, ctx: &'t dyn Context<'t>,
query_trees: Vec<Operation>, query_trees: Vec<Operation>,
candidates: Option<RoaringBitmap>, candidates: Option<RoaringBitmap>,
bucket_candidates: RoaringBitmap, bucket_candidates: Option<RoaringBitmap>,
parent: Box<dyn Criterion + 't>, parent: Box<dyn Criterion + 't>,
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, compute_candidates: bool,
} }
impl<'t> Words<'t> { impl<'t> Words<'t> {
@ -22,9 +21,9 @@ impl<'t> Words<'t> {
ctx, ctx,
query_trees: Vec::default(), query_trees: Vec::default(),
candidates: None, candidates: None,
bucket_candidates: RoaringBitmap::new(), bucket_candidates: None,
parent, parent,
candidates_cache: HashMap::default(), compute_candidates: false,
} }
} }
} }
@ -40,55 +39,48 @@ impl<'t> Criterion for Words<'t> {
loop { loop {
debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates);
match (self.query_trees.pop(), &mut self.candidates) { match self.query_trees.pop() {
(query_tree, Some(candidates)) if candidates.is_empty() => { Some(query_tree) => {
self.query_trees = Vec::new(); let candidates = match self.candidates.as_mut() {
return Ok(Some(CriterionResult { Some(allowed_candidates) if self.compute_candidates => {
query_tree, let mut candidates = resolve_query_tree(self.ctx, &query_tree, params.wdcache)?;
candidates: self.candidates.take(), candidates &= &*allowed_candidates;
bucket_candidates: take(&mut self.bucket_candidates), *allowed_candidates -= &candidates;
})); Some(candidates)
}, },
(Some(qt), Some(candidates)) => { candidates => candidates.cloned(),
let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, params.wdcache)?; };
found_candidates.intersect_with(&candidates);
candidates.difference_with(&found_candidates); let bucket_candidates = match self.bucket_candidates.as_mut() {
Some(bucket_candidates) => Some(take(bucket_candidates)),
None => None,
};
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: Some(qt), query_tree: Some(query_tree),
candidates: Some(found_candidates), candidates,
bucket_candidates: take(&mut self.bucket_candidates), bucket_candidates,
})); }));
}, },
(Some(qt), None) => { None => {
return Ok(Some(CriterionResult {
query_tree: Some(qt),
candidates: None,
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(None, Some(_)) => {
let candidates = self.candidates.take();
return Ok(Some(CriterionResult {
query_tree: None,
candidates: candidates.clone(),
bucket_candidates: candidates.unwrap_or_default(),
}));
},
(None, None) => {
match self.parent.next(params)? { match self.parent.next(params)? {
Some(CriterionResult { query_tree: None, candidates: None, bucket_candidates }) => { Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => {
self.query_trees = explode_query_tree(query_tree);
self.candidates = candidates;
self.compute_candidates = bucket_candidates.is_some();
self.bucket_candidates = match (self.bucket_candidates.take(), bucket_candidates) {
(Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc),
(self_bc, parent_bc) => self_bc.or(parent_bc),
};
},
Some(CriterionResult { query_tree: None, candidates, bucket_candidates }) => {
return Ok(Some(CriterionResult { return Ok(Some(CriterionResult {
query_tree: None, query_tree: None,
candidates: None, candidates,
bucket_candidates, bucket_candidates,
})); }));
}, },
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default();
self.candidates = candidates;
self.bucket_candidates.union_with(&bucket_candidates);
},
None => return Ok(None), None => return Ok(None),
} }
}, },