From 62a70c300d35f569186a05bfe7b91338113a5d9f Mon Sep 17 00:00:00 2001 From: many Date: Tue, 9 Mar 2021 12:04:52 +0100 Subject: [PATCH 1/6] Optimize words criterion --- milli/src/search/criteria/asc_desc.rs | 20 +++++++++--- milli/src/search/criteria/fetcher.rs | 42 +++++++++++++++++++------- milli/src/search/criteria/mod.rs | 17 ++++++----- milli/src/search/criteria/proximity.rs | 16 +++++++--- milli/src/search/criteria/typo.rs | 20 ++++++------ milli/src/search/criteria/words.rs | 37 ++++++++++------------- milli/src/search/mod.rs | 5 ++- 7 files changed, 95 insertions(+), 62 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 29fe26d7e..5b2ec32e8 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -160,9 +160,21 @@ impl<'t> Criterion for AscDesc<'t> { match self.parent.as_mut() { Some(parent) => { match parent.next(wdcache)? { - Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree; - candidates.intersect_with(&self.faceted_candidates); + let candidates = match (&self.query_tree, candidates) { + (_, Some(mut candidates)) => { + candidates.intersect_with(&self.faceted_candidates); + candidates + }, + (Some(qt), None) => { + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; + candidates.intersect_with(&self.faceted_candidates); + candidates + }, + (None, None) => take(&mut self.faceted_candidates), + }; self.candidates = facet_ordered( self.index, self.rtxn, @@ -183,7 +195,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree, - candidates: RoaringBitmap::new(), + candidates: Some(RoaringBitmap::new()), bucket_candidates, })); }, @@ -195,7 +207,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), - candidates, + candidates: Some(candidates), bucket_candidates, })); }, diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index 094efe75e..723b5a13a 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -8,12 +8,24 @@ use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; +/// The result of a call to the parent criterion. +#[derive(Debug, Clone, PartialEq)] +pub struct FetcherResult { + /// The query tree that must be used by the children criterion to fetch candidates. + pub query_tree: Option, + /// The candidates that this criterion is allowed to return subsets of. + pub candidates: RoaringBitmap, + /// Candidates that comes from the current bucket of the initial criterion. + pub bucket_candidates: RoaringBitmap, +} + pub struct Fetcher<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Candidates, parent: Option>, should_get_documents_ids: bool, + wdcache: WordDerivationsCache, } impl<'t> Fetcher<'t> { @@ -29,6 +41,7 @@ impl<'t> Fetcher<'t> { candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), parent: None, should_get_documents_ids: true, + wdcache: WordDerivationsCache::new(), } } @@ -43,13 +56,12 @@ impl<'t> Fetcher<'t> { candidates: Candidates::default(), parent: Some(parent), should_get_documents_ids: true, + wdcache: WordDerivationsCache::new(), } } -} -impl<'t> Criterion for Fetcher<'t> { #[logging_timer::time("Fetcher::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + pub fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", @@ -62,14 +74,14 @@ impl<'t> Criterion for Fetcher<'t> { let candidates = take(&mut self.candidates).into_inner(); let candidates = match &self.query_tree { Some(qt) if should_get_documents_ids => { - let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?; + let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?; docids.intersect_with(&candidates); docids }, _ => candidates, }; - return Ok(Some(CriterionResult { + return Ok(Some(FetcherResult { query_tree: self.query_tree.take(), candidates: candidates.clone(), bucket_candidates: candidates, @@ -78,15 +90,23 @@ impl<'t> Criterion for Fetcher<'t> { Forbidden(_) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(wdcache)? { - Some(result) => return Ok(Some(result)), + match parent.next(&mut self.wdcache)? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, + (None, None) => RoaringBitmap::new(), + }; + + return Ok(Some(FetcherResult { query_tree, candidates, bucket_candidates })) + }, None => if should_get_documents_ids { let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?, + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, None => self.ctx.documents_ids()?, }; - return Ok(Some(CriterionResult { + return Ok(Some(FetcherResult { query_tree: self.query_tree.clone(), candidates: candidates.clone(), bucket_candidates: candidates, @@ -96,11 +116,11 @@ impl<'t> Criterion for Fetcher<'t> { }, None => if should_get_documents_ids { let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?, + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, None => self.ctx.documents_ids()?, }; - return Ok(Some(CriterionResult { + return Ok(Some(FetcherResult { query_tree: self.query_tree.clone(), candidates: candidates.clone(), bucket_candidates: candidates, diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d70942c1c..b2fd7803d 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -14,10 +14,10 @@ use self::asc_desc::AscDesc; use self::proximity::Proximity; use self::fetcher::Fetcher; -pub mod typo; -pub mod words; -pub mod asc_desc; -pub mod proximity; +mod typo; +mod words; +mod asc_desc; +mod proximity; pub mod fetcher; pub trait Criterion { @@ -28,11 +28,12 @@ pub trait Criterion { #[derive(Debug, Clone, PartialEq)] pub struct CriterionResult { /// The query tree that must be used by the children criterion to fetch candidates. - pub query_tree: Option, - /// The candidates that this criterion is allowed to return subsets of. - pub candidates: RoaringBitmap, + query_tree: Option, + /// The candidates that this criterion is allowed to return subsets of, + /// if None, it is up to the child to compute the candidates itself. + candidates: Option, /// Candidates that comes from the current bucket of the initial criterion. - pub bucket_candidates: RoaringBitmap, + bucket_candidates: RoaringBitmap, } /// Either a set of candidates that defines the candidates diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 5b14f699c..cb4fd257b 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -9,7 +9,7 @@ use log::debug; use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::WordDerivationsCache; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; pub struct Proximity<'t> { ctx: &'t dyn Context, @@ -70,7 +70,7 @@ impl<'t> Criterion for Proximity<'t> { (_, Allowed(candidates)) if candidates.is_empty() => { return Ok(Some(CriterionResult { query_tree: self.query_tree.take().map(|(_, qt)| qt), - candidates: take(&mut self.candidates).into_inner(), + candidates: Some(take(&mut self.candidates).into_inner()), bucket_candidates: take(&mut self.bucket_candidates), })); }, @@ -126,7 +126,7 @@ impl<'t> Criterion for Proximity<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree.clone()), - candidates: new_candidates, + candidates: Some(new_candidates), bucket_candidates, })); } @@ -155,7 +155,7 @@ impl<'t> Criterion for Proximity<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree.clone()), - candidates: new_candidates, + candidates: Some(new_candidates), bucket_candidates, })); } @@ -164,7 +164,7 @@ impl<'t> Criterion for Proximity<'t> { let candidates = take(&mut self.candidates).into_inner(); return Ok(Some(CriterionResult { query_tree: None, - candidates: candidates.clone(), + candidates: Some(candidates.clone()), bucket_candidates: candidates, })); }, @@ -173,6 +173,12 @@ impl<'t> Criterion for Proximity<'t> { Some(parent) => { match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, + (None, None) => RoaringBitmap::new(), + }; + self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); self.proximity = 0; self.candidates = Candidates::Allowed(candidates); diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 4cc0015da..8bead4661 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -63,7 +63,7 @@ impl<'t> Criterion for Typo<'t> { (_, Allowed(candidates)) if candidates.is_empty() => { return Ok(Some(CriterionResult { query_tree: self.query_tree.take().map(|(_, qt)| qt), - candidates: take(&mut self.candidates).into_inner(), + candidates: Some(take(&mut self.candidates).into_inner()), bucket_candidates: take(&mut self.bucket_candidates), })); }, @@ -100,7 +100,7 @@ impl<'t> Criterion for Typo<'t> { return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), - candidates: new_candidates, + candidates: Some(new_candidates), bucket_candidates, })); } @@ -138,7 +138,7 @@ impl<'t> Criterion for Typo<'t> { return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), - candidates: new_candidates, + candidates: Some(new_candidates), bucket_candidates, })); } @@ -147,7 +147,7 @@ impl<'t> Criterion for Typo<'t> { let candidates = take(&mut self.candidates).into_inner(); return Ok(Some(CriterionResult { query_tree: None, - candidates: candidates.clone(), + candidates: Some(candidates.clone()), bucket_candidates: candidates, })); }, @@ -158,7 +158,7 @@ impl<'t> Criterion for Typo<'t> { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); self.number_typos = 0; - self.candidates = Candidates::Allowed(candidates); + self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed); self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), @@ -394,7 +394,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), ]), ])), - candidates: candidates_1.clone(), + candidates: Some(candidates_1.clone()), bucket_candidates: candidates_1, }; @@ -416,7 +416,7 @@ mod test { ]), ]), ])), - candidates: candidates_2.clone(), + candidates: Some(candidates_2.clone()), bucket_candidates: candidates_2, }; @@ -434,7 +434,7 @@ mod test { let expected = CriterionResult { query_tree: None, - candidates: facet_candidates.clone(), + candidates: Some(facet_candidates.clone()), bucket_candidates: facet_candidates, }; @@ -472,7 +472,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), ]), ])), - candidates: &candidates_1 & &facet_candidates, + candidates: Some(&candidates_1 & &facet_candidates), bucket_candidates: candidates_1 & &facet_candidates, }; @@ -494,7 +494,7 @@ mod test { ]), ]), ])), - candidates: &candidates_2 & &facet_candidates, + candidates: Some(&candidates_2 & &facet_candidates), bucket_candidates: candidates_2 & &facet_candidates, }; diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index d94fd0c53..8774eed7c 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -6,12 +6,12 @@ use roaring::RoaringBitmap; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; +use super::{resolve_query_tree, Criterion, CriterionResult, Context}; pub struct Words<'t> { ctx: &'t dyn Context, query_trees: Vec, - candidates: Candidates, + candidates: Option, bucket_candidates: RoaringBitmap, parent: Option>, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, @@ -27,7 +27,7 @@ impl<'t> Words<'t> { Words { ctx, query_trees: query_tree.map(explode_query_tree).unwrap_or_default(), - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + candidates, bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::default(), @@ -38,7 +38,7 @@ impl<'t> Words<'t> { Words { ctx, query_trees: Vec::default(), - candidates: Candidates::default(), + candidates: None, bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::default(), @@ -49,20 +49,19 @@ impl<'t> Words<'t> { impl<'t> Criterion for Words<'t> { #[logging_timer::time("Words::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { - use Candidates::{Allowed, Forbidden}; loop { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); match (self.query_trees.pop(), &mut self.candidates) { - (query_tree, Allowed(candidates)) if candidates.is_empty() => { + (query_tree, Some(candidates)) if candidates.is_empty() => { self.query_trees = Vec::new(); return Ok(Some(CriterionResult { query_tree, - candidates: take(&mut self.candidates).into_inner(), + candidates: self.candidates.take(), bucket_candidates: take(&mut self.bucket_candidates), })); }, - (Some(qt), Allowed(candidates)) => { + (Some(qt), Some(candidates)) => { let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); @@ -74,41 +73,37 @@ impl<'t> Criterion for Words<'t> { return Ok(Some(CriterionResult { query_tree: Some(qt), - candidates: found_candidates, + candidates: Some(found_candidates), bucket_candidates, })); }, - (Some(qt), Forbidden(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; - found_candidates.difference_with(&candidates); - candidates.union_with(&found_candidates); - + (Some(qt), None) => { let bucket_candidates = match self.parent { Some(_) => take(&mut self.bucket_candidates), - None => found_candidates.clone(), + None => RoaringBitmap::new(), }; return Ok(Some(CriterionResult { query_tree: Some(qt), - candidates: found_candidates, + candidates: None, bucket_candidates, })); }, - (None, Allowed(_)) => { - let candidates = take(&mut self.candidates).into_inner(); + (None, Some(_)) => { + let candidates = self.candidates.take(); return Ok(Some(CriterionResult { query_tree: None, candidates: candidates.clone(), - bucket_candidates: candidates, + bucket_candidates: candidates.unwrap_or_default(), })); }, - (None, Forbidden(_)) => { + (None, None) => { match self.parent.as_mut() { Some(parent) => { match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); - self.candidates = Candidates::Allowed(candidates); + self.candidates = candidates; self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 34b3ffec9..7475ef473 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,7 +11,7 @@ use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use crate::search::criteria::{Criterion, CriterionResult}; +use crate::search::criteria::fetcher::FetcherResult; use crate::{Index, DocumentId}; pub use self::facet::FacetIter; @@ -99,9 +99,8 @@ impl<'a> Search<'a> { let mut offset = self.offset; let mut limit = self.limit; let mut documents_ids = Vec::new(); - let mut words_derivations_cache = WordDerivationsCache::new(); let mut initial_candidates = RoaringBitmap::new(); - while let Some(CriterionResult { candidates, bucket_candidates, .. }) = criteria.next(&mut words_derivations_cache)? { + while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { debug!("Number of candidates found {}", candidates.len()); From 42fd7dea78aa1903da70afb365f5fbcacb653699 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 9 Mar 2021 15:18:30 +0100 Subject: [PATCH 2/6] Remove the useless typo cache --- milli/src/search/criteria/typo.rs | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 8bead4661..5acc7a048 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -16,7 +16,6 @@ pub struct Typo<'t> { bucket_candidates: RoaringBitmap, parent: Option>, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, - typo_cache: HashMap<(String, bool, u8), Vec<(String, u8)>>, } impl<'t> Typo<'t> { @@ -34,7 +33,6 @@ impl<'t> Typo<'t> { bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::new(), - typo_cache: HashMap::new(), } } @@ -47,7 +45,6 @@ impl<'t> Typo<'t> { bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::new(), - typo_cache: HashMap::new(), } } } @@ -74,9 +71,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache, wdcache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache, wdcache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; query_tree.clone() } else { query_tree.clone() @@ -112,9 +109,9 @@ impl<'t> Criterion for Typo<'t> { } else { let fst = self.ctx.words_fst(); let new_query_tree = if self.number_typos < 2 { - alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache, wdcache)? + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)? } else if self.number_typos == 2 { - *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache, wdcache)?; + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, wdcache)?; query_tree.clone() } else { query_tree.clone() @@ -179,7 +176,6 @@ fn alterate_query_tree( words_fst: &fst::Set>, mut query_tree: Operation, number_typos: u8, - typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result { @@ -187,7 +183,6 @@ fn alterate_query_tree( words_fst: &fst::Set>, operation: &mut Operation, number_typos: u8, - typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result<()> { @@ -195,7 +190,7 @@ fn alterate_query_tree( match operation { And(ops) | Consecutive(ops) | Or(_, ops) => { - ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, typo_cache, wdcache)) + ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) }, Operation::Query(q) => { // TODO may be optimized when number_typos == 0 @@ -209,19 +204,11 @@ fn alterate_query_tree( }); } else { let typo = *typo.min(&number_typos); - let cache_key = (word.clone(), q.prefix, typo); - let words = if let Some(derivations) = typo_cache.get(&cache_key) { - derivations.clone() - } else { - let derivations = word_derivations(word, q.prefix, typo, words_fst, wdcache)?.to_vec(); - typo_cache.insert(cache_key, derivations.clone()); - derivations - }; - + let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?; let queries = words.into_iter().map(|(word, typo)| { Operation::Query(Query { prefix: false, - kind: QueryKind::Exact { original_typo: typo, word }, + kind: QueryKind::Exact { original_typo: *typo, word: word.to_string() }, }) }).collect(); @@ -234,7 +221,7 @@ fn alterate_query_tree( } } - recurse(words_fst, &mut query_tree, number_typos, typo_cache, wdcache)?; + recurse(words_fst, &mut query_tree, number_typos, wdcache)?; Ok(query_tree) } From facfb4b6151482628e82c90f36a19f97d3ff764a Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 9 Mar 2021 15:55:59 +0100 Subject: [PATCH 3/6] Fix the bucket candidates --- milli/src/search/criteria/asc_desc.rs | 6 +++++- milli/src/search/criteria/proximity.rs | 7 ++++++- milli/src/search/criteria/typo.rs | 8 ++------ 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 5b2ec32e8..6b8afad2c 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -175,6 +175,11 @@ impl<'t> Criterion for AscDesc<'t> { }, (None, None) => take(&mut self.faceted_candidates), }; + if bucket_candidates.is_empty() { + self.bucket_candidates.union_with(&candidates); + } else { + self.bucket_candidates.union_with(&bucket_candidates); + } self.candidates = facet_ordered( self.index, self.rtxn, @@ -183,7 +188,6 @@ impl<'t> Criterion for AscDesc<'t> { self.ascending, candidates, )?; - self.bucket_candidates = bucket_candidates; }, None => return Ok(None), } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index cb4fd257b..e5f010177 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -179,10 +179,15 @@ impl<'t> Criterion for Proximity<'t> { (None, None) => RoaringBitmap::new(), }; + if bucket_candidates.is_empty() { + self.bucket_candidates.union_with(&candidates); + } else { + self.bucket_candidates.union_with(&bucket_candidates); + } + self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); self.proximity = 0; self.candidates = Candidates::Allowed(candidates); - self.bucket_candidates.union_with(&bucket_candidates); self.plane_sweep_cache = None; }, None => return Ok(None), diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 5acc7a048..b17b7561b 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -127,16 +127,12 @@ impl<'t> Criterion for Typo<'t> { new_candidates.difference_with(&candidates); candidates.union_with(&new_candidates); self.number_typos += 1; - - let bucket_candidates = match self.parent { - Some(_) => take(&mut self.bucket_candidates), - None => new_candidates.clone(), - }; + self.bucket_candidates.union_with(&new_candidates); return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), candidates: Some(new_candidates), - bucket_candidates, + bucket_candidates: take(&mut self.bucket_candidates), })); } }, From d301859bbd1c445040abd1bb1b833abcffc71e7c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 9 Mar 2021 17:48:05 +0100 Subject: [PATCH 4/6] Introduce a special word_derivations function for Proximity --- milli/src/search/criteria/mod.rs | 14 +++-- milli/src/search/criteria/proximity.rs | 73 ++++++++++++++------------ 2 files changed, 48 insertions(+), 39 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index b2fd7803d..22f081871 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -67,7 +67,7 @@ pub trait Context { fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; - fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result>; + fn docid_words_positions(&self, docid: DocumentId) -> heed::Result>; } pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, @@ -107,9 +107,13 @@ impl<'a> Context for CriteriaBuilder<'a> { self.words_prefixes_fst.contains(word) } - fn docid_word_positions(&self, docid: DocumentId, word: &str) -> heed::Result> { - let key = (docid, word); - self.index.docid_word_positions.get(self.rtxn, &key) + fn docid_words_positions(&self, docid: DocumentId) -> heed::Result> { + let mut words_positions = HashMap::new(); + for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? { + let ((_, word), positions) = result?; + words_positions.insert(word.to_string(), positions); + } + Ok(words_positions) } } @@ -391,7 +395,7 @@ pub mod test { self.word_prefix_docids.contains_key(&word.to_string()) } - fn docid_word_positions(&self, _docid: DocumentId, _word: &str) -> heed::Result> { + fn docid_words_positions(&self, _docid: DocumentId) -> heed::Result> { todo!() } } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index e5f010177..b62eb8cfd 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -1,14 +1,13 @@ -use std::borrow::Cow; use std::collections::btree_map::{self, BTreeMap}; -use std::collections::hash_map::{HashMap, Entry}; +use std::collections::hash_map::HashMap; use std::mem::take; use roaring::RoaringBitmap; use log::debug; -use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}}; +use crate::{DocumentId, Position, search::{query_tree::QueryKind}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; -use crate::search::WordDerivationsCache; +use crate::search::{build_dfa, WordDerivationsCache}; use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; pub struct Proximity<'t> { @@ -358,7 +357,7 @@ fn resolve_plane_sweep_candidates( docid: DocumentId, consecutive: bool, rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, - dwpcache: &mut HashMap>, + words_positions: &HashMap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { @@ -400,7 +399,7 @@ fn resolve_plane_sweep_candidates( let mut groups_positions = Vec::with_capacity(groups_len); for operation in operations { - let positions = resolve_operation(ctx, operation, docid, rocache, dwpcache, wdcache)?; + let positions = resolve_operation(ctx, operation, docid, rocache, words_positions, wdcache)?; groups_positions.push(positions.into_iter()); } @@ -476,7 +475,7 @@ fn resolve_plane_sweep_candidates( query_tree: &'a Operation, docid: DocumentId, rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, - dwpcache: &mut HashMap>, + words_positions: &HashMap, wdcache: &mut WordDerivationsCache, ) -> anyhow::Result> { @@ -487,44 +486,34 @@ fn resolve_plane_sweep_candidates( } let result = match query_tree { - And(ops) => plane_sweep(ctx, ops, docid, false, rocache, dwpcache, wdcache)?, - Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, dwpcache, wdcache)?, + And(ops) => plane_sweep(ctx, ops, docid, false, rocache, words_positions, wdcache)?, + Consecutive(ops) => plane_sweep(ctx, ops, docid, true, rocache, words_positions, wdcache)?, Or(_, ops) => { let mut result = Vec::new(); for op in ops { - result.extend(resolve_operation(ctx, op, docid, rocache, dwpcache, wdcache)?) + result.extend(resolve_operation(ctx, op, docid, rocache, words_positions, wdcache)?) } result.sort_unstable(); result }, - Operation::Query(Query {prefix, kind}) => { - let fst = ctx.words_fst(); - let words = match kind { + Operation::Query(Query { prefix, kind }) => { + let mut result = Vec::new(); + match kind { QueryKind::Exact { word, .. } => { if *prefix { - Cow::Borrowed(word_derivations(word, true, 0, fst, wdcache)?) + let iter = word_derivations(word, true, 0, &words_positions) + .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); + result.extend(iter); } else { - Cow::Owned(vec![(word.to_string(), 0)]) + if let Some(positions) = words_positions.get(word) { + result.extend(positions.iter().map(|p| (p, 0, p))); + } } }, QueryKind::Tolerant { typo, word } => { - Cow::Borrowed(word_derivations(word, *prefix, *typo, fst, wdcache)?) - } - }; - - let mut result = Vec::new(); - for (word, _) in words.as_ref() { - let positions = match dwpcache.entry(word.to_string()) { - Entry::Occupied(entry) => entry.into_mut(), - Entry::Vacant(entry) => { - let positions = ctx.docid_word_positions(docid, word)?; - entry.insert(positions) - } - }; - - if let Some(positions) = positions { - let iter = positions.iter().map(|p| (p, 0, p)); + let iter = word_derivations(word, *prefix, *typo, &words_positions) + .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); result.extend(iter); } } @@ -538,18 +527,34 @@ fn resolve_plane_sweep_candidates( Ok(result) } - let mut word_positions_cache = HashMap::new(); + fn word_derivations<'a>( + word: &str, + is_prefix: bool, + max_typo: u8, + words_positions: &'a HashMap, + ) -> impl Iterator + { + let dfa = build_dfa(word, max_typo, is_prefix); + words_positions.iter().filter_map(move |(document_word, positions)| { + use levenshtein_automata::Distance; + match dfa.eval(document_word) { + Distance::Exact(_) => Some(positions), + Distance::AtLeast(_) => None, + } + }) + } + let mut resolve_operation_cache = HashMap::new(); let mut candidates = BTreeMap::new(); for docid in allowed_candidates { - word_positions_cache.clear(); + let words_positions = ctx.docid_words_positions(docid)?; resolve_operation_cache.clear(); let positions = resolve_operation( ctx, query_tree, docid, &mut resolve_operation_cache, - &mut word_positions_cache, + &words_positions, wdcache, )?; let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity); From 54b97ed8e1241072cd5e68e14267f61d96b8d5bf Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 10 Mar 2021 10:56:26 +0100 Subject: [PATCH 5/6] Update the fetcher comments --- milli/src/search/criteria/fetcher.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index 723b5a13a..fa204bdf2 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -8,12 +8,12 @@ use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; -/// The result of a call to the parent criterion. +/// The result of a call to the fetcher. #[derive(Debug, Clone, PartialEq)] pub struct FetcherResult { - /// The query tree that must be used by the children criterion to fetch candidates. + /// The query tree corresponding to the current bucket of the last criterion. pub query_tree: Option, - /// The candidates that this criterion is allowed to return subsets of. + /// The candidates of the current bucket of the last criterion. pub candidates: RoaringBitmap, /// Candidates that comes from the current bucket of the initial criterion. pub bucket_candidates: RoaringBitmap, From d48008339e33ae615dc467e54e1eb290517117de Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 10 Mar 2021 11:16:30 +0100 Subject: [PATCH 6/6] Introduce two new optional_words and authorize_typos Search options --- milli/src/search/mod.rs | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7475ef473..ce5a6bc88 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -33,13 +33,24 @@ pub struct Search<'a> { facet_condition: Option, offset: usize, limit: usize, + optional_words: bool, + authorize_typos: bool, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } impl<'a> Search<'a> { pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> Search<'a> { - Search { query: None, facet_condition: None, offset: 0, limit: 20, rtxn, index } + Search { + query: None, + facet_condition: None, + offset: 0, + limit: 20, + optional_words: true, + authorize_typos: true, + rtxn, + index, + } } pub fn query(&mut self, query: impl Into) -> &mut Search<'a> { @@ -57,6 +68,16 @@ impl<'a> Search<'a> { self } + pub fn optional_words(&mut self, value: bool) -> &mut Search<'a> { + self.optional_words = value; + self + } + + pub fn authorize_typos(&mut self, value: bool) -> &mut Search<'a> { + self.authorize_typos = value; + self + } + pub fn facet_condition(&mut self, condition: FacetCondition) -> &mut Search<'a> { self.facet_condition = Some(condition); self @@ -67,7 +88,9 @@ impl<'a> Search<'a> { let before = Instant::now(); let query_tree = match self.query.as_ref() { Some(query) => { - let builder = QueryTreeBuilder::new(self.rtxn, self.index); + let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); + builder.optional_words(self.optional_words); + builder.authorize_typos(self.authorize_typos); let stop_words = &Set::default(); let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); let result = analyzer.analyze(query); @@ -129,12 +152,23 @@ impl<'a> Search<'a> { impl fmt::Debug for Search<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let Search { query, facet_condition, offset, limit, rtxn: _, index: _ } = self; + let Search { + query, + facet_condition, + offset, + limit, + optional_words, + authorize_typos, + rtxn: _, + index: _, + } = self; f.debug_struct("Search") .field("query", query) .field("facet_condition", facet_condition) .field("offset", offset) .field("limit", limit) + .field("optional_words", optional_words) + .field("authorize_typos", authorize_typos) .finish() } }