From 62a70c300d35f569186a05bfe7b91338113a5d9f Mon Sep 17 00:00:00 2001 From: many Date: Tue, 9 Mar 2021 12:04:52 +0100 Subject: [PATCH] Optimize words criterion --- milli/src/search/criteria/asc_desc.rs | 20 +++++++++--- milli/src/search/criteria/fetcher.rs | 42 +++++++++++++++++++------- milli/src/search/criteria/mod.rs | 17 ++++++----- milli/src/search/criteria/proximity.rs | 16 +++++++--- milli/src/search/criteria/typo.rs | 20 ++++++------ milli/src/search/criteria/words.rs | 37 ++++++++++------------- milli/src/search/mod.rs | 5 ++- 7 files changed, 95 insertions(+), 62 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 29fe26d7e..5b2ec32e8 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -160,9 +160,21 @@ impl<'t> Criterion for AscDesc<'t> { match self.parent.as_mut() { Some(parent) => { match parent.next(wdcache)? { - Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree; - candidates.intersect_with(&self.faceted_candidates); + let candidates = match (&self.query_tree, candidates) { + (_, Some(mut candidates)) => { + candidates.intersect_with(&self.faceted_candidates); + candidates + }, + (Some(qt), None) => { + let context = CriteriaBuilder::new(&self.rtxn, &self.index)?; + let mut candidates = resolve_query_tree(&context, qt, &mut HashMap::new(), wdcache)?; + candidates.intersect_with(&self.faceted_candidates); + candidates + }, + (None, None) => take(&mut self.faceted_candidates), + }; self.candidates = facet_ordered( self.index, self.rtxn, @@ -183,7 +195,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree, - candidates: RoaringBitmap::new(), + candidates: Some(RoaringBitmap::new()), bucket_candidates, })); }, @@ -195,7 +207,7 @@ impl<'t> Criterion for AscDesc<'t> { return Ok(Some(CriterionResult { query_tree: self.query_tree.clone(), - candidates, + candidates: Some(candidates), bucket_candidates, })); }, diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs index 094efe75e..723b5a13a 100644 --- a/milli/src/search/criteria/fetcher.rs +++ b/milli/src/search/criteria/fetcher.rs @@ -8,12 +8,24 @@ use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; +/// The result of a call to the parent criterion. +#[derive(Debug, Clone, PartialEq)] +pub struct FetcherResult { + /// The query tree that must be used by the children criterion to fetch candidates. + pub query_tree: Option, + /// The candidates that this criterion is allowed to return subsets of. + pub candidates: RoaringBitmap, + /// Candidates that comes from the current bucket of the initial criterion. + pub bucket_candidates: RoaringBitmap, +} + pub struct Fetcher<'t> { ctx: &'t dyn Context, query_tree: Option, candidates: Candidates, parent: Option>, should_get_documents_ids: bool, + wdcache: WordDerivationsCache, } impl<'t> Fetcher<'t> { @@ -29,6 +41,7 @@ impl<'t> Fetcher<'t> { candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), parent: None, should_get_documents_ids: true, + wdcache: WordDerivationsCache::new(), } } @@ -43,13 +56,12 @@ impl<'t> Fetcher<'t> { candidates: Candidates::default(), parent: Some(parent), should_get_documents_ids: true, + wdcache: WordDerivationsCache::new(), } } -} -impl<'t> Criterion for Fetcher<'t> { #[logging_timer::time("Fetcher::{}")] - fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { + pub fn next(&mut self) -> anyhow::Result> { use Candidates::{Allowed, Forbidden}; loop { debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", @@ -62,14 +74,14 @@ impl<'t> Criterion for Fetcher<'t> { let candidates = take(&mut self.candidates).into_inner(); let candidates = match &self.query_tree { Some(qt) if should_get_documents_ids => { - let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?; + let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?; docids.intersect_with(&candidates); docids }, _ => candidates, }; - return Ok(Some(CriterionResult { + return Ok(Some(FetcherResult { query_tree: self.query_tree.take(), candidates: candidates.clone(), bucket_candidates: candidates, @@ -78,15 +90,23 @@ impl<'t> Criterion for Fetcher<'t> { Forbidden(_) => { match self.parent.as_mut() { Some(parent) => { - match parent.next(wdcache)? { - Some(result) => return Ok(Some(result)), + match parent.next(&mut self.wdcache)? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), &mut self.wdcache)?, + (None, None) => RoaringBitmap::new(), + }; + + return Ok(Some(FetcherResult { query_tree, candidates, bucket_candidates })) + }, None => if should_get_documents_ids { let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?, + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, None => self.ctx.documents_ids()?, }; - return Ok(Some(CriterionResult { + return Ok(Some(FetcherResult { query_tree: self.query_tree.clone(), candidates: candidates.clone(), bucket_candidates: candidates, @@ -96,11 +116,11 @@ impl<'t> Criterion for Fetcher<'t> { }, None => if should_get_documents_ids { let candidates = match &self.query_tree { - Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), wdcache)?, + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new(), &mut self.wdcache)?, None => self.ctx.documents_ids()?, }; - return Ok(Some(CriterionResult { + return Ok(Some(FetcherResult { query_tree: self.query_tree.clone(), candidates: candidates.clone(), bucket_candidates: candidates, diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index d70942c1c..b2fd7803d 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -14,10 +14,10 @@ use self::asc_desc::AscDesc; use self::proximity::Proximity; use self::fetcher::Fetcher; -pub mod typo; -pub mod words; -pub mod asc_desc; -pub mod proximity; +mod typo; +mod words; +mod asc_desc; +mod proximity; pub mod fetcher; pub trait Criterion { @@ -28,11 +28,12 @@ pub trait Criterion { #[derive(Debug, Clone, PartialEq)] pub struct CriterionResult { /// The query tree that must be used by the children criterion to fetch candidates. - pub query_tree: Option, - /// The candidates that this criterion is allowed to return subsets of. - pub candidates: RoaringBitmap, + query_tree: Option, + /// The candidates that this criterion is allowed to return subsets of, + /// if None, it is up to the child to compute the candidates itself. + candidates: Option, /// Candidates that comes from the current bucket of the initial criterion. - pub bucket_candidates: RoaringBitmap, + bucket_candidates: RoaringBitmap, } /// Either a set of candidates that defines the candidates diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 5b14f699c..cb4fd257b 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -9,7 +9,7 @@ use log::debug; use crate::{DocumentId, Position, search::{query_tree::QueryKind, word_derivations}}; use crate::search::query_tree::{maximum_proximity, Operation, Query}; use crate::search::WordDerivationsCache; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids, resolve_query_tree}; pub struct Proximity<'t> { ctx: &'t dyn Context, @@ -70,7 +70,7 @@ impl<'t> Criterion for Proximity<'t> { (_, Allowed(candidates)) if candidates.is_empty() => { return Ok(Some(CriterionResult { query_tree: self.query_tree.take().map(|(_, qt)| qt), - candidates: take(&mut self.candidates).into_inner(), + candidates: Some(take(&mut self.candidates).into_inner()), bucket_candidates: take(&mut self.bucket_candidates), })); }, @@ -126,7 +126,7 @@ impl<'t> Criterion for Proximity<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree.clone()), - candidates: new_candidates, + candidates: Some(new_candidates), bucket_candidates, })); } @@ -155,7 +155,7 @@ impl<'t> Criterion for Proximity<'t> { return Ok(Some(CriterionResult { query_tree: Some(query_tree.clone()), - candidates: new_candidates, + candidates: Some(new_candidates), bucket_candidates, })); } @@ -164,7 +164,7 @@ impl<'t> Criterion for Proximity<'t> { let candidates = take(&mut self.candidates).into_inner(); return Ok(Some(CriterionResult { query_tree: None, - candidates: candidates.clone(), + candidates: Some(candidates.clone()), bucket_candidates: candidates, })); }, @@ -173,6 +173,12 @@ impl<'t> Criterion for Proximity<'t> { Some(parent) => { match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + let candidates = match (&query_tree, candidates) { + (_, Some(candidates)) => candidates, + (Some(qt), None) => resolve_query_tree(self.ctx, qt, &mut HashMap::new(), wdcache)?, + (None, None) => RoaringBitmap::new(), + }; + self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); self.proximity = 0; self.candidates = Candidates::Allowed(candidates); diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 4cc0015da..8bead4661 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -63,7 +63,7 @@ impl<'t> Criterion for Typo<'t> { (_, Allowed(candidates)) if candidates.is_empty() => { return Ok(Some(CriterionResult { query_tree: self.query_tree.take().map(|(_, qt)| qt), - candidates: take(&mut self.candidates).into_inner(), + candidates: Some(take(&mut self.candidates).into_inner()), bucket_candidates: take(&mut self.bucket_candidates), })); }, @@ -100,7 +100,7 @@ impl<'t> Criterion for Typo<'t> { return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), - candidates: new_candidates, + candidates: Some(new_candidates), bucket_candidates, })); } @@ -138,7 +138,7 @@ impl<'t> Criterion for Typo<'t> { return Ok(Some(CriterionResult { query_tree: Some(new_query_tree), - candidates: new_candidates, + candidates: Some(new_candidates), bucket_candidates, })); } @@ -147,7 +147,7 @@ impl<'t> Criterion for Typo<'t> { let candidates = take(&mut self.candidates).into_inner(); return Ok(Some(CriterionResult { query_tree: None, - candidates: candidates.clone(), + candidates: Some(candidates.clone()), bucket_candidates: candidates, })); }, @@ -158,7 +158,7 @@ impl<'t> Criterion for Typo<'t> { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); self.number_typos = 0; - self.candidates = Candidates::Allowed(candidates); + self.candidates = candidates.map_or_else(Candidates::default, Candidates::Allowed); self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), @@ -394,7 +394,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), ]), ])), - candidates: candidates_1.clone(), + candidates: Some(candidates_1.clone()), bucket_candidates: candidates_1, }; @@ -416,7 +416,7 @@ mod test { ]), ]), ])), - candidates: candidates_2.clone(), + candidates: Some(candidates_2.clone()), bucket_candidates: candidates_2, }; @@ -434,7 +434,7 @@ mod test { let expected = CriterionResult { query_tree: None, - candidates: facet_candidates.clone(), + candidates: Some(facet_candidates.clone()), bucket_candidates: facet_candidates, }; @@ -472,7 +472,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), ]), ])), - candidates: &candidates_1 & &facet_candidates, + candidates: Some(&candidates_1 & &facet_candidates), bucket_candidates: candidates_1 & &facet_candidates, }; @@ -494,7 +494,7 @@ mod test { ]), ]), ])), - candidates: &candidates_2 & &facet_candidates, + candidates: Some(&candidates_2 & &facet_candidates), bucket_candidates: candidates_2 & &facet_candidates, }; diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index d94fd0c53..8774eed7c 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -6,12 +6,12 @@ use roaring::RoaringBitmap; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; -use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; +use super::{resolve_query_tree, Criterion, CriterionResult, Context}; pub struct Words<'t> { ctx: &'t dyn Context, query_trees: Vec, - candidates: Candidates, + candidates: Option, bucket_candidates: RoaringBitmap, parent: Option>, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, @@ -27,7 +27,7 @@ impl<'t> Words<'t> { Words { ctx, query_trees: query_tree.map(explode_query_tree).unwrap_or_default(), - candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + candidates, bucket_candidates: RoaringBitmap::new(), parent: None, candidates_cache: HashMap::default(), @@ -38,7 +38,7 @@ impl<'t> Words<'t> { Words { ctx, query_trees: Vec::default(), - candidates: Candidates::default(), + candidates: None, bucket_candidates: RoaringBitmap::new(), parent: Some(parent), candidates_cache: HashMap::default(), @@ -49,20 +49,19 @@ impl<'t> Words<'t> { impl<'t> Criterion for Words<'t> { #[logging_timer::time("Words::{}")] fn next(&mut self, wdcache: &mut WordDerivationsCache) -> anyhow::Result> { - use Candidates::{Allowed, Forbidden}; loop { debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); match (self.query_trees.pop(), &mut self.candidates) { - (query_tree, Allowed(candidates)) if candidates.is_empty() => { + (query_tree, Some(candidates)) if candidates.is_empty() => { self.query_trees = Vec::new(); return Ok(Some(CriterionResult { query_tree, - candidates: take(&mut self.candidates).into_inner(), + candidates: self.candidates.take(), bucket_candidates: take(&mut self.bucket_candidates), })); }, - (Some(qt), Allowed(candidates)) => { + (Some(qt), Some(candidates)) => { let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); @@ -74,41 +73,37 @@ impl<'t> Criterion for Words<'t> { return Ok(Some(CriterionResult { query_tree: Some(qt), - candidates: found_candidates, + candidates: Some(found_candidates), bucket_candidates, })); }, - (Some(qt), Forbidden(candidates)) => { - let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache, wdcache)?; - found_candidates.difference_with(&candidates); - candidates.union_with(&found_candidates); - + (Some(qt), None) => { let bucket_candidates = match self.parent { Some(_) => take(&mut self.bucket_candidates), - None => found_candidates.clone(), + None => RoaringBitmap::new(), }; return Ok(Some(CriterionResult { query_tree: Some(qt), - candidates: found_candidates, + candidates: None, bucket_candidates, })); }, - (None, Allowed(_)) => { - let candidates = take(&mut self.candidates).into_inner(); + (None, Some(_)) => { + let candidates = self.candidates.take(); return Ok(Some(CriterionResult { query_tree: None, candidates: candidates.clone(), - bucket_candidates: candidates, + bucket_candidates: candidates.unwrap_or_default(), })); }, - (None, Forbidden(_)) => { + (None, None) => { match self.parent.as_mut() { Some(parent) => { match parent.next(wdcache)? { Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); - self.candidates = Candidates::Allowed(candidates); + self.candidates = candidates; self.bucket_candidates.union_with(&bucket_candidates); }, None => return Ok(None), diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 34b3ffec9..7475ef473 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -11,7 +11,7 @@ use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use crate::search::criteria::{Criterion, CriterionResult}; +use crate::search::criteria::fetcher::FetcherResult; use crate::{Index, DocumentId}; pub use self::facet::FacetIter; @@ -99,9 +99,8 @@ impl<'a> Search<'a> { let mut offset = self.offset; let mut limit = self.limit; let mut documents_ids = Vec::new(); - let mut words_derivations_cache = WordDerivationsCache::new(); let mut initial_candidates = RoaringBitmap::new(); - while let Some(CriterionResult { candidates, bucket_candidates, .. }) = criteria.next(&mut words_derivations_cache)? { + while let Some(FetcherResult { candidates, bucket_candidates, .. }) = criteria.next()? { debug!("Number of candidates found {}", candidates.len());