From a3968063436911975fb1fc47e1c1ed7085c13e0c Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 12 Jul 2022 17:56:50 +0200 Subject: [PATCH 1/6] Add settings to force milli to exhaustively compute the total number of hits --- milli/src/search/criteria/initial.rs | 32 ++++++++++++++++++++++------ milli/src/search/criteria/mod.rs | 4 +++- milli/src/search/criteria/typo.rs | 9 ++++---- milli/src/search/mod.rs | 10 +++++++++ milli/tests/search/query_criteria.rs | 2 +- 5 files changed, 45 insertions(+), 12 deletions(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 514dbff96..2aabe9b13 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -1,17 +1,22 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::{resolve_query_tree, Context}; use crate::search::query_tree::Operation; use crate::Result; -pub struct Initial { +pub struct Initial<'t> { + ctx: &'t dyn Context<'t>, answer: Option, + exhaustive_number_hits: bool, } -impl Initial { +impl<'t> Initial<'t> { pub fn new( + ctx: &'t dyn Context<'t>, query_tree: Option, filtered_candidates: Option, + exhaustive_number_hits: bool, ) -> Initial { let answer = CriterionResult { query_tree, @@ -19,13 +24,28 @@ impl Initial { filtered_candidates, bucket_candidates: None, }; - Initial { answer: Some(answer) } + Initial { ctx, answer: Some(answer), exhaustive_number_hits } } } -impl Criterion for Initial { +impl Criterion for Initial<'_> { #[logging_timer::time("Initial::{}")] - fn next(&mut self, _: &mut CriterionParameters) -> Result> { - Ok(self.answer.take()) + fn next(&mut self, params: &mut CriterionParameters) -> Result> { + self.answer + .take() + .map(|mut answer| { + if self.exhaustive_number_hits && answer.query_tree.is_some() { + let candidates = resolve_query_tree( + self.ctx, + answer.query_tree.as_ref().unwrap(), + &mut params.wdcache, + )?; + + answer.candidates = Some(candidates.clone()); + answer.bucket_candidates = Some(candidates); + } + Ok(answer) + }) + .transpose() } } diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index f48865ba5..6c4fa51d3 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -232,13 +232,15 @@ impl<'t> CriteriaBuilder<'t> { primitive_query: Option>, filtered_candidates: Option, sort_criteria: Option>, + exhaustive_number_hits: bool, ) -> Result> { use crate::criterion::Criterion as Name; let primitive_query = primitive_query.unwrap_or_default(); let mut criterion = - Box::new(Initial::new(query_tree, filtered_candidates)) as Box; + Box::new(Initial::new(self, query_tree, filtered_candidates, exhaustive_number_hits)) + as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { Name::Words => Box::new(Words::new(self, criterion)), diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index e9e6fb2f5..f1537ed48 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -368,7 +368,7 @@ mod test { excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(query_tree, facet_candidates); + let parent = Initial::new(&context, query_tree, facet_candidates, false); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -405,7 +405,7 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(Some(query_tree), facet_candidates); + let parent = Initial::new(&context, Some(query_tree), facet_candidates, false); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -439,7 +439,7 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(query_tree, Some(facet_candidates.clone())); + let parent = Initial::new(&context, query_tree, Some(facet_candidates.clone()), false); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -476,7 +476,8 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(Some(query_tree), Some(facet_candidates.clone())); + let parent = + Initial::new(&context, Some(query_tree), Some(facet_candidates.clone()), false); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 7145c1445..6f1e1b34c 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -47,6 +47,7 @@ pub struct Search<'a> { terms_matching_strategy: TermsMatchingStrategy, authorize_typos: bool, words_limit: usize, + exhaustive_number_hits: bool, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } @@ -61,6 +62,7 @@ impl<'a> Search<'a> { sort_criteria: None, terms_matching_strategy: TermsMatchingStrategy::default(), authorize_typos: true, + exhaustive_number_hits: false, words_limit: 10, rtxn, index, @@ -107,6 +109,11 @@ impl<'a> Search<'a> { self } + pub fn exhaustive_number_hits(&mut self, exhaustive_number_hits: bool) -> &mut Search<'a> { + self.exhaustive_number_hits = exhaustive_number_hits; + self + } + fn is_typo_authorized(&self) -> Result { let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?; // only authorize typos if both the index and the query allow it. @@ -189,6 +196,7 @@ impl<'a> Search<'a> { primitive_query, filtered_candidates, self.sort_criteria.clone(), + self.exhaustive_number_hits, )?; match self.index.distinct_field(self.rtxn)? { @@ -262,6 +270,7 @@ impl fmt::Debug for Search<'_> { terms_matching_strategy, authorize_typos, words_limit, + exhaustive_number_hits, rtxn: _, index: _, } = self; @@ -273,6 +282,7 @@ impl fmt::Debug for Search<'_> { .field("sort_criteria", sort_criteria) .field("terms_matching_strategy", terms_matching_strategy) .field("authorize_typos", authorize_typos) + .field("exhaustive_number_hits", exhaustive_number_hits) .field("words_limit", words_limit) .finish() } diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 8b72c8420..f873f56f7 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -51,7 +51,7 @@ macro_rules! test_criterion { }; } -test_criterion!(none_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![], vec![]); +test_criterion!(none_allow_typo, DISALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![], vec![]); test_criterion!(none_disallow_typo, DISALLOW_OPTIONAL_WORDS, DISALLOW_TYPOS, vec![], vec![]); test_criterion!(words_allow_typo, ALLOW_OPTIONAL_WORDS, ALLOW_TYPOS, vec![Words], vec![]); test_criterion!( From d71bc1e69fc68c8cf4b5a3abb503fda7c8afdf1b Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 18 Jul 2022 16:52:45 +0200 Subject: [PATCH 2/6] Compute an exact count when using distinct --- milli/src/search/criteria/initial.rs | 29 ++++++++++++++++----- milli/src/search/criteria/mod.rs | 15 +++++++---- milli/src/search/criteria/typo.rs | 24 +++++++++++++---- milli/src/search/distinct/facet_distinct.rs | 1 + milli/src/search/mod.rs | 28 ++++++++++++++------ 5 files changed, 72 insertions(+), 25 deletions(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 2aabe9b13..bae77fda0 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -3,32 +3,35 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::{resolve_query_tree, Context}; use crate::search::query_tree::Operation; +use crate::search::Distinct; use crate::Result; -pub struct Initial<'t> { +pub struct Initial<'t, D> { ctx: &'t dyn Context<'t>, answer: Option, exhaustive_number_hits: bool, + distinct: Option, } -impl<'t> Initial<'t> { +impl<'t, D> Initial<'t, D> { pub fn new( ctx: &'t dyn Context<'t>, query_tree: Option, filtered_candidates: Option, exhaustive_number_hits: bool, - ) -> Initial { + distinct: Option, + ) -> Initial { let answer = CriterionResult { query_tree, candidates: None, filtered_candidates, bucket_candidates: None, }; - Initial { ctx, answer: Some(answer), exhaustive_number_hits } + Initial { ctx, answer: Some(answer), exhaustive_number_hits, distinct } } } -impl Criterion for Initial<'_> { +impl Criterion for Initial<'_, D> { #[logging_timer::time("Initial::{}")] fn next(&mut self, params: &mut CriterionParameters) -> Result> { self.answer @@ -41,8 +44,20 @@ impl Criterion for Initial<'_> { &mut params.wdcache, )?; - answer.candidates = Some(candidates.clone()); - answer.bucket_candidates = Some(candidates); + let bucket_candidates = match &mut self.distinct { + // may be really time consuming + Some(distinct) => { + let mut bucket_candidates = RoaringBitmap::new(); + for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { + bucket_candidates.insert(c?); + } + bucket_candidates + } + None => candidates.clone(), + }; + + answer.candidates = Some(candidates); + answer.bucket_candidates = Some(bucket_candidates); } Ok(answer) }) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 6c4fa51d3..866eaefde 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -13,7 +13,7 @@ use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use crate::search::criteria::geo::Geo; -use crate::search::{word_derivations, WordDerivationsCache}; +use crate::search::{word_derivations, Distinct, WordDerivationsCache}; use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; mod asc_desc; @@ -226,21 +226,26 @@ impl<'t> CriteriaBuilder<'t> { Ok(Self { rtxn, index, words_fst, words_prefixes_fst }) } - pub fn build( + pub fn build( &'t self, query_tree: Option, primitive_query: Option>, filtered_candidates: Option, sort_criteria: Option>, exhaustive_number_hits: bool, + distinct: Option, ) -> Result> { use crate::criterion::Criterion as Name; let primitive_query = primitive_query.unwrap_or_default(); - let mut criterion = - Box::new(Initial::new(self, query_tree, filtered_candidates, exhaustive_number_hits)) - as Box; + let mut criterion = Box::new(Initial::new( + self, + query_tree, + filtered_candidates, + exhaustive_number_hits, + distinct, + )) as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { Name::Words => Box::new(Words::new(self, criterion)), diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index f1537ed48..605089fae 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -348,6 +348,7 @@ mod test { use super::super::initial::Initial; use super::super::test::TestContext; use super::*; + use crate::search::NoopDistinct; fn display_criteria(mut criteria: Typo, mut parameters: CriterionParameters) -> String { let mut result = String::new(); @@ -368,7 +369,8 @@ mod test { excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(&context, query_tree, facet_candidates, false); + let parent = + Initial::::new(&context, query_tree, facet_candidates, false, None); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -405,7 +407,8 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(&context, Some(query_tree), facet_candidates, false); + let parent = + Initial::::new(&context, Some(query_tree), facet_candidates, false, None); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -439,7 +442,13 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = Initial::new(&context, query_tree, Some(facet_candidates.clone()), false); + let parent = Initial::::new( + &context, + query_tree, + Some(facet_candidates.clone()), + false, + None, + ); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); @@ -476,8 +485,13 @@ mod test { wdcache: &mut WordDerivationsCache::new(), excluded_candidates: &RoaringBitmap::new(), }; - let parent = - Initial::new(&context, Some(query_tree), Some(facet_candidates.clone()), false); + let parent = Initial::::new( + &context, + Some(query_tree), + Some(facet_candidates.clone()), + false, + None, + ); let criteria = Typo::new(&context, Box::new(parent)); let result = display_criteria(criteria, criterion_parameters); diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 4436d4cda..33e7b4975 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -21,6 +21,7 @@ const DOCID_SIZE: usize = size_of::(); /// care to keep the document we are currently on, and remove it from the excluded list. The next /// iterations will never contain any occurence of a document with the same distinct value as a /// document from previous iterations. +#[derive(Clone)] pub struct FacetDistinct<'a> { distinct: FieldId, index: &'a Index, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 6f1e1b34c..270beb52a 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -191,21 +191,33 @@ impl<'a> Search<'a> { } let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let criteria = criteria_builder.build( - query_tree, - primitive_query, - filtered_candidates, - self.sort_criteria.clone(), - self.exhaustive_number_hits, - )?; match self.index.distinct_field(self.rtxn)? { - None => self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria), + None => { + let criteria = criteria_builder.build::( + query_tree, + primitive_query, + filtered_candidates, + self.sort_criteria.clone(), + self.exhaustive_number_hits, + None, + )?; + self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria) + } Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; match field_ids_map.id(name) { Some(fid) => { let distinct = FacetDistinct::new(fid, self.index, self.rtxn); + + let criteria = criteria_builder.build( + query_tree, + primitive_query, + filtered_candidates, + self.sort_criteria.clone(), + self.exhaustive_number_hits, + Some(distinct.clone()), + )?; self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria) } None => Ok(SearchResult::default()), From cf203b7fde8ddbdf711cde6f53ef0298fc3af8d9 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 20 Jul 2022 15:58:26 +0200 Subject: [PATCH 3/6] Take filter in account when computing the pages candidates --- milli/src/search/criteria/initial.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index bae77fda0..9a9565182 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -38,12 +38,16 @@ impl Criterion for Initial<'_, D> { .take() .map(|mut answer| { if self.exhaustive_number_hits && answer.query_tree.is_some() { - let candidates = resolve_query_tree( + let mut candidates = resolve_query_tree( self.ctx, answer.query_tree.as_ref().unwrap(), &mut params.wdcache, )?; + if let Some(ref filtered_candidates) = answer.filtered_candidates { + candidates &= filtered_candidates; + } + let bucket_candidates = match &mut self.distinct { // may be really time consuming Some(distinct) => { From 6f55e7844ce7a72f2b4ee7e64be9d1e98cec7700 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 17 Oct 2022 14:41:57 +0200 Subject: [PATCH 4/6] Add some code comments --- milli/src/search/criteria/initial.rs | 9 +++++++-- milli/src/search/mod.rs | 2 ++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 9a9565182..14d368d4e 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -5,7 +5,9 @@ use crate::search::criteria::{resolve_query_tree, Context}; use crate::search::query_tree::Operation; use crate::search::Distinct; use crate::Result; - +/// Initial is a mandatory criterion, it is always the first +/// and is meant to initalize the CriterionResult used by the other criteria. +/// It behave like an [Once Iterator](https://doc.rust-lang.org/std/iter/struct.Once.html) and will return Some(CriterionResult) only one time. pub struct Initial<'t, D> { ctx: &'t dyn Context<'t>, answer: Option, @@ -38,18 +40,21 @@ impl Criterion for Initial<'_, D> { .take() .map(|mut answer| { if self.exhaustive_number_hits && answer.query_tree.is_some() { + // resolve the whole query tree to retrieve an exhastive list of documents matching the query. let mut candidates = resolve_query_tree( self.ctx, answer.query_tree.as_ref().unwrap(), &mut params.wdcache, )?; + // Apply the filters on the documents retrieved with the query tree. if let Some(ref filtered_candidates) = answer.filtered_candidates { candidates &= filtered_candidates; } + // because the bucket_candidates should be an exhastive count of the matching documents, + // we precompute the distinct attributes. let bucket_candidates = match &mut self.distinct { - // may be really time consuming Some(distinct) => { let mut bucket_candidates = RoaringBitmap::new(); for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 270beb52a..20003c676 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -109,6 +109,8 @@ impl<'a> Search<'a> { self } + /// Force the search to exhastivelly compute the number of candidates, + /// this will increase the search time but allows finite pagination. pub fn exhaustive_number_hits(&mut self, exhaustive_number_hits: bool) -> &mut Search<'a> { self.exhaustive_number_hits = exhaustive_number_hits; self From 516e838eb4cadd27df43d84a288cd11d1d2e3bb0 Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 17 Oct 2022 18:23:15 +0200 Subject: [PATCH 5/6] Update milli/src/search/criteria/initial.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/initial.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 14d368d4e..195c926be 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -40,7 +40,7 @@ impl Criterion for Initial<'_, D> { .take() .map(|mut answer| { if self.exhaustive_number_hits && answer.query_tree.is_some() { - // resolve the whole query tree to retrieve an exhastive list of documents matching the query. + // resolve the whole query tree to retrieve an exhaustive list of documents matching the query. let mut candidates = resolve_query_tree( self.ctx, answer.query_tree.as_ref().unwrap(), From 81919a35a27f588b75893fd3b18644feaa6c325a Mon Sep 17 00:00:00 2001 From: Many the fish Date: Mon, 17 Oct 2022 18:23:20 +0200 Subject: [PATCH 6/6] Update milli/src/search/criteria/initial.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Clément Renault --- milli/src/search/criteria/initial.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 195c926be..ac61adfe2 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -52,7 +52,7 @@ impl Criterion for Initial<'_, D> { candidates &= filtered_candidates; } - // because the bucket_candidates should be an exhastive count of the matching documents, + // because the bucket_candidates should be an exhaustive count of the matching documents, // we precompute the distinct attributes. let bucket_candidates = match &mut self.distinct { Some(distinct) => {