From 229405aeb927c1fd700cb4cb091d849ee4c8e689 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 12 Dec 2022 16:54:31 +0100 Subject: [PATCH] Choose implementation strategy of criterion at runtime --- cli/src/main.rs | 4 +- milli/src/lib.rs | 5 +- milli/src/search/criteria/asc_desc.rs | 100 +++++++++++++++++-------- milli/src/search/criteria/attribute.rs | 22 +++++- milli/src/search/criteria/mod.rs | 32 ++++++-- milli/src/search/criteria/proximity.rs | 21 +++++- milli/src/search/mod.rs | 22 ++++++ 7 files changed, 156 insertions(+), 50 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index dd5489ebc..6b7f2078b 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -13,7 +13,7 @@ use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; -use milli::{heed, Index, Object}; +use milli::{heed, CriterionImplementationStrategy, Index, Object}; use structopt::StructOpt; #[global_allocator] @@ -441,7 +441,7 @@ impl Search { if let Some(limit) = limit { search.limit(*limit); } - + search.criterion_implementation_strategy(CriterionImplementationStrategy::OnlyIterative); let result = search.execute()?; let fields_ids_map = index.fields_ids_map(&txn)?; diff --git a/milli/src/lib.rs b/milli/src/lib.rs index b17be8f1f..865195df5 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -42,8 +42,9 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, - MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, + CriterionImplementationStrategy, FacetDistribution, Filter, FormatOptions, MatchBounds, + MatcherBuilder, MatchingWord, MatchingWords, Search, SearchResult, TermsMatchingStrategy, + DEFAULT_VALUES_PER_FACET, }; pub type Result = std::result::Result; diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 6b2199b28..c0fdbada3 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -12,6 +12,7 @@ use crate::heed_codec::ByteSliceRefCodec; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates}; use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; use crate::search::query_tree::Operation; +use crate::search::CriterionImplementationStrategy; use crate::{FieldId, Index, Result}; /// Threshold on the number of candidates that will make @@ -29,6 +30,7 @@ pub struct AscDesc<'t> { allowed_candidates: RoaringBitmap, initial_candidates: InitialCandidates, faceted_candidates: RoaringBitmap, + implementation_strategy: CriterionImplementationStrategy, parent: Box, } @@ -38,8 +40,9 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, + implementation_strategy: CriterionImplementationStrategy, ) -> Result { - Self::new(index, rtxn, parent, field_name, true) + Self::new(index, rtxn, parent, field_name, true, implementation_strategy) } pub fn desc( @@ -47,8 +50,9 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, + implementation_strategy: CriterionImplementationStrategy, ) -> Result { - Self::new(index, rtxn, parent, field_name, false) + Self::new(index, rtxn, parent, field_name, false, implementation_strategy) } fn new( @@ -57,6 +61,7 @@ impl<'t> AscDesc<'t> { parent: Box, field_name: String, is_ascending: bool, + implementation_strategy: CriterionImplementationStrategy, ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let field_id = fields_ids_map.id(&field_name); @@ -82,6 +87,7 @@ impl<'t> AscDesc<'t> { allowed_candidates: RoaringBitmap::new(), faceted_candidates, initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), + implementation_strategy, parent, }) } @@ -149,6 +155,7 @@ impl<'t> Criterion for AscDesc<'t> { field_id, self.is_ascending, candidates & &self.faceted_candidates, + self.implementation_strategy, )?, None => Box::new(std::iter::empty()), }; @@ -170,6 +177,51 @@ impl<'t> Criterion for AscDesc<'t> { } } +fn facet_ordered_iterative<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + is_ascending: bool, + candidates: RoaringBitmap, +) -> Result> + 't>> { + let number_iter = iterative_facet_number_ordered_iter( + index, + rtxn, + field_id, + is_ascending, + candidates.clone(), + )?; + let string_iter = + iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; + Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) +} + +fn facet_ordered_set_based<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: FieldId, + is_ascending: bool, + candidates: RoaringBitmap, +) -> Result> + 't>> { + let make_iter = if is_ascending { ascending_facet_sort } else { descending_facet_sort }; + + let number_iter = make_iter( + rtxn, + index.facet_id_f64_docids.remap_key_type::>(), + field_id, + candidates.clone(), + )?; + + let string_iter = make_iter( + rtxn, + index.facet_id_string_docids.remap_key_type::>(), + field_id, + candidates, + )?; + + Ok(Box::new(number_iter.chain(string_iter))) +} + /// Returns an iterator over groups of the given candidates in ascending or descending order. /// /// It will either use an iterative or a recursive method on the whole facet database depending @@ -180,36 +232,22 @@ fn facet_ordered<'t>( field_id: FieldId, is_ascending: bool, candidates: RoaringBitmap, + implementation_strategy: CriterionImplementationStrategy, ) -> Result> + 't>> { - if candidates.len() <= CANDIDATES_THRESHOLD { - let number_iter = iterative_facet_number_ordered_iter( - index, - rtxn, - field_id, - is_ascending, - candidates.clone(), - )?; - let string_iter = - iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; - Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) - } else { - let make_iter = if is_ascending { ascending_facet_sort } else { descending_facet_sort }; - - let number_iter = make_iter( - rtxn, - index.facet_id_f64_docids.remap_key_type::>(), - field_id, - candidates.clone(), - )?; - - let string_iter = make_iter( - rtxn, - index.facet_id_string_docids.remap_key_type::>(), - field_id, - candidates, - )?; - - Ok(Box::new(number_iter.chain(string_iter))) + match implementation_strategy { + CriterionImplementationStrategy::OnlyIterative => { + facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates) + } + CriterionImplementationStrategy::OnlySetBased => { + facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates) + } + CriterionImplementationStrategy::Dynamic => { + if candidates.len() <= CANDIDATES_THRESHOLD { + facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates) + } else { + facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates) + } + } } } diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index 9da868e1a..d7ec0d382 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -9,7 +9,9 @@ use roaring::RoaringBitmap; use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::{InitialCandidates, Query}; use crate::search::query_tree::{Operation, QueryKind}; -use crate::search::{build_dfa, word_derivations, WordDerivationsCache}; +use crate::search::{ + build_dfa, word_derivations, CriterionImplementationStrategy, WordDerivationsCache, +}; use crate::Result; /// To be able to divide integers by the number of words in the query @@ -30,10 +32,15 @@ pub struct Attribute<'t> { parent: Box, linear_buckets: Option>, set_buckets: Option>>, + implementation_strategy: CriterionImplementationStrategy, } impl<'t> Attribute<'t> { - pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { + pub fn new( + ctx: &'t dyn Context<'t>, + parent: Box, + implementation_strategy: CriterionImplementationStrategy, + ) -> Self { Attribute { ctx, state: None, @@ -41,6 +48,7 @@ impl<'t> Attribute<'t> { parent, linear_buckets: None, set_buckets: None, + implementation_strategy, } } } @@ -64,7 +72,15 @@ impl<'t> Criterion for Attribute<'t> { })); } Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { - let found_candidates = if allowed_candidates.len() < CANDIDATES_THRESHOLD { + let found_candidates = if matches!( + self.implementation_strategy, + CriterionImplementationStrategy::OnlyIterative + ) || (matches!( + self.implementation_strategy, + CriterionImplementationStrategy::Dynamic + ) && allowed_candidates.len() + < CANDIDATES_THRESHOLD) + { let linear_buckets = match self.linear_buckets.as_mut() { Some(linear_buckets) => linear_buckets, None => { diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index eb83f5515..26d1e243f 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -14,6 +14,7 @@ use self::r#final::Final; use self::typo::Typo; use self::words::Words; use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; +use super::CriterionImplementationStrategy; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, Distinct, WordDerivationsCache}; use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; @@ -377,6 +378,7 @@ impl<'t> CriteriaBuilder<'t> { sort_criteria: Option>, exhaustive_number_hits: bool, distinct: Option, + implementation_strategy: CriterionImplementationStrategy, ) -> Result> { use crate::criterion::Criterion as Name; @@ -402,12 +404,14 @@ impl<'t> CriteriaBuilder<'t> { self.rtxn, criterion, field.to_string(), + implementation_strategy, )?), AscDescName::Desc(Member::Field(field)) => Box::new(AscDesc::desc( self.index, self.rtxn, criterion, field.to_string(), + implementation_strategy, )?), AscDescName::Asc(Member::Geo(point)) => { Box::new(Geo::asc(self.index, self.rtxn, criterion, *point)?) @@ -421,15 +425,27 @@ impl<'t> CriteriaBuilder<'t> { } None => criterion, }, - Name::Proximity => Box::new(Proximity::new(self, criterion)), - Name::Attribute => Box::new(Attribute::new(self, criterion)), + Name::Proximity => { + Box::new(Proximity::new(self, criterion, implementation_strategy)) + } + Name::Attribute => { + Box::new(Attribute::new(self, criterion, implementation_strategy)) + } Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), - Name::Asc(field) => { - Box::new(AscDesc::asc(self.index, self.rtxn, criterion, field)?) - } - Name::Desc(field) => { - Box::new(AscDesc::desc(self.index, self.rtxn, criterion, field)?) - } + Name::Asc(field) => Box::new(AscDesc::asc( + self.index, + self.rtxn, + criterion, + field, + implementation_strategy, + )?), + Name::Desc(field) => Box::new(AscDesc::desc( + self.index, + self.rtxn, + criterion, + field, + implementation_strategy, + )?), }; } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index d44ba25dd..2072d0133 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -11,7 +11,7 @@ use super::{ }; use crate::search::criteria::InitialCandidates; use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; -use crate::search::{build_dfa, WordDerivationsCache}; +use crate::search::{build_dfa, CriterionImplementationStrategy, WordDerivationsCache}; use crate::{Position, Result}; type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; @@ -33,10 +33,15 @@ pub struct Proximity<'t> { parent: Box, candidates_cache: Cache, plane_sweep_cache: Option>, + implementation_strategy: CriterionImplementationStrategy, } impl<'t> Proximity<'t> { - pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { + pub fn new( + ctx: &'t dyn Context<'t>, + parent: Box, + implementation_strategy: CriterionImplementationStrategy, + ) -> Self { Proximity { ctx, state: None, @@ -45,6 +50,7 @@ impl<'t> Proximity<'t> { parent, candidates_cache: Cache::new(), plane_sweep_cache: None, + implementation_strategy, } } } @@ -72,8 +78,15 @@ impl<'t> Criterion for Proximity<'t> { self.state = None; // reset state } Some((_, query_tree, allowed_candidates)) => { - let mut new_candidates = if allowed_candidates.len() <= CANDIDATES_THRESHOLD - && self.proximity > PROXIMITY_THRESHOLD + let mut new_candidates = if matches!( + self.implementation_strategy, + CriterionImplementationStrategy::OnlyIterative + ) || (matches!( + self.implementation_strategy, + CriterionImplementationStrategy::Dynamic + ) && allowed_candidates.len() + <= CANDIDATES_THRESHOLD + && self.proximity > PROXIMITY_THRESHOLD) { if let Some(cache) = self.plane_sweep_cache.as_mut() { match cache.next() { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 96cf1e0f1..df59634bb 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -49,6 +49,7 @@ pub struct Search<'a> { authorize_typos: bool, words_limit: usize, exhaustive_number_hits: bool, + criterion_implementation_strategy: CriterionImplementationStrategy, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } @@ -65,6 +66,7 @@ impl<'a> Search<'a> { authorize_typos: true, exhaustive_number_hits: false, words_limit: 10, + criterion_implementation_strategy: CriterionImplementationStrategy::default(), rtxn, index, } @@ -117,6 +119,14 @@ impl<'a> Search<'a> { self } + pub fn criterion_implementation_strategy( + &mut self, + strategy: CriterionImplementationStrategy, + ) -> &mut Search<'a> { + self.criterion_implementation_strategy = strategy; + self + } + fn is_typo_authorized(&self) -> Result { let index_authorizes_typos = self.index.authorize_typos(self.rtxn)?; // only authorize typos if both the index and the query allow it. @@ -204,6 +214,7 @@ impl<'a> Search<'a> { self.sort_criteria.clone(), self.exhaustive_number_hits, None, + self.criterion_implementation_strategy, )?; self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria) } @@ -220,6 +231,7 @@ impl<'a> Search<'a> { self.sort_criteria.clone(), self.exhaustive_number_hits, Some(distinct.clone()), + self.criterion_implementation_strategy, )?; self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria) } @@ -288,6 +300,7 @@ impl fmt::Debug for Search<'_> { authorize_typos, words_limit, exhaustive_number_hits, + criterion_implementation_strategy, rtxn: _, index: _, } = self; @@ -300,6 +313,7 @@ impl fmt::Debug for Search<'_> { .field("terms_matching_strategy", terms_matching_strategy) .field("authorize_typos", authorize_typos) .field("exhaustive_number_hits", exhaustive_number_hits) + .field("criterion_implementation_strategy", criterion_implementation_strategy) .field("words_limit", words_limit) .finish() } @@ -313,6 +327,14 @@ pub struct SearchResult { pub documents_ids: Vec, } +#[derive(Debug, Default, Clone, Copy)] +pub enum CriterionImplementationStrategy { + OnlyIterative, + OnlySetBased, + #[default] + Dynamic, +} + #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum TermsMatchingStrategy { // remove last word first