From 6d50ea0830faf554267ad8e27f83d8b5f9b95548 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 7 Dec 2022 16:41:23 +0100 Subject: [PATCH 1/2] add tests --- milli/tests/search/distinct.rs | 117 +++++++++++++++++++++++++++++---- 1 file changed, 105 insertions(+), 12 deletions(-) diff --git a/milli/tests/search/distinct.rs b/milli/tests/search/distinct.rs index c2b7e2c1e..f1e57d288 100644 --- a/milli/tests/search/distinct.rs +++ b/milli/tests/search/distinct.rs @@ -8,7 +8,7 @@ use Criterion::*; use crate::search::{self, EXTERNAL_DOCUMENTS_IDS}; macro_rules! test_distinct { - ($func:ident, $distinct:ident, $criteria:expr, $n_res:expr) => { + ($func:ident, $distinct:ident, $exhaustive:ident, $limit:expr, $criteria:expr, $n_res:expr) => { #[test] fn $func() { let criteria = $criteria; @@ -26,7 +26,8 @@ macro_rules! test_distinct { let mut search = Search::new(&rtxn, &index); search.query(search::TEST_QUERY); - search.limit(EXTERNAL_DOCUMENTS_IDS.len()); + search.limit($limit); + search.exhaustive_number_hits($exhaustive); search.authorize_typos(true); search.terms_matching_strategy(TermsMatchingStrategy::default()); @@ -46,6 +47,7 @@ macro_rules! test_distinct { Some(d.id) } }) + .take($limit) .collect(); let documents_ids = search::internal_to_external_ids(&index, &documents_ids); @@ -54,25 +56,116 @@ macro_rules! test_distinct { }; } +test_distinct!( + exhaustive_distinct_string_default_criteria, + tag, + true, + 1, + vec![Words, Typo, Proximity, Attribute, Exactness], + 3 +); +test_distinct!( + exhaustive_distinct_number_default_criteria, + asc_desc_rank, + true, + 1, + vec![Words, Typo, Proximity, Attribute, Exactness], + 7 +); + test_distinct!( distinct_string_default_criteria, tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), vec![Words, Typo, Proximity, Attribute, Exactness], 3 ); test_distinct!( distinct_number_default_criteria, asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), vec![Words, Typo, Proximity, Attribute, Exactness], 7 ); -test_distinct!(distinct_string_criterion_words, tag, vec![Words], 3); -test_distinct!(distinct_number_criterion_words, asc_desc_rank, vec![Words], 7); -test_distinct!(distinct_string_criterion_words_typo, tag, vec![Words, Typo], 3); -test_distinct!(distinct_number_criterion_words_typo, asc_desc_rank, vec![Words, Typo], 7); -test_distinct!(distinct_string_criterion_words_proximity, tag, vec![Words, Proximity], 3); -test_distinct!(distinct_number_criterion_words_proximity, asc_desc_rank, vec![Words, Proximity], 7); -test_distinct!(distinct_string_criterion_words_attribute, tag, vec![Words, Attribute], 3); -test_distinct!(distinct_number_criterion_words_attribute, asc_desc_rank, vec![Words, Attribute], 7); -test_distinct!(distinct_string_criterion_words_exactness, tag, vec![Words, Exactness], 3); -test_distinct!(distinct_number_criterion_words_exactness, asc_desc_rank, vec![Words, Exactness], 7); +test_distinct!( + distinct_string_criterion_words, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words], + 3 +); +test_distinct!( + distinct_number_criterion_words, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words], + 7 +); +test_distinct!( + distinct_string_criterion_words_typo, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Typo], + 3 +); +test_distinct!( + distinct_number_criterion_words_typo, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Typo], + 7 +); +test_distinct!( + distinct_string_criterion_words_proximity, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Proximity], + 3 +); +test_distinct!( + distinct_number_criterion_words_proximity, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Proximity], + 7 +); +test_distinct!( + distinct_string_criterion_words_attribute, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Attribute], + 3 +); +test_distinct!( + distinct_number_criterion_words_attribute, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Attribute], + 7 +); +test_distinct!( + distinct_string_criterion_words_exactness, + tag, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Exactness], + 3 +); +test_distinct!( + distinct_number_criterion_words_exactness, + asc_desc_rank, + false, + EXTERNAL_DOCUMENTS_IDS.len(), + vec![Words, Exactness], + 7 +); From 55724f24120c55ed2e293ee7438dd8489e77edbe Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 7 Dec 2022 18:29:25 +0100 Subject: [PATCH 2/2] Introduce an initial candidates set that makes the difference between an exhaustive count and an estimation --- milli/src/search/criteria/asc_desc.rs | 20 ++++---- milli/src/search/criteria/attribute.rs | 28 ++++++----- milli/src/search/criteria/exactness.rs | 21 ++++---- milli/src/search/criteria/final.rs | 10 ++-- milli/src/search/criteria/geo.rs | 20 ++++---- milli/src/search/criteria/initial.rs | 20 ++++---- milli/src/search/criteria/mod.rs | 69 +++++++++++++++++++++++++- milli/src/search/criteria/proximity.rs | 22 ++++---- milli/src/search/criteria/typo.rs | 39 +++++++-------- milli/src/search/criteria/words.rs | 21 ++++---- milli/src/search/mod.rs | 11 ++-- 11 files changed, 180 insertions(+), 101 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index fbcf1d3fe..fd01e806d 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -9,7 +9,7 @@ use super::{Criterion, CriterionParameters, CriterionResult}; use crate::facet::FacetType; use crate::heed_codec::facet::FacetGroupKeyCodec; use crate::heed_codec::ByteSliceRefCodec; -use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; +use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates}; use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -27,7 +27,7 @@ pub struct AscDesc<'t> { query_tree: Option, candidates: Box> + 't>, allowed_candidates: RoaringBitmap, - bucket_candidates: RoaringBitmap, + initial_candidates: InitialCandidates, faceted_candidates: RoaringBitmap, parent: Box, } @@ -81,7 +81,7 @@ impl<'t> AscDesc<'t> { candidates: Box::new(std::iter::empty()), allowed_candidates: RoaringBitmap::new(), faceted_candidates, - bucket_candidates: RoaringBitmap::new(), + initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), parent, }) } @@ -106,7 +106,7 @@ impl<'t> Criterion for AscDesc<'t> { query_tree: self.query_tree.clone(), candidates: Some(take(&mut self.allowed_candidates)), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } None => match self.parent.next(params)? { @@ -114,7 +114,7 @@ impl<'t> Criterion for AscDesc<'t> { query_tree, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { self.query_tree = query_tree; let mut candidates = match (&self.query_tree, candidates) { @@ -130,9 +130,11 @@ impl<'t> Criterion for AscDesc<'t> { candidates &= filtered_candidates; } - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), } if candidates.is_empty() { @@ -160,7 +162,7 @@ impl<'t> Criterion for AscDesc<'t> { query_tree: self.query_tree.clone(), candidates: Some(candidates), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } } diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs index fd567a7ac..9da868e1a 100644 --- a/milli/src/search/criteria/attribute.rs +++ b/milli/src/search/criteria/attribute.rs @@ -7,7 +7,7 @@ use std::mem::take; use roaring::RoaringBitmap; use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::Query; +use crate::search::criteria::{InitialCandidates, Query}; use crate::search::query_tree::{Operation, QueryKind}; use crate::search::{build_dfa, word_derivations, WordDerivationsCache}; use crate::Result; @@ -26,7 +26,7 @@ type FlattenedQueryTree = Vec>>; pub struct Attribute<'t> { ctx: &'t dyn Context<'t>, state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>, - bucket_candidates: RoaringBitmap, + initial_candidates: InitialCandidates, parent: Box, linear_buckets: Option>, set_buckets: Option>>, @@ -37,7 +37,7 @@ impl<'t> Attribute<'t> { Attribute { ctx, state: None, - bucket_candidates: RoaringBitmap::new(), + initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), parent, linear_buckets: None, set_buckets: None, @@ -60,7 +60,7 @@ impl<'t> Criterion for Attribute<'t> { query_tree: Some(query_tree), candidates: Some(RoaringBitmap::new()), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { @@ -84,7 +84,7 @@ impl<'t> Criterion for Attribute<'t> { query_tree: Some(query_tree), candidates: Some(RoaringBitmap::new()), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } } @@ -109,7 +109,7 @@ impl<'t> Criterion for Attribute<'t> { query_tree: Some(query_tree), candidates: Some(RoaringBitmap::new()), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } } @@ -124,7 +124,7 @@ impl<'t> Criterion for Attribute<'t> { query_tree: Some(query_tree), candidates: Some(found_candidates), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } None => match self.parent.next(params)? { @@ -132,7 +132,7 @@ impl<'t> Criterion for Attribute<'t> { query_tree: Some(query_tree), candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { let mut candidates = match candidates { Some(candidates) => candidates, @@ -148,9 +148,11 @@ impl<'t> Criterion for Attribute<'t> { let flattened_query_tree = flatten_query_tree(&query_tree); - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), } self.state = Some((query_tree, flattened_query_tree, candidates)); @@ -160,13 +162,13 @@ impl<'t> Criterion for Attribute<'t> { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, })); } None => return Ok(None), diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 580031697..b389a5d1e 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -8,6 +8,7 @@ use roaring::RoaringBitmap; use crate::search::criteria::{ resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, + InitialCandidates, }; use crate::search::query_tree::{Operation, PrimitiveQueryPart}; use crate::{absolute_from_relative_position, FieldId, Result}; @@ -16,7 +17,7 @@ pub struct Exactness<'t> { ctx: &'t dyn Context<'t>, query_tree: Option, state: Option, - bucket_candidates: RoaringBitmap, + initial_candidates: InitialCandidates, parent: Box, query: Vec, } @@ -36,7 +37,7 @@ impl<'t> Exactness<'t> { ctx, query_tree: None, state: None, - bucket_candidates: RoaringBitmap::new(), + initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), parent, query, }) @@ -68,7 +69,7 @@ impl<'t> Criterion for Exactness<'t> { query_tree: self.query_tree.clone(), candidates: Some(candidates), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } None => match self.parent.next(params)? { @@ -76,7 +77,7 @@ impl<'t> Criterion for Exactness<'t> { query_tree: Some(query_tree), candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { let mut candidates = match candidates { Some(candidates) => candidates, @@ -90,9 +91,11 @@ impl<'t> Criterion for Exactness<'t> { candidates &= filtered_candidates; } - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), } self.state = Some(State::new(candidates)); @@ -102,13 +105,13 @@ impl<'t> Criterion for Exactness<'t> { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, })); } None => return Ok(None), diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs index bd3244143..9f7a147b8 100644 --- a/milli/src/search/criteria/final.rs +++ b/milli/src/search/criteria/final.rs @@ -2,6 +2,7 @@ use log::debug; use roaring::RoaringBitmap; use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::InitialCandidates; use crate::search::query_tree::Operation; use crate::search::WordDerivationsCache; use crate::Result; @@ -14,7 +15,7 @@ pub struct FinalResult { /// The candidates of the current bucket of the last criterion. pub candidates: RoaringBitmap, /// Candidates that comes from the current bucket of the initial criterion. - pub bucket_candidates: RoaringBitmap, + pub initial_candidates: InitialCandidates, } pub struct Final<'t> { @@ -49,7 +50,7 @@ impl<'t> Final<'t> { query_tree, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { let mut candidates = match (candidates, query_tree.as_ref()) { (Some(candidates), _) => candidates, @@ -63,11 +64,12 @@ impl<'t> Final<'t> { candidates &= filtered_candidates; } - let bucket_candidates = bucket_candidates.unwrap_or_else(|| candidates.clone()); + let initial_candidates = initial_candidates + .unwrap_or_else(|| InitialCandidates::Estimated(candidates.clone())); self.returned_candidates |= &candidates; - Ok(Some(FinalResult { query_tree, candidates, bucket_candidates })) + Ok(Some(FinalResult { query_tree, candidates, initial_candidates })) } None => Ok(None), } diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs index 1b08cfac8..0b33e6b2f 100644 --- a/milli/src/search/criteria/geo.rs +++ b/milli/src/search/criteria/geo.rs @@ -4,7 +4,7 @@ use roaring::RoaringBitmap; use rstar::RTree; use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; +use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates}; use crate::{lat_lng_to_xyz, GeoPoint, Index, Result}; pub struct Geo<'t> { @@ -14,7 +14,7 @@ pub struct Geo<'t> { parent: Box, candidates: Box>, allowed_candidates: RoaringBitmap, - bucket_candidates: RoaringBitmap, + initial_candidates: InitialCandidates, rtree: Option>, point: [f64; 2], } @@ -47,7 +47,7 @@ impl<'t> Geo<'t> { ) -> Result { let candidates = Box::new(iter::empty()); let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?; - let bucket_candidates = RoaringBitmap::new(); + let initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new()); let rtree = index.geo_rtree(rtxn)?; Ok(Self { @@ -57,7 +57,7 @@ impl<'t> Geo<'t> { parent, candidates, allowed_candidates, - bucket_candidates, + initial_candidates, rtree, point, }) @@ -77,7 +77,7 @@ impl Criterion for Geo<'_> { query_tree: None, candidates: Some(candidates), filtered_candidates: None, - bucket_candidates: Some(self.bucket_candidates.clone()), + initial_candidates: Some(self.initial_candidates.clone()), })); } None => match self.parent.next(params)? { @@ -85,7 +85,7 @@ impl Criterion for Geo<'_> { query_tree, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { let mut candidates = match (&query_tree, candidates) { (_, Some(candidates)) => candidates, @@ -100,9 +100,11 @@ impl Criterion for Geo<'_> { candidates &= filtered_candidates; } - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), } if candidates.is_empty() { diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs index 85daa813b..44c08dc06 100644 --- a/milli/src/search/criteria/initial.rs +++ b/milli/src/search/criteria/initial.rs @@ -1,7 +1,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::{resolve_query_tree, Context}; +use crate::search::criteria::{resolve_query_tree, Context, InitialCandidates}; use crate::search::query_tree::Operation; use crate::search::Distinct; use crate::Result; @@ -27,7 +27,7 @@ impl<'t, D> Initial<'t, D> { query_tree, candidates: None, filtered_candidates, - bucket_candidates: None, + initial_candidates: None, }; Initial { ctx, answer: Some(answer), exhaustive_number_hits, distinct } } @@ -41,32 +41,34 @@ impl Criterion for Initial<'_, D> { .map(|mut answer| { if self.exhaustive_number_hits && answer.query_tree.is_some() { // resolve the whole query tree to retrieve an exhaustive list of documents matching the query. + // then remove the potential soft deleted documents. let mut candidates = resolve_query_tree( self.ctx, answer.query_tree.as_ref().unwrap(), params.wdcache, - )?; + )? - params.excluded_candidates; // Apply the filters on the documents retrieved with the query tree. if let Some(ref filtered_candidates) = answer.filtered_candidates { candidates &= filtered_candidates; } - // because the bucket_candidates should be an exhaustive count of the matching documents, + // because the initial_candidates should be an exhaustive count of the matching documents, // we precompute the distinct attributes. - let bucket_candidates = match &mut self.distinct { + let initial_candidates = match &mut self.distinct { Some(distinct) => { - let mut bucket_candidates = RoaringBitmap::new(); + let mut initial_candidates = RoaringBitmap::new(); for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { - bucket_candidates.insert(c?); + initial_candidates.insert(c?); } - bucket_candidates + initial_candidates } None => candidates.clone(), }; answer.candidates = Some(candidates); - answer.bucket_candidates = Some(bucket_candidates); + answer.initial_candidates = + Some(InitialCandidates::Exhaustive(initial_candidates)); } Ok(answer) }) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 76718c8ec..eb83f5515 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,5 +1,7 @@ use std::borrow::Cow; use std::collections::HashMap; +use std::mem::take; +use std::ops::{BitOr, BitOrAssign}; use roaring::RoaringBitmap; @@ -41,7 +43,7 @@ pub struct CriterionResult { /// The candidates, coming from facet filters, that this criterion is allowed to return subsets of. filtered_candidates: Option, /// Candidates that comes from the current bucket of the initial criterion. - bucket_candidates: Option, + initial_candidates: Option, } #[derive(Debug, PartialEq)] @@ -65,6 +67,71 @@ impl Default for Candidates { } } +/// Either a set of candidates that defines the estimated set of candidates +/// that could be returned, +/// or the Exhaustive set of candidates that will be returned if all possible results are fetched. +#[derive(Debug, Clone, PartialEq)] +pub enum InitialCandidates { + Estimated(RoaringBitmap), + Exhaustive(RoaringBitmap), +} + +impl InitialCandidates { + fn take(&mut self) -> Self { + match self { + Self::Estimated(c) => Self::Estimated(take(c)), + Self::Exhaustive(c) => Self::Exhaustive(take(c)), + } + } + + /// modify the containing roaring bitmap inplace if the set isn't already Exhaustive. + pub fn map_inplace(&mut self, f: F) + where + F: FnOnce(RoaringBitmap) -> RoaringBitmap, + { + if let Self::Estimated(c) = self { + *c = f(take(c)) + } + } + + pub fn into_inner(self) -> RoaringBitmap { + match self { + Self::Estimated(c) => c, + Self::Exhaustive(c) => c, + } + } +} + +impl BitOrAssign for InitialCandidates { + /// Make an union between the containing roaring bitmaps if the set isn't already Exhaustive. + /// In the case of rhs is Exhaustive and not self, then rhs replaces self. + fn bitor_assign(&mut self, rhs: Self) { + if let Self::Estimated(c) = self { + *self = match rhs { + Self::Estimated(rhs) => Self::Estimated(rhs | &*c), + Self::Exhaustive(rhs) => Self::Exhaustive(rhs), + } + } + } +} + +impl BitOr for InitialCandidates { + type Output = Self; + + /// Make an union between the containing roaring bitmaps if the set isn't already Exhaustive. + /// In the case of rhs is Exhaustive and not self, then rhs replaces self. + fn bitor(self, rhs: Self) -> Self::Output { + if let Self::Estimated(c) = self { + match rhs { + Self::Estimated(rhs) => Self::Estimated(rhs | c), + Self::Exhaustive(rhs) => Self::Exhaustive(rhs), + } + } else { + self.clone() + } + } +} + pub trait Context<'c> { fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 880b3e1ba..d44ba25dd 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -1,6 +1,5 @@ use std::collections::btree_map::{self, BTreeMap}; use std::collections::hash_map::HashMap; -use std::mem::take; use log::debug; use roaring::RoaringBitmap; @@ -10,6 +9,7 @@ use super::{ query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, }; +use crate::search::criteria::InitialCandidates; use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; use crate::search::{build_dfa, WordDerivationsCache}; use crate::{Position, Result}; @@ -29,7 +29,7 @@ pub struct Proximity<'t> { /// (max_proximity, query_tree, allowed_candidates) state: Option<(u8, Operation, RoaringBitmap)>, proximity: u8, - bucket_candidates: RoaringBitmap, + initial_candidates: InitialCandidates, parent: Box, candidates_cache: Cache, plane_sweep_cache: Option>, @@ -41,7 +41,7 @@ impl<'t> Proximity<'t> { ctx, state: None, proximity: 0, - bucket_candidates: RoaringBitmap::new(), + initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), parent, candidates_cache: Cache::new(), plane_sweep_cache: None, @@ -115,7 +115,7 @@ impl<'t> Criterion for Proximity<'t> { query_tree: Some(query_tree.clone()), candidates: Some(new_candidates), filtered_candidates: None, - bucket_candidates: Some(take(&mut self.bucket_candidates)), + initial_candidates: Some(self.initial_candidates.take()), })); } None => match self.parent.next(params)? { @@ -123,7 +123,7 @@ impl<'t> Criterion for Proximity<'t> { query_tree: Some(query_tree), candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { let mut candidates = match candidates { Some(candidates) => candidates, @@ -137,9 +137,11 @@ impl<'t> Criterion for Proximity<'t> { candidates &= filtered_candidates; } - match bucket_candidates { - Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates, - None => self.bucket_candidates |= &candidates, + match initial_candidates { + Some(initial_candidates) => { + self.initial_candidates |= initial_candidates + } + None => self.initial_candidates.map_inplace(|c| c | &candidates), } let maximum_proximity = maximum_proximity(&query_tree); @@ -151,13 +153,13 @@ impl<'t> Criterion for Proximity<'t> { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, })); } None => return Ok(None), diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs index 2ae35e418..56cffd232 100644 --- a/milli/src/search/criteria/typo.rs +++ b/milli/src/search/criteria/typo.rs @@ -9,7 +9,7 @@ use super::{ query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters, CriterionResult, }; -use crate::search::criteria::resolve_phrase; +use crate::search::criteria::{resolve_phrase, InitialCandidates}; use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; use crate::search::{word_derivations, WordDerivationsCache}; use crate::Result; @@ -22,7 +22,7 @@ pub struct Typo<'t> { /// (max_typos, query_tree, candidates) state: Option<(u8, Operation, Candidates)>, typos: u8, - bucket_candidates: Option, + initial_candidates: Option, parent: Box, candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, } @@ -33,7 +33,7 @@ impl<'t> Typo<'t> { ctx, state: None, typos: 0, - bucket_candidates: None, + initial_candidates: None, parent, candidates_cache: HashMap::new(), } @@ -120,9 +120,9 @@ impl<'t> Criterion for Typo<'t> { } } - let bucket_candidates = match self.bucket_candidates.as_mut() { - Some(bucket_candidates) => take(bucket_candidates), - None => candidates.clone(), + let initial_candidates = match self.initial_candidates.as_mut() { + Some(initial_candidates) => initial_candidates.take(), + None => InitialCandidates::Estimated(candidates.clone()), }; self.typos += 1; @@ -131,7 +131,7 @@ impl<'t> Criterion for Typo<'t> { query_tree: Some(new_query_tree), candidates: Some(candidates), filtered_candidates: None, - bucket_candidates: Some(bucket_candidates), + initial_candidates: Some(initial_candidates), })); } None => match self.parent.next(params)? { @@ -139,14 +139,9 @@ impl<'t> Criterion for Typo<'t> { query_tree: Some(query_tree), candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { - self.bucket_candidates = - match (self.bucket_candidates.take(), bucket_candidates) { - (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), - (self_bc, parent_bc) => self_bc.or(parent_bc), - }; - + self.initial_candidates = initial_candidates; let candidates = match candidates.or(filtered_candidates) { Some(candidates) => { Candidates::Allowed(candidates - params.excluded_candidates) @@ -162,13 +157,13 @@ impl<'t> Criterion for Typo<'t> { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, })); } None => return Ok(None), @@ -356,7 +351,7 @@ mod test { let result = display_criteria(criteria, criterion_parameters); insta::assert_snapshot!(result, @r###" - CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, bucket_candidates: None } + CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, initial_candidates: None } "###); } @@ -399,7 +394,7 @@ mod test { Exact { word: "split" } Exact { word: "this" } Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } CriterionResult { query_tree: Some(OR AND @@ -408,7 +403,7 @@ mod test { OR Exact { word: "word" } Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } "###); } @@ -434,7 +429,7 @@ mod test { let result = display_criteria(criteria, criterion_parameters); insta::assert_snapshot!(result, @r###" - CriterionResult { query_tree: None, candidates: None, filtered_candidates: Some(RoaringBitmap<8000 values between 986424 and 4294786076>), bucket_candidates: None } + CriterionResult { query_tree: None, candidates: None, filtered_candidates: Some(RoaringBitmap<8000 values between 986424 and 4294786076>), initial_candidates: None } "###); } @@ -482,7 +477,7 @@ mod test { Exact { word: "split" } Exact { word: "this" } Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } CriterionResult { query_tree: Some(OR AND @@ -491,7 +486,7 @@ mod test { OR Exact { word: "word" } Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, bucket_candidates: Some(RoaringBitmap<[]>) } + ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } "###); } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index b67b7f6b4..181749b60 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -1,9 +1,8 @@ -use std::mem::take; - use log::debug; use roaring::RoaringBitmap; use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; +use crate::search::criteria::InitialCandidates; use crate::search::query_tree::Operation; use crate::Result; @@ -11,7 +10,7 @@ pub struct Words<'t> { ctx: &'t dyn Context<'t>, query_trees: Vec, candidates: Option, - bucket_candidates: Option, + initial_candidates: Option, filtered_candidates: Option, parent: Box, } @@ -22,7 +21,7 @@ impl<'t> Words<'t> { ctx, query_trees: Vec::default(), candidates: None, - bucket_candidates: None, + initial_candidates: None, parent, filtered_candidates: None, } @@ -53,13 +52,13 @@ impl<'t> Criterion for Words<'t> { None => None, }; - let bucket_candidates = self.bucket_candidates.as_mut().map(take); + let initial_candidates = self.initial_candidates.clone(); return Ok(Some(CriterionResult { query_tree: Some(query_tree), candidates, filtered_candidates: self.filtered_candidates.clone(), - bucket_candidates, + initial_candidates, })); } None => match self.parent.next(params)? { @@ -67,14 +66,14 @@ impl<'t> Criterion for Words<'t> { query_tree: Some(query_tree), candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { self.query_trees = explode_query_tree(query_tree); self.candidates = candidates; self.filtered_candidates = filtered_candidates; - self.bucket_candidates = - match (self.bucket_candidates.take(), bucket_candidates) { + self.initial_candidates = + match (self.initial_candidates.take(), initial_candidates) { (Some(self_bc), Some(parent_bc)) => Some(self_bc | parent_bc), (self_bc, parent_bc) => self_bc.or(parent_bc), }; @@ -83,13 +82,13 @@ impl<'t> Criterion for Words<'t> { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, }) => { return Ok(Some(CriterionResult { query_tree: None, candidates, filtered_candidates, - bucket_candidates, + initial_candidates, })); } None => return Ok(None), diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f62a37c1b..96cf1e0f1 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -23,6 +23,7 @@ pub use self::matches::{ use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; +use crate::search::criteria::InitialCandidates; use crate::{AscDesc, Criterion, DocumentId, Index, Member, Result}; // Building these factories is not free. @@ -235,11 +236,11 @@ impl<'a> Search<'a> { mut criteria: Final, ) -> Result { let mut offset = self.offset; - let mut initial_candidates = RoaringBitmap::new(); + let mut initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new()); let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?; let mut documents_ids = Vec::new(); - while let Some(FinalResult { candidates, bucket_candidates, .. }) = + while let Some(FinalResult { candidates, initial_candidates: ic, .. }) = criteria.next(&excluded_candidates)? { debug!("Number of candidates found {}", candidates.len()); @@ -247,7 +248,7 @@ impl<'a> Search<'a> { let excluded = take(&mut excluded_candidates); let mut candidates = distinct.distinct(candidates, excluded); - initial_candidates |= bucket_candidates; + initial_candidates |= ic; if offset != 0 { let discarded = candidates.by_ref().take(offset).count(); @@ -265,9 +266,11 @@ impl<'a> Search<'a> { } } + initial_candidates.map_inplace(|c| c - excluded_candidates); + Ok(SearchResult { matching_words, - candidates: initial_candidates - excluded_candidates, + candidates: initial_candidates.into_inner(), documents_ids, }) }