From 8fdf860c17efd131504beba7c8ecfdd3b20335fb Mon Sep 17 00:00:00 2001 From: many Date: Thu, 12 Aug 2021 11:29:20 +0200 Subject: [PATCH 1/2] Remove max values by facet limit for facet distribution --- milli/src/search/facet/facet_distribution.rs | 47 ++------------------ 1 file changed, 3 insertions(+), 44 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index bfbea76c3..91bf21cf7 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; -use std::{cmp, fmt, mem}; +use std::{fmt, mem}; use heed::types::ByteSlice; use roaring::RoaringBitmap; @@ -13,14 +13,6 @@ use crate::heed_codec::facet::{ use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; use crate::{FieldId, Index, Result}; -/// The default number of values by facets that will -/// be fetched from the key-value store. -const DEFAULT_VALUES_BY_FACET: usize = 100; - -/// The hard limit in the number of values by facets that will be fetched from -/// the key-value store. Searching for more values could slow down the engine. -const MAX_VALUES_BY_FACET: usize = 1000; - /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. const CANDIDATES_THRESHOLD: u64 = 3000; @@ -28,20 +20,13 @@ const CANDIDATES_THRESHOLD: u64 = 3000; pub struct FacetDistribution<'a> { facets: Option>, candidates: Option, - max_values_by_facet: usize, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } impl<'a> FacetDistribution<'a> { pub fn new(rtxn: &'a heed::RoTxn, index: &'a Index) -> FacetDistribution<'a> { - FacetDistribution { - facets: None, - candidates: None, - max_values_by_facet: DEFAULT_VALUES_BY_FACET, - rtxn, - index, - } + FacetDistribution { facets: None, candidates: None, rtxn, index } } pub fn facets, A: AsRef>(&mut self, names: I) -> &mut Self { @@ -54,11 +39,6 @@ impl<'a> FacetDistribution<'a> { self } - pub fn max_values_by_facet(&mut self, max: usize) -> &mut Self { - self.max_values_by_facet = cmp::min(max, MAX_VALUES_BY_FACET); - self - } - /// There is a small amount of candidates OR we ask for facet string values so we /// decide to iterate over the facet values of each one of them, one by one. fn facet_distribution_from_documents( @@ -72,7 +52,6 @@ impl<'a> FacetDistribution<'a> { FacetType::Number => { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); - let distribution_prelength = distribution.len(); let db = self.index.field_id_docid_facet_f64s; for docid in candidates.into_iter() { key_buffer.truncate(mem::size_of::()); @@ -85,9 +64,6 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, _, value), ()) = result?; *distribution.entry(value.to_string()).or_insert(0) += 1; - if distribution.len() - distribution_prelength == self.max_values_by_facet { - break; - } } } } @@ -110,10 +86,6 @@ impl<'a> FacetDistribution<'a> { .entry(normalized_value) .or_insert_with(|| (original_value, 0)); *count += 1; - - if normalized_distribution.len() == self.max_values_by_facet { - break; - } } } @@ -144,9 +116,6 @@ impl<'a> FacetDistribution<'a> { if !docids.is_empty() { distribution.insert(value.to_string(), docids.len()); } - if distribution.len() == self.max_values_by_facet { - break; - } } Ok(()) @@ -167,9 +136,6 @@ impl<'a> FacetDistribution<'a> { if !docids.is_empty() { distribution.insert(original.to_string(), docids.len()); } - if distribution.len() == self.max_values_by_facet { - break; - } } Ok(()) @@ -189,9 +155,6 @@ impl<'a> FacetDistribution<'a> { for result in range { let ((_, _, value, _), docids) = result?; distribution.insert(value.to_string(), docids.len()); - if distribution.len() == self.max_values_by_facet { - break; - } } let iter = self @@ -205,9 +168,6 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, normalized_value), (original_value, docids)) = result?; normalized_distribution.insert(normalized_value, (original_value, docids.len())); - if distribution.len() == self.max_values_by_facet { - break; - } } let iter = normalized_distribution @@ -289,12 +249,11 @@ impl<'a> FacetDistribution<'a> { impl fmt::Debug for FacetDistribution<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let FacetDistribution { facets, candidates, max_values_by_facet, rtxn: _, index: _ } = self; + let FacetDistribution { facets, candidates, rtxn: _, index: _ } = self; f.debug_struct("FacetDistribution") .field("facets", facets) .field("candidates", candidates) - .field("max_values_by_facet", max_values_by_facet) .finish() } } From 7dbefae1e39d20fb5183ecc1651da063ed2e5d97 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 12 Aug 2021 17:23:39 +0200 Subject: [PATCH 2/2] Make facet string iterator non reducing --- milli/src/search/facet/facet_string.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 40ea8c04a..ed5322607 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -289,6 +289,7 @@ pub struct FacetStringIter<'t> { field_id: FieldId, level_iters: Vec<(RoaringBitmap, Either, FacetStringLevelZeroRange<'t>>)>, + must_reduce: bool, } impl<'t> FacetStringIter<'t> { @@ -318,7 +319,13 @@ impl<'t> FacetStringIter<'t> { )?), }; - Ok(FacetStringIter { rtxn, db, field_id, level_iters: vec![(documents_ids, highest_iter)] }) + Ok(FacetStringIter { + rtxn, + db, + field_id, + level_iters: vec![(documents_ids, highest_iter)], + must_reduce: false, + }) } fn highest_level( @@ -348,7 +355,9 @@ impl<'t> Iterator for FacetStringIter<'t> { Ok(((level, left, right), (string_bounds, mut docids))) => { docids &= &*documents_ids; if !docids.is_empty() { - *documents_ids -= &docids; + if self.must_reduce { + *documents_ids -= &docids; + } let result = match string_bounds { Some((left, right)) => FacetStringLevelZeroRange::new( @@ -390,7 +399,9 @@ impl<'t> Iterator for FacetStringIter<'t> { Ok((normalized, original, mut docids)) => { docids &= &*documents_ids; if !docids.is_empty() { - *documents_ids -= &docids; + if self.must_reduce { + *documents_ids -= &docids; + } return Some(Ok((normalized, original, docids))); } }