diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index f6ef51ccd..3d277013a 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -9,12 +9,14 @@ use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; use crate::heed_codec::facet::{ - FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, - OrderedF64Codec, + FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec, }; use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec}; use crate::search::facet::facet_distribution_iter; use crate::{FieldId, Index, Result}; +use facet_distribution_iter::{ + count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution, +}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -24,10 +26,20 @@ pub const DEFAULT_VALUES_PER_FACET: usize = 100; /// the system to choose between one algorithm or another. const CANDIDATES_THRESHOLD: u64 = 3000; +/// How should we fetch the facets? +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum OrderBy { + /// By lexicographic order... + Lexicographic, + /// Or by number of docids in common? + Count, +} + pub struct FacetDistribution<'a> { facets: Option>, candidates: Option, max_values_per_facet: usize, + order_by: OrderBy, rtxn: &'a heed::RoTxn<'a>, index: &'a Index, } @@ -38,6 +50,7 @@ impl<'a> FacetDistribution<'a> { facets: None, candidates: None, max_values_per_facet: DEFAULT_VALUES_PER_FACET, + order_by: OrderBy::Count, rtxn, index, } @@ -53,6 +66,11 @@ impl<'a> FacetDistribution<'a> { self } + pub fn order_by(&mut self, order_by: OrderBy) -> &mut Self { + self.order_by = order_by; + self + } + pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self { self.candidates = Some(candidates); self @@ -134,9 +152,15 @@ impl<'a> FacetDistribution<'a> { &self, field_id: FieldId, candidates: &RoaringBitmap, + order_by: OrderBy, distribution: &mut BTreeMap, ) -> heed::Result<()> { - facet_distribution_iter::lexicographically_iterate_over_facet_distribution( + let search_function = match order_by { + OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution, + OrderBy::Count => count_iterate_over_facet_distribution, + }; + + search_function( self.rtxn, self.index .facet_id_f64_docids @@ -159,9 +183,15 @@ impl<'a> FacetDistribution<'a> { &self, field_id: FieldId, candidates: &RoaringBitmap, + order_by: OrderBy, distribution: &mut BTreeMap, ) -> heed::Result<()> { - facet_distribution_iter::lexicographically_iterate_over_facet_distribution( + let search_function = match order_by { + OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution, + OrderBy::Count => count_iterate_over_facet_distribution, + }; + + search_function( self.rtxn, self.index .facet_id_string_docids @@ -189,98 +219,42 @@ impl<'a> FacetDistribution<'a> { ) } - /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the - /// facet values one by one and iterate on the facet level 0 for numbers. - fn facet_values_from_raw_facet_database( - &self, - field_id: FieldId, - ) -> heed::Result> { - let mut distribution = BTreeMap::new(); - - let db = self.index.facet_id_f64_docids; - let mut prefix = vec![]; - prefix.extend_from_slice(&field_id.to_be_bytes()); - prefix.push(0); // read values from level 0 only - - let iter = db - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); - - for result in iter { - let (key, value) = result?; - distribution.insert(key.left_bound.to_string(), value.bitmap.len()); - if distribution.len() == self.max_values_per_facet { - break; - } - } - - let iter = self - .index - .facet_id_string_docids - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? - .remap_types::, FacetGroupValueCodec>(); - - for result in iter { - let (key, value) = result?; - - let docid = value.bitmap.iter().next().unwrap(); - let key: (FieldId, _, &'a str) = (field_id, docid, key.left_bound); - let original_string = - self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned(); - - distribution.insert(original_string, value.bitmap.len()); - if distribution.len() == self.max_values_per_facet { - break; - } - } - - Ok(distribution) - } - fn facet_values(&self, field_id: FieldId) -> heed::Result> { - // use FacetType::{Number, String}; - - let candidates = match self.candidates.as_ref() { - Some(candidates) => candidates.clone(), - None => todo!("fetch candidates"), - }; + use FacetType::{Number, String}; let mut distribution = BTreeMap::new(); + match (self.order_by, &self.candidates) { + (OrderBy::Lexicographic, Some(cnd)) if cnd.len() <= CANDIDATES_THRESHOLD => { + // Classic search, candidates were specified, we must return facet values only related + // to those candidates. We also enter here for facet strings for performance reasons. + self.facet_distribution_from_documents(field_id, Number, cnd, &mut distribution)?; + self.facet_distribution_from_documents(field_id, String, cnd, &mut distribution)?; + } + _ => { + let universe; + let candidates; + match &self.candidates { + Some(cnd) => candidates = cnd, + None => { + universe = self.index.documents_ids(self.rtxn)?; + candidates = &universe; + } + } - let number_distribution = facet_distribution_iter::count_iterate_over_facet_distribution( - self.rtxn, - self.index - .facet_id_f64_docids - .remap_key_type::>(), - field_id, - &candidates, - )?; - - for (count, facet_key, _) in number_distribution { - let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap(); - distribution.insert(facet_key.to_string(), count); - } - - let string_distribution = facet_distribution_iter::count_iterate_over_facet_distribution( - self.rtxn, - self.index - .facet_id_string_docids - .remap_key_type::>(), - field_id, - &candidates, - )?; - - for (count, facet_key, any_docid) in string_distribution { - let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap(); - - let key: (FieldId, _, &str) = (field_id, any_docid, facet_key); - let original_string = - self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned(); - - distribution.insert(original_string, count); - } + self.facet_numbers_distribution_from_facet_levels( + field_id, + candidates, + self.order_by, + &mut distribution, + )?; + self.facet_strings_distribution_from_facet_levels( + field_id, + candidates, + self.order_by, + &mut distribution, + )?; + } + }; Ok(distribution) } @@ -381,13 +355,20 @@ impl<'a> FacetDistribution<'a> { impl fmt::Debug for FacetDistribution<'_> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let FacetDistribution { facets, candidates, max_values_per_facet, rtxn: _, index: _ } = - self; + let FacetDistribution { + facets, + candidates, + max_values_per_facet, + order_by, + rtxn: _, + index: _, + } = self; f.debug_struct("FacetDistribution") .field("facets", facets) .field("candidates", candidates) .field("max_values_per_facet", max_values_per_facet) + .field("order_by", order_by) .finish() } } diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index acd936eff..9ff57b34a 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -46,12 +46,16 @@ where } } -pub fn count_iterate_over_facet_distribution<'t>( +pub fn count_iterate_over_facet_distribution<'t, CB>( rtxn: &'t heed::RoTxn<'t>, db: heed::Database, FacetGroupValueCodec>, field_id: u16, candidates: &RoaringBitmap, -) -> Result> { + mut callback: CB, +) -> Result<()> +where + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, +{ #[derive(Debug, PartialOrd, Ord, PartialEq, Eq)] struct LevelEntry<'t> { /// The number of candidates in this entry. @@ -68,8 +72,6 @@ pub fn count_iterate_over_facet_distribution<'t>( // Represents the list of keys that we must explore. let mut heap = BinaryHeap::new(); - let mut results = Vec::new(); - let highest_level = get_highest_level( rtxn, db.remap_key_type::>(), @@ -103,10 +105,9 @@ pub fn count_iterate_over_facet_distribution<'t>( while let Some(LevelEntry { count, level, left_bound, group_size, any_docid }) = heap.pop() { if let Reverse(0) = level { - results.push((count, left_bound, any_docid)); - // TODO better just call the user callback and ask for a ControlFlow - if results.len() == 20 { - break; + match (callback)(left_bound, count, any_docid)? { + ControlFlow::Continue(_) => (), + ControlFlow::Break(_) => return Ok(()), } } else { let starting_key = @@ -132,11 +133,9 @@ pub fn count_iterate_over_facet_distribution<'t>( } } } - - Ok(results) - } else { - Ok(Default::default()) } + + Ok(()) } /// Iterate over the facets values by lexicographic order.