From 5a904cf29d77ccb7bbeda88373f9017f9c0e388c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 31 Aug 2022 07:50:18 +0200 Subject: [PATCH] Reintroduce facet distribution functionality --- milli/src/search/facet/facet_distribution.rs | 132 +++++++++--------- .../search/facet/facet_distribution_iter.rs | 4 +- milli/src/search/facet/mod.rs | 6 +- 3 files changed, 72 insertions(+), 70 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index fddf93d4b..670719a9b 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,13 +1,18 @@ use std::collections::{BTreeMap, HashSet}; -use std::ops::Bound::Unbounded; +use std::ops::ControlFlow; use std::{fmt, mem}; use heed::types::ByteSlice; +use heed::BytesDecode; use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::str_ref::StrRefCodec; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +use crate::search::facet::facet_distribution_iter; // use crate::search::facet::FacetStringIter; use crate::{FieldId, Index, Result}; @@ -131,22 +136,21 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - todo!() - // let iter = - // FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - - // for result in iter { - // let (value, mut docids) = result?; - // docids &= candidates; - // if !docids.is_empty() { - // distribution.insert(value.to_string(), docids.len()); - // } - // if distribution.len() == self.max_values_per_facet { - // break; - // } - // } - - // Ok(()) + facet_distribution_iter::iterate_over_facet_distribution( + self.rtxn, + &self.index.facet_id_f64_docids.remap_key_type::>(), + field_id, + candidates, + |facet_key, nbr_docids| { + let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap(); + distribution.insert(facet_key.to_string(), nbr_docids); + if distribution.len() == self.max_values_per_facet { + ControlFlow::Break(()) + } else { + ControlFlow::Continue(()) + } + }, + ) } fn facet_strings_distribution_from_facet_levels( @@ -155,22 +159,21 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - todo!() - // let iter = - // FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - - // for result in iter { - // let (_normalized, original, mut docids) = result?; - // docids &= candidates; - // if !docids.is_empty() { - // distribution.insert(original.to_string(), docids.len()); - // } - // if distribution.len() == self.max_values_per_facet { - // break; - // } - // } - - // Ok(()) + facet_distribution_iter::iterate_over_facet_distribution( + self.rtxn, + &self.index.facet_id_string_docids.remap_key_type::>(), + field_id, + candidates, + |facet_key, nbr_docids| { + let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap(); + distribution.insert(facet_key.to_string(), nbr_docids); + if distribution.len() == self.max_values_per_facet { + ControlFlow::Break(()) + } else { + ControlFlow::Continue(()) + } + }, + ) } /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the @@ -179,43 +182,42 @@ impl<'a> FacetDistribution<'a> { &self, field_id: FieldId, ) -> heed::Result> { - todo!() - // let mut distribution = BTreeMap::new(); + let mut distribution = BTreeMap::new(); - // let db = self.index.facet_id_f64_docids; - // let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; + let db = self.index.facet_id_f64_docids; + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(0); + let iter = db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); - // for result in range { - // let ((_, _, value, _), docids) = result?; - // distribution.insert(value.to_string(), docids.len()); - // if distribution.len() == self.max_values_per_facet { - // break; - // } - // } + for result in iter { + let (key, value) = result?; + distribution.insert(key.left_bound.to_string(), value.bitmap.len()); + if distribution.len() == self.max_values_per_facet { + break; + } + } - // let iter = self - // .index - // .facet_id_string_docids - // .remap_key_type::() - // .prefix_iter(self.rtxn, &field_id.to_be_bytes())? - // .remap_key_type::(); + let iter = self + .index + .facet_id_string_docids + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); - // let mut normalized_distribution = BTreeMap::new(); - // for result in iter { - // let ((_, normalized_value), group_value) = result?; - // normalized_distribution - // .insert(normalized_value, (normalized_value, group_value.bitmap.len())); - // if normalized_distribution.len() == self.max_values_per_facet { - // break; - // } - // } + // TODO: get the original value of the facet somewhere (in the documents DB?) + for result in iter { + let (key, value) = result?; + distribution.insert(key.left_bound.to_owned(), value.bitmap.len()); + if distribution.len() == self.max_values_per_facet { + break; + } + } - // let iter = normalized_distribution - // .into_iter() - // .map(|(_normalized, (original, count))| (original.to_string(), count)); - // distribution.extend(iter); - - // Ok(distribution) + Ok(distribution) } fn facet_values(&self, field_id: FieldId) -> heed::Result> { diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 83079028c..9f1031a85 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -1,5 +1,5 @@ use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; -use crate::Result; +use heed::Result; use roaring::RoaringBitmap; use std::ops::ControlFlow; @@ -20,7 +20,7 @@ where get_highest_level(rtxn, &db.remap_key_type::>(), field_id)?; if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { - fd.iterate(candidates, highest_level, first_bound, usize::MAX); + fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; return Ok(()); } else { return Ok(()); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index d27206af2..023d433ad 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -20,7 +20,7 @@ pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> crate::Result> +) -> heed::Result> where BoundCodec: BytesDecode<'t>, { @@ -42,7 +42,7 @@ pub(crate) fn get_last_facet_value<'t, BoundCodec>( txn: &'t RoTxn, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> crate::Result> +) -> heed::Result> where BoundCodec: BytesDecode<'t>, { @@ -65,7 +65,7 @@ pub(crate) fn get_highest_level<'t>( txn: &'t RoTxn<'t>, db: &'t heed::Database, FacetGroupValueCodec>, field_id: u16, -) -> crate::Result { +) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); Ok(db .as_polymorph()