diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs new file mode 100644 index 000000000..2dfe3580f --- /dev/null +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -0,0 +1,199 @@ +use roaring::RoaringBitmap; +use std::ops::ControlFlow; + +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice}; + +use super::{get_first_facet_value, get_highest_level}; + +pub fn iterate_over_facet_distribution<'t, CB>( + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: &RoaringBitmap, + callback: CB, +) where + CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, +{ + let mut fd = FacetDistribution { rtxn, db, field_id, callback }; + let highest_level = + get_highest_level(rtxn, &db.remap_key_type::>(), field_id); + + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id) { + fd.iterate(candidates, highest_level, first_bound, usize::MAX); + return; + } else { + return; + } +} + +struct FacetDistribution<'t, CB> +where + CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, +{ + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + callback: CB, +} + +impl<'t, CB> FacetDistribution<'t, CB> +where + CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, +{ + fn iterate_level_0( + &mut self, + candidates: &RoaringBitmap, + starting_bound: &'t [u8], + group_size: usize, + ) -> ControlFlow<()> { + let starting_key = + FacetKey { field_id: self.field_id, level: 0, left_bound: starting_bound }; + let iter = self.db.range(self.rtxn, &(starting_key..)).unwrap().take(group_size); + for el in iter { + let (key, value) = el.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if key.field_id != self.field_id { + return ControlFlow::Break(()); + } + let docids_in_common = value.bitmap.intersection_len(candidates); + if docids_in_common > 0 { + match (self.callback)(key.left_bound, docids_in_common) { + ControlFlow::Continue(_) => {} + ControlFlow::Break(_) => return ControlFlow::Break(()), + } + } + } + return ControlFlow::Continue(()); + } + fn iterate( + &mut self, + candidates: &RoaringBitmap, + level: u8, + starting_bound: &'t [u8], + group_size: usize, + ) -> ControlFlow<()> { + if level == 0 { + return self.iterate_level_0(candidates, starting_bound, group_size); + } + let starting_key = FacetKey { field_id: self.field_id, level, left_bound: starting_bound }; + let iter = self.db.range(&self.rtxn, &(&starting_key..)).unwrap().take(group_size); + + for el in iter { + let (key, value) = el.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if key.field_id != self.field_id { + return ControlFlow::Break(()); + } + let docids_in_common = value.bitmap & candidates; + if docids_in_common.len() > 0 { + let cf = + self.iterate(&docids_in_common, level - 1, key.left_bound, value.size as usize); + match cf { + ControlFlow::Continue(_) => {} + ControlFlow::Break(_) => return ControlFlow::Break(()), + } + } + } + + return ControlFlow::Continue(()); + } +} + +#[cfg(test)] +mod tests { + use crate::{codec::U16Codec, Index}; + use heed::BytesDecode; + use roaring::RoaringBitmap; + use std::ops::ControlFlow; + + use super::iterate_over_facet_distribution; + + fn get_simple_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &i, &bitmap); + } + txn.commit().unwrap(); + index + } + fn get_random_looking_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let rng = fastrand::Rng::with_seed(0); + let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as u16), &bitmap); + } + txn.commit().unwrap(); + index + } + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + insta::assert_display_snapshot!(index) + } + #[test] + fn filter_distribution_all() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (0..=255).into_iter().collect::(); + let mut results = String::new(); + iterate_over_facet_distribution( + &txn, + &index.db.content, + 0, + &candidates, + |facet, count| { + let facet = U16Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {count}\n")); + ControlFlow::Continue(()) + }, + ); + insta::assert_snapshot!(format!("filter_distribution_{i}_all"), results); + + txn.commit().unwrap(); + } + } + #[test] + fn filter_distribution_all_stop_early() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (0..=255).into_iter().collect::(); + let mut results = String::new(); + let mut nbr_facets = 0; + iterate_over_facet_distribution( + &txn, + &index.db.content, + 0, + &candidates, + |facet, count| { + let facet = U16Codec::bytes_decode(facet).unwrap(); + if nbr_facets == 100 { + return ControlFlow::Break(()); + } else { + nbr_facets += 1; + results.push_str(&format!("{facet}: {count}\n")); + + ControlFlow::Continue(()) + } + }, + ); + insta::assert_snapshot!(format!("filter_distribution_{i}_all_stop_early"), results); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_number.rs b/milli/src/search/facet/facet_number.rs deleted file mode 100644 index 5f7bd5325..000000000 --- a/milli/src/search/facet/facet_number.rs +++ /dev/null @@ -1,335 +0,0 @@ -// use std::ops::Bound::{self, Excluded, Included, Unbounded}; - -// use either::Either::{self, Left, Right}; -// use heed::types::{ByteSlice, DecodeIgnore}; -// use heed::{BytesDecode, BytesEncode, Database, Lazy, LazyDecode, RoRange, RoRevRange}; -// use obkv::Key; -// use roaring::RoaringBitmap; - -// use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; -// use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; -// use crate::heed_codec::CboRoaringBitmapCodec; -// use crate::{FieldId, Index}; - -// pub struct FacetNumberRange<'t, 'e> { -// rtxn: &'t heed::RoTxn<'e>, -// db: Database, FacetGroupValueCodec>, -// iter: RoRange<'t, FacetKeyCodec, LazyDecode>, -// max_bound: f64, -// previous: Option<(FacetKey, Lazy<'t, FacetGroupValueCodec>)>, -// field_id: FieldId, -// end: Bound, -// } - -// impl<'t, 'e> FacetNumberRange<'t, 'e> { -// pub fn new( -// rtxn: &'t heed::RoTxn<'e>, -// db: Database, FacetGroupValueCodec>, -// field_id: FieldId, -// level: u8, -// left: Bound, -// right: Bound, -// ) -> heed::Result> { -// let left_bound = match left { -// Included(left_bound) => Included(FacetKey { field_id, level, left_bound }), -// Excluded(left_bound) => Excluded(FacetKey { field_id, level, left_bound }), -// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }), -// }; - -// let mut iter = db.lazily_decode_data().range(rtxn, &(left_bound, Unbounded))?; -// let mut previous = iter.next().transpose()?; - -// // Compute the maximum end bound by looking at the key of the last element in level 0 -// let mut prefix_level_0 = vec![]; -// prefix_level_0.extend_from_slice(&field_id.to_be_bytes()); -// prefix_level_0.push(level); - -// let mut rev_iter = -// db.as_polymorph().rev_prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, &prefix_level_0)?; - -// let rev_iter_first = rev_iter.next().transpose()?; -// let max_bound = if let Some((max_bound_key, _)) = rev_iter_first { -// let max_bound_key = -// FacetKeyCodec::::bytes_decode(max_bound_key).unwrap(); -// max_bound_key.left_bound -// } else { -// // I can't imagine when that would happen, but let's handle it correctly anyway -// // by making the iterator empty -// previous = None; -// 0.0 // doesn't matter since previous = None so the iterator will always early exit -// // and return None itself -// }; - -// Ok(FacetNumberRange { rtxn, db, iter, field_id, previous, max_bound, end: right }) -// } -// } - -// impl<'t, 'e> Iterator for FacetNumberRange<'t, 'e> { -// type Item = heed::Result<(FacetKey, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// // The idea here is to return the **previous** element only if the left -// // bound of the current key fits within the range given to the iter -// // if it doesn't, then there is still a chance that it must be returned, -// // but we need to check the actual right bound of the group by looking for -// // the key preceding the first key of the next group in level 0 - -// let (prev_key, prev_value) = self.previous?; - -// let (next_left_bound, next_previous) = if let Some(next) = self.iter.next() { -// let (key, group_value) = match next { -// Ok(n) => n, -// Err(e) => return Some(Err(e)), -// }; -// (key.left_bound, Some((key, group_value))) -// } else { -// // we're at the end of the level iter, so we need to fetch the max bound instead -// (self.max_bound, None) -// }; -// let must_be_returned = match self.end { -// Included(end) => next_left_bound <= end, -// Excluded(end) => next_left_bound < end, -// Unbounded => true, -// }; -// if must_be_returned { -// match prev_value.decode() { -// Ok(group_value) => { -// self.previous = next_previous; -// Some(Ok((prev_key, group_value.bitmap))) -// } -// Err(e) => Some(Err(e)), -// } -// } else { -// // it still possible that we want to return the value (one last time) -// // but to do so, we need to fetch the right bound of the current group -// // this is done by getting the first element at level 0 of the next group -// // then iterating in reverse from it -// // once we have the right bound, we can compare it, and then return or not -// // then we still set self.previous to None so that no other element can return -// // from it? -// let mut level_0_key_prefix = vec![]; -// level_0_key_prefix.extend_from_slice(&self.field_id.to_be_bytes()); -// level_0_key_prefix.push(0); -// let key = -// FacetKey:: { field_id: self.field_id, level: 0, left_bound: next_left_bound }; -// let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); -// level_0_key_prefix.extend_from_slice(&key_bytes); - -// let mut rev_iter_next_group_level_0 = self -// .db -// .as_polymorph() -// .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&self.rtxn, &level_0_key_prefix) -// .unwrap(); -// let (key_for_right_bound, _) = rev_iter_next_group_level_0.next().unwrap().unwrap(); -// let key_for_right_bound = -// FacetKeyCodec::::bytes_decode(key_for_right_bound).unwrap(); -// let right_bound = key_for_right_bound.left_bound; -// let must_be_returned = match self.end { -// Included(end) => right_bound <= end, -// Excluded(end) => right_bound < end, -// Unbounded => unreachable!(), -// }; -// self.previous = None; -// if must_be_returned { -// match prev_value.decode() { -// Ok(group_value) => Some(Ok((prev_key, group_value.bitmap))), -// Err(e) => Some(Err(e)), -// } -// } else { -// None -// } -// } -// } -// } - -// pub struct FacetNumberRevRange<'t> { -// iter: RoRevRange<'t, FacetKeyCodec, LazyDecode>, -// end: Bound, -// } - -// impl<'t> FacetNumberRevRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, FacetGroupValueCodec>, -// field_id: FieldId, -// level: u8, -// left: Bound, -// right: Bound, -// ) -> heed::Result> { -// let left_bound = match left { -// Included(left) => Included(FacetKey { field_id, level, left_bound: left }), -// Excluded(left) => Excluded(FacetKey { field_id, level, left_bound: left }), -// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }), -// }; -// let right_bound = Included(FacetKey { field_id, level, left_bound: f64::MAX }); -// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; -// Ok(FacetNumberRevRange { iter, end: right }) -// } -// } - -// impl<'t> Iterator for FacetNumberRevRange<'t> { -// type Item = heed::Result<(FacetKey, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// loop { -// match self.iter.next() { -// Some(Ok((FacetKey { field_id, level, left_bound }, docids))) => { -// let must_be_returned = match self.end { -// Included(end) => todo!(), //right <= end, -// Excluded(end) => todo!(), //right < end, -// Unbounded => true, -// }; -// if must_be_returned { -// match docids.decode() { -// Ok(docids) => { -// return Some(Ok(( -// FacetKey { field_id, level, left_bound }, -// docids.bitmap, -// ))) -// } -// Err(e) => return Some(Err(e)), -// } -// } -// continue; -// } -// Some(Err(e)) => return Some(Err(e)), -// None => return None, -// } -// } -// } -// } - -// pub struct FacetNumberIter<'t, 'e> { -// rtxn: &'t heed::RoTxn<'t>, -// db: Database, FacetGroupValueCodec>, -// field_id: FieldId, -// level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, -// must_reduce: bool, -// } - -// impl<'t, 'e> FacetNumberIter<'t, 'e> { -// /// Create a `FacetNumberIter` that will iterate on the different facet entries -// /// (facet value + documents ids) and that will reduce the given documents ids -// /// while iterating on the different facet levels. -// pub fn new_reducing( -// rtxn: &'t heed::RoTxn<'e>, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_f64_docids; -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// let highest_iter = -// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; -// let level_iters = vec![(documents_ids, Left(highest_iter))]; -// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) -// } - -// /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse -// /// (facet value + documents ids) and that will reduce the given documents ids -// /// while iterating on the different facet levels. -// pub fn new_reverse_reducing( -// rtxn: &'t heed::RoTxn<'e>, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_f64_docids; -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// let highest_iter = -// FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; -// let level_iters = vec![(documents_ids, Right(highest_iter))]; -// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) -// } - -// /// Create a `FacetNumberIter` that will iterate on the different facet entries -// /// (facet value + documents ids) and that will not reduce the given documents ids -// /// while iterating on the different facet levels, possibly returning multiple times -// /// a document id associated with multiple facet values. -// pub fn new_non_reducing( -// rtxn: &'t heed::RoTxn<'e>, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_f64_docids; -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// let highest_iter = -// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; -// let level_iters = vec![(documents_ids, Left(highest_iter))]; -// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) -// } - -// fn highest_level( -// rtxn: &'t heed::RoTxn, -// db: Database, X>, -// fid: FieldId, -// ) -> heed::Result> { -// let level = db -// .remap_types::() -// .prefix_iter(rtxn, &fid.to_be_bytes())? -// .remap_key_type::>() -// .last() -// .transpose()? -// .map(|(key, _)| key.level); -// Ok(level) -// } -// } - -// impl<'t, 'e> Iterator for FacetNumberIter<'t, 'e> { -// type Item = heed::Result<(f64, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// 'outer: loop { -// let (documents_ids, last) = self.level_iters.last_mut()?; -// let is_ascending = last.is_left(); -// for result in last { -// // If the last iterator must find an empty set of documents it means -// // that we found all the documents in the sub level iterations already, -// // we can pop this level iterator. -// if documents_ids.is_empty() { -// break; -// } - -// match result { -// Ok((key, mut docids)) => { -// docids &= &*documents_ids; -// if !docids.is_empty() { -// if self.must_reduce { -// *documents_ids -= &docids; -// } - -// if level == 0 { -// return Some(Ok((left, docids))); -// } - -// let rtxn = self.rtxn; -// let db = self.db; -// let fid = self.field_id; -// let left = Included(left); -// let right = Included(right); - -// let result = if is_ascending { -// FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) -// .map(Left) -// } else { -// FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) -// .map(Right) -// }; - -// match result { -// Ok(iter) => { -// self.level_iters.push((docids, iter)); -// continue 'outer; -// } -// Err(e) => return Some(Err(e)), -// } -// } -// } -// Err(e) => return Some(Err(e)), -// } -// } -// self.level_iters.pop(); -// } -// } -// } diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs new file mode 100644 index 000000000..c9abd9556 --- /dev/null +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -0,0 +1,147 @@ +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; + +use super::{get_first_facet_value, get_highest_level}; + +pub fn ascending_facet_sort<'t>( + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: RoaringBitmap, +) -> Box + 't> { + let highest_level = + get_highest_level(rtxn, &db.remap_key_type::>(), field_id); + if let Some(first_bound) = get_first_facet_value::( + rtxn, + &db.remap_key_type::>(), + field_id, + ) { + let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; + let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); + + Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] }) + } else { + return Box::new(std::iter::empty()); + } +} + +struct AscendingFacetSort<'t, 'e> { + rtxn: &'t heed::RoTxn<'e>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + stack: Vec<( + RoaringBitmap, + std::iter::Take, FacetGroupValueCodec>>, + )>, +} + +impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { + type Item = (&'t [u8], RoaringBitmap); + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, deepest_iter) = self.stack.last_mut()?; + for result in deepest_iter { + let ( + FacetKey { level, left_bound, field_id }, + FacetGroupValue { size: group_size, mut bitmap }, + ) = result.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if field_id != self.field_id { + return None; + } + + // If the last iterator found an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + + bitmap &= &*documents_ids; + if !bitmap.is_empty() { + *documents_ids -= &bitmap; + + if level == 0 { + return Some((left_bound, bitmap)); + } + let starting_key_below = + FacetKey { field_id: self.field_id, level: level - 1, left_bound }; + let iter = self + .db + .range(&self.rtxn, &(starting_key_below..)) + .unwrap() + .take(group_size as usize); + + self.stack.push((bitmap, iter)); + continue 'outer; + } + } + self.stack.pop(); + } + } +} + +#[cfg(test)] +mod tests { + use crate::{ + ascending_facet_sort::ascending_facet_sort, codec::U16Codec, display_bitmap, Index, + }; + use heed::BytesDecode; + use roaring::RoaringBitmap; + + fn get_simple_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &i, &bitmap); + } + txn.commit().unwrap(); + index + } + fn get_random_looking_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let rng = fastrand::Rng::with_seed(0); + let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as u16), &bitmap); + } + txn.commit().unwrap(); + index + } + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + insta::assert_display_snapshot!(index) + } + #[test] + fn filter_sort() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, &index.db.content, 0, candidates); + for (facet, docids) in iter { + let facet = U16Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); + } + insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs new file mode 100644 index 000000000..d3c9d54f8 --- /dev/null +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -0,0 +1,172 @@ +use std::ops::Bound; + +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, +}; + +use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; + +fn descending_facet_sort<'t>( + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: RoaringBitmap, +) -> Box + 't> { + let highest_level = get_highest_level(rtxn, db, field_id); + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id) { + let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; + let last_bound = get_last_facet_value::(rtxn, db, field_id).unwrap(); + let last_key = FacetKey { field_id, level: highest_level, left_bound: last_bound }; + let iter = db.rev_range(rtxn, &(first_key..=last_key)).unwrap().take(usize::MAX); + Box::new(DescendingFacetSort { + rtxn, + db, + field_id, + stack: vec![(candidates, iter, Bound::Included(last_bound))], + }) + } else { + return Box::new(std::iter::empty()); + } +} + +struct DescendingFacetSort<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, + stack: Vec<( + RoaringBitmap, + std::iter::Take, FacetGroupValueCodec>>, + Bound<&'t [u8]>, + )>, +} + +impl<'t> Iterator for DescendingFacetSort<'t> { + type Item = (&'t [u8], RoaringBitmap); + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?; + while let Some(result) = deepest_iter.next() { + let ( + FacetKey { level, left_bound, field_id }, + FacetGroupValue { size: group_size, mut bitmap }, + ) = result.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if field_id != self.field_id { + return None; + } + // If the last iterator found an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + + bitmap &= &*documents_ids; + if !bitmap.is_empty() { + *documents_ids -= &bitmap; + + if level == 0 { + return Some((left_bound, bitmap)); + } + let starting_key_below = FacetKey { field_id, level: level - 1, left_bound }; + + let end_key_kelow = match *right_bound { + Bound::Included(right) => Bound::Included(FacetKey { + field_id, + level: level - 1, + left_bound: right, + }), + Bound::Excluded(right) => Bound::Excluded(FacetKey { + field_id, + level: level - 1, + left_bound: right, + }), + Bound::Unbounded => Bound::Unbounded, + }; + let prev_right_bound = *right_bound; + *right_bound = Bound::Excluded(left_bound); + let iter = self + .db + .rev_range( + &self.rtxn, + &(Bound::Included(starting_key_below), end_key_kelow), + ) + .unwrap() + .take(group_size as usize); + + self.stack.push((bitmap, iter, prev_right_bound)); + continue 'outer; + } + *right_bound = Bound::Excluded(left_bound); + } + self.stack.pop(); + } + } +} + +#[cfg(test)] +mod tests { + use crate::{ + codec::{MyByteSlice, U16Codec}, + descending_facet_sort::descending_facet_sort, + display_bitmap, FacetKeyCodec, Index, + }; + use heed::BytesDecode; + use roaring::RoaringBitmap; + + fn get_simple_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &i, &bitmap); + } + txn.commit().unwrap(); + index + } + fn get_random_looking_index() -> Index { + let index = Index::::new(4, 8); + let mut txn = index.env.write_txn().unwrap(); + + let rng = fastrand::Rng::with_seed(0); + let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as u16), &bitmap); + } + txn.commit().unwrap(); + index + } + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + insta::assert_display_snapshot!(index) + } + #[test] + fn filter_sort_descending() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.into_iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let db = index.db.content.remap_key_type::>(); + let iter = descending_facet_sort(&txn, &db, 0, candidates); + for (facet, docids) in iter { + let facet = U16Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); + } + insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs deleted file mode 100644 index b01359503..000000000 --- a/milli/src/search/facet/facet_string.rs +++ /dev/null @@ -1,649 +0,0 @@ -// //! This module contains helpers iterators for facet strings. -// //! -// //! The purpose is to help iterate over the quite complex system of facets strings. A simple -// //! description of the system would be that every facet string value is stored into an LMDB database -// //! and that every value is associated with the document ids which are associated with this facet -// //! string value. -// //! -// //! In reality it is a little bit more complex as we have to create aggregations of runs of facet -// //! string values, those aggregations helps in choosing the right groups of facets to follow. -// //! -// //! ## A typical algorithm run -// //! -// //! If a group of aggregated facets values contains one of the documents ids, we must continue -// //! iterating over the sub-groups. -// //! -// //! If this group is the lowest level and contain at least one document id we yield the associated -// //! facet documents ids. -// //! -// //! If the group doesn't contain one of our documents ids, we continue to the next group at this -// //! same level. -// //! -// //! ## The complexity comes from the strings -// //! -// //! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create -// //! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the -// //! two numbers bounds, the left and the right bound of the group, both inclusive. -// //! -// //! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and -// //! puting two numbers big-endian encoded one after the other gives us ordered groups. The values -// //! are simple unions of the documents ids coming from the groups below. -// //! -// //! ### Example of what a facet number LMDB database contain -// //! -// //! | level | left-bound | right-bound | documents ids | -// //! |-------|------------|-------------|------------------| -// //! | 0 | 0 | _skipped_ | 1, 2 | -// //! | 0 | 1 | _skipped_ | 6, 7 | -// //! | 0 | 3 | _skipped_ | 4, 7 | -// //! | 0 | 5 | _skipped_ | 2, 3, 4 | -// //! | 1 | 0 | 1 | 1, 2, 6, 7 | -// //! | 1 | 3 | 5 | 2, 3, 4, 7 | -// //! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | -// //! -// //! As you can see the level 0 have two equal bounds, therefore we skip serializing the second -// //! bound, that's the base level where you can directly fetch the documents ids associated with an -// //! exact number. -// //! -// //! The next levels have two different bounds and the associated documents ids are simply the result -// //! of an union of all the documents ids associated with the aggregated groups above. -// //! -// //! ## The complexity of defining groups for facet strings -// //! -// //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in -// //! lexicographical order, it means that whatever the key represent the bytes are read in their raw -// //! form and a simple `strcmp` will define the order in which keys will be read from the store. -// //! -// //! That's easy for types with a known size, like floats or integers, they are 64 bytes long and -// //! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the -// //! first number then by the second if the the first number is equal on two keys. -// //! -// //! For strings it is a lot more complex as those types are unsized, it means that the size of facet -// //! strings is different for each facet value. -// //! -// //! ### Basic approach: padding the keys -// //! -// //! A first approach would be to simply define the maximum size of a facet string and pad the keys -// //! with zeroes. The big problem of this approach is that it: -// //! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the -// //! other. -// //! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB -// //! performances. -// //! -// //! ### Better approach: number the facet groups -// //! -// //! A better approach would be to number the groups, this way we don't have the downsides of the -// //! previously described approach but we need to be able to describe the groups by using a number. -// //! -// //! #### Example of facet strings with numbered groups -// //! -// //! | level | left-bound | right-bound | left-string | right-string | documents ids | -// //! |-------|------------|-------------|-------------|--------------|------------------| -// //! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | -// //! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | -// //! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | -// //! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | -// //! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | -// //! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | -// //! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | -// //! -// //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not -// //! need to store the facet string value two times. -// //! -// //! The number in the left-bound and right-bound columns are incremental numbers representing the -// //! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering -// //! of the LMDB keys. -// //! -// //! In the value, not in the key, you can see that we added two new values: the left-string and the -// //! right-string, which defines the original facet strings associated with the given group. -// //! -// //! We put those two strings inside of the value, this way we do not limit the maximum size of the -// //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big -// //! values on another page, this helps in iterating over keys fast enough and only fetch the page -// //! with the values when required. -// //! -// //! The other little advantage with this solution is that there is no a big overhead, compared with -// //! the facet number levels, we only duplicate the facet strings once for the level 1. -// //! -// //! #### A typical algorithm run -// //! -// //! Note that the algorithm is always moving from the highest level to the lowest one, one level -// //! by one level, this is why it is ok to only store the facets string on the level 1. -// //! -// //! If a group of aggregated facets values, a group with numbers contains one of the documents ids, -// //! we must continue iterating over the sub-groups. To do so: -// //! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds -// //! and iterate over the facet groups defined by these numbers over the current level - 1. -// //! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the -// //! value and just do the same as with the facet numbers but with strings: iterate over the -// //! current level - 1 with both keys. -// //! -// //! If this group is the lowest level (level 0) and contain at least one document id we yield the -// //! associated facet documents ids. -// //! -// //! If the group doesn't contain one of our documents ids, we continue to the next group at this -// //! same level. -// //! - -// use std::num::NonZeroU8; -// use std::ops::Bound; -// use std::ops::Bound::{Excluded, Included, Unbounded}; - -// use either::{Either, Left, Right}; -// use heed::types::{ByteSlice, DecodeIgnore}; -// use heed::{Database, LazyDecode, RoRange, RoRevRange}; -// use roaring::RoaringBitmap; - -// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; -// use crate::heed_codec::CboRoaringBitmapCodec; -// use crate::{FieldId, Index}; - -// /// An iterator that is used to explore the facets level strings -// /// from the level 1 to infinity. -// /// -// /// It yields the level, group id that an entry covers, the optional group strings -// /// that it covers of the level 0 only if it is an entry from the level 1 and -// /// the roaring bitmap associated. -// pub struct FacetStringGroupRange<'t> { -// iter: RoRange< -// 't, -// FacetLevelValueU32Codec, -// LazyDecode>, -// >, -// end: Bound, -// } - -// impl<'t> FacetStringGroupRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, -// field_id: FieldId, -// level: NonZeroU8, -// left: Bound, -// right: Bound, -// ) -> heed::Result> { -// let db = db.remap_types::< -// FacetLevelValueU32Codec, -// FacetStringZeroBoundsValueCodec, -// >(); -// let left_bound = match left { -// Included(left) => Included((field_id, level, left, u32::MIN)), -// Excluded(left) => Excluded((field_id, level, left, u32::MIN)), -// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), -// }; -// let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); -// let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; -// Ok(FacetStringGroupRange { iter, end: right }) -// } -// } - -// impl<'t> Iterator for FacetStringGroupRange<'t> { -// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - -// fn next(&mut self) -> Option { -// match self.iter.next() { -// Some(Ok(((_fid, level, left, right), docids))) => { -// let must_be_returned = match self.end { -// Included(end) => right <= end, -// Excluded(end) => right < end, -// Unbounded => true, -// }; -// if must_be_returned { -// match docids.decode() { -// Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), -// Err(e) => Some(Err(e)), -// } -// } else { -// None -// } -// } -// Some(Err(e)) => Some(Err(e)), -// None => None, -// } -// } -// } - -// pub struct FacetStringGroupRevRange<'t> { -// iter: RoRevRange< -// 't, -// FacetLevelValueU32Codec, -// LazyDecode>, -// >, -// end: Bound, -// } - -// impl<'t> FacetStringGroupRevRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, -// field_id: FieldId, -// level: NonZeroU8, -// left: Bound, -// right: Bound, -// ) -> heed::Result> { -// let db = db.remap_types::< -// FacetLevelValueU32Codec, -// FacetStringZeroBoundsValueCodec, -// >(); -// let left_bound = match left { -// Included(left) => Included((field_id, level, left, u32::MIN)), -// Excluded(left) => Excluded((field_id, level, left, u32::MIN)), -// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), -// }; -// let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); -// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; -// Ok(FacetStringGroupRevRange { iter, end: right }) -// } -// } - -// impl<'t> Iterator for FacetStringGroupRevRange<'t> { -// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - -// fn next(&mut self) -> Option { -// loop { -// match self.iter.next() { -// Some(Ok(((_fid, level, left, right), docids))) => { -// let must_be_returned = match self.end { -// Included(end) => right <= end, -// Excluded(end) => right < end, -// Unbounded => true, -// }; -// if must_be_returned { -// match docids.decode() { -// Ok((bounds, docids)) => { -// return Some(Ok(((level, left, right), (bounds, docids)))) -// } -// Err(e) => return Some(Err(e)), -// } -// } -// continue; -// } -// Some(Err(e)) => return Some(Err(e)), -// None => return None, -// } -// } -// } -// } - -// /// An iterator that is used to explore the level 0 of the facets string database. -// /// -// /// It yields the facet string and the roaring bitmap associated with it. -// pub struct FacetStringLevelZeroRange<'t> { -// iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -// } - -// impl<'t> FacetStringLevelZeroRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, -// field_id: FieldId, -// left: Bound<&str>, -// right: Bound<&str>, -// ) -> heed::Result> { -// fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { -// buffer.extend_from_slice(&field_id.to_be_bytes()); -// buffer.push(0); -// buffer.extend_from_slice(value.as_bytes()); -// &buffer[..] -// } - -// let mut left_buffer = Vec::new(); -// let left_bound = match left { -// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), -// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), -// Unbounded => { -// left_buffer.extend_from_slice(&field_id.to_be_bytes()); -// left_buffer.push(0); -// Included(&left_buffer[..]) -// } -// }; - -// let mut right_buffer = Vec::new(); -// let right_bound = match right { -// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), -// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), -// Unbounded => { -// right_buffer.extend_from_slice(&field_id.to_be_bytes()); -// right_buffer.push(1); // we must only get the level 0 -// Excluded(&right_buffer[..]) -// } -// }; - -// let iter = db -// .remap_key_type::() -// .range(rtxn, &(left_bound, right_bound))? -// .remap_types::(); - -// Ok(FacetStringLevelZeroRange { iter }) -// } -// } - -// impl<'t> Iterator for FacetStringLevelZeroRange<'t> { -// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// match self.iter.next() { -// Some(Ok(((_fid, normalized), (original, docids)))) => { -// Some(Ok((normalized, original, docids))) -// } -// Some(Err(e)) => Some(Err(e)), -// None => None, -// } -// } -// } - -// pub struct FacetStringLevelZeroRevRange<'t> { -// iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -// } - -// impl<'t> FacetStringLevelZeroRevRange<'t> { -// pub fn new( -// rtxn: &'t heed::RoTxn, -// db: Database, -// field_id: FieldId, -// left: Bound<&str>, -// right: Bound<&str>, -// ) -> heed::Result> { -// fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { -// buffer.extend_from_slice(&field_id.to_be_bytes()); -// buffer.push(0); -// buffer.extend_from_slice(value.as_bytes()); -// &buffer[..] -// } - -// let mut left_buffer = Vec::new(); -// let left_bound = match left { -// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), -// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), -// Unbounded => { -// left_buffer.extend_from_slice(&field_id.to_be_bytes()); -// left_buffer.push(0); -// Included(&left_buffer[..]) -// } -// }; - -// let mut right_buffer = Vec::new(); -// let right_bound = match right { -// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), -// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), -// Unbounded => { -// right_buffer.extend_from_slice(&field_id.to_be_bytes()); -// right_buffer.push(1); // we must only get the level 0 -// Excluded(&right_buffer[..]) -// } -// }; - -// let iter = db -// .remap_key_type::() -// .rev_range(rtxn, &(left_bound, right_bound))? -// .remap_types::(); - -// Ok(FacetStringLevelZeroRevRange { iter }) -// } -// } - -// impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { -// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// match self.iter.next() { -// Some(Ok(((_fid, normalized), (original, docids)))) => { -// Some(Ok((normalized, original, docids))) -// } -// Some(Err(e)) => Some(Err(e)), -// None => None, -// } -// } -// } - -// type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; -// type EitherStringRevRange<'t> = -// Either, FacetStringLevelZeroRevRange<'t>>; - -// /// An iterator that is used to explore the facet strings level by level, -// /// it will only return facets strings that are associated with the -// /// candidates documents ids given. -// pub struct FacetStringIter<'t> { -// rtxn: &'t heed::RoTxn<'t>, -// db: Database, -// field_id: FieldId, -// level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, -// must_reduce: bool, -// } - -// impl<'t> FacetStringIter<'t> { -// pub fn new_reducing( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_string_docids.remap_types::(); -// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; -// Ok(FacetStringIter { -// rtxn, -// db, -// field_id, -// level_iters: vec![(documents_ids, Left(highest_iter))], -// must_reduce: true, -// }) -// } - -// pub fn new_reverse_reducing( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_string_docids.remap_types::(); -// let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; -// Ok(FacetStringIter { -// rtxn, -// db, -// field_id, -// level_iters: vec![(documents_ids, Right(highest_reverse_iter))], -// must_reduce: true, -// }) -// } - -// pub fn new_non_reducing( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// field_id: FieldId, -// documents_ids: RoaringBitmap, -// ) -> heed::Result> { -// let db = index.facet_id_string_docids.remap_types::(); -// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; -// Ok(FacetStringIter { -// rtxn, -// db, -// field_id, -// level_iters: vec![(documents_ids, Left(highest_iter))], -// must_reduce: false, -// }) -// } - -// fn highest_level( -// rtxn: &'t heed::RoTxn, -// db: Database, -// fid: FieldId, -// ) -> heed::Result> { -// Ok(db -// .remap_types::() -// .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits -// .last() -// .transpose()? -// .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit -// } - -// fn highest_iter( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// db: Database, -// field_id: FieldId, -// ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// match NonZeroU8::new(highest_level) { -// Some(highest_level) => FacetStringGroupRange::new( -// rtxn, -// index.facet_id_string_docids, -// field_id, -// highest_level, -// Unbounded, -// Unbounded, -// ) -// .map(Left), -// None => FacetStringLevelZeroRange::new( -// rtxn, -// index.facet_id_string_docids, -// field_id, -// Unbounded, -// Unbounded, -// ) -// .map(Right), -// } -// } - -// fn highest_reverse_iter( -// rtxn: &'t heed::RoTxn, -// index: &'t Index, -// db: Database, -// field_id: FieldId, -// ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { -// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); -// match NonZeroU8::new(highest_level) { -// Some(highest_level) => FacetStringGroupRevRange::new( -// rtxn, -// index.facet_id_string_docids, -// field_id, -// highest_level, -// Unbounded, -// Unbounded, -// ) -// .map(Left), -// None => FacetStringLevelZeroRevRange::new( -// rtxn, -// index.facet_id_string_docids, -// field_id, -// Unbounded, -// Unbounded, -// ) -// .map(Right), -// } -// } -// } - -// impl<'t> Iterator for FacetStringIter<'t> { -// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - -// fn next(&mut self) -> Option { -// 'outer: loop { -// let (documents_ids, last) = self.level_iters.last_mut()?; -// let is_ascending = last.is_left(); - -// // We remap the different iterator types to make -// // the algorithm less complex to understand. -// let last = match last { -// Left(ascending) => match ascending { -// Left(group) => Left(Left(group)), -// Right(zero_level) => Right(Left(zero_level)), -// }, -// Right(descending) => match descending { -// Left(group) => Left(Right(group)), -// Right(zero_level) => Right(Right(zero_level)), -// }, -// }; - -// match last { -// Left(group) => { -// for result in group { -// match result { -// Ok(((level, left, right), (string_bounds, mut docids))) => { -// docids &= &*documents_ids; -// if !docids.is_empty() { -// if self.must_reduce { -// *documents_ids -= &docids; -// } - -// let result = if is_ascending { -// match string_bounds { -// Some((left, right)) => FacetStringLevelZeroRange::new( -// self.rtxn, -// self.db, -// self.field_id, -// Included(left), -// Included(right), -// ) -// .map(Right), -// None => FacetStringGroupRange::new( -// self.rtxn, -// self.db, -// self.field_id, -// NonZeroU8::new(level.get() - 1).unwrap(), -// Included(left), -// Included(right), -// ) -// .map(Left), -// } -// .map(Left) -// } else { -// match string_bounds { -// Some((left, right)) => { -// FacetStringLevelZeroRevRange::new( -// self.rtxn, -// self.db, -// self.field_id, -// Included(left), -// Included(right), -// ) -// .map(Right) -// } -// None => FacetStringGroupRevRange::new( -// self.rtxn, -// self.db, -// self.field_id, -// NonZeroU8::new(level.get() - 1).unwrap(), -// Included(left), -// Included(right), -// ) -// .map(Left), -// } -// .map(Right) -// }; - -// match result { -// Ok(iter) => { -// self.level_iters.push((docids, iter)); -// continue 'outer; -// } -// Err(e) => return Some(Err(e)), -// } -// } -// } -// Err(e) => return Some(Err(e)), -// } -// } -// } -// Right(zero_level) => { -// // level zero only -// for result in zero_level { -// match result { -// Ok((normalized, original, mut docids)) => { -// docids &= &*documents_ids; -// if !docids.is_empty() { -// if self.must_reduce { -// *documents_ids -= &docids; -// } -// return Some(Ok((normalized, original, docids))); -// } -// } -// Err(e) => return Some(Err(e)), -// } -// } -// } -// } - -// self.level_iters.pop(); -// } -// } -// } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 13b00d2de..ceedff1e0 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,9 +1,79 @@ +use heed::types::ByteSlice; +use heed::{BytesDecode, RoTxn}; + +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; + pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; // pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; // pub use self::facet_string::FacetStringIter; pub use self::filter::Filter; mod facet_distribution; -mod facet_number; -mod facet_string; +mod facet_distribution_iter; +mod facet_sort_ascending; +mod facet_sort_descending; mod filter; + +fn get_first_facet_value<'t, BoundCodec>( + txn: &'t RoTxn, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> Option +where + BoundCodec: BytesDecode<'t>, +{ + let mut level0prefix = vec![]; + level0prefix.extend_from_slice(&field_id.to_be_bytes()); + level0prefix.push(0); + let mut level0_iter_forward = db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice()) + .unwrap(); + if let Some(first) = level0_iter_forward.next() { + let (first_key, _) = first.unwrap(); + let first_key = FacetKeyCodec::::bytes_decode(first_key).unwrap(); + Some(first_key.left_bound) + } else { + None + } +} +fn get_last_facet_value<'t, BoundCodec>( + txn: &'t RoTxn, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> Option +where + BoundCodec: BytesDecode<'t>, +{ + let mut level0prefix = vec![]; + level0prefix.extend_from_slice(&field_id.to_be_bytes()); + level0prefix.push(0); + let mut level0_iter_backward = db + .as_polymorph() + .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice()) + .unwrap(); + if let Some(last) = level0_iter_backward.next() { + let (last_key, _) = last.unwrap(); + let last_key = FacetKeyCodec::::bytes_decode(last_key).unwrap(); + Some(last_key.left_bound) + } else { + None + } +} +fn get_highest_level<'t>( + txn: &'t RoTxn<'t>, + db: &'t heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> u8 { + let field_id_prefix = &field_id.to_be_bytes(); + db.as_polymorph() + .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix) + .unwrap() + .next() + .map(|el| { + let (key, _) = el.unwrap(); + let key = FacetKeyCodec::::bytes_decode(key).unwrap(); + key.level + }) + .unwrap_or(0) +} diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index aaaa445da..fe8c2855e 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -64,7 +64,7 @@ impl<'i> Facets<'i> { } #[logging_timer::time("Facets::{}")] - pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> { + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; // We get the faceted fields to be able to create the facet levels. let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone(); @@ -172,14 +172,14 @@ impl<'t> CreateFacetsAlgo<'t> { bitmaps.push(docids); if bitmaps.len() == self.level_group_size { - handle_group(&bitmaps, left_bound); + handle_group(&bitmaps, left_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); } } // don't forget to give the leftover bitmaps as well if !bitmaps.is_empty() { - handle_group(&bitmaps, left_bound); + handle_group(&bitmaps, left_bound)?; bitmaps.clear(); } Ok(()) @@ -197,7 +197,7 @@ impl<'t> CreateFacetsAlgo<'t> { handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, ) -> Result>> { if level == 0 { - self.read_level_0(handle_group); + self.read_level_0(handle_group)?; // Level 0 is already in the database return Ok(vec![]); }