diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index 2eebffbcd..3379d1abe 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -114,14 +114,13 @@ where mod tests { use std::ops::ControlFlow; - use heed::BytesDecode; - use rand::{Rng, SeedableRng}; - use roaring::RoaringBitmap; - use super::iterate_over_facet_distribution; use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; - use crate::search::facet::test::FacetIndex; + use crate::update::facet::tests::FacetIndex; + use heed::BytesDecode; + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -164,17 +163,11 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); - iterate_over_facet_distribution( - &txn, - index.db.content, - 0, - &candidates, - |facet, count| { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - results.push_str(&format!("{facet}: {count}\n")); - ControlFlow::Continue(()) - }, - ) + iterate_over_facet_distribution(&txn, index.content, 0, &candidates, |facet, count| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {count}\n")); + ControlFlow::Continue(()) + }) .unwrap(); milli_snap!(results, i); @@ -189,23 +182,17 @@ mod tests { let candidates = (0..=255).into_iter().collect::(); let mut results = String::new(); let mut nbr_facets = 0; - iterate_over_facet_distribution( - &txn, - index.db.content, - 0, - &candidates, - |facet, count| { - let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); - if nbr_facets == 100 { - return ControlFlow::Break(()); - } else { - nbr_facets += 1; - results.push_str(&format!("{facet}: {count}\n")); + iterate_over_facet_distribution(&txn, index.content, 0, &candidates, |facet, count| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + if nbr_facets == 100 { + return ControlFlow::Break(()); + } else { + nbr_facets += 1; + results.push_str(&format!("{facet}: {count}\n")); - ControlFlow::Continue(()) - } - }, - ) + ControlFlow::Continue(()) + } + }) .unwrap(); milli_snap!(results, i); diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index bb555e1ab..cb5fd14d2 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -263,8 +263,8 @@ mod tests { use super::find_docids_of_facet_within_bounds; use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; - use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; + use crate::update::facet::tests::FacetIndex; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -312,7 +312,7 @@ mod tests { let end = Bound::Included(i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -328,7 +328,7 @@ mod tests { let end = Bound::Excluded(i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -354,7 +354,7 @@ mod tests { let end = Bound::Included(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -373,7 +373,7 @@ mod tests { let end = Bound::Excluded(255.); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -401,7 +401,7 @@ mod tests { let end = Bound::Included(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, @@ -420,7 +420,7 @@ mod tests { let end = Bound::Excluded(255. - i); let docids = find_docids_of_facet_within_bounds::( &txn, - index.db.content.remap_key_type::>(), + index.content.remap_key_type::>(), 0, &start, &end, diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index fc5fd3d04..f320f9e77 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -89,8 +89,8 @@ mod tests { use crate::heed_codec::facet::OrderedF64Codec; use crate::milli_snap; use crate::search::facet::facet_sort_ascending::ascending_facet_sort; - use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; + use crate::update::facet::tests::FacetIndex; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -133,7 +133,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); - let iter = ascending_facet_sort(&txn, index.db.content, 0, candidates).unwrap(); + let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap(); for el in iter { let docids = el.unwrap(); results.push_str(&display_bitmap(&docids)); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 42bae42a6..be5fe7841 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -122,8 +122,8 @@ mod tests { use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, OrderedF64Codec}; use crate::milli_snap; use crate::search::facet::facet_sort_descending::descending_facet_sort; - use crate::search::facet::test::FacetIndex; use crate::snapshot_tests::display_bitmap; + use crate::update::facet::tests::FacetIndex; fn get_simple_index() -> FacetIndex { let index = FacetIndex::::new(4, 8, 5); @@ -166,7 +166,7 @@ mod tests { let txn = index.env.read_txn().unwrap(); let candidates = (200..=300).into_iter().collect::(); let mut results = String::new(); - let db = index.db.content.remap_key_type::>(); + let db = index.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); for el in iter { let docids = el.unwrap(); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index ef72658ec..fc71acf37 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -74,149 +74,3 @@ pub(crate) fn get_highest_level<'t>( }) .unwrap_or(0)) } - -#[cfg(test)] -pub mod test { - use std::fmt::Display; - use std::marker::PhantomData; - use std::rc::Rc; - - use heed::{BytesDecode, BytesEncode, Env, RwTxn}; - use roaring::RoaringBitmap; - - use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, - }; - use crate::snapshot_tests::display_bitmap; - use crate::update::FacetsUpdateIncrementalInner; - - // A dummy index that only contains the facet database, used for testing - pub struct FacetIndex - where - for<'a> BoundCodec: - BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, - { - pub env: Env, - pub db: Database, - _phantom: PhantomData, - } - - // The faecet database and its settings - pub struct Database { - pub content: heed::Database, FacetGroupValueCodec>, - pub group_size: u8, - pub min_level_size: u8, - pub max_group_size: u8, - _tempdir: Rc, - } - - impl FacetIndex - where - for<'a> BoundCodec: - BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, - { - #[cfg(all(test, fuzzing))] - pub fn open_from_tempdir( - tempdir: Rc, - group_size: u8, - max_group_size: u8, - min_level_size: u8, - ) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 - let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 - let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf - - let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 10 * 100); - unsafe { - options.flag(heed::flags::Flags::MdbAlwaysFreePages); - } - let env = options.open(tempdir.path()).unwrap(); - let content = env.open_database(None).unwrap().unwrap(); - - FacetIndex { - db: Database { - content, - group_size, - max_group_size, - min_level_size, - _tempdir: tempdir, - }, - env, - _phantom: PhantomData, - } - } - pub fn new( - group_size: u8, - max_group_size: u8, - min_level_size: u8, - ) -> FacetIndex { - let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 - let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 - let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf - let mut options = heed::EnvOpenOptions::new(); - let options = options.map_size(4096 * 4 * 100); - let tempdir = tempfile::TempDir::new().unwrap(); - let env = options.open(tempdir.path()).unwrap(); - let content = env.create_database(None).unwrap(); - - FacetIndex { - db: Database { - content, - group_size, - max_group_size, - min_level_size, - _tempdir: Rc::new(tempdir), - }, - env, - _phantom: PhantomData, - } - } - pub fn insert<'a>( - &self, - rwtxn: &'a mut RwTxn, - field_id: u16, - key: &'a >::EItem, - docids: &RoaringBitmap, - ) { - let update = FacetsUpdateIncrementalInner::new(self.db.content); - let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - update.insert(rwtxn, field_id, &key_bytes, docids).unwrap(); - } - pub fn delete<'a>( - &self, - rwtxn: &'a mut RwTxn, - field_id: u16, - key: &'a >::EItem, - value: u32, - ) { - let update = FacetsUpdateIncrementalInner::new(self.db.content); - let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - update.delete(rwtxn, field_id, &key_bytes, value).unwrap(); - } - } - - impl Display for FacetIndex - where - for<'a> >::EItem: Sized + Display, - for<'a> BoundCodec: - BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, - { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - let txn = self.env.read_txn().unwrap(); - let mut iter = self.db.content.iter(&txn).unwrap(); - while let Some(el) = iter.next() { - let (key, value) = el.unwrap(); - let FacetGroupKey { field_id, level, left_bound: bound } = key; - let bound = BoundCodec::bytes_decode(bound).unwrap(); - let FacetGroupValue { size, bitmap } = value; - writeln!( - f, - "{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}", - values = display_bitmap(&bitmap) - )?; - } - Ok(()) - } - } -} diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index ad97ed2de..321ae52d4 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -19,7 +19,7 @@ use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; pub struct FacetsUpdateBulk<'i> { index: &'i Index, database: heed::Database, FacetGroupValueCodec>, - level_group_size: u8, + group_size: u8, min_level_size: u8, facet_type: FacetType, // None if level 0 does not need to be updated @@ -42,7 +42,7 @@ impl<'i> FacetsUpdateBulk<'i> { index.facet_id_f64_docids.remap_key_type::>() } }, - level_group_size: 4, + group_size: 4, min_level_size: 5, facet_type, new_data: Some(new_data), @@ -63,7 +63,7 @@ impl<'i> FacetsUpdateBulk<'i> { index.facet_id_f64_docids.remap_key_type::>() } }, - level_group_size: 4, + group_size: 4, min_level_size: 5, facet_type, new_data: None, @@ -74,61 +74,85 @@ impl<'i> FacetsUpdateBulk<'i> { /// /// This setting is always greater than or equal to 2. pub fn level_group_size(mut self, value: u8) -> Self { - self.level_group_size = cmp::max(value, 2); + self.group_size = cmp::max(value, 2); self } /// The minimum number of elements that a level is allowed to have. pub fn min_level_size(mut self, value: u8) -> Self { - self.min_level_size = cmp::max(value, 1); + self.min_level_size = cmp::max(value, 2); self } + #[logging_timer::time("FacetsUpdateBulk::{}")] + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + + let Self { index, database, group_size, min_level_size, facet_type, new_data } = self; + + index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + + let inner = FacetsUpdateBulkInner { db: database, new_data, group_size, min_level_size }; + + let field_ids = index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); + + inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { + index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; + Ok(()) + })?; + + Ok(()) + } +} + +pub(crate) struct FacetsUpdateBulkInner { + pub db: heed::Database, FacetGroupValueCodec>, + pub new_data: Option>, + pub group_size: u8, + pub min_level_size: u8, +} +impl FacetsUpdateBulkInner { + pub fn update( + mut self, + wtxn: &mut RwTxn, + field_ids: &[u16], + mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>, + ) -> Result<()> { + self.update_level0(wtxn)?; + for &field_id in field_ids.iter() { + self.clear_levels(wtxn, field_id)?; + } + + for &field_id in field_ids.iter() { + let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?; + + handle_all_docids(wtxn, field_id, all_docids)?; + + for level_reader in level_readers { + let mut cursor = level_reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + self.db.remap_types::().put(wtxn, k, v)?; + } + } + } + Ok(()) + } + fn clear_levels(&self, wtxn: &mut heed::RwTxn, field_id: FieldId) -> Result<()> { let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; let range = left..=right; - self.database.delete_range(wtxn, &range).map(drop)?; + self.db.delete_range(wtxn, &range).map(drop)?; Ok(()) } - - #[logging_timer::time("FacetsUpdateBulk::{}")] - pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> { - self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - - // We get the faceted fields to be able to create the facet levels. - let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone(); - - for &field_id in faceted_fields.iter() { - self.clear_levels(wtxn, field_id)?; - } - self.update_level0(wtxn)?; - - for &field_id in faceted_fields.iter() { - let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?; - - self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &all_docids)?; - - for level_reader in level_readers { - let mut cursor = level_reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - self.database.remap_types::().put(wtxn, k, v)?; - } - } - } - - Ok(()) - } - fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { let new_data = match self.new_data.take() { Some(x) => x, None => return Ok(()), }; - if self.database.is_empty(wtxn)? { + if self.db.is_empty(wtxn)? { let mut buffer = Vec::new(); - let mut database = self.database.iter_mut(wtxn)?.remap_types::(); + let mut database = self.db.iter_mut(wtxn)?.remap_types::(); let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { buffer.clear(); @@ -140,7 +164,7 @@ impl<'i> FacetsUpdateBulk<'i> { } } else { let mut buffer = Vec::new(); - let database = self.database.remap_types::(); + let database = self.db.remap_types::(); let mut cursor = new_data.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { @@ -164,47 +188,29 @@ impl<'i> FacetsUpdateBulk<'i> { database.put(wtxn, key, &buffer)?; } } - Ok(()) } - fn compute_levels_for_field_id( &self, field_id: FieldId, txn: &RoTxn, ) -> Result<(Vec>, RoaringBitmap)> { // TODO: first check whether there is anything in level 0? - let algo = ComputeHigherLevels { - rtxn: txn, - db: &self.database, - field_id, - level_group_size: self.level_group_size, - min_level_size: self.min_level_size, - }; let mut all_docids = RoaringBitmap::new(); - let subwriters = algo.compute_higher_levels(32, &mut |bitmaps, _| { + let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { for bitmap in bitmaps { all_docids |= bitmap; } Ok(()) })?; - drop(algo); Ok((subwriters, all_docids)) } -} - -struct ComputeHigherLevels<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: &'t heed::Database, FacetGroupValueCodec>, - field_id: u16, - level_group_size: u8, - min_level_size: u8, -} -impl<'t> ComputeHigherLevels<'t> { - fn read_level_0( + fn read_level_0<'t>( &self, + rtxn: &'t RoTxn, + field_id: u16, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, ) -> Result<()> { // we read the elements one by one and @@ -213,13 +219,13 @@ impl<'t> ComputeHigherLevels<'t> { let mut bitmaps = vec![]; let mut level_0_prefix = vec![]; - level_0_prefix.extend_from_slice(&self.field_id.to_be_bytes()); + level_0_prefix.extend_from_slice(&field_id.to_be_bytes()); level_0_prefix.push(0); let level_0_iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, level_0_prefix.as_slice())? + .prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, level_0_prefix.as_slice())? .remap_types::, FacetGroupValueCodec>(); let mut left_bound: &[u8] = &[]; @@ -235,7 +241,7 @@ impl<'t> ComputeHigherLevels<'t> { } bitmaps.push(docids); - if bitmaps.len() == self.level_group_size as usize { + if bitmaps.len() == self.group_size as usize { handle_group(&bitmaps, left_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); @@ -254,13 +260,15 @@ impl<'t> ComputeHigherLevels<'t> { /// ## Returns: /// A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` /// that must be inserted into the database. - fn compute_higher_levels( + fn compute_higher_levels<'t>( &self, + rtxn: &'t RoTxn, + field_id: u16, level: u8, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, ) -> Result>> { if level == 0 { - self.read_level_0(handle_group)?; + self.read_level_0(rtxn, field_id, handle_group)?; // Level 0 is already in the database return Ok(vec![]); } @@ -270,7 +278,7 @@ impl<'t> ComputeHigherLevels<'t> { // of those elements, and their bitmaps, to the level above let mut cur_writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); - let mut cur_writer_len = 0; + let mut cur_writer_len: usize = 0; let mut group_sizes = vec![]; let mut left_bounds = vec![]; @@ -278,8 +286,13 @@ impl<'t> ComputeHigherLevels<'t> { // compute the levels below // in the callback, we fill `cur_writer` with the correct elements for this level - let mut sub_writers = - self.compute_higher_levels(level - 1, &mut |sub_bitmaps, left_bound| { + let mut sub_writers = self.compute_higher_levels( + rtxn, + field_id, + level - 1, + &mut |sub_bitmaps, left_bound| { + // TODO: is this done unnecessarily for all 32 levels? + println!("level: {level}"); let mut combined_bitmap = RoaringBitmap::default(); for bitmap in sub_bitmaps { combined_bitmap |= bitmap; @@ -288,7 +301,7 @@ impl<'t> ComputeHigherLevels<'t> { left_bounds.push(left_bound); bitmaps.push(combined_bitmap); - if bitmaps.len() != self.level_group_size as usize { + if bitmaps.len() != self.group_size as usize { return Ok(()); } let left_bound = left_bounds.first().unwrap(); @@ -297,7 +310,7 @@ impl<'t> ComputeHigherLevels<'t> { for ((bitmap, left_bound), group_size) in bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { - let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; + let key = FacetGroupKey { field_id, level, left_bound }; let key = FacetGroupKeyCodec::::bytes_encode(&key) .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; @@ -307,15 +320,26 @@ impl<'t> ComputeHigherLevels<'t> { cur_writer_len += 1; } Ok(()) - })?; + }, + )?; // don't forget to insert the leftover elements into the writer as well - if !bitmaps.is_empty() && cur_writer_len >= self.min_level_size { + + // but only do so if the current number of elements to be inserted into this + // levelcould grow to the minimum level size + + if !bitmaps.is_empty() && (cur_writer_len >= self.min_level_size as usize - 1) { + // the length of bitmaps is between 0 and group_size + assert!(bitmaps.len() < self.group_size as usize); + assert!(cur_writer_len > 0); + let left_bound = left_bounds.first().unwrap(); handle_group(&bitmaps, left_bound)?; + + // Note: how many bitmaps are there here? for ((bitmap, left_bound), group_size) in bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) { - let key = FacetGroupKey { field_id: self.field_id, level, left_bound }; + let key = FacetGroupKey { field_id, level, left_bound }; let key = FacetGroupKeyCodec::::bytes_encode(&key) .ok_or(Error::Encoding)?; let value = FacetGroupValue { size: group_size, bitmap }; @@ -324,9 +348,12 @@ impl<'t> ComputeHigherLevels<'t> { cur_writer_len += 1; } } - if cur_writer_len > self.min_level_size { + // if we inserted enough elements to reach the minimum level size, then we push the writer + if cur_writer_len as u8 >= self.min_level_size { sub_writers.push(writer_into_reader(cur_writer)?); } else { + // otherwise, if there are still leftover elements, we give them to the level above + // this is necessary in order to get the union of all docids if !bitmaps.is_empty() { handle_group(&bitmaps, left_bounds.first().unwrap())?; } @@ -337,184 +364,90 @@ impl<'t> ComputeHigherLevels<'t> { #[cfg(test)] mod tests { - use std::num::NonZeroUsize; - - use crate::db_snap; - use crate::documents::documents_batch_reader_from_objects; - use crate::index::tests::TempIndex; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::milli_snap; + use crate::update::facet::tests::FacetIndex; + use roaring::RoaringBitmap; + use std::iter::once; #[test] - fn test_facets_number() { - let test = - |name: &str, group_size: Option, min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; + fn insert() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = + FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1_000 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 0..100 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents = documents_batch_reader_from_objects(documents); - index.add_documents(documents).unwrap(); - db_snap!(index, facet_id_f64_docids, name); - }; - - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); - test("small_groups_small_levels", NonZeroUsize::new(2), NonZeroUsize::new(2)); - test("small_groups_large_levels", NonZeroUsize::new(2), NonZeroUsize::new(128)); - test("large_groups_small_levels", NonZeroUsize::new(16), NonZeroUsize::new(2)); - test("large_groups_large_levels", NonZeroUsize::new(16), NonZeroUsize::new(256)); - } - - #[test] - fn test_facets_string() { - let test = |name: &str, - group_size: Option, - min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; - - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..100 { - documents.push( - serde_json::json!({ "facet": format!("s{i:X}") }).as_object().unwrap().clone(), - ); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..1_000u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); } - for i in 0..10 { - documents.push( - serde_json::json!({ "facet2": format!("s{i:X}") }).as_object().unwrap().clone(), - ); + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); } - let documents = documents_batch_reader_from_objects(documents); + let mut wtxn = index.env.write_txn().unwrap(); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); - index.add_documents(documents).unwrap(); + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); - db_snap!(index, facet_id_string_docids, name); + wtxn.commit().unwrap(); + + milli_snap!(format!("{index}"), name); }; - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); } - #[test] - fn test_facets_number_incremental_update() { - let test = - |name: &str, group_size: Option, min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; + fn insert_delete_field_insert() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = + FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); + let mut wtxn = index.env.write_txn().unwrap(); - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..100u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); + } + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); + } + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); - let mut documents = vec![]; - for i in 0..1000 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 0..100 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents_batch = documents_batch_reader_from_objects(documents.clone()); + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + // delete all the elements for the facet id 0 + for i in 0..100u32 { + index.delete(&mut wtxn, 0, &(i as f64), i); + } + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); - index.add_documents(documents_batch).unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + // then add some elements again for the facet id 1 + for i in 0..110u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); + } + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); - let mut documents = vec![]; - for i in 1000..1010 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 100..110 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents_batch = documents_batch_reader_from_objects(documents.clone()); + wtxn.commit().unwrap(); - index.add_documents(documents_batch).unwrap(); + milli_snap!(format!("{index}"), name); + }; - db_snap!(index, facet_id_f64_docids, name); - }; - - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); - } - - #[test] - fn test_facets_number_delete_facet_id_then_bulk_update() { - let test = - |name: &str, group_size: Option, min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; - - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1000 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 0..100 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents_batch = documents_batch_reader_from_objects(documents.clone()); - - index.add_documents(documents_batch).unwrap(); - - // 1100 facets -> how long is the DB? - - let mut documents = vec![]; - for i in 1000..1010 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 100..110 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents_batch = documents_batch_reader_from_objects(documents.clone()); - - index.add_documents(documents_batch).unwrap(); - - db_snap!(index, facet_id_f64_docids, name); - }; - - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); } } diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 75ca5d55b..14b421242 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -84,15 +84,10 @@ impl<'i> FacetsUpdateIncremental<'i> { } pub struct FacetsUpdateIncrementalInner { - db: heed::Database, FacetGroupValueCodec>, - group_size: u8, - min_level_size: u8, - max_group_size: u8, -} -impl FacetsUpdateIncrementalInner { - pub fn new(db: heed::Database, FacetGroupValueCodec>) -> Self { - Self { db, group_size: 4, min_level_size: 5, max_group_size: 8 } - } + pub db: heed::Database, FacetGroupValueCodec>, + pub group_size: u8, + pub min_level_size: u8, + pub max_group_size: u8, } impl FacetsUpdateIncrementalInner { fn find_insertion_key_value( @@ -528,82 +523,13 @@ impl FacetsUpdateIncrementalInner { #[cfg(test)] mod tests { - use heed::types::ByteSlice; - use heed::{BytesDecode, BytesEncode}; + use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec}; + use crate::milli_snap; + use crate::update::facet::tests::FacetIndex; use rand::seq::SliceRandom; use rand::{Rng, SeedableRng}; use roaring::RoaringBitmap; - use crate::heed_codec::facet::{ - ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, StrRefCodec, - }; - use crate::milli_snap; - use crate::search::facet::get_highest_level; - use crate::search::facet::test::FacetIndex; - - pub fn verify_structure_validity(index: &FacetIndex, field_id: u16) - where - for<'a> C: BytesDecode<'a> + BytesEncode<'a, EItem = >::DItem>, - { - let FacetIndex { env, db, .. } = index; - - let txn = env.write_txn().unwrap(); - let mut field_id_prefix = vec![]; - field_id_prefix.extend_from_slice(&field_id.to_be_bytes()); - - let highest_level = get_highest_level(&txn, index.db.content, field_id).unwrap(); - txn.commit().unwrap(); - - let txn = env.read_txn().unwrap(); - for level_no in (1..=highest_level).rev() { - let mut level_no_prefix = vec![]; - level_no_prefix.extend_from_slice(&field_id.to_be_bytes()); - level_no_prefix.push(level_no); - - let mut iter = db - .content - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level_no_prefix) - .unwrap(); - while let Some(el) = iter.next() { - let (key, value) = el.unwrap(); - let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); - - let mut prefix_start_below = vec![]; - prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); - prefix_start_below.push(level_no - 1); - prefix_start_below.extend_from_slice(&key.left_bound); - - let start_below = { - let mut start_below_iter = db - .content - .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( - &txn, - &prefix_start_below, - ) - .unwrap(); - let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); - FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() - }; - - assert!(value.size > 0 && value.size < db.max_group_size); - - let mut actual_size = 0; - let mut values_below = RoaringBitmap::new(); - let mut iter_below = - db.content.range(&txn, &(start_below..)).unwrap().take(value.size as usize); - while let Some(el) = iter_below.next() { - let (_, value) = el.unwrap(); - actual_size += 1; - values_below |= value.bitmap; - } - assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}"); - - assert_eq!(value.bitmap, values_below); - } - } - } #[test] fn append() { let index = FacetIndex::::new(4, 8, 5); @@ -614,7 +540,9 @@ mod tests { index.insert(&mut txn, 0, &(i as f64), &bitmap); txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } #[test] @@ -641,9 +569,11 @@ mod tests { index.insert(&mut txn, 1, &(i as f64), &bitmap); txn.commit().unwrap(); } - verify_structure_validity(&index, 0); - verify_structure_validity(&index, 1); - verify_structure_validity(&index, 2); + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.verify_structure_validity(&txn, 1); + index.verify_structure_validity(&txn, 2); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } #[test] @@ -670,9 +600,11 @@ mod tests { index.insert(&mut txn, 1, &(i as f64), &bitmap); txn.commit().unwrap(); } - verify_structure_validity(&index, 0); - verify_structure_validity(&index, 1); - verify_structure_validity(&index, 2); + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.verify_structure_validity(&txn, 1); + index.verify_structure_validity(&txn, 2); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } @@ -686,8 +618,9 @@ mod tests { bitmap.insert(i); index.insert(&mut txn, 0, &(i as f64), &bitmap); } + + index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); - verify_structure_validity(&index, 0); milli_snap!(format!("{index}")); } @@ -705,146 +638,138 @@ mod tests { bitmap.insert(key); index.insert(&mut txn, 0, &(key as f64), &bitmap); } + index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); - verify_structure_validity(&index, 0); milli_snap!(format!("{index}")); } #[test] fn merge_values() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); let mut keys = (0..256).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); + for (_i, key) in keys.into_iter().enumerate() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); bitmap.insert(rng.gen_range(256..512)); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(key as f64), &bitmap); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } #[test] fn delete_from_end() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(&(i as f64)), &bitmap); - txn.commit().unwrap(); } for i in (200..256).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 200); + let mut txn = index.env.write_txn().unwrap(); for i in (150..200).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 150); - + let mut txn = index.env.write_txn().unwrap(); for i in (100..150).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 100); - + let mut txn = index.env.write_txn().unwrap(); for i in (17..100).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 17); - let mut txn = index.env.write_txn().unwrap(); for i in (15..17).into_iter().rev() { index.delete(&mut txn, 0, &(i as f64), i as u32); } + index.verify_structure_validity(&txn, 0); txn.commit().unwrap(); - verify_structure_validity(&index, 0); milli_snap!(format!("{index}"), 15); + let mut txn = index.env.write_txn().unwrap(); for i in (0..15).into_iter().rev() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 0); } #[test] fn delete_from_start() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(i as f64), &bitmap); - txn.commit().unwrap(); } for i in 0..128 { - let mut txn = index.env.write_txn().unwrap(); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 127); + let mut txn = index.env.write_txn().unwrap(); for i in 128..216 { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 215); + let mut txn = index.env.write_txn().unwrap(); for i in 216..256 { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(i as f64), i as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 255); } #[test] fn delete_shuffled() { let index = FacetIndex::::new(4, 8, 5); - + let mut txn = index.env.write_txn().unwrap(); for i in 0..256 { let mut bitmap = RoaringBitmap::new(); bitmap.insert(i); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(i as f64), &bitmap); - txn.commit().unwrap(); } let mut keys = (0..256).into_iter().collect::>(); @@ -853,36 +778,37 @@ mod tests { for i in 0..128 { let key = keys[i]; - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(key as f64), key as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 127); + let mut txn = index.env.write_txn().unwrap(); for i in 128..216 { let key = keys[i]; - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(key as f64), key as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + let mut txn = index.env.write_txn().unwrap(); milli_snap!(format!("{index}"), 215); for i in 216..256 { let key = keys[i]; - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(key as f64), key as u32); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), 255); } #[test] fn in_place_level0_insert() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + let mut keys = (0..16).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); keys.shuffle(&mut rng); @@ -890,19 +816,19 @@ mod tests { for &key in keys.iter() { let mut bitmap = RoaringBitmap::new(); bitmap.insert(rng.gen_range(i * 256..(i + 1) * 256)); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &(key as f64), &bitmap); - txn.commit().unwrap(); } } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}")); } #[test] fn in_place_level0_delete() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); let mut keys = (0..64).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -912,27 +838,29 @@ mod tests { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); bitmap.insert(key + 100); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(key as f64), &bitmap); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), "before_delete"); + let mut txn = index.env.write_txn().unwrap(); + for &key in keys.iter() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &(key as f64), key + 100); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), "after_delete"); } #[test] fn shuffle_merge_string_and_delete() { let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); let mut keys = (1000..1064).into_iter().collect::>(); let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); @@ -942,21 +870,21 @@ mod tests { let mut bitmap = RoaringBitmap::new(); bitmap.insert(key); bitmap.insert(key + 100); - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.insert(&mut txn, 0, &format!("{key:x}").as_str(), &bitmap); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), "before_delete"); + let mut txn = index.env.write_txn().unwrap(); + for &key in keys.iter() { - verify_structure_validity(&index, 0); - let mut txn = index.env.write_txn().unwrap(); + index.verify_structure_validity(&txn, 0); index.delete(&mut txn, 0, &format!("{key:x}").as_str(), key + 100); - txn.commit().unwrap(); } - verify_structure_validity(&index, 0); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); milli_snap!(format!("{index}"), "after_delete"); } @@ -1083,7 +1011,7 @@ mod tests { // assert_eq!(key, &group_key.left_bound); // assert_eq!(values, &group_values.bitmap); // } -// verify_structure_validity(&index, *field_id); +// index.verify_structure_validity(*field_id); // } // index.db.content.clear(&mut txn).unwrap(); diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 3b46bb421..7298fecc5 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -68,10 +68,244 @@ impl<'i> FacetsUpdate<'i> { } #[cfg(test)] -mod tests { - // here I want to create a benchmark - // to find out at which point it is faster to do it incrementally +pub(crate) mod tests { + use super::bulk::FacetsUpdateBulkInner; + use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + }; + use crate::search::facet::get_highest_level; + use crate::snapshot_tests::display_bitmap; + use crate::update::FacetsUpdateIncrementalInner; + use crate::CboRoaringBitmapCodec; + use heed::types::ByteSlice; + use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; + use roaring::RoaringBitmap; + use std::fmt::Display; + use std::marker::PhantomData; + use std::rc::Rc; - #[test] - fn update() {} + // A dummy index that only contains the facet database, used for testing + pub struct FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + pub env: Env, + pub content: heed::Database, FacetGroupValueCodec>, + pub group_size: u8, + pub min_level_size: u8, + pub max_group_size: u8, + _tempdir: Rc, + _phantom: PhantomData, + } + + impl FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + #[cfg(all(test, fuzzing))] + pub fn open_from_tempdir( + tempdir: Rc, + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 10 * 100); + unsafe { + options.flag(heed::flags::Flags::MdbAlwaysFreePages); + } + let env = options.open(tempdir.path()).unwrap(); + let content = env.open_database(None).unwrap().unwrap(); + + FacetIndex { + db: Database { + content, + group_size, + max_group_size, + min_level_size, + _tempdir: tempdir, + }, + env, + _phantom: PhantomData, + } + } + pub fn new( + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 100); + let tempdir = tempfile::TempDir::new().unwrap(); + let env = options.open(tempdir.path()).unwrap(); + let content = env.create_database(None).unwrap(); + + FacetIndex { + content, + group_size, + max_group_size, + min_level_size, + _tempdir: Rc::new(tempdir), + env, + _phantom: PhantomData, + } + } + pub fn insert<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docids: &RoaringBitmap, + ) { + let update = FacetsUpdateIncrementalInner { + db: self.content, + group_size: self.group_size, + min_level_size: self.min_level_size, + max_group_size: self.max_group_size, + }; + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.insert(wtxn, field_id, &key_bytes, docids).unwrap(); + } + pub fn delete<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + value: u32, + ) { + let update = FacetsUpdateIncrementalInner { + db: self.content, + group_size: self.group_size, + min_level_size: self.min_level_size, + max_group_size: self.max_group_size, + }; + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.delete(wtxn, field_id, &key_bytes, value).unwrap(); + } + + pub fn bulk_insert<'a, 'b>( + &self, + wtxn: &'a mut RwTxn, + field_ids: &[u16], + els: impl IntoIterator< + Item = &'a ((u16, >::EItem), RoaringBitmap), + >, + ) where + for<'c> >::EItem: Sized, + { + let mut new_data = vec![]; + let mut writer = grenad::Writer::new(&mut new_data); + for ((field_id, left_bound), docids) in els { + let left_bound_bytes = BoundCodec::bytes_encode(left_bound).unwrap().into_owned(); + let key: FacetGroupKey<&[u8]> = + FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; + let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + let value = CboRoaringBitmapCodec::bytes_encode(&docids).unwrap(); + writer.insert(&key, &value).unwrap(); + } + writer.finish().unwrap(); + let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); + + let update = FacetsUpdateBulkInner { + db: self.content, + new_data: Some(reader), + group_size: self.group_size, + min_level_size: self.min_level_size, + }; + + update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap(); + } + + pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) { + let mut field_id_prefix = vec![]; + field_id_prefix.extend_from_slice(&field_id.to_be_bytes()); + + let highest_level = get_highest_level(txn, self.content, field_id).unwrap(); + + for level_no in (1..=highest_level).rev() { + let mut level_no_prefix = vec![]; + level_no_prefix.extend_from_slice(&field_id.to_be_bytes()); + level_no_prefix.push(level_no); + + let mut iter = self + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, &level_no_prefix) + .unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); + + let mut prefix_start_below = vec![]; + prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); + prefix_start_below.push(level_no - 1); + prefix_start_below.extend_from_slice(&key.left_bound); + + let start_below = { + let mut start_below_iter = self + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + txn, + &prefix_start_below, + ) + .unwrap(); + let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); + FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() + }; + + assert!(value.size > 0 && value.size < self.max_group_size); + + let mut actual_size = 0; + let mut values_below = RoaringBitmap::new(); + let mut iter_below = self + .content + .range(txn, &(start_below..)) + .unwrap() + .take(value.size as usize); + while let Some(el) = iter_below.next() { + let (_, value) = el.unwrap(); + actual_size += 1; + values_below |= value.bitmap; + } + assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}"); + + assert_eq!(value.bitmap, values_below); + } + } + } + } + + impl Display for FacetIndex + where + for<'a> >::EItem: Sized + Display, + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let txn = self.env.read_txn().unwrap(); + let mut iter = self.content.iter(&txn).unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let FacetGroupKey { field_id, level, left_bound: bound } = key; + let bound = BoundCodec::bytes_decode(bound).unwrap(); + let FacetGroupValue { size, bitmap } = value; + writeln!( + f, + "{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}", + values = display_bitmap(&bitmap) + )?; + } + Ok(()) + } + } } diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap similarity index 58% rename from milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap rename to milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap index 960843592..bef20823c 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/default/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -07718df52f8463335fb8fefcd3ae01f4 +b40dd31a65e033ffc6b35c027ce19506 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap similarity index 58% rename from milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap index 960843592..74c40e6a3 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -07718df52f8463335fb8fefcd3ae01f4 +7ee22d8e9387e72758f00918eb67e4c6 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap similarity index 58% rename from milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap index 960843592..6fb086d35 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -07718df52f8463335fb8fefcd3ae01f4 +60f567359382507afdaf45fb075740c3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap similarity index 58% rename from milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap rename to milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap index 960843592..0271a6c6b 100644 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/facet/bulk.rs --- -07718df52f8463335fb8fefcd3ae01f4 +b986d6e6cbf425685f409a8b417010e1 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..d801ef19f --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +ee10dd2ae2b5c6621a89a5d0a9aa8ccc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap new file mode 100644 index 000000000..e9988f527 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +fa877559eef78b383b496c15a364a2dc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..aa52901da --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..64f5012a4 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +be1b08073b9d9788d18080c1320151d7 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..aa52901da --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..bb0e9aa69 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +32a45d555df2e001420fea149818d376 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 960843592..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 960843592..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -07718df52f8463335fb8fefcd3ae01f4 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap deleted file mode 100644 index c2b3896c4..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/default/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -9e9175e0a56db39f0dc04fb8f15c28fe diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index c2b3896c4..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_number_update/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -9e9175e0a56db39f0dc04fb8f15c28fe diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap deleted file mode 100644 index 574a3c393..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/default/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -3e6a91b3c54c614a4787224ac4278ed3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap deleted file mode 100644 index 574a3c393..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -3e6a91b3c54c614a4787224ac4278ed3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap deleted file mode 100644 index c9f8951ac..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/default/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -b494fb6565707ce401f6d6ac03f46b93 diff --git a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap deleted file mode 100644 index c9f8951ac..000000000 --- a/milli/src/update/facet/snapshots/bulk.rs/test_facets_string_update/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facet/bulk.rs ---- -b494fb6565707ce401f6d6ac03f46b93 diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index b13118e09..952720725 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -17,7 +17,7 @@ pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; mod delete_documents; -mod facet; +pub(crate) mod facet; mod index_documents; mod indexer_config; mod prefix_word_pairs; diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap deleted file mode 100644 index e50e50347..000000000 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_pair_proximity_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/word_prefix_pair_proximity_docids.rs ---- -6873ff1f78d08f2b1a13bb9e37349c01