From cf0cd92ed46df14743b90c091055ccec37988e64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Sat, 16 Jul 2022 20:26:59 +0200 Subject: [PATCH 1/4] Refactor Facets::execute to increase performance --- milli/src/update/facets.rs | 249 +++++++++++++++++++++++++++++++++++-- 1 file changed, 236 insertions(+), 13 deletions(-) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 4c4963b56..8899f0485 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,10 +1,11 @@ use std::fs::File; use std::num::{NonZeroU8, NonZeroUsize}; +use std::ops::RangeInclusive; use std::{cmp, mem}; use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{BytesEncode, Error}; +use heed::{BytesDecode, BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; use time::OffsetDateTime; @@ -86,13 +87,32 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; // Compute and store the faceted numbers documents ids. - let number_documents_ids = compute_faceted_numbers_documents_ids( - self.wtxn, - self.index.facet_id_f64_docids.remap_key_type::(), - field_id, - )?; + // let number_documents_ids = compute_faceted_numbers_documents_ids( + // self.wtxn, + // self.index.facet_id_f64_docids.remap_key_type::(), + // field_id, + // )?; - let facet_number_levels = compute_facet_number_levels( + // let facet_number_levels = compute_facet_number_levels( + // self.wtxn, + // self.index.facet_id_f64_docids, + // self.chunk_compression_type, + // self.chunk_compression_level, + // self.level_group_size, + // self.min_level_size, + // field_id, + // )?; + + // println!("printing 1"); + + // let mut cursor = facet_number_levels.into_cursor().unwrap(); + // while let Some((key, bitmap)) = cursor.move_on_next().unwrap() { + // let key = FacetLevelValueF64Codec::bytes_decode(key).unwrap(); + // let bitmap = CboRoaringBitmapCodec::bytes_decode(bitmap).unwrap(); + // println!("{key:?} {bitmap:?}"); + // } + + let (facet_number_levels_2, number_documents_ids) = compute_facet_number_levels_2( self.wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, @@ -102,6 +122,32 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; + // let mut writer = create_writer( + // self.chunk_compression_type, + // self.chunk_compression_level, + // tempfile::tempfile()?, + // ); + // for fnl in facet_number_levels_2 { + // let mut cursor = fnl.into_cursor().unwrap(); + // while let Some((key, bitmap)) = cursor.move_on_next().unwrap() { + // writer.insert(key, bitmap).unwrap(); + // } + // } + // let reader = writer_into_reader(writer)?; + // let mut cursor1 = reader.into_cursor().unwrap(); + // let mut cursor2 = facet_number_levels.into_cursor().unwrap(); + // loop { + // let (c1, c2) = (cursor1.move_on_next().unwrap(), cursor2.move_on_next().unwrap()); + // match (c1, c2) { + // (Some((k1, v1)), Some((k2, v2))) => { + // assert_eq!(k1, k2); + // assert_eq!(v1, v2); + // } + // (None, None) => break, + // _ => panic!(), + // } + // } + self.index.put_string_faceted_documents_ids( self.wtxn, field_id, @@ -113,12 +159,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { &number_documents_ids, )?; - write_into_lmdb_database( - self.wtxn, - *self.index.facet_id_f64_docids.as_polymorph(), - facet_number_levels, - |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" })?, - )?; + for facet_number_levels in facet_number_levels_2 { + write_into_lmdb_database( + self.wtxn, + *self.index.facet_id_f64_docids.as_polymorph(), + facet_number_levels, + |_, _| { + Err(InternalError::IndexingMergingKeys { process: "facet number levels" })? + }, + )?; + } write_into_lmdb_database( self.wtxn, @@ -143,6 +193,177 @@ fn clear_field_number_levels<'t>( db.delete_range(wtxn, &range).map(drop) } +fn compute_facet_number_levels_2<'t>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, + field_id: FieldId, +) -> Result<(Vec>, RoaringBitmap)> { + let first_level_size = db + .remap_key_type::() + .prefix_iter(rtxn, &field_id.to_be_bytes())? + .remap_types::() + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + + let level_0_range = { + let left = (field_id, 0, f64::MIN, f64::MIN); + let right = (field_id, 0, f64::MAX, f64::MAX); + left..=right + }; + + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()) + .collect::>(); + + // dbg!(first_level_size, min_level_size); + // dbg!(level_group_size); + // dbg!(&group_size_iter); + + let mut number_document_ids = RoaringBitmap::new(); + + if let Some((top_level, _)) = group_size_iter.last() { + let subwriters = recursive_compute_levels( + rtxn, + db, + compression_type, + compression_level, + *top_level, + level_0_range, + level_group_size, + &mut |bitmaps, _, _| { + for bitmap in bitmaps { + number_document_ids |= bitmap; + } + Ok(()) + }, + )?; + Ok((subwriters, number_document_ids)) + } else { + let mut documents_ids = RoaringBitmap::new(); + for result in db.range(rtxn, &level_0_range)? { + let (_key, docids) = result?; + documents_ids |= docids; + } + + Ok((vec![], documents_ids)) + } +} + +fn recursive_compute_levels<'t>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + level: u8, + level_0_range: RangeInclusive<(FieldId, u8, f64, f64)>, + level_group_size: NonZeroUsize, + computed_group_bitmap: &mut dyn FnMut(&[RoaringBitmap], f64, f64) -> Result<()>, +) -> Result>> { + let (field_id, level_0, first_left, first_right) = level_0_range.start().clone(); + assert_eq!(level_0, 0); + assert_eq!(first_left, first_right); + if level == 0 { + let mut bitmaps = vec![]; + + let mut first_f64_value = first_left; + let mut last_f64_value = first_left; + + let mut first_iteration_for_new_group = true; + for db_result_item in db.range(rtxn, &level_0_range)? { + let ((_field_id, _level, left, _right), docids) = db_result_item?; + // println!("level0: {left}"); + assert_eq!(_level, 0); + assert_eq!(left, _right); + if first_iteration_for_new_group { + first_f64_value = left; + first_iteration_for_new_group = false; + } + last_f64_value = left; + bitmaps.push(docids); + + if bitmaps.len() == level_group_size.get() { + // println!("callback first level with {bitmaps:?} {last_f64_value:?}"); + computed_group_bitmap(&bitmaps, first_f64_value, last_f64_value)?; + first_iteration_for_new_group = true; + bitmaps.clear(); + } + } + if !bitmaps.is_empty() { + // println!("end callback first level with {bitmaps:?} {last_f64_value:?}"); + computed_group_bitmap(&bitmaps, first_f64_value, last_f64_value)?; + bitmaps.clear(); + } + + // level 0 isn't actually stored in this DB, since it contains exactly the same information as that other DB + return Ok(vec![]); + } else { + let mut cur_writer = + create_writer(compression_type, compression_level, tempfile::tempfile()?); + + let mut range_for_bitmaps = vec![]; + let mut bitmaps = vec![]; + + let mut sub_writers = recursive_compute_levels( + rtxn, + db, + compression_type, + compression_level, + level - 1, + level_0_range, + level_group_size, + &mut |sub_bitmaps: &[RoaringBitmap], start_range, end_range| { + let mut combined_bitmap = RoaringBitmap::default(); + for bitmap in sub_bitmaps { + combined_bitmap |= bitmap; + } + range_for_bitmaps.push((start_range, end_range)); + + bitmaps.push(combined_bitmap); + if bitmaps.len() == level_group_size.get() { + let start_range = range_for_bitmaps.first().unwrap().0; + let end_range = range_for_bitmaps.last().unwrap().1; + // println!("callback level {} with {bitmaps:?} {last_f64_value:?}", level + 1); + computed_group_bitmap(&bitmaps, start_range, end_range)?; + for (bitmap, (start_range, end_range)) in + bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) + { + // println!("write {field_id} {level} {start_range} {end_range} {bitmap:?}"); + write_number_entry( + &mut cur_writer, + field_id, + level, + start_range, + end_range, + &bitmap, + )?; + } + } + // println!("end callback level {level}"); + Ok(()) + }, + )?; + if !bitmaps.is_empty() { + let start_range = range_for_bitmaps.first().unwrap().0; + let end_range = range_for_bitmaps.last().unwrap().1; + // println!("end callback level {} with {bitmaps:?} {last_f64_value:?}", level + 1); + computed_group_bitmap(&bitmaps, start_range, end_range)?; + for (bitmap, (left, right)) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { + // println!("end write: {field_id} {level} {left} {right} {bitmap:?}"); + write_number_entry(&mut cur_writer, field_id, level, left, right, &bitmap)?; + } + } + + sub_writers.push(writer_into_reader(cur_writer)?); + return Ok(sub_writers); + } +} + fn compute_facet_number_levels<'t>( rtxn: &'t heed::RoTxn, db: heed::Database, @@ -175,6 +396,7 @@ fn compute_facet_number_levels<'t>( .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); for (level, group_size) in group_size_iter { + // dbg!(level, group_size); let mut left = 0.0; let mut right = 0.0; let mut group_docids = RoaringBitmap::new(); @@ -218,6 +440,7 @@ fn write_number_entry( let key = (field_id, level, left, right); let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + // println!(" w{field_id}-{level}-{left}-{right}"); writer.insert(&key, &data)?; Ok(()) } From 8d4b21a00525cec1095e237b1ec6c0a1473512d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 09:59:08 +0200 Subject: [PATCH 2/4] Switch string facet levels indexation to new algo Write the algorithm once for both numbers and strings --- milli/src/update/facets.rs | 505 ++++++++++++++----------------------- 1 file changed, 185 insertions(+), 320 deletions(-) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 8899f0485..b3d9f1c58 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,13 +1,12 @@ -use std::fs::File; -use std::num::{NonZeroU8, NonZeroUsize}; -use std::ops::RangeInclusive; -use std::{cmp, mem}; - use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; +use std::cmp; +use std::fs::File; +use std::num::{NonZeroU8, NonZeroUsize}; +use std::ops::RangeFrom; use time::OffsetDateTime; use crate::error::InternalError; @@ -66,14 +65,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; - // Compute and store the faceted strings documents ids. - let string_documents_ids = compute_faceted_strings_documents_ids( - self.wtxn, - self.index.facet_id_string_docids.remap_key_type::(), - field_id, - )?; - - let facet_string_levels = compute_facet_string_levels( + let (facet_string_levels, string_documents_ids) = compute_facet_strings_levels( self.wtxn, self.index.facet_id_string_docids, self.chunk_compression_type, @@ -83,36 +75,26 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; + self.index.put_string_faceted_documents_ids( + self.wtxn, + field_id, + &string_documents_ids, + )?; + for facet_strings_levels in facet_string_levels { + write_into_lmdb_database( + self.wtxn, + *self.index.facet_id_string_docids.as_polymorph(), + facet_strings_levels, + |_, _| { + Err(InternalError::IndexingMergingKeys { process: "facet string levels" })? + }, + )?; + } + // Clear the facet number levels. clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; - // Compute and store the faceted numbers documents ids. - // let number_documents_ids = compute_faceted_numbers_documents_ids( - // self.wtxn, - // self.index.facet_id_f64_docids.remap_key_type::(), - // field_id, - // )?; - - // let facet_number_levels = compute_facet_number_levels( - // self.wtxn, - // self.index.facet_id_f64_docids, - // self.chunk_compression_type, - // self.chunk_compression_level, - // self.level_group_size, - // self.min_level_size, - // field_id, - // )?; - - // println!("printing 1"); - - // let mut cursor = facet_number_levels.into_cursor().unwrap(); - // while let Some((key, bitmap)) = cursor.move_on_next().unwrap() { - // let key = FacetLevelValueF64Codec::bytes_decode(key).unwrap(); - // let bitmap = CboRoaringBitmapCodec::bytes_decode(bitmap).unwrap(); - // println!("{key:?} {bitmap:?}"); - // } - - let (facet_number_levels_2, number_documents_ids) = compute_facet_number_levels_2( + let (facet_number_levels_2, number_documents_ids) = compute_facet_number_levels( self.wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, @@ -122,37 +104,6 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; - // let mut writer = create_writer( - // self.chunk_compression_type, - // self.chunk_compression_level, - // tempfile::tempfile()?, - // ); - // for fnl in facet_number_levels_2 { - // let mut cursor = fnl.into_cursor().unwrap(); - // while let Some((key, bitmap)) = cursor.move_on_next().unwrap() { - // writer.insert(key, bitmap).unwrap(); - // } - // } - // let reader = writer_into_reader(writer)?; - // let mut cursor1 = reader.into_cursor().unwrap(); - // let mut cursor2 = facet_number_levels.into_cursor().unwrap(); - // loop { - // let (c1, c2) = (cursor1.move_on_next().unwrap(), cursor2.move_on_next().unwrap()); - // match (c1, c2) { - // (Some((k1, v1)), Some((k2, v2))) => { - // assert_eq!(k1, k2); - // assert_eq!(v1, v2); - // } - // (None, None) => break, - // _ => panic!(), - // } - // } - - self.index.put_string_faceted_documents_ids( - self.wtxn, - field_id, - &string_documents_ids, - )?; self.index.put_number_faceted_documents_ids( self.wtxn, field_id, @@ -169,31 +120,13 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { }, )?; } - - write_into_lmdb_database( - self.wtxn, - *self.index.facet_id_string_docids.as_polymorph(), - facet_string_levels, - |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" })?, - )?; } Ok(()) } } -fn clear_field_number_levels<'t>( - wtxn: &'t mut heed::RwTxn, - db: heed::Database, - field_id: FieldId, -) -> heed::Result<()> { - let left = (field_id, 1, f64::MIN, f64::MIN); - let right = (field_id, u8::MAX, f64::MAX, f64::MAX); - let range = left..=right; - db.delete_range(wtxn, &range).map(drop) -} - -fn compute_facet_number_levels_2<'t>( +fn compute_facet_number_levels<'t>( rtxn: &'t heed::RoTxn, db: heed::Database, compression_type: CompressionType, @@ -208,11 +141,7 @@ fn compute_facet_number_levels_2<'t>( .remap_types::() .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - let level_0_range = { - let left = (field_id, 0, f64::MIN, f64::MIN); - let right = (field_id, 0, f64::MAX, f64::MAX); - left..=right - }; + let level_0_start = (field_id, 0, f64::MIN, f64::MIN); // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. @@ -221,32 +150,38 @@ fn compute_facet_number_levels_2<'t>( .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()) .collect::>(); - // dbg!(first_level_size, min_level_size); - // dbg!(level_group_size); - // dbg!(&group_size_iter); - let mut number_document_ids = RoaringBitmap::new(); if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = recursive_compute_levels( - rtxn, - db, - compression_type, - compression_level, - *top_level, - level_0_range, - level_group_size, - &mut |bitmaps, _, _| { - for bitmap in bitmaps { - number_document_ids |= bitmap; - } - Ok(()) - }, - )?; + let subwriters = + recursive_compute_levels::( + rtxn, + db, + compression_type, + compression_level, + *top_level, + level_0_start, + &(level_0_start..), + first_level_size, + level_group_size, + &mut |bitmaps, _, _| { + for bitmap in bitmaps { + number_document_ids |= bitmap; + } + Ok(()) + }, + &|_i, (_field_id, _level, left, _right)| *left, + &|bitmap| bitmap, + &|writer, level, left, right, docids| { + write_number_entry(writer, field_id, level.get(), left, right, &docids)?; + Ok(()) + }, + )?; + Ok((subwriters, number_document_ids)) } else { let mut documents_ids = RoaringBitmap::new(); - for result in db.range(rtxn, &level_0_range)? { + for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { let (_key, docids) = result?; documents_ids |= docids; } @@ -255,52 +190,129 @@ fn compute_facet_number_levels_2<'t>( } } -fn recursive_compute_levels<'t>( +fn compute_facet_strings_levels<'t>( rtxn: &'t heed::RoTxn, - db: heed::Database, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, + field_id: FieldId, +) -> Result<(Vec>, RoaringBitmap)> { + let first_level_size = db + .remap_key_type::() + .prefix_iter(rtxn, &field_id.to_be_bytes())? + .remap_types::() + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + + let level_0_start = (field_id, ""); + + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()) + .collect::>(); + + let mut strings_document_ids = RoaringBitmap::new(); + + if let Some((top_level, _)) = group_size_iter.last() { + let subwriters = recursive_compute_levels::< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + (u32, &str), + >( + rtxn, + db, + compression_type, + compression_level, + *top_level, + level_0_start, + &(level_0_start..), + first_level_size, + level_group_size, + &mut |bitmaps, _, _| { + for bitmap in bitmaps { + strings_document_ids |= bitmap; + } + Ok(()) + }, + &|i, (_field_id, value)| (i as u32, *value), + &|value| value.1, + &|writer, level, start_bound, end_bound, docids| { + write_string_entry(writer, field_id, level, start_bound, end_bound, docids)?; + Ok(()) + }, + )?; + + Ok((subwriters, strings_document_ids)) + } else { + let mut documents_ids = RoaringBitmap::new(); + for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { + let (_key, (_original_value, docids)) = result?; + documents_ids |= docids; + } + + Ok((vec![], documents_ids)) + } +} + +fn recursive_compute_levels<'t, KeyCodec, ValueCodec, Bound>( + rtxn: &'t heed::RoTxn, + db: heed::Database, compression_type: CompressionType, compression_level: Option, level: u8, - level_0_range: RangeInclusive<(FieldId, u8, f64, f64)>, + level_0_start: >::DItem, + level_0_range: &'t RangeFrom<>::DItem>, + level_0_size: usize, level_group_size: NonZeroUsize, - computed_group_bitmap: &mut dyn FnMut(&[RoaringBitmap], f64, f64) -> Result<()>, -) -> Result>> { - let (field_id, level_0, first_left, first_right) = level_0_range.start().clone(); - assert_eq!(level_0, 0); - assert_eq!(first_left, first_right); + computed_group_bitmap: &mut dyn FnMut(&[RoaringBitmap], Bound, Bound) -> Result<()>, + bound_from_db_key: &dyn for<'a> Fn(usize, &'a >::DItem) -> Bound, + bitmap_from_db_value: &dyn Fn(>::DItem) -> RoaringBitmap, + write_entry: &dyn Fn(&mut Writer, NonZeroU8, Bound, Bound, RoaringBitmap) -> Result<()>, +) -> Result>> +where + KeyCodec: for<'a> BytesEncode<'a> + + for<'a> BytesDecode<'a, DItem = >::EItem>, + for<'a> >::EItem: Sized, + ValueCodec: for<'a> BytesEncode<'a> + + for<'a> BytesDecode<'a, DItem = >::EItem>, + for<'a> >::EItem: Sized, + Bound: Copy, +{ if level == 0 { + // base case for the recursion + let mut bitmaps = vec![]; - let mut first_f64_value = first_left; - let mut last_f64_value = first_left; - + let mut start_bound = bound_from_db_key(0, &level_0_start); + let mut end_bound = bound_from_db_key(0, &level_0_start); let mut first_iteration_for_new_group = true; - for db_result_item in db.range(rtxn, &level_0_range)? { - let ((_field_id, _level, left, _right), docids) = db_result_item?; - // println!("level0: {left}"); - assert_eq!(_level, 0); - assert_eq!(left, _right); + for (i, db_result_item) in db.range(rtxn, level_0_range)?.take(level_0_size).enumerate() { + let (key, value) = db_result_item?; + + let bound = bound_from_db_key(i, &key); + let docids = bitmap_from_db_value(value); + if first_iteration_for_new_group { - first_f64_value = left; + start_bound = bound; first_iteration_for_new_group = false; } - last_f64_value = left; + end_bound = bound; bitmaps.push(docids); if bitmaps.len() == level_group_size.get() { - // println!("callback first level with {bitmaps:?} {last_f64_value:?}"); - computed_group_bitmap(&bitmaps, first_f64_value, last_f64_value)?; + computed_group_bitmap(&bitmaps, start_bound, end_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); } } if !bitmaps.is_empty() { - // println!("end callback first level with {bitmaps:?} {last_f64_value:?}"); - computed_group_bitmap(&bitmaps, first_f64_value, last_f64_value)?; + computed_group_bitmap(&bitmaps, start_bound, end_bound)?; bitmaps.clear(); } - - // level 0 isn't actually stored in this DB, since it contains exactly the same information as that other DB + // level 0 is already stored in the DB return Ok(vec![]); } else { let mut cur_writer = @@ -315,7 +327,9 @@ fn recursive_compute_levels<'t>( compression_type, compression_level, level - 1, + level_0_start, level_0_range, + level_0_size, level_group_size, &mut |sub_bitmaps: &[RoaringBitmap], start_range, end_range| { let mut combined_bitmap = RoaringBitmap::default(); @@ -326,36 +340,33 @@ fn recursive_compute_levels<'t>( bitmaps.push(combined_bitmap); if bitmaps.len() == level_group_size.get() { - let start_range = range_for_bitmaps.first().unwrap().0; - let end_range = range_for_bitmaps.last().unwrap().1; - // println!("callback level {} with {bitmaps:?} {last_f64_value:?}", level + 1); - computed_group_bitmap(&bitmaps, start_range, end_range)?; - for (bitmap, (start_range, end_range)) in + let start_bound = range_for_bitmaps.first().unwrap().0; + let end_bound = range_for_bitmaps.last().unwrap().1; + computed_group_bitmap(&bitmaps, start_bound, end_bound)?; + for (bitmap, (start_bound, end_bound)) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - // println!("write {field_id} {level} {start_range} {end_range} {bitmap:?}"); - write_number_entry( + write_entry( &mut cur_writer, - field_id, - level, - start_range, - end_range, - &bitmap, + NonZeroU8::new(level).unwrap(), + start_bound, + end_bound, + bitmap, )?; } } - // println!("end callback level {level}"); Ok(()) }, + bound_from_db_key, + bitmap_from_db_value, + write_entry, )?; if !bitmaps.is_empty() { let start_range = range_for_bitmaps.first().unwrap().0; let end_range = range_for_bitmaps.last().unwrap().1; - // println!("end callback level {} with {bitmaps:?} {last_f64_value:?}", level + 1); computed_group_bitmap(&bitmaps, start_range, end_range)?; for (bitmap, (left, right)) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - // println!("end write: {field_id} {level} {left} {right} {bitmap:?}"); - write_number_entry(&mut cur_writer, field_id, level, left, right, &bitmap)?; + write_entry(&mut cur_writer, NonZeroU8::new(level).unwrap(), left, right, bitmap)?; } } @@ -364,113 +375,15 @@ fn recursive_compute_levels<'t>( } } -fn compute_facet_number_levels<'t>( - rtxn: &'t heed::RoTxn, +fn clear_field_number_levels<'t>( + wtxn: &'t mut heed::RwTxn, db: heed::Database, - compression_type: CompressionType, - compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, field_id: FieldId, -) -> Result> { - let first_level_size = db - .remap_key_type::() - .prefix_iter(rtxn, &field_id.to_be_bytes())? - .remap_types::() - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - - // It is forbidden to keep a cursor and write in a database at the same time with LMDB - // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = create_writer(compression_type, compression_level, tempfile::tempfile()?); - - let level_0_range = { - let left = (field_id, 0, f64::MIN, f64::MIN); - let right = (field_id, 0, f64::MAX, f64::MAX); - left..=right - }; - - // Groups sizes are always a power of the original level_group_size and therefore a group - // always maps groups of the previous level and never splits previous levels groups in half. - let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) - .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); - - for (level, group_size) in group_size_iter { - // dbg!(level, group_size); - let mut left = 0.0; - let mut right = 0.0; - let mut group_docids = RoaringBitmap::new(); - - for (i, result) in db.range(rtxn, &level_0_range)?.enumerate() { - let ((_field_id, _level, value, _right), docids) = result?; - - if i == 0 { - left = value; - } else if i % group_size == 0 { - // we found the first bound of the next group, we must store the left - // and right bounds associated with the docids. - write_number_entry(&mut writer, field_id, level, left, right, &group_docids)?; - - // We save the left bound for the new group and also reset the docids. - group_docids = RoaringBitmap::new(); - left = value; - } - - // The right bound is always the bound we run through. - group_docids |= docids; - right = value; - } - - if !group_docids.is_empty() { - write_number_entry(&mut writer, field_id, level, left, right, &group_docids)?; - } - } - - writer_into_reader(writer) -} - -fn write_number_entry( - writer: &mut Writer, - field_id: FieldId, - level: u8, - left: f64, - right: f64, - ids: &RoaringBitmap, -) -> Result<()> { - let key = (field_id, level, left, right); - let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; - // println!(" w{field_id}-{level}-{left}-{right}"); - writer.insert(&key, &data)?; - Ok(()) -} - -fn compute_faceted_strings_documents_ids( - rtxn: &heed::RoTxn, - db: heed::Database, - field_id: FieldId, -) -> Result { - let mut documents_ids = RoaringBitmap::new(); - for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { - let (_key, (_original_value, docids)) = result?; - documents_ids |= docids; - } - - Ok(documents_ids) -} - -fn compute_faceted_numbers_documents_ids( - rtxn: &heed::RoTxn, - db: heed::Database, - field_id: FieldId, -) -> Result { - let mut documents_ids = RoaringBitmap::new(); - for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { - let (_key, docids) = result?; - documents_ids |= docids; - } - - Ok(documents_ids) +) -> heed::Result<()> { + let left = (field_id, 1, f64::MIN, f64::MIN); + let right = (field_id, u8::MAX, f64::MAX, f64::MAX); + let range = left..=right; + db.delete_range(wtxn, &range).map(drop) } fn clear_field_string_levels<'t>( @@ -484,68 +397,20 @@ fn clear_field_string_levels<'t>( db.remap_key_type::().delete_range(wtxn, &range).map(drop) } -fn compute_facet_string_levels<'t>( - rtxn: &'t heed::RoTxn, - db: heed::Database, - compression_type: CompressionType, - compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, +fn write_number_entry( + writer: &mut Writer, field_id: FieldId, -) -> Result> { - let first_level_size = db - .remap_key_type::() - .prefix_iter(rtxn, &field_id.to_be_bytes())? - .remap_types::() - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - - // It is forbidden to keep a cursor and write in a database at the same time with LMDB - // therefore we write the facet levels entries into a grenad file before transfering them. - let mut writer = create_writer(compression_type, compression_level, tempfile::tempfile()?); - - // Groups sizes are always a power of the original level_group_size and therefore a group - // always maps groups of the previous level and never splits previous levels groups in half. - let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) - .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); - - for (level, group_size) in group_size_iter { - let level = NonZeroU8::new(level).unwrap(); - let mut left = (0, ""); - let mut right = (0, ""); - let mut group_docids = RoaringBitmap::new(); - - // Because we know the size of the level 0 we can use a range iterator that starts - // at the first value of the level and goes to the last by simply counting. - for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() { - let ((_field_id, value), (_original_value, docids)) = result?; - - if i == 0 { - left = (i as u32, value); - } else if i % group_size == 0 { - // we found the first bound of the next group, we must store the left - // and right bounds associated with the docids. We also reset the docids. - let docids = mem::take(&mut group_docids); - write_string_entry(&mut writer, field_id, level, left, right, docids)?; - - // We save the left bound for the new group. - left = (i as u32, value); - } - - // The right bound is always the bound we run through. - group_docids |= docids; - right = (i as u32, value); - } - - if !group_docids.is_empty() { - let docids = mem::take(&mut group_docids); - write_string_entry(&mut writer, field_id, level, left, right, docids)?; - } - } - - writer_into_reader(writer) + level: u8, + left: f64, + right: f64, + ids: &RoaringBitmap, +) -> Result<()> { + let key = (field_id, level, left, right); + let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + writer.insert(&key, &data)?; + Ok(()) } - fn write_string_entry( writer: &mut Writer, field_id: FieldId, From 39687908f1af00264f1bdd1eacdd57c51dfe98cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 20 Jul 2022 09:49:40 +0200 Subject: [PATCH 3/4] Add documentation and comments to facets.rs --- milli/src/update/facets.rs | 193 +++++++++++++++++++++++++++++++++++-- 1 file changed, 184 insertions(+), 9 deletions(-) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index b3d9f1c58..56529a3c5 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,12 +1,138 @@ +/*! +This module initialises the databases that are used to quickly get the list +of documents with a faceted field value falling within a certain range. For +example, they can be used to implement filters such as `x >= 3`. + +These databases are `facet_id_string_docids` and `facet_id_f64_docids`. + +## Example with numbers + +In the case of numbers, we start with a sorted list whose keys are +`(field_id, number_value)` and whose value is a roaring bitmap of the document ids +which contain the value `number_value` for the faceted field `field_id`. + +From this list, we want to compute two things: + +1. the bitmap of all documents that contain **any** number for each faceted field +2. a structure that allows us to use a (sort of) binary search to find all documents +containing numbers inside a certain range for a faceted field + +To achieve goal (2), we recursively split the list into chunks. Every time we split it, we +create a new "level" that is several times smaller than the level below it. The base level, +level 0, is the starting list. Level 1 is composed of chunks of up to N elements. Each element +contains a range and a bitmap of docids. Level 2 is composed of chunks up to N^2 elements, etc. + +For example, let's say we have 26 documents which we identify through the letters a-z. +We will focus on a single faceted field. When there are multiple faceted fields, the structure +described below is simply repeated for each field. + +What we want to obtain is the following structure for each faceted field: +```text +┌───────┐ ┌───────────────────────────────────────────────────────────────────────────────┐ +│ all │ │ [a, b, c, d, e, f, g, u, y, z] │ +└───────┘ └───────────────────────────────────────────────────────────────────────────────┘ + ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ +┌───────┐ │ 1.2 – 2 │ 3.4 – 100 │ 102 – 104 │ +│Level 2│ │ │ │ │ +└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ + ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ +┌───────┐ │ 1.2 – 1.3 │ 1.6 – 2 │ 3.4 – 12 │ 12.3 – 100 │ 102 – 104 │ +│Level 1│ │ │ │ │ │ │ +└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ + ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ +┌───────┐ │ 1.2 │ 1.3 │ 1.6 │ 2 │ 3.4 │ 12 │ 12.3 │ 100 │ 102 │ 104 │ +│Level 0│ │ │ │ │ │ │ │ │ │ │ │ +└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ + └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ +``` + +You can read more about this structure (for strings) in `[crate::search::facet::facet_strings]`. + +To create the levels, we use a recursive algorithm which makes sure that we only need to iterate +over the elements of level 0 once. It is implemented by [`recursive_compute_levels`]. + +## Encoding + +### Numbers +For numbers we use the same encoding for level 0 and the other levels. + +The key is given by `FacetLevelValueF64Codec`. It consists of: +1. The field id : u16 +2. The height of the level : u8 +3. The start bound : f64 +4. The end bound : f64 +Note that at level 0, we have start bound == end bound. + +The value is a serialised `RoaringBitmap`. + +### Strings + +For strings, we use a different encoding for level 0 and the other levels. + +At level 0, the key is given by `FacetStringLevelZeroCodec`. It consists of: +1. The field id : u16 +2. The height of the level : u8 <-- always == 0 +3. The normalised string value : &str + +And the value is given by `FacetStringLevelZeroValueCodec`. It consists of: +1. The original string +2. A serialised `RoaringBitmap` + +At level 1, the key is given by `FacetLevelValueU32Codec`. It consists of: +1. The field id : u16 +2. The height of the level : u8 <-- always >= 1 +3. The start bound : u32 +4. The end bound : u32 +where the bounds are indices inside level 0. + +The value is given by `FacetStringZeroBoundsValueCodec`. +If the level is 1, then it consists of: +1. The normalised string of the start bound +2. The normalised string of the end bound +3. A serialised `RoaringBitmap` + +If the level is higher, then it consists only of the serialised roaring bitmap. + +The distinction between the value encoding of level 1 and the levels above it +is to allow us to retrieve the value in level 0 quickly by reading the key of +level 1 (we obtain the string value of the bound and execute a prefix search +in the database). + +Therefore, for strings, the structure for a single faceted field looks more like this: +```text +┌───────┐ ┌───────────────────────────────────────────────────────────────────────────────┐ +│ all │ │ [a, b, c, d, e, f, g, u, y, z] │ +└───────┘ └───────────────────────────────────────────────────────────────────────────────┘ + + ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ +┌───────┐ │ 0 – 3 │ 4 – 7 │ 8 – 9 │ +│Level 2│ │ │ │ │ +└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ + ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ +┌───────┐ │ 0 – 1 │ 2 – 3 │ 4 – 5 │ 6 – 7 │ 8 – 9 │ +│Level 1│ │ "ab" – "ac" │ "ba" – "bac" │ "gaf" – "gal" │"form" – "wow" │ "woz" – "zz" │ +└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ + ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ +┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │ +│Level 0│ │ "AB" │ " Ac" │ "ba " │ "Bac" │ " GAF"│ "gal" │ "Form"│ " wow"│ "woz" │ "ZZ" │ +└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ + └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ + +The first line in a cell is its key (without the field id and level height) and the last two +lines are its values. +``` +*/ + +use std::cmp; +use std::fs::File; +use std::num::{NonZeroU8, NonZeroUsize}; +use std::ops::RangeFrom; + use grenad::{CompressionType, Reader, Writer}; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, BytesEncode, Error}; use log::debug; use roaring::RoaringBitmap; -use std::cmp; -use std::fs::File; -use std::num::{NonZeroU8, NonZeroUsize}; -use std::ops::RangeFrom; use time::OffsetDateTime; use crate::error::InternalError; @@ -80,11 +206,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, &string_documents_ids, )?; - for facet_strings_levels in facet_string_levels { + for facet_strings_level in facet_string_levels { write_into_lmdb_database( self.wtxn, *self.index.facet_id_string_docids.as_polymorph(), - facet_strings_levels, + facet_strings_level, |_, _| { Err(InternalError::IndexingMergingKeys { process: "facet string levels" })? }, @@ -94,7 +220,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { // Clear the facet number levels. clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; - let (facet_number_levels_2, number_documents_ids) = compute_facet_number_levels( + let (facet_number_levels, number_documents_ids) = compute_facet_number_levels( self.wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, @@ -110,11 +236,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { &number_documents_ids, )?; - for facet_number_levels in facet_number_levels_2 { + for facet_number_level in facet_number_levels { write_into_lmdb_database( self.wtxn, *self.index.facet_id_f64_docids.as_polymorph(), - facet_number_levels, + facet_number_level, |_, _| { Err(InternalError::IndexingMergingKeys { process: "facet number levels" })? }, @@ -257,6 +383,43 @@ fn compute_facet_strings_levels<'t>( } } +/** +Compute a level from the levels below it, with the elements of level 0 already existing in the given `db`. + +This function is generic to work with both numbers and strings. The generic type parameters are: +* `KeyCodec`/`ValueCodec`: the codecs used to read the elements of the database. +* `Bound`: part of the range in the levels structure. For example, for numbers, the `Bound` is `f64` +because each chunk in a level contains a range such as (1.2 ..= 4.5). + +## Arguments +* `rtxn` : LMDB read transaction +* `db`: a database which already contains a `level 0` +* `compression_type`/`compression_level`: parameters used to create the `grenad::Writer` that +will contain the new levels +* `level` : the height of the level to create, or `0` to read elements from level 0. +* `level_0_start` : a key in the database that points to the beginning of its level 0 +* `level_0_range` : equivalent to `level_0_start..` +* `level_0_size` : the number of elements in level 0 +* `level_group_size` : the number of elements from the level below that are represented by a +* single element of the new level +* `computed_group_bitmap` : a callback that is called whenever at most `level_group_size` elements +from the level below were read/created. Its arguments are: + 0. the list of bitmaps from each read/created element of the level below + 1. the start bound corresponding to the first element + 2. the end bound corresponding to the last element +* `bound_from_db_key` : finds the `Bound` from a key in the database +* `bitmap_from_db_value` : finds the `RoaringBitmap` from a value in the database +* `write_entry` : writes an element of a level into the writer. The arguments are: + 0. the writer + 1. the height of the level + 2. the start bound + 3. the end bound + 4. the docids of all elements between the start and end bound + +## Return +A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` +that must be inserted into the database. +*/ fn recursive_compute_levels<'t, KeyCodec, ValueCodec, Bound>( rtxn: &'t heed::RoTxn, db: heed::Database, @@ -284,6 +447,9 @@ where if level == 0 { // base case for the recursion + // we read the elements one by one and + // 1. keep track of the start and end bounds + // 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read let mut bitmaps = vec![]; let mut start_bound = bound_from_db_key(0, &level_0_start); @@ -308,6 +474,7 @@ where bitmaps.clear(); } } + // don't forget to give the leftover bitmaps as well if !bitmaps.is_empty() { computed_group_bitmap(&bitmaps, start_bound, end_bound)?; bitmaps.clear(); @@ -315,12 +482,19 @@ where // level 0 is already stored in the DB return Ok(vec![]); } else { + // level >= 1 + // we compute each element of this level based on the elements of the level below it + // once we have computed `level_group_size` elements, we give the start and end bounds + // of those elements, and their bitmaps, to the level above + let mut cur_writer = create_writer(compression_type, compression_level, tempfile::tempfile()?); let mut range_for_bitmaps = vec![]; let mut bitmaps = vec![]; + // compute the levels below + // in the callback, we fill `cur_writer` with the correct elements for this level let mut sub_writers = recursive_compute_levels( rtxn, db, @@ -361,6 +535,7 @@ where bitmap_from_db_value, write_entry, )?; + // don't forget to insert the leftover elements into the writer as well if !bitmaps.is_empty() { let start_range = range_for_bitmaps.first().unwrap().0; let end_range = range_for_bitmaps.last().unwrap().1; From 6cc975704d48060602f50b337b672244baec4f3b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 17 Aug 2022 12:58:11 +0200 Subject: [PATCH 4/4] Add some documentation to facets.rs --- milli/src/update/facets.rs | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 56529a3c5..108acae4f 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -165,11 +165,15 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { } } + /// The number of elements from the level below that are represented by a single element in the level above + /// + /// This setting is always greater than or equal to 2. pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); self } + /// The minimum number of elements that a level is allowed to have. pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { self.min_level_size = value; self @@ -252,6 +256,12 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { } } +/// Compute the content of the database levels from its level 0 for the given field id. +/// +/// ## Returns: +/// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` +/// that must be inserted into the database. +/// 2. a roaring bitmap of all the document ids present in the database fn compute_facet_number_levels<'t>( rtxn: &'t heed::RoTxn, db: heed::Database, @@ -316,6 +326,12 @@ fn compute_facet_number_levels<'t>( } } +/// Compute the content of the database levels from its level 0 for the given field id. +/// +/// ## Returns: +/// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` +/// that must be inserted into the database. +/// 2. a roaring bitmap of all the document ids present in the database fn compute_facet_strings_levels<'t>( rtxn: &'t heed::RoTxn, db: heed::Database, @@ -401,7 +417,7 @@ will contain the new levels * `level_0_range` : equivalent to `level_0_start..` * `level_0_size` : the number of elements in level 0 * `level_group_size` : the number of elements from the level below that are represented by a -* single element of the new level +single element of the new level * `computed_group_bitmap` : a callback that is called whenever at most `level_group_size` elements from the level below were read/created. Its arguments are: 0. the list of bitmaps from each read/created element of the level below