From 27454e9828ef76d85bb530a63a73e4948b902809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Sep 2022 16:44:08 +0200 Subject: [PATCH] Document and refine facet indexing algorithms --- milli/src/heed_codec/facet/mod.rs | 21 +- milli/src/update/facet/bulk.rs | 88 ++--- milli/src/update/facet/incremental.rs | 440 ++++++++++++++---------- milli/src/update/facet/mod.rs | 125 ++++--- milli/src/update/index_documents/mod.rs | 4 +- 5 files changed, 387 insertions(+), 291 deletions(-) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 40e395881..2e9f0b212 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -29,31 +29,14 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { } } +/// The key in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`] +/// databases. #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] pub struct FacetGroupKey { pub field_id: u16, pub level: u8, pub left_bound: T, } -impl<'a> FacetGroupKey<&'a [u8]> { - pub fn into_owned(self) -> FacetGroupKey> { - FacetGroupKey { - field_id: self.field_id, - level: self.level, - left_bound: self.left_bound.to_vec(), - } - } -} - -impl<'a> FacetGroupKey> { - pub fn as_ref(&self) -> FacetGroupKey<&[u8]> { - FacetGroupKey { - field_id: self.field_id, - level: self.level, - left_bound: self.left_bound.as_slice(), - } - } -} #[derive(Debug)] pub struct FacetGroupValue { diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs index 90e287f23..83fa51003 100644 --- a/milli/src/update/facet/bulk.rs +++ b/milli/src/update/facet/bulk.rs @@ -1,24 +1,30 @@ -use std::borrow::Cow; -use std::cmp; -use std::fs::File; - -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::{BytesEncode, Error, RoTxn, RwTxn}; -use log::debug; -use roaring::RoaringBitmap; -use time::OffsetDateTime; - use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::update::index_documents::{create_writer, writer_into_reader}; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use log::debug; +use roaring::RoaringBitmap; +use std::borrow::Cow; +use std::fs::File; +use time::OffsetDateTime; +use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; + +/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases +/// by rebuilding the database "from scratch". +/// +/// First, the new elements are inserted into the level 0 of the database. Then, the +/// higher levels are cleared and recomputed from the content of level 0. +/// +/// Finally, the `faceted_documents_ids` value in the main database of `Index` +/// is updated to contain the new set of faceted documents. pub struct FacetsUpdateBulk<'i> { index: &'i Index, - database: heed::Database, FacetGroupValueCodec>, group_size: u8, min_level_size: u8, facet_type: FacetType, @@ -31,22 +37,10 @@ impl<'i> FacetsUpdateBulk<'i> { index: &'i Index, facet_type: FacetType, new_data: grenad::Reader, + group_size: u8, + min_level_size: u8, ) -> FacetsUpdateBulk<'i> { - FacetsUpdateBulk { - index, - database: match facet_type { - FacetType::String => index - .facet_id_string_docids - .remap_key_type::>(), - FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() - } - }, - group_size: 4, - min_level_size: 5, - facet_type, - new_data: Some(new_data), - } + FacetsUpdateBulk { index, group_size, min_level_size, facet_type, new_data: Some(new_data) } } pub fn new_not_updating_level_0( @@ -55,44 +49,31 @@ impl<'i> FacetsUpdateBulk<'i> { ) -> FacetsUpdateBulk<'i> { FacetsUpdateBulk { index, - database: match facet_type { - FacetType::String => index - .facet_id_string_docids - .remap_key_type::>(), - FacetType::Number => { - index.facet_id_f64_docids.remap_key_type::>() - } - }, - group_size: 4, - min_level_size: 5, + group_size: FACET_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, new_data: None, } } - /// The number of elements from the level below that are represented by a single element in the level above - /// - /// This setting is always greater than or equal to 2. - pub fn level_group_size(mut self, value: u8) -> Self { - self.group_size = cmp::max(value, 2); - self - } - - /// The minimum number of elements that a level is allowed to have. - pub fn min_level_size(mut self, value: u8) -> Self { - self.min_level_size = cmp::max(value, 2); - self - } - #[logging_timer::time("FacetsUpdateBulk::{}")] pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - let Self { index, database, group_size, min_level_size, facet_type, new_data } = self; + let Self { index, group_size, min_level_size, facet_type, new_data } = self; + + let db = match facet_type { + FacetType::String => { + index.facet_id_string_docids.remap_key_type::>() + } + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - let inner = FacetsUpdateBulkInner { db: database, new_data, group_size, min_level_size }; + let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; let field_ids = index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); @@ -105,6 +86,7 @@ impl<'i> FacetsUpdateBulk<'i> { } } +/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type pub(crate) struct FacetsUpdateBulkInner { pub db: heed::Database, FacetGroupValueCodec>, pub new_data: Option>, diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs index 14b421242..6be2dbf03 100644 --- a/milli/src/update/facet/incremental.rs +++ b/milli/src/update/facet/incremental.rs @@ -1,16 +1,14 @@ -use std::collections::HashMap; -use std::fs::File; - -use heed::types::ByteSlice; -use heed::{BytesDecode, Error, RoTxn, RwTxn}; -use roaring::RoaringBitmap; - use crate::facet::FacetType; use crate::heed_codec::facet::{ ByteSliceRef, FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, }; use crate::search::facet::get_highest_level; use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, Error, RoTxn, RwTxn}; +use roaring::RoaringBitmap; +use std::collections::HashMap; +use std::fs::File; enum InsertionResult { InPlace, @@ -18,10 +16,15 @@ enum InsertionResult { } enum DeletionResult { InPlace, - Reduce { prev: Option>, next: Option> }, - Remove { prev: Option>, next: Option> }, + Reduce { next: Option> }, + Remove { next: Option> }, } +/// Algorithm to incrementally insert and delete elememts into the +/// `facet_id_(string/f64)_docids` databases. +/// +/// Rhe `faceted_documents_ids` value in the main database of `Index` +/// is also updated to contain the new set of faceted documents. pub struct FacetsUpdateIncremental<'i> { index: &'i Index, inner: FacetsUpdateIncrementalInner, @@ -30,7 +33,14 @@ pub struct FacetsUpdateIncremental<'i> { } impl<'i> FacetsUpdateIncremental<'i> { - pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { + pub fn new( + index: &'i Index, + facet_type: FacetType, + new_data: grenad::Reader, + group_size: u8, + min_level_size: u8, + max_group_size: u8, + ) -> Self { FacetsUpdateIncremental { index, inner: FacetsUpdateIncrementalInner { @@ -42,26 +52,15 @@ impl<'i> FacetsUpdateIncremental<'i> { .facet_id_f64_docids .remap_key_type::>(), }, - group_size: 4, - max_group_size: 8, - min_level_size: 5, + group_size, + max_group_size, + min_level_size, }, facet_type, new_data, } } - pub fn group_size(mut self, size: u8) -> Self { - self.inner.group_size = size; - self - } - pub fn min_level_size(mut self, size: u8) -> Self { - self.inner.min_level_size = size; - self - } - pub fn max_group_size(mut self, size: u8) -> Self { - self.inner.max_group_size = size; - self - } + pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { let mut new_faceted_docids = HashMap::::default(); @@ -83,6 +82,7 @@ impl<'i> FacetsUpdateIncremental<'i> { } } +/// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type pub struct FacetsUpdateIncrementalInner { pub db: heed::Database, FacetGroupValueCodec>, pub group_size: u8, @@ -90,22 +90,36 @@ pub struct FacetsUpdateIncrementalInner { pub max_group_size: u8, } impl FacetsUpdateIncrementalInner { + /// Find the `FacetGroupKey`/`FacetGroupValue` in the database that + /// should be used to insert the new `facet_value` for the given `field_id` and `level` + /// where `level` must be strictly greater than 0. + /// + /// For example, when inserting the facet value `4`, there are two possibilities: + /// + /// 1. We find a key whose lower bound is 3 followed by a key whose lower bound is 6. Therefore, + /// we know that the implicit range of the first key is 3..6, which contains 4. + /// So the new facet value belongs in that first key/value pair. + /// + /// 2. The first key of the level has a lower bound of `5`. We return this key/value pair + /// but will need to change the lowerbound of this key to `4` in order to insert this facet value. fn find_insertion_key_value( &self, field_id: u16, level: u8, - search_key: &[u8], + facet_value: &[u8], txn: &RoTxn, ) -> Result<(FacetGroupKey>, FacetGroupValue)> { + assert!(level > 0); + let mut prefix = vec![]; prefix.extend_from_slice(&field_id.to_be_bytes()); prefix.push(level); - prefix.extend_from_slice(search_key); + prefix.extend_from_slice(facet_value); let mut prefix_iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>(txn, &prefix.as_slice())?; + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, prefix.as_slice())?; if let Some(e) = prefix_iter.next() { let (key_bytes, value) = e?; Ok(( @@ -115,10 +129,10 @@ impl FacetsUpdateIncrementalInner { value, )) } else { - let key = FacetGroupKey { field_id, level, left_bound: search_key }; + let key = FacetGroupKey { field_id, level, left_bound: facet_value }; match self.db.get_lower_than(txn, &key)? { Some((key, value)) => { - if key.level != level || key.field_id != field_id { + if key.level != level { let mut prefix = vec![]; prefix.extend_from_slice(&field_id.to_be_bytes()); prefix.push(level); @@ -126,7 +140,7 @@ impl FacetsUpdateIncrementalInner { let mut iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSliceRef, FacetGroupValueCodec>( + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( txn, &prefix.as_slice(), )?; @@ -146,15 +160,19 @@ impl FacetsUpdateIncrementalInner { } } + /// Insert the given facet value and corresponding document ids in the level 0 of the database + /// + /// ## Return + /// See documentation of `insert_in_level` fn insert_in_level_0<'t>( &self, txn: &'t mut RwTxn, field_id: u16, - new_key: &[u8], - new_values: &RoaringBitmap, + facet_value: &[u8], + docids: &RoaringBitmap, ) -> Result { - let key = FacetGroupKey { field_id, level: 0, left_bound: new_key }; - let value = FacetGroupValue { bitmap: new_values.clone(), size: 1 }; + let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; + let value = FacetGroupValue { bitmap: docids.clone(), size: 1 }; let mut level0_prefix = vec![]; level0_prefix.extend_from_slice(&field_id.to_be_bytes()); @@ -163,7 +181,7 @@ impl FacetsUpdateIncrementalInner { let mut iter = self .db .as_polymorph() - .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level0_prefix)?; + .prefix_iter::<_, ByteSlice, DecodeIgnore>(&txn, &level0_prefix)?; if iter.next().is_none() { drop(iter); @@ -186,143 +204,158 @@ impl FacetsUpdateIncrementalInner { } } } + + /// Insert the given facet value and corresponding document ids in all the levels of the database up to the given `level`. + /// This function works recursively. + /// + /// ## Return + /// Returns the effect of adding the facet value to the database on the given `level`. + /// + /// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have + /// an effect on the number of keys in that level. Therefore, it did not increase the number of children + /// of the parent node. + /// + /// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted + /// in the addition of a new key in that level, and that therefore the number of children + /// of the parent node should be incremented. fn insert_in_level<'t>( &self, txn: &'t mut RwTxn, field_id: u16, level: u8, - new_key: &[u8], - new_values: &RoaringBitmap, + facet_value: &[u8], + docids: &RoaringBitmap, ) -> Result { if level == 0 { - return self.insert_in_level_0(txn, field_id, new_key, new_values); + return self.insert_in_level_0(txn, field_id, facet_value, docids); } let max_group_size = self.max_group_size; - let (insertion_key, insertion_value) = - self.find_insertion_key_value(field_id, level, new_key, txn)?; - - let result = self.insert_in_level(txn, field_id, level - 1, new_key.clone(), new_values)?; + let result = self.insert_in_level(txn, field_id, level - 1, facet_value.clone(), docids)?; // level below inserted an element - let insertion_key = { - let mut new_insertion_key = insertion_key.clone(); - let mut modified = false; - - if new_key < insertion_key.left_bound.as_slice() { - new_insertion_key.left_bound = new_key.to_vec(); - modified = true; - } - if modified { - let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; - assert!(is_deleted); - self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; - } - new_insertion_key - }; + let (insertion_key, insertion_value) = + self.find_insertion_key_value(field_id, level, facet_value, txn)?; match result { - // TODO: this could go above the block recomputing insertion key - // because we know that if we inserted in place, the key is not a new one - // thus it doesn't extend a group + // because we know that we inserted in place, the facet_value is not a new one + // thus it doesn't extend a group, and thus the insertion key computed above is + // still correct InsertionResult::InPlace => { - let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); - updated_value.bitmap |= new_values; + let mut updated_value = insertion_value; + updated_value.bitmap |= docids; self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; return Ok(InsertionResult::InPlace); } InsertionResult::Insert => {} } - let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); + + // Here we know that inserting the facet value in the level below resulted in the creation + // of a new key. Therefore, it may be the case that we need to modify the left bound of the + // insertion key (see documentation of `find_insertion_key_value` for an example of when that + // could happen). + let insertion_key = { + let mut new_insertion_key = insertion_key.clone(); + let mut key_should_be_modified = false; + + if facet_value < insertion_key.left_bound.as_slice() { + new_insertion_key.left_bound = facet_value.to_vec(); + key_should_be_modified = true; + } + if key_should_be_modified { + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; + } + new_insertion_key + }; + // Now we know that the insertion key contains the `facet_value`. + + // We still need to update the insertion value by: + // 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`) + // 2. Merge the previous docids with the new one + let mut updated_value = insertion_value; updated_value.size += 1; - if updated_value.size == max_group_size { - let size_left = max_group_size / 2; - let size_right = max_group_size - size_left; - let level_below = level - 1; + if updated_value.size < max_group_size { + updated_value.bitmap |= docids; + self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; - let (start_key, _) = self - .db - .get_greater_than_or_equal_to( - &txn, - &FacetGroupKey { - field_id, - level: level_below, - left_bound: insertion_key.left_bound.as_slice(), - }, - )? - .unwrap(); - - let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize); - - let group_left = { - let mut values_left = RoaringBitmap::new(); - - let mut i = 0; - while let Some(next) = iter.next() { - let (_key, value) = next?; - i += 1; - values_left |= &value.bitmap; - if i == size_left { - break; - } - } - - let key = - FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; - let value = FacetGroupValue { size: size_left as u8, bitmap: values_left }; - (key, value) - }; - - let group_right = { - let mut values_right = RoaringBitmap::new(); - let mut right_start_key = None; - - while let Some(next) = iter.next() { - let (key, value) = next?; - if right_start_key.is_none() { - right_start_key = Some(key.left_bound); - } - values_right |= &value.bitmap; - } - - let key = FacetGroupKey { - field_id, - level, - left_bound: right_start_key.unwrap().to_vec(), - }; - let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; - (key, value) - }; - drop(iter); - - let _ = self.db.delete(txn, &insertion_key.as_ref())?; - - self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; - self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; - - Ok(InsertionResult::Insert) - } else { - let mut value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap(); - value.bitmap |= new_values; - value.size += 1; - self.db.put(txn, &insertion_key.as_ref(), &value).unwrap(); - - Ok(InsertionResult::InPlace) + return Ok(InsertionResult::InPlace); } + + // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` + // Therefore it must be split into two nodes. + + let size_left = max_group_size / 2; + let size_right = max_group_size - size_left; + + let level_below = level - 1; + + let start_key = FacetGroupKey { + field_id, + level: level_below, + left_bound: insertion_key.left_bound.as_slice(), + }; + + let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size as usize); + + let group_left = { + let mut values_left = RoaringBitmap::new(); + + let mut i = 0; + while let Some(next) = iter.next() { + let (_key, value) = next?; + i += 1; + values_left |= &value.bitmap; + if i == size_left { + break; + } + } + + let key = + FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; + let value = FacetGroupValue { size: size_left as u8, bitmap: values_left }; + (key, value) + }; + + let group_right = { + let ( + FacetGroupKey { left_bound: right_left_bound, .. }, + FacetGroupValue { bitmap: mut values_right, .. }, + ) = iter.next().unwrap()?; + + while let Some(next) = iter.next() { + let (_, value) = next?; + values_right |= &value.bitmap; + } + + let key = FacetGroupKey { field_id, level, left_bound: right_left_bound.to_vec() }; + let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; + (key, value) + }; + drop(iter); + + let _ = self.db.delete(txn, &insertion_key.as_ref())?; + + self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; + self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; + + Ok(InsertionResult::Insert) } + /// Insert the given facet value and corresponding document ids in the database. pub fn insert<'a, 't>( &self, txn: &'t mut RwTxn, field_id: u16, - new_key: &[u8], - new_values: &RoaringBitmap, + facet_value: &[u8], + docids: &RoaringBitmap, ) -> Result<()> { - if new_values.is_empty() { + if docids.is_empty() { return Ok(()); } let group_size = self.group_size; @@ -330,12 +363,15 @@ impl FacetsUpdateIncrementalInner { let highest_level = get_highest_level(&txn, self.db, field_id)?; let result = - self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?; + self.insert_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; match result { InsertionResult::InPlace => return Ok(()), InsertionResult::Insert => {} } + // Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`. + // If it has, we must build an addition level above it. + let mut highest_level_prefix = vec![]; highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); highest_level_prefix.push(highest_level); @@ -384,36 +420,61 @@ impl FacetsUpdateIncrementalInner { Ok(()) } + /// Delete the given document id from the given facet value in the database, from level 0 to the + /// the given level. + /// + /// ## Return + /// Returns the effect of removing the document id from the database on the given `level`. + /// + /// - `DeletionResult::InPlace` means that deleting the document id did not have + /// an effect on the keys in that level. + /// + /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the + /// number of keys in the level. For example, removing a document id from the facet value `3` could + /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted + /// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must + /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. + /// + /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the + /// bounds of the keys of the level. For example, removing a document id from the facet value + /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, + /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). + /// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust + /// its left bound as well. fn delete_in_level<'t>( &self, txn: &'t mut RwTxn, field_id: u16, level: u8, - key: &[u8], - value: u32, + facet_value: &[u8], + docid: u32, ) -> Result { if level == 0 { - return self.delete_in_level_0(txn, field_id, key, value); + return self.delete_in_level_0(txn, field_id, facet_value, docid); } let (deletion_key, mut bitmap) = - self.find_insertion_key_value(field_id, level, key, txn)?; + self.find_insertion_key_value(field_id, level, facet_value, txn)?; - let result = self.delete_in_level(txn, field_id, level - 1, key.clone(), value)?; + let result = self.delete_in_level(txn, field_id, level - 1, facet_value.clone(), docid)?; let mut decrease_size = false; - let (prev_key, next_key) = match result { + let next_key = match result { DeletionResult::InPlace => { - bitmap.bitmap.remove(value); + bitmap.bitmap.remove(docid); self.db.put(txn, &deletion_key.as_ref(), &bitmap)?; return Ok(DeletionResult::InPlace); } - DeletionResult::Reduce { prev, next } => (prev, next), - DeletionResult::Remove { prev, next } => { + DeletionResult::Reduce { next } => next, + DeletionResult::Remove { next } => { decrease_size = true; - (prev, next) + next } }; + // If either DeletionResult::Reduce or DeletionResult::Remove was returned, + // then we may need to adjust the left_bound of the deletion key. + // If DeletionResult::Remove was returned, then we need to decrease the group + // size of the deletion key. let mut updated_value = bitmap; if decrease_size { updated_value.size -= 1; @@ -421,17 +482,21 @@ impl FacetsUpdateIncrementalInner { if updated_value.size == 0 { self.db.delete(txn, &deletion_key.as_ref())?; - Ok(DeletionResult::Remove { prev: prev_key, next: next_key }) + Ok(DeletionResult::Remove { next: next_key }) } else { let mut updated_deletion_key = deletion_key.clone(); - if key == deletion_key.left_bound { + let reduced_range = facet_value == deletion_key.left_bound; + if reduced_range { updated_deletion_key.left_bound = next_key.clone().unwrap(); } - updated_value.bitmap.remove(value); + updated_value.bitmap.remove(docid); let _ = self.db.delete(txn, &deletion_key.as_ref())?; self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?; - - Ok(DeletionResult::Reduce { prev: prev_key, next: next_key }) + if reduced_range { + Ok(DeletionResult::Reduce { next: next_key }) + } else { + Ok(DeletionResult::InPlace) + } } } @@ -439,27 +504,24 @@ impl FacetsUpdateIncrementalInner { &self, txn: &'t mut RwTxn, field_id: u16, - key: &[u8], - value: u32, + facet_value: &[u8], + docid: u32, ) -> Result { - let key = FacetGroupKey { field_id, level: 0, left_bound: key }; + let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap; - bitmap.remove(value); + bitmap.remove(docid); if bitmap.is_empty() { - let mut prev_key = None; let mut next_key = None; - - if let Some(prev) = self.db.get_lower_than(&txn, &key)? { - prev_key = Some(prev.0.left_bound.to_vec()); - } - if let Some(next) = self.db.get_greater_than(&txn, &key)? { - if next.0.level == 0 { - next_key = Some(next.0.left_bound.to_vec()); + if let Some((next, _)) = + self.db.remap_data_type::().get_greater_than(&txn, &key)? + { + if next.field_id == field_id && next.level == 0 { + next_key = Some(next.left_bound.to_vec()); } } self.db.delete(txn, &key)?; - Ok(DeletionResult::Remove { prev: prev_key, next: next_key }) + Ok(DeletionResult::Remove { next: next_key }) } else { self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?; Ok(DeletionResult::InPlace) @@ -470,22 +532,30 @@ impl FacetsUpdateIncrementalInner { &self, txn: &'t mut RwTxn, field_id: u16, - key: &[u8], - value: u32, + facet_value: &[u8], + docid: u32, ) -> Result<()> { - if self.db.get(txn, &FacetGroupKey { field_id, level: 0, left_bound: key })?.is_none() { + if self + .db + .remap_data_type::() + .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })? + .is_none() + { return Ok(()); } let highest_level = get_highest_level(&txn, self.db, field_id)?; - // let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); - - let result = self.delete_in_level(txn, field_id, highest_level as u8, key, value)?; + let result = + self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docid)?; match result { DeletionResult::InPlace => return Ok(()), - DeletionResult::Reduce { .. } => {} + DeletionResult::Reduce { .. } => return Ok(()), DeletionResult::Remove { .. } => {} } + + // if we either removed a key from the highest level, its size may have fallen + // below `min_level_size`, in which case we need to remove the entire level + let mut highest_level_prefix = vec![]; highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); highest_level_prefix.push(highest_level); @@ -521,6 +591,26 @@ impl FacetsUpdateIncrementalInner { } } +impl<'a> FacetGroupKey<&'a [u8]> { + pub fn into_owned(self) -> FacetGroupKey> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.to_vec(), + } + } +} + +impl<'a> FacetGroupKey> { + pub fn as_ref(&self) -> FacetGroupKey<&[u8]> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.as_slice(), + } + } +} + #[cfg(test)] mod tests { use crate::heed_codec::facet::{OrderedF64Codec, StrRefCodec}; diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index caf88671e..ea6468538 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -1,3 +1,79 @@ +/*! +This module implements two different algorithms for updating the `facet_id_string_docids` +and `facet_id_f64_docids` databases. The first algorithm is a "bulk" algorithm, meaning that +it recreates the database from scratch when new elements are added to it. The second algorithm +is incremental: it modifies the database as little as possible. + +The databases must be able to return results for queries such as: +1. Filter : find all the document ids that have a facet value greater than X and/or smaller than Y +2. Min/Max : find the minimum/maximum facet value among these document ids +3. Sort : sort these document ids by increasing/decreasing facet values +4. Distribution : given some document ids, make a list of each facet value + found in these documents along with the number of documents that contain it + +The algorithms that implement these queries are found in the `src/search/facet` folder. + +To make these queries fast to compute, the database adopts a tree structure: +```ignore + ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ +┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │ +│Level 2│ │ │ │ │ +└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ + ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ +┌───────┐ │ "ab" (2) │ "ba" (2) │ "gaf" (2) │ "form" (2) │ "woz" (2) │ +│Level 1│ │ │ │ │ │ │ +└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ + ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ +┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │ +│Level 0│ │ │ │ │ │ │ │ │ │ │ │ +└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ + └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ +``` +In the diagram above, each cell corresponds to a node in the tree. The first line of the cell +contains the left bound of the range of facet values as well as the number of children of the node. +The second line contains the document ids which have a facet value within the range of the node. +The nodes at level 0 are the leaf nodes. They have 0 children and a single facet value in their range. + +In the diagram above, the first cell of level 2 is `ab (2)`. Its range is `ab .. gaf` (because +`gaf` is the left bound of the next node) and it has two children. Its document ids are `[a,b,d,f,z]`. +These documents all contain a facet value that is contained within `ab .. gaf`. + +In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a +[`FacetGroupValue`], which have the following format: + +```ignore +FacetGroupKey: +- field id : u16 +- level : u8 +- left bound: [u8] // the facet value encoded using either OrderedF64Codec or Str + +FacetGroupValue: +- #children : u8 +- docids : RoaringBitmap +``` + +When the database is first created using the "bulk" method, each node has a fixed number of children +(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`). +The tree is also built such that the highest level has more than `min_level_size` +(default to `FACET_MIN_LEVEL_SIZE`) elements in it. + +When the database is incrementally updated, the number of children of a node can vary between +1 and `max_group_size`. This is done so that most incremental operations do not need to change +the structure of the tree. When the number of children of a node reaches `max_group_size`, +we split the node in two and update the number of children of its parent. + +When adding documents to the databases, it is important to determine which method to use to +minimise indexing time. The incremental method is faster when adding few new facet values, but the +bulk method is faster when a large part of the database is modified. Empirically, it seems that +it takes 50x more time to incrementally add N facet values to an existing database than it is to +construct a database of N facet values. This is the heuristic that is used to choose between the +two methods. +*/ + +pub const FACET_MAX_GROUP_SIZE: u8 = 8; +pub const FACET_GROUP_SIZE: u8 = 4; +pub const FACET_MIN_LEVEL_SIZE: u8 = 5; + use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; @@ -13,8 +89,8 @@ pub struct FacetsUpdate<'i> { database: heed::Database, FacetGroupValueCodec>, facet_type: FacetType, new_data: grenad::Reader, - level_group_size: u8, - max_level_group_size: u8, + group_size: u8, + max_group_size: u8, min_level_size: u8, } impl<'i> FacetsUpdate<'i> { @@ -30,57 +106,24 @@ impl<'i> FacetsUpdate<'i> { Self { index, database, - level_group_size: 4, - max_level_group_size: 8, - min_level_size: 5, + group_size: FACET_GROUP_SIZE, + max_group_size: FACET_MAX_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, facet_type, new_data, } } - // TODO: use the options below? - // but I don't actually see why they should be configurable - // /// The minimum number of elements that a level is allowed to have. - // pub fn level_max_group_size(mut self, value: u8) -> Self { - // self.max_level_group_size = std::cmp::max(value, 4); - // self - // } - - // /// The number of elements from the level below that are represented by a single element in the level above - // /// - // /// This setting is always greater than or equal to 2. - // pub fn level_group_size(mut self, value: u8) -> Self { - // self.level_group_size = std::cmp::max(value, 2); - // self - // } - - // /// The minimum number of elements that a level is allowed to have. - // pub fn min_level_size(mut self, value: u8) -> Self { - // self.min_level_size = std::cmp::max(value, 2); - // self - // } - pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { if self.new_data.is_empty() { return Ok(()); } - // here, come up with a better condition! - // ideally we'd choose which method to use for each field id individually - // but I dont' think it's worth the effort yet - // As a first requirement, we ask that the length of the new data is less - // than a 1/50th of the length of the database in order to use the incremental - // method. if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { - let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data) - .level_group_size(self.level_group_size) - .min_level_size(self.min_level_size); + let bulk_update = FacetsUpdateBulk::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size); bulk_update.execute(wtxn)?; } else { let incremental_update = - FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data) - .group_size(self.level_group_size) - .max_group_size(self.max_level_group_size) - .min_level_size(self.min_level_size); + FacetsUpdateIncremental::new(self.index, self.facet_type, self.new_data, self.group_size, self.min_level_size, self.max_group_size); incremental_update.execute(wtxn)?; } Ok(()) @@ -346,7 +389,7 @@ mod comparison_bench { // of the incremental vs. bulk indexer. // It appears that the incremental indexer is about 50 times slower than the // bulk indexer. - #[test] + // #[test] fn benchmark_facet_indexing() { // then we add 10_000 documents at a time and compare the speed of adding 1, 100, and 1000 documents to it diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 96bea9589..7b02fd1af 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -7,7 +7,7 @@ mod typed_chunk; use std::collections::HashSet; use std::io::{Cursor, Read, Seek}; use std::iter::FromIterator; -use std::num::{NonZeroU32, NonZeroUsize}; +use std::num::NonZeroU32; use std::result::Result as StdResult; use crossbeam_channel::{Receiver, Sender}; @@ -82,8 +82,6 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a, F> { #[derive(Default, Debug, Clone)] pub struct IndexDocumentsConfig { - pub facet_level_group_size: Option, - pub facet_min_level_size: Option, pub words_prefix_threshold: Option, pub max_prefix_length: Option, pub words_positions_level_group_size: Option,