diff --git a/.gitignore b/.gitignore index cef7b7b4c..edd3e675c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,8 @@ /target /Cargo.lock +milli/target/ + # datasets *.csv *.mmdb @@ -11,6 +13,8 @@ # Snapshots ## ... large *.full.snap - -# ... unreviewed +## ... unreviewed *.snap.new + +# Fuzzcheck data for the facet indexing fuzz test +milli/fuzz/update::facet::incremental::fuzz::fuzz/ diff --git a/milli/Cargo.toml b/milli/Cargo.toml index f19d3781e..c7c780dd4 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -54,7 +54,10 @@ big_s = "1.0.2" insta = "1.21.0" maplit = "1.0.2" md5 = "0.7.0" -rand = "0.8.5" +rand = {version = "0.8.5", features = ["small_rng"] } + +[target.'cfg(fuzzing)'.dev-dependencies] +fuzzcheck = "0.12.1" [features] default = [ "charabia/default" ] diff --git a/milli/src/heed_codec/byte_slice_ref.rs b/milli/src/heed_codec/byte_slice_ref.rs new file mode 100644 index 000000000..48eda63c5 --- /dev/null +++ b/milli/src/heed_codec/byte_slice_ref.rs @@ -0,0 +1,23 @@ +use std::borrow::Cow; + +use heed::{BytesDecode, BytesEncode}; + +/// A codec for values of type `&[u8]`. Unlike `ByteSlice`, its `EItem` and `DItem` associated +/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure. +pub struct ByteSliceRefCodec; + +impl<'a> BytesEncode<'a> for ByteSliceRefCodec { + type EItem = &'a [u8]; + + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Borrowed(item)) + } +} + +impl<'a> BytesDecode<'a> for ByteSliceRefCodec { + type DItem = &'a [u8]; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + Some(bytes) + } +} diff --git a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs deleted file mode 100644 index 1e66427ca..000000000 --- a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs +++ /dev/null @@ -1,89 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::f64_into_bytes; -use crate::{try_split_array_at, FieldId}; - -// TODO do not de/serialize right bound when level = 0 -pub struct FacetLevelValueF64Codec; - -impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { - type DItem = (FieldId, u8, f64, f64); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let (level, bytes) = bytes.split_first()?; - - let (left, right) = if *level != 0 { - let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?; - let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?; - (left, right) - } else { - let left = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; - (left, left) - }; - - Some((field_id, *level, left, right)) - } -} - -impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { - type EItem = (FieldId, u8, f64, f64); - - fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { - let mut buffer = [0u8; 32]; - - let len = if *level != 0 { - // Write the globally ordered floats. - let bytes = f64_into_bytes(*left)?; - buffer[..8].copy_from_slice(&bytes[..]); - - let bytes = f64_into_bytes(*right)?; - buffer[8..16].copy_from_slice(&bytes[..]); - - // Then the f64 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[16..24].copy_from_slice(&bytes[..]); - - let bytes = right.to_be_bytes(); - buffer[24..].copy_from_slice(&bytes[..]); - - 32 // length - } else { - // Write the globally ordered floats. - let bytes = f64_into_bytes(*left)?; - buffer[..8].copy_from_slice(&bytes[..]); - - // Then the f64 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[8..16].copy_from_slice(&bytes[..]); - - 16 // length - }; - - let mut bytes = Vec::with_capacity(len + 3); - bytes.extend_from_slice(&field_id.to_be_bytes()); - bytes.push(*level); - bytes.extend_from_slice(&buffer[..len]); - Some(Cow::Owned(bytes)) - } -} - -#[cfg(test)] -mod tests { - use heed::{BytesDecode, BytesEncode}; - - use super::*; - - #[test] - fn globally_ordered_f64() { - let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, 32.0, 0.0)).unwrap(); - let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); - assert_eq!((name, level, left, right), (3, 0, 32.0, 32.0)); - - let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 1, -32.0, 32.0)).unwrap(); - let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); - assert_eq!((name, level, left, right), (3, 1, -32.0, 32.0)); - } -} diff --git a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs deleted file mode 100644 index 597335b6e..000000000 --- a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs +++ /dev/null @@ -1,53 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; -use std::num::NonZeroU8; - -use crate::{try_split_array_at, FieldId}; - -/// A codec that stores the field id, level 1 and higher and the groups ids. -/// -/// It can only be used to encode the facet string of the level 1 or higher. -pub struct FacetLevelValueU32Codec; - -impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec { - type DItem = (FieldId, NonZeroU8, u32, u32); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let (level, bytes) = bytes.split_first()?; - let level = NonZeroU8::new(*level)?; - let left = bytes[8..12].try_into().ok().map(u32::from_be_bytes)?; - let right = bytes[12..].try_into().ok().map(u32::from_be_bytes)?; - Some((field_id, level, left, right)) - } -} - -impl heed::BytesEncode<'_> for FacetLevelValueU32Codec { - type EItem = (FieldId, NonZeroU8, u32, u32); - - fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { - let mut buffer = [0u8; 16]; - - // Write the big-endian integers. - let bytes = left.to_be_bytes(); - buffer[..4].copy_from_slice(&bytes[..]); - - let bytes = right.to_be_bytes(); - buffer[4..8].copy_from_slice(&bytes[..]); - - // Then the u32 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[8..12].copy_from_slice(&bytes[..]); - - let bytes = right.to_be_bytes(); - buffer[12..].copy_from_slice(&bytes[..]); - - let mut bytes = Vec::with_capacity(buffer.len() + 2 + 1); - bytes.extend_from_slice(&field_id.to_be_bytes()); - bytes.push(level.get()); - bytes.extend_from_slice(&buffer); - - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs deleted file mode 100644 index 009c6454a..000000000 --- a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::borrow::Cow; -use std::str; - -use crate::{try_split_array_at, FieldId}; - -/// A codec that stores the field id, level 0, and facet string. -/// -/// It can only be used to encode the facet string of the level 0, -/// as it hardcodes the level. -/// -/// We encode the level 0 to not break the lexicographical ordering of the LMDB keys, -/// and make sure that the levels are not mixed-up. The level 0 is special, the key -/// are strings, other levels represent groups and keys are simply two integers. -pub struct FacetStringLevelZeroCodec; - -impl FacetStringLevelZeroCodec { - pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { - out.reserve(value.len() + 2); - out.extend_from_slice(&field_id.to_be_bytes()); - out.push(0); // the level zero (for LMDB ordering only) - out.extend_from_slice(value.as_bytes()); - } -} - -impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { - type DItem = (FieldId, &'a str); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let (level, bytes) = bytes.split_first()?; - - if *level != 0 { - return None; - } - - let value = str::from_utf8(bytes).ok()?; - Some((field_id, value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec { - type EItem = (FieldId, &'a str); - - fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::new(); - FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs index d1605e6ef..e69de29bb 100644 --- a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs @@ -1,90 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; -use std::{marker, str}; - -use crate::error::SerializationError; -use crate::heed_codec::RoaringBitmapCodec; -use crate::{try_split_array_at, try_split_at, Result}; - -pub type FacetStringLevelZeroValueCodec = StringValueCodec; - -/// A codec that encodes a string in front of a value. -/// -/// The usecase is for the facet string levels algorithm where we must know the -/// original string of a normalized facet value, the original values are stored -/// in the value to not break the lexicographical ordering of the LMDB keys. -pub struct StringValueCodec(marker::PhantomData); - -impl<'a, C> heed::BytesDecode<'a> for StringValueCodec -where - C: heed::BytesDecode<'a>, -{ - type DItem = (&'a str, C::DItem); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (string, bytes) = decode_prefix_string(bytes)?; - C::bytes_decode(bytes).map(|item| (string, item)) - } -} - -impl<'a, C> heed::BytesEncode<'a> for StringValueCodec -where - C: heed::BytesEncode<'a>, -{ - type EItem = (&'a str, C::EItem); - - fn bytes_encode((string, value): &'a Self::EItem) -> Option> { - let value_bytes = C::bytes_encode(value)?; - - let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); - encode_prefix_string(string, &mut bytes).ok()?; - bytes.extend_from_slice(&value_bytes[..]); - - Some(Cow::Owned(bytes)) - } -} - -pub fn decode_prefix_string(value: &[u8]) -> Option<(&str, &[u8])> { - let (original_length_bytes, bytes) = try_split_array_at(value)?; - let original_length = u16::from_be_bytes(original_length_bytes) as usize; - let (string, bytes) = try_split_at(bytes, original_length)?; - let string = str::from_utf8(string).ok()?; - Some((string, bytes)) -} - -pub fn encode_prefix_string(string: &str, buffer: &mut Vec) -> Result<()> { - let string_len: u16 = - string.len().try_into().map_err(|_| SerializationError::InvalidNumberSerialization)?; - buffer.extend_from_slice(&string_len.to_be_bytes()); - buffer.extend_from_slice(string.as_bytes()); - Ok(()) -} - -#[cfg(test)] -mod tests { - use heed::types::Unit; - use heed::{BytesDecode, BytesEncode}; - use roaring::RoaringBitmap; - - use super::*; - - #[test] - fn deserialize_roaring_bitmaps() { - let string = "abc"; - let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); - let key = (string, docids.clone()); - let bytes = StringValueCodec::::bytes_encode(&key).unwrap(); - let (out_string, out_docids) = - StringValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_string, out_docids), (string, docids)); - } - - #[test] - fn deserialize_unit() { - let string = "def"; - let key = (string, ()); - let bytes = StringValueCodec::::bytes_encode(&key).unwrap(); - let (out_string, out_unit) = StringValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_string, out_unit), (string, ())); - } -} diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs index 90ba09ae2..e69de29bb 100644 --- a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs @@ -1,114 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; -use std::{marker, str}; - -use super::try_split_at; - -/// A codec that optionally encodes two strings in front of the value. -/// -/// The usecase is for the facet string levels algorithm where we must -/// know the origin of a group, the group left and right bounds are stored -/// in the value to not break the lexicographical ordering of the LMDB keys. -pub struct FacetStringZeroBoundsValueCodec(marker::PhantomData); - -impl<'a, C> heed::BytesDecode<'a> for FacetStringZeroBoundsValueCodec -where - C: heed::BytesDecode<'a>, -{ - type DItem = (Option<(&'a str, &'a str)>, C::DItem); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (contains_bounds, bytes) = bytes.split_first()?; - - if *contains_bounds != 0 { - let (left_len, bytes) = try_split_at(bytes, 2)?; - let (right_len, bytes) = try_split_at(bytes, 2)?; - - let left_len = left_len.try_into().ok().map(u16::from_be_bytes)?; - let right_len = right_len.try_into().ok().map(u16::from_be_bytes)?; - - let (left, bytes) = try_split_at(bytes, left_len as usize)?; - let (right, bytes) = try_split_at(bytes, right_len as usize)?; - - let left = str::from_utf8(left).ok()?; - let right = str::from_utf8(right).ok()?; - - C::bytes_decode(bytes).map(|item| (Some((left, right)), item)) - } else { - C::bytes_decode(bytes).map(|item| (None, item)) - } - } -} - -impl<'a, C> heed::BytesEncode<'a> for FacetStringZeroBoundsValueCodec -where - C: heed::BytesEncode<'a>, -{ - type EItem = (Option<(&'a str, &'a str)>, C::EItem); - - fn bytes_encode((bounds, value): &'a Self::EItem) -> Option> { - let mut bytes = Vec::new(); - - match bounds { - Some((left, right)) => { - bytes.push(u8::max_value()); - - if left.is_empty() || right.is_empty() { - return None; - } - - let left_len: u16 = left.len().try_into().ok()?; - let right_len: u16 = right.len().try_into().ok()?; - - bytes.extend_from_slice(&left_len.to_be_bytes()); - bytes.extend_from_slice(&right_len.to_be_bytes()); - - bytes.extend_from_slice(left.as_bytes()); - bytes.extend_from_slice(right.as_bytes()); - - let value_bytes = C::bytes_encode(value)?; - bytes.extend_from_slice(&value_bytes[..]); - - Some(Cow::Owned(bytes)) - } - None => { - bytes.push(0); - let value_bytes = C::bytes_encode(value)?; - bytes.extend_from_slice(&value_bytes[..]); - Some(Cow::Owned(bytes)) - } - } - } -} - -#[cfg(test)] -mod tests { - use heed::types::Unit; - use heed::{BytesDecode, BytesEncode}; - use roaring::RoaringBitmap; - - use super::*; - use crate::CboRoaringBitmapCodec; - - #[test] - fn deserialize_roaring_bitmaps() { - let bounds = Some(("abc", "def")); - let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); - let key = (bounds, docids.clone()); - let bytes = - FacetStringZeroBoundsValueCodec::::bytes_encode(&key).unwrap(); - let (out_bounds, out_docids) = - FacetStringZeroBoundsValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_bounds, out_docids), (bounds, docids)); - } - - #[test] - fn deserialize_unit() { - let bounds = Some(("abc", "def")); - let key = (bounds, ()); - let bytes = FacetStringZeroBoundsValueCodec::::bytes_encode(&key).unwrap(); - let (out_bounds, out_unit) = - FacetStringZeroBoundsValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_bounds, out_unit), (bounds, ())); - } -} diff --git a/milli/src/heed_codec/facet/facet_value_string_codec.rs b/milli/src/heed_codec/facet/facet_value_string_codec.rs deleted file mode 100644 index 54abb7886..000000000 --- a/milli/src/heed_codec/facet/facet_value_string_codec.rs +++ /dev/null @@ -1,35 +0,0 @@ -use std::borrow::Cow; -use std::str; - -use crate::{try_split_array_at, FieldId}; - -pub struct FacetValueStringCodec; - -impl FacetValueStringCodec { - pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { - out.reserve(value.len() + 2); - out.extend_from_slice(&field_id.to_be_bytes()); - out.extend_from_slice(value.as_bytes()); - } -} - -impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { - type DItem = (FieldId, &'a str); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let value = str::from_utf8(bytes).ok()?; - Some((field_id, value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FacetValueStringCodec { - type EItem = (FieldId, &'a str); - - fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::new(); - FacetValueStringCodec::serialize_into(*field_id, value, &mut bytes); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs new file mode 100644 index 000000000..cc9919ad2 --- /dev/null +++ b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs @@ -0,0 +1,44 @@ +use std::borrow::Cow; +use std::marker::PhantomData; + +use heed::{BytesDecode, BytesEncode}; + +use crate::{try_split_array_at, DocumentId, FieldId}; + +pub struct FieldDocIdFacetCodec(PhantomData); + +impl<'a, C> BytesDecode<'a> for FieldDocIdFacetCodec +where + C: BytesDecode<'a>, +{ + type DItem = (FieldId, DocumentId, C::DItem); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); + + let (document_id_bytes, bytes) = try_split_array_at(bytes)?; + let document_id = u32::from_be_bytes(document_id_bytes); + + let value = C::bytes_decode(bytes)?; + + Some((field_id, document_id, value)) + } +} + +impl<'a, C> BytesEncode<'a> for FieldDocIdFacetCodec +where + C: BytesEncode<'a>, +{ + type EItem = (FieldId, DocumentId, C::EItem); + + fn bytes_encode((field_id, document_id, value): &'a Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(32); + bytes.extend_from_slice(&field_id.to_be_bytes()); // 2 bytes + bytes.extend_from_slice(&document_id.to_be_bytes()); // 4 bytes + let value_bytes = C::bytes_encode(value)?; + // variable length, if f64 -> 16 bytes, if string -> large, potentially + bytes.extend_from_slice(&value_bytes); + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs deleted file mode 100644 index 22159601c..000000000 --- a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs +++ /dev/null @@ -1,37 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::f64_into_bytes; -use crate::{try_split_array_at, DocumentId, FieldId}; - -pub struct FieldDocIdFacetF64Codec; - -impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec { - type DItem = (FieldId, DocumentId, f64); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - - let (document_id_bytes, bytes) = try_split_array_at(bytes)?; - let document_id = u32::from_be_bytes(document_id_bytes); - - let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?; - - Some((field_id, document_id, value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec { - type EItem = (FieldId, DocumentId, f64); - - fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8); - bytes.extend_from_slice(&field_id.to_be_bytes()); - bytes.extend_from_slice(&document_id.to_be_bytes()); - let value_bytes = f64_into_bytes(*value)?; - bytes.extend_from_slice(&value_bytes); - bytes.extend_from_slice(&value.to_be_bytes()); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs deleted file mode 100644 index 178bb21c1..000000000 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::borrow::Cow; -use std::str; - -use crate::{try_split_array_at, DocumentId, FieldId}; - -pub struct FieldDocIdFacetStringCodec; - -impl FieldDocIdFacetStringCodec { - pub fn serialize_into( - field_id: FieldId, - document_id: DocumentId, - normalized_value: &str, - out: &mut Vec, - ) { - out.reserve(2 + 4 + normalized_value.len()); - out.extend_from_slice(&field_id.to_be_bytes()); - out.extend_from_slice(&document_id.to_be_bytes()); - out.extend_from_slice(normalized_value.as_bytes()); - } -} - -impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { - type DItem = (FieldId, DocumentId, &'a str); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - - let (document_id_bytes, bytes) = try_split_array_at(bytes)?; - let document_id = u32::from_be_bytes(document_id_bytes); - - let normalized_value = str::from_utf8(bytes).ok()?; - Some((field_id, document_id, normalized_value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { - type EItem = (FieldId, DocumentId, &'a str); - - fn bytes_encode((field_id, document_id, normalized_value): &Self::EItem) -> Option> { - let mut bytes = Vec::new(); - FieldDocIdFacetStringCodec::serialize_into( - *field_id, - *document_id, - normalized_value, - &mut bytes, - ); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 0b2d9186f..4609bfe7f 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,23 +1,22 @@ -mod facet_level_value_f64_codec; -mod facet_level_value_u32_codec; -mod facet_string_level_zero_codec; -mod facet_string_level_zero_value_codec; -mod facet_string_zero_bounds_value_codec; -mod field_doc_id_facet_f64_codec; -mod field_doc_id_facet_string_codec; +mod field_doc_id_facet_codec; +mod ordered_f64_codec; -use heed::types::OwnedType; +use std::borrow::Cow; +use std::convert::TryFrom; +use std::marker::PhantomData; -pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; -pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; -pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; -pub use self::facet_string_level_zero_value_codec::{ - decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, -}; -pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; -pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; -pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; -use crate::BEU16; +use heed::types::{DecodeIgnore, OwnedType}; +use heed::{BytesDecode, BytesEncode}; +use roaring::RoaringBitmap; + +pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec; +pub use self::ordered_f64_codec::OrderedF64Codec; +use super::StrRefCodec; +use crate::{CboRoaringBitmapCodec, BEU16}; + +pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec; +pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec; +pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec; pub type FieldIdCodec = OwnedType; @@ -30,3 +29,76 @@ pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { None } } + +/// The key in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`] +/// databases. +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] // TODO: try removing PartialOrd and Ord +pub struct FacetGroupKey { + pub field_id: u16, + pub level: u8, + pub left_bound: T, +} + +/// The value in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`] +/// databases. +#[derive(Debug)] +pub struct FacetGroupValue { + pub size: u8, + pub bitmap: RoaringBitmap, +} + +pub struct FacetGroupKeyCodec { + _phantom: PhantomData, +} + +impl<'a, T> heed::BytesEncode<'a> for FacetGroupKeyCodec +where + T: BytesEncode<'a>, + T::EItem: Sized, +{ + type EItem = FacetGroupKey; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.extend_from_slice(&value.field_id.to_be_bytes()); + v.extend_from_slice(&[value.level]); + + let bound = T::bytes_encode(&value.left_bound)?; + v.extend_from_slice(&bound); + + Some(Cow::Owned(v)) + } +} +impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec +where + T: BytesDecode<'a>, +{ + type DItem = FacetGroupKey; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).ok()?); + let level = bytes[2]; + let bound = T::bytes_decode(&bytes[3..])?; + Some(FacetGroupKey { field_id: fid, level, left_bound: bound }) + } +} + +pub struct FacetGroupValueCodec; +impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { + type EItem = FacetGroupValue; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.push(value.size); + CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v); + Some(Cow::Owned(v)) + } +} +impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { + type DItem = FacetGroupValue; + fn bytes_decode(bytes: &'a [u8]) -> Option { + let size = bytes[0]; + let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..]).ok()?; + Some(FacetGroupValue { size, bitmap }) + } +} diff --git a/milli/src/heed_codec/facet/ordered_f64_codec.rs b/milli/src/heed_codec/facet/ordered_f64_codec.rs new file mode 100644 index 000000000..5ac9ffcfc --- /dev/null +++ b/milli/src/heed_codec/facet/ordered_f64_codec.rs @@ -0,0 +1,37 @@ +use std::borrow::Cow; +use std::convert::TryInto; + +use heed::BytesDecode; + +use crate::facet::value_encoding::f64_into_bytes; + +pub struct OrderedF64Codec; + +impl<'a> BytesDecode<'a> for OrderedF64Codec { + type DItem = f64; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + if bytes.len() < 16 { + return None; + } + let f = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; + Some(f) + } +} + +impl heed::BytesEncode<'_> for OrderedF64Codec { + type EItem = f64; + + fn bytes_encode(f: &Self::EItem) -> Option> { + let mut buffer = [0u8; 16]; + + // write the globally ordered float + let bytes = f64_into_bytes(*f)?; + buffer[..8].copy_from_slice(&bytes[..]); + // Then the f64 value just to be able to read it back + let bytes = f.to_be_bytes(); + buffer[8..16].copy_from_slice(&bytes[..]); + + Some(Cow::Owned(buffer.to_vec())) + } +} diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index e07e47c79..702dcf661 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -1,12 +1,17 @@ mod beu32_str_codec; +mod byte_slice_ref; pub mod facet; mod field_id_word_count_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; mod str_beu32_codec; +mod str_ref; mod str_str_u8_codec; +pub use byte_slice_ref::ByteSliceRefCodec; +pub use str_ref::StrRefCodec; + pub use self::beu32_str_codec::BEU32StrCodec; pub use self::field_id_word_count_codec::FieldIdWordCountCodec; pub use self::obkv_codec::ObkvCodec; diff --git a/milli/src/heed_codec/str_ref.rs b/milli/src/heed_codec/str_ref.rs new file mode 100644 index 000000000..ced5cc65e --- /dev/null +++ b/milli/src/heed_codec/str_ref.rs @@ -0,0 +1,22 @@ +use std::borrow::Cow; + +use heed::{BytesDecode, BytesEncode}; + +/// A codec for values of type `&str`. Unlike `Str`, its `EItem` and `DItem` associated +/// types are equivalent (= `&'a str`) and these values can reside within another structure. +pub struct StrRefCodec; +impl<'a> BytesEncode<'a> for StrRefCodec { + type EItem = &'a str; + + fn bytes_encode(item: &'a &'a str) -> Option> { + Some(Cow::Borrowed(item.as_bytes())) + } +} +impl<'a> BytesDecode<'a> for StrRefCodec { + type DItem = &'a str; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let s = std::str::from_utf8(bytes).ok()?; + Some(s) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 5a4bb57f4..8b1e4d8ff 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -12,11 +12,13 @@ use rstar::RTree; use time::OffsetDateTime; use crate::error::{InternalError, UserError}; +use crate::facet::FacetType; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, + FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + FieldIdCodec, OrderedF64Codec, }; +use crate::heed_codec::StrRefCodec; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, @@ -123,10 +125,10 @@ pub struct Index { /// Maps the facet field id and the docids for which this field exists pub facet_id_exists_docids: Database, - /// Maps the facet field id, level and the number with the docids that corresponds to it. - pub facet_id_f64_docids: Database, - /// Maps the facet field id and the string with the original string and docids that corresponds to it. - pub facet_id_string_docids: Database, + /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. + pub facet_id_f64_docids: Database, FacetGroupValueCodec>, + /// Maps the facet field id and ranges of strings with the docids that corresponds to them. + pub facet_id_string_docids: Database, FacetGroupValueCodec>, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, @@ -775,68 +777,38 @@ impl Index { /* faceted documents ids */ - /// Writes the documents ids that are faceted with numbers under this field id. - pub(crate) fn put_number_faceted_documents_ids( + /// Writes the documents ids that are faceted under this field id for the given facet type. + pub fn put_faceted_documents_ids( &self, wtxn: &mut RwTxn, field_id: FieldId, + facet_type: FacetType, docids: &RoaringBitmap, ) -> heed::Result<()> { - let mut buffer = - [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; - buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] - .copy_from_slice(&field_id.to_be_bytes()); + let key = match facet_type { + FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, + FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, + }; + let mut buffer = vec![0u8; key.len() + size_of::()]; + buffer[..key.len()].copy_from_slice(key.as_bytes()); + buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) } - /// Retrieve all the documents ids that faceted with numbers under this field id. - pub fn number_faceted_documents_ids( + /// Retrieve all the documents ids that are faceted under this field id for the given facet type. + pub fn faceted_documents_ids( &self, rtxn: &RoTxn, field_id: FieldId, + facet_type: FacetType, ) -> heed::Result { - let mut buffer = - [0u8; main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; - buffer[..main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX.len()..] - .copy_from_slice(&field_id.to_be_bytes()); - match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { - Some(docids) => Ok(docids), - None => Ok(RoaringBitmap::new()), - } - } - - /// Writes the documents ids that are faceted with strings under this field id. - pub(crate) fn put_string_faceted_documents_ids( - &self, - wtxn: &mut RwTxn, - field_id: FieldId, - docids: &RoaringBitmap, - ) -> heed::Result<()> { - let mut buffer = - [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; - buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] - .copy_from_slice(&field_id.to_be_bytes()); - self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids) - } - - /// Retrieve all the documents ids that faceted with strings under this field id. - pub fn string_faceted_documents_ids( - &self, - rtxn: &RoTxn, - field_id: FieldId, - ) -> heed::Result { - let mut buffer = - [0u8; main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len() + size_of::()]; - buffer[..main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()] - .copy_from_slice(main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.as_bytes()); - buffer[main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX.len()..] - .copy_from_slice(&field_id.to_be_bytes()); + let key = match facet_type { + FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX, + FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX, + }; + let mut buffer = vec![0u8; key.len() + size_of::()]; + buffer[..key.len()].copy_from_slice(key.as_bytes()); + buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes()); match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? { Some(docids) => Ok(docids), None => Ok(RoaringBitmap::new()), diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 6fb83922a..28f048b8a 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,3 +1,4 @@ +#![cfg_attr(all(test, fuzzing), feature(no_coverage))] #![allow(clippy::reversed_empty_ranges)] #![allow(clippy::too_many_arguments)] #[macro_use] diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index bf015c5fc..92c73709b 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -6,8 +6,11 @@ use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; +use crate::facet::FacetType; +use crate::heed_codec::facet::FacetGroupKeyCodec; +use crate::heed_codec::ByteSliceRefCodec; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::{FacetNumberIter, FacetStringIter}; +use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -59,8 +62,10 @@ impl<'t> AscDesc<'t> { let field_id = fields_ids_map.id(&field_name); let faceted_candidates = match field_id { Some(field_id) => { - let number_faceted = index.number_faceted_documents_ids(rtxn, field_id)?; - let string_faceted = index.string_faceted_documents_ids(rtxn, field_id)?; + let number_faceted = + index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?; + let string_faceted = + index.faceted_documents_ids(rtxn, field_id, FacetType::String)?; number_faceted | string_faceted } None => RoaringBitmap::default(), @@ -186,21 +191,21 @@ fn facet_ordered<'t>( iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) } else { - let facet_number_fn = if is_ascending { - FacetNumberIter::new_reducing - } else { - FacetNumberIter::new_reverse_reducing - }; - let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())? - .map(|res| res.map(|(_, docids)| docids)); + let make_iter = if is_ascending { ascending_facet_sort } else { descending_facet_sort }; - let facet_string_fn = if is_ascending { - FacetStringIter::new_reducing - } else { - FacetStringIter::new_reverse_reducing - }; - let string_iter = facet_string_fn(rtxn, index, field_id, candidates)? - .map(|res| res.map(|(_, _, docids)| docids)); + let number_iter = make_iter( + rtxn, + index.facet_id_f64_docids.remap_key_type::>(), + field_id, + candidates.clone(), + )?; + + let string_iter = make_iter( + rtxn, + index.facet_id_string_docids.remap_key_type::>(), + field_id, + candidates, + )?; Ok(Box::new(number_iter.chain(string_iter))) } diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 33e7b4975..1725346be 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,7 +6,7 @@ use roaring::RoaringBitmap; use super::{Distinct, DocIter}; use crate::error::InternalError; -use crate::heed_codec::facet::*; +use crate::heed_codec::facet::{FacetGroupKey, *}; use crate::index::db_name; use crate::{DocumentId, FieldId, Index, Result}; @@ -47,13 +47,16 @@ impl<'a> FacetDistinctIter<'a> { fn facet_string_docids(&self, key: &str) -> heed::Result> { self.index .facet_id_string_docids - .get(self.txn, &(self.distinct, key)) - .map(|result| result.map(|(_original, docids)| docids)) + .get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key }) + .map(|opt| opt.map(|v| v.bitmap)) } fn facet_number_docids(&self, key: f64) -> heed::Result> { // get facet docids on level 0 - self.index.facet_id_f64_docids.get(self.txn, &(self.distinct, 0, key, key)) + self.index + .facet_id_f64_docids + .get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key }) + .map(|opt| opt.map(|v| v.bitmap)) } fn distinct_string(&mut self, id: DocumentId) -> Result<()> { diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 47e4088fe..43367abbb 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,16 +1,19 @@ use std::collections::{BTreeMap, HashSet}; -use std::ops::Bound::Unbounded; +use std::ops::ControlFlow; use std::{fmt, mem}; use heed::types::ByteSlice; +use heed::BytesDecode; use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; use crate::heed_codec::facet::{ - FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, + OrderedF64Codec, }; -use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; +use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec}; +use crate::search::facet::facet_distribution_iter; use crate::{FieldId, Index, Result}; /// The default number of values by facets that will @@ -94,7 +97,7 @@ impl<'a> FacetDistribution<'a> { let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); let db = self.index.field_id_docid_facet_strings; - for docid in candidates.into_iter() { + 'outer: for docid in candidates.into_iter() { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db @@ -110,7 +113,7 @@ impl<'a> FacetDistribution<'a> { *count += 1; if normalized_distribution.len() == self.max_values_per_facet { - break; + break 'outer; } } } @@ -133,21 +136,23 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - let iter = - FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - - for result in iter { - let (value, mut docids) = result?; - docids &= candidates; - if !docids.is_empty() { - distribution.insert(value.to_string(), docids.len()); - } - if distribution.len() == self.max_values_per_facet { - break; - } - } - - Ok(()) + facet_distribution_iter::iterate_over_facet_distribution( + self.rtxn, + self.index + .facet_id_f64_docids + .remap_key_type::>(), + field_id, + candidates, + |facet_key, nbr_docids, _| { + let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap(); + distribution.insert(facet_key.to_string(), nbr_docids); + if distribution.len() == self.max_values_per_facet { + Ok(ControlFlow::Break(())) + } else { + Ok(ControlFlow::Continue(())) + } + }, + ) } fn facet_strings_distribution_from_facet_levels( @@ -156,21 +161,32 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - let iter = - FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + facet_distribution_iter::iterate_over_facet_distribution( + self.rtxn, + self.index + .facet_id_string_docids + .remap_key_type::>(), + field_id, + candidates, + |facet_key, nbr_docids, any_docid| { + let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap(); - for result in iter { - let (_normalized, original, mut docids) = result?; - docids &= candidates; - if !docids.is_empty() { - distribution.insert(original.to_string(), docids.len()); - } - if distribution.len() == self.max_values_per_facet { - break; - } - } + let key: (FieldId, _, &str) = (field_id, any_docid, facet_key); + let original_string = self + .index + .field_id_docid_facet_strings + .get(self.rtxn, &key)? + .unwrap() + .to_owned(); - Ok(()) + distribution.insert(original_string, nbr_docids); + if distribution.len() == self.max_values_per_facet { + Ok(ControlFlow::Break(())) + } else { + Ok(ControlFlow::Continue(())) + } + }, + ) } /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the @@ -182,11 +198,18 @@ impl<'a> FacetDistribution<'a> { let mut distribution = BTreeMap::new(); let db = self.index.facet_id_f64_docids; - let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(0); // read values from level 0 only - for result in range { - let ((_, _, value, _), docids) = result?; - distribution.insert(value.to_string(), docids.len()); + let iter = db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); + + for result in iter { + let (key, value) = result?; + distribution.insert(key.left_bound.to_string(), value.bitmap.len()); if distribution.len() == self.max_values_per_facet { break; } @@ -195,24 +218,24 @@ impl<'a> FacetDistribution<'a> { let iter = self .index .facet_id_string_docids - .remap_key_type::() - .prefix_iter(self.rtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); - let mut normalized_distribution = BTreeMap::new(); for result in iter { - let ((_, normalized_value), (original_value, docids)) = result?; - normalized_distribution.insert(normalized_value, (original_value, docids.len())); - if normalized_distribution.len() == self.max_values_per_facet { + let (key, value) = result?; + + let docid = value.bitmap.iter().next().unwrap(); + let key: (FieldId, _, &'a str) = (field_id, docid, key.left_bound); + let original_string = + self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned(); + + distribution.insert(original_string, value.bitmap.len()); + if distribution.len() == self.max_values_per_facet { break; } } - let iter = normalized_distribution - .into_iter() - .map(|(_normalized, (original, count))| (original.to_string(), count)); - distribution.extend(iter); - Ok(distribution) } @@ -301,3 +324,216 @@ impl fmt::Debug for FacetDistribution<'_> { .finish() } } + +#[cfg(test)] +mod tests { + use big_s::S; + use maplit::hashset; + + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + use crate::{milli_snap, FacetDistribution}; + + #[test] + fn few_candidates_few_facet_values() { + // All the tests here avoid using the code in `facet_distribution_iter` because there aren't + // enough candidates. + + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .unwrap(); + + let documents = documents!([ + { "colour": "Blue" }, + { "colour": " blue" }, + { "colour": "RED" } + ]); + + index.add_documents(documents).unwrap(); + + let txn = index.read_txn().unwrap(); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([0, 1, 2].iter().copied().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2, "RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([1, 2].iter().copied().collect()) + .execute() + .unwrap(); + + // I think it would be fine if " blue" was "Blue" instead. + // We just need to get any non-normalised string I think, even if it's not in + // the candidates + milli_snap!(format!("{map:?}"), @r###"{"colour": {" blue": 1, "RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([2].iter().copied().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"RED": 1}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates([0, 1, 2].iter().copied().collect()) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 1}}"###); + } + + #[test] + fn many_candidates_few_facet_values() { + let mut index = TempIndex::new_with_map_size(4096 * 10_000); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .unwrap(); + + let facet_values = ["Red", "RED", " red ", "Blue", "BLUE"]; + + let mut documents = vec![]; + for i in 0..10_000 { + let document = serde_json::json!({ + "colour": facet_values[i % 5], + }) + .as_object() + .unwrap() + .clone(); + documents.push(document); + } + + let documents = documents_batch_reader_from_objects(documents); + + index.add_documents(documents).unwrap(); + + let txn = index.read_txn().unwrap(); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..10_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 4000, "Red": 6000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000, "Red": 3000}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .max_values_per_facet(1) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), @r###"{"colour": {"Blue": 2000}}"###); + } + + #[test] + fn many_candidates_many_facet_values() { + let mut index = TempIndex::new_with_map_size(4096 * 10_000); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) + .unwrap(); + + let facet_values = (0..1000).into_iter().map(|x| format!("{x:x}")).collect::>(); + + let mut documents = vec![]; + for i in 0..10_000 { + let document = serde_json::json!({ + "colour": facet_values[i % 1000], + }) + .as_object() + .unwrap() + .clone(); + documents.push(document); + } + + let documents = documents_batch_reader_from_objects(documents); + + index.add_documents(documents).unwrap(); + + let txn = index.read_txn().unwrap(); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "no_candidates", @"ac9229ed5964d893af96a7076e2f8af5"); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .max_values_per_facet(2) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "no_candidates_with_max_2", @r###"{"colour": {"0": 10, "1": 10}}"###); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..10_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "candidates_0_10_000", @"ac9229ed5964d893af96a7076e2f8af5"); + + let map = FacetDistribution::new(&txn, &index) + .facets(std::iter::once("colour")) + .candidates((0..5_000).into_iter().collect()) + .execute() + .unwrap(); + + milli_snap!(format!("{map:?}"), "candidates_0_5_000", @"825f23a4090d05756f46176987b7d992"); + } +} diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs new file mode 100644 index 000000000..9cd85b667 --- /dev/null +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -0,0 +1,196 @@ +use std::ops::ControlFlow; + +use heed::Result; +use roaring::RoaringBitmap; + +use super::{get_first_facet_value, get_highest_level}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::DocumentId; + +/// Call the given closure on the facet distribution of the candidate documents. +/// +/// The arguments to the closure are: +/// - the facet value, as a byte slice +/// - the number of documents among the candidates that contain this facet value +/// - the id of a document which contains the facet value. Note that this document +/// is not necessarily from the list of candidates, it is simply *any* document which +/// contains this facet value. +/// +/// The return value of the closure is a `ControlFlow<()>` which indicates whether we should +/// keep iterating over the different facet values or stop. +pub fn iterate_over_facet_distribution<'t, CB>( + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: &RoaringBitmap, + callback: CB, +) -> Result<()> +where + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, +{ + let mut fd = FacetDistribution { rtxn, db, field_id, callback }; + let highest_level = get_highest_level( + rtxn, + db.remap_key_type::>(), + field_id, + )?; + + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; + return Ok(()); + } else { + return Ok(()); + } +} + +struct FacetDistribution<'t, CB> +where + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, +{ + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + callback: CB, +} + +impl<'t, CB> FacetDistribution<'t, CB> +where + CB: FnMut(&'t [u8], u64, DocumentId) -> Result>, +{ + fn iterate_level_0( + &mut self, + candidates: &RoaringBitmap, + starting_bound: &'t [u8], + group_size: usize, + ) -> Result> { + let starting_key = + FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_bound }; + let iter = self.db.range(self.rtxn, &(starting_key..))?.take(group_size); + for el in iter { + let (key, value) = el?; + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if key.field_id != self.field_id { + return Ok(ControlFlow::Break(())); + } + let docids_in_common = value.bitmap & candidates; + if !docids_in_common.is_empty() { + let any_docid_in_common = docids_in_common.min().unwrap(); + match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)? + { + ControlFlow::Continue(_) => (), + ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), + } + } + } + return Ok(ControlFlow::Continue(())); + } + fn iterate( + &mut self, + candidates: &RoaringBitmap, + level: u8, + starting_bound: &'t [u8], + group_size: usize, + ) -> Result> { + if level == 0 { + return self.iterate_level_0(candidates, starting_bound, group_size); + } + let starting_key = + FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound }; + let iter = self.db.range(&self.rtxn, &(&starting_key..)).unwrap().take(group_size); + + for el in iter { + let (key, value) = el.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if key.field_id != self.field_id { + return Ok(ControlFlow::Break(())); + } + let docids_in_common = value.bitmap & candidates; + if docids_in_common.len() > 0 { + let cf = self.iterate( + &docids_in_common, + level - 1, + key.left_bound, + value.size as usize, + )?; + match cf { + ControlFlow::Continue(_) => {} + ControlFlow::Break(_) => return Ok(ControlFlow::Break(())), + } + } + } + + return Ok(ControlFlow::Continue(())); + } +} + +#[cfg(test)] +mod tests { + use std::ops::ControlFlow; + + use heed::BytesDecode; + use roaring::RoaringBitmap; + + use super::iterate_over_facet_distribution; + use crate::heed_codec::facet::OrderedF64Codec; + use crate::milli_snap; + use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; + + #[test] + fn filter_distribution_all() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (0..=255).into_iter().collect::(); + let mut results = String::new(); + iterate_over_facet_distribution( + &txn, + index.content, + 0, + &candidates, + |facet, count, _| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + results.push_str(&format!("{facet}: {count}\n")); + Ok(ControlFlow::Continue(())) + }, + ) + .unwrap(); + milli_snap!(results, i); + + txn.commit().unwrap(); + } + } + #[test] + fn filter_distribution_all_stop_early() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (0..=255).into_iter().collect::(); + let mut results = String::new(); + let mut nbr_facets = 0; + iterate_over_facet_distribution( + &txn, + index.content, + 0, + &candidates, + |facet, count, _| { + let facet = OrderedF64Codec::bytes_decode(facet).unwrap(); + if nbr_facets == 100 { + return Ok(ControlFlow::Break(())); + } else { + nbr_facets += 1; + results.push_str(&format!("{facet}: {count}\n")); + + Ok(ControlFlow::Continue(())) + } + }, + ) + .unwrap(); + milli_snap!(results, i); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_number.rs b/milli/src/search/facet/facet_number.rs deleted file mode 100644 index 02390aac1..000000000 --- a/milli/src/search/facet/facet_number.rs +++ /dev/null @@ -1,248 +0,0 @@ -use std::ops::Bound::{self, Excluded, Included, Unbounded}; - -use either::Either::{self, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange, RoRevRange}; -use roaring::RoaringBitmap; - -use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{FieldId, Index}; - -pub struct FacetNumberRange<'t> { - iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} - -impl<'t> FacetNumberRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; - Ok(FacetNumberRange { iter, end: right }) - } -} - -impl<'t> Iterator for FacetNumberRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; - - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => Some(Ok(((fid, level, left, right), docids))), - Err(e) => Some(Err(e)), - } - } else { - None - } - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} - -pub struct FacetNumberRevRange<'t> { - iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} - -impl<'t> FacetNumberRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; - Ok(FacetNumberRevRange { iter, end: right }) - } -} - -impl<'t> Iterator for FacetNumberRevRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; - - fn next(&mut self) -> Option { - loop { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => return Some(Ok(((fid, level, left, right), docids))), - Err(e) => return Some(Err(e)), - } - } - continue; - } - Some(Err(e)) => return Some(Err(e)), - None => return None, - } - } - } -} - -pub struct FacetNumberIter<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: Database, - field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, - must_reduce: bool, -} - -impl<'t> FacetNumberIter<'t> { - /// Create a `FacetNumberIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } - - /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reverse_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids; - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Right(highest_iter))]; - Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } - - /// Create a `FacetNumberIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will not reduce the given documents ids - /// while iterating on the different facet levels, possibly returning multiple times - /// a document id associated with multiple facet values. - pub fn new_non_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) - } - - fn highest_level( - rtxn: &'t heed::RoTxn, - db: Database, - fid: FieldId, - ) -> heed::Result> { - let level = db - .remap_types::() - .prefix_iter(rtxn, &fid.to_be_bytes())? - .remap_key_type::() - .last() - .transpose()? - .map(|((_, level, _, _), _)| level); - Ok(level) - } -} - -impl<'t> Iterator for FacetNumberIter<'t> { - type Item = heed::Result<(f64, RoaringBitmap)>; - - fn next(&mut self) -> Option { - 'outer: loop { - let (documents_ids, last) = self.level_iters.last_mut()?; - let is_ascending = last.is_left(); - for result in last { - // If the last iterator must find an empty set of documents it means - // that we found all the documents in the sub level iterations already, - // we can pop this level iterator. - if documents_ids.is_empty() { - break; - } - - match result { - Ok(((_fid, level, left, right), mut docids)) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } - - if level == 0 { - return Some(Ok((left, docids))); - } - - let rtxn = self.rtxn; - let db = self.db; - let fid = self.field_id; - let left = Included(left); - let right = Included(right); - - let result = if is_ascending { - FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) - .map(Left) - } else { - FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) - .map(Right) - }; - - match result { - Ok(iter) => { - self.level_iters.push((docids, iter)); - continue 'outer; - } - Err(e) => return Some(Err(e)), - } - } - } - Err(e) => return Some(Err(e)), - } - } - self.level_iters.pop(); - } - } -} diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs new file mode 100644 index 000000000..07300e920 --- /dev/null +++ b/milli/src/search/facet/facet_range_search.rs @@ -0,0 +1,487 @@ +use std::ops::{Bound, RangeBounds}; + +use heed::BytesEncode; +use roaring::RoaringBitmap; + +use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::Result; + +/// Find all the document ids for which the given field contains a value contained within +/// the two bounds. +pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + left: &'t Bound<>::EItem>, + right: &'t Bound<>::EItem>, + docids: &mut RoaringBitmap, +) -> Result<()> +where + BoundCodec: for<'a> BytesEncode<'a>, + for<'a> >::EItem: Sized, +{ + let inner; + let left = match left { + Bound::Included(left) => { + inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?; + Bound::Included(inner.as_ref()) + } + Bound::Excluded(left) => { + inner = BoundCodec::bytes_encode(left).ok_or(heed::Error::Encoding)?; + Bound::Excluded(inner.as_ref()) + } + Bound::Unbounded => Bound::Unbounded, + }; + let inner; + let right = match right { + Bound::Included(right) => { + inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?; + Bound::Included(inner.as_ref()) + } + Bound::Excluded(right) => { + inner = BoundCodec::bytes_encode(right).ok_or(heed::Error::Encoding)?; + Bound::Excluded(inner.as_ref()) + } + Bound::Unbounded => Bound::Unbounded, + }; + let db = db.remap_key_type::>(); + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; + let highest_level = get_highest_level(rtxn, db, field_id)?; + + if let Some(starting_left_bound) = + get_first_facet_value::(rtxn, db, field_id)? + { + let rightmost_bound = Bound::Included( + get_last_facet_value::(rtxn, db, field_id)?.unwrap(), + ); // will not fail because get_first_facet_value succeeded + let group_size = usize::MAX; + f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; + Ok(()) + } else { + return Ok(()); + } +} + +/// Fetch the document ids that have a facet with a value between the two given bounds +struct FacetRangeSearch<'t, 'b, 'bitmap> { + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + left: Bound<&'b [u8]>, + right: Bound<&'b [u8]>, + docids: &'bitmap mut RoaringBitmap, +} +impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { + fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { + let left_key = + FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; + let iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); + for el in iter { + let (key, value) = el?; + // the right side of the iter range is unbounded, so we need to make sure that we are not iterating + // on the next field id + if key.field_id != self.field_id { + return Ok(()); + } + let should_skip = { + match self.left { + Bound::Included(left) => left > key.left_bound, + Bound::Excluded(left) => left >= key.left_bound, + Bound::Unbounded => false, + } + }; + if should_skip { + continue; + } + let should_stop = { + match self.right { + Bound::Included(right) => right < key.left_bound, + Bound::Excluded(right) => right <= key.left_bound, + Bound::Unbounded => false, + } + }; + if should_stop { + break; + } + + if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { + *self.docids |= value.bitmap; + } + } + Ok(()) + } + + /// Recursive part of the algorithm for level > 0. + /// + /// It works by visiting a slice of a level and checking whether the range asscociated + /// with each visited element is contained within the bounds. + /// + /// 1. So long as the element's range is less than the left bound, we do nothing and keep iterating + /// 2. If the element's range is fully contained by the bounds, then all of its docids are added to + /// the roaring bitmap. + /// 3. If the element's range merely intersects the bounds, then we call the algorithm recursively + /// on the children of the element from the level below. + /// 4. If the element's range is greater than the right bound, we do nothing and stop iterating. + /// Note that the right bound is found through either the `left_bound` of the *next* element, + /// or from the `rightmost_bound` argument + /// + /// ## Arguments + /// - `level`: the level being visited + /// - `starting_left_bound`: the left_bound of the first element to visit + /// - `rightmost_bound`: the right bound of the last element that should be visited + /// - `group_size`: the number of elements that should be visited + fn run( + &mut self, + level: u8, + starting_left_bound: &'t [u8], + rightmost_bound: Bound<&'t [u8]>, + group_size: usize, + ) -> Result<()> { + if level == 0 { + return self.run_level_0(starting_left_bound, group_size); + } + + let left_key = + FacetGroupKey { field_id: self.field_id, level, left_bound: starting_left_bound }; + let mut iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size); + + // We iterate over the range while keeping in memory the previous value + let (mut previous_key, mut previous_value) = iter.next().unwrap()?; + for el in iter { + let (next_key, next_value) = el?; + // the right of the iter range is potentially unbounded (e.g. if `group_size` is usize::MAX), + // so we need to make sure that we are not iterating on the next field id + if next_key.field_id != self.field_id { + break; + } + // now, do we skip, stop, or visit? + let should_skip = { + match self.left { + Bound::Included(left) => left >= next_key.left_bound, + Bound::Excluded(left) => left >= next_key.left_bound, + Bound::Unbounded => false, + } + }; + if should_skip { + previous_key = next_key; + previous_value = next_value; + continue; + } + + // should we stop? + let should_stop = { + match self.right { + Bound::Included(right) => right < previous_key.left_bound, + Bound::Excluded(right) => right <= previous_key.left_bound, + Bound::Unbounded => false, + } + }; + if should_stop { + return Ok(()); + } + // should we take the whole thing, without recursing down? + let should_take_whole_group = { + let left_condition = match self.left { + Bound::Included(left) => previous_key.left_bound >= left, + Bound::Excluded(left) => previous_key.left_bound > left, + Bound::Unbounded => true, + }; + let right_condition = match self.right { + Bound::Included(right) => next_key.left_bound <= right, + Bound::Excluded(right) => next_key.left_bound <= right, + Bound::Unbounded => true, + }; + left_condition && right_condition + }; + if should_take_whole_group { + *self.docids |= &previous_value.bitmap; + previous_key = next_key; + previous_value = next_value; + continue; + } + // from here, we should visit the children of the previous element and + // call the function recursively + + let level = level - 1; + let starting_left_bound = previous_key.left_bound; + let rightmost_bound = Bound::Excluded(next_key.left_bound); + let group_size = previous_value.size as usize; + + self.run(level, starting_left_bound, rightmost_bound, group_size)?; + + previous_key = next_key; + previous_value = next_value; + } + // previous_key/previous_value are the last element's key/value + + // now, do we skip, stop, or visit? + let should_skip = { + match (self.left, rightmost_bound) { + (Bound::Included(left), Bound::Included(right)) => left > right, + (Bound::Included(left), Bound::Excluded(right)) => left >= right, + (Bound::Excluded(left), Bound::Included(right) | Bound::Excluded(right)) => { + left >= right + } + (Bound::Unbounded, _) => false, + (_, Bound::Unbounded) => false, // should never run? + } + }; + if should_skip { + return Ok(()); + } + + // should we stop? + let should_stop = { + match self.right { + Bound::Included(right) => right <= previous_key.left_bound, + Bound::Excluded(right) => right < previous_key.left_bound, + Bound::Unbounded => false, + } + }; + if should_stop { + return Ok(()); + } + // should we take the whole thing, without recursing down? + let should_take_whole_group = { + let left_condition = match self.left { + Bound::Included(left) => previous_key.left_bound >= left, + Bound::Excluded(left) => previous_key.left_bound > left, + Bound::Unbounded => true, + }; + let right_condition = match (self.right, rightmost_bound) { + (Bound::Included(right), Bound::Included(rightmost)) => { + // we need to stay within the bound ..=right + // the element's range goes to ..=righmost + // so the element fits entirely within the bound if rightmost <= right + rightmost <= right + } + (Bound::Included(right), Bound::Excluded(rightmost)) => { + // we need to stay within the bound ..=right + // the element's range goes to ..righmost + // so the element fits entirely within the bound if rightmost <= right + rightmost <= right + } + (Bound::Excluded(right), Bound::Included(rightmost)) => { + // we need to stay within the bound ..right + // the element's range goes to ..=righmost + // so the element fits entirely within the bound if rightmost < right + rightmost < right + } + (Bound::Excluded(right), Bound::Excluded(rightmost)) => { + // we need to stay within the bound ..right + // the element's range goes to ..righmost + // so the element fits entirely within the bound if rightmost <= right + rightmost <= right + } + (Bound::Unbounded, _) => { + // we need to stay within the bound ..inf + // so the element always fits entirely within the bound + true + } + (_, Bound::Unbounded) => { + // we need to stay within a finite bound + // but the element's range goes to ..inf + // so the element never fits entirely within the bound + false + } + }; + left_condition && right_condition + }; + if should_take_whole_group { + *self.docids |= &previous_value.bitmap; + } else { + let level = level - 1; + let starting_left_bound = previous_key.left_bound; + let group_size = previous_value.size as usize; + + self.run(level, starting_left_bound, rightmost_bound, group_size)?; + } + + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::ops::Bound; + + use roaring::RoaringBitmap; + + use super::find_docids_of_facet_within_bounds; + use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; + use crate::milli_snap; + use crate::search::facet::tests::{ + get_random_looking_index, get_random_looking_index_with_multiple_field_ids, + get_simple_index, get_simple_index_with_multiple_field_ids, + }; + use crate::snapshot_tests::display_bitmap; + + #[test] + fn random_looking_index_snap() { + let index = get_random_looking_index(); + milli_snap!(format!("{index}")); + } + #[test] + fn filter_range_increasing() { + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Included(0.); + let end = Bound::Included(i); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + milli_snap!(results, format!("included_{i}")); + let mut results = String::new(); + for i in 0..=255 { + let i = i as f64; + let start = Bound::Excluded(0.); + let end = Bound::Excluded(i); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + milli_snap!(results, format!("excluded_{i}")); + txn.commit().unwrap(); + } + } + #[test] + fn filter_range_decreasing() { + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + + let mut results = String::new(); + + for i in (0..=255).into_iter().rev() { + let i = i as f64; + let start = Bound::Included(i); + let end = Bound::Included(255.); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + + milli_snap!(results, format!("included_{i}")); + + let mut results = String::new(); + + for i in (0..=255).into_iter().rev() { + let i = i as f64; + let start = Bound::Excluded(i); + let end = Bound::Excluded(255.); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + + milli_snap!(results, format!("excluded_{i}")); + + txn.commit().unwrap(); + } + } + #[test] + fn filter_range_pinch() { + let indexes = [ + get_simple_index(), + get_random_looking_index(), + get_simple_index_with_multiple_field_ids(), + get_random_looking_index_with_multiple_field_ids(), + ]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + + let mut results = String::new(); + + for i in (0..=128).into_iter().rev() { + let i = i as f64; + let start = Bound::Included(i); + let end = Bound::Included(255. - i); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + + milli_snap!(results, format!("included_{i}")); + + let mut results = String::new(); + + for i in (0..=128).into_iter().rev() { + let i = i as f64; + let start = Bound::Excluded(i); + let end = Bound::Excluded(255. - i); + let mut docids = RoaringBitmap::new(); + find_docids_of_facet_within_bounds::( + &txn, + index.content.remap_key_type::>(), + 0, + &start, + &end, + &mut docids, + ) + .unwrap(); + results.push_str(&format!("{}\n", display_bitmap(&docids))); + } + + milli_snap!(results, format!("excluded_{i}")); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs new file mode 100644 index 000000000..2f1f73db3 --- /dev/null +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -0,0 +1,136 @@ +use heed::Result; +use roaring::RoaringBitmap; + +use super::{get_first_facet_value, get_highest_level}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, +}; +use crate::heed_codec::ByteSliceRefCodec; + +/// Return an iterator which iterates over the given candidate documents in +/// ascending order of their facet value for the given field id. +/// +/// The documents returned by the iterator are grouped by the facet values that +/// determined their rank. For example, given the documents: +/// +/// ```ignore +/// 0: { "colour": ["blue", "green"] } +/// 1: { "colour": ["blue", "red"] } +/// 2: { "colour": ["orange", "red"] } +/// 3: { "colour": ["green", "red"] } +/// 4: { "colour": ["blue", "orange", "red"] } +/// ``` +/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator +/// over the following elements: +/// ```ignore +/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue" +/// [3] // same for "green" +/// [2] // same for "orange" +/// END +/// ``` +/// Note that once a document id is returned by the iterator, it is never returned again. +pub fn ascending_facet_sort<'t>( + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: RoaringBitmap, +) -> Result> + 't>> { + let highest_level = get_highest_level(rtxn, db, field_id)?; + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; + let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); + + Ok(Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] })) + } else { + Ok(Box::new(std::iter::empty())) + } +} + +struct AscendingFacetSort<'t, 'e> { + rtxn: &'t heed::RoTxn<'e>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + stack: Vec<( + RoaringBitmap, + std::iter::Take< + heed::RoRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, + >, + )>, +} + +impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { + type Item = Result; + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, deepest_iter) = self.stack.last_mut()?; + for result in deepest_iter { + let ( + FacetGroupKey { level, left_bound, field_id }, + FacetGroupValue { size: group_size, mut bitmap }, + ) = result.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if field_id != self.field_id { + return None; + } + + // If the last iterator found an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + + bitmap &= &*documents_ids; + if !bitmap.is_empty() { + *documents_ids -= &bitmap; + + if level == 0 { + return Some(Ok(bitmap)); + } + let starting_key_below = + FacetGroupKey { field_id: self.field_id, level: level - 1, left_bound }; + let iter = match self.db.range(&self.rtxn, &(starting_key_below..)) { + Ok(iter) => iter, + Err(e) => return Some(Err(e.into())), + } + .take(group_size as usize); + + self.stack.push((bitmap, iter)); + continue 'outer; + } + } + self.stack.pop(); + } + } +} + +#[cfg(test)] +mod tests { + use roaring::RoaringBitmap; + + use crate::milli_snap; + use crate::search::facet::facet_sort_ascending::ascending_facet_sort; + use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; + use crate::snapshot_tests::display_bitmap; + + #[test] + fn filter_sort() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, i); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs new file mode 100644 index 000000000..5f09d708b --- /dev/null +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -0,0 +1,151 @@ +use std::ops::Bound; + +use heed::Result; +use roaring::RoaringBitmap; + +use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, +}; +use crate::heed_codec::ByteSliceRefCodec; + +/// See documentationg for [`ascending_facet_sort`](super::ascending_facet_sort). +/// +/// This function does the same thing, but in the opposite order. +pub fn descending_facet_sort<'t>( + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + candidates: RoaringBitmap, +) -> Result> + 't>> { + let highest_level = get_highest_level(rtxn, db, field_id)?; + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; + let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); + Ok(Box::new(DescendingFacetSort { + rtxn, + db, + field_id, + stack: vec![(candidates, iter, Bound::Included(last_bound))], + })) + } else { + Ok(Box::new(std::iter::empty())) + } +} + +struct DescendingFacetSort<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, + stack: Vec<( + RoaringBitmap, + std::iter::Take< + heed::RoRevRange<'t, FacetGroupKeyCodec, FacetGroupValueCodec>, + >, + Bound<&'t [u8]>, + )>, +} + +impl<'t> Iterator for DescendingFacetSort<'t> { + type Item = Result; + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?; + while let Some(result) = deepest_iter.next() { + let ( + FacetGroupKey { level, left_bound, field_id }, + FacetGroupValue { size: group_size, mut bitmap }, + ) = result.unwrap(); + // The range is unbounded on the right and the group size for the highest level is MAX, + // so we need to check that we are not iterating over the next field id + if field_id != self.field_id { + return None; + } + // If the last iterator found an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + + bitmap &= &*documents_ids; + if !bitmap.is_empty() { + *documents_ids -= &bitmap; + + if level == 0 { + return Some(Ok(bitmap)); + } + let starting_key_below = + FacetGroupKey { field_id, level: level - 1, left_bound }; + + let end_key_kelow = match *right_bound { + Bound::Included(right) => Bound::Included(FacetGroupKey { + field_id, + level: level - 1, + left_bound: right, + }), + Bound::Excluded(right) => Bound::Excluded(FacetGroupKey { + field_id, + level: level - 1, + left_bound: right, + }), + Bound::Unbounded => Bound::Unbounded, + }; + let prev_right_bound = *right_bound; + *right_bound = Bound::Excluded(left_bound); + let iter = match self + .db + .remap_key_type::>() + .rev_range( + &self.rtxn, + &(Bound::Included(starting_key_below), end_key_kelow), + ) { + Ok(iter) => iter, + Err(e) => return Some(Err(e.into())), + } + .take(group_size as usize); + + self.stack.push((bitmap, iter, prev_right_bound)); + continue 'outer; + } + *right_bound = Bound::Excluded(left_bound); + } + self.stack.pop(); + } + } +} + +#[cfg(test)] +mod tests { + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::FacetGroupKeyCodec; + use crate::heed_codec::ByteSliceRefCodec; + use crate::milli_snap; + use crate::search::facet::facet_sort_descending::descending_facet_sort; + use crate::search::facet::tests::{get_random_looking_index, get_simple_index}; + use crate::snapshot_tests::display_bitmap; + + #[test] + fn filter_sort_descending() { + let indexes = [get_simple_index(), get_random_looking_index()]; + for (i, index) in indexes.iter().enumerate() { + let txn = index.env.read_txn().unwrap(); + let candidates = (200..=300).into_iter().collect::(); + let mut results = String::new(); + let db = index.content.remap_key_type::>(); + let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); + for el in iter { + let docids = el.unwrap(); + results.push_str(&display_bitmap(&docids)); + results.push('\n'); + } + milli_snap!(results, i); + + txn.commit().unwrap(); + } + } +} diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs deleted file mode 100644 index c55430cf1..000000000 --- a/milli/src/search/facet/facet_string.rs +++ /dev/null @@ -1,652 +0,0 @@ -//! This module contains helpers iterators for facet strings. -//! -//! The purpose is to help iterate over the quite complex system of facets strings. A simple -//! description of the system would be that every facet string value is stored into an LMDB database -//! and that every value is associated with the document ids which are associated with this facet -//! string value. -//! -//! In reality it is a little bit more complex as we have to create aggregations of runs of facet -//! string values, those aggregations helps in choosing the right groups of facets to follow. -//! -//! ## A typical algorithm run -//! -//! If a group of aggregated facets values contains one of the documents ids, we must continue -//! iterating over the sub-groups. -//! -//! If this group is the lowest level and contain at least one document id we yield the associated -//! facet documents ids. -//! -//! If the group doesn't contain one of our documents ids, we continue to the next group at this -//! same level. -//! -//! ## The complexity comes from the strings -//! -//! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create -//! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the -//! two numbers bounds, the left and the right bound of the group, both inclusive. -//! -//! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and -//! puting two numbers big-endian encoded one after the other gives us ordered groups. The values -//! are simple unions of the documents ids coming from the groups below. -//! -//! ### Example of what a facet number LMDB database contain -//! -//! | level | left-bound | right-bound | documents ids | -//! |-------|------------|-------------|------------------| -//! | 0 | 0 | _skipped_ | 1, 2 | -//! | 0 | 1 | _skipped_ | 6, 7 | -//! | 0 | 3 | _skipped_ | 4, 7 | -//! | 0 | 5 | _skipped_ | 2, 3, 4 | -//! | 1 | 0 | 1 | 1, 2, 6, 7 | -//! | 1 | 3 | 5 | 2, 3, 4, 7 | -//! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | -//! -//! As you can see the level 0 have two equal bounds, therefore we skip serializing the second -//! bound, that's the base level where you can directly fetch the documents ids associated with an -//! exact number. -//! -//! The next levels have two different bounds and the associated documents ids are simply the result -//! of an union of all the documents ids associated with the aggregated groups above. -//! -//! ## The complexity of defining groups for facet strings -//! -//! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in -//! lexicographical order, it means that whatever the key represent the bytes are read in their raw -//! form and a simple `strcmp` will define the order in which keys will be read from the store. -//! -//! That's easy for types with a known size, like floats or integers, they are 64 bytes long and -//! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the -//! first number then by the second if the the first number is equal on two keys. -//! -//! For strings it is a lot more complex as those types are unsized, it means that the size of facet -//! strings is different for each facet value. -//! -//! ### Basic approach: padding the keys -//! -//! A first approach would be to simply define the maximum size of a facet string and pad the keys -//! with zeroes. The big problem of this approach is that it: -//! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the -//! other. -//! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB -//! performances. -//! -//! ### Better approach: number the facet groups -//! -//! A better approach would be to number the groups, this way we don't have the downsides of the -//! previously described approach but we need to be able to describe the groups by using a number. -//! -//! #### Example of facet strings with numbered groups -//! -//! | level | left-bound | right-bound | left-string | right-string | documents ids | -//! |-------|------------|-------------|-------------|--------------|------------------| -//! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | -//! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | -//! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | -//! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | -//! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | -//! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | -//! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | -//! -//! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not -//! need to store the facet string value two times. -//! -//! The number in the left-bound and right-bound columns are incremental numbers representing the -//! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering -//! of the LMDB keys. -//! -//! In the value, not in the key, you can see that we added two new values: the left-string and the -//! right-string, which defines the original facet strings associated with the given group. -//! -//! We put those two strings inside of the value, this way we do not limit the maximum size of the -//! facet string values, and the impact on performances is not important as, IIRC, LMDB put big -//! values on another page, this helps in iterating over keys fast enough and only fetch the page -//! with the values when required. -//! -//! The other little advantage with this solution is that there is no a big overhead, compared with -//! the facet number levels, we only duplicate the facet strings once for the level 1. -//! -//! #### A typical algorithm run -//! -//! Note that the algorithm is always moving from the highest level to the lowest one, one level -//! by one level, this is why it is ok to only store the facets string on the level 1. -//! -//! If a group of aggregated facets values, a group with numbers contains one of the documents ids, -//! we must continue iterating over the sub-groups. To do so: -//! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds -//! and iterate over the facet groups defined by these numbers over the current level - 1. -//! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the -//! value and just do the same as with the facet numbers but with strings: iterate over the -//! current level - 1 with both keys. -//! -//! If this group is the lowest level (level 0) and contain at least one document id we yield the -//! associated facet documents ids. -//! -//! If the group doesn't contain one of our documents ids, we continue to the next group at this -//! same level. -//! - -use std::num::NonZeroU8; -use std::ops::Bound; -use std::ops::Bound::{Excluded, Included, Unbounded}; - -use either::{Either, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange, RoRevRange}; -use roaring::RoaringBitmap; - -use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FacetStringZeroBoundsValueCodec, -}; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{FieldId, Index}; - -/// An iterator that is used to explore the facets level strings -/// from the level 1 to infinity. -/// -/// It yields the level, group id that an entry covers, the optional group strings -/// that it covers of the level 0 only if it is an entry from the level 1 and -/// the roaring bitmap associated. -pub struct FacetStringGroupRange<'t> { - iter: RoRange< - 't, - FacetLevelValueU32Codec, - LazyDecode>, - >, - end: Bound, -} - -impl<'t> FacetStringGroupRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: NonZeroU8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let db = db.remap_types::< - FacetLevelValueU32Codec, - FacetStringZeroBoundsValueCodec, - >(); - let left_bound = match left { - Included(left) => Included((field_id, level, left, u32::MIN)), - Excluded(left) => Excluded((field_id, level, left, u32::MIN)), - Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), - }; - let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); - let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; - Ok(FacetStringGroupRange { iter, end: right }) - } -} - -impl<'t> Iterator for FacetStringGroupRange<'t> { - type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), - Err(e) => Some(Err(e)), - } - } else { - None - } - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} - -pub struct FacetStringGroupRevRange<'t> { - iter: RoRevRange< - 't, - FacetLevelValueU32Codec, - LazyDecode>, - >, - end: Bound, -} - -impl<'t> FacetStringGroupRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: NonZeroU8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let db = db.remap_types::< - FacetLevelValueU32Codec, - FacetStringZeroBoundsValueCodec, - >(); - let left_bound = match left { - Included(left) => Included((field_id, level, left, u32::MIN)), - Excluded(left) => Excluded((field_id, level, left, u32::MIN)), - Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), - }; - let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); - let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; - Ok(FacetStringGroupRevRange { iter, end: right }) - } -} - -impl<'t> Iterator for FacetStringGroupRevRange<'t> { - type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - - fn next(&mut self) -> Option { - loop { - match self.iter.next() { - Some(Ok(((_fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok((bounds, docids)) => { - return Some(Ok(((level, left, right), (bounds, docids)))) - } - Err(e) => return Some(Err(e)), - } - } - continue; - } - Some(Err(e)) => return Some(Err(e)), - None => return None, - } - } - } -} - -/// An iterator that is used to explore the level 0 of the facets string database. -/// -/// It yields the facet string and the roaring bitmap associated with it. -pub struct FacetStringLevelZeroRange<'t> { - iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -} - -impl<'t> FacetStringLevelZeroRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - left: Bound<&str>, - right: Bound<&str>, - ) -> heed::Result> { - fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - &buffer[..] - } - - let mut left_buffer = Vec::new(); - let left_bound = match left { - Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), - Unbounded => { - left_buffer.extend_from_slice(&field_id.to_be_bytes()); - left_buffer.push(0); - Included(&left_buffer[..]) - } - }; - - let mut right_buffer = Vec::new(); - let right_bound = match right { - Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), - Unbounded => { - right_buffer.extend_from_slice(&field_id.to_be_bytes()); - right_buffer.push(1); // we must only get the level 0 - Excluded(&right_buffer[..]) - } - }; - - let iter = db - .remap_key_type::() - .range(rtxn, &(left_bound, right_bound))? - .remap_types::(); - - Ok(FacetStringLevelZeroRange { iter }) - } -} - -impl<'t> Iterator for FacetStringLevelZeroRange<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, normalized), (original, docids)))) => { - Some(Ok((normalized, original, docids))) - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} - -pub struct FacetStringLevelZeroRevRange<'t> { - iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -} - -impl<'t> FacetStringLevelZeroRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - left: Bound<&str>, - right: Bound<&str>, - ) -> heed::Result> { - fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - &buffer[..] - } - - let mut left_buffer = Vec::new(); - let left_bound = match left { - Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), - Unbounded => { - left_buffer.extend_from_slice(&field_id.to_be_bytes()); - left_buffer.push(0); - Included(&left_buffer[..]) - } - }; - - let mut right_buffer = Vec::new(); - let right_bound = match right { - Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), - Unbounded => { - right_buffer.extend_from_slice(&field_id.to_be_bytes()); - right_buffer.push(1); // we must only get the level 0 - Excluded(&right_buffer[..]) - } - }; - - let iter = db - .remap_key_type::() - .rev_range(rtxn, &(left_bound, right_bound))? - .remap_types::(); - - Ok(FacetStringLevelZeroRevRange { iter }) - } -} - -impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, normalized), (original, docids)))) => { - Some(Ok((normalized, original, docids))) - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} - -type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; -type EitherStringRevRange<'t> = - Either, FacetStringLevelZeroRevRange<'t>>; - -/// An iterator that is used to explore the facet strings level by level, -/// it will only return facets strings that are associated with the -/// candidates documents ids given. -pub struct FacetStringIter<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: Database, - field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, - must_reduce: bool, -} - -impl<'t> FacetStringIter<'t> { - pub fn new_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Left(highest_iter))], - must_reduce: true, - }) - } - - pub fn new_reverse_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Right(highest_reverse_iter))], - must_reduce: true, - }) - } - - pub fn new_non_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Left(highest_iter))], - must_reduce: false, - }) - } - - fn highest_level( - rtxn: &'t heed::RoTxn, - db: Database, - fid: FieldId, - ) -> heed::Result> { - Ok(db - .remap_types::() - .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits - .last() - .transpose()? - .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit - } - - fn highest_iter( - rtxn: &'t heed::RoTxn, - index: &'t Index, - db: Database, - field_id: FieldId, - ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - match NonZeroU8::new(highest_level) { - Some(highest_level) => FacetStringGroupRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - highest_level, - Unbounded, - Unbounded, - ) - .map(Left), - None => FacetStringLevelZeroRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - Unbounded, - Unbounded, - ) - .map(Right), - } - } - - fn highest_reverse_iter( - rtxn: &'t heed::RoTxn, - index: &'t Index, - db: Database, - field_id: FieldId, - ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - match NonZeroU8::new(highest_level) { - Some(highest_level) => FacetStringGroupRevRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - highest_level, - Unbounded, - Unbounded, - ) - .map(Left), - None => FacetStringLevelZeroRevRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - Unbounded, - Unbounded, - ) - .map(Right), - } - } -} - -impl<'t> Iterator for FacetStringIter<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - - fn next(&mut self) -> Option { - 'outer: loop { - let (documents_ids, last) = self.level_iters.last_mut()?; - let is_ascending = last.is_left(); - - // We remap the different iterator types to make - // the algorithm less complex to understand. - let last = match last { - Left(ascending) => match ascending { - Left(group) => Left(Left(group)), - Right(zero_level) => Right(Left(zero_level)), - }, - Right(descending) => match descending { - Left(group) => Left(Right(group)), - Right(zero_level) => Right(Right(zero_level)), - }, - }; - - match last { - Left(group) => { - for result in group { - match result { - Ok(((level, left, right), (string_bounds, mut docids))) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } - - let result = if is_ascending { - match string_bounds { - Some((left, right)) => FacetStringLevelZeroRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right), - None => FacetStringGroupRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), - } - .map(Left) - } else { - match string_bounds { - Some((left, right)) => { - FacetStringLevelZeroRevRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right) - } - None => FacetStringGroupRevRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), - } - .map(Right) - }; - - match result { - Ok(iter) => { - self.level_iters.push((docids, iter)); - continue 'outer; - } - Err(e) => return Some(Err(e)), - } - } - } - Err(e) => return Some(Err(e)), - } - } - } - Right(zero_level) => { - // level zero only - for result in zero_level { - match result { - Ok((normalized, original, mut docids)) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } - return Some(Ok((normalized, original, docids))); - } - } - Err(e) => return Some(Err(e)), - } - } - } - } - - self.level_iters.pop(); - } - } -} diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 1d8fcd389..1dc01566e 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -5,15 +5,14 @@ use std::ops::Bound::{self, Excluded, Included}; use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; -use log::debug; use roaring::RoaringBitmap; -use super::FacetNumberRange; +use super::facet_range_search; use crate::error::{Error, UserError}; -use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::{ - distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result, +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec, }; +use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result}; /// The maximum number of filters the filter AST can process. const MAX_FILTER_DEPTH: usize = 2000; @@ -145,112 +144,14 @@ impl<'a> Filter<'a> { } impl<'a> Filter<'a> { - /// Aggregates the documents ids that are part of the specified range automatically - /// going deeper through the levels. - fn explore_facet_number_levels( - rtxn: &heed::RoTxn, - db: heed::Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - output: &mut RoaringBitmap, - ) -> Result<()> { - match (left, right) { - // If the request is an exact value we must go directly to the deepest level. - (Included(l), Included(r)) if l == r && level > 0 => { - return Self::explore_facet_number_levels( - rtxn, db, field_id, 0, left, right, output, - ); - } - // lower TO upper when lower > upper must return no result - (Included(l), Included(r)) if l > r => return Ok(()), - (Included(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Excluded(r)) if l >= r => return Ok(()), - (Excluded(l), Included(r)) if l >= r => return Ok(()), - (_, _) => (), - } + pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { + // to avoid doing this for each recursive call we're going to do it ONCE ahead of time + let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; + let filterable_fields = index.filterable_fields(rtxn)?; - let mut left_found = None; - let mut right_found = None; - - // We must create a custom iterator to be able to iterate over the - // requested range as the range iterator cannot express some conditions. - let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; - - debug!("Iterating between {:?} and {:?} (level {})", left, right, level); - - for (i, result) in iter.enumerate() { - let ((_fid, level, l, r), docids) = result?; - debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); - *output |= docids; - // We save the leftest and rightest bounds we actually found at this level. - if i == 0 { - left_found = Some(l); - } - right_found = Some(r); - } - - // Can we go deeper? - let deeper_level = match level.checked_sub(1) { - Some(level) => level, - None => return Ok(()), - }; - - // We must refine the left and right bounds of this range by retrieving the - // missing part in a deeper level. - match left_found.zip(right_found) { - Some((left_found, right_found)) => { - // If the bound is satisfied we avoid calling this function again. - if !matches!(left, Included(l) if l == left_found) { - let sub_right = Excluded(left_found); - debug!( - "calling left with {:?} to {:?} (level {})", - left, sub_right, deeper_level - ); - Self::explore_facet_number_levels( - rtxn, - db, - field_id, - deeper_level, - left, - sub_right, - output, - )?; - } - if !matches!(right, Included(r) if r == right_found) { - let sub_left = Excluded(right_found); - debug!( - "calling right with {:?} to {:?} (level {})", - sub_left, right, deeper_level - ); - Self::explore_facet_number_levels( - rtxn, - db, - field_id, - deeper_level, - sub_left, - right, - output, - )?; - } - } - None => { - // If we found nothing at this level it means that we must find - // the same bounds but at a deeper, more precise level. - Self::explore_facet_number_levels( - rtxn, - db, - field_id, - deeper_level, - left, - right, - output, - )?; - } - } - - Ok(()) + // and finally we delete all the soft_deleted_documents, again, only once at the very end + self.inner_evaluate(rtxn, index, &filterable_fields) + .map(|result| result - soft_deleted_documents) } fn evaluate_operator( @@ -277,8 +178,16 @@ impl<'a> Filter<'a> { return Ok(exist); } Condition::Equal(val) => { - let (_original_value, string_docids) = strings_db - .get(rtxn, &(field_id, &val.value().to_lowercase()))? + let string_docids = strings_db + .get( + rtxn, + &FacetGroupKey { + field_id, + level: 0, + left_bound: &val.value().to_lowercase(), + }, + )? + .map(|v| v.bitmap) .unwrap_or_default(); let number = val.parse::().ok(); let number_docids = match number { @@ -312,8 +221,19 @@ impl<'a> Filter<'a> { // that's fine if it don't, the value just before will be returned instead. let biggest_level = numbers_db .remap_data_type::() - .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, f64::MAX, f64::MAX))? - .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); + .get_lower_than_or_equal_to( + rtxn, + &FacetGroupKey { field_id, level: u8::MAX, left_bound: f64::MAX }, + )? + .and_then( + |(FacetGroupKey { field_id: id, level, .. }, _)| { + if id == field_id { + Some(level) + } else { + None + } + }, + ); match biggest_level { Some(level) => { @@ -333,14 +253,36 @@ impl<'a> Filter<'a> { } } - pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { - // to avoid doing this for each recursive call we're going to do it ONCE ahead of time - let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; - let filterable_fields = index.filterable_fields(rtxn)?; + /// Aggregates the documents ids that are part of the specified range automatically + /// going deeper through the levels. + fn explore_facet_number_levels( + rtxn: &heed::RoTxn, + db: heed::Database, FacetGroupValueCodec>, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + output: &mut RoaringBitmap, + ) -> Result<()> { + match (left, right) { + // If the request is an exact value we must go directly to the deepest level. + (Included(l), Included(r)) if l == r && level > 0 => { + return Self::explore_facet_number_levels( + rtxn, db, field_id, 0, left, right, output, + ); + } + // lower TO upper when lower > upper must return no result + (Included(l), Included(r)) if l > r => return Ok(()), + (Included(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Excluded(r)) if l >= r => return Ok(()), + (Excluded(l), Included(r)) if l >= r => return Ok(()), + (_, _) => (), + } + facet_range_search::find_docids_of_facet_within_bounds::( + rtxn, db, field_id, &left, &right, output, + )?; - // and finally we delete all the soft_deleted_documents, again, only once at the very end - self.inner_evaluate(rtxn, index, &filterable_fields) - .map(|result| result - soft_deleted_documents) + Ok(()) } fn inner_evaluate( diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index e3ac95882..ccf40d6aa 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,9 +1,153 @@ -pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; -pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; -pub use self::facet_string::FacetStringIter; -pub use self::filter::Filter; +pub use facet_sort_ascending::ascending_facet_sort; +pub use facet_sort_descending::descending_facet_sort; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, RoTxn}; +pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; +pub use self::filter::Filter; +use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; mod facet_distribution; -mod facet_number; -mod facet_string; +mod facet_distribution_iter; +mod facet_range_search; +mod facet_sort_ascending; +mod facet_sort_descending; mod filter; + +/// Get the first facet value in the facet database +pub(crate) fn get_first_facet_value<'t, BoundCodec>( + txn: &'t RoTxn, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> heed::Result> +where + BoundCodec: BytesDecode<'t>, +{ + let mut level0prefix = vec![]; + level0prefix.extend_from_slice(&field_id.to_be_bytes()); + level0prefix.push(0); + let mut level0_iter_forward = db + .as_polymorph() + .prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?; + if let Some(first) = level0_iter_forward.next() { + let (first_key, _) = first?; + let first_key = FacetGroupKeyCodec::::bytes_decode(first_key) + .ok_or(heed::Error::Encoding)?; + Ok(Some(first_key.left_bound)) + } else { + Ok(None) + } +} + +/// Get the last facet value in the facet database +pub(crate) fn get_last_facet_value<'t, BoundCodec>( + txn: &'t RoTxn, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> heed::Result> +where + BoundCodec: BytesDecode<'t>, +{ + let mut level0prefix = vec![]; + level0prefix.extend_from_slice(&field_id.to_be_bytes()); + level0prefix.push(0); + let mut level0_iter_backward = db + .as_polymorph() + .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(txn, level0prefix.as_slice())?; + if let Some(last) = level0_iter_backward.next() { + let (last_key, _) = last?; + let last_key = FacetGroupKeyCodec::::bytes_decode(last_key) + .ok_or(heed::Error::Encoding)?; + Ok(Some(last_key.left_bound)) + } else { + Ok(None) + } +} + +/// Get the height of the highest level in the facet database +pub(crate) fn get_highest_level<'t>( + txn: &'t RoTxn<'t>, + db: heed::Database, FacetGroupValueCodec>, + field_id: u16, +) -> heed::Result { + let field_id_prefix = &field_id.to_be_bytes(); + Ok(db + .as_polymorph() + .rev_prefix_iter::<_, ByteSlice, DecodeIgnore>(&txn, field_id_prefix)? + .next() + .map(|el| { + let (key, _) = el.unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(key).unwrap(); + key.level + }) + .unwrap_or(0)) +} + +#[cfg(test)] +pub(crate) mod tests { + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::OrderedF64Codec; + use crate::update::facet::tests::FacetIndex; + + pub fn get_simple_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + txn.commit().unwrap(); + index + } + pub fn get_random_looking_index() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + txn.commit().unwrap(); + index + } + pub fn get_simple_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for fid in 0..2 { + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + index.insert(&mut txn, fid, &(i as f64), &bitmap); + } + } + txn.commit().unwrap(); + index + } + pub fn get_random_looking_index_with_multiple_field_ids() -> FacetIndex { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + let keys = + std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::>(); + for fid in 0..2 { + for (_i, &key) in keys.iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.insert(&mut txn, fid, &(key as f64), &bitmap); + } + } + txn.commit().unwrap(); + index + } +} diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap new file mode 100644 index 000000000..2b6123289 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/0.snap @@ -0,0 +1,260 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +0: 1 +1: 1 +2: 1 +3: 1 +4: 1 +5: 1 +6: 1 +7: 1 +8: 1 +9: 1 +10: 1 +11: 1 +12: 1 +13: 1 +14: 1 +15: 1 +16: 1 +17: 1 +18: 1 +19: 1 +20: 1 +21: 1 +22: 1 +23: 1 +24: 1 +25: 1 +26: 1 +27: 1 +28: 1 +29: 1 +30: 1 +31: 1 +32: 1 +33: 1 +34: 1 +35: 1 +36: 1 +37: 1 +38: 1 +39: 1 +40: 1 +41: 1 +42: 1 +43: 1 +44: 1 +45: 1 +46: 1 +47: 1 +48: 1 +49: 1 +50: 1 +51: 1 +52: 1 +53: 1 +54: 1 +55: 1 +56: 1 +57: 1 +58: 1 +59: 1 +60: 1 +61: 1 +62: 1 +63: 1 +64: 1 +65: 1 +66: 1 +67: 1 +68: 1 +69: 1 +70: 1 +71: 1 +72: 1 +73: 1 +74: 1 +75: 1 +76: 1 +77: 1 +78: 1 +79: 1 +80: 1 +81: 1 +82: 1 +83: 1 +84: 1 +85: 1 +86: 1 +87: 1 +88: 1 +89: 1 +90: 1 +91: 1 +92: 1 +93: 1 +94: 1 +95: 1 +96: 1 +97: 1 +98: 1 +99: 1 +100: 1 +101: 1 +102: 1 +103: 1 +104: 1 +105: 1 +106: 1 +107: 1 +108: 1 +109: 1 +110: 1 +111: 1 +112: 1 +113: 1 +114: 1 +115: 1 +116: 1 +117: 1 +118: 1 +119: 1 +120: 1 +121: 1 +122: 1 +123: 1 +124: 1 +125: 1 +126: 1 +127: 1 +128: 1 +129: 1 +130: 1 +131: 1 +132: 1 +133: 1 +134: 1 +135: 1 +136: 1 +137: 1 +138: 1 +139: 1 +140: 1 +141: 1 +142: 1 +143: 1 +144: 1 +145: 1 +146: 1 +147: 1 +148: 1 +149: 1 +150: 1 +151: 1 +152: 1 +153: 1 +154: 1 +155: 1 +156: 1 +157: 1 +158: 1 +159: 1 +160: 1 +161: 1 +162: 1 +163: 1 +164: 1 +165: 1 +166: 1 +167: 1 +168: 1 +169: 1 +170: 1 +171: 1 +172: 1 +173: 1 +174: 1 +175: 1 +176: 1 +177: 1 +178: 1 +179: 1 +180: 1 +181: 1 +182: 1 +183: 1 +184: 1 +185: 1 +186: 1 +187: 1 +188: 1 +189: 1 +190: 1 +191: 1 +192: 1 +193: 1 +194: 1 +195: 1 +196: 1 +197: 1 +198: 1 +199: 1 +200: 1 +201: 1 +202: 1 +203: 1 +204: 1 +205: 1 +206: 1 +207: 1 +208: 1 +209: 1 +210: 1 +211: 1 +212: 1 +213: 1 +214: 1 +215: 1 +216: 1 +217: 1 +218: 1 +219: 1 +220: 1 +221: 1 +222: 1 +223: 1 +224: 1 +225: 1 +226: 1 +227: 1 +228: 1 +229: 1 +230: 1 +231: 1 +232: 1 +233: 1 +234: 1 +235: 1 +236: 1 +237: 1 +238: 1 +239: 1 +240: 1 +241: 1 +242: 1 +243: 1 +244: 1 +245: 1 +246: 1 +247: 1 +248: 1 +249: 1 +250: 1 +251: 1 +252: 1 +253: 1 +254: 1 +255: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap new file mode 100644 index 000000000..d0c0dd98d --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all/1.snap @@ -0,0 +1,105 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +3: 2 +5: 2 +6: 2 +9: 2 +10: 2 +11: 2 +14: 2 +18: 2 +19: 2 +24: 2 +26: 2 +28: 2 +29: 2 +32: 2 +33: 2 +35: 2 +36: 2 +37: 2 +38: 2 +39: 2 +41: 2 +46: 2 +47: 2 +49: 2 +52: 2 +53: 2 +55: 2 +59: 2 +61: 2 +64: 2 +68: 2 +71: 2 +74: 2 +75: 2 +76: 2 +81: 2 +83: 2 +85: 2 +86: 2 +88: 2 +90: 2 +91: 2 +92: 2 +98: 2 +99: 2 +101: 2 +102: 2 +103: 2 +107: 2 +111: 2 +115: 2 +119: 2 +123: 2 +124: 2 +130: 2 +131: 2 +133: 2 +135: 2 +136: 2 +137: 2 +139: 2 +141: 2 +143: 2 +144: 2 +147: 2 +150: 2 +156: 1 +158: 1 +160: 1 +162: 1 +163: 1 +164: 1 +167: 1 +169: 1 +173: 1 +177: 1 +178: 1 +179: 1 +181: 1 +182: 1 +186: 1 +189: 1 +192: 1 +193: 1 +195: 1 +197: 1 +205: 1 +206: 1 +207: 1 +208: 1 +209: 1 +210: 1 +216: 1 +219: 1 +220: 1 +223: 1 +226: 1 +235: 1 +236: 1 +238: 1 +243: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap new file mode 100644 index 000000000..7170dab89 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/0.snap @@ -0,0 +1,104 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +0: 1 +1: 1 +2: 1 +3: 1 +4: 1 +5: 1 +6: 1 +7: 1 +8: 1 +9: 1 +10: 1 +11: 1 +12: 1 +13: 1 +14: 1 +15: 1 +16: 1 +17: 1 +18: 1 +19: 1 +20: 1 +21: 1 +22: 1 +23: 1 +24: 1 +25: 1 +26: 1 +27: 1 +28: 1 +29: 1 +30: 1 +31: 1 +32: 1 +33: 1 +34: 1 +35: 1 +36: 1 +37: 1 +38: 1 +39: 1 +40: 1 +41: 1 +42: 1 +43: 1 +44: 1 +45: 1 +46: 1 +47: 1 +48: 1 +49: 1 +50: 1 +51: 1 +52: 1 +53: 1 +54: 1 +55: 1 +56: 1 +57: 1 +58: 1 +59: 1 +60: 1 +61: 1 +62: 1 +63: 1 +64: 1 +65: 1 +66: 1 +67: 1 +68: 1 +69: 1 +70: 1 +71: 1 +72: 1 +73: 1 +74: 1 +75: 1 +76: 1 +77: 1 +78: 1 +79: 1 +80: 1 +81: 1 +82: 1 +83: 1 +84: 1 +85: 1 +86: 1 +87: 1 +88: 1 +89: 1 +90: 1 +91: 1 +92: 1 +93: 1 +94: 1 +95: 1 +96: 1 +97: 1 +98: 1 +99: 1 + diff --git a/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap new file mode 100644 index 000000000..95c719bb0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_distribution_iter.rs/filter_distribution_all_stop_early/1.snap @@ -0,0 +1,104 @@ +--- +source: milli/src/search/facet/facet_distribution_iter.rs +--- +3: 2 +5: 2 +6: 2 +9: 2 +10: 2 +11: 2 +14: 2 +18: 2 +19: 2 +24: 2 +26: 2 +28: 2 +29: 2 +32: 2 +33: 2 +35: 2 +36: 2 +37: 2 +38: 2 +39: 2 +41: 2 +46: 2 +47: 2 +49: 2 +52: 2 +53: 2 +55: 2 +59: 2 +61: 2 +64: 2 +68: 2 +71: 2 +74: 2 +75: 2 +76: 2 +81: 2 +83: 2 +85: 2 +86: 2 +88: 2 +90: 2 +91: 2 +92: 2 +98: 2 +99: 2 +101: 2 +102: 2 +103: 2 +107: 2 +111: 2 +115: 2 +119: 2 +123: 2 +124: 2 +130: 2 +131: 2 +133: 2 +135: 2 +136: 2 +137: 2 +139: 2 +141: 2 +143: 2 +144: 2 +147: 2 +150: 2 +156: 1 +158: 1 +160: 1 +162: 1 +163: 1 +164: 1 +167: 1 +169: 1 +173: 1 +177: 1 +178: 1 +179: 1 +181: 1 +182: 1 +186: 1 +189: 1 +192: 1 +193: 1 +195: 1 +197: 1 +205: 1 +206: 1 +207: 1 +208: 1 +209: 1 +210: 1 +216: 1 +219: 1 +220: 1 +223: 1 +226: 1 +235: 1 +236: 1 +238: 1 + diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap new file mode 100644 index 000000000..7bf13e05c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +fcedc563a82c1c61f50174a5f3f982b6 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap new file mode 100644 index 000000000..100b928d7 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +6cc26e77fc6bd9145deedf14cf422b03 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap new file mode 100644 index 000000000..7bf13e05c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +fcedc563a82c1c61f50174a5f3f982b6 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap new file mode 100644 index 000000000..100b928d7 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/excluded_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +6cc26e77fc6bd9145deedf14cf422b03 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap new file mode 100644 index 000000000..be0b06ded --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +57d35cfa419a19a1a1f8d7c8ef096e0f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap new file mode 100644 index 000000000..93fe17b0c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3dbe0547b42759795e9b16989df72cee diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap new file mode 100644 index 000000000..be0b06ded --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +57d35cfa419a19a1a1f8d7c8ef096e0f diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap new file mode 100644 index 000000000..93fe17b0c --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_decreasing/included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3dbe0547b42759795e9b16989df72cee diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap new file mode 100644 index 000000000..db11ce952 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +c1c7a0bb91d53d33724583b6d4a99f16 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap new file mode 100644 index 000000000..f5a81c121 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +12213d3f1047a0c3d08e4670a7d688e7 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap new file mode 100644 index 000000000..db11ce952 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +c1c7a0bb91d53d33724583b6d4a99f16 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap new file mode 100644 index 000000000..f5a81c121 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/excluded_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +12213d3f1047a0c3d08e4670a7d688e7 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap new file mode 100644 index 000000000..fa7242056 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +ca59f20e043a4d52c49e15b10adf96bb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap new file mode 100644 index 000000000..a7611d8c1 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +cb69e0fe10fb299bafe77514204379cb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap new file mode 100644 index 000000000..fa7242056 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +ca59f20e043a4d52c49e15b10adf96bb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap new file mode 100644 index 000000000..a7611d8c1 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_increasing/included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +cb69e0fe10fb299bafe77514204379cb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap new file mode 100644 index 000000000..07664807e --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3456db9a1bb94c33c1e9f656184ee711 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap new file mode 100644 index 000000000..ef530faa1 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2127cd818b457e0611e0c8e1a871602a diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap new file mode 100644 index 000000000..07664807e --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3456db9a1bb94c33c1e9f656184ee711 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap new file mode 100644 index 000000000..ef530faa1 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/excluded_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +2127cd818b457e0611e0c8e1a871602a diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap new file mode 100644 index 000000000..db8a314b0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_0.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +b976551ceff412bfb2ec9bfbda320bbb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap new file mode 100644 index 000000000..2b82e07e8 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_1.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +7620ca1a96882c7147d3fd996570f9b3 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap new file mode 100644 index 000000000..db8a314b0 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_2.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +b976551ceff412bfb2ec9bfbda320bbb diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap new file mode 100644 index 000000000..2b82e07e8 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/filter_range_pinch/included_3.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +7620ca1a96882c7147d3fd996570f9b3 diff --git a/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap b/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap new file mode 100644 index 000000000..67a2f6bd9 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_range_search.rs/random_looking_index_snap/random_looking_index_snap.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/search/facet/facet_range_search.rs +--- +3256c76a7c1b768a013e78d5fa6e9ff9 diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap new file mode 100644 index 000000000..2d0f6e213 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/0.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[200, ] +[201, ] +[202, ] +[203, ] +[204, ] +[205, ] +[206, ] +[207, ] +[208, ] +[209, ] +[210, ] +[211, ] +[212, ] +[213, ] +[214, ] +[215, ] +[216, ] +[217, ] +[218, ] +[219, ] +[220, ] +[221, ] +[222, ] +[223, ] +[224, ] +[225, ] +[226, ] +[227, ] +[228, ] +[229, ] +[230, ] +[231, ] +[232, ] +[233, ] +[234, ] +[235, ] +[236, ] +[237, ] +[238, ] +[239, ] +[240, ] +[241, ] +[242, ] +[243, ] +[244, ] +[245, ] +[246, ] +[247, ] +[248, ] +[249, ] +[250, ] +[251, ] +[252, ] +[253, ] +[254, ] +[255, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap new file mode 100644 index 000000000..20d666494 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_ascending.rs/filter_sort/1.snap @@ -0,0 +1,54 @@ +--- +source: milli/src/search/facet/facet_sort_ascending.rs +--- +[201, ] +[202, ] +[203, ] +[207, ] +[211, ] +[215, ] +[219, ] +[223, ] +[224, ] +[230, ] +[231, ] +[233, ] +[235, ] +[236, ] +[237, ] +[239, ] +[241, ] +[243, ] +[244, ] +[247, ] +[250, ] +[256, ] +[258, ] +[260, ] +[262, ] +[263, ] +[264, ] +[267, ] +[269, ] +[273, ] +[277, ] +[278, ] +[279, ] +[281, ] +[282, ] +[286, ] +[289, ] +[292, ] +[293, ] +[295, ] +[297, ] +[205, ] +[206, ] +[208, ] +[209, ] +[210, ] +[216, ] +[220, ] +[226, ] +[238, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap new file mode 100644 index 000000000..032763c74 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/0.snap @@ -0,0 +1,60 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[255, ] +[254, ] +[253, ] +[252, ] +[251, ] +[250, ] +[249, ] +[248, ] +[247, ] +[246, ] +[245, ] +[244, ] +[243, ] +[242, ] +[241, ] +[240, ] +[239, ] +[238, ] +[237, ] +[236, ] +[235, ] +[234, ] +[233, ] +[232, ] +[231, ] +[230, ] +[229, ] +[228, ] +[227, ] +[226, ] +[225, ] +[224, ] +[223, ] +[222, ] +[221, ] +[220, ] +[219, ] +[218, ] +[217, ] +[216, ] +[215, ] +[214, ] +[213, ] +[212, ] +[211, ] +[210, ] +[209, ] +[208, ] +[207, ] +[206, ] +[205, ] +[204, ] +[203, ] +[202, ] +[201, ] +[200, ] + diff --git a/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap new file mode 100644 index 000000000..4c62cfee4 --- /dev/null +++ b/milli/src/search/facet/snapshots/facet_sort_descending.rs/filter_sort_descending/1.snap @@ -0,0 +1,54 @@ +--- +source: milli/src/search/facet/facet_sort_descending.rs +--- +[243, ] +[238, ] +[236, ] +[235, ] +[226, ] +[223, ] +[220, ] +[219, ] +[216, ] +[210, ] +[209, ] +[208, ] +[207, ] +[206, ] +[205, ] +[297, ] +[295, ] +[293, ] +[292, ] +[289, ] +[286, ] +[282, ] +[281, ] +[279, ] +[278, ] +[277, ] +[273, ] +[269, ] +[267, ] +[264, ] +[263, ] +[262, ] +[260, ] +[258, ] +[256, ] +[250, ] +[247, ] +[244, ] +[241, ] +[239, ] +[237, ] +[233, ] +[231, ] +[230, ] +[224, ] +[215, ] +[211, ] +[203, ] +[202, ] +[201, ] + diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 1b62a67c7..f62a37c1b 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -15,7 +15,7 @@ use log::debug; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, FacetNumberIter, Filter, DEFAULT_VALUES_PER_FACET}; +pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, @@ -32,7 +32,7 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); mod criteria; mod distinct; -mod facet; +pub mod facet; mod fst_utils; mod matches; mod query_tree; diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index e9c92a949..bcb9805ea 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -2,18 +2,14 @@ use std::borrow::Cow; use std::fmt::Write; use std::path::Path; -use heed::types::ByteSlice; -use heed::BytesDecode; use roaring::RoaringBitmap; -use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FacetStringZeroBoundsValueCodec, -}; -use crate::{make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index}; +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue}; +use crate::{make_db_snap_from_iter, ExternalDocumentsIds, Index}; #[track_caller] -pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Settings { +pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) { let mut settings = insta::Settings::clone_current(); settings.set_prepend_module_to_snapshot(false); let path = Path::new(std::panic::Location::caller().file()); @@ -23,12 +19,63 @@ pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> insta::Setti if let Some(name) = name { settings - .set_snapshot_path(Path::new("snapshots").join(filename).join(test_name).join(name)); + .set_snapshot_path(Path::new("snapshots").join(filename).join(&test_name).join(name)); } else { - settings.set_snapshot_path(Path::new("snapshots").join(filename).join(test_name)); + settings.set_snapshot_path(Path::new("snapshots").join(filename).join(&test_name)); } - settings + (settings, test_name) +} +#[macro_export] +macro_rules! milli_snap { + ($value:expr, $name:expr) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", $name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($value:expr) => { + let (settings, test_name) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", test_name), &snap, false); + for (name, snap) in snaps { + insta::assert_snapshot!(name, snap); + } + }); + }; + ($value:expr, @$inline:literal) => { + let (settings, test_name) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", test_name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; + ($value:expr, $name:expr, @$inline:literal) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + settings.bind(|| { + let snap = $value; + let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(&format!("{}", $name), &snap, true); + for (name, snap) in snaps { + if !name.ends_with(".full") { + insta::assert_snapshot!(snap, @$inline); + } else { + insta::assert_snapshot!(name, snap); + } + } + }); + }; } /** @@ -99,7 +146,7 @@ db_snap!(index, word_docids, "some_identifier", @""); #[macro_export] macro_rules! db_snap { ($index:ident, $db_name:ident, $name:expr) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some( + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some( &format!("{}", $name), )); settings.bind(|| { @@ -111,7 +158,7 @@ macro_rules! db_snap { }); }; ($index:ident, $db_name:ident) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); settings.bind(|| { let snap = $crate::full_snap_of_db!($index, $db_name); let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, false); @@ -121,7 +168,7 @@ macro_rules! db_snap { }); }; ($index:ident, $db_name:ident, @$inline:literal) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(None); settings.bind(|| { let snap = $crate::full_snap_of_db!($index, $db_name); let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); @@ -134,8 +181,8 @@ macro_rules! db_snap { } }); }; - ($index:ident, $db_name:ident, $name:literal, @$inline:literal) => { - let settings = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(&format!("{}", $name))); + ($index:ident, $db_name:ident, $name:expr, @$inline:literal) => { + let (settings, _) = $crate::snapshot_tests::default_db_snapshot_settings_for_test(Some(&format!("{}", $name))); settings.bind(|| { let snap = $crate::full_snap_of_db!($index, $db_name); let snaps = $crate::snapshot_tests::convert_snap_to_hash_if_needed(stringify!($db_name), &snap, true); @@ -233,44 +280,35 @@ pub fn snap_word_prefix_position_docids(index: &Index) -> String { } pub fn snap_facet_id_f64_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( - (facet_id, level, left, right), - b, + FacetGroupKey { field_id, level, left_bound }, + FacetGroupValue { size, bitmap }, )| { - &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) + &format!("{field_id:<3} {level:<2} {left_bound:<6} {size:<2} {}", display_bitmap(&bitmap)) + }); + snap +} +pub fn snap_facet_id_exists_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, facet_id_exists_docids, |(facet_id, docids)| { + &format!("{facet_id:<3} {}", display_bitmap(&docids)) }); snap } pub fn snap_facet_id_string_docids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let bytes_db = index.facet_id_string_docids.remap_types::(); - let iter = bytes_db.iter(&rtxn).unwrap(); - let mut snap = String::new(); - - for x in iter { - let (key, value) = x.unwrap(); - if let Some((field_id, normalized_str)) = FacetStringLevelZeroCodec::bytes_decode(key) { - let (orig_string, docids) = - FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); - snap.push_str(&format!( - "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", - display_bitmap(&docids) - )); - } else if let Some((field_id, level, left, right)) = - FacetLevelValueU32Codec::bytes_decode(key) - { - snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); - let (bounds, docids) = - FacetStringZeroBoundsValueCodec::::bytes_decode(value) - .unwrap(); - if let Some((left, right)) = bounds { - snap.push_str(&format!("{left:<8} {right:<8} ")); - } - snap.push_str(&display_bitmap(&docids)); - snap.push('\n'); - } else { - panic!(); - } - } + let snap = make_db_snap_from_iter!(index, facet_id_string_docids, |( + FacetGroupKey { field_id, level, left_bound }, + FacetGroupValue { size, bitmap }, + )| { + &format!("{field_id:<3} {level:<2} {left_bound:<12} {size:<2} {}", display_bitmap(&bitmap)) + }); + snap +} +pub fn snap_field_id_docid_facet_strings(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, field_id_docid_facet_strings, |( + (field_id, doc_id, string), + other_string, + )| { + &format!("{field_id:<3} {doc_id:<4} {string:<12} {other_string}") + }); snap } pub fn snap_documents_ids(index: &Index) -> String { @@ -339,7 +377,7 @@ pub fn snap_number_faceted_documents_ids(index: &Index) -> String { let mut snap = String::new(); for field_id in fields_ids_map.ids() { let number_faceted_documents_ids = - index.number_faceted_documents_ids(&rtxn, field_id).unwrap(); + index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap(); writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids)) .unwrap(); } @@ -352,7 +390,7 @@ pub fn snap_string_faceted_documents_ids(index: &Index) -> String { let mut snap = String::new(); for field_id in fields_ids_map.ids() { let string_faceted_documents_ids = - index.string_faceted_documents_ids(&rtxn, field_id).unwrap(); + index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap(); writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids)) .unwrap(); } @@ -454,6 +492,12 @@ macro_rules! full_snap_of_db { ($index:ident, facet_id_string_docids) => {{ $crate::snapshot_tests::snap_facet_id_string_docids(&$index) }}; + ($index:ident, field_id_docid_facet_strings) => {{ + $crate::snapshot_tests::snap_field_id_docid_facet_strings(&$index) + }}; + ($index:ident, facet_id_exists_docids) => {{ + $crate::snapshot_tests::snap_facet_id_exists_docids(&$index) + }}; ($index:ident, documents_ids) => {{ $crate::snapshot_tests::snap_documents_ids(&$index) }}; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index ba59c14cf..adeea11fa 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -1,6 +1,7 @@ use roaring::RoaringBitmap; use time::OffsetDateTime; +use crate::facet::FacetType; use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result}; pub struct ClearDocuments<'t, 'u, 'i> { @@ -55,8 +56,18 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // We clean all the faceted documents ids. for field_id in faceted_fields { - self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?; - self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &empty_roaring)?; + self.index.put_faceted_documents_ids( + self.wtxn, + field_id, + FacetType::Number, + &empty_roaring, + )?; + self.index.put_faceted_documents_ids( + self.wtxn, + field_id, + FacetType::String, + &empty_roaring, + )?; } // Clear the other databases. diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 26340b9dd..f1341c48c 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,23 +1,24 @@ use std::collections::btree_map::Entry; +use std::collections::{HashMap, HashSet}; use fst::IntoStreamer; -use heed::types::{ByteSlice, Str}; -use heed::{BytesDecode, BytesEncode, Database}; +use heed::types::{ByteSlice, DecodeIgnore, Str}; +use heed::Database; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use serde_json::Value; use time::OffsetDateTime; +use super::facet::delete::FacetsDelete; use super::ClearDocuments; -use crate::error::{InternalError, SerializationError, UserError}; -use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, -}; +use crate::error::{InternalError, UserError}; +use crate::facet::FacetType; +use crate::heed_codec::facet::FieldDocIdFacetCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, - RoaringBitmapCodec, SmallString32, BEU32, + ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, + SmallString32, BEU32, }; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -25,6 +26,8 @@ pub struct DeleteDocuments<'t, 'u, 'i> { index: &'i Index, external_documents_ids: ExternalDocumentsIds<'static>, to_delete_docids: RoaringBitmap, + #[cfg(test)] + disable_soft_deletion: bool, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -45,9 +48,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { index, external_documents_ids, to_delete_docids: RoaringBitmap::new(), + #[cfg(test)] + disable_soft_deletion: false, }) } + #[cfg(test)] + pub fn disable_soft_deletion(&mut self, disable: bool) { + self.disable_soft_deletion = disable; + } + pub fn delete_document(&mut self, docid: u32) { self.to_delete_docids.insert(docid); } @@ -64,6 +74,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { pub fn execute(mut self) -> Result { self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; + // We retrieve the current documents ids that are in the database. let mut documents_ids = self.index.documents_ids(self.wtxn)?; let mut soft_deleted_docids = self.index.soft_deleted_documents_ids(self.wtxn)?; @@ -127,7 +138,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // the `soft_deleted_documents_ids` bitmap and early exit. let size_used = self.index.used_size()?; let map_size = self.index.env.map_size()? as u64; - let nb_documents = self.index.number_of_documents(self.wtxn)?; + let nb_documents = self.index.number_of_documents(&self.wtxn)?; let nb_soft_deleted = soft_deleted_docids.len(); let percentage_available = 100 - (size_used * 100 / map_size); @@ -145,7 +156,20 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We run the deletion. // - With 100Go of disk and 50Go used including 15Go of soft-deleted documents // We run the deletion. - if percentage_available > 10 && percentage_used_by_soft_deleted_documents < 10 { + let disable_soft_deletion = { + #[cfg(not(test))] + { + false + } + #[cfg(test)] + { + self.disable_soft_deletion + } + }; + if !disable_soft_deletion + && percentage_available > 10 + && percentage_used_by_soft_deleted_documents < 10 + { self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?; return Ok(DocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), @@ -185,11 +209,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { prefix_word_pair_proximity_docids, word_position_docids, word_prefix_position_docids, - facet_id_f64_docids, + facet_id_f64_docids: _, + facet_id_string_docids: _, + field_id_docid_facet_f64s: _, + field_id_docid_facet_strings: _, facet_id_exists_docids, - facet_id_string_docids, - field_id_docid_facet_f64s, - field_id_docid_facet_strings, documents, } = self.index; @@ -440,54 +464,42 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?; } + for facet_type in [FacetType::Number, FacetType::String] { + let mut affected_facet_values = HashMap::new(); + for field_id in self.index.faceted_fields_ids(self.wtxn)? { + // Remove docids from the number faceted documents ids + let mut docids = + self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?; + docids -= &self.to_delete_docids; + self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?; + + let facet_values = remove_docids_from_field_id_docid_facet_value( + &self.index, + self.wtxn, + facet_type, + field_id, + &self.to_delete_docids, + )?; + if !facet_values.is_empty() { + affected_facet_values.insert(field_id, facet_values); + } + } + FacetsDelete::new( + self.index, + facet_type, + affected_facet_values, + &self.to_delete_docids, + ) + .execute(self.wtxn)?; + } + // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_docids( - self.wtxn, - facet_id_f64_docids, - &self.to_delete_docids, - )?; - // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_docids( + remove_docids_from_facet_id_exists_docids( self.wtxn, facet_id_exists_docids, &self.to_delete_docids, )?; - remove_docids_from_facet_field_id_string_docids( - self.wtxn, - facet_id_string_docids, - &self.to_delete_docids, - )?; - - // Remove the documents ids from the faceted documents ids. - for field_id in self.index.faceted_fields_ids(self.wtxn)? { - // Remove docids from the number faceted documents ids - let mut docids = self.index.number_faceted_documents_ids(self.wtxn, field_id)?; - docids -= &self.to_delete_docids; - self.index.put_number_faceted_documents_ids(self.wtxn, field_id, &docids)?; - - remove_docids_from_field_id_docid_facet_value( - self.wtxn, - field_id_docid_facet_f64s, - field_id, - &self.to_delete_docids, - |(_fid, docid, _value)| docid, - )?; - - // Remove docids from the string faceted documents ids - let mut docids = self.index.string_faceted_documents_ids(self.wtxn, field_id)?; - docids -= &self.to_delete_docids; - self.index.put_string_faceted_documents_ids(self.wtxn, field_id, &docids)?; - - remove_docids_from_field_id_docid_facet_value( - self.wtxn, - field_id_docid_facet_strings, - field_id, - &self.to_delete_docids, - |(_fid, docid, _value)| docid, - )?; - } - Ok(DocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), @@ -553,95 +565,41 @@ fn remove_from_word_docids( Ok(()) } -fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( +fn remove_docids_from_field_id_docid_facet_value<'i, 'a>( + index: &'i Index, wtxn: &'a mut heed::RwTxn, - db: &heed::Database, + facet_type: FacetType, field_id: FieldId, to_remove: &RoaringBitmap, - convert: F, -) -> heed::Result<()> -where - C: heed::BytesDecode<'a, DItem = K>, - DC: heed::BytesDecode<'a, DItem = V>, - F: Fn(K) -> DocumentId, -{ +) -> heed::Result>> { + let db = match facet_type { + FacetType::String => { + index.field_id_docid_facet_strings.remap_types::() + } + FacetType::Number => { + index.field_id_docid_facet_f64s.remap_types::() + } + }; + let mut all_affected_facet_values = HashSet::default(); let mut iter = db - .remap_key_type::() .prefix_iter_mut(wtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + .remap_key_type::>(); while let Some(result) = iter.next() { - let (key, _) = result?; - if to_remove.contains(convert(key)) { + let ((_, docid, facet_value), _) = result?; + if to_remove.contains(docid) { + if !all_affected_facet_values.contains(facet_value) { + all_affected_facet_values.insert(facet_value.to_owned()); + } // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; } } - Ok(()) + Ok(all_affected_facet_values) } -fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( - wtxn: &'a mut heed::RwTxn, - db: &heed::Database, - to_remove: &RoaringBitmap, -) -> crate::Result<()> { - let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS); - let mut iter = db.remap_types::().iter_mut(wtxn)?; - while let Some(result) = iter.next() { - let (key, val) = result?; - match FacetLevelValueU32Codec::bytes_decode(key) { - Some(_) => { - // If we are able to parse this key it means it is a facet string group - // level key. We must then parse the value using the appropriate codec. - let (group, mut docids) = - FacetStringZeroBoundsValueCodec::::bytes_decode(val) - .ok_or(SerializationError::Decoding { db_name })?; - - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - let val = &(group, docids); - let value_bytes = - FacetStringZeroBoundsValueCodec::::bytes_encode(val) - .ok_or(SerializationError::Encoding { db_name })?; - - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &value_bytes)? }; - } - } - None => { - // The key corresponds to a level zero facet string. - let (original_value, mut docids) = - FacetStringLevelZeroValueCodec::bytes_decode(val) - .ok_or(SerializationError::Decoding { db_name })?; - - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - let val = &(original_value, docids); - let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) - .ok_or(SerializationError::Encoding { db_name })?; - - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &value_bytes)? }; - } - } - } - } - - Ok(()) -} - -fn remove_docids_from_facet_field_id_docids<'a, C>( +fn remove_docids_from_facet_id_exists_docids<'a, C>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, to_remove: &RoaringBitmap, @@ -675,12 +633,13 @@ mod tests { use super::*; use crate::index::tests::TempIndex; - use crate::Filter; + use crate::{db_snap, Filter}; fn delete_documents<'t>( wtxn: &mut RwTxn<'t, '_>, index: &'t Index, external_ids: &[&str], + disable_soft_deletion: bool, ) -> Vec { let external_document_ids = index.external_documents_ids(&wtxn).unwrap(); let ids_to_delete: Vec = external_ids @@ -690,14 +649,14 @@ mod tests { // Delete some documents. let mut builder = DeleteDocuments::new(wtxn, index).unwrap(); + builder.disable_soft_deletion(disable_soft_deletion); external_ids.iter().for_each(|id| drop(builder.delete_external_id(id))); builder.execute().unwrap(); ids_to_delete } - #[test] - fn delete_documents_with_numbers_as_primary_key() { + fn delete_documents_with_numbers_as_primary_key_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -717,19 +676,36 @@ mod tests { builder.delete_document(0); builder.delete_document(1); builder.delete_document(2); + builder.disable_soft_deletion(disable_soft_deletion); builder.execute().unwrap(); wtxn.commit().unwrap(); + // All these snapshots should be empty since the database was cleared + db_snap!(index, documents_ids, disable_soft_deletion); + db_snap!(index, word_docids, disable_soft_deletion); + db_snap!(index, word_pair_proximity_docids, disable_soft_deletion); + db_snap!(index, facet_id_exists_docids, disable_soft_deletion); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + let rtxn = index.read_txn().unwrap(); assert!(index.field_distribution(&rtxn).unwrap().is_empty()); } #[test] - fn delete_documents_with_strange_primary_key() { + fn delete_documents_with_numbers_as_primary_key() { + delete_documents_with_numbers_as_primary_key_(true); + delete_documents_with_numbers_as_primary_key_(false); + } + + fn delete_documents_with_strange_primary_key_(disable_soft_deletion: bool) { let index = TempIndex::new(); + index + .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()])) + .unwrap(); + let mut wtxn = index.write_txn().unwrap(); index .add_documents_using_wtxn( @@ -741,18 +717,33 @@ mod tests { ]), ) .unwrap(); + wtxn.commit().unwrap(); + + let mut wtxn = index.write_txn().unwrap(); // Delete not all of the documents but some of them. let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); builder.delete_external_id("0"); builder.delete_external_id("1"); + builder.disable_soft_deletion(disable_soft_deletion); builder.execute().unwrap(); - wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, disable_soft_deletion); + db_snap!(index, word_docids, disable_soft_deletion); + db_snap!(index, word_pair_proximity_docids, disable_soft_deletion); + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); } #[test] - fn filtered_placeholder_search_should_not_return_deleted_documents() { + fn delete_documents_with_strange_primary_key() { + delete_documents_with_strange_primary_key_(true); + delete_documents_with_strange_primary_key_(false); + } + + fn filtered_placeholder_search_should_not_return_deleted_documents_( + disable_soft_deletion: bool, + ) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -760,7 +751,7 @@ mod tests { index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("docid")); - settings.set_filterable_fields(hashset! { S("label") }); + settings.set_filterable_fields(hashset! { S("label"), S("label2") }); }) .unwrap(); @@ -768,31 +759,34 @@ mod tests { .add_documents_using_wtxn( &mut wtxn, documents!([ - { "docid": "1_4", "label": "sign" }, - { "docid": "1_5", "label": "letter" }, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, - { "docid": "1_36", "label": "drawing,painting,pattern" }, - { "docid": "1_37", "label": "art,drawing,outdoor" }, - { "docid": "1_38", "label": "aquarium,art,drawing" }, - { "docid": "1_39", "label": "abstract" }, - { "docid": "1_40", "label": "cartoon" }, - { "docid": "1_41", "label": "art,drawing" }, - { "docid": "1_42", "label": "art,pattern" }, - { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, - { "docid": "1_44", "label": "drawing" }, - { "docid": "1_45", "label": "art" }, - { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, - { "docid": "1_47", "label": "abstract,pattern" }, - { "docid": "1_52", "label": "abstract,cartoon" }, - { "docid": "1_57", "label": "abstract,drawing,pattern" }, - { "docid": "1_58", "label": "abstract,art,cartoon" }, - { "docid": "1_68", "label": "design" }, - { "docid": "1_69", "label": "geometry" } + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } ]), ) .unwrap(); - delete_documents(&mut wtxn, &index, &["1_4"]); + delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"], disable_soft_deletion); // Placeholder search with filter let filter = Filter::from_str("label = sign").unwrap().unwrap(); @@ -800,10 +794,22 @@ mod tests { assert!(results.documents_ids.is_empty()); wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + db_snap!(index, word_docids, disable_soft_deletion); + db_snap!(index, facet_id_f64_docids, disable_soft_deletion); + db_snap!(index, word_pair_proximity_docids, disable_soft_deletion); + db_snap!(index, facet_id_exists_docids, disable_soft_deletion); + db_snap!(index, facet_id_string_docids, disable_soft_deletion); } #[test] - fn placeholder_search_should_not_return_deleted_documents() { + fn filtered_placeholder_search_should_not_return_deleted_documents() { + filtered_placeholder_search_should_not_return_deleted_documents_(true); + filtered_placeholder_search_should_not_return_deleted_documents_(false); + } + + fn placeholder_search_should_not_return_deleted_documents_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -817,31 +823,35 @@ mod tests { .add_documents_using_wtxn( &mut wtxn, documents!([ - { "docid": "1_4", "label": "sign" }, - { "docid": "1_5", "label": "letter" }, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, - { "docid": "1_36", "label": "drawing,painting,pattern" }, - { "docid": "1_37", "label": "art,drawing,outdoor" }, - { "docid": "1_38", "label": "aquarium,art,drawing" }, - { "docid": "1_39", "label": "abstract" }, - { "docid": "1_40", "label": "cartoon" }, - { "docid": "1_41", "label": "art,drawing" }, - { "docid": "1_42", "label": "art,pattern" }, - { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, - { "docid": "1_44", "label": "drawing" }, - { "docid": "1_45", "label": "art" }, - { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, - { "docid": "1_47", "label": "abstract,pattern" }, - { "docid": "1_52", "label": "abstract,cartoon" }, - { "docid": "1_57", "label": "abstract,drawing,pattern" }, - { "docid": "1_58", "label": "abstract,art,cartoon" }, - { "docid": "1_68", "label": "design" }, - { "docid": "1_69", "label": "geometry" } + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } ]), ) .unwrap(); - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &["1_4"], disable_soft_deletion); // Placeholder search let results = index.search(&wtxn).execute().unwrap(); @@ -858,7 +868,12 @@ mod tests { } #[test] - fn search_should_not_return_deleted_documents() { + fn placeholder_search_should_not_return_deleted_documents() { + placeholder_search_should_not_return_deleted_documents_(true); + placeholder_search_should_not_return_deleted_documents_(false); + } + + fn search_should_not_return_deleted_documents_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -872,31 +887,35 @@ mod tests { .add_documents_using_wtxn( &mut wtxn, documents!([ - {"docid": "1_4", "label": "sign"}, - {"docid": "1_5", "label": "letter"}, - {"docid": "1_7", "label": "abstract,cartoon,design,pattern"}, - {"docid": "1_36","label": "drawing,painting,pattern"}, - {"docid": "1_37","label": "art,drawing,outdoor"}, - {"docid": "1_38","label": "aquarium,art,drawing"}, - {"docid": "1_39","label": "abstract"}, - {"docid": "1_40","label": "cartoon"}, - {"docid": "1_41","label": "art,drawing"}, - {"docid": "1_42","label": "art,pattern"}, - {"docid": "1_43","label": "abstract,art,drawing,pattern"}, - {"docid": "1_44","label": "drawing"}, - {"docid": "1_45","label": "art"}, - {"docid": "1_46","label": "abstract,colorfulness,pattern"}, - {"docid": "1_47","label": "abstract,pattern"}, - {"docid": "1_52","label": "abstract,cartoon"}, - {"docid": "1_57","label": "abstract,drawing,pattern"}, - {"docid": "1_58","label": "abstract,art,cartoon"}, - {"docid": "1_68","label": "design"}, - {"docid": "1_69","label": "geometry"} + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } ]), ) .unwrap(); - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &["1_7", "1_52"], disable_soft_deletion); // search for abstract let results = index.search(&wtxn).query("abstract").execute().unwrap(); @@ -910,10 +929,19 @@ mod tests { } wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); } #[test] - fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { + fn search_should_not_return_deleted_documents() { + search_should_not_return_deleted_documents_(true); + search_should_not_return_deleted_documents_(false); + } + + fn geo_filtered_placeholder_search_should_not_return_deleted_documents_( + disable_soft_deletion: bool, + ) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -949,7 +977,8 @@ mod tests { ])).unwrap(); let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"]; - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &external_ids_to_delete, disable_soft_deletion); // Placeholder search with geo filter let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap(); @@ -964,10 +993,19 @@ mod tests { } wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + db_snap!(index, facet_id_f64_docids, disable_soft_deletion); + db_snap!(index, facet_id_string_docids, disable_soft_deletion); } #[test] - fn get_documents_should_not_return_deleted_documents() { + fn geo_filtered_placeholder_search_should_not_return_deleted_documents() { + geo_filtered_placeholder_search_should_not_return_deleted_documents_(true); + geo_filtered_placeholder_search_should_not_return_deleted_documents_(false); + } + + fn get_documents_should_not_return_deleted_documents_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -981,32 +1019,36 @@ mod tests { .add_documents_using_wtxn( &mut wtxn, documents!([ - { "docid": "1_4", "label": "sign" }, - { "docid": "1_5", "label": "letter" }, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern" }, - { "docid": "1_36", "label": "drawing,painting,pattern" }, - { "docid": "1_37", "label": "art,drawing,outdoor" }, - { "docid": "1_38", "label": "aquarium,art,drawing" }, - { "docid": "1_39", "label": "abstract" }, - { "docid": "1_40", "label": "cartoon" }, - { "docid": "1_41", "label": "art,drawing" }, - { "docid": "1_42", "label": "art,pattern" }, - { "docid": "1_43", "label": "abstract,art,drawing,pattern" }, - { "docid": "1_44", "label": "drawing" }, - { "docid": "1_45", "label": "art" }, - { "docid": "1_46", "label": "abstract,colorfulness,pattern" }, - { "docid": "1_47", "label": "abstract,pattern" }, - { "docid": "1_52", "label": "abstract,cartoon" }, - { "docid": "1_57", "label": "abstract,drawing,pattern" }, - { "docid": "1_58", "label": "abstract,art,cartoon" }, - { "docid": "1_68", "label": "design" }, - { "docid": "1_69", "label": "geometry" } + { "docid": "1_4", "label": ["sign"] }, + { "docid": "1_5", "label": ["letter"] }, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"] }, + { "docid": "1_36", "label": ["drawing","painting","pattern"] }, + { "docid": "1_37", "label": ["art","drawing","outdoor"] }, + { "docid": "1_38", "label": ["aquarium","art","drawing"] }, + { "docid": "1_39", "label": ["abstract"] }, + { "docid": "1_40", "label": ["cartoon"] }, + { "docid": "1_41", "label": ["art","drawing"] }, + { "docid": "1_42", "label": ["art","pattern"] }, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] }, + { "docid": "1_44", "label": ["drawing"] }, + { "docid": "1_45", "label": ["art"] }, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] }, + { "docid": "1_47", "label": ["abstract","pattern"] }, + { "docid": "1_52", "label": ["abstract","cartoon"] }, + { "docid": "1_57", "label": ["abstract","drawing","pattern"] }, + { "docid": "1_58", "label": ["abstract","art","cartoon"] }, + { "docid": "1_68", "label": ["design"] }, + { "docid": "1_69", "label": ["geometry"] }, + { "docid": "1_70", "label2": ["geometry", 1.2] }, + { "docid": "1_71", "label2": ["design", 2.2] }, + { "docid": "1_72", "label2": ["geometry", 1.2] } ]), ) .unwrap(); let deleted_external_ids = ["1_7", "1_52"]; - let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &deleted_external_ids, disable_soft_deletion); // list all documents let results = index.all_documents(&wtxn).unwrap(); @@ -1036,10 +1078,17 @@ mod tests { } wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); } #[test] - fn stats_should_not_return_deleted_documents() { + fn get_documents_should_not_return_deleted_documents() { + get_documents_should_not_return_deleted_documents_(true); + get_documents_should_not_return_deleted_documents_(false); + } + + fn stats_should_not_return_deleted_documents_(disable_soft_deletion: bool) { let index = TempIndex::new(); let mut wtxn = index.write_txn().unwrap(); @@ -1051,29 +1100,29 @@ mod tests { .unwrap(); index.add_documents_using_wtxn(&mut wtxn, documents!([ - { "docid": "1_4", "label": "sign"}, - { "docid": "1_5", "label": "letter"}, - { "docid": "1_7", "label": "abstract,cartoon,design,pattern", "title": "Mickey Mouse"}, - { "docid": "1_36", "label": "drawing,painting,pattern"}, - { "docid": "1_37", "label": "art,drawing,outdoor"}, - { "docid": "1_38", "label": "aquarium,art,drawing", "title": "Nemo"}, - { "docid": "1_39", "label": "abstract"}, - { "docid": "1_40", "label": "cartoon"}, - { "docid": "1_41", "label": "art,drawing"}, - { "docid": "1_42", "label": "art,pattern"}, - { "docid": "1_43", "label": "abstract,art,drawing,pattern", "number": 32i32}, - { "docid": "1_44", "label": "drawing", "number": 44i32}, - { "docid": "1_45", "label": "art"}, - { "docid": "1_46", "label": "abstract,colorfulness,pattern"}, - { "docid": "1_47", "label": "abstract,pattern"}, - { "docid": "1_52", "label": "abstract,cartoon"}, - { "docid": "1_57", "label": "abstract,drawing,pattern"}, - { "docid": "1_58", "label": "abstract,art,cartoon"}, - { "docid": "1_68", "label": "design"}, - { "docid": "1_69", "label": "geometry"} + { "docid": "1_4", "label": ["sign"]}, + { "docid": "1_5", "label": ["letter"]}, + { "docid": "1_7", "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"}, + { "docid": "1_36", "label": ["drawing","painting","pattern"]}, + { "docid": "1_37", "label": ["art","drawing","outdoor"]}, + { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"}, + { "docid": "1_39", "label": ["abstract"]}, + { "docid": "1_40", "label": ["cartoon"]}, + { "docid": "1_41", "label": ["art","drawing"]}, + { "docid": "1_42", "label": ["art","pattern"]}, + { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32}, + { "docid": "1_44", "label": ["drawing"], "number": 44i32}, + { "docid": "1_45", "label": ["art"]}, + { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]}, + { "docid": "1_47", "label": ["abstract","pattern"]}, + { "docid": "1_52", "label": ["abstract","cartoon"]}, + { "docid": "1_57", "label": ["abstract","drawing","pattern"]}, + { "docid": "1_58", "label": ["abstract","art","cartoon"]}, + { "docid": "1_68", "label": ["design"]}, + { "docid": "1_69", "label": ["geometry"]} ])).unwrap(); - delete_documents(&mut wtxn, &index, &["1_7", "1_52"]); + delete_documents(&mut wtxn, &index, &["1_7", "1_52"], disable_soft_deletion); // count internal documents let results = index.number_of_documents(&wtxn).unwrap(); @@ -1086,5 +1135,13 @@ mod tests { assert_eq!(Some(&2), results.get("number")); wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, disable_soft_deletion); + } + + #[test] + fn stats_should_not_return_deleted_documents() { + stats_should_not_return_deleted_documents_(true); + stats_should_not_return_deleted_documents_(false); } } diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs new file mode 100644 index 000000000..ea0a7d3d7 --- /dev/null +++ b/milli/src/update/facet/bulk.rs @@ -0,0 +1,438 @@ +use std::borrow::Cow; +use std::fs::File; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::{BytesEncode, Error, RoTxn, RwTxn}; +use roaring::RoaringBitmap; + +use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; +use crate::facet::FacetType; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, +}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::update::index_documents::{create_writer, writer_into_reader}; +use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; + +/// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases +/// by rebuilding the database "from scratch". +/// +/// First, the new elements are inserted into the level 0 of the database. Then, the +/// higher levels are cleared and recomputed from the content of level 0. +/// +/// Finally, the `faceted_documents_ids` value in the main database of `Index` +/// is updated to contain the new set of faceted documents. +pub struct FacetsUpdateBulk<'i> { + index: &'i Index, + group_size: u8, + min_level_size: u8, + facet_type: FacetType, + field_ids: Vec, + // None if level 0 does not need to be updated + new_data: Option>, +} + +impl<'i> FacetsUpdateBulk<'i> { + pub fn new( + index: &'i Index, + field_ids: Vec, + facet_type: FacetType, + new_data: grenad::Reader, + group_size: u8, + min_level_size: u8, + ) -> FacetsUpdateBulk<'i> { + FacetsUpdateBulk { + index, + field_ids, + group_size, + min_level_size, + facet_type, + new_data: Some(new_data), + } + } + + pub fn new_not_updating_level_0( + index: &'i Index, + field_ids: Vec, + facet_type: FacetType, + ) -> FacetsUpdateBulk<'i> { + FacetsUpdateBulk { + index, + field_ids, + group_size: FACET_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, + facet_type, + new_data: None, + } + } + + #[logging_timer::time("FacetsUpdateBulk::{}")] + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self; + + let db = match facet_type { + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; + + let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size }; + + inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| { + index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?; + Ok(()) + })?; + + Ok(()) + } +} + +/// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type +pub(crate) struct FacetsUpdateBulkInner { + pub db: heed::Database, FacetGroupValueCodec>, + pub new_data: Option>, + pub group_size: u8, + pub min_level_size: u8, +} +impl FacetsUpdateBulkInner { + pub fn update( + mut self, + wtxn: &mut RwTxn, + field_ids: &[u16], + mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>, + ) -> Result<()> { + self.update_level0(wtxn)?; + for &field_id in field_ids.iter() { + self.clear_levels(wtxn, field_id)?; + } + + for &field_id in field_ids.iter() { + let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, &wtxn)?; + + handle_all_docids(wtxn, field_id, all_docids)?; + + for level_reader in level_readers { + let mut cursor = level_reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + self.db.remap_types::().put(wtxn, k, v)?; + } + } + } + Ok(()) + } + + fn clear_levels(&self, wtxn: &mut heed::RwTxn, field_id: FieldId) -> Result<()> { + let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; + let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; + let range = left..=right; + self.db.delete_range(wtxn, &range).map(drop)?; + Ok(()) + } + fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> { + let new_data = match self.new_data.take() { + Some(x) => x, + None => return Ok(()), + }; + if self.db.is_empty(wtxn)? { + let mut buffer = Vec::new(); + let mut database = self.db.iter_mut(wtxn)?.remap_types::(); + let mut cursor = new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + buffer.clear(); + // the group size for level 0 + buffer.push(1); + // then we extend the buffer with the docids bitmap + buffer.extend_from_slice(value); + unsafe { database.append(key, &buffer)? }; + } + } else { + let mut buffer = Vec::new(); + let database = self.db.remap_types::(); + + let mut cursor = new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + // the value is a CboRoaringBitmap, but I still need to prepend the + // group size for level 0 (= 1) to it + buffer.clear(); + buffer.push(1); + // then we extend the buffer with the docids bitmap + match database.get(wtxn, key)? { + Some(prev_value) => { + let old_bitmap = &prev_value[1..]; + CboRoaringBitmapCodec::merge_into( + &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)], + &mut buffer, + )?; + } + None => { + buffer.extend_from_slice(value); + } + }; + database.put(wtxn, key, &buffer)?; + } + } + Ok(()) + } + fn compute_levels_for_field_id( + &self, + field_id: FieldId, + txn: &RoTxn, + ) -> Result<(Vec>, RoaringBitmap)> { + let mut all_docids = RoaringBitmap::new(); + let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| { + for bitmap in bitmaps { + all_docids |= bitmap; + } + Ok(()) + })?; + + Ok((subwriters, all_docids)) + } + fn read_level_0<'t>( + &self, + rtxn: &'t RoTxn, + field_id: u16, + handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, + ) -> Result<()> { + // we read the elements one by one and + // 1. keep track of the left bound + // 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read + let mut bitmaps = vec![]; + + let mut level_0_prefix = vec![]; + level_0_prefix.extend_from_slice(&field_id.to_be_bytes()); + level_0_prefix.push(0); + + let level_0_iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, level_0_prefix.as_slice())? + .remap_types::, FacetGroupValueCodec>(); + + let mut left_bound: &[u8] = &[]; + let mut first_iteration_for_new_group = true; + for el in level_0_iter { + let (key, value) = el?; + let bound = key.left_bound; + let docids = value.bitmap; + + if first_iteration_for_new_group { + left_bound = bound; + first_iteration_for_new_group = false; + } + bitmaps.push(docids); + + if bitmaps.len() == self.group_size as usize { + handle_group(&bitmaps, left_bound)?; + first_iteration_for_new_group = true; + bitmaps.clear(); + } + } + // don't forget to give the leftover bitmaps as well + if !bitmaps.is_empty() { + handle_group(&bitmaps, left_bound)?; + bitmaps.clear(); + } + Ok(()) + } + + /// Compute the content of the database levels from its level 0 for the given field id. + /// + /// ## Returns: + /// A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` + /// that must be inserted into the database. + fn compute_higher_levels<'t>( + &self, + rtxn: &'t RoTxn, + field_id: u16, + level: u8, + handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, + ) -> Result>> { + if level == 0 { + self.read_level_0(rtxn, field_id, handle_group)?; + // Level 0 is already in the database + return Ok(vec![]); + } + // level >= 1 + // we compute each element of this level based on the elements of the level below it + // once we have computed `level_group_size` elements, we give the left bound + // of those elements, and their bitmaps, to the level above + + let mut cur_writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + let mut cur_writer_len: usize = 0; + + let mut group_sizes = vec![]; + let mut left_bounds = vec![]; + let mut bitmaps = vec![]; + + // compute the levels below + // in the callback, we fill `cur_writer` with the correct elements for this level + let mut sub_writers = self.compute_higher_levels( + rtxn, + field_id, + level - 1, + &mut |sub_bitmaps, left_bound| { + let mut combined_bitmap = RoaringBitmap::default(); + for bitmap in sub_bitmaps { + combined_bitmap |= bitmap; + } + group_sizes.push(sub_bitmaps.len() as u8); + left_bounds.push(left_bound); + + bitmaps.push(combined_bitmap); + if bitmaps.len() != self.group_size as usize { + return Ok(()); + } + let left_bound = left_bounds.first().unwrap(); + handle_group(&bitmaps, left_bound)?; + + for ((bitmap, left_bound), group_size) in + bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) + { + let key = FacetGroupKey { field_id, level, left_bound }; + let key = FacetGroupKeyCodec::::bytes_encode(&key) + .ok_or(Error::Encoding)?; + let value = FacetGroupValue { size: group_size, bitmap }; + let value = + FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; + cur_writer.insert(key, value)?; + cur_writer_len += 1; + } + Ok(()) + }, + )?; + // don't forget to insert the leftover elements into the writer as well + + // but only do so if the current number of elements to be inserted into this + // levelcould grow to the minimum level size + + if !bitmaps.is_empty() && (cur_writer_len >= self.min_level_size as usize - 1) { + // the length of bitmaps is between 0 and group_size + assert!(bitmaps.len() < self.group_size as usize); + assert!(cur_writer_len > 0); + + let left_bound = left_bounds.first().unwrap(); + handle_group(&bitmaps, left_bound)?; + + // Note: how many bitmaps are there here? + for ((bitmap, left_bound), group_size) in + bitmaps.drain(..).zip(left_bounds.drain(..)).zip(group_sizes.drain(..)) + { + let key = FacetGroupKey { field_id, level, left_bound }; + let key = FacetGroupKeyCodec::::bytes_encode(&key) + .ok_or(Error::Encoding)?; + let value = FacetGroupValue { size: group_size, bitmap }; + let value = FacetGroupValueCodec::bytes_encode(&value).ok_or(Error::Encoding)?; + cur_writer.insert(key, value)?; + cur_writer_len += 1; + } + } + // if we inserted enough elements to reach the minimum level size, then we push the writer + if cur_writer_len as u8 >= self.min_level_size { + sub_writers.push(writer_into_reader(cur_writer)?); + } else { + // otherwise, if there are still leftover elements, we give them to the level above + // this is necessary in order to get the union of all docids + if !bitmaps.is_empty() { + handle_group(&bitmaps, left_bounds.first().unwrap())?; + } + } + return Ok(sub_writers); + } +} + +#[cfg(test)] +mod tests { + use std::iter::once; + + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::OrderedF64Codec; + use crate::milli_snap; + use crate::update::facet::tests::FacetIndex; + + #[test] + fn insert() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = + FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); + + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..1_000u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); + } + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); + } + let mut wtxn = index.env.write_txn().unwrap(); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); + + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + + wtxn.commit().unwrap(); + + milli_snap!(format!("{index}"), name); + }; + + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); + } + #[test] + fn insert_delete_field_insert() { + let test = |name: &str, group_size: u8, min_level_size: u8| { + let index = + FacetIndex::::new(group_size, 0 /*NA*/, min_level_size); + let mut wtxn = index.env.write_txn().unwrap(); + + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..100u32 { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); + } + for i in 0..100u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); + } + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); + + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + // delete all the elements for the facet id 0 + for i in 0..100u32 { + index.delete_single_docid(&mut wtxn, 0, &(i as f64), i); + } + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + // then add some elements again for the facet id 1 + for i in 0..110u32 { + // field id = 1, left_bound = i, docids = [i] + elements.push(((1, i as f64), once(i).collect())); + } + index.verify_structure_validity(&wtxn, 0); + index.verify_structure_validity(&wtxn, 1); + index.bulk_insert(&mut wtxn, &[0, 1], elements.iter()); + + wtxn.commit().unwrap(); + + milli_snap!(format!("{index}"), name); + }; + + test("default", 4, 5); + test("small_group_small_min_level", 2, 2); + test("small_group_large_min_level", 2, 128); + test("large_group_small_min_level", 16, 2); + test("odd_group_odd_min_level", 7, 3); + } +} diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs new file mode 100644 index 000000000..2bc54c7c1 --- /dev/null +++ b/milli/src/update/facet/delete.rs @@ -0,0 +1,239 @@ +use std::collections::{HashMap, HashSet}; + +use heed::RwTxn; +use log::debug; +use roaring::RoaringBitmap; +use time::OffsetDateTime; + +use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner}; +use crate::{FieldId, Index, Result}; + +/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases. +/// +/// Depending on the number of removed elements and the existing size of the database, we use either +/// a bulk delete method or an incremental delete method. +pub struct FacetsDelete<'i, 'b> { + index: &'i Index, + database: heed::Database, FacetGroupValueCodec>, + facet_type: FacetType, + affected_facet_values: HashMap>>, + docids_to_delete: &'b RoaringBitmap, + group_size: u8, + max_group_size: u8, + min_level_size: u8, +} +impl<'i, 'b> FacetsDelete<'i, 'b> { + pub fn new( + index: &'i Index, + facet_type: FacetType, + affected_facet_values: HashMap>>, + docids_to_delete: &'b RoaringBitmap, + ) -> Self { + let database = match facet_type { + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; + Self { + index, + database, + facet_type, + affected_facet_values, + docids_to_delete, + group_size: FACET_GROUP_SIZE, + max_group_size: FACET_MAX_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, + } + } + + pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> { + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + + for (field_id, affected_facet_values) in self.affected_facet_values { + // This is an incorrect condition, since we assume that the length of the database is equal + // to the number of facet values for the given field_id. It means that in some cases, we might + // wrongly choose the incremental indexer over the bulk indexer. But the only case where that could + // really be a performance problem is when we fully delete a large ratio of all facet values for + // each field id. This would almost never happen. Still, to be overly cautious, I have added a + // 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance + // penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead. + if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) { + // Bulk delete + let mut modified = false; + + for facet_value in affected_facet_values { + let key = + FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() }; + let mut old = self.database.get(wtxn, &key)?.unwrap(); + let previous_len = old.bitmap.len(); + old.bitmap -= self.docids_to_delete; + if old.bitmap.is_empty() { + modified = true; + self.database.delete(wtxn, &key)?; + } else if old.bitmap.len() != previous_len { + modified = true; + self.database.put(wtxn, &key, &old)?; + } + } + if modified { + let builder = FacetsUpdateBulk::new_not_updating_level_0( + self.index, + vec![field_id], + self.facet_type, + ); + builder.execute(wtxn)?; + } + } else { + // Incremental + let inc = FacetsUpdateIncrementalInner { + db: self.database, + group_size: self.group_size, + min_level_size: self.min_level_size, + max_group_size: self.max_group_size, + }; + for facet_value in affected_facet_values { + inc.delete(wtxn, field_id, facet_value.as_slice(), &self.docids_to_delete)?; + } + } + } + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use std::iter::FromIterator; + + use big_s::S; + use maplit::hashset; + use roaring::RoaringBitmap; + + use crate::db_snap; + use crate::documents::documents_batch_reader_from_objects; + use crate::index::tests::TempIndex; + use crate::update::DeleteDocuments; + + #[test] + fn delete_mixed_incremental_and_bulk() { + // The point of this test is to create an index populated with documents + // containing different filterable attributes. Then, we delete a bunch of documents + // such that a mix of the incremental and bulk indexer is used (depending on the field id) + let index = TempIndex::new_with_map_size(4096 * 1000 * 100); + + index + .update_settings(|settings| { + settings.set_filterable_fields( + hashset! { S("id"), S("label"), S("timestamp"), S("colour") }, + ); + }) + .unwrap(); + + let mut documents = vec![]; + for i in 0..1000 { + documents.push( + serde_json::json! { + { + "id": i, + "label": i / 10, + "colour": i / 100, + "timestamp": i / 2, + } + } + .as_object() + .unwrap() + .clone(), + ); + } + + let documents = documents_batch_reader_from_objects(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, facet_id_f64_docids, 1); + db_snap!(index, number_faceted_documents_ids, 1); + + let mut wtxn = index.env.write_txn().unwrap(); + + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.disable_soft_deletion(true); + builder.delete_documents(&RoaringBitmap::from_iter(0..100)); + // by deleting the first 100 documents, we expect that: + // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13) + // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13 + // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13 + // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13 + // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, soft_deleted_documents_ids, @"[]"); + db_snap!(index, facet_id_f64_docids, 2); + db_snap!(index, number_faceted_documents_ids, 2); + } +} + +#[allow(unused)] +#[cfg(test)] +mod comparison_bench { + use std::iter::once; + + use rand::Rng; + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::OrderedF64Codec; + use crate::update::facet::tests::FacetIndex; + + // This is a simple test to get an intuition on the relative speed + // of the incremental vs. bulk indexer. + // + // The benchmark shows the worst-case scenario for the incremental indexer, since + // each facet value contains only one document ID. + // + // In that scenario, it appears that the incremental indexer is about 70 times slower than the + // bulk indexer. + // #[test] + fn benchmark_facet_indexing_delete() { + let mut r = rand::thread_rng(); + + for i in 1..=20 { + let size = 50_000 * i; + let index = FacetIndex::::new(4, 8, 5); + + let mut txn = index.env.write_txn().unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..size { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, i as f64), once(i).collect())); + } + let timer = std::time::Instant::now(); + index.bulk_insert(&mut txn, &[0], elements.iter()); + let time_spent = timer.elapsed().as_millis(); + println!("bulk {size} : {time_spent}ms"); + + txn.commit().unwrap(); + + for nbr_doc in [1, 100, 1000, 10_000] { + let mut txn = index.env.write_txn().unwrap(); + let timer = std::time::Instant::now(); + // + // delete one document + // + for _ in 0..nbr_doc { + let deleted_u32 = r.gen::() % size; + let deleted_f64 = deleted_u32 as f64; + index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32) + } + let time_spent = timer.elapsed().as_millis(); + println!(" delete {nbr_doc} : {time_spent}ms"); + txn.abort().unwrap(); + } + } + } +} diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs new file mode 100644 index 000000000..2558c81a3 --- /dev/null +++ b/milli/src/update/facet/incremental.rs @@ -0,0 +1,1412 @@ +use std::collections::HashMap; +use std::fs::File; + +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{BytesDecode, Error, RoTxn, RwTxn}; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, +}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::search::facet::get_highest_level; +use crate::{CboRoaringBitmapCodec, FieldId, Index, Result}; + +enum InsertionResult { + InPlace, + Expand, + Insert, +} +enum DeletionResult { + InPlace, + Reduce { next: Option> }, + Remove { next: Option> }, +} + +/// Algorithm to incrementally insert and delete elememts into the +/// `facet_id_(string/f64)_docids` databases. +/// +/// Rhe `faceted_documents_ids` value in the main database of `Index` +/// is also updated to contain the new set of faceted documents. +pub struct FacetsUpdateIncremental<'i> { + index: &'i Index, + inner: FacetsUpdateIncrementalInner, + facet_type: FacetType, + new_data: grenad::Reader, +} + +impl<'i> FacetsUpdateIncremental<'i> { + pub fn new( + index: &'i Index, + facet_type: FacetType, + new_data: grenad::Reader, + group_size: u8, + min_level_size: u8, + max_group_size: u8, + ) -> Self { + FacetsUpdateIncremental { + index, + inner: FacetsUpdateIncrementalInner { + db: match facet_type { + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), + FacetType::Number => index + .facet_id_f64_docids + .remap_key_type::>(), + }, + group_size, + max_group_size, + min_level_size, + }, + facet_type, + new_data, + } + } + + pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> { + let mut new_faceted_docids = HashMap::::default(); + + let mut cursor = self.new_data.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let key = FacetGroupKeyCodec::::bytes_decode(key) + .ok_or(heed::Error::Encoding)?; + let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?; + self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?; + *new_faceted_docids.entry(key.field_id).or_default() |= docids; + } + + for (field_id, new_docids) in new_faceted_docids { + let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?; + docids |= new_docids; + self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?; + } + Ok(()) + } +} + +/// Implementation of `FacetsUpdateIncremental` that is independent of milli's `Index` type +pub struct FacetsUpdateIncrementalInner { + pub db: heed::Database, FacetGroupValueCodec>, + pub group_size: u8, + pub min_level_size: u8, + pub max_group_size: u8, +} +impl FacetsUpdateIncrementalInner { + /// Find the `FacetGroupKey`/`FacetGroupValue` in the database that + /// should be used to insert the new `facet_value` for the given `field_id` and `level` + /// where `level` must be strictly greater than 0. + /// + /// For example, when inserting the facet value `4`, there are two possibilities: + /// + /// 1. We find a key whose lower bound is 3 followed by a key whose lower bound is 6. Therefore, + /// we know that the implicit range of the first key is 3..6, which contains 4. + /// So the new facet value belongs in that first key/value pair. + /// + /// 2. The first key of the level has a lower bound of `5`. We return this key/value pair + /// but will need to change the lowerbound of this key to `4` in order to insert this facet value. + fn find_insertion_key_value( + &self, + field_id: u16, + level: u8, + facet_value: &[u8], + txn: &RoTxn, + ) -> Result<(FacetGroupKey>, FacetGroupValue)> { + assert!(level > 0); + + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(level); + prefix.extend_from_slice(facet_value); + + let mut prefix_iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, prefix.as_slice())?; + if let Some(e) = prefix_iter.next() { + let (key_bytes, value) = e?; + Ok(( + FacetGroupKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)? + .into_owned(), + value, + )) + } else { + let key = FacetGroupKey { field_id, level, left_bound: facet_value }; + match self.db.get_lower_than(txn, &key)? { + Some((key, value)) => { + if key.level != level { + let mut prefix = vec![]; + prefix.extend_from_slice(&field_id.to_be_bytes()); + prefix.push(level); + + let mut iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + txn, + &prefix.as_slice(), + )?; + let (key_bytes, value) = iter.next().unwrap()?; + Ok(( + FacetGroupKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)? + .into_owned(), + value, + )) + } else { + Ok((key.into_owned(), value)) + } + } + None => panic!(), + } + } + } + + /// Insert the given facet value and corresponding document ids in the level 0 of the database + /// + /// ## Return + /// See documentation of `insert_in_level` + fn insert_in_level_0<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + facet_value: &[u8], + docids: &RoaringBitmap, + ) -> Result { + let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; + let value = FacetGroupValue { bitmap: docids.clone(), size: 1 }; + + let mut level0_prefix = vec![]; + level0_prefix.extend_from_slice(&field_id.to_be_bytes()); + level0_prefix.push(0); + + let mut iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, DecodeIgnore>(&txn, &level0_prefix)?; + + if iter.next().is_none() { + drop(iter); + self.db.put(txn, &key, &value)?; + return Ok(InsertionResult::Insert); + } else { + drop(iter); + let old_value = self.db.get(&txn, &key)?; + match old_value { + Some(mut updated_value) => { + // now merge the two + updated_value.bitmap |= value.bitmap; + self.db.put(txn, &key, &updated_value)?; + Ok(InsertionResult::InPlace) + } + None => { + self.db.put(txn, &key, &value)?; + Ok(InsertionResult::Insert) + } + } + } + } + + /// Insert the given facet value and corresponding document ids in all the levels of the database up to the given `level`. + /// This function works recursively. + /// + /// ## Return + /// Returns the effect of adding the facet value to the database on the given `level`. + /// + /// - `InsertionResult::InPlace` means that inserting the `facet_value` into the `level` did not have + /// an effect on the number of keys in that level. Therefore, it did not increase the number of children + /// of the parent node. + /// + /// - `InsertionResult::Insert` means that inserting the `facet_value` into the `level` resulted + /// in the addition of a new key in that level, and that therefore the number of children + /// of the parent node should be incremented. + fn insert_in_level<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + level: u8, + facet_value: &[u8], + docids: &RoaringBitmap, + ) -> Result { + if level == 0 { + return self.insert_in_level_0(txn, field_id, facet_value, docids); + } + + let max_group_size = self.max_group_size; + + let result = self.insert_in_level(txn, field_id, level - 1, facet_value.clone(), docids)?; + // level below inserted an element + + let (insertion_key, insertion_value) = + self.find_insertion_key_value(field_id, level, facet_value, txn)?; + + match result { + // because we know that we inserted in place, the facet_value is not a new one + // thus it doesn't extend a group, and thus the insertion key computed above is + // still correct + InsertionResult::InPlace => { + let mut updated_value = insertion_value; + updated_value.bitmap |= docids; + self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; + + return Ok(InsertionResult::InPlace); + } + InsertionResult::Expand => {} + InsertionResult::Insert => {} + } + + // Here we know that inserting the facet value in the level below resulted in the creation + // of a new key. Therefore, it may be the case that we need to modify the left bound of the + // insertion key (see documentation of `find_insertion_key_value` for an example of when that + // could happen). + let (insertion_key, insertion_key_was_modified) = { + let mut new_insertion_key = insertion_key.clone(); + let mut key_should_be_modified = false; + + if facet_value < insertion_key.left_bound.as_slice() { + new_insertion_key.left_bound = facet_value.to_vec(); + key_should_be_modified = true; + } + if key_should_be_modified { + let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?; + assert!(is_deleted); + self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?; + } + (new_insertion_key, key_should_be_modified) + }; + // Now we know that the insertion key contains the `facet_value`. + + // We still need to update the insertion value by: + // 1. Incrementing the number of children (since the recursive call returned `InsertionResult::Insert`) + // 2. Merge the previous docids with the new one + let mut updated_value = insertion_value; + + if matches!(result, InsertionResult::Insert) { + updated_value.size += 1; + } + + if updated_value.size < max_group_size { + updated_value.bitmap |= docids; + self.db.put(txn, &insertion_key.as_ref(), &updated_value)?; + if insertion_key_was_modified { + return Ok(InsertionResult::Expand); + } else { + return Ok(InsertionResult::InPlace); + } + } + + // We've increased the group size of the value and realised it has become greater than or equal to `max_group_size` + // Therefore it must be split into two nodes. + + let size_left = updated_value.size / 2; + let size_right = updated_value.size - size_left; + + let level_below = level - 1; + + let start_key = FacetGroupKey { + field_id, + level: level_below, + left_bound: insertion_key.left_bound.as_slice(), + }; + + let mut iter = + self.db.range(&txn, &(start_key..))?.take((size_left as usize) + (size_right as usize)); + + let group_left = { + let mut values_left = RoaringBitmap::new(); + + let mut i = 0; + while let Some(next) = iter.next() { + let (_key, value) = next?; + i += 1; + values_left |= &value.bitmap; + if i == size_left { + break; + } + } + + let key = + FacetGroupKey { field_id, level, left_bound: insertion_key.left_bound.clone() }; + let value = FacetGroupValue { size: size_left as u8, bitmap: values_left }; + (key, value) + }; + + let group_right = { + let ( + FacetGroupKey { left_bound: right_left_bound, .. }, + FacetGroupValue { bitmap: mut values_right, .. }, + ) = iter.next().unwrap()?; + + while let Some(next) = iter.next() { + let (_, value) = next?; + values_right |= &value.bitmap; + } + + let key = FacetGroupKey { field_id, level, left_bound: right_left_bound.to_vec() }; + let value = FacetGroupValue { size: size_right as u8, bitmap: values_right }; + (key, value) + }; + drop(iter); + + let _ = self.db.delete(txn, &insertion_key.as_ref())?; + + self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?; + self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?; + + Ok(InsertionResult::Insert) + } + + /// Insert the given facet value and corresponding document ids in the database. + pub fn insert<'a, 't>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + facet_value: &[u8], + docids: &RoaringBitmap, + ) -> Result<()> { + if docids.is_empty() { + return Ok(()); + } + let group_size = self.group_size; + + let highest_level = get_highest_level(&txn, self.db, field_id)?; + + let result = + self.insert_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; + match result { + InsertionResult::InPlace => return Ok(()), + InsertionResult::Expand => return Ok(()), + InsertionResult::Insert => {} + } + + // Here we check whether the highest level has exceeded `min_level_size` * `self.group_size`. + // If it has, we must build an addition level above it. + + let mut highest_level_prefix = vec![]; + highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); + highest_level_prefix.push(highest_level); + + let size_highest_level = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? + .count(); + + if size_highest_level < self.group_size as usize * self.min_level_size as usize { + return Ok(()); + } + + let mut groups_iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &highest_level_prefix)?; + + let nbr_new_groups = size_highest_level / self.group_size as usize; + let nbr_leftover_elements = size_highest_level % self.group_size as usize; + + let mut to_add = vec![]; + for _ in 0..nbr_new_groups { + let mut first_key = None; + let mut values = RoaringBitmap::new(); + for _ in 0..group_size { + let (key_bytes, value_i) = groups_iter.next().unwrap()?; + let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)?; + + if first_key.is_none() { + first_key = Some(key_i); + } + values |= value_i.bitmap; + } + let key = FacetGroupKey { + field_id, + level: highest_level + 1, + left_bound: first_key.unwrap().left_bound, + }; + let value = FacetGroupValue { size: group_size as u8, bitmap: values }; + to_add.push((key.into_owned(), value)); + } + // now we add the rest of the level, in case its size is > group_size * min_level_size + // this can indeed happen if the min_level_size parameter changes between two calls to `insert` + if nbr_leftover_elements > 0 { + let mut first_key = None; + let mut values = RoaringBitmap::new(); + for _ in 0..nbr_leftover_elements { + let (key_bytes, value_i) = groups_iter.next().unwrap()?; + let key_i = FacetGroupKeyCodec::::bytes_decode(&key_bytes) + .ok_or(Error::Encoding)?; + + if first_key.is_none() { + first_key = Some(key_i); + } + values |= value_i.bitmap; + } + let key = FacetGroupKey { + field_id, + level: highest_level + 1, + left_bound: first_key.unwrap().left_bound, + }; + let value = FacetGroupValue { size: nbr_leftover_elements as u8, bitmap: values }; + to_add.push((key.into_owned(), value)); + } + + drop(groups_iter); + for (key, value) in to_add { + self.db.put(txn, &key.as_ref(), &value)?; + } + Ok(()) + } + + /// Delete the given document id from the given facet value in the database, from level 0 to the + /// the given level. + /// + /// ## Return + /// Returns the effect of removing the document id from the database on the given `level`. + /// + /// - `DeletionResult::InPlace` means that deleting the document id did not have + /// an effect on the keys in that level. + /// + /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the + /// number of keys in the level. For example, removing a document id from the facet value `3` could + /// cause it to have no corresponding document in level 0 anymore, and therefore the key was deleted + /// entirely. In that case, `DeletionResult::Remove` is returned. The parent of the deleted key must + /// then adjust its group size. If its group size falls to 0, then it will need to be deleted as well. + /// + /// - `DeletionResult::Reduce` means that deleting the document id resulted in a change in the + /// bounds of the keys of the level. For example, removing a document id from the facet value + /// `3` might have caused the facet value `3` to have no corresponding document in level 0. Therefore, + /// in level 1, the key with the left bound `3` had to be changed to the next facet value (e.g. 4). + /// In that case `DeletionResult::Reduce` is returned. The parent of the reduced key may need to adjust + /// its left bound as well. + fn delete_in_level<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + level: u8, + facet_value: &[u8], + docids: &RoaringBitmap, + ) -> Result { + if level == 0 { + return self.delete_in_level_0(txn, field_id, facet_value, docids); + } + let (deletion_key, mut bitmap) = + self.find_insertion_key_value(field_id, level, facet_value, txn)?; + + let result = self.delete_in_level(txn, field_id, level - 1, facet_value.clone(), docids)?; + + let mut decrease_size = false; + let next_key = match result { + DeletionResult::InPlace => { + bitmap.bitmap -= docids; + self.db.put(txn, &deletion_key.as_ref(), &bitmap)?; + return Ok(DeletionResult::InPlace); + } + DeletionResult::Reduce { next } => next, + DeletionResult::Remove { next } => { + decrease_size = true; + next + } + }; + // If either DeletionResult::Reduce or DeletionResult::Remove was returned, + // then we may need to adjust the left_bound of the deletion key. + + // If DeletionResult::Remove was returned, then we need to decrease the group + // size of the deletion key. + let mut updated_value = bitmap; + if decrease_size { + updated_value.size -= 1; + } + + if updated_value.size == 0 { + self.db.delete(txn, &deletion_key.as_ref())?; + Ok(DeletionResult::Remove { next: next_key }) + } else { + let mut updated_deletion_key = deletion_key.clone(); + let reduced_range = facet_value == deletion_key.left_bound; + if reduced_range { + updated_deletion_key.left_bound = next_key.clone().unwrap(); + } + updated_value.bitmap -= docids; + let _ = self.db.delete(txn, &deletion_key.as_ref())?; + self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?; + if reduced_range { + Ok(DeletionResult::Reduce { next: next_key }) + } else { + Ok(DeletionResult::InPlace) + } + } + } + + fn delete_in_level_0<'t>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + facet_value: &[u8], + docids: &RoaringBitmap, + ) -> Result { + let key = FacetGroupKey { field_id, level: 0, left_bound: facet_value }; + let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap; + bitmap -= docids; + + if bitmap.is_empty() { + let mut next_key = None; + if let Some((next, _)) = + self.db.remap_data_type::().get_greater_than(&txn, &key)? + { + if next.field_id == field_id && next.level == 0 { + next_key = Some(next.left_bound.to_vec()); + } + } + self.db.delete(txn, &key)?; + Ok(DeletionResult::Remove { next: next_key }) + } else { + self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?; + Ok(DeletionResult::InPlace) + } + } + + pub fn delete<'a, 't>( + &self, + txn: &'t mut RwTxn, + field_id: u16, + facet_value: &[u8], + docids: &RoaringBitmap, + ) -> Result<()> { + if self + .db + .remap_data_type::() + .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value })? + .is_none() + { + return Ok(()); + } + let highest_level = get_highest_level(&txn, self.db, field_id)?; + + let result = + self.delete_in_level(txn, field_id, highest_level as u8, facet_value, docids)?; + match result { + DeletionResult::InPlace => return Ok(()), + DeletionResult::Reduce { .. } => return Ok(()), + DeletionResult::Remove { .. } => {} + } + + // if we either removed a key from the highest level, its size may have fallen + // below `min_level_size`, in which case we need to remove the entire level + + let mut highest_level_prefix = vec![]; + highest_level_prefix.extend_from_slice(&field_id.to_be_bytes()); + highest_level_prefix.push(highest_level); + + if highest_level == 0 + || self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)? + .count() + >= self.min_level_size as usize + { + return Ok(()); + } + let mut to_delete = vec![]; + let mut iter = self + .db + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)?; + while let Some(el) = iter.next() { + let (k, _) = el?; + to_delete.push( + FacetGroupKeyCodec::::bytes_decode(k) + .ok_or(Error::Encoding)? + .into_owned(), + ); + } + drop(iter); + for k in to_delete { + self.db.delete(txn, &k.as_ref())?; + } + Ok(()) + } +} + +impl<'a> FacetGroupKey<&'a [u8]> { + pub fn into_owned(self) -> FacetGroupKey> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.to_vec(), + } + } +} + +impl<'a> FacetGroupKey> { + pub fn as_ref(&self) -> FacetGroupKey<&[u8]> { + FacetGroupKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.as_slice(), + } + } +} + +#[cfg(test)] +mod tests { + use rand::seq::SliceRandom; + use rand::{Rng, SeedableRng}; + use roaring::RoaringBitmap; + + use crate::heed_codec::facet::OrderedF64Codec; + use crate::heed_codec::StrRefCodec; + use crate::milli_snap; + use crate::update::facet::tests::FacetIndex; + + #[test] + fn append() { + let index = FacetIndex::::new(4, 8, 5); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + #[test] + fn many_field_ids_append() { + let index = FacetIndex::::new(4, 8, 5); + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 2, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in 0..256u16 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 1, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.verify_structure_validity(&txn, 1); + index.verify_structure_validity(&txn, 2); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + #[test] + fn many_field_ids_prepend() { + let index = FacetIndex::::new(4, 8, 5); + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 2, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i as u32); + let mut txn = index.env.write_txn().unwrap(); + index.insert(&mut txn, 1, &(i as f64), &bitmap); + txn.commit().unwrap(); + } + let txn = index.env.read_txn().unwrap(); + index.verify_structure_validity(&txn, 0); + index.verify_structure_validity(&txn, 1); + index.verify_structure_validity(&txn, 2); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + + #[test] + fn prepend() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + for i in (0..256).into_iter().rev() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + + #[test] + fn shuffled() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut keys = (0..256).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + + #[test] + fn merge_values() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut keys = (0..256).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for (_i, key) in keys.into_iter().enumerate() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(rng.gen_range(256..512)); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + + #[test] + fn delete_from_end() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(&(i as f64)), &bitmap); + } + + for i in (200..256).into_iter().rev() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 200); + let mut txn = index.env.write_txn().unwrap(); + + for i in (150..200).into_iter().rev() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 150); + let mut txn = index.env.write_txn().unwrap(); + for i in (100..150).into_iter().rev() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 100); + let mut txn = index.env.write_txn().unwrap(); + for i in (17..100).into_iter().rev() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 17); + let mut txn = index.env.write_txn().unwrap(); + for i in (15..17).into_iter().rev() { + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 15); + let mut txn = index.env.write_txn().unwrap(); + for i in (0..15).into_iter().rev() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 0); + } + + #[test] + fn delete_from_start() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + for i in 0..256 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + + for i in 0..128 { + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 127); + let mut txn = index.env.write_txn().unwrap(); + for i in 128..216 { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 215); + let mut txn = index.env.write_txn().unwrap(); + for i in 216..256 { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(i as f64), i as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 255); + } + + #[test] + fn delete_shuffled() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + for i in 0..256 { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(i); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(i as f64), &bitmap); + } + + let mut keys = (0..256).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for i in 0..128 { + let key = keys[i]; + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(key as f64), key as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 127); + let mut txn = index.env.write_txn().unwrap(); + for i in 128..216 { + let key = keys[i]; + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(key as f64), key as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + let mut txn = index.env.write_txn().unwrap(); + milli_snap!(format!("{index}"), 215); + for i in 216..256 { + let key = keys[i]; + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(key as f64), key as u32); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), 255); + } + + #[test] + fn in_place_level0_insert() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut keys = (0..16).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + for i in 0..4 { + for &key in keys.iter() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(rng.gen_range(i * 256..(i + 1) * 256)); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}")); + } + + #[test] + fn in_place_level0_delete() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut keys = (0..64).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for &key in keys.iter() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.verify_structure_validity(&txn, 0); + + index.insert(&mut txn, 0, &(key as f64), &bitmap); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), "before_delete"); + + let mut txn = index.env.write_txn().unwrap(); + + for &key in keys.iter() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &(key as f64), key + 100); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), "after_delete"); + } + + #[test] + fn shuffle_merge_string_and_delete() { + let index = FacetIndex::::new(4, 8, 5); + let mut txn = index.env.write_txn().unwrap(); + + let mut keys = (1000..1064).into_iter().collect::>(); + let mut rng = rand::rngs::SmallRng::from_seed([0; 32]); + keys.shuffle(&mut rng); + + for &key in keys.iter() { + let mut bitmap = RoaringBitmap::new(); + bitmap.insert(key); + bitmap.insert(key + 100); + index.verify_structure_validity(&txn, 0); + index.insert(&mut txn, 0, &format!("{key:x}").as_str(), &bitmap); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), "before_delete"); + + let mut txn = index.env.write_txn().unwrap(); + + for &key in keys.iter() { + index.verify_structure_validity(&txn, 0); + index.delete_single_docid(&mut txn, 0, &format!("{key:x}").as_str(), key + 100); + } + index.verify_structure_validity(&txn, 0); + txn.commit().unwrap(); + milli_snap!(format!("{index}"), "after_delete"); + } +} + +// fuzz tests +#[cfg(all(test, fuzzing))] +/** +Fuzz test for the incremental indxer. + +The fuzz test uses fuzzcheck, a coverage-guided fuzzer. +See https://github.com/loiclec/fuzzcheck-rs and https://fuzzcheck.neocities.org +for more information. + +It is only run when using the `cargo fuzzcheck` command line tool, which can be installed with: +```sh +cargo install cargo-fuzzcheck +``` +To start the fuzz test, run (from the base folder or from milli/): +```sh +cargo fuzzcheck update::facet::incremental::fuzz::fuzz +``` +and wait a couple minutes to make sure the code was thoroughly tested, then +hit `Ctrl-C` to stop the fuzzer. The corpus generated by the fuzzer is located in milli/fuzz. + +To work on this module with rust-analyzer working properly, add the following to your .cargo/config.toml file: +```toml +[build] +rustflags = ["--cfg", "fuzzing"] +``` + +The fuzz test generates sequences of additions and deletions to the facet database and +ensures that: +1. its structure is still internally valid +2. its content is the same as a trivially correct implementation of the same database +*/ +mod fuzz { + use std::borrow::Cow; + use std::collections::{BTreeMap, HashMap}; + use std::convert::TryFrom; + use std::iter::FromIterator; + use std::rc::Rc; + + use fuzzcheck::mutators::integer::U8Mutator; + use fuzzcheck::mutators::integer_within_range::{U16WithinRangeMutator, U8WithinRangeMutator}; + use fuzzcheck::mutators::vector::VecMutator; + use fuzzcheck::DefaultMutator; + use heed::BytesEncode; + use roaring::RoaringBitmap; + use tempfile::TempDir; + + use super::*; + use crate::update::facet::tests::FacetIndex; + + struct NEU16Codec; + impl<'a> BytesEncode<'a> for NEU16Codec { + type EItem = u16; + #[no_coverage] + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Owned(item.to_be_bytes().to_vec())) + } + } + impl<'a> BytesDecode<'a> for NEU16Codec { + type DItem = u16; + #[no_coverage] + fn bytes_decode(bytes: &'a [u8]) -> Option { + let bytes = <[u8; 2]>::try_from(&bytes[0..=1]).unwrap(); + Some(u16::from_be_bytes(bytes)) + } + } + + #[derive(Default)] + pub struct TrivialDatabase { + pub elements: BTreeMap>, + } + impl TrivialDatabase + where + T: Ord + Clone + Copy + Eq + std::fmt::Debug, + { + #[no_coverage] + pub fn insert(&mut self, field_id: u16, new_key: T, new_values: &RoaringBitmap) { + if new_values.is_empty() { + return; + } + let values_field_id = self.elements.entry(field_id).or_default(); + let values = values_field_id.entry(new_key).or_default(); + *values |= new_values; + } + #[no_coverage] + pub fn delete(&mut self, field_id: u16, key: T, values_to_remove: &RoaringBitmap) { + if let Some(values_field_id) = self.elements.get_mut(&field_id) { + if let Some(values) = values_field_id.get_mut(&key) { + *values -= values_to_remove; + if values.is_empty() { + values_field_id.remove(&key); + } + } + if values_field_id.is_empty() { + self.elements.remove(&field_id); + } + } + } + } + #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] + struct Operation { + key: Key, + #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] + group_size: u8, + #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] + max_group_size: u8, + #[field_mutator(U8WithinRangeMutator = { U8WithinRangeMutator::new(..32) })] + min_level_size: u8, + #[field_mutator(U16WithinRangeMutator = { U16WithinRangeMutator::new(..=3) })] + field_id: u16, + kind: OperationKind, + } + #[derive(Clone, DefaultMutator, serde::Serialize, serde::Deserialize)] + enum OperationKind { + Insert( + #[field_mutator(VecMutator = { VecMutator::new(U8Mutator::default(), 0 ..= 10) })] + Vec, + ), + Delete( + #[field_mutator(VecMutator = { VecMutator::new(U8Mutator::default(), 0 ..= 10) })] + Vec, + ), + } + + #[no_coverage] + fn compare_with_trivial_database(tempdir: Rc, operations: &[Operation]) { + let index = FacetIndex::::open_from_tempdir(tempdir, 4, 8, 5); // dummy params, they'll be overwritten + // let mut txn = index.env.write_txn().unwrap(); + let mut txn = index.env.write_txn().unwrap(); + + let mut trivial_db = TrivialDatabase::::default(); + let mut value_to_keys = HashMap::>::new(); + for Operation { key, group_size, max_group_size, min_level_size, field_id, kind } in + operations + { + index.set_group_size(*group_size); + index.set_max_group_size(*max_group_size); + index.set_min_level_size(*min_level_size); + match kind { + OperationKind::Insert(values) => { + let mut bitmap = RoaringBitmap::new(); + for value in values { + bitmap.insert(*value as u32); + value_to_keys.entry(*value).or_default().push(*key); + } + index.insert(&mut txn, *field_id, key, &bitmap); + trivial_db.insert(*field_id, *key, &bitmap); + } + OperationKind::Delete(values) => { + let values = RoaringBitmap::from_iter(values.iter().copied().map(|x| x as u32)); + let mut values_per_key = HashMap::new(); + + for value in values { + if let Some(keys) = value_to_keys.get(&(value as u8)) { + for key in keys { + let values: &mut RoaringBitmap = + values_per_key.entry(key).or_default(); + values.insert(value); + } + } + } + for (key, values) in values_per_key { + index.delete(&mut txn, *field_id, &key, &values); + trivial_db.delete(*field_id, *key, &values); + } + } + } + } + + for (field_id, values_field_id) in trivial_db.elements.iter() { + let level0iter = index + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + &mut txn, + &field_id.to_be_bytes(), + ) + .unwrap(); + + for ((key, values), group) in values_field_id.iter().zip(level0iter) { + let (group_key, group_values) = group.unwrap(); + let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + assert_eq!(key, &group_key.left_bound); + assert_eq!(values, &group_values.bitmap); + } + } + + for (field_id, values_field_id) in trivial_db.elements.iter() { + let level0iter = index + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &field_id.to_be_bytes()) + .unwrap(); + + for ((key, values), group) in values_field_id.iter().zip(level0iter) { + let (group_key, group_values) = group.unwrap(); + let group_key = FacetGroupKeyCodec::::bytes_decode(group_key).unwrap(); + assert_eq!(key, &group_key.left_bound); + assert_eq!(values, &group_values.bitmap); + } + index.verify_structure_validity(&txn, *field_id); + } + txn.abort().unwrap(); + } + + #[test] + #[no_coverage] + fn fuzz() { + let tempdir = Rc::new(TempDir::new().unwrap()); + let tempdir_cloned = tempdir.clone(); + let result = fuzzcheck::fuzz_test(move |operations: &[Operation]| { + compare_with_trivial_database(tempdir_cloned.clone(), operations) + }) + .default_mutator() + .serde_serializer() + .default_sensor_and_pool_with_custom_filter(|file, function| { + file == std::path::Path::new("milli/src/update/facet/incremental.rs") + && !function.contains("serde") + && !function.contains("tests::") + && !function.contains("fuzz::") + && !function.contains("display_bitmap") + }) + .arguments_from_cargo_fuzzcheck() + .launch(); + assert!(!result.found_test_failure); + } + + #[test] + #[no_coverage] + fn reproduce_bug1() { + let operations = r#" + [ + {"key":0, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[109]}}, + {"key":143, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[243]}}, + {"key":90, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[217]}}, + {"key":172, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[94]}}, + {"key":27, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[4]}}, + {"key":124, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, + {"key":123, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, + {"key":67, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[109]}}, + {"key":13, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[0]}}, + {"key":162, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[213]}}, + {"key":235, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[67]}}, + {"key":251, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[50]}}, + {"key":218, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[164]}}, + {"key":166, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[67]}}, + {"key":64, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[61]}}, + {"key":183, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, + {"key":250, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":[50]}} + ] + "#; + let operations: Vec> = serde_json::from_str(operations).unwrap(); + let tempdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tempdir), &operations); + } + + #[test] + #[no_coverage] + fn reproduce_bug2() { + let operations = r#" + [ + {"key":102, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[122]}}, + {"key":73, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[132]}}, + {"key":20, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[215]}}, + {"key":39, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[152]}}, + {"key":151, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[226]}}, + {"key":17, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[101]}}, + {"key":74, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, + {"key":2, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[130]}}, + {"key":64, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[180]}}, + {"key":83, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[250]}}, + {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, + {"key":113, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[63]}}, + {"key":201, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[210]}}, + {"key":200, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, + {"key":93, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[98]}}, + {"key":162, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Insert":[5]}}, + {"key":80, "field_id": 0, "group_size":4, "max_group_size":8, "min_level_size":5, "kind":{"Delete":[210]}} + ] + "#; + let operations: Vec> = serde_json::from_str(operations).unwrap(); + let tempdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tempdir), &operations); + } + #[test] + #[no_coverage] + fn reproduce_bug3() { + let operations = r#" + [ + {"key":27488, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, + {"key":64716, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[216]}}, + {"key":60886, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, + {"key":59509, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[187,231]}}, + {"key":55057, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[37]}}, + {"key":45200, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, + {"key":55056, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[37]}}, + {"key":63679, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[206]}}, + {"key":52155, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[74]}}, + {"key":20648, "field_id": 0, "group_size":0, "max_group_size":7, "min_level_size":0, "kind":{"Insert":[47,138,157]}} + ] + "#; + let operations: Vec> = serde_json::from_str(operations).unwrap(); + let tempdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tempdir), &operations); + } + + #[test] + #[no_coverage] + fn reproduce_bug4() { + let operations = r#"[ + {"key":63499, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[87]}}, + {"key":25374, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[14]}}, + {"key":64481, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Delete":[87]}}, + {"key":23038, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[173]}}, + {"key":14862, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[8]}}, + {"key":13145, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[5,64]}}, + {"key":23446, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[86,59]}}, + {"key":17972, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[58,137]}}, + {"key":21273, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[121,132,81,147]}}, + {"key":28264, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[36]}}, + {"key":46659, "field_id": 0, "group_size":2, "max_group_size":1, "min_level_size":0, "kind":{"Insert":[]}} + ] + "#; + let operations: Vec> = serde_json::from_str(operations).unwrap(); + let tempdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tempdir), &operations); + } + + #[test] + #[no_coverage] + fn reproduce_bug5() { + let input = r#" + [ + { + "key":3438, + "group_size":11, + "max_group_size":0, + "min_level_size":17, + "field_id":3, + "kind":{"Insert":[198]} + }, + + { + "key":47098, + "group_size":0, + "max_group_size":8, + "min_level_size":0, + "field_id":3, + "kind":{"Insert":[11]} + }, + { + "key":22453, + "group_size":0, + "max_group_size":0, + "min_level_size":0, + "field_id":3, + "kind":{"Insert":[145]} + }, + { + "key":14105, + "group_size":14, + "max_group_size":4, + "min_level_size":25, + "field_id":3, + "kind":{"Delete":[11]} + } + ] + "#; + let operations: Vec> = serde_json::from_str(input).unwrap(); + let tmpdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tmpdir), &operations); + } + + #[test] + #[no_coverage] + fn reproduce_bug6() { + let input = r#" + [ + {"key":45720,"group_size":1,"max_group_size":4,"min_level_size":0,"field_id":0,"kind":{"Insert":[120]}}, + {"key":37463,"group_size":1,"max_group_size":4,"min_level_size":0,"field_id":0,"kind":{"Insert":[187]}}, + {"key":21512,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}}, + {"key":21511,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}}, + {"key":37737,"group_size":12,"max_group_size":0,"min_level_size":6,"field_id":0,"kind":{"Insert":[181]}}, + {"key":53042,"group_size":23,"max_group_size":20,"min_level_size":23,"field_id":0,"kind":{"Insert":[181]}} + ] + "#; + let operations: Vec> = serde_json::from_str(input).unwrap(); + let tmpdir = TempDir::new().unwrap(); + compare_with_trivial_database(Rc::new(tmpdir), &operations); + } +} diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs new file mode 100644 index 000000000..76e5514a1 --- /dev/null +++ b/milli/src/update/facet/mod.rs @@ -0,0 +1,499 @@ +/*! +This module implements two different algorithms for updating the `facet_id_string_docids` +and `facet_id_f64_docids` databases. The first algorithm is a "bulk" algorithm, meaning that +it recreates the database from scratch when new elements are added to it. The second algorithm +is incremental: it modifies the database as little as possible. + +The databases must be able to return results for queries such as: +1. Filter : find all the document ids that have a facet value greater than X and/or smaller than Y +2. Min/Max : find the minimum/maximum facet value among these document ids +3. Sort : sort these document ids by increasing/decreasing facet values +4. Distribution : given some document ids, make a list of each facet value + found in these documents along with the number of documents that contain it + +The algorithms that implement these queries are found in the `src/search/facet` folder. + +To make these queries fast to compute, the database adopts a tree structure: +```ignore + ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ +┌───────┐ │ "ab" (2) │ "gaf" (2) │ "woz" (1) │ +│Level 2│ │ │ │ │ +└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ + ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ +┌───────┐ │ "ab" (2) │ "ba" (2) │ "gaf" (2) │ "form" (2) │ "woz" (2) │ +│Level 1│ │ │ │ │ │ │ +└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ + ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ +┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │ +│Level 0│ │ │ │ │ │ │ │ │ │ │ │ +└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ + └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ +``` +In the diagram above, each cell corresponds to a node in the tree. The first line of the cell +contains the left bound of the range of facet values as well as the number of children of the node. +The second line contains the document ids which have a facet value within the range of the node. +The nodes at level 0 are the leaf nodes. They have 0 children and a single facet value in their range. + +In the diagram above, the first cell of level 2 is `ab (2)`. Its range is `ab .. gaf` (because +`gaf` is the left bound of the next node) and it has two children. Its document ids are `[a,b,d,f,z]`. +These documents all contain a facet value that is contained within `ab .. gaf`. + +In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a +[`FacetGroupValue`], which have the following format: + +```ignore +FacetGroupKey: +- field id : u16 +- level : u8 +- left bound: [u8] // the facet value encoded using either OrderedF64Codec or Str + +FacetGroupValue: +- #children : u8 +- docids : RoaringBitmap +``` + +When the database is first created using the "bulk" method, each node has a fixed number of children +(except for possibly the last one) given by the `group_size` parameter (default to `FACET_GROUP_SIZE`). +The tree is also built such that the highest level has more than `min_level_size` +(default to `FACET_MIN_LEVEL_SIZE`) elements in it. + +When the database is incrementally updated, the number of children of a node can vary between +1 and `max_group_size`. This is done so that most incremental operations do not need to change +the structure of the tree. When the number of children of a node reaches `max_group_size`, +we split the node in two and update the number of children of its parent. + +When adding documents to the databases, it is important to determine which method to use to +minimise indexing time. The incremental method is faster when adding few new facet values, but the +bulk method is faster when a large part of the database is modified. Empirically, it seems that +it takes 50x more time to incrementally add N facet values to an existing database than it is to +construct a database of N facet values. This is the heuristic that is used to choose between the +two methods. + +Related PR: https://github.com/meilisearch/milli/pull/619 +*/ + +pub const FACET_MAX_GROUP_SIZE: u8 = 8; +pub const FACET_GROUP_SIZE: u8 = 4; +pub const FACET_MIN_LEVEL_SIZE: u8 = 5; + +use std::fs::File; + +use log::debug; +use time::OffsetDateTime; + +use self::incremental::FacetsUpdateIncremental; +use super::FacetsUpdateBulk; +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::ByteSliceRefCodec; +use crate::{Index, Result}; + +pub mod bulk; +pub mod delete; +pub mod incremental; + +/// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases. +/// +/// Depending on the number of new elements and the existing size of the database, we use either +/// a bulk update method or an incremental update method. +pub struct FacetsUpdate<'i> { + index: &'i Index, + database: heed::Database, FacetGroupValueCodec>, + facet_type: FacetType, + new_data: grenad::Reader, + group_size: u8, + max_group_size: u8, + min_level_size: u8, +} +impl<'i> FacetsUpdate<'i> { + pub fn new(index: &'i Index, facet_type: FacetType, new_data: grenad::Reader) -> Self { + let database = match facet_type { + FacetType::String => index + .facet_id_string_docids + .remap_key_type::>(), + FacetType::Number => { + index.facet_id_f64_docids.remap_key_type::>() + } + }; + Self { + index, + database, + group_size: FACET_GROUP_SIZE, + max_group_size: FACET_MAX_GROUP_SIZE, + min_level_size: FACET_MIN_LEVEL_SIZE, + facet_type, + new_data, + } + } + + pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> { + if self.new_data.is_empty() { + return Ok(()); + } + debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; + + // See self::comparison_bench::benchmark_facet_indexing + if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) { + let field_ids = + self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); + let bulk_update = FacetsUpdateBulk::new( + self.index, + field_ids, + self.facet_type, + self.new_data, + self.group_size, + self.min_level_size, + ); + bulk_update.execute(wtxn)?; + } else { + let incremental_update = FacetsUpdateIncremental::new( + self.index, + self.facet_type, + self.new_data, + self.group_size, + self.min_level_size, + self.max_group_size, + ); + incremental_update.execute(wtxn)?; + } + Ok(()) + } +} + +#[cfg(test)] +pub(crate) mod tests { + use std::cell::Cell; + use std::fmt::Display; + use std::iter::FromIterator; + use std::marker::PhantomData; + use std::rc::Rc; + + use heed::types::ByteSlice; + use heed::{BytesDecode, BytesEncode, Env, RoTxn, RwTxn}; + use roaring::RoaringBitmap; + + use super::bulk::FacetsUpdateBulkInner; + use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, + }; + use crate::heed_codec::ByteSliceRefCodec; + use crate::search::facet::get_highest_level; + use crate::snapshot_tests::display_bitmap; + use crate::update::FacetsUpdateIncrementalInner; + use crate::CboRoaringBitmapCodec; + + /// A dummy index that only contains the facet database, used for testing + pub struct FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + pub env: Env, + pub content: heed::Database, FacetGroupValueCodec>, + pub group_size: Cell, + pub min_level_size: Cell, + pub max_group_size: Cell, + _tempdir: Rc, + _phantom: PhantomData, + } + + impl FacetIndex + where + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + #[cfg(all(test, fuzzing))] + pub fn open_from_tempdir( + tempdir: Rc, + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(16, std::cmp::max(group_size, 2)); // 2 <= x <= 16 + let max_group_size = std::cmp::min(16, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 16 + let min_level_size = std::cmp::min(17, std::cmp::max(1, min_level_size)); // 1 <= x <= 17 + + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 10 * 1000); + unsafe { + options.flag(heed::flags::Flags::MdbAlwaysFreePages); + } + let env = options.open(tempdir.path()).unwrap(); + let content = env.open_database(None).unwrap().unwrap(); + + FacetIndex { + content, + group_size: Cell::new(group_size), + max_group_size: Cell::new(max_group_size), + min_level_size: Cell::new(min_level_size), + _tempdir: tempdir, + env, + _phantom: PhantomData, + } + } + pub fn new( + group_size: u8, + max_group_size: u8, + min_level_size: u8, + ) -> FacetIndex { + let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)); // 2 <= x <= 127 + let max_group_size = std::cmp::min(127, std::cmp::max(group_size * 2, max_group_size)); // 2*group_size <= x <= 127 + let min_level_size = std::cmp::max(1, min_level_size); // 1 <= x <= inf + let mut options = heed::EnvOpenOptions::new(); + let options = options.map_size(4096 * 4 * 1000 * 100); + let tempdir = tempfile::TempDir::new().unwrap(); + let env = options.open(tempdir.path()).unwrap(); + let content = env.create_database(None).unwrap(); + + FacetIndex { + content, + group_size: Cell::new(group_size), + max_group_size: Cell::new(max_group_size), + min_level_size: Cell::new(min_level_size), + _tempdir: Rc::new(tempdir), + env, + _phantom: PhantomData, + } + } + + #[cfg(all(test, fuzzing))] + pub fn set_group_size(&self, group_size: u8) { + // 2 <= x <= 64 + self.group_size.set(std::cmp::min(64, std::cmp::max(group_size, 2))); + } + #[cfg(all(test, fuzzing))] + pub fn set_max_group_size(&self, max_group_size: u8) { + // 2*group_size <= x <= 128 + let max_group_size = std::cmp::max(4, std::cmp::min(128, max_group_size)); + self.max_group_size.set(max_group_size); + if self.group_size.get() < max_group_size / 2 { + self.group_size.set(max_group_size / 2); + } + } + #[cfg(all(test, fuzzing))] + pub fn set_min_level_size(&self, min_level_size: u8) { + // 1 <= x <= inf + self.min_level_size.set(std::cmp::max(1, min_level_size)); + } + + pub fn insert<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docids: &RoaringBitmap, + ) { + let update = FacetsUpdateIncrementalInner { + db: self.content, + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), + max_group_size: self.max_group_size.get(), + }; + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.insert(wtxn, field_id, &key_bytes, docids).unwrap(); + } + pub fn delete_single_docid<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docid: u32, + ) { + self.delete(wtxn, field_id, key, &RoaringBitmap::from_iter(std::iter::once(docid))) + } + + pub fn delete<'a>( + &self, + wtxn: &'a mut RwTxn, + field_id: u16, + key: &'a >::EItem, + docids: &RoaringBitmap, + ) { + let update = FacetsUpdateIncrementalInner { + db: self.content, + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), + max_group_size: self.max_group_size.get(), + }; + let key_bytes = BoundCodec::bytes_encode(&key).unwrap(); + update.delete(wtxn, field_id, &key_bytes, docids).unwrap(); + } + + pub fn bulk_insert<'a, 'b>( + &self, + wtxn: &'a mut RwTxn, + field_ids: &[u16], + els: impl IntoIterator< + Item = &'a ((u16, >::EItem), RoaringBitmap), + >, + ) where + for<'c> >::EItem: Sized, + { + let mut new_data = vec![]; + let mut writer = grenad::Writer::new(&mut new_data); + for ((field_id, left_bound), docids) in els { + let left_bound_bytes = BoundCodec::bytes_encode(left_bound).unwrap().into_owned(); + let key: FacetGroupKey<&[u8]> = + FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes }; + let key = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); + let value = CboRoaringBitmapCodec::bytes_encode(&docids).unwrap(); + writer.insert(&key, &value).unwrap(); + } + writer.finish().unwrap(); + let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap(); + + let update = FacetsUpdateBulkInner { + db: self.content, + new_data: Some(reader), + group_size: self.group_size.get(), + min_level_size: self.min_level_size.get(), + }; + + update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap(); + } + + pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) { + let mut field_id_prefix = vec![]; + field_id_prefix.extend_from_slice(&field_id.to_be_bytes()); + + let highest_level = get_highest_level(txn, self.content, field_id).unwrap(); + + for level_no in (1..=highest_level).rev() { + let mut level_no_prefix = vec![]; + level_no_prefix.extend_from_slice(&field_id.to_be_bytes()); + level_no_prefix.push(level_no); + + let mut iter = self + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(txn, &level_no_prefix) + .unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let key = FacetGroupKeyCodec::::bytes_decode(&key).unwrap(); + + let mut prefix_start_below = vec![]; + prefix_start_below.extend_from_slice(&field_id.to_be_bytes()); + prefix_start_below.push(level_no - 1); + prefix_start_below.extend_from_slice(&key.left_bound); + + let start_below = { + let mut start_below_iter = self + .content + .as_polymorph() + .prefix_iter::<_, ByteSlice, FacetGroupValueCodec>( + txn, + &prefix_start_below, + ) + .unwrap(); + let (key_bytes, _) = start_below_iter.next().unwrap().unwrap(); + FacetGroupKeyCodec::::bytes_decode(&key_bytes).unwrap() + }; + + assert!(value.size > 0); + + let mut actual_size = 0; + let mut values_below = RoaringBitmap::new(); + let mut iter_below = self + .content + .range(txn, &(start_below..)) + .unwrap() + .take(value.size as usize); + while let Some(el) = iter_below.next() { + let (_, value) = el.unwrap(); + actual_size += 1; + values_below |= value.bitmap; + } + assert_eq!(actual_size, value.size, "{key:?} start_below: {start_below:?}"); + + assert_eq!(value.bitmap, values_below); + } + } + } + } + + impl Display for FacetIndex + where + for<'a> >::EItem: Sized + Display, + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let txn = self.env.read_txn().unwrap(); + let mut iter = self.content.iter(&txn).unwrap(); + while let Some(el) = iter.next() { + let (key, value) = el.unwrap(); + let FacetGroupKey { field_id, level, left_bound: bound } = key; + let bound = BoundCodec::bytes_decode(bound).unwrap(); + let FacetGroupValue { size, bitmap } = value; + writeln!( + f, + "{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}", + values = display_bitmap(&bitmap) + )?; + } + Ok(()) + } + } +} + +#[allow(unused)] +#[cfg(test)] +mod comparison_bench { + use std::iter::once; + + use rand::Rng; + use roaring::RoaringBitmap; + + use super::tests::FacetIndex; + use crate::heed_codec::facet::OrderedF64Codec; + + // This is a simple test to get an intuition on the relative speed + // of the incremental vs. bulk indexer. + // + // The benchmark shows the worst-case scenario for the incremental indexer, since + // each facet value contains only one document ID. + // + // In that scenario, it appears that the incremental indexer is about 50 times slower than the + // bulk indexer. + // #[test] + fn benchmark_facet_indexing() { + let mut facet_value = 0; + + let mut r = rand::thread_rng(); + + for i in 1..=20 { + let size = 50_000 * i; + let index = FacetIndex::::new(4, 8, 5); + + let mut txn = index.env.write_txn().unwrap(); + let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new(); + for i in 0..size { + // field id = 0, left_bound = i, docids = [i] + elements.push(((0, facet_value as f64), once(i).collect())); + facet_value += 1; + } + let timer = std::time::Instant::now(); + index.bulk_insert(&mut txn, &[0], elements.iter()); + let time_spent = timer.elapsed().as_millis(); + println!("bulk {size} : {time_spent}ms"); + + txn.commit().unwrap(); + + for nbr_doc in [1, 100, 1000, 10_000] { + let mut txn = index.env.write_txn().unwrap(); + let timer = std::time::Instant::now(); + // + // insert one document + // + for _ in 0..nbr_doc { + index.insert(&mut txn, 0, &r.gen(), &once(1).collect()); + } + let time_spent = timer.elapsed().as_millis(); + println!(" add {nbr_doc} : {time_spent}ms"); + txn.abort().unwrap(); + } + } + } +} diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap new file mode 100644 index 000000000..bef20823c --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +b40dd31a65e033ffc6b35c027ce19506 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..74c40e6a3 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +7ee22d8e9387e72758f00918eb67e4c6 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..6fb086d35 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +60f567359382507afdaf45fb075740c3 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..0271a6c6b --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +b986d6e6cbf425685f409a8b417010e1 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..d801ef19f --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +ee10dd2ae2b5c6621a89a5d0a9aa8ccc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap new file mode 100644 index 000000000..e9988f527 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +fa877559eef78b383b496c15a364a2dc diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap new file mode 100644 index 000000000..aa52901da --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap new file mode 100644 index 000000000..64f5012a4 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +be1b08073b9d9788d18080c1320151d7 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap new file mode 100644 index 000000000..aa52901da --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +16a96353bc42f2ff3e91611ca4d5b184 diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap new file mode 100644 index 000000000..bb0e9aa69 --- /dev/null +++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/bulk.rs +--- +32a45d555df2e001420fea149818d376 diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..fee486bab --- /dev/null +++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/delete.rs +--- +550cd138d6fe31ccdd42cd5392fbd576 diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap new file mode 100644 index 000000000..fcf957004 --- /dev/null +++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/1/number_faceted_documents_ids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/delete.rs +--- +9a0ea88e7c9dcf6dc0ef0b601736ffcf diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap new file mode 100644 index 000000000..29ceb250e --- /dev/null +++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/facet_id_f64_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/delete.rs +--- +d4d5f14e7f1e1f09b86821a0b6defcc6 diff --git a/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap new file mode 100644 index 000000000..bbaf6d2a2 --- /dev/null +++ b/milli/src/update/facet/snapshots/delete.rs/delete_mixed_incremental_and_bulk/2/number_faceted_documents_ids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/delete.rs +--- +3570e0ac0fdb21be9ebe433f59264b56 diff --git a/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap new file mode 100644 index 000000000..919f3fe7c --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/append/append.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +5dbfa134cc44abeb3ab6242fc182e48e diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap new file mode 100644 index 000000000..b006c11ab --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/0.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap new file mode 100644 index 000000000..bdeeefc13 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/100.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +6ed7bf5d440599b3b10b37549a271fdf diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap new file mode 100644 index 000000000..e037c0295 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/15.snap @@ -0,0 +1,19 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k0 1 "[0, ]" +0 0 k1 1 "[1, ]" +0 0 k2 1 "[2, ]" +0 0 k3 1 "[3, ]" +0 0 k4 1 "[4, ]" +0 0 k5 1 "[5, ]" +0 0 k6 1 "[6, ]" +0 0 k7 1 "[7, ]" +0 0 k8 1 "[8, ]" +0 0 k9 1 "[9, ]" +0 0 k10 1 "[10, ]" +0 0 k11 1 "[11, ]" +0 0 k12 1 "[12, ]" +0 0 k13 1 "[13, ]" +0 0 k14 1 "[14, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap new file mode 100644 index 000000000..e9ccc990f --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/150.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b5203f0df0036ebaa133dd77d63a00eb diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap new file mode 100644 index 000000000..a98803604 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/17.snap @@ -0,0 +1,26 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k0 1 "[0, ]" +0 0 k1 1 "[1, ]" +0 0 k2 1 "[2, ]" +0 0 k3 1 "[3, ]" +0 0 k4 1 "[4, ]" +0 0 k5 1 "[5, ]" +0 0 k6 1 "[6, ]" +0 0 k7 1 "[7, ]" +0 0 k8 1 "[8, ]" +0 0 k9 1 "[9, ]" +0 0 k10 1 "[10, ]" +0 0 k11 1 "[11, ]" +0 0 k12 1 "[12, ]" +0 0 k13 1 "[13, ]" +0 0 k14 1 "[14, ]" +0 0 k15 1 "[15, ]" +0 0 k16 1 "[16, ]" +0 1 k0 4 "[0, 1, 2, 3, ]" +0 1 k4 4 "[4, 5, 6, 7, ]" +0 1 k8 4 "[8, 9, 10, 11, ]" +0 1 k12 4 "[12, 13, 14, 15, ]" +0 1 k16 1 "[16, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap new file mode 100644 index 000000000..bb07123a9 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_end/200.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +95497d8579740868ee0bfc655b0bf782 diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap new file mode 100644 index 000000000..8714af061 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/127.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +d565c2f7bbd9e13e12de40cfbbfba6bb diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap new file mode 100644 index 000000000..1bba99454 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/215.snap @@ -0,0 +1,54 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k216 1 "[216, ]" +0 0 k217 1 "[217, ]" +0 0 k218 1 "[218, ]" +0 0 k219 1 "[219, ]" +0 0 k220 1 "[220, ]" +0 0 k221 1 "[221, ]" +0 0 k222 1 "[222, ]" +0 0 k223 1 "[223, ]" +0 0 k224 1 "[224, ]" +0 0 k225 1 "[225, ]" +0 0 k226 1 "[226, ]" +0 0 k227 1 "[227, ]" +0 0 k228 1 "[228, ]" +0 0 k229 1 "[229, ]" +0 0 k230 1 "[230, ]" +0 0 k231 1 "[231, ]" +0 0 k232 1 "[232, ]" +0 0 k233 1 "[233, ]" +0 0 k234 1 "[234, ]" +0 0 k235 1 "[235, ]" +0 0 k236 1 "[236, ]" +0 0 k237 1 "[237, ]" +0 0 k238 1 "[238, ]" +0 0 k239 1 "[239, ]" +0 0 k240 1 "[240, ]" +0 0 k241 1 "[241, ]" +0 0 k242 1 "[242, ]" +0 0 k243 1 "[243, ]" +0 0 k244 1 "[244, ]" +0 0 k245 1 "[245, ]" +0 0 k246 1 "[246, ]" +0 0 k247 1 "[247, ]" +0 0 k248 1 "[248, ]" +0 0 k249 1 "[249, ]" +0 0 k250 1 "[250, ]" +0 0 k251 1 "[251, ]" +0 0 k252 1 "[252, ]" +0 0 k253 1 "[253, ]" +0 0 k254 1 "[254, ]" +0 0 k255 1 "[255, ]" +0 1 k216 4 "[216, 217, 218, 219, ]" +0 1 k220 4 "[220, 221, 222, 223, ]" +0 1 k224 4 "[224, 225, 226, 227, ]" +0 1 k228 4 "[228, 229, 230, 231, ]" +0 1 k232 4 "[232, 233, 234, 235, ]" +0 1 k236 4 "[236, 237, 238, 239, ]" +0 1 k240 4 "[240, 241, 242, 243, ]" +0 1 k244 4 "[244, 245, 246, 247, ]" +0 1 k248 4 "[248, 249, 250, 251, ]" +0 1 k252 4 "[252, 253, 254, 255, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap new file mode 100644 index 000000000..b006c11ab --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_from_start/255.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- + diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap new file mode 100644 index 000000000..6815ee609 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/127.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +7cb503827ba17e9670296cc9531a1380 diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap new file mode 100644 index 000000000..6860385ee --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/215.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b061f43e379e16f0617c05d3313d0078 diff --git a/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap new file mode 100644 index 000000000..b006c11ab --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/delete_shuffled/255.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- + diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/after_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/after_delete.hash.snap new file mode 100644 index 000000000..f96b42b27 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/after_delete.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +81fc9489d6b163935b97433477dea63b diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap new file mode 100644 index 000000000..c57ca72eb --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_delete/before_delete.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b17b2c4ec87a778aae07854c96c08b48 diff --git a/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap new file mode 100644 index 000000000..82a7ce716 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/in_place_level0_insert/in_place_level0_insert.snap @@ -0,0 +1,20 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +0 0 k0 1 "[3, 435, 583, 849, ]" +0 0 k1 1 "[35, 494, 693, 796, ]" +0 0 k2 1 "[76, 420, 526, 909, ]" +0 0 k3 1 "[133, 451, 653, 806, ]" +0 0 k4 1 "[131, 464, 656, 853, ]" +0 0 k5 1 "[61, 308, 701, 903, ]" +0 0 k6 1 "[144, 449, 674, 794, ]" +0 0 k7 1 "[182, 451, 735, 941, ]" +0 0 k8 1 "[6, 359, 679, 1003, ]" +0 0 k9 1 "[197, 418, 659, 904, ]" +0 0 k10 1 "[88, 297, 567, 800, ]" +0 0 k11 1 "[150, 309, 530, 946, ]" +0 0 k12 1 "[156, 466, 567, 892, ]" +0 0 k13 1 "[46, 425, 610, 807, ]" +0 0 k14 1 "[236, 433, 549, 891, ]" +0 0 k15 1 "[207, 472, 603, 974, ]" + diff --git a/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap new file mode 100644 index 000000000..fd4beeca8 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_append/many_field_ids_append.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +7f8aa18d2b3a6422d55c03bede0563db diff --git a/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap new file mode 100644 index 000000000..fd4beeca8 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/many_field_ids_prepend/many_field_ids_prepend.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +7f8aa18d2b3a6422d55c03bede0563db diff --git a/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap new file mode 100644 index 000000000..d055892f5 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/merge_values/merge_values.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +b3e2de9020d9e0f3941bc3a179c795ba diff --git a/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap new file mode 100644 index 000000000..919f3fe7c --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/prepend/prepend.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +5dbfa134cc44abeb3ab6242fc182e48e diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/after_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/after_delete.hash.snap new file mode 100644 index 000000000..1802eb952 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/after_delete.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +9343355bf535ed4a0c956df2b229d5e6 diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/before_delete.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/before_delete.hash.snap new file mode 100644 index 000000000..2b6805676 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/shuffle_merge_string_and_delete/before_delete.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +4fc800f49201a336295af0542fdf01ab diff --git a/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap b/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap new file mode 100644 index 000000000..5ef88bfb4 --- /dev/null +++ b/milli/src/update/facet/snapshots/incremental.rs/shuffled/shuffled.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/facet/incremental.rs +--- +fd65ce7d96a07aafb0ef6cfb5bf016b8 diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 50b34a714..8b1378917 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,713 +1 @@ -/*! -This module initialises the databases that are used to quickly get the list -of documents with a faceted field value falling within a certain range. For -example, they can be used to implement filters such as `x >= 3`. -These databases are `facet_id_string_docids` and `facet_id_f64_docids`. - -## Example with numbers - -In the case of numbers, we start with a sorted list whose keys are -`(field_id, number_value)` and whose value is a roaring bitmap of the document ids -which contain the value `number_value` for the faceted field `field_id`. - -From this list, we want to compute two things: - -1. the bitmap of all documents that contain **any** number for each faceted field -2. a structure that allows us to use a (sort of) binary search to find all documents -containing numbers inside a certain range for a faceted field - -To achieve goal (2), we recursively split the list into chunks. Every time we split it, we -create a new "level" that is several times smaller than the level below it. The base level, -level 0, is the starting list. Level 1 is composed of chunks of up to N elements. Each element -contains a range and a bitmap of docids. Level 2 is composed of chunks up to N^2 elements, etc. - -For example, let's say we have 26 documents which we identify through the letters a-z. -We will focus on a single faceted field. When there are multiple faceted fields, the structure -described below is simply repeated for each field. - -What we want to obtain is the following structure for each faceted field: -```text -┌───────┐ ┌───────────────────────────────────────────────────────────────────────────────┐ -│ all │ │ [a, b, c, d, e, f, g, u, y, z] │ -└───────┘ └───────────────────────────────────────────────────────────────────────────────┘ - ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ -┌───────┐ │ 1.2 – 2 │ 3.4 – 100 │ 102 – 104 │ -│Level 2│ │ │ │ │ -└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ - ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ -┌───────┐ │ 1.2 – 1.3 │ 1.6 – 2 │ 3.4 – 12 │ 12.3 – 100 │ 102 – 104 │ -│Level 1│ │ │ │ │ │ │ -└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ - ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ -┌───────┐ │ 1.2 │ 1.3 │ 1.6 │ 2 │ 3.4 │ 12 │ 12.3 │ 100 │ 102 │ 104 │ -│Level 0│ │ │ │ │ │ │ │ │ │ │ │ -└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ - └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ -``` - -You can read more about this structure (for strings) in `[crate::search::facet::facet_strings]`. - -To create the levels, we use a recursive algorithm which makes sure that we only need to iterate -over the elements of level 0 once. It is implemented by [`recursive_compute_levels`]. - -## Encoding - -### Numbers -For numbers we use the same encoding for level 0 and the other levels. - -The key is given by `FacetLevelValueF64Codec`. It consists of: -1. The field id : u16 -2. The height of the level : u8 -3. The start bound : f64 -4. The end bound : f64 -Note that at level 0, we have start bound == end bound. - -The value is a serialised `RoaringBitmap`. - -### Strings - -For strings, we use a different encoding for level 0 and the other levels. - -At level 0, the key is given by `FacetStringLevelZeroCodec`. It consists of: -1. The field id : u16 -2. The height of the level : u8 <-- always == 0 -3. The normalised string value : &str - -And the value is given by `FacetStringLevelZeroValueCodec`. It consists of: -1. The original string -2. A serialised `RoaringBitmap` - -At level 1, the key is given by `FacetLevelValueU32Codec`. It consists of: -1. The field id : u16 -2. The height of the level : u8 <-- always >= 1 -3. The start bound : u32 -4. The end bound : u32 -where the bounds are indices inside level 0. - -The value is given by `FacetStringZeroBoundsValueCodec`. -If the level is 1, then it consists of: -1. The normalised string of the start bound -2. The normalised string of the end bound -3. A serialised `RoaringBitmap` - -If the level is higher, then it consists only of the serialised roaring bitmap. - -The distinction between the value encoding of level 1 and the levels above it -is to allow us to retrieve the value in level 0 quickly by reading the key of -level 1 (we obtain the string value of the bound and execute a prefix search -in the database). - -Therefore, for strings, the structure for a single faceted field looks more like this: -```text -┌───────┐ ┌───────────────────────────────────────────────────────────────────────────────┐ -│ all │ │ [a, b, c, d, e, f, g, u, y, z] │ -└───────┘ └───────────────────────────────────────────────────────────────────────────────┘ - - ┌───────────────────────────────┬───────────────────────────────┬───────────────┐ -┌───────┐ │ 0 – 3 │ 4 – 7 │ 8 – 9 │ -│Level 2│ │ │ │ │ -└───────┘ │ [a, b, d, f, z] │ [c, d, e, f, g] │ [u, y] │ - ├───────────────┬───────────────┼───────────────┬───────────────┼───────────────┤ -┌───────┐ │ 0 – 1 │ 2 – 3 │ 4 – 5 │ 6 – 7 │ 8 – 9 │ -│Level 1│ │ "ab" – "ac" │ "ba" – "bac" │ "gaf" – "gal" │"form" – "wow" │ "woz" – "zz" │ -└───────┘ │ [a, b, d, z] │ [a, b, f] │ [c, d, g] │ [e, f] │ [u, y] │ - ├───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┼───────┬───────┤ -┌───────┐ │ "ab" │ "ac" │ "ba" │ "bac" │ "gaf" │ "gal" │ "form"│ "wow" │ "woz" │ "zz" │ -│Level 0│ │ "AB" │ " Ac" │ "ba " │ "Bac" │ " GAF"│ "gal" │ "Form"│ " wow"│ "woz" │ "ZZ" │ -└───────┘ │ [a, b]│ [d, z]│ [b, f]│ [a, f]│ [c, d]│ [g] │ [e] │ [e, f]│ [y] │ [u] │ - └───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┴───────┘ - -The first line in a cell is its key (without the field id and level height) and the last two -lines are its values. -``` -*/ - -use std::cmp; -use std::fs::File; -use std::num::{NonZeroU8, NonZeroUsize}; -use std::ops::RangeFrom; - -use grenad::{CompressionType, Reader, Writer}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{BytesDecode, BytesEncode, Error}; -use log::debug; -use roaring::RoaringBitmap; -use time::OffsetDateTime; - -use crate::error::InternalError; -use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, -}; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; -use crate::{FieldId, Index, Result}; - -pub struct Facets<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, -} - -impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Facets<'t, 'u, 'i> { - Facets { - wtxn, - index, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - level_group_size: NonZeroUsize::new(4).unwrap(), - min_level_size: NonZeroUsize::new(5).unwrap(), - } - } - - /// The number of elements from the level below that are represented by a single element in the level above - /// - /// This setting is always greater than or equal to 2. - pub fn level_group_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.level_group_size = NonZeroUsize::new(cmp::max(value.get(), 2)).unwrap(); - self - } - - /// The minimum number of elements that a level is allowed to have. - pub fn min_level_size(&mut self, value: NonZeroUsize) -> &mut Self { - self.min_level_size = value; - self - } - - #[logging_timer::time("Facets::{}")] - pub fn execute(self) -> Result<()> { - self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?; - // We get the faceted fields to be able to create the facet levels. - let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?; - - debug!("Computing and writing the facet values levels docids into LMDB on disk..."); - - for field_id in faceted_fields { - // Clear the facet string levels. - clear_field_string_levels( - self.wtxn, - self.index.facet_id_string_docids.remap_types::(), - field_id, - )?; - - let (facet_string_levels, string_documents_ids) = compute_facet_strings_levels( - self.wtxn, - self.index.facet_id_string_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.level_group_size, - self.min_level_size, - field_id, - )?; - - self.index.put_string_faceted_documents_ids( - self.wtxn, - field_id, - &string_documents_ids, - )?; - for facet_strings_level in facet_string_levels { - write_into_lmdb_database( - self.wtxn, - *self.index.facet_id_string_docids.as_polymorph(), - facet_strings_level, - |_, _| { - Err(InternalError::IndexingMergingKeys { process: "facet string levels" })? - }, - )?; - } - - // Clear the facet number levels. - clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; - - let (facet_number_levels, number_documents_ids) = compute_facet_number_levels( - self.wtxn, - self.index.facet_id_f64_docids, - self.chunk_compression_type, - self.chunk_compression_level, - self.level_group_size, - self.min_level_size, - field_id, - )?; - - self.index.put_number_faceted_documents_ids( - self.wtxn, - field_id, - &number_documents_ids, - )?; - - for facet_number_level in facet_number_levels { - write_into_lmdb_database( - self.wtxn, - *self.index.facet_id_f64_docids.as_polymorph(), - facet_number_level, - |_, _| { - Err(InternalError::IndexingMergingKeys { process: "facet number levels" })? - }, - )?; - } - } - - Ok(()) - } -} - -/// Compute the content of the database levels from its level 0 for the given field id. -/// -/// ## Returns: -/// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` -/// that must be inserted into the database. -/// 2. a roaring bitmap of all the document ids present in the database -fn compute_facet_number_levels( - rtxn: &heed::RoTxn, - db: heed::Database, - compression_type: CompressionType, - compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, - field_id: FieldId, -) -> Result<(Vec>, RoaringBitmap)> { - let first_level_size = db - .remap_key_type::() - .prefix_iter(rtxn, &field_id.to_be_bytes())? - .remap_types::() - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - - let level_0_start = (field_id, 0, f64::MIN, f64::MIN); - - // Groups sizes are always a power of the original level_group_size and therefore a group - // always maps groups of the previous level and never splits previous levels groups in half. - let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) - .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()) - .collect::>(); - - let mut number_document_ids = RoaringBitmap::new(); - - if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = - recursive_compute_levels::( - rtxn, - db, - compression_type, - compression_level, - *top_level, - level_0_start, - &(level_0_start..), - first_level_size, - level_group_size, - &mut |bitmaps, _, _| { - for bitmap in bitmaps { - number_document_ids |= bitmap; - } - Ok(()) - }, - &|_i, (_field_id, _level, left, _right)| *left, - &|bitmap| bitmap, - &|writer, level, left, right, docids| { - write_number_entry(writer, field_id, level.get(), left, right, &docids)?; - Ok(()) - }, - )?; - - Ok((subwriters, number_document_ids)) - } else { - let mut documents_ids = RoaringBitmap::new(); - for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, docids) = result?; - documents_ids |= docids; - } - - Ok((vec![], documents_ids)) - } -} - -/// Compute the content of the database levels from its level 0 for the given field id. -/// -/// ## Returns: -/// 1. a vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` -/// that must be inserted into the database. -/// 2. a roaring bitmap of all the document ids present in the database -fn compute_facet_strings_levels( - rtxn: &heed::RoTxn, - db: heed::Database, - compression_type: CompressionType, - compression_level: Option, - level_group_size: NonZeroUsize, - min_level_size: NonZeroUsize, - field_id: FieldId, -) -> Result<(Vec>, RoaringBitmap)> { - let first_level_size = db - .remap_key_type::() - .prefix_iter(rtxn, &field_id.to_be_bytes())? - .remap_types::() - .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - - let level_0_start = (field_id, ""); - - // Groups sizes are always a power of the original level_group_size and therefore a group - // always maps groups of the previous level and never splits previous levels groups in half. - let group_size_iter = (1u8..) - .map(|l| (l, level_group_size.get().pow(l as u32))) - .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()) - .collect::>(); - - let mut strings_document_ids = RoaringBitmap::new(); - - if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = recursive_compute_levels::< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - (u32, &str), - >( - rtxn, - db, - compression_type, - compression_level, - *top_level, - level_0_start, - &(level_0_start..), - first_level_size, - level_group_size, - &mut |bitmaps, _, _| { - for bitmap in bitmaps { - strings_document_ids |= bitmap; - } - Ok(()) - }, - &|i, (_field_id, value)| (i as u32, *value), - &|value| value.1, - &|writer, level, start_bound, end_bound, docids| { - write_string_entry(writer, field_id, level, start_bound, end_bound, docids)?; - Ok(()) - }, - )?; - - Ok((subwriters, strings_document_ids)) - } else { - let mut documents_ids = RoaringBitmap::new(); - for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, (_original_value, docids)) = result?; - documents_ids |= docids; - } - - Ok((vec![], documents_ids)) - } -} - -/** -Compute a level from the levels below it, with the elements of level 0 already existing in the given `db`. - -This function is generic to work with both numbers and strings. The generic type parameters are: -* `KeyCodec`/`ValueCodec`: the codecs used to read the elements of the database. -* `Bound`: part of the range in the levels structure. For example, for numbers, the `Bound` is `f64` -because each chunk in a level contains a range such as (1.2 ..= 4.5). - -## Arguments -* `rtxn` : LMDB read transaction -* `db`: a database which already contains a `level 0` -* `compression_type`/`compression_level`: parameters used to create the `grenad::Writer` that -will contain the new levels -* `level` : the height of the level to create, or `0` to read elements from level 0. -* `level_0_start` : a key in the database that points to the beginning of its level 0 -* `level_0_range` : equivalent to `level_0_start..` -* `level_0_size` : the number of elements in level 0 -* `level_group_size` : the number of elements from the level below that are represented by a -single element of the new level -* `computed_group_bitmap` : a callback that is called whenever at most `level_group_size` elements -from the level below were read/created. Its arguments are: - 0. the list of bitmaps from each read/created element of the level below - 1. the start bound corresponding to the first element - 2. the end bound corresponding to the last element -* `bound_from_db_key` : finds the `Bound` from a key in the database -* `bitmap_from_db_value` : finds the `RoaringBitmap` from a value in the database -* `write_entry` : writes an element of a level into the writer. The arguments are: - 0. the writer - 1. the height of the level - 2. the start bound - 3. the end bound - 4. the docids of all elements between the start and end bound - -## Return -A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` -that must be inserted into the database. -*/ -fn recursive_compute_levels<'t, KeyCodec, ValueCodec, Bound>( - rtxn: &'t heed::RoTxn, - db: heed::Database, - compression_type: CompressionType, - compression_level: Option, - level: u8, - level_0_start: >::DItem, - level_0_range: &'t RangeFrom<>::DItem>, - level_0_size: usize, - level_group_size: NonZeroUsize, - computed_group_bitmap: &mut dyn FnMut(&[RoaringBitmap], Bound, Bound) -> Result<()>, - bound_from_db_key: &dyn for<'a> Fn(usize, &'a >::DItem) -> Bound, - bitmap_from_db_value: &dyn Fn(>::DItem) -> RoaringBitmap, - write_entry: &dyn Fn(&mut Writer, NonZeroU8, Bound, Bound, RoaringBitmap) -> Result<()>, -) -> Result>> -where - KeyCodec: for<'a> BytesEncode<'a> - + for<'a> BytesDecode<'a, DItem = >::EItem>, - for<'a> >::EItem: Sized, - ValueCodec: for<'a> BytesEncode<'a> - + for<'a> BytesDecode<'a, DItem = >::EItem>, - for<'a> >::EItem: Sized, - Bound: Copy, -{ - if level == 0 { - // base case for the recursion - - // we read the elements one by one and - // 1. keep track of the start and end bounds - // 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read - let mut bitmaps = vec![]; - - let mut start_bound = bound_from_db_key(0, &level_0_start); - let mut end_bound = bound_from_db_key(0, &level_0_start); - let mut first_iteration_for_new_group = true; - for (i, db_result_item) in db.range(rtxn, level_0_range)?.take(level_0_size).enumerate() { - let (key, value) = db_result_item?; - - let bound = bound_from_db_key(i, &key); - let docids = bitmap_from_db_value(value); - - if first_iteration_for_new_group { - start_bound = bound; - first_iteration_for_new_group = false; - } - end_bound = bound; - bitmaps.push(docids); - - if bitmaps.len() == level_group_size.get() { - computed_group_bitmap(&bitmaps, start_bound, end_bound)?; - first_iteration_for_new_group = true; - bitmaps.clear(); - } - } - // don't forget to give the leftover bitmaps as well - if !bitmaps.is_empty() { - computed_group_bitmap(&bitmaps, start_bound, end_bound)?; - bitmaps.clear(); - } - // level 0 is already stored in the DB - Ok(vec![]) - } else { - // level >= 1 - // we compute each element of this level based on the elements of the level below it - // once we have computed `level_group_size` elements, we give the start and end bounds - // of those elements, and their bitmaps, to the level above - - let mut cur_writer = - create_writer(compression_type, compression_level, tempfile::tempfile()?); - - let mut range_for_bitmaps = vec![]; - let mut bitmaps = vec![]; - - // compute the levels below - // in the callback, we fill `cur_writer` with the correct elements for this level - let mut sub_writers = recursive_compute_levels( - rtxn, - db, - compression_type, - compression_level, - level - 1, - level_0_start, - level_0_range, - level_0_size, - level_group_size, - &mut |sub_bitmaps: &[RoaringBitmap], start_range, end_range| { - let mut combined_bitmap = RoaringBitmap::default(); - for bitmap in sub_bitmaps { - combined_bitmap |= bitmap; - } - range_for_bitmaps.push((start_range, end_range)); - - bitmaps.push(combined_bitmap); - if bitmaps.len() == level_group_size.get() { - let start_bound = range_for_bitmaps.first().unwrap().0; - let end_bound = range_for_bitmaps.last().unwrap().1; - computed_group_bitmap(&bitmaps, start_bound, end_bound)?; - for (bitmap, (start_bound, end_bound)) in - bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) - { - write_entry( - &mut cur_writer, - NonZeroU8::new(level).unwrap(), - start_bound, - end_bound, - bitmap, - )?; - } - } - Ok(()) - }, - bound_from_db_key, - bitmap_from_db_value, - write_entry, - )?; - // don't forget to insert the leftover elements into the writer as well - if !bitmaps.is_empty() { - let start_range = range_for_bitmaps.first().unwrap().0; - let end_range = range_for_bitmaps.last().unwrap().1; - computed_group_bitmap(&bitmaps, start_range, end_range)?; - for (bitmap, (left, right)) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - write_entry(&mut cur_writer, NonZeroU8::new(level).unwrap(), left, right, bitmap)?; - } - } - - sub_writers.push(writer_into_reader(cur_writer)?); - Ok(sub_writers) - } -} - -fn clear_field_number_levels<'t>( - wtxn: &'t mut heed::RwTxn, - db: heed::Database, - field_id: FieldId, -) -> heed::Result<()> { - let left = (field_id, 1, f64::MIN, f64::MIN); - let right = (field_id, u8::MAX, f64::MAX, f64::MAX); - let range = left..=right; - db.delete_range(wtxn, &range).map(drop) -} - -fn clear_field_string_levels<'t>( - wtxn: &'t mut heed::RwTxn, - db: heed::Database, - field_id: FieldId, -) -> heed::Result<()> { - let left = (field_id, NonZeroU8::new(1).unwrap(), u32::MIN, u32::MIN); - let right = (field_id, NonZeroU8::new(u8::MAX).unwrap(), u32::MAX, u32::MAX); - let range = left..=right; - db.remap_key_type::().delete_range(wtxn, &range).map(drop) -} - -fn write_number_entry( - writer: &mut Writer, - field_id: FieldId, - level: u8, - left: f64, - right: f64, - ids: &RoaringBitmap, -) -> Result<()> { - let key = (field_id, level, left, right); - let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(ids).ok_or(Error::Encoding)?; - writer.insert(&key, &data)?; - Ok(()) -} -fn write_string_entry( - writer: &mut Writer, - field_id: FieldId, - level: NonZeroU8, - (left_id, left_value): (u32, &str), - (right_id, right_value): (u32, &str), - docids: RoaringBitmap, -) -> Result<()> { - let key = (field_id, level, left_id, right_id); - let key = FacetLevelValueU32Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = match level.get() { - 1 => (Some((left_value, right_value)), docids), - _ => (None, docids), - }; - let data = FacetStringZeroBoundsValueCodec::::bytes_encode(&data) - .ok_or(Error::Encoding)?; - writer.insert(&key, &data)?; - Ok(()) -} - -#[cfg(test)] -mod tests { - use std::num::NonZeroUsize; - - use crate::db_snap; - use crate::documents::documents_batch_reader_from_objects; - use crate::index::tests::TempIndex; - - #[test] - fn test_facets_number() { - let test = - |name: &str, group_size: Option, min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; - - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..1_000 { - documents.push(serde_json::json!({ "facet": i }).as_object().unwrap().clone()); - } - for i in 0..100 { - documents.push(serde_json::json!({ "facet2": i }).as_object().unwrap().clone()); - } - let documents = documents_batch_reader_from_objects(documents); - - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_f64_docids, name); - }; - - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); - test("small_groups_small_levels", NonZeroUsize::new(2), NonZeroUsize::new(2)); - test("small_groups_large_levels", NonZeroUsize::new(2), NonZeroUsize::new(128)); - test("large_groups_small_levels", NonZeroUsize::new(16), NonZeroUsize::new(2)); - test("large_groups_large_levels", NonZeroUsize::new(16), NonZeroUsize::new(256)); - } - - #[test] - fn test_facets_string() { - let test = |name: &str, - group_size: Option, - min_level_size: Option| { - let mut index = TempIndex::new_with_map_size(4096 * 1000 * 10); // 40MB - index.index_documents_config.autogenerate_docids = true; - index.index_documents_config.facet_level_group_size = group_size; - index.index_documents_config.facet_min_level_size = min_level_size; - - index - .update_settings(|settings| { - settings.set_filterable_fields( - IntoIterator::into_iter(["facet".to_owned(), "facet2".to_owned()]) - .collect(), - ); - }) - .unwrap(); - - let mut documents = vec![]; - for i in 0..100 { - documents.push( - serde_json::json!({ "facet": format!("s{i:X}") }).as_object().unwrap().clone(), - ); - } - for i in 0..10 { - documents.push( - serde_json::json!({ "facet2": format!("s{i:X}") }).as_object().unwrap().clone(), - ); - } - let documents = documents_batch_reader_from_objects(documents); - - index.add_documents(documents).unwrap(); - - db_snap!(index, facet_id_string_docids, name); - }; - - test("default", None, None); - test("tiny_groups_tiny_levels", NonZeroUsize::new(1), NonZeroUsize::new(1)); - } -} diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 61157fa35..1d415166d 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -6,7 +6,9 @@ use heed::{BytesDecode, BytesEncode}; use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FieldDocIdFacetF64Codec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec, +}; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. @@ -34,8 +36,8 @@ pub fn extract_facet_number_docids( let (field_id, document_id, number) = FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); - let key = (field_id, 0, number, number); - let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); + let key = FacetGroupKey { field_id, level: 0, left_bound: number }; + let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index f7aa3730c..221356ba0 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -1,14 +1,12 @@ use std::fs::File; -use std::iter::FromIterator; -use std::{io, str}; +use std::io; -use roaring::RoaringBitmap; +use heed::BytesEncode; -use super::helpers::{ - create_sorter, keep_first_prefix_value_merge_roaring_bitmaps, sorter_into_reader, - try_split_array_at, GrenadParameters, -}; -use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; +use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; +use crate::heed_codec::StrRefCodec; +use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::{FieldId, Result}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -24,36 +22,28 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - keep_first_prefix_value_merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory, ); - let mut key_buffer = Vec::new(); - let mut value_buffer = Vec::new(); let mut cursor = docid_fid_facet_string.into_cursor()?; - while let Some((key, original_value_bytes)) = cursor.move_on_next()? { + while let Some((key, _original_value_bytes)) = cursor.move_on_next()? { let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); - let (document_id_bytes, normalized_value_bytes) = try_split_array_at(bytes).unwrap(); + + let (document_id_bytes, normalized_value_bytes) = + try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); - let original_value = str::from_utf8(original_value_bytes)?; - key_buffer.clear(); - FacetStringLevelZeroCodec::serialize_into( - field_id, - str::from_utf8(normalized_value_bytes)?, - &mut key_buffer, - ); + let normalised_value = std::str::from_utf8(normalized_value_bytes)?; + let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value }; + let key_bytes = FacetGroupKeyCodec::::bytes_encode(&key).unwrap(); - value_buffer.clear(); - encode_prefix_string(original_value, &mut value_buffer)?; - let bitmap = RoaringBitmap::from_iter(Some(document_id)); - bitmap.serialize_into(&mut value_buffer)?; - - facet_string_docids_sorter.insert(&key_buffer, &value_buffer)?; + // document id is encoded in native-endian because of the CBO roaring bitmap codec + facet_string_docids_sorter.insert(&key_bytes, &document_id.to_ne_bytes())?; } sorter_into_reader(facet_string_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 8e0e61175..c0f12e9ee 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -25,8 +25,8 @@ use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ - as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, - merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, + as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, + GrenadParameters, MergeFn, MergeableReader, }; use super::{helpers, TypedChunk}; use crate::{FieldId, Result}; @@ -142,7 +142,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, - keep_first_prefix_value_merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, TypedChunk::FieldIdFacetStringDocids, "field-id-facet-string-docids", ); diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index e18cb4e16..03f15945a 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -3,7 +3,7 @@ use std::fs::File; use std::io::{self, Seek, SeekFrom}; use std::time::Instant; -use grenad::{CompressionType, Reader, Sorter}; +use grenad::{CompressionType, Sorter}; use heed::types::ByteSlice; use log::debug; @@ -208,36 +208,6 @@ pub fn grenad_obkv_into_chunks( Ok(std::iter::from_fn(move || transposer().transpose())) } -pub fn write_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - reader: Reader, - merge: MergeFn, -) -> Result<()> { - debug!("Writing MTBL stores..."); - let before = Instant::now(); - - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = &[Cow::Borrowed(old_val), Cow::Borrowed(v)][..]; - let val = merge(k, vals)?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - } - - debug!("MTBL stores merged in {:.02?}!", before.elapsed()); - Ok(()) -} - pub fn sorter_into_lmdb_database( wtxn: &mut heed::RwTxn, database: heed::PolyDatabase, diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index dbe3c0344..37af7ab6a 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -5,7 +5,6 @@ use std::result::Result as StdResult; use roaring::RoaringBitmap; use super::read_u32_ne_bytes; -use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::Result; @@ -49,33 +48,6 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul } } -pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( - _key: &[u8], - values: &[Cow<'a, [u8]>], -) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - let original = decode_prefix_string(&values[0]).unwrap().0; - let merged_bitmaps = values - .iter() - .map(AsRef::as_ref) - .map(decode_prefix_string) - .map(Option::unwrap) - .map(|(_, bitmap_bytes)| bitmap_bytes) - .map(RoaringBitmap::deserialize_from) - .map(StdResult::unwrap) - .reduce(|a, b| a | b) - .unwrap(); - - let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); - let mut buffer = Vec::with_capacity(cap); - encode_prefix_string(original, &mut buffer)?; - merged_bitmaps.serialize_into(&mut buffer)?; - Ok(Cow::Owned(buffer)) - } -} - pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { Ok(values[0].clone()) } diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 6466a636b..8fb629cae 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -9,13 +9,13 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, - merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, - writer_into_reader, GrenadParameters, MergeableReader, + merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader, + GrenadParameters, MergeableReader, }; pub use merge_functions::{ - concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, - merge_cbo_roaring_bitmaps, merge_obkvs, merge_roaring_bitmaps, merge_two_obkvs, - roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, + concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_obkvs, + merge_roaring_bitmaps, merge_two_obkvs, roaring_bitmap_from_u32s_array, + serialize_roaring_bitmap, MergeFn, }; /// The maximum length a word can be diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 6838b6651..a121d3ae0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -7,7 +7,7 @@ mod typed_chunk; use std::collections::HashSet; use std::io::{Cursor, Read, Seek}; use std::iter::FromIterator; -use std::num::{NonZeroU32, NonZeroUsize}; +use std::num::NonZeroU32; use std::result::Result as StdResult; use crossbeam_channel::{Receiver, Sender}; @@ -27,8 +27,7 @@ pub use self::enrich::{ pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, valid_lmdb_key, write_into_lmdb_database, writer_into_reader, - ClonableMmap, MergeFn, + sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; @@ -36,8 +35,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, - WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst, + self, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, + WordPrefixPositionDocids, WordsPrefixesFst, }; use crate::{Index, Result, RoaringBitmapCodec}; @@ -84,8 +83,6 @@ pub struct IndexDocuments<'t, 'u, 'i, 'a, FP, FA> { #[derive(Default, Debug, Clone)] pub struct IndexDocumentsConfig { - pub facet_level_group_size: Option, - pub facet_min_level_size: Option, pub words_prefix_threshold: Option, pub max_prefix_length: Option, pub words_positions_level_group_size: Option, @@ -445,18 +442,6 @@ where return Err(Error::InternalError(InternalError::AbortedIndexation)); } - // Run the facets update operation. - let mut builder = Facets::new(self.wtxn, self.index); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - if let Some(value) = self.config.facet_level_group_size { - builder.level_group_size(value); - } - if let Some(value) = self.config.facet_min_level_size { - builder.min_level_size(value); - } - builder.execute()?; - databases_seen += 1; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { databases_seen, @@ -643,7 +628,7 @@ mod tests { use crate::index::tests::TempIndex; use crate::search::TermsMatchingStrategy; use crate::update::DeleteDocuments; - use crate::BEU16; + use crate::{db_snap, BEU16}; #[test] fn simple_document_replacement() { @@ -1430,6 +1415,25 @@ mod tests { }) .unwrap(); + db_snap!(index, facet_id_string_docids, @r###" + 3 0 first 1 [1, ] + 3 0 second 1 [2, ] + 3 0 third 1 [3, ] + 3 0 zeroth 1 [0, ] + "###); + db_snap!(index, field_id_docid_facet_strings, @r###" + 3 0 zeroth zeroth + 3 1 first first + 3 2 second second + 3 3 third third + "###); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + let rtxn = index.read_txn().unwrap(); let hidden = index.faceted_fields(&rtxn).unwrap(); @@ -1450,6 +1454,15 @@ mod tests { }) .unwrap(); + db_snap!(index, facet_id_string_docids, @""); + db_snap!(index, field_id_docid_facet_strings, @""); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + let rtxn = index.read_txn().unwrap(); let facets = index.faceted_fields(&rtxn).unwrap(); @@ -1463,6 +1476,25 @@ mod tests { }) .unwrap(); + db_snap!(index, facet_id_string_docids, @r###" + 3 0 first 1 [1, ] + 3 0 second 1 [2, ] + 3 0 third 1 [3, ] + 3 0 zeroth 1 [0, ] + "###); + db_snap!(index, field_id_docid_facet_strings, @r###" + 3 0 zeroth zeroth + 3 1 first first + 3 2 second second + 3 3 third third + "###); + db_snap!(index, string_faceted_documents_ids, @r###" + 0 [] + 1 [] + 2 [] + 3 [0, 1, 2, 3, ] + "###); + let rtxn = index.read_txn().unwrap(); let facets = index.faceted_fields(&rtxn).unwrap(); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 8464c98b6..16784bd92 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -13,7 +13,8 @@ use super::helpers::{ valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; -use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; +use crate::facet::FacetType; +use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, @@ -136,15 +137,14 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } - TypedChunk::FieldIdFacetNumberDocids(facet_id_f64_docids_iter) => { - append_entries_into_database( - facet_id_f64_docids_iter, - &index.facet_id_f64_docids, - wtxn, - index_is_empty, - |value, _buffer| Ok(value), - merge_cbo_roaring_bitmaps, - )?; + TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { + let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); + indexer.execute(wtxn)?; + is_merged_database = true; + } + TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids_iter) => { + let indexer = FacetsUpdate::new(index, FacetType::String, facet_id_string_docids_iter); + indexer.execute(wtxn)?; is_merged_database = true; } TypedChunk::FieldIdFacetExistsDocids(facet_id_exists_docids) => { @@ -189,25 +189,6 @@ pub(crate) fn write_typed_chunk_into_index( } } } - TypedChunk::FieldIdFacetStringDocids(facet_id_string_docids) => { - append_entries_into_database( - facet_id_string_docids, - &index.facet_id_string_docids, - wtxn, - index_is_empty, - |value, _buffer| Ok(value), - |new_values, db_values, buffer| { - let (_, new_values) = decode_prefix_string(new_values).unwrap(); - let new_values = RoaringBitmap::deserialize_from(new_values)?; - let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); - let db_values = RoaringBitmap::deserialize_from(db_values)?; - let values = new_values | db_values; - encode_prefix_string(db_original, buffer)?; - Ok(values.serialize_into(buffer)?) - }, - )?; - is_merged_database = true; - } TypedChunk::GeoPoints(geo_points) => { let mut rtree = index.geo_rtree(wtxn)?.unwrap_or_default(); let mut geo_faceted_docids = index.geo_faceted_documents_ids(wtxn)?; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 3ddc01cef..952720725 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -1,7 +1,8 @@ pub use self::available_documents_ids::AvailableDocumentsIds; pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; -pub use self::facets::Facets; +pub use self::facet::bulk::FacetsUpdateBulk; +pub use self::facet::incremental::FacetsUpdateIncrementalInner; pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; @@ -16,7 +17,7 @@ pub use self::words_prefixes_fst::WordsPrefixesFst; mod available_documents_ids; mod clear_documents; mod delete_documents; -mod facets; +pub(crate) mod facet; mod index_documents; mod indexer_config; mod prefix_word_pairs; diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/false/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/facet_id_exists_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/true/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap new file mode 100644 index 000000000..6d69b2ffb --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..9139b7a05 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[0, 1, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap new file mode 100644 index 000000000..15c881e87 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/delete_documents.rs +--- +benoit [2, ] +kevin [0, ] +kevina [1, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/false/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap new file mode 100644 index 000000000..6d69b2ffb --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap new file mode 100644 index 000000000..88d3a98aa --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/delete_documents.rs +--- +benoit [2, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/true/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap new file mode 100644 index 000000000..a7ee4348d --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_exists_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] +2 [20, 21, 22, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap new file mode 100644 index 000000000..cfa649653 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +2 0 1.2 1 [20, 22, ] +2 0 2.2 1 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap new file mode 100644 index 000000000..8336bd712 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap @@ -0,0 +1,19 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] +1 0 aquarium 1 [5, ] +1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] +1 0 cartoon 1 [2, 7, 15, 17, ] +1 0 colorfulness 1 [13, ] +1 0 design 1 [2, 18, ] +1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] +1 0 geometry 1 [19, ] +1 0 letter 1 [1, ] +1 0 outdoor 1 [4, ] +1 0 painting 1 [3, ] +1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] +1 0 sign 1 [0, ] +2 0 design 1 [21, ] +2 0 geometry 1 [20, 22, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..dfac98e59 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[0, 20, 22, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap new file mode 100644 index 000000000..7909d9b06 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_docids.snap @@ -0,0 +1,42 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1.2 [20, 22, ] +1_36 [3, ] +1_37 [4, ] +1_38 [5, ] +1_39 [6, ] +1_4 [0, ] +1_40 [7, ] +1_41 [8, ] +1_42 [9, ] +1_43 [10, ] +1_44 [11, ] +1_45 [12, ] +1_46 [13, ] +1_47 [14, ] +1_5 [1, ] +1_52 [15, ] +1_57 [16, ] +1_58 [17, ] +1_68 [18, ] +1_69 [19, ] +1_7 [2, ] +1_70 [20, ] +1_71 [21, ] +1_72 [22, ] +2.2 [21, ] +abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +aquarium [5, ] +art [4, 5, 8, 9, 10, 12, 17, ] +cartoon [2, 7, 15, 17, ] +colorfulness [13, ] +design [2, 18, 21, ] +drawing [3, 4, 5, 8, 10, 11, 16, ] +geometry [19, 20, 22, ] +letter [1, ] +outdoor [4, ] +painting [3, ] +pattern [2, 3, 9, 10, 13, 14, 16, ] +sign [0, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/false/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap new file mode 100644 index 000000000..7481b11c4 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_exists_docids.snap @@ -0,0 +1,6 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ] +2 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap new file mode 100644 index 000000000..87856f6dc --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap @@ -0,0 +1,5 @@ +--- +source: milli/src/update/delete_documents.rs +--- +2 0 2.2 1 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap new file mode 100644 index 000000000..ab1d2175f --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap @@ -0,0 +1,17 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1 0 abstract 1 [2, 6, 10, 13, 14, 15, 16, 17, ] +1 0 aquarium 1 [5, ] +1 0 art 1 [4, 5, 8, 9, 10, 12, 17, ] +1 0 cartoon 1 [2, 7, 15, 17, ] +1 0 colorfulness 1 [13, ] +1 0 design 1 [2, 18, ] +1 0 drawing 1 [3, 4, 5, 8, 10, 11, 16, ] +1 0 geometry 1 [19, ] +1 0 letter 1 [1, ] +1 0 outdoor 1 [4, ] +1 0 painting 1 [3, ] +1 0 pattern 1 [2, 3, 9, 10, 13, 14, 16, ] +2 0 design 1 [21, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap new file mode 100644 index 000000000..d8125dfcf --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_docids.snap @@ -0,0 +1,37 @@ +--- +source: milli/src/update/delete_documents.rs +--- +1_36 [3, ] +1_37 [4, ] +1_38 [5, ] +1_39 [6, ] +1_40 [7, ] +1_41 [8, ] +1_42 [9, ] +1_43 [10, ] +1_44 [11, ] +1_45 [12, ] +1_46 [13, ] +1_47 [14, ] +1_5 [1, ] +1_52 [15, ] +1_57 [16, ] +1_58 [17, ] +1_68 [18, ] +1_69 [19, ] +1_7 [2, ] +1_71 [21, ] +2.2 [21, ] +abstract [2, 6, 10, 13, 14, 15, 16, 17, ] +aquarium [5, ] +art [4, 5, 8, 9, 10, 12, 17, ] +cartoon [2, 7, 15, 17, ] +colorfulness [13, ] +design [2, 18, 21, ] +drawing [3, 4, 5, 8, 10, 11, 16, ] +geometry [19, ] +letter [1, ] +outdoor [4, ] +painting [3, ] +pattern [2, 3, 9, 10, 13, 14, 16, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/true/word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap new file mode 100644 index 000000000..c909a3cd8 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_f64_docids.snap @@ -0,0 +1,53 @@ +--- +source: milli/src/update/delete_documents.rs +--- +3 0 48.9021 1 [19, ] +3 0 49.4449 1 [18, ] +3 0 49.9314 1 [17, ] +3 0 50.1112 1 [16, ] +3 0 50.1793 1 [15, ] +3 0 50.2844 1 [14, ] +3 0 50.3518 1 [13, ] +3 0 50.4095 1 [11, ] +3 0 50.4502 1 [12, ] +3 0 50.6053 1 [8, ] +3 0 50.6224 1 [3, ] +3 0 50.6299 1 [0, ] +3 0 50.6312 1 [2, ] +3 0 50.6415 1 [1, ] +3 0 50.6552 1 [4, ] +3 0 50.6924 1 [5, ] +3 0 50.7263 1 [6, ] +3 0 50.7453 1 [7, ] +3 0 50.8466 1 [10, ] +3 0 51.0537 1 [9, ] +3 1 48.9021 4 [16, 17, 18, 19, ] +3 1 50.1793 4 [11, 13, 14, 15, ] +3 1 50.4502 4 [0, 3, 8, 12, ] +3 1 50.6312 4 [1, 2, 4, 5, ] +3 1 50.7263 4 [6, 7, 9, 10, ] +4 0 2.271 1 [17, ] +4 0 2.3708 1 [19, ] +4 0 2.7637 1 [14, ] +4 0 2.7913 1 [18, ] +4 0 2.8547 1 [16, ] +4 0 3.0569 1 [0, ] +4 0 3.1106 1 [1, 2, ] +4 0 3.1476 1 [3, ] +4 0 3.1541 1 [6, ] +4 0 3.1763 1 [5, ] +4 0 3.1897 1 [4, ] +4 0 3.2189 1 [15, ] +4 0 3.2206 1 [7, ] +4 0 3.3758 1 [8, ] +4 0 3.5326 1 [13, ] +4 0 3.6957 1 [9, ] +4 0 3.9623 1 [12, ] +4 0 4.337 1 [10, ] +4 0 4.4347 1 [11, ] +4 1 2.271 4 [14, 17, 18, 19, ] +4 1 2.8547 4 [0, 1, 2, 3, 16, ] +4 1 3.1541 4 [4, 5, 6, 15, ] +4 1 3.2206 4 [7, 8, 9, 13, ] +4 1 3.9623 3 [10, 11, 12, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/facet_id_string_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..1260b12de --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[4, 5, 6, 11, 16, 18, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap new file mode 100644 index 000000000..18a9d9309 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_f64_docids.snap @@ -0,0 +1,31 @@ +--- +source: milli/src/update/delete_documents.rs +--- +3 0 48.9021 1 [19, ] +3 0 49.9314 1 [17, ] +3 0 50.1793 1 [15, ] +3 0 50.2844 1 [14, ] +3 0 50.3518 1 [13, ] +3 0 50.4502 1 [12, ] +3 0 50.6053 1 [8, ] +3 0 50.6224 1 [3, ] +3 0 50.6299 1 [0, ] +3 0 50.6312 1 [2, ] +3 0 50.6415 1 [1, ] +3 0 50.7453 1 [7, ] +3 0 50.8466 1 [10, ] +3 0 51.0537 1 [9, ] +4 0 2.271 1 [17, ] +4 0 2.3708 1 [19, ] +4 0 2.7637 1 [14, ] +4 0 3.0569 1 [0, ] +4 0 3.1106 1 [1, 2, ] +4 0 3.1476 1 [3, ] +4 0 3.2189 1 [15, ] +4 0 3.2206 1 [7, ] +4 0 3.3758 1 [8, ] +4 0 3.5326 1 [13, ] +4 0 3.6957 1 [9, ] +4 0 3.9623 1 [12, ] +4 0 4.337 1 [10, ] + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap new file mode 100644 index 000000000..88031d24a --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/facet_id_string_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- + diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..efcd7af8c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..efcd7af8c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..efcd7af8c --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/false/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[2, 15, ] diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap new file mode 100644 index 000000000..e87bce206 --- /dev/null +++ b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/true/soft_deleted_documents_ids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/delete_documents.rs +--- +[] diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap deleted file mode 100644 index 373455db6..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/default/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -587899707db2848da3f18399e14ed4d0 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index c3415c320..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_large_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -02bbf2ca1663cccea0e4c06d5ad06a45 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 78dad29f1..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/large_groups_small_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -e68ea591e1af3e53e544dff9a1648e88 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 61a5908f4..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_large_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -12a4bb0f5b95d7629c2b9a915150c0cf diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 961346de5..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/small_groups_small_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -6438e94bc7fada13022e0efccdf294e0 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap deleted file mode 100644 index 2b7c1ef9c..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_number/tiny_groups_tiny_levels/facet_id_f64_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -5348bbc46b5384455b6a900666d2a502 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap deleted file mode 100644 index 901b86255..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_string/default/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -faddef9eae5f2efacfec51f20f2e8cd6 diff --git a/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap b/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap deleted file mode 100644 index aa6c85461..000000000 --- a/milli/src/update/snapshots/facets.rs/test_facets_string/tiny_groups_tiny_levels/facet_id_string_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/facets.rs ---- -ddb8fc987c5dc892337682595043858e