diff --git a/infos/src/main.rs b/infos/src/main.rs new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/infos/src/main.rs @@ -0,0 +1 @@ + diff --git a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs b/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs deleted file mode 100644 index 1e66427ca..000000000 --- a/milli/src/heed_codec/facet/facet_level_value_f64_codec.rs +++ /dev/null @@ -1,89 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::f64_into_bytes; -use crate::{try_split_array_at, FieldId}; - -// TODO do not de/serialize right bound when level = 0 -pub struct FacetLevelValueF64Codec; - -impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec { - type DItem = (FieldId, u8, f64, f64); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let (level, bytes) = bytes.split_first()?; - - let (left, right) = if *level != 0 { - let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?; - let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?; - (left, right) - } else { - let left = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; - (left, left) - }; - - Some((field_id, *level, left, right)) - } -} - -impl heed::BytesEncode<'_> for FacetLevelValueF64Codec { - type EItem = (FieldId, u8, f64, f64); - - fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { - let mut buffer = [0u8; 32]; - - let len = if *level != 0 { - // Write the globally ordered floats. - let bytes = f64_into_bytes(*left)?; - buffer[..8].copy_from_slice(&bytes[..]); - - let bytes = f64_into_bytes(*right)?; - buffer[8..16].copy_from_slice(&bytes[..]); - - // Then the f64 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[16..24].copy_from_slice(&bytes[..]); - - let bytes = right.to_be_bytes(); - buffer[24..].copy_from_slice(&bytes[..]); - - 32 // length - } else { - // Write the globally ordered floats. - let bytes = f64_into_bytes(*left)?; - buffer[..8].copy_from_slice(&bytes[..]); - - // Then the f64 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[8..16].copy_from_slice(&bytes[..]); - - 16 // length - }; - - let mut bytes = Vec::with_capacity(len + 3); - bytes.extend_from_slice(&field_id.to_be_bytes()); - bytes.push(*level); - bytes.extend_from_slice(&buffer[..len]); - Some(Cow::Owned(bytes)) - } -} - -#[cfg(test)] -mod tests { - use heed::{BytesDecode, BytesEncode}; - - use super::*; - - #[test] - fn globally_ordered_f64() { - let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, 32.0, 0.0)).unwrap(); - let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); - assert_eq!((name, level, left, right), (3, 0, 32.0, 32.0)); - - let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 1, -32.0, 32.0)).unwrap(); - let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap(); - assert_eq!((name, level, left, right), (3, 1, -32.0, 32.0)); - } -} diff --git a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs deleted file mode 100644 index 597335b6e..000000000 --- a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs +++ /dev/null @@ -1,53 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; -use std::num::NonZeroU8; - -use crate::{try_split_array_at, FieldId}; - -/// A codec that stores the field id, level 1 and higher and the groups ids. -/// -/// It can only be used to encode the facet string of the level 1 or higher. -pub struct FacetLevelValueU32Codec; - -impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec { - type DItem = (FieldId, NonZeroU8, u32, u32); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let (level, bytes) = bytes.split_first()?; - let level = NonZeroU8::new(*level)?; - let left = bytes[8..12].try_into().ok().map(u32::from_be_bytes)?; - let right = bytes[12..].try_into().ok().map(u32::from_be_bytes)?; - Some((field_id, level, left, right)) - } -} - -impl heed::BytesEncode<'_> for FacetLevelValueU32Codec { - type EItem = (FieldId, NonZeroU8, u32, u32); - - fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { - let mut buffer = [0u8; 16]; - - // Write the big-endian integers. - let bytes = left.to_be_bytes(); - buffer[..4].copy_from_slice(&bytes[..]); - - let bytes = right.to_be_bytes(); - buffer[4..8].copy_from_slice(&bytes[..]); - - // Then the u32 values just to be able to read them back. - let bytes = left.to_be_bytes(); - buffer[8..12].copy_from_slice(&bytes[..]); - - let bytes = right.to_be_bytes(); - buffer[12..].copy_from_slice(&bytes[..]); - - let mut bytes = Vec::with_capacity(buffer.len() + 2 + 1); - bytes.extend_from_slice(&field_id.to_be_bytes()); - bytes.push(level.get()); - bytes.extend_from_slice(&buffer); - - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs deleted file mode 100644 index 009c6454a..000000000 --- a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::borrow::Cow; -use std::str; - -use crate::{try_split_array_at, FieldId}; - -/// A codec that stores the field id, level 0, and facet string. -/// -/// It can only be used to encode the facet string of the level 0, -/// as it hardcodes the level. -/// -/// We encode the level 0 to not break the lexicographical ordering of the LMDB keys, -/// and make sure that the levels are not mixed-up. The level 0 is special, the key -/// are strings, other levels represent groups and keys are simply two integers. -pub struct FacetStringLevelZeroCodec; - -impl FacetStringLevelZeroCodec { - pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { - out.reserve(value.len() + 2); - out.extend_from_slice(&field_id.to_be_bytes()); - out.push(0); // the level zero (for LMDB ordering only) - out.extend_from_slice(value.as_bytes()); - } -} - -impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { - type DItem = (FieldId, &'a str); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - let (level, bytes) = bytes.split_first()?; - - if *level != 0 { - return None; - } - - let value = str::from_utf8(bytes).ok()?; - Some((field_id, value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec { - type EItem = (FieldId, &'a str); - - fn bytes_encode((field_id, value): &Self::EItem) -> Option> { - let mut bytes = Vec::new(); - FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs deleted file mode 100644 index 22031c474..000000000 --- a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs +++ /dev/null @@ -1,90 +0,0 @@ -use std::borrow::Cow; -use std::convert::TryInto; -use std::{marker, str}; - -use crate::error::SerializationError; -use crate::heed_codec::RoaringBitmapCodec; -use crate::{try_split_array_at, try_split_at, Result}; - -pub type FacetStringLevelZeroValueCodec = StringValueCodec; - -/// A codec that encodes a string in front of a value. -/// -/// The usecase is for the facet string levels algorithm where we must know the -/// original string of a normalized facet value, the original values are stored -/// in the value to not break the lexicographical ordering of the LMDB keys. -pub struct StringValueCodec(marker::PhantomData); - -impl<'a, C> heed::BytesDecode<'a> for StringValueCodec -where - C: heed::BytesDecode<'a>, -{ - type DItem = (&'a str, C::DItem); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (string, bytes) = decode_prefix_string(bytes)?; - C::bytes_decode(bytes).map(|item| (string, item)) - } -} - -impl<'a, C> heed::BytesEncode<'a> for StringValueCodec -where - C: heed::BytesEncode<'a>, -{ - type EItem = (&'a str, C::EItem); - - fn bytes_encode((string, value): &'a Self::EItem) -> Option> { - let value_bytes = C::bytes_encode(&value)?; - - let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); - encode_prefix_string(string, &mut bytes).ok()?; - bytes.extend_from_slice(&value_bytes[..]); - - Some(Cow::Owned(bytes)) - } -} - -pub fn decode_prefix_string(value: &[u8]) -> Option<(&str, &[u8])> { - let (original_length_bytes, bytes) = try_split_array_at(value)?; - let original_length = u16::from_be_bytes(original_length_bytes) as usize; - let (string, bytes) = try_split_at(bytes, original_length)?; - let string = str::from_utf8(string).ok()?; - Some((string, bytes)) -} - -pub fn encode_prefix_string(string: &str, buffer: &mut Vec) -> Result<()> { - let string_len: u16 = - string.len().try_into().map_err(|_| SerializationError::InvalidNumberSerialization)?; - buffer.extend_from_slice(&string_len.to_be_bytes()); - buffer.extend_from_slice(string.as_bytes()); - Ok(()) -} - -#[cfg(test)] -mod tests { - use heed::types::Unit; - use heed::{BytesDecode, BytesEncode}; - use roaring::RoaringBitmap; - - use super::*; - - #[test] - fn deserialize_roaring_bitmaps() { - let string = "abc"; - let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); - let key = (string, docids.clone()); - let bytes = StringValueCodec::::bytes_encode(&key).unwrap(); - let (out_string, out_docids) = - StringValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_string, out_docids), (string, docids)); - } - - #[test] - fn deserialize_unit() { - let string = "def"; - let key = (string, ()); - let bytes = StringValueCodec::::bytes_encode(&key).unwrap(); - let (out_string, out_unit) = StringValueCodec::::bytes_decode(&bytes).unwrap(); - assert_eq!((out_string, out_unit), (string, ())); - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 0b2d9186f..d23ab391e 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,19 +1,21 @@ -mod facet_level_value_f64_codec; -mod facet_level_value_u32_codec; -mod facet_string_level_zero_codec; -mod facet_string_level_zero_value_codec; +// mod facet_level_value_f64_codec; +// mod facet_level_value_u32_codec; +// mod facet_string_level_zero_codec; +// mod facet_string_level_zero_value_codec; mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; +pub mod new; + use heed::types::OwnedType; -pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; -pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; -pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; -pub use self::facet_string_level_zero_value_codec::{ - decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, -}; +// pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; +// pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; +// pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; +// pub use self::facet_string_level_zero_value_codec::{ +// decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec, +// }; pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/milli/src/heed_codec/facet/new/mod.rs b/milli/src/heed_codec/facet/new/mod.rs new file mode 100644 index 000000000..5ed6a61f6 --- /dev/null +++ b/milli/src/heed_codec/facet/new/mod.rs @@ -0,0 +1,148 @@ +use heed::{BytesDecode, BytesEncode}; +use roaring::RoaringBitmap; +use std::{borrow::Cow, convert::TryFrom, marker::PhantomData}; + +pub mod ordered_f64_codec; +pub mod str_ref; +// TODO: these codecs were quickly written and not fast/resilient enough + +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] +pub struct FacetKey { + pub field_id: u16, + pub level: u8, + pub left_bound: T, +} +impl<'a> FacetKey<&'a [u8]> { + pub fn into_owned(self) -> FacetKey> { + FacetKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.to_vec(), + } + } +} + +impl<'a> FacetKey> { + pub fn as_ref(&self) -> FacetKey<&[u8]> { + FacetKey { + field_id: self.field_id, + level: self.level, + left_bound: self.left_bound.as_slice(), + } + } +} + +pub struct FacetGroupValue { + pub size: u8, + pub bitmap: RoaringBitmap, +} + +pub struct FacetKeyCodec { + _phantom: PhantomData, +} + +impl<'a, T> heed::BytesEncode<'a> for FacetKeyCodec +where + T: BytesEncode<'a>, + T::EItem: Sized, +{ + type EItem = FacetKey; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.extend_from_slice(&value.field_id.to_be_bytes()); + v.extend_from_slice(&[value.level]); + + let bound = T::bytes_encode(&value.left_bound).unwrap(); + v.extend_from_slice(&bound); + + Some(Cow::Owned(v)) + } +} +impl<'a, T> heed::BytesDecode<'a> for FacetKeyCodec +where + T: BytesDecode<'a>, +{ + type DItem = FacetKey; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).unwrap()); + let level = bytes[2]; + let bound = T::bytes_decode(&bytes[3..]).unwrap(); + Some(FacetKey { field_id: fid, level, left_bound: bound }) + } +} + +pub struct FacetGroupValueCodec; +impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { + type EItem = FacetGroupValue; + + fn bytes_encode(value: &'a Self::EItem) -> Option> { + let mut v = vec![]; + v.push(value.size); + value.bitmap.serialize_into(&mut v).unwrap(); + Some(Cow::Owned(v)) + } +} +impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { + type DItem = FacetGroupValue; + fn bytes_decode(bytes: &'a [u8]) -> Option { + let size = bytes[0]; + let bitmap = RoaringBitmap::deserialize_from(&bytes[1..]).unwrap(); + Some(FacetGroupValue { size, bitmap }) + } +} + +// TODO: get rid of this codec as it is named confusingly + should really be part of heed +// or even replace the current ByteSlice codec +pub struct MyByteSlice; + +impl<'a> BytesEncode<'a> for MyByteSlice { + type EItem = &'a [u8]; + + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Borrowed(item)) + } +} + +impl<'a> BytesDecode<'a> for MyByteSlice { + type DItem = &'a [u8]; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + Some(bytes) + } +} + +// I won't need these ones anymore +// pub struct U16Codec; +// impl<'a> BytesEncode<'a> for U16Codec { +// type EItem = u16; + +// fn bytes_encode(item: &'a Self::EItem) -> Option> { +// Some(Cow::Owned(item.to_be_bytes().to_vec())) +// } +// } +// impl<'a> BytesDecode<'a> for U16Codec { +// type DItem = u16; + +// fn bytes_decode(bytes: &'a [u8]) -> Option { +// Some(u16::from_be_bytes(bytes[0..=1].try_into().unwrap())) +// } +// } + +// pub struct StrCodec; +// impl<'a> BytesEncode<'a> for StrCodec { +// type EItem = &'a str; + +// fn bytes_encode(item: &'a &'a str) -> Option> { +// Some(Cow::Borrowed(item.as_bytes())) +// } +// } +// impl<'a> BytesDecode<'a> for StrCodec { +// type DItem = &'a str; + +// fn bytes_decode(bytes: &'a [u8]) -> Option { +// let s = std::str::from_utf8(bytes).unwrap(); +// Some(s) +// } +// } diff --git a/milli/src/heed_codec/facet/new/ordered_f64_codec.rs b/milli/src/heed_codec/facet/new/ordered_f64_codec.rs new file mode 100644 index 000000000..856a9c0d1 --- /dev/null +++ b/milli/src/heed_codec/facet/new/ordered_f64_codec.rs @@ -0,0 +1,36 @@ +use std::{borrow::Cow, convert::TryInto}; + +use heed::BytesDecode; + +use crate::facet::value_encoding::f64_into_bytes; + +pub struct OrderedF64Codec; + +impl<'a> BytesDecode<'a> for OrderedF64Codec { + type DItem = f64; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + if bytes.len() < 16 { + return None; + } + let f = bytes[8..].try_into().ok().map(f64::from_be_bytes)?; + Some(f) + } +} + +impl heed::BytesEncode<'_> for OrderedF64Codec { + type EItem = f64; + + fn bytes_encode(f: &Self::EItem) -> Option> { + let mut buffer = [0u8; 16]; + + // write the globally ordered float + let bytes = f64_into_bytes(*f)?; + buffer[..8].copy_from_slice(&bytes[..]); + // Then the f64 value just to be able to read it back + let bytes = f.to_be_bytes(); + buffer[8..16].copy_from_slice(&bytes[..]); + + Some(Cow::Owned(buffer.to_vec())) + } +} diff --git a/milli/src/heed_codec/facet/new/str_ref.rs b/milli/src/heed_codec/facet/new/str_ref.rs new file mode 100644 index 000000000..80a51c803 --- /dev/null +++ b/milli/src/heed_codec/facet/new/str_ref.rs @@ -0,0 +1,20 @@ +use std::borrow::Cow; + +use heed::{BytesDecode, BytesEncode}; + +pub struct StrRefCodec; +impl<'a> BytesEncode<'a> for StrRefCodec { + type EItem = &'a str; + + fn bytes_encode(item: &'a &'a str) -> Option> { + Some(Cow::Borrowed(item.as_bytes())) + } +} +impl<'a> BytesDecode<'a> for StrRefCodec { + type DItem = &'a str; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let s = std::str::from_utf8(bytes).unwrap(); + Some(s) + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 94e2f538d..0561a77ac 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -13,9 +13,14 @@ use time::OffsetDateTime; use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::str_ref::StrRefCodec; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec}; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, + // FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FieldDocIdFacetF64Codec, + FieldDocIdFacetStringCodec, + FieldIdCodec, }; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, @@ -123,10 +128,10 @@ pub struct Index { /// Maps the facet field id and the docids for which this field exists pub facet_id_exists_docids: Database, - /// Maps the facet field id, level and the number with the docids that corresponds to it. - pub facet_id_f64_docids: Database, - /// Maps the facet field id and the string with the original string and docids that corresponds to it. - pub facet_id_string_docids: Database, + /// Maps the facet field id and ranges of numbers with the docids that corresponds to them. + pub facet_id_f64_docids: Database, FacetGroupValueCodec>, + /// Maps the facet field id and ranges of strings with the docids that corresponds to them. + pub facet_id_string_docids: Database, FacetGroupValueCodec>, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 6d50c1bb5..bd08c54a5 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::{FacetNumberIter, FacetStringIter}; +// use crate::search::facet::FacetStringIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -186,23 +186,24 @@ fn facet_ordered<'t>( iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) } else { - let facet_number_fn = if is_ascending { - FacetNumberIter::new_reducing - } else { - FacetNumberIter::new_reverse_reducing - }; - let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())? - .map(|res| res.map(|(_, docids)| docids)); + todo!() + // let facet_number_fn = if is_ascending { + // FacetNumberIter::new_reducing + // } else { + // FacetNumberIter::new_reverse_reducing + // }; + // let number_iter = facet_number_fn(rtxn, index, field_id, candidates.clone())? + // .map(|res| res.map(|(_, docids)| docids)); - let facet_string_fn = if is_ascending { - FacetStringIter::new_reducing - } else { - FacetStringIter::new_reverse_reducing - }; - let string_iter = facet_string_fn(rtxn, index, field_id, candidates)? - .map(|res| res.map(|(_, _, docids)| docids)); + // let facet_string_fn = if is_ascending { + // FacetStringIter::new_reducing + // } else { + // FacetStringIter::new_reverse_reducing + // }; + // let string_iter = facet_string_fn(rtxn, index, field_id, candidates)? + // .map(|res| res.map(|(_, _, docids)| docids)); - Ok(Box::new(number_iter.chain(string_iter))) + // Ok(Box::new(number_iter.chain(string_iter))) } } diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 33e7b4975..4a4815775 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -6,6 +6,7 @@ use roaring::RoaringBitmap; use super::{Distinct, DocIter}; use crate::error::InternalError; +use crate::heed_codec::facet::new::FacetKey; use crate::heed_codec::facet::*; use crate::index::db_name; use crate::{DocumentId, FieldId, Index, Result}; @@ -47,13 +48,16 @@ impl<'a> FacetDistinctIter<'a> { fn facet_string_docids(&self, key: &str) -> heed::Result> { self.index .facet_id_string_docids - .get(self.txn, &(self.distinct, key)) - .map(|result| result.map(|(_original, docids)| docids)) + .get(self.txn, &FacetKey { field_id: self.distinct, level: 0, left_bound: key }) + .map(|opt| opt.map(|v| v.bitmap)) } fn facet_number_docids(&self, key: f64) -> heed::Result> { // get facet docids on level 0 - self.index.facet_id_f64_docids.get(self.txn, &(self.distinct, 0, key, key)) + self.index + .facet_id_f64_docids + .get(self.txn, &FacetKey { field_id: self.distinct, level: 0, left_bound: key }) + .map(|opt| opt.map(|v| v.bitmap)) } fn distinct_string(&mut self, id: DocumentId) -> Result<()> { diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index b2718a490..fddf93d4b 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -7,10 +7,8 @@ use roaring::RoaringBitmap; use crate::error::UserError; use crate::facet::FacetType; -use crate::heed_codec::facet::{ - FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, -}; -use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; +use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec}; +// use crate::search::facet::FacetStringIter; use crate::{FieldId, Index, Result}; /// The default number of values by facets that will @@ -133,21 +131,22 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - let iter = - FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + todo!() + // let iter = + // FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - for result in iter { - let (value, mut docids) = result?; - docids &= candidates; - if !docids.is_empty() { - distribution.insert(value.to_string(), docids.len()); - } - if distribution.len() == self.max_values_per_facet { - break; - } - } + // for result in iter { + // let (value, mut docids) = result?; + // docids &= candidates; + // if !docids.is_empty() { + // distribution.insert(value.to_string(), docids.len()); + // } + // if distribution.len() == self.max_values_per_facet { + // break; + // } + // } - Ok(()) + // Ok(()) } fn facet_strings_distribution_from_facet_levels( @@ -156,21 +155,22 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - let iter = - FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + todo!() + // let iter = + // FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; - for result in iter { - let (_normalized, original, mut docids) = result?; - docids &= candidates; - if !docids.is_empty() { - distribution.insert(original.to_string(), docids.len()); - } - if distribution.len() == self.max_values_per_facet { - break; - } - } + // for result in iter { + // let (_normalized, original, mut docids) = result?; + // docids &= candidates; + // if !docids.is_empty() { + // distribution.insert(original.to_string(), docids.len()); + // } + // if distribution.len() == self.max_values_per_facet { + // break; + // } + // } - Ok(()) + // Ok(()) } /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the @@ -179,41 +179,43 @@ impl<'a> FacetDistribution<'a> { &self, field_id: FieldId, ) -> heed::Result> { - let mut distribution = BTreeMap::new(); + todo!() + // let mut distribution = BTreeMap::new(); - let db = self.index.facet_id_f64_docids; - let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; + // let db = self.index.facet_id_f64_docids; + // let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; - for result in range { - let ((_, _, value, _), docids) = result?; - distribution.insert(value.to_string(), docids.len()); - if distribution.len() == self.max_values_per_facet { - break; - } - } + // for result in range { + // let ((_, _, value, _), docids) = result?; + // distribution.insert(value.to_string(), docids.len()); + // if distribution.len() == self.max_values_per_facet { + // break; + // } + // } - let iter = self - .index - .facet_id_string_docids - .remap_key_type::() - .prefix_iter(self.rtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + // let iter = self + // .index + // .facet_id_string_docids + // .remap_key_type::() + // .prefix_iter(self.rtxn, &field_id.to_be_bytes())? + // .remap_key_type::(); - let mut normalized_distribution = BTreeMap::new(); - for result in iter { - let ((_, normalized_value), (original_value, docids)) = result?; - normalized_distribution.insert(normalized_value, (original_value, docids.len())); - if normalized_distribution.len() == self.max_values_per_facet { - break; - } - } + // let mut normalized_distribution = BTreeMap::new(); + // for result in iter { + // let ((_, normalized_value), group_value) = result?; + // normalized_distribution + // .insert(normalized_value, (normalized_value, group_value.bitmap.len())); + // if normalized_distribution.len() == self.max_values_per_facet { + // break; + // } + // } - let iter = normalized_distribution - .into_iter() - .map(|(_normalized, (original, count))| (original.to_string(), count)); - distribution.extend(iter); + // let iter = normalized_distribution + // .into_iter() + // .map(|(_normalized, (original, count))| (original.to_string(), count)); + // distribution.extend(iter); - Ok(distribution) + // Ok(distribution) } fn facet_values(&self, field_id: FieldId) -> heed::Result> { diff --git a/milli/src/search/facet/facet_number.rs b/milli/src/search/facet/facet_number.rs index 02390aac1..5f7bd5325 100644 --- a/milli/src/search/facet/facet_number.rs +++ b/milli/src/search/facet/facet_number.rs @@ -1,248 +1,335 @@ -use std::ops::Bound::{self, Excluded, Included, Unbounded}; +// use std::ops::Bound::{self, Excluded, Included, Unbounded}; -use either::Either::{self, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange, RoRevRange}; -use roaring::RoaringBitmap; +// use either::Either::{self, Left, Right}; +// use heed::types::{ByteSlice, DecodeIgnore}; +// use heed::{BytesDecode, BytesEncode, Database, Lazy, LazyDecode, RoRange, RoRevRange}; +// use obkv::Key; +// use roaring::RoaringBitmap; -use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{FieldId, Index}; +// use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +// use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; +// use crate::heed_codec::CboRoaringBitmapCodec; +// use crate::{FieldId, Index}; -pub struct FacetNumberRange<'t> { - iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} +// pub struct FacetNumberRange<'t, 'e> { +// rtxn: &'t heed::RoTxn<'e>, +// db: Database, FacetGroupValueCodec>, +// iter: RoRange<'t, FacetKeyCodec, LazyDecode>, +// max_bound: f64, +// previous: Option<(FacetKey, Lazy<'t, FacetGroupValueCodec>)>, +// field_id: FieldId, +// end: Bound, +// } -impl<'t> FacetNumberRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; - Ok(FacetNumberRange { iter, end: right }) - } -} +// impl<'t, 'e> FacetNumberRange<'t, 'e> { +// pub fn new( +// rtxn: &'t heed::RoTxn<'e>, +// db: Database, FacetGroupValueCodec>, +// field_id: FieldId, +// level: u8, +// left: Bound, +// right: Bound, +// ) -> heed::Result> { +// let left_bound = match left { +// Included(left_bound) => Included(FacetKey { field_id, level, left_bound }), +// Excluded(left_bound) => Excluded(FacetKey { field_id, level, left_bound }), +// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }), +// }; -impl<'t> Iterator for FacetNumberRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; +// let mut iter = db.lazily_decode_data().range(rtxn, &(left_bound, Unbounded))?; +// let mut previous = iter.next().transpose()?; - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => Some(Ok(((fid, level, left, right), docids))), - Err(e) => Some(Err(e)), - } - } else { - None - } - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} +// // Compute the maximum end bound by looking at the key of the last element in level 0 +// let mut prefix_level_0 = vec![]; +// prefix_level_0.extend_from_slice(&field_id.to_be_bytes()); +// prefix_level_0.push(level); -pub struct FacetNumberRevRange<'t> { - iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} +// let mut rev_iter = +// db.as_polymorph().rev_prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, &prefix_level_0)?; -impl<'t> FacetNumberRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; - Ok(FacetNumberRevRange { iter, end: right }) - } -} +// let rev_iter_first = rev_iter.next().transpose()?; +// let max_bound = if let Some((max_bound_key, _)) = rev_iter_first { +// let max_bound_key = +// FacetKeyCodec::::bytes_decode(max_bound_key).unwrap(); +// max_bound_key.left_bound +// } else { +// // I can't imagine when that would happen, but let's handle it correctly anyway +// // by making the iterator empty +// previous = None; +// 0.0 // doesn't matter since previous = None so the iterator will always early exit +// // and return None itself +// }; -impl<'t> Iterator for FacetNumberRevRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; +// Ok(FacetNumberRange { rtxn, db, iter, field_id, previous, max_bound, end: right }) +// } +// } - fn next(&mut self) -> Option { - loop { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => return Some(Ok(((fid, level, left, right), docids))), - Err(e) => return Some(Err(e)), - } - } - continue; - } - Some(Err(e)) => return Some(Err(e)), - None => return None, - } - } - } -} +// impl<'t, 'e> Iterator for FacetNumberRange<'t, 'e> { +// type Item = heed::Result<(FacetKey, RoaringBitmap)>; -pub struct FacetNumberIter<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: Database, - field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, - must_reduce: bool, -} +// fn next(&mut self) -> Option { +// // The idea here is to return the **previous** element only if the left +// // bound of the current key fits within the range given to the iter +// // if it doesn't, then there is still a chance that it must be returned, +// // but we need to check the actual right bound of the group by looking for +// // the key preceding the first key of the next group in level 0 -impl<'t> FacetNumberIter<'t> { - /// Create a `FacetNumberIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } +// let (prev_key, prev_value) = self.previous?; - /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reverse_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids; - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Right(highest_iter))]; - Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } +// let (next_left_bound, next_previous) = if let Some(next) = self.iter.next() { +// let (key, group_value) = match next { +// Ok(n) => n, +// Err(e) => return Some(Err(e)), +// }; +// (key.left_bound, Some((key, group_value))) +// } else { +// // we're at the end of the level iter, so we need to fetch the max bound instead +// (self.max_bound, None) +// }; +// let must_be_returned = match self.end { +// Included(end) => next_left_bound <= end, +// Excluded(end) => next_left_bound < end, +// Unbounded => true, +// }; +// if must_be_returned { +// match prev_value.decode() { +// Ok(group_value) => { +// self.previous = next_previous; +// Some(Ok((prev_key, group_value.bitmap))) +// } +// Err(e) => Some(Err(e)), +// } +// } else { +// // it still possible that we want to return the value (one last time) +// // but to do so, we need to fetch the right bound of the current group +// // this is done by getting the first element at level 0 of the next group +// // then iterating in reverse from it +// // once we have the right bound, we can compare it, and then return or not +// // then we still set self.previous to None so that no other element can return +// // from it? +// let mut level_0_key_prefix = vec![]; +// level_0_key_prefix.extend_from_slice(&self.field_id.to_be_bytes()); +// level_0_key_prefix.push(0); +// let key = +// FacetKey:: { field_id: self.field_id, level: 0, left_bound: next_left_bound }; +// let key_bytes = FacetKeyCodec::::bytes_encode(&key).unwrap(); +// level_0_key_prefix.extend_from_slice(&key_bytes); - /// Create a `FacetNumberIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will not reduce the given documents ids - /// while iterating on the different facet levels, possibly returning multiple times - /// a document id associated with multiple facet values. - pub fn new_non_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) - } +// let mut rev_iter_next_group_level_0 = self +// .db +// .as_polymorph() +// .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&self.rtxn, &level_0_key_prefix) +// .unwrap(); +// let (key_for_right_bound, _) = rev_iter_next_group_level_0.next().unwrap().unwrap(); +// let key_for_right_bound = +// FacetKeyCodec::::bytes_decode(key_for_right_bound).unwrap(); +// let right_bound = key_for_right_bound.left_bound; +// let must_be_returned = match self.end { +// Included(end) => right_bound <= end, +// Excluded(end) => right_bound < end, +// Unbounded => unreachable!(), +// }; +// self.previous = None; +// if must_be_returned { +// match prev_value.decode() { +// Ok(group_value) => Some(Ok((prev_key, group_value.bitmap))), +// Err(e) => Some(Err(e)), +// } +// } else { +// None +// } +// } +// } +// } - fn highest_level( - rtxn: &'t heed::RoTxn, - db: Database, - fid: FieldId, - ) -> heed::Result> { - let level = db - .remap_types::() - .prefix_iter(rtxn, &fid.to_be_bytes())? - .remap_key_type::() - .last() - .transpose()? - .map(|((_, level, _, _), _)| level); - Ok(level) - } -} +// pub struct FacetNumberRevRange<'t> { +// iter: RoRevRange<'t, FacetKeyCodec, LazyDecode>, +// end: Bound, +// } -impl<'t> Iterator for FacetNumberIter<'t> { - type Item = heed::Result<(f64, RoaringBitmap)>; +// impl<'t> FacetNumberRevRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, FacetGroupValueCodec>, +// field_id: FieldId, +// level: u8, +// left: Bound, +// right: Bound, +// ) -> heed::Result> { +// let left_bound = match left { +// Included(left) => Included(FacetKey { field_id, level, left_bound: left }), +// Excluded(left) => Excluded(FacetKey { field_id, level, left_bound: left }), +// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }), +// }; +// let right_bound = Included(FacetKey { field_id, level, left_bound: f64::MAX }); +// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; +// Ok(FacetNumberRevRange { iter, end: right }) +// } +// } - fn next(&mut self) -> Option { - 'outer: loop { - let (documents_ids, last) = self.level_iters.last_mut()?; - let is_ascending = last.is_left(); - for result in last { - // If the last iterator must find an empty set of documents it means - // that we found all the documents in the sub level iterations already, - // we can pop this level iterator. - if documents_ids.is_empty() { - break; - } +// impl<'t> Iterator for FacetNumberRevRange<'t> { +// type Item = heed::Result<(FacetKey, RoaringBitmap)>; - match result { - Ok(((_fid, level, left, right), mut docids)) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } +// fn next(&mut self) -> Option { +// loop { +// match self.iter.next() { +// Some(Ok((FacetKey { field_id, level, left_bound }, docids))) => { +// let must_be_returned = match self.end { +// Included(end) => todo!(), //right <= end, +// Excluded(end) => todo!(), //right < end, +// Unbounded => true, +// }; +// if must_be_returned { +// match docids.decode() { +// Ok(docids) => { +// return Some(Ok(( +// FacetKey { field_id, level, left_bound }, +// docids.bitmap, +// ))) +// } +// Err(e) => return Some(Err(e)), +// } +// } +// continue; +// } +// Some(Err(e)) => return Some(Err(e)), +// None => return None, +// } +// } +// } +// } - if level == 0 { - return Some(Ok((left, docids))); - } +// pub struct FacetNumberIter<'t, 'e> { +// rtxn: &'t heed::RoTxn<'t>, +// db: Database, FacetGroupValueCodec>, +// field_id: FieldId, +// level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, +// must_reduce: bool, +// } - let rtxn = self.rtxn; - let db = self.db; - let fid = self.field_id; - let left = Included(left); - let right = Included(right); +// impl<'t, 'e> FacetNumberIter<'t, 'e> { +// /// Create a `FacetNumberIter` that will iterate on the different facet entries +// /// (facet value + documents ids) and that will reduce the given documents ids +// /// while iterating on the different facet levels. +// pub fn new_reducing( +// rtxn: &'t heed::RoTxn<'e>, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_f64_docids; +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// let highest_iter = +// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; +// let level_iters = vec![(documents_ids, Left(highest_iter))]; +// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) +// } - let result = if is_ascending { - FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) - .map(Left) - } else { - FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) - .map(Right) - }; +// /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse +// /// (facet value + documents ids) and that will reduce the given documents ids +// /// while iterating on the different facet levels. +// pub fn new_reverse_reducing( +// rtxn: &'t heed::RoTxn<'e>, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_f64_docids; +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// let highest_iter = +// FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; +// let level_iters = vec![(documents_ids, Right(highest_iter))]; +// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) +// } - match result { - Ok(iter) => { - self.level_iters.push((docids, iter)); - continue 'outer; - } - Err(e) => return Some(Err(e)), - } - } - } - Err(e) => return Some(Err(e)), - } - } - self.level_iters.pop(); - } - } -} +// /// Create a `FacetNumberIter` that will iterate on the different facet entries +// /// (facet value + documents ids) and that will not reduce the given documents ids +// /// while iterating on the different facet levels, possibly returning multiple times +// /// a document id associated with multiple facet values. +// pub fn new_non_reducing( +// rtxn: &'t heed::RoTxn<'e>, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_f64_docids; +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// let highest_iter = +// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; +// let level_iters = vec![(documents_ids, Left(highest_iter))]; +// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) +// } + +// fn highest_level( +// rtxn: &'t heed::RoTxn, +// db: Database, X>, +// fid: FieldId, +// ) -> heed::Result> { +// let level = db +// .remap_types::() +// .prefix_iter(rtxn, &fid.to_be_bytes())? +// .remap_key_type::>() +// .last() +// .transpose()? +// .map(|(key, _)| key.level); +// Ok(level) +// } +// } + +// impl<'t, 'e> Iterator for FacetNumberIter<'t, 'e> { +// type Item = heed::Result<(f64, RoaringBitmap)>; + +// fn next(&mut self) -> Option { +// 'outer: loop { +// let (documents_ids, last) = self.level_iters.last_mut()?; +// let is_ascending = last.is_left(); +// for result in last { +// // If the last iterator must find an empty set of documents it means +// // that we found all the documents in the sub level iterations already, +// // we can pop this level iterator. +// if documents_ids.is_empty() { +// break; +// } + +// match result { +// Ok((key, mut docids)) => { +// docids &= &*documents_ids; +// if !docids.is_empty() { +// if self.must_reduce { +// *documents_ids -= &docids; +// } + +// if level == 0 { +// return Some(Ok((left, docids))); +// } + +// let rtxn = self.rtxn; +// let db = self.db; +// let fid = self.field_id; +// let left = Included(left); +// let right = Included(right); + +// let result = if is_ascending { +// FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) +// .map(Left) +// } else { +// FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) +// .map(Right) +// }; + +// match result { +// Ok(iter) => { +// self.level_iters.push((docids, iter)); +// continue 'outer; +// } +// Err(e) => return Some(Err(e)), +// } +// } +// } +// Err(e) => return Some(Err(e)), +// } +// } +// self.level_iters.pop(); +// } +// } +// } diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index c55430cf1..b01359503 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -1,652 +1,649 @@ -//! This module contains helpers iterators for facet strings. -//! -//! The purpose is to help iterate over the quite complex system of facets strings. A simple -//! description of the system would be that every facet string value is stored into an LMDB database -//! and that every value is associated with the document ids which are associated with this facet -//! string value. -//! -//! In reality it is a little bit more complex as we have to create aggregations of runs of facet -//! string values, those aggregations helps in choosing the right groups of facets to follow. -//! -//! ## A typical algorithm run -//! -//! If a group of aggregated facets values contains one of the documents ids, we must continue -//! iterating over the sub-groups. -//! -//! If this group is the lowest level and contain at least one document id we yield the associated -//! facet documents ids. -//! -//! If the group doesn't contain one of our documents ids, we continue to the next group at this -//! same level. -//! -//! ## The complexity comes from the strings -//! -//! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create -//! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the -//! two numbers bounds, the left and the right bound of the group, both inclusive. -//! -//! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and -//! puting two numbers big-endian encoded one after the other gives us ordered groups. The values -//! are simple unions of the documents ids coming from the groups below. -//! -//! ### Example of what a facet number LMDB database contain -//! -//! | level | left-bound | right-bound | documents ids | -//! |-------|------------|-------------|------------------| -//! | 0 | 0 | _skipped_ | 1, 2 | -//! | 0 | 1 | _skipped_ | 6, 7 | -//! | 0 | 3 | _skipped_ | 4, 7 | -//! | 0 | 5 | _skipped_ | 2, 3, 4 | -//! | 1 | 0 | 1 | 1, 2, 6, 7 | -//! | 1 | 3 | 5 | 2, 3, 4, 7 | -//! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | -//! -//! As you can see the level 0 have two equal bounds, therefore we skip serializing the second -//! bound, that's the base level where you can directly fetch the documents ids associated with an -//! exact number. -//! -//! The next levels have two different bounds and the associated documents ids are simply the result -//! of an union of all the documents ids associated with the aggregated groups above. -//! -//! ## The complexity of defining groups for facet strings -//! -//! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in -//! lexicographical order, it means that whatever the key represent the bytes are read in their raw -//! form and a simple `strcmp` will define the order in which keys will be read from the store. -//! -//! That's easy for types with a known size, like floats or integers, they are 64 bytes long and -//! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the -//! first number then by the second if the the first number is equal on two keys. -//! -//! For strings it is a lot more complex as those types are unsized, it means that the size of facet -//! strings is different for each facet value. -//! -//! ### Basic approach: padding the keys -//! -//! A first approach would be to simply define the maximum size of a facet string and pad the keys -//! with zeroes. The big problem of this approach is that it: -//! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the -//! other. -//! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB -//! performances. -//! -//! ### Better approach: number the facet groups -//! -//! A better approach would be to number the groups, this way we don't have the downsides of the -//! previously described approach but we need to be able to describe the groups by using a number. -//! -//! #### Example of facet strings with numbered groups -//! -//! | level | left-bound | right-bound | left-string | right-string | documents ids | -//! |-------|------------|-------------|-------------|--------------|------------------| -//! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | -//! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | -//! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | -//! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | -//! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | -//! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | -//! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | -//! -//! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not -//! need to store the facet string value two times. -//! -//! The number in the left-bound and right-bound columns are incremental numbers representing the -//! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering -//! of the LMDB keys. -//! -//! In the value, not in the key, you can see that we added two new values: the left-string and the -//! right-string, which defines the original facet strings associated with the given group. -//! -//! We put those two strings inside of the value, this way we do not limit the maximum size of the -//! facet string values, and the impact on performances is not important as, IIRC, LMDB put big -//! values on another page, this helps in iterating over keys fast enough and only fetch the page -//! with the values when required. -//! -//! The other little advantage with this solution is that there is no a big overhead, compared with -//! the facet number levels, we only duplicate the facet strings once for the level 1. -//! -//! #### A typical algorithm run -//! -//! Note that the algorithm is always moving from the highest level to the lowest one, one level -//! by one level, this is why it is ok to only store the facets string on the level 1. -//! -//! If a group of aggregated facets values, a group with numbers contains one of the documents ids, -//! we must continue iterating over the sub-groups. To do so: -//! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds -//! and iterate over the facet groups defined by these numbers over the current level - 1. -//! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the -//! value and just do the same as with the facet numbers but with strings: iterate over the -//! current level - 1 with both keys. -//! -//! If this group is the lowest level (level 0) and contain at least one document id we yield the -//! associated facet documents ids. -//! -//! If the group doesn't contain one of our documents ids, we continue to the next group at this -//! same level. -//! +// //! This module contains helpers iterators for facet strings. +// //! +// //! The purpose is to help iterate over the quite complex system of facets strings. A simple +// //! description of the system would be that every facet string value is stored into an LMDB database +// //! and that every value is associated with the document ids which are associated with this facet +// //! string value. +// //! +// //! In reality it is a little bit more complex as we have to create aggregations of runs of facet +// //! string values, those aggregations helps in choosing the right groups of facets to follow. +// //! +// //! ## A typical algorithm run +// //! +// //! If a group of aggregated facets values contains one of the documents ids, we must continue +// //! iterating over the sub-groups. +// //! +// //! If this group is the lowest level and contain at least one document id we yield the associated +// //! facet documents ids. +// //! +// //! If the group doesn't contain one of our documents ids, we continue to the next group at this +// //! same level. +// //! +// //! ## The complexity comes from the strings +// //! +// //! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create +// //! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the +// //! two numbers bounds, the left and the right bound of the group, both inclusive. +// //! +// //! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and +// //! puting two numbers big-endian encoded one after the other gives us ordered groups. The values +// //! are simple unions of the documents ids coming from the groups below. +// //! +// //! ### Example of what a facet number LMDB database contain +// //! +// //! | level | left-bound | right-bound | documents ids | +// //! |-------|------------|-------------|------------------| +// //! | 0 | 0 | _skipped_ | 1, 2 | +// //! | 0 | 1 | _skipped_ | 6, 7 | +// //! | 0 | 3 | _skipped_ | 4, 7 | +// //! | 0 | 5 | _skipped_ | 2, 3, 4 | +// //! | 1 | 0 | 1 | 1, 2, 6, 7 | +// //! | 1 | 3 | 5 | 2, 3, 4, 7 | +// //! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | +// //! +// //! As you can see the level 0 have two equal bounds, therefore we skip serializing the second +// //! bound, that's the base level where you can directly fetch the documents ids associated with an +// //! exact number. +// //! +// //! The next levels have two different bounds and the associated documents ids are simply the result +// //! of an union of all the documents ids associated with the aggregated groups above. +// //! +// //! ## The complexity of defining groups for facet strings +// //! +// //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in +// //! lexicographical order, it means that whatever the key represent the bytes are read in their raw +// //! form and a simple `strcmp` will define the order in which keys will be read from the store. +// //! +// //! That's easy for types with a known size, like floats or integers, they are 64 bytes long and +// //! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the +// //! first number then by the second if the the first number is equal on two keys. +// //! +// //! For strings it is a lot more complex as those types are unsized, it means that the size of facet +// //! strings is different for each facet value. +// //! +// //! ### Basic approach: padding the keys +// //! +// //! A first approach would be to simply define the maximum size of a facet string and pad the keys +// //! with zeroes. The big problem of this approach is that it: +// //! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the +// //! other. +// //! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB +// //! performances. +// //! +// //! ### Better approach: number the facet groups +// //! +// //! A better approach would be to number the groups, this way we don't have the downsides of the +// //! previously described approach but we need to be able to describe the groups by using a number. +// //! +// //! #### Example of facet strings with numbered groups +// //! +// //! | level | left-bound | right-bound | left-string | right-string | documents ids | +// //! |-------|------------|-------------|-------------|--------------|------------------| +// //! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | +// //! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | +// //! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | +// //! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | +// //! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | +// //! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | +// //! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | +// //! +// //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not +// //! need to store the facet string value two times. +// //! +// //! The number in the left-bound and right-bound columns are incremental numbers representing the +// //! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering +// //! of the LMDB keys. +// //! +// //! In the value, not in the key, you can see that we added two new values: the left-string and the +// //! right-string, which defines the original facet strings associated with the given group. +// //! +// //! We put those two strings inside of the value, this way we do not limit the maximum size of the +// //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big +// //! values on another page, this helps in iterating over keys fast enough and only fetch the page +// //! with the values when required. +// //! +// //! The other little advantage with this solution is that there is no a big overhead, compared with +// //! the facet number levels, we only duplicate the facet strings once for the level 1. +// //! +// //! #### A typical algorithm run +// //! +// //! Note that the algorithm is always moving from the highest level to the lowest one, one level +// //! by one level, this is why it is ok to only store the facets string on the level 1. +// //! +// //! If a group of aggregated facets values, a group with numbers contains one of the documents ids, +// //! we must continue iterating over the sub-groups. To do so: +// //! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds +// //! and iterate over the facet groups defined by these numbers over the current level - 1. +// //! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the +// //! value and just do the same as with the facet numbers but with strings: iterate over the +// //! current level - 1 with both keys. +// //! +// //! If this group is the lowest level (level 0) and contain at least one document id we yield the +// //! associated facet documents ids. +// //! +// //! If the group doesn't contain one of our documents ids, we continue to the next group at this +// //! same level. +// //! -use std::num::NonZeroU8; -use std::ops::Bound; -use std::ops::Bound::{Excluded, Included, Unbounded}; +// use std::num::NonZeroU8; +// use std::ops::Bound; +// use std::ops::Bound::{Excluded, Included, Unbounded}; -use either::{Either, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange, RoRevRange}; -use roaring::RoaringBitmap; +// use either::{Either, Left, Right}; +// use heed::types::{ByteSlice, DecodeIgnore}; +// use heed::{Database, LazyDecode, RoRange, RoRevRange}; +// use roaring::RoaringBitmap; -use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FacetStringZeroBoundsValueCodec, -}; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{FieldId, Index}; +// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; +// use crate::heed_codec::CboRoaringBitmapCodec; +// use crate::{FieldId, Index}; -/// An iterator that is used to explore the facets level strings -/// from the level 1 to infinity. -/// -/// It yields the level, group id that an entry covers, the optional group strings -/// that it covers of the level 0 only if it is an entry from the level 1 and -/// the roaring bitmap associated. -pub struct FacetStringGroupRange<'t> { - iter: RoRange< - 't, - FacetLevelValueU32Codec, - LazyDecode>, - >, - end: Bound, -} +// /// An iterator that is used to explore the facets level strings +// /// from the level 1 to infinity. +// /// +// /// It yields the level, group id that an entry covers, the optional group strings +// /// that it covers of the level 0 only if it is an entry from the level 1 and +// /// the roaring bitmap associated. +// pub struct FacetStringGroupRange<'t> { +// iter: RoRange< +// 't, +// FacetLevelValueU32Codec, +// LazyDecode>, +// >, +// end: Bound, +// } -impl<'t> FacetStringGroupRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: NonZeroU8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let db = db.remap_types::< - FacetLevelValueU32Codec, - FacetStringZeroBoundsValueCodec, - >(); - let left_bound = match left { - Included(left) => Included((field_id, level, left, u32::MIN)), - Excluded(left) => Excluded((field_id, level, left, u32::MIN)), - Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), - }; - let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); - let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; - Ok(FacetStringGroupRange { iter, end: right }) - } -} +// impl<'t> FacetStringGroupRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, +// field_id: FieldId, +// level: NonZeroU8, +// left: Bound, +// right: Bound, +// ) -> heed::Result> { +// let db = db.remap_types::< +// FacetLevelValueU32Codec, +// FacetStringZeroBoundsValueCodec, +// >(); +// let left_bound = match left { +// Included(left) => Included((field_id, level, left, u32::MIN)), +// Excluded(left) => Excluded((field_id, level, left, u32::MIN)), +// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), +// }; +// let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); +// let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; +// Ok(FacetStringGroupRange { iter, end: right }) +// } +// } -impl<'t> Iterator for FacetStringGroupRange<'t> { - type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; +// impl<'t> Iterator for FacetStringGroupRange<'t> { +// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), - Err(e) => Some(Err(e)), - } - } else { - None - } - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} +// fn next(&mut self) -> Option { +// match self.iter.next() { +// Some(Ok(((_fid, level, left, right), docids))) => { +// let must_be_returned = match self.end { +// Included(end) => right <= end, +// Excluded(end) => right < end, +// Unbounded => true, +// }; +// if must_be_returned { +// match docids.decode() { +// Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), +// Err(e) => Some(Err(e)), +// } +// } else { +// None +// } +// } +// Some(Err(e)) => Some(Err(e)), +// None => None, +// } +// } +// } -pub struct FacetStringGroupRevRange<'t> { - iter: RoRevRange< - 't, - FacetLevelValueU32Codec, - LazyDecode>, - >, - end: Bound, -} +// pub struct FacetStringGroupRevRange<'t> { +// iter: RoRevRange< +// 't, +// FacetLevelValueU32Codec, +// LazyDecode>, +// >, +// end: Bound, +// } -impl<'t> FacetStringGroupRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: NonZeroU8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let db = db.remap_types::< - FacetLevelValueU32Codec, - FacetStringZeroBoundsValueCodec, - >(); - let left_bound = match left { - Included(left) => Included((field_id, level, left, u32::MIN)), - Excluded(left) => Excluded((field_id, level, left, u32::MIN)), - Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), - }; - let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); - let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; - Ok(FacetStringGroupRevRange { iter, end: right }) - } -} +// impl<'t> FacetStringGroupRevRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, +// field_id: FieldId, +// level: NonZeroU8, +// left: Bound, +// right: Bound, +// ) -> heed::Result> { +// let db = db.remap_types::< +// FacetLevelValueU32Codec, +// FacetStringZeroBoundsValueCodec, +// >(); +// let left_bound = match left { +// Included(left) => Included((field_id, level, left, u32::MIN)), +// Excluded(left) => Excluded((field_id, level, left, u32::MIN)), +// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), +// }; +// let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); +// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; +// Ok(FacetStringGroupRevRange { iter, end: right }) +// } +// } -impl<'t> Iterator for FacetStringGroupRevRange<'t> { - type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; +// impl<'t> Iterator for FacetStringGroupRevRange<'t> { +// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; - fn next(&mut self) -> Option { - loop { - match self.iter.next() { - Some(Ok(((_fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok((bounds, docids)) => { - return Some(Ok(((level, left, right), (bounds, docids)))) - } - Err(e) => return Some(Err(e)), - } - } - continue; - } - Some(Err(e)) => return Some(Err(e)), - None => return None, - } - } - } -} +// fn next(&mut self) -> Option { +// loop { +// match self.iter.next() { +// Some(Ok(((_fid, level, left, right), docids))) => { +// let must_be_returned = match self.end { +// Included(end) => right <= end, +// Excluded(end) => right < end, +// Unbounded => true, +// }; +// if must_be_returned { +// match docids.decode() { +// Ok((bounds, docids)) => { +// return Some(Ok(((level, left, right), (bounds, docids)))) +// } +// Err(e) => return Some(Err(e)), +// } +// } +// continue; +// } +// Some(Err(e)) => return Some(Err(e)), +// None => return None, +// } +// } +// } +// } -/// An iterator that is used to explore the level 0 of the facets string database. -/// -/// It yields the facet string and the roaring bitmap associated with it. -pub struct FacetStringLevelZeroRange<'t> { - iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -} +// /// An iterator that is used to explore the level 0 of the facets string database. +// /// +// /// It yields the facet string and the roaring bitmap associated with it. +// pub struct FacetStringLevelZeroRange<'t> { +// iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, +// } -impl<'t> FacetStringLevelZeroRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - left: Bound<&str>, - right: Bound<&str>, - ) -> heed::Result> { - fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - &buffer[..] - } +// impl<'t> FacetStringLevelZeroRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, +// field_id: FieldId, +// left: Bound<&str>, +// right: Bound<&str>, +// ) -> heed::Result> { +// fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { +// buffer.extend_from_slice(&field_id.to_be_bytes()); +// buffer.push(0); +// buffer.extend_from_slice(value.as_bytes()); +// &buffer[..] +// } - let mut left_buffer = Vec::new(); - let left_bound = match left { - Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), - Unbounded => { - left_buffer.extend_from_slice(&field_id.to_be_bytes()); - left_buffer.push(0); - Included(&left_buffer[..]) - } - }; +// let mut left_buffer = Vec::new(); +// let left_bound = match left { +// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), +// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), +// Unbounded => { +// left_buffer.extend_from_slice(&field_id.to_be_bytes()); +// left_buffer.push(0); +// Included(&left_buffer[..]) +// } +// }; - let mut right_buffer = Vec::new(); - let right_bound = match right { - Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), - Unbounded => { - right_buffer.extend_from_slice(&field_id.to_be_bytes()); - right_buffer.push(1); // we must only get the level 0 - Excluded(&right_buffer[..]) - } - }; +// let mut right_buffer = Vec::new(); +// let right_bound = match right { +// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), +// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), +// Unbounded => { +// right_buffer.extend_from_slice(&field_id.to_be_bytes()); +// right_buffer.push(1); // we must only get the level 0 +// Excluded(&right_buffer[..]) +// } +// }; - let iter = db - .remap_key_type::() - .range(rtxn, &(left_bound, right_bound))? - .remap_types::(); +// let iter = db +// .remap_key_type::() +// .range(rtxn, &(left_bound, right_bound))? +// .remap_types::(); - Ok(FacetStringLevelZeroRange { iter }) - } -} +// Ok(FacetStringLevelZeroRange { iter }) +// } +// } -impl<'t> Iterator for FacetStringLevelZeroRange<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; +// impl<'t> Iterator for FacetStringLevelZeroRange<'t> { +// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, normalized), (original, docids)))) => { - Some(Ok((normalized, original, docids))) - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} +// fn next(&mut self) -> Option { +// match self.iter.next() { +// Some(Ok(((_fid, normalized), (original, docids)))) => { +// Some(Ok((normalized, original, docids))) +// } +// Some(Err(e)) => Some(Err(e)), +// None => None, +// } +// } +// } -pub struct FacetStringLevelZeroRevRange<'t> { - iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, -} +// pub struct FacetStringLevelZeroRevRange<'t> { +// iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>, +// } -impl<'t> FacetStringLevelZeroRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - left: Bound<&str>, - right: Bound<&str>, - ) -> heed::Result> { - fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - &buffer[..] - } +// impl<'t> FacetStringLevelZeroRevRange<'t> { +// pub fn new( +// rtxn: &'t heed::RoTxn, +// db: Database, +// field_id: FieldId, +// left: Bound<&str>, +// right: Bound<&str>, +// ) -> heed::Result> { +// fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { +// buffer.extend_from_slice(&field_id.to_be_bytes()); +// buffer.push(0); +// buffer.extend_from_slice(value.as_bytes()); +// &buffer[..] +// } - let mut left_buffer = Vec::new(); - let left_bound = match left { - Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), - Unbounded => { - left_buffer.extend_from_slice(&field_id.to_be_bytes()); - left_buffer.push(0); - Included(&left_buffer[..]) - } - }; +// let mut left_buffer = Vec::new(); +// let left_bound = match left { +// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), +// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), +// Unbounded => { +// left_buffer.extend_from_slice(&field_id.to_be_bytes()); +// left_buffer.push(0); +// Included(&left_buffer[..]) +// } +// }; - let mut right_buffer = Vec::new(); - let right_bound = match right { - Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), - Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), - Unbounded => { - right_buffer.extend_from_slice(&field_id.to_be_bytes()); - right_buffer.push(1); // we must only get the level 0 - Excluded(&right_buffer[..]) - } - }; +// let mut right_buffer = Vec::new(); +// let right_bound = match right { +// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), +// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), +// Unbounded => { +// right_buffer.extend_from_slice(&field_id.to_be_bytes()); +// right_buffer.push(1); // we must only get the level 0 +// Excluded(&right_buffer[..]) +// } +// }; - let iter = db - .remap_key_type::() - .rev_range(rtxn, &(left_bound, right_bound))? - .remap_types::(); +// let iter = db +// .remap_key_type::() +// .rev_range(rtxn, &(left_bound, right_bound))? +// .remap_types::(); - Ok(FacetStringLevelZeroRevRange { iter }) - } -} +// Ok(FacetStringLevelZeroRevRange { iter }) +// } +// } -impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; +// impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> { +// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((_fid, normalized), (original, docids)))) => { - Some(Ok((normalized, original, docids))) - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} +// fn next(&mut self) -> Option { +// match self.iter.next() { +// Some(Ok(((_fid, normalized), (original, docids)))) => { +// Some(Ok((normalized, original, docids))) +// } +// Some(Err(e)) => Some(Err(e)), +// None => None, +// } +// } +// } -type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; -type EitherStringRevRange<'t> = - Either, FacetStringLevelZeroRevRange<'t>>; +// type EitherStringRange<'t> = Either, FacetStringLevelZeroRange<'t>>; +// type EitherStringRevRange<'t> = +// Either, FacetStringLevelZeroRevRange<'t>>; -/// An iterator that is used to explore the facet strings level by level, -/// it will only return facets strings that are associated with the -/// candidates documents ids given. -pub struct FacetStringIter<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: Database, - field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, - must_reduce: bool, -} +// /// An iterator that is used to explore the facet strings level by level, +// /// it will only return facets strings that are associated with the +// /// candidates documents ids given. +// pub struct FacetStringIter<'t> { +// rtxn: &'t heed::RoTxn<'t>, +// db: Database, +// field_id: FieldId, +// level_iters: Vec<(RoaringBitmap, Either, EitherStringRevRange<'t>>)>, +// must_reduce: bool, +// } -impl<'t> FacetStringIter<'t> { - pub fn new_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Left(highest_iter))], - must_reduce: true, - }) - } +// impl<'t> FacetStringIter<'t> { +// pub fn new_reducing( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_string_docids.remap_types::(); +// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; +// Ok(FacetStringIter { +// rtxn, +// db, +// field_id, +// level_iters: vec![(documents_ids, Left(highest_iter))], +// must_reduce: true, +// }) +// } - pub fn new_reverse_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Right(highest_reverse_iter))], - must_reduce: true, - }) - } +// pub fn new_reverse_reducing( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_string_docids.remap_types::(); +// let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?; +// Ok(FacetStringIter { +// rtxn, +// db, +// field_id, +// level_iters: vec![(documents_ids, Right(highest_reverse_iter))], +// must_reduce: true, +// }) +// } - pub fn new_non_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_string_docids.remap_types::(); - let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; - Ok(FacetStringIter { - rtxn, - db, - field_id, - level_iters: vec![(documents_ids, Left(highest_iter))], - must_reduce: false, - }) - } +// pub fn new_non_reducing( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// field_id: FieldId, +// documents_ids: RoaringBitmap, +// ) -> heed::Result> { +// let db = index.facet_id_string_docids.remap_types::(); +// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?; +// Ok(FacetStringIter { +// rtxn, +// db, +// field_id, +// level_iters: vec![(documents_ids, Left(highest_iter))], +// must_reduce: false, +// }) +// } - fn highest_level( - rtxn: &'t heed::RoTxn, - db: Database, - fid: FieldId, - ) -> heed::Result> { - Ok(db - .remap_types::() - .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits - .last() - .transpose()? - .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit - } +// fn highest_level( +// rtxn: &'t heed::RoTxn, +// db: Database, +// fid: FieldId, +// ) -> heed::Result> { +// Ok(db +// .remap_types::() +// .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits +// .last() +// .transpose()? +// .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit +// } - fn highest_iter( - rtxn: &'t heed::RoTxn, - index: &'t Index, - db: Database, - field_id: FieldId, - ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - match NonZeroU8::new(highest_level) { - Some(highest_level) => FacetStringGroupRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - highest_level, - Unbounded, - Unbounded, - ) - .map(Left), - None => FacetStringLevelZeroRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - Unbounded, - Unbounded, - ) - .map(Right), - } - } +// fn highest_iter( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// db: Database, +// field_id: FieldId, +// ) -> heed::Result, FacetStringLevelZeroRange<'t>>> { +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// match NonZeroU8::new(highest_level) { +// Some(highest_level) => FacetStringGroupRange::new( +// rtxn, +// index.facet_id_string_docids, +// field_id, +// highest_level, +// Unbounded, +// Unbounded, +// ) +// .map(Left), +// None => FacetStringLevelZeroRange::new( +// rtxn, +// index.facet_id_string_docids, +// field_id, +// Unbounded, +// Unbounded, +// ) +// .map(Right), +// } +// } - fn highest_reverse_iter( - rtxn: &'t heed::RoTxn, - index: &'t Index, - db: Database, - field_id: FieldId, - ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - match NonZeroU8::new(highest_level) { - Some(highest_level) => FacetStringGroupRevRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - highest_level, - Unbounded, - Unbounded, - ) - .map(Left), - None => FacetStringLevelZeroRevRange::new( - rtxn, - index.facet_id_string_docids, - field_id, - Unbounded, - Unbounded, - ) - .map(Right), - } - } -} +// fn highest_reverse_iter( +// rtxn: &'t heed::RoTxn, +// index: &'t Index, +// db: Database, +// field_id: FieldId, +// ) -> heed::Result, FacetStringLevelZeroRevRange<'t>>> { +// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); +// match NonZeroU8::new(highest_level) { +// Some(highest_level) => FacetStringGroupRevRange::new( +// rtxn, +// index.facet_id_string_docids, +// field_id, +// highest_level, +// Unbounded, +// Unbounded, +// ) +// .map(Left), +// None => FacetStringLevelZeroRevRange::new( +// rtxn, +// index.facet_id_string_docids, +// field_id, +// Unbounded, +// Unbounded, +// ) +// .map(Right), +// } +// } +// } -impl<'t> Iterator for FacetStringIter<'t> { - type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; +// impl<'t> Iterator for FacetStringIter<'t> { +// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; - fn next(&mut self) -> Option { - 'outer: loop { - let (documents_ids, last) = self.level_iters.last_mut()?; - let is_ascending = last.is_left(); +// fn next(&mut self) -> Option { +// 'outer: loop { +// let (documents_ids, last) = self.level_iters.last_mut()?; +// let is_ascending = last.is_left(); - // We remap the different iterator types to make - // the algorithm less complex to understand. - let last = match last { - Left(ascending) => match ascending { - Left(group) => Left(Left(group)), - Right(zero_level) => Right(Left(zero_level)), - }, - Right(descending) => match descending { - Left(group) => Left(Right(group)), - Right(zero_level) => Right(Right(zero_level)), - }, - }; +// // We remap the different iterator types to make +// // the algorithm less complex to understand. +// let last = match last { +// Left(ascending) => match ascending { +// Left(group) => Left(Left(group)), +// Right(zero_level) => Right(Left(zero_level)), +// }, +// Right(descending) => match descending { +// Left(group) => Left(Right(group)), +// Right(zero_level) => Right(Right(zero_level)), +// }, +// }; - match last { - Left(group) => { - for result in group { - match result { - Ok(((level, left, right), (string_bounds, mut docids))) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } +// match last { +// Left(group) => { +// for result in group { +// match result { +// Ok(((level, left, right), (string_bounds, mut docids))) => { +// docids &= &*documents_ids; +// if !docids.is_empty() { +// if self.must_reduce { +// *documents_ids -= &docids; +// } - let result = if is_ascending { - match string_bounds { - Some((left, right)) => FacetStringLevelZeroRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right), - None => FacetStringGroupRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), - } - .map(Left) - } else { - match string_bounds { - Some((left, right)) => { - FacetStringLevelZeroRevRange::new( - self.rtxn, - self.db, - self.field_id, - Included(left), - Included(right), - ) - .map(Right) - } - None => FacetStringGroupRevRange::new( - self.rtxn, - self.db, - self.field_id, - NonZeroU8::new(level.get() - 1).unwrap(), - Included(left), - Included(right), - ) - .map(Left), - } - .map(Right) - }; +// let result = if is_ascending { +// match string_bounds { +// Some((left, right)) => FacetStringLevelZeroRange::new( +// self.rtxn, +// self.db, +// self.field_id, +// Included(left), +// Included(right), +// ) +// .map(Right), +// None => FacetStringGroupRange::new( +// self.rtxn, +// self.db, +// self.field_id, +// NonZeroU8::new(level.get() - 1).unwrap(), +// Included(left), +// Included(right), +// ) +// .map(Left), +// } +// .map(Left) +// } else { +// match string_bounds { +// Some((left, right)) => { +// FacetStringLevelZeroRevRange::new( +// self.rtxn, +// self.db, +// self.field_id, +// Included(left), +// Included(right), +// ) +// .map(Right) +// } +// None => FacetStringGroupRevRange::new( +// self.rtxn, +// self.db, +// self.field_id, +// NonZeroU8::new(level.get() - 1).unwrap(), +// Included(left), +// Included(right), +// ) +// .map(Left), +// } +// .map(Right) +// }; - match result { - Ok(iter) => { - self.level_iters.push((docids, iter)); - continue 'outer; - } - Err(e) => return Some(Err(e)), - } - } - } - Err(e) => return Some(Err(e)), - } - } - } - Right(zero_level) => { - // level zero only - for result in zero_level { - match result { - Ok((normalized, original, mut docids)) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } - return Some(Ok((normalized, original, docids))); - } - } - Err(e) => return Some(Err(e)), - } - } - } - } +// match result { +// Ok(iter) => { +// self.level_iters.push((docids, iter)); +// continue 'outer; +// } +// Err(e) => return Some(Err(e)), +// } +// } +// } +// Err(e) => return Some(Err(e)), +// } +// } +// } +// Right(zero_level) => { +// // level zero only +// for result in zero_level { +// match result { +// Ok((normalized, original, mut docids)) => { +// docids &= &*documents_ids; +// if !docids.is_empty() { +// if self.must_reduce { +// *documents_ids -= &docids; +// } +// return Some(Ok((normalized, original, docids))); +// } +// } +// Err(e) => return Some(Err(e)), +// } +// } +// } +// } - self.level_iters.pop(); - } - } -} +// self.level_iters.pop(); +// } +// } +// } diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 7241dab2b..e911dfb15 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -1,16 +1,20 @@ use std::collections::HashSet; use std::fmt::{Debug, Display}; use std::ops::Bound::{self, Excluded, Included}; +use std::ops::RangeBounds; use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; use heed::types::DecodeIgnore; +use heed::LazyDecode; use log::debug; use roaring::RoaringBitmap; -use super::FacetNumberRange; +// use super::FacetNumberRange; use crate::error::{Error, UserError}; -use crate::heed_codec::facet::FacetLevelValueF64Codec; +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; +// use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::{ distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result, }; @@ -144,18 +148,29 @@ impl<'a> Filter<'a> { } } +fn explore_facet_number_levels( + rtxn: &heed::RoTxn, + db: heed::Database, FacetGroupValueCodec>, + field_id: FieldId, +) { +} + impl<'a> Filter<'a> { /// Aggregates the documents ids that are part of the specified range automatically /// going deeper through the levels. fn explore_facet_number_levels( rtxn: &heed::RoTxn, - db: heed::Database, + db: heed::Database, CboRoaringBitmapCodec>, field_id: FieldId, level: u8, left: Bound, right: Bound, output: &mut RoaringBitmap, ) -> Result<()> { + // level must be > 0, I'll create a separate function for level 0 + // if level == 0 { + // call that function + //} match (left, right) { // If the request is an exact value we must go directly to the deepest level. (Included(l), Included(r)) if l == r && level > 0 => { @@ -170,87 +185,121 @@ impl<'a> Filter<'a> { (Excluded(l), Included(r)) if l >= r => return Ok(()), (_, _) => (), } - - let mut left_found = None; - let mut right_found = None; - - // We must create a custom iterator to be able to iterate over the - // requested range as the range iterator cannot express some conditions. - let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; - - debug!("Iterating between {:?} and {:?} (level {})", left, right, level); - - for (i, result) in iter.enumerate() { - let ((_fid, level, l, r), docids) = result?; - debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); - *output |= docids; - // We save the leftest and rightest bounds we actually found at this level. - if i == 0 { - left_found = Some(l); - } - right_found = Some(r); - } - - // Can we go deeper? - let deeper_level = match level.checked_sub(1) { - Some(level) => level, - None => return Ok(()), + let range_start_key = FacetKey { + field_id, + level, + left_bound: match left { + Included(l) => l, + Excluded(l) => l, + Bound::Unbounded => f64::MIN, + }, }; + let mut range_iter = db + .remap_data_type::>() + .range(rtxn, &(range_start_key..))?; - // We must refine the left and right bounds of this range by retrieving the - // missing part in a deeper level. - match left_found.zip(right_found) { - Some((left_found, right_found)) => { - // If the bound is satisfied we avoid calling this function again. - if !matches!(left, Included(l) if l == left_found) { - let sub_right = Excluded(left_found); - debug!( - "calling left with {:?} to {:?} (level {})", - left, sub_right, deeper_level - ); - Self::explore_facet_number_levels( - rtxn, - db, - field_id, - deeper_level, - left, - sub_right, - output, - )?; - } - if !matches!(right, Included(r) if r == right_found) { - let sub_left = Excluded(right_found); - debug!( - "calling right with {:?} to {:?} (level {})", - sub_left, right, deeper_level - ); - Self::explore_facet_number_levels( - rtxn, - db, - field_id, - deeper_level, - sub_left, - right, - output, - )?; - } - } - None => { - // If we found nothing at this level it means that we must find - // the same bounds but at a deeper, more precise level. - Self::explore_facet_number_levels( - rtxn, - db, - field_id, - deeper_level, - left, - right, - output, - )?; - } + let (mut previous_facet_key, mut previous_value) = range_iter.next().unwrap()?; + while let Some(el) = range_iter.next() { + let (facet_key, value) = el?; + let range = (Included(previous_facet_key.left_bound), Excluded(facet_key.left_bound)); + // if the current range intersects with the query range, then go deeper + // what does it mean for two ranges to intersect? + let gte_left = match left { + Included(l) => previous_facet_key.left_bound >= l, + Excluded(l) => previous_facet_key.left_bound > l, // TODO: not true? + Bound::Unbounded => true, + }; + let lte_right = match right { + Included(r) => facet_key.left_bound <= r, + Excluded(r) => facet_key.left_bound < r, + Bound::Unbounded => true, + }; } + // at this point, previous_facet_key and previous_value are the last groups in the level + // we must also check whether we should visit this group - Ok(()) + todo!(); + + // let mut left_found = None; + // let mut right_found = None; + + // // We must create a custom iterator to be able to iterate over the + // // requested range as the range iterator cannot express some conditions. + // let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; + + // debug!("Iterating between {:?} and {:?} (level {})", left, right, level); + + // for (i, result) in iter.enumerate() { + // let ((_fid, level, l, r), docids) = result?; + // debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len()); + // *output |= docids; + // // We save the leftest and rightest bounds we actually found at this level. + // if i == 0 { + // left_found = Some(l); + // } + // right_found = Some(r); + // } + + // // Can we go deeper? + // let deeper_level = match level.checked_sub(1) { + // Some(level) => level, + // None => return Ok(()), + // }; + + // // We must refine the left and right bounds of this range by retrieving the + // // missing part in a deeper level. + // match left_found.zip(right_found) { + // Some((left_found, right_found)) => { + // // If the bound is satisfied we avoid calling this function again. + // if !matches!(left, Included(l) if l == left_found) { + // let sub_right = Excluded(left_found); + // debug!( + // "calling left with {:?} to {:?} (level {})", + // left, sub_right, deeper_level + // ); + // Self::explore_facet_number_levels( + // rtxn, + // db, + // field_id, + // deeper_level, + // left, + // sub_right, + // output, + // )?; + // } + // if !matches!(right, Included(r) if r == right_found) { + // let sub_left = Excluded(right_found); + // debug!( + // "calling right with {:?} to {:?} (level {})", + // sub_left, right, deeper_level + // ); + // Self::explore_facet_number_levels( + // rtxn, + // db, + // field_id, + // deeper_level, + // sub_left, + // right, + // output, + // )?; + // } + // } + // None => { + // // If we found nothing at this level it means that we must find + // // the same bounds but at a deeper, more precise level. + // Self::explore_facet_number_levels( + // rtxn, + // db, + // field_id, + // deeper_level, + // left, + // right, + // output, + // )?; + // } + // } + + // Ok(()) } fn evaluate_operator( @@ -277,23 +326,27 @@ impl<'a> Filter<'a> { return Ok(exist); } Condition::Equal(val) => { - let (_original_value, string_docids) = strings_db - .get(rtxn, &(field_id, &val.value().to_lowercase()))? + let string_docids = strings_db + .get( + rtxn, + &FacetKey { field_id, level: 0, left_bound: &val.value().to_lowercase() }, + )? + .map(|v| v.bitmap) .unwrap_or_default(); let number = val.parse::().ok(); let number_docids = match number { Some(n) => { let n = Included(n); let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels( - rtxn, - numbers_db, - field_id, - 0, - n, - n, - &mut output, - )?; + // Self::explore_facet_number_levels( + // rtxn, + // numbers_db, + // field_id, + // 0, + // n, + // n, + // &mut output, + // )?; output } None => RoaringBitmap::new(), @@ -312,21 +365,32 @@ impl<'a> Filter<'a> { // that's fine if it don't, the value just before will be returned instead. let biggest_level = numbers_db .remap_data_type::() - .get_lower_than_or_equal_to(rtxn, &(field_id, u8::MAX, f64::MAX, f64::MAX))? - .and_then(|((id, level, _, _), _)| if id == field_id { Some(level) } else { None }); + .get_lower_than_or_equal_to( + rtxn, + &FacetKey { field_id, level: u8::MAX, left_bound: f64::MAX }, + )? + .and_then( + |(FacetKey { field_id: id, level, .. }, _)| { + if id == field_id { + Some(level) + } else { + None + } + }, + ); match biggest_level { Some(level) => { let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels( - rtxn, - numbers_db, - field_id, - level, - left, - right, - &mut output, - )?; + // Self::explore_facet_number_levels( + // rtxn, + // numbers_db, + // field_id, + // level, + // left, + // right, + // &mut output, + // )?; Ok(output) } None => Ok(RoaringBitmap::new()), diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index e3ac95882..13b00d2de 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,6 +1,6 @@ pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; -pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; -pub use self::facet_string::FacetStringIter; +// pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; +// pub use self::facet_string::FacetStringIter; pub use self::filter::Filter; mod facet_distribution; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 1b62a67c7..d05e807df 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -15,7 +15,7 @@ use log::debug; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, FacetNumberIter, Filter, DEFAULT_VALUES_PER_FACET}; +pub use self::facet::{FacetDistribution, /* FacetNumberIter,*/ Filter, DEFAULT_VALUES_PER_FACET,}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index e9c92a949..4031c9b06 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -6,10 +6,7 @@ use heed::types::ByteSlice; use heed::BytesDecode; use roaring::RoaringBitmap; -use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, - FacetStringZeroBoundsValueCodec, -}; +use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; use crate::{make_db_snap_from_iter, CboRoaringBitmapCodec, ExternalDocumentsIds, Index}; #[track_caller] @@ -232,46 +229,48 @@ pub fn snap_word_prefix_position_docids(index: &Index) -> String { snap } pub fn snap_facet_id_f64_docids(index: &Index) -> String { - let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( - (facet_id, level, left, right), - b, - )| { - &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) - }); - snap + todo!() + // let snap = make_db_snap_from_iter!(index, facet_id_f64_docids, |( + // (facet_id, level, left, right), + // b, + // )| { + // &format!("{facet_id:<3} {level:<2} {left:<6} {right:<6} {}", display_bitmap(&b)) + // }); + // snap } pub fn snap_facet_id_string_docids(index: &Index) -> String { - let rtxn = index.read_txn().unwrap(); - let bytes_db = index.facet_id_string_docids.remap_types::(); - let iter = bytes_db.iter(&rtxn).unwrap(); - let mut snap = String::new(); + todo!() + // let rtxn = index.read_txn().unwrap(); + // let bytes_db = index.facet_id_string_docids.remap_types::(); + // let iter = bytes_db.iter(&rtxn).unwrap(); + // let mut snap = String::new(); - for x in iter { - let (key, value) = x.unwrap(); - if let Some((field_id, normalized_str)) = FacetStringLevelZeroCodec::bytes_decode(key) { - let (orig_string, docids) = - FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); - snap.push_str(&format!( - "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", - display_bitmap(&docids) - )); - } else if let Some((field_id, level, left, right)) = - FacetLevelValueU32Codec::bytes_decode(key) - { - snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); - let (bounds, docids) = - FacetStringZeroBoundsValueCodec::::bytes_decode(value) - .unwrap(); - if let Some((left, right)) = bounds { - snap.push_str(&format!("{left:<8} {right:<8} ")); - } - snap.push_str(&display_bitmap(&docids)); - snap.push('\n'); - } else { - panic!(); - } - } - snap + // for x in iter { + // let (key, value) = x.unwrap(); + // if let Some((field_id, normalized_str)) = FacetStringLevelZeroCodec::bytes_decode(key) { + // let (orig_string, docids) = + // FacetStringLevelZeroValueCodec::bytes_decode(value).unwrap(); + // snap.push_str(&format!( + // "{field_id:<3} {normalized_str:<8} {orig_string:<8} {}\n", + // display_bitmap(&docids) + // )); + // } else if let Some((field_id, level, left, right)) = + // FacetLevelValueU32Codec::bytes_decode(key) + // { + // snap.push_str(&format!("{field_id:<3} {level:<2} {left:<6} {right:<6} ")); + // let (bounds, docids) = + // FacetStringZeroBoundsValueCodec::::bytes_decode(value) + // .unwrap(); + // if let Some((left, right)) = bounds { + // snap.push_str(&format!("{left:<8} {right:<8} ")); + // } + // snap.push_str(&display_bitmap(&docids)); + // snap.push('\n'); + // } else { + // panic!(); + // } + // } + // snap } pub fn snap_documents_ids(index: &Index) -> String { let rtxn = index.read_txn().unwrap(); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 54328b50d..bb30f24c9 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -10,9 +10,7 @@ use time::OffsetDateTime; use super::ClearDocuments; use crate::error::{InternalError, SerializationError, UserError}; -use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, -}; +use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ @@ -442,11 +440,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_docids( - self.wtxn, - facet_id_f64_docids, - &self.to_delete_docids, - )?; + // TODO: remove_docids_from_facet_field_id_docids( + // self.wtxn, + // facet_id_f64_docids, + // &self.to_delete_docids, + // )?; // We delete the documents ids that are under the facet field id values. remove_docids_from_facet_field_id_docids( self.wtxn, @@ -587,57 +585,57 @@ fn remove_docids_from_facet_field_id_string_docids<'a, C, D>( db: &heed::Database, to_remove: &RoaringBitmap, ) -> crate::Result<()> { - let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS); - let mut iter = db.remap_types::().iter_mut(wtxn)?; - while let Some(result) = iter.next() { - let (key, val) = result?; - match FacetLevelValueU32Codec::bytes_decode(key) { - Some(_) => { - // If we are able to parse this key it means it is a facet string group - // level key. We must then parse the value using the appropriate codec. - let (group, mut docids) = - FacetStringZeroBoundsValueCodec::::bytes_decode(val) - .ok_or_else(|| SerializationError::Decoding { db_name })?; + // let db_name = Some(crate::index::db_name::FACET_ID_STRING_DOCIDS); + // let mut iter = db.remap_types::().iter_mut(wtxn)?; + // while let Some(result) = iter.next() { + // let (key, val) = result?; + // match FacetLevelValueU32Codec::bytes_decode(key) { + // Some(_) => { + // // If we are able to parse this key it means it is a facet string group + // // level key. We must then parse the value using the appropriate codec. + // let (group, mut docids) = + // FacetStringZeroBoundsValueCodec::::bytes_decode(val) + // .ok_or_else(|| SerializationError::Decoding { db_name })?; - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - let val = &(group, docids); - let value_bytes = - FacetStringZeroBoundsValueCodec::::bytes_encode(val) - .ok_or_else(|| SerializationError::Encoding { db_name })?; + // let previous_len = docids.len(); + // docids -= to_remove; + // if docids.is_empty() { + // // safety: we don't keep references from inside the LMDB database. + // unsafe { iter.del_current()? }; + // } else if docids.len() != previous_len { + // let key = key.to_owned(); + // let val = &(group, docids); + // let value_bytes = + // FacetStringZeroBoundsValueCodec::::bytes_encode(val) + // .ok_or_else(|| SerializationError::Encoding { db_name })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &value_bytes)? }; - } - } - None => { - // The key corresponds to a level zero facet string. - let (original_value, mut docids) = - FacetStringLevelZeroValueCodec::bytes_decode(val) - .ok_or_else(|| SerializationError::Decoding { db_name })?; + // // safety: we don't keep references from inside the LMDB database. + // unsafe { iter.put_current(&key, &value_bytes)? }; + // } + // } + // None => { + // // The key corresponds to a level zero facet string. + // let (original_value, mut docids) = + // FacetStringLevelZeroValueCodec::bytes_decode(val) + // .ok_or_else(|| SerializationError::Decoding { db_name })?; - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - let val = &(original_value, docids); - let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) - .ok_or_else(|| SerializationError::Encoding { db_name })?; + // let previous_len = docids.len(); + // docids -= to_remove; + // if docids.is_empty() { + // // safety: we don't keep references from inside the LMDB database. + // unsafe { iter.del_current()? }; + // } else if docids.len() != previous_len { + // let key = key.to_owned(); + // let val = &(original_value, docids); + // let value_bytes = FacetStringLevelZeroValueCodec::bytes_encode(val) + // .ok_or_else(|| SerializationError::Encoding { db_name })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &value_bytes)? }; - } - } - } - } + // // safety: we don't keep references from inside the LMDB database. + // unsafe { iter.put_current(&key, &value_bytes)? }; + // } + // } + // } + // } Ok(()) } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 108acae4f..0926b63f4 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -136,11 +136,12 @@ use roaring::RoaringBitmap; use time::OffsetDateTime; use crate::error::InternalError; -use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, +use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; +use crate::heed_codec::facet::new::str_ref::StrRefCodec; +use crate::heed_codec::facet::new::{ + FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, }; -use crate::heed_codec::CboRoaringBitmapCodec; +// use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{create_writer, write_into_lmdb_database, writer_into_reader}; use crate::{FieldId, Index, Result}; @@ -187,16 +188,18 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); + let mut nested_wtxn = self.index.env.nested_write_txn(self.wtxn).unwrap(); + for field_id in faceted_fields { // Clear the facet string levels. - clear_field_string_levels( - self.wtxn, - self.index.facet_id_string_docids.remap_types::(), - field_id, - )?; + // clear_field_string_levels( + // &mut nested_wtxn, + // self.index.facet_id_string_docids.remap_types::(), + // field_id, + // )?; let (facet_string_levels, string_documents_ids) = compute_facet_strings_levels( - self.wtxn, + &mut nested_wtxn, self.index.facet_id_string_docids, self.chunk_compression_type, self.chunk_compression_level, @@ -206,13 +209,13 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; self.index.put_string_faceted_documents_ids( - self.wtxn, + &mut nested_wtxn, field_id, &string_documents_ids, )?; for facet_strings_level in facet_string_levels { write_into_lmdb_database( - self.wtxn, + &mut nested_wtxn, *self.index.facet_id_string_docids.as_polymorph(), facet_strings_level, |_, _| { @@ -221,11 +224,11 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; } - // Clear the facet number levels. - clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; + // // Clear the facet number levels. + // clear_field_number_levels(&mut nested_wtxn, self.index.facet_id_f64_docids, field_id)?; let (facet_number_levels, number_documents_ids) = compute_facet_number_levels( - self.wtxn, + &mut nested_wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, self.chunk_compression_level, @@ -235,14 +238,14 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; self.index.put_number_faceted_documents_ids( - self.wtxn, + &mut nested_wtxn, field_id, &number_documents_ids, )?; for facet_number_level in facet_number_levels { write_into_lmdb_database( - self.wtxn, + &mut nested_wtxn, *self.index.facet_id_f64_docids.as_polymorph(), facet_number_level, |_, _| { @@ -263,8 +266,8 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { /// that must be inserted into the database. /// 2. a roaring bitmap of all the document ids present in the database fn compute_facet_number_levels<'t>( - rtxn: &'t heed::RoTxn, - db: heed::Database, + rtxn: &'t mut heed::RwTxn, + db: heed::Database, FacetGroupValueCodec>, compression_type: CompressionType, compression_level: Option, level_group_size: NonZeroUsize, @@ -277,7 +280,7 @@ fn compute_facet_number_levels<'t>( .remap_types::() .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - let level_0_start = (field_id, 0, f64::MIN, f64::MIN); + let level_0_start = FacetKey { field_id, level: 0, left_bound: f64::MIN }; // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. @@ -289,37 +292,31 @@ fn compute_facet_number_levels<'t>( let mut number_document_ids = RoaringBitmap::new(); if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = - recursive_compute_levels::( - rtxn, - db, - compression_type, - compression_level, - *top_level, - level_0_start, - &(level_0_start..), - first_level_size, - level_group_size, - &mut |bitmaps, _, _| { - for bitmap in bitmaps { - number_document_ids |= bitmap; - } - Ok(()) - }, - &|_i, (_field_id, _level, left, _right)| *left, - &|bitmap| bitmap, - &|writer, level, left, right, docids| { - write_number_entry(writer, field_id, level.get(), left, right, &docids)?; - Ok(()) - }, - )?; + let subwriters = recursive_compute_levels::( + rtxn, + db, + compression_type, + compression_level, + field_id, + *top_level, + level_0_start, + &(level_0_start..), + first_level_size, + level_group_size, + &mut |bitmaps, _| { + for bitmap in bitmaps { + number_document_ids |= bitmap; + } + Ok(()) + }, + )?; Ok((subwriters, number_document_ids)) } else { let mut documents_ids = RoaringBitmap::new(); for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, docids) = result?; - documents_ids |= docids; + let (_key, group_value) = result?; + documents_ids |= group_value.bitmap; } Ok((vec![], documents_ids)) @@ -333,8 +330,8 @@ fn compute_facet_number_levels<'t>( /// that must be inserted into the database. /// 2. a roaring bitmap of all the document ids present in the database fn compute_facet_strings_levels<'t>( - rtxn: &'t heed::RoTxn, - db: heed::Database, + rtxn: &'t mut heed::RwTxn, + db: heed::Database, FacetGroupValueCodec>, compression_type: CompressionType, compression_level: Option, level_group_size: NonZeroUsize, @@ -347,7 +344,7 @@ fn compute_facet_strings_levels<'t>( .remap_types::() .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; - let level_0_start = (field_id, ""); + let level_0_start = FacetKey { field_id, level: 0, left_bound: "" }; // Groups sizes are always a power of the original level_group_size and therefore a group // always maps groups of the previous level and never splits previous levels groups in half. @@ -359,40 +356,31 @@ fn compute_facet_strings_levels<'t>( let mut strings_document_ids = RoaringBitmap::new(); if let Some((top_level, _)) = group_size_iter.last() { - let subwriters = recursive_compute_levels::< - FacetStringLevelZeroCodec, - FacetStringLevelZeroValueCodec, - (u32, &str), - >( + let subwriters = recursive_compute_levels::( rtxn, db, compression_type, compression_level, + field_id, *top_level, level_0_start, &(level_0_start..), first_level_size, level_group_size, - &mut |bitmaps, _, _| { + &mut |bitmaps, _| { for bitmap in bitmaps { strings_document_ids |= bitmap; } Ok(()) }, - &|i, (_field_id, value)| (i as u32, *value), - &|value| value.1, - &|writer, level, start_bound, end_bound, docids| { - write_string_entry(writer, field_id, level, start_bound, end_bound, docids)?; - Ok(()) - }, )?; Ok((subwriters, strings_document_ids)) } else { let mut documents_ids = RoaringBitmap::new(); for result in db.range(rtxn, &(level_0_start..))?.take(first_level_size) { - let (_key, (_original_value, docids)) = result?; - documents_ids |= docids; + let (_key, group_value) = result?; + documents_ids |= group_value.bitmap; } Ok((vec![], documents_ids)) @@ -436,29 +424,26 @@ from the level below were read/created. Its arguments are: A vector of grenad::Reader. The reader at index `i` corresponds to the elements of level `i + 1` that must be inserted into the database. */ -fn recursive_compute_levels<'t, KeyCodec, ValueCodec, Bound>( - rtxn: &'t heed::RoTxn, - db: heed::Database, +fn recursive_compute_levels<'t, BoundCodec>( + rtxn: &'t mut heed::RwTxn, + db: heed::Database, FacetGroupValueCodec>, compression_type: CompressionType, compression_level: Option, + field_id: FieldId, level: u8, - level_0_start: >::DItem, - level_0_range: &'t RangeFrom<>::DItem>, + level_0_start: FacetKey<>::EItem>, + level_0_range: &'t RangeFrom>::EItem>>, level_0_size: usize, level_group_size: NonZeroUsize, - computed_group_bitmap: &mut dyn FnMut(&[RoaringBitmap], Bound, Bound) -> Result<()>, - bound_from_db_key: &dyn for<'a> Fn(usize, &'a >::DItem) -> Bound, - bitmap_from_db_value: &dyn Fn(>::DItem) -> RoaringBitmap, - write_entry: &dyn Fn(&mut Writer, NonZeroU8, Bound, Bound, RoaringBitmap) -> Result<()>, + computed_group_bitmap: &mut dyn FnMut( + &[RoaringBitmap], + >::EItem, + ) -> Result<()>, ) -> Result>> where - KeyCodec: for<'a> BytesEncode<'a> - + for<'a> BytesDecode<'a, DItem = >::EItem>, - for<'a> >::EItem: Sized, - ValueCodec: for<'a> BytesEncode<'a> - + for<'a> BytesDecode<'a, DItem = >::EItem>, - for<'a> >::EItem: Sized, - Bound: Copy, + for<'a> BoundCodec: + BytesEncode<'a> + BytesDecode<'a, DItem = >::EItem>, + for<'a> >::EItem: Copy + Sized, { if level == 0 { // base case for the recursion @@ -468,31 +453,32 @@ where // 2. fill the `bitmaps` vector to give it to level 1 once `level_group_size` elements were read let mut bitmaps = vec![]; - let mut start_bound = bound_from_db_key(0, &level_0_start); - let mut end_bound = bound_from_db_key(0, &level_0_start); + let mut start_bound = level_0_start.left_bound; + // let mut end_bound = level_0_start.bound; + let mut first_iteration_for_new_group = true; for (i, db_result_item) in db.range(rtxn, level_0_range)?.take(level_0_size).enumerate() { let (key, value) = db_result_item?; - let bound = bound_from_db_key(i, &key); - let docids = bitmap_from_db_value(value); + let bound = key.left_bound; + let docids = value.bitmap; if first_iteration_for_new_group { start_bound = bound; first_iteration_for_new_group = false; } - end_bound = bound; + // end_bound = bound; bitmaps.push(docids); if bitmaps.len() == level_group_size.get() { - computed_group_bitmap(&bitmaps, start_bound, end_bound)?; + computed_group_bitmap(&bitmaps, start_bound)?; first_iteration_for_new_group = true; bitmaps.clear(); } } // don't forget to give the leftover bitmaps as well if !bitmaps.is_empty() { - computed_group_bitmap(&bitmaps, start_bound, end_bound)?; + computed_group_bitmap(&bitmaps, start_bound)?; bitmaps.clear(); } // level 0 is already stored in the DB @@ -516,48 +502,52 @@ where db, compression_type, compression_level, + field_id, level - 1, level_0_start, level_0_range, level_0_size, level_group_size, - &mut |sub_bitmaps: &[RoaringBitmap], start_range, end_range| { + &mut |sub_bitmaps: &[RoaringBitmap], + start_range: >::EItem| { let mut combined_bitmap = RoaringBitmap::default(); for bitmap in sub_bitmaps { combined_bitmap |= bitmap; } - range_for_bitmaps.push((start_range, end_range)); + range_for_bitmaps.push(start_range); bitmaps.push(combined_bitmap); if bitmaps.len() == level_group_size.get() { - let start_bound = range_for_bitmaps.first().unwrap().0; - let end_bound = range_for_bitmaps.last().unwrap().1; - computed_group_bitmap(&bitmaps, start_bound, end_bound)?; - for (bitmap, (start_bound, end_bound)) in - bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) + let start_bound = range_for_bitmaps.first().unwrap(); + computed_group_bitmap(&bitmaps, *start_bound)?; + for (bitmap, start_bound) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - write_entry( + write_entry::( &mut cur_writer, + field_id, NonZeroU8::new(level).unwrap(), start_bound, - end_bound, bitmap, )?; } } Ok(()) }, - bound_from_db_key, - bitmap_from_db_value, - write_entry, )?; + // don't forget to insert the leftover elements into the writer as well if !bitmaps.is_empty() { - let start_range = range_for_bitmaps.first().unwrap().0; - let end_range = range_for_bitmaps.last().unwrap().1; - computed_group_bitmap(&bitmaps, start_range, end_range)?; - for (bitmap, (left, right)) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { - write_entry(&mut cur_writer, NonZeroU8::new(level).unwrap(), left, right, bitmap)?; + let start_range = range_for_bitmaps.first().unwrap(); + let end_range = range_for_bitmaps.last().unwrap(); + computed_group_bitmap(&bitmaps, *start_range)?; + for (bitmap, bound) in bitmaps.drain(..).zip(range_for_bitmaps.drain(..)) { + write_entry( + &mut cur_writer, + field_id, + NonZeroU8::new(level).unwrap(), + bound, + bitmap, + )?; } } @@ -566,60 +556,25 @@ where } } -fn clear_field_number_levels<'t>( - wtxn: &'t mut heed::RwTxn, - db: heed::Database, - field_id: FieldId, -) -> heed::Result<()> { - let left = (field_id, 1, f64::MIN, f64::MIN); - let right = (field_id, u8::MAX, f64::MAX, f64::MAX); - let range = left..=right; - db.delete_range(wtxn, &range).map(drop) -} - -fn clear_field_string_levels<'t>( - wtxn: &'t mut heed::RwTxn, - db: heed::Database, - field_id: FieldId, -) -> heed::Result<()> { - let left = (field_id, NonZeroU8::new(1).unwrap(), u32::MIN, u32::MIN); - let right = (field_id, NonZeroU8::new(u8::MAX).unwrap(), u32::MAX, u32::MAX); - let range = left..=right; - db.remap_key_type::().delete_range(wtxn, &range).map(drop) -} - -fn write_number_entry( - writer: &mut Writer, - field_id: FieldId, - level: u8, - left: f64, - right: f64, - ids: &RoaringBitmap, -) -> Result<()> { - let key = (field_id, level, left, right); - let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; - writer.insert(&key, &data)?; - Ok(()) -} -fn write_string_entry( +fn write_entry( writer: &mut Writer, field_id: FieldId, level: NonZeroU8, - (left_id, left_value): (u32, &str), - (right_id, right_value): (u32, &str), + bound: >::EItem, docids: RoaringBitmap, -) -> Result<()> { - let key = (field_id, level, left_id, right_id); - let key = FacetLevelValueU32Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = match level.get() { - 1 => (Some((left_value, right_value)), docids), - _ => (None, docids), - }; - let data = FacetStringZeroBoundsValueCodec::::bytes_encode(&data) - .ok_or(Error::Encoding)?; - writer.insert(&key, &data)?; - Ok(()) +) -> Result<()> +where + for<'a> BoundCodec: BytesEncode<'a>, + for<'a> >::EItem: Copy + Sized, +{ + todo!() + // let key = FacetKey { field_id, level: level.get(), left_bound: bound }; + // let key_bytes = FacetKeyCodec::::bytes_encode(&key).ok_or(Error::Encoding)?; + // let value_bytes = + // FacetGroupValueCodec::bytes_encode(&FacetGroupValue { size: 4, bitmap: docids }) + // .ok_or(Error::Encoding)?; + // writer.insert(&key_bytes, &value_bytes)?; + // Ok(()) } #[cfg(test)] diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs index 61157fa35..c5424a346 100644 --- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs @@ -6,7 +6,7 @@ use heed::{BytesDecode, BytesEncode}; use super::helpers::{ create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, }; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FieldDocIdFacetF64Codec}; +use crate::heed_codec::facet::FieldDocIdFacetF64Codec; use crate::Result; /// Extracts the facet number and the documents ids where this facet number appear. @@ -31,13 +31,14 @@ pub fn extract_facet_number_docids( let mut cursor = docid_fid_facet_number.into_cursor()?; while let Some((key_bytes, _)) = cursor.move_on_next()? { - let (field_id, document_id, number) = - FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); + todo!() + // let (field_id, document_id, number) = + // FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap(); - let key = (field_id, 0, number, number); - let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); + // let key = (field_id, 0, number, number); + // // let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); - facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; + // facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; } sorter_into_reader(facet_number_docids_sorter, indexer) diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index f7aa3730c..4e655329e 100644 --- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -4,11 +4,9 @@ use std::{io, str}; use roaring::RoaringBitmap; -use super::helpers::{ - create_sorter, keep_first_prefix_value_merge_roaring_bitmaps, sorter_into_reader, - try_split_array_at, GrenadParameters, -}; -use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; +use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; +use crate::update::index_documents::merge_cbo_roaring_bitmaps; +// use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec}; use crate::{FieldId, Result}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -24,7 +22,7 @@ pub fn extract_facet_string_docids( let mut facet_string_docids_sorter = create_sorter( grenad::SortAlgorithm::Stable, - keep_first_prefix_value_merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, // TODO: check indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -42,14 +40,16 @@ pub fn extract_facet_string_docids( let original_value = str::from_utf8(original_value_bytes)?; key_buffer.clear(); - FacetStringLevelZeroCodec::serialize_into( - field_id, - str::from_utf8(normalized_value_bytes)?, - &mut key_buffer, - ); + // TODO + // FacetStringLevelZeroCodec::serialize_into( + // field_id, + // str::from_utf8(normalized_value_bytes)?, + // &mut key_buffer, + // ); value_buffer.clear(); - encode_prefix_string(original_value, &mut value_buffer)?; + // TODO + // encode_prefix_string(original_value, &mut value_buffer)?; let bitmap = RoaringBitmap::from_iter(Some(document_id)); bitmap.serialize_into(&mut value_buffer)?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 50cc04610..1e414458f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -25,8 +25,8 @@ use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ - as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, - merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, + as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, + GrenadParameters, MergeFn, MergeableReader, }; use super::{helpers, TypedChunk}; use crate::{FieldId, Result}; @@ -142,7 +142,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), extract_facet_string_docids, - keep_first_prefix_value_merge_roaring_bitmaps, + merge_roaring_bitmaps, // TODO: check (cbo?) TypedChunk::FieldIdFacetStringDocids, "field-id-facet-string-docids", ); diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index dbe3c0344..cef27ab30 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -5,7 +5,7 @@ use std::result::Result as StdResult; use roaring::RoaringBitmap; use super::read_u32_ne_bytes; -use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; +// use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::Result; @@ -49,32 +49,32 @@ pub fn merge_roaring_bitmaps<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Resul } } -pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( - _key: &[u8], - values: &[Cow<'a, [u8]>], -) -> Result> { - if values.len() == 1 { - Ok(values[0].clone()) - } else { - let original = decode_prefix_string(&values[0]).unwrap().0; - let merged_bitmaps = values - .iter() - .map(AsRef::as_ref) - .map(decode_prefix_string) - .map(Option::unwrap) - .map(|(_, bitmap_bytes)| bitmap_bytes) - .map(RoaringBitmap::deserialize_from) - .map(StdResult::unwrap) - .reduce(|a, b| a | b) - .unwrap(); +// pub fn keep_first_prefix_value_merge_roaring_bitmaps<'a>( +// _key: &[u8], +// values: &[Cow<'a, [u8]>], +// ) -> Result> { +// if values.len() == 1 { +// Ok(values[0].clone()) +// } else { +// let original = decode_prefix_string(&values[0]).unwrap().0; +// let merged_bitmaps = values +// .iter() +// .map(AsRef::as_ref) +// .map(decode_prefix_string) +// .map(Option::unwrap) +// .map(|(_, bitmap_bytes)| bitmap_bytes) +// .map(RoaringBitmap::deserialize_from) +// .map(StdResult::unwrap) +// .reduce(|a, b| a | b) +// .unwrap(); - let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); - let mut buffer = Vec::with_capacity(cap); - encode_prefix_string(original, &mut buffer)?; - merged_bitmaps.serialize_into(&mut buffer)?; - Ok(Cow::Owned(buffer)) - } -} +// let cap = std::mem::size_of::() + original.len() + merged_bitmaps.serialized_size(); +// let mut buffer = Vec::with_capacity(cap); +// encode_prefix_string(original, &mut buffer)?; +// merged_bitmaps.serialize_into(&mut buffer)?; +// Ok(Cow::Owned(buffer)) +// } +// } pub fn keep_first<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result> { Ok(values[0].clone()) diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 6466a636b..7e2ebd2d3 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -13,9 +13,9 @@ pub use grenad_helpers::{ writer_into_reader, GrenadParameters, MergeableReader, }; pub use merge_functions::{ - concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv, - merge_cbo_roaring_bitmaps, merge_obkvs, merge_roaring_bitmaps, merge_two_obkvs, - roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, + concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, merge_obkvs, + merge_roaring_bitmaps, merge_two_obkvs, roaring_bitmap_from_u32s_array, + serialize_roaring_bitmap, MergeFn, }; /// The maximum length a word can be diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 8464c98b6..7a9787bdb 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -13,7 +13,6 @@ use super::helpers::{ valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; -use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, @@ -197,13 +196,14 @@ pub(crate) fn write_typed_chunk_into_index( index_is_empty, |value, _buffer| Ok(value), |new_values, db_values, buffer| { - let (_, new_values) = decode_prefix_string(new_values).unwrap(); - let new_values = RoaringBitmap::deserialize_from(new_values)?; - let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); - let db_values = RoaringBitmap::deserialize_from(db_values)?; - let values = new_values | db_values; - encode_prefix_string(db_original, buffer)?; - Ok(values.serialize_into(buffer)?) + todo!() + // let (_, new_values) = decode_prefix_string(new_values).unwrap(); + // let new_values = RoaringBitmap::deserialize_from(new_values)?; + // let (db_original, db_values) = decode_prefix_string(db_values).unwrap(); + // let db_values = RoaringBitmap::deserialize_from(db_values)?; + // let values = new_values | db_values; + // encode_prefix_string(db_original, buffer)?; + // Ok(values.serialize_into(buffer)?) }, )?; is_merged_database = true;