From 0227254a65f9627099a74039932236522cff278e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 17 Jul 2021 12:50:01 +0200 Subject: [PATCH] Return the original string values for the inverted facet index database --- infos/src/main.rs | 4 +- .../facet_string_level_zero_value_codec.rs | 80 +++++++++++++++++++ .../facet_string_zero_bounds_value_codec.rs | 14 +--- milli/src/heed_codec/facet/mod.rs | 12 +++ milli/src/index.rs | 9 ++- milli/src/search/distinct/facet_distinct.rs | 11 +-- milli/src/search/distinct/mod.rs | 3 +- milli/src/search/facet/facet_distribution.rs | 24 ++++-- milli/src/search/facet/facet_string.rs | 26 ++++-- milli/src/search/facet/filter_condition.rs | 12 ++- milli/src/update/delete_documents.rs | 33 +++++++- milli/src/update/facets.rs | 30 +++++-- .../update/index_documents/merge_function.rs | 23 ++++++ milli/src/update/index_documents/mod.rs | 3 +- milli/src/update/index_documents/store.rs | 16 ++-- 15 files changed, 242 insertions(+), 58 deletions(-) create mode 100644 milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs diff --git a/infos/src/main.rs b/infos/src/main.rs index d5d1ad0af..da15251b0 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -627,14 +627,14 @@ fn facet_values_docids( FacetType::String => { wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?; for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? { - let ((_fid, value), docids) = result?; + let ((_fid, normalized), (_original, docids)) = result?; let count = docids.len(); let docids = if debug { format!("{:?}", docids) } else { format!("{:?}", docids.iter().collect::>()) }; - wtr.write_record(&[value.to_string(), count.to_string(), docids])?; + wtr.write_record(&[normalized.to_string(), count.to_string(), docids])?; } } } diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs new file mode 100644 index 000000000..b2434d453 --- /dev/null +++ b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs @@ -0,0 +1,80 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::{marker, str}; + +use super::try_split_at; + +/// A codec that encodes a string in front of the value. +/// +/// The usecase is for the facet string levels algorithm where we must know the +/// original string of a normalized facet value, the original values are stored +/// in the value to not break the lexicographical ordering of the LMDB keys. +pub struct FacetStringLevelZeroValueCodec(marker::PhantomData); + +impl<'a, C> heed::BytesDecode<'a> for FacetStringLevelZeroValueCodec +where + C: heed::BytesDecode<'a>, +{ + type DItem = (&'a str, C::DItem); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (string_len, bytes) = try_split_at(bytes, 2)?; + let string_len = string_len.try_into().ok().map(u16::from_be_bytes)?; + + let (string, bytes) = try_split_at(bytes, string_len as usize)?; + let string = str::from_utf8(string).ok()?; + + C::bytes_decode(bytes).map(|item| (string, item)) + } +} + +impl<'a, C> heed::BytesEncode<'a> for FacetStringLevelZeroValueCodec +where + C: heed::BytesEncode<'a>, +{ + type EItem = (&'a str, C::EItem); + + fn bytes_encode((string, value): &'a Self::EItem) -> Option> { + let string_len: u16 = string.len().try_into().ok()?; + let value_bytes = C::bytes_encode(&value)?; + + let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); + bytes.extend_from_slice(&string_len.to_be_bytes()); + bytes.extend_from_slice(string.as_bytes()); + bytes.extend_from_slice(&value_bytes[..]); + + Some(Cow::Owned(bytes)) + } +} + +#[cfg(test)] +mod tests { + use heed::types::Unit; + use heed::{BytesDecode, BytesEncode}; + use roaring::RoaringBitmap; + + use super::*; + use crate::CboRoaringBitmapCodec; + + #[test] + fn deserialize_roaring_bitmaps() { + let string = "abc"; + let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); + let key = (string, docids.clone()); + let bytes = + FacetStringLevelZeroValueCodec::::bytes_encode(&key).unwrap(); + let (out_string, out_docids) = + FacetStringLevelZeroValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_string, out_docids), (string, docids)); + } + + #[test] + fn deserialize_unit() { + let string = "def"; + let key = (string, ()); + let bytes = FacetStringLevelZeroValueCodec::::bytes_encode(&key).unwrap(); + let (out_string, out_unit) = + FacetStringLevelZeroValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_string, out_unit), (string, ())); + } +} diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs index 6161118b6..337433c2b 100644 --- a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs @@ -2,7 +2,9 @@ use std::borrow::Cow; use std::convert::TryInto; use std::{marker, str}; -/// A codec that encodes two strings in front of the value. +use super::try_split_at; + +/// A codec that optionally encodes two strings in front of the value. /// /// The usecase is for the facet string levels algorithm where we must /// know the origin of a group, the group left and right bounds are stored @@ -79,16 +81,6 @@ where } } -/// Tries to split a slice in half at the given middle point, -/// `None` if the slice is too short. -fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { - if slice.len() >= mid { - Some(slice.split_at(mid)) - } else { - None - } -} - #[cfg(test)] mod tests { use heed::types::Unit; diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index ecab7eb7c..a6a805bf7 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,6 +1,7 @@ mod facet_level_value_f64_codec; mod facet_level_value_u32_codec; mod facet_string_level_zero_codec; +mod facet_string_level_zero_value_codec; mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; @@ -8,6 +9,17 @@ mod field_doc_id_facet_string_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; +pub use self::facet_string_level_zero_value_codec::FacetStringLevelZeroValueCodec; pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; + +/// Tries to split a slice in half at the given middle point, +/// `None` if the slice is too short. +pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if slice.len() >= mid { + Some(slice.split_at(mid)) + } else { + None + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index efc31ab46..f26643de7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -11,8 +11,8 @@ use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, - FieldDocIdFacetStringCodec, + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, }; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, @@ -90,8 +90,9 @@ pub struct Index { /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, - /// Maps the facet field id and the string with the docids that corresponds to it. - pub facet_id_string_docids: Database, + /// Maps the facet field id and the string with the original string and docids that corresponds to it. + pub facet_id_string_docids: + Database>, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index d81f20732..4436d4cda 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -1,5 +1,6 @@ use std::mem::size_of; +use concat_arrays::concat_arrays; use heed::types::{ByteSlice, Str, Unit}; use roaring::RoaringBitmap; @@ -43,7 +44,10 @@ pub struct FacetDistinctIter<'a> { impl<'a> FacetDistinctIter<'a> { fn facet_string_docids(&self, key: &str) -> heed::Result> { - self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key)) + self.index + .facet_id_string_docids + .get(self.txn, &(self.distinct, key)) + .map(|result| result.map(|(_original, docids)| docids)) } fn facet_number_docids(&self, key: f64) -> heed::Result> { @@ -116,10 +120,7 @@ impl<'a> FacetDistinctIter<'a> { } fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] { - let mut key = [0; FID_SIZE + DOCID_SIZE]; - key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); - key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); - key + concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) } fn facet_number_values<'a>( diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index ae3fdb91e..e7dc52a82 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -47,7 +47,7 @@ mod test { let mut documents = Vec::new(); - let txts = ["toto", "titi", "tata"]; + let txts = ["Toto", "Titi", "Tata"]; let cats = (1..10).map(|i| i.to_string()).collect::>(); let cat_ints = (1..10).collect::>(); @@ -90,7 +90,6 @@ mod test { addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); addition.update_format(UpdateFormat::Json); - addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 7c9acf276..94f875dfc 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -23,7 +23,7 @@ const MAX_VALUES_BY_FACET: usize = 1000; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 35_000; +const CANDIDATES_THRESHOLD: u64 = 3000; pub struct FacetDistribution<'a> { facets: Option>, @@ -72,6 +72,7 @@ impl<'a> FacetDistribution<'a> { FacetType::Number => { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let distribution_prelength = distribution.len(); let db = self.index.field_id_docid_facet_f64s; for docid in candidates.into_iter() { key_buffer.truncate(mem::size_of::()); @@ -84,6 +85,9 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, _, value), ()) = result?; *distribution.entry(value.to_string()).or_insert(0) += 1; + if distribution.len() - distribution_prelength == self.max_values_by_facet { + break; + } } } } @@ -106,6 +110,10 @@ impl<'a> FacetDistribution<'a> { .entry(normalized_value) .or_insert_with(|| (original_value, 0)); *count += 1; + + if normalized_distribution.len() == self.max_values_by_facet { + break; + } } } @@ -154,10 +162,10 @@ impl<'a> FacetDistribution<'a> { FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; for result in iter { - let (value, mut docids) = result?; + let (_normalized, original, mut docids) = result?; docids &= candidates; if !docids.is_empty() { - distribution.insert(value.to_string(), docids.len()); + distribution.insert(original.to_string(), docids.len()); } if distribution.len() == self.max_values_by_facet { break; @@ -193,14 +201,20 @@ impl<'a> FacetDistribution<'a> { .prefix_iter(self.rtxn, &field_id.to_be_bytes())? .remap_key_type::(); + let mut normalized_distribution = BTreeMap::new(); for result in iter { - let ((_, value), docids) = result?; - distribution.insert(value.to_string(), docids.len()); + let ((_, normalized_value), (original_value, docids)) = result?; + normalized_distribution.insert(normalized_value, (original_value, docids.len())); if distribution.len() == self.max_values_by_facet { break; } } + let iter = normalized_distribution + .into_iter() + .map(|(_normalized, (original, count))| (original.to_string(), count)); + distribution.extend(iter); + Ok(distribution) } diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index f0b527104..40ea8c04a 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -135,7 +135,8 @@ use heed::{Database, LazyDecode, RoRange}; use roaring::RoaringBitmap; use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec, + FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FacetStringZeroBoundsValueCodec, }; use crate::heed_codec::CboRoaringBitmapCodec; use crate::{FieldId, Index}; @@ -209,7 +210,11 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { /// /// It yields the facet string and the roaring bitmap associated with it. pub struct FacetStringLevelZeroRange<'t> { - iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, + iter: RoRange< + 't, + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, } impl<'t> FacetStringLevelZeroRange<'t> { @@ -252,18 +257,23 @@ impl<'t> FacetStringLevelZeroRange<'t> { let iter = db .remap_key_type::() .range(rtxn, &(left_bound, right_bound))? - .remap_types::(); + .remap_types::< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec + >(); Ok(FacetStringLevelZeroRange { iter }) } } impl<'t> Iterator for FacetStringLevelZeroRange<'t> { - type Item = heed::Result<(&'t str, RoaringBitmap)>; + type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; fn next(&mut self) -> Option { match self.iter.next() { - Some(Ok(((_fid, value), docids))) => Some(Ok((value, docids))), + Some(Ok(((_fid, normalized), (original, docids)))) => { + Some(Ok((normalized, original, docids))) + } Some(Err(e)) => Some(Err(e)), None => None, } @@ -326,7 +336,7 @@ impl<'t> FacetStringIter<'t> { } impl<'t> Iterator for FacetStringIter<'t> { - type Item = heed::Result<(&'t str, RoaringBitmap)>; + type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; fn next(&mut self) -> Option { 'outer: loop { @@ -377,11 +387,11 @@ impl<'t> Iterator for FacetStringIter<'t> { // level zero only for result in last { match result { - Ok((value, mut docids)) => { + Ok((normalized, original, mut docids)) => { docids &= &*documents_ids; if !docids.is_empty() { *documents_ids -= &docids; - return Some(Ok((value, docids))); + return Some(Ok((normalized, original, docids))); } } Err(e) => return Some(Err(e)), diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index c5ecb5a79..cc108f855 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -17,7 +17,9 @@ use self::Operator::*; use super::parser::{FilterParser, Rule, PREC_CLIMBER}; use super::FacetNumberRange; use crate::error::UserError; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetStringLevelZeroCodec}; +use crate::heed_codec::facet::{ + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, +}; use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq)] @@ -363,7 +365,10 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, numbers_db: heed::Database, - strings_db: heed::Database, + strings_db: heed::Database< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, field_id: FieldId, operator: &Operator, ) -> Result { @@ -374,7 +379,8 @@ impl FilterCondition { GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), Equal(number, string) => { - let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); + let (_original_value, string_docids) = + strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); let number_docids = match number { Some(n) => { let n = Included(*n); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e9c1e507a..bcb7d7580 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -9,6 +9,7 @@ use serde_json::Value; use super::ClearDocuments; use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; +use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; @@ -374,13 +375,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_value_docids( + remove_docids_from_facet_field_id_number_docids( self.wtxn, facet_id_f64_docids, &self.documents_ids, )?; - remove_docids_from_facet_field_id_value_docids( + remove_docids_from_facet_field_id_string_docids( self.wtxn, facet_id_string_docids, &self.documents_ids, @@ -447,7 +448,33 @@ where Ok(()) } -fn remove_docids_from_facet_field_id_value_docids<'a, C>( +fn remove_docids_from_facet_field_id_string_docids<'a, C>( + wtxn: &'a mut heed::RwTxn, + db: &heed::Database>, + to_remove: &RoaringBitmap, +) -> heed::Result<()> +where + C: heed::BytesDecode<'a> + heed::BytesEncode<'a>, +{ + let mut iter = db.remap_key_type::().iter_mut(wtxn)?; + while let Some(result) = iter.next() { + let (bytes, (original_value, mut docids)) = result?; + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let bytes = bytes.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &(original_value, docids))? }; + } + } + + Ok(()) +} + +fn remove_docids_from_facet_field_id_number_docids<'a, C>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, to_remove: &RoaringBitmap, diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index d3bba6d6e..cb9a90f7e 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -12,7 +12,7 @@ use roaring::RoaringBitmap; use crate::error::InternalError; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec, - FacetStringZeroBoundsValueCodec, + FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, }; use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{ @@ -75,7 +75,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; // Compute and store the faceted strings documents ids. - let string_documents_ids = compute_faceted_documents_ids( + let string_documents_ids = compute_faceted_strings_documents_ids( self.wtxn, self.index.facet_id_string_docids.remap_key_type::(), field_id, @@ -96,7 +96,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; // Compute and store the faceted numbers documents ids. - let number_documents_ids = compute_faceted_documents_ids( + let number_documents_ids = compute_faceted_numbers_documents_ids( self.wtxn, self.index.facet_id_f64_docids.remap_key_type::(), field_id, @@ -237,13 +237,26 @@ fn write_number_entry( Ok(()) } -fn compute_faceted_documents_ids( +fn compute_faceted_strings_documents_ids( + rtxn: &heed::RoTxn, + db: heed::Database>, + field_id: FieldId, +) -> Result { + let mut documents_ids = RoaringBitmap::new(); + for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { + let (_key, (_original_value, docids)) = result?; + documents_ids |= docids; + } + + Ok(documents_ids) +} + +fn compute_faceted_numbers_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, field_id: FieldId, ) -> Result { let mut documents_ids = RoaringBitmap::new(); - for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { let (_key, docids) = result?; documents_ids |= docids; @@ -265,7 +278,10 @@ fn clear_field_string_levels<'t>( fn compute_facet_string_levels<'t>( rtxn: &'t heed::RoTxn, - db: heed::Database, + db: heed::Database< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, compression_type: CompressionType, compression_level: Option, shrink_size: Option, @@ -299,7 +315,7 @@ fn compute_facet_string_levels<'t>( // Because we know the size of the level 0 we can use a range iterator that starts // at the first value of the level and goes to the last by simply counting. for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() { - let ((_field_id, value), docids) = result?; + let ((_field_id, value), (_original_value, docids)) = result?; if i == 0 { left = (i as u32, value); diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 8613a8824..7e5d0b581 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -2,8 +2,11 @@ use std::borrow::Cow; use std::result::Result as StdResult; use fst::IntoStreamer; +use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; +use crate::error::SerializationError; +use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::Result; @@ -69,6 +72,26 @@ pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result Ok(vec) } +/// Uses the FacetStringLevelZeroValueCodec to merge the values. +pub fn tuple_string_cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { + let (head, tail) = values.split_first().unwrap(); + let (head_string, mut head_rb) = + FacetStringLevelZeroValueCodec::::bytes_decode(&head[..]) + .ok_or(SerializationError::Decoding { db_name: None })?; + + for value in tail { + let (_string, rb) = + FacetStringLevelZeroValueCodec::::bytes_decode(&value[..]) + .ok_or(SerializationError::Decoding { db_name: None })?; + head_rb |= rb; + } + + FacetStringLevelZeroValueCodec::::bytes_encode(&(head_string, head_rb)) + .map(|cow| cow.into_owned()) + .ok_or(SerializationError::Encoding { db_name: None }) + .map_err(Into::into) +} + pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { let (head, tail) = values.split_first().unwrap(); let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 9ac05fe1a..efe16def7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize}; pub use self::merge_function::{ cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, }; use self::store::{Readers, Store}; pub use self::transform::{Transform, TransformOutput}; @@ -655,7 +656,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, *self.index.facet_id_string_docids.as_polymorph(), facet_field_strings_docids_readers, - cbo_roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, write_method, )?; diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 1538295f9..444b11e31 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -22,12 +22,13 @@ use tempfile::tempfile; use super::merge_function::{ cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, }; use super::{create_sorter, create_writer, writer_into_reader, MergeFn}; use crate::error::{Error, InternalError, SerializationError}; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, - FieldDocIdFacetStringCodec, + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, }; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; @@ -153,7 +154,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let facet_field_strings_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -528,17 +529,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Error: From, { let mut key_buffer = Vec::new(); - let mut data_buffer = Vec::new(); for ((field_id, normalized_value), (original_value, docids)) in iter { key_buffer.clear(); - data_buffer.clear(); FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer); - CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); + + let data = (original_value.as_str(), docids); + let data = FacetStringLevelZeroValueCodec::::bytes_encode(&data) + .ok_or(SerializationError::Encoding { db_name: Some("facet-id-string-docids") })?; if lmdb_key_valid_size(&key_buffer) { - sorter.insert(&key_buffer, &data_buffer)?; + sorter.insert(&key_buffer, &data)?; } else { warn!("facet value {:?} is too large to be saved", original_value); }