diff --git a/Cargo.lock b/Cargo.lock index b62a61f92..4417af63a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4377,12 +4377,6 @@ dependencies = [ "winreg", ] -[[package]] -name = "retain_mut" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" - [[package]] name = "ring" version = "0.17.8" @@ -4400,13 +4394,12 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.2" +version = "0.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" +checksum = "7699249cc2c7d71939f30868f47e9d7add0bdc030d90ee10bfd16887ff8bb1c8" dependencies = [ "bytemuck", "byteorder", - "retain_mut", "serde", ] diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 7bb874060..a8bb5055e 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -47,6 +47,12 @@ pub struct FacetGroupValue { pub bitmap: RoaringBitmap, } +#[derive(Debug)] +pub struct FacetGroupLazyValue<'b> { + pub size: u8, + pub bitmap_bytes: &'b [u8], +} + pub struct FacetGroupKeyCodec { _phantom: PhantomData, } @@ -69,6 +75,7 @@ where Ok(Cow::Owned(v)) } } + impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec where T: BytesDecode<'a>, @@ -84,6 +91,7 @@ where } pub struct FacetGroupValueCodec; + impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { type EItem = FacetGroupValue; @@ -93,11 +101,23 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { Ok(Cow::Owned(v)) } } + impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { type DItem = FacetGroupValue; + fn bytes_decode(bytes: &'a [u8]) -> Result { let size = bytes[0]; let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?; Ok(FacetGroupValue { size, bitmap }) } } + +pub struct FacetGroupLazyValueCodec; + +impl<'a> heed::BytesDecode<'a> for FacetGroupLazyValueCodec { + type DItem = FacetGroupLazyValue<'a>; + + fn bytes_decode(bytes: &'a [u8]) -> Result { + Ok(FacetGroupLazyValue { size: bytes[0], bitmap_bytes: &bytes[1..] }) + } +} diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 1db518c7d..a04698019 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::io; +use std::io::{self, Cursor}; use std::mem::size_of; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; @@ -57,6 +57,24 @@ impl CboRoaringBitmapCodec { } } + pub fn intersection_with_serialized( + mut bytes: &[u8], + other: &RoaringBitmap, + ) -> io::Result { + // See above `deserialize_from` method for implementation details. + if bytes.len() <= THRESHOLD * size_of::() { + let mut bitmap = RoaringBitmap::new(); + while let Ok(integer) = bytes.read_u32::() { + if other.contains(integer) { + bitmap.insert(integer); + } + } + Ok(bitmap) + } else { + other.intersection_with_serialized_unchecked(Cursor::new(bytes)) + } + } + /// Merge serialized CboRoaringBitmaps in a buffer. /// /// if the merged values length is under the threshold, values are directly diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index d993ef2dc..a8aa1a006 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -38,7 +38,7 @@ where field_id, )?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; Ok(()) } else { @@ -81,7 +81,7 @@ where field_id, )?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { // We first fill the heap with values from the highest level let starting_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index e340fbac5..0f8f58771 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -4,9 +4,11 @@ use heed::BytesEncode; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec, +}; use crate::heed_codec::BytesRefCodec; -use crate::Result; +use crate::{CboRoaringBitmapCodec, Result}; /// Find all the document ids for which the given field contains a value contained within /// the two bounds. @@ -16,6 +18,7 @@ pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, + universe: Option<&RoaringBitmap>, docids: &mut RoaringBitmap, ) -> Result<()> where @@ -46,13 +49,15 @@ where } Bound::Unbounded => Bound::Unbounded, }; - let db = db.remap_key_type::>(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; + let db = db.remap_types::, FacetGroupLazyValueCodec>(); + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(starting_left_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(starting_left_bound) = + get_first_facet_value::(rtxn, db, field_id)? + { let rightmost_bound = - Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded + Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded let group_size = usize::MAX; f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) @@ -64,12 +69,16 @@ where /// Fetch the document ids that have a facet with a value between the two given bounds struct FacetRangeSearch<'t, 'b, 'bitmap> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupLazyValueCodec>, field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, + /// The subset of documents ids that are useful for this search. + /// Great performance optimizations can be achieved by only fetching values matching this subset. + universe: Option<&'bitmap RoaringBitmap>, docids: &'bitmap mut RoaringBitmap, } + impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { let left_key = @@ -104,7 +113,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { - *self.docids |= value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?, + }; } } Ok(()) @@ -195,7 +210,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { left_condition && right_condition }; if should_take_whole_group { - *self.docids |= &previous_value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, + }; previous_key = next_key; previous_value = next_value; continue; @@ -291,7 +312,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { left_condition && right_condition }; if should_take_whole_group { - *self.docids |= &previous_value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, + }; } else { let level = level - 1; let starting_left_bound = previous_key.left_bound; @@ -365,6 +392,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -384,6 +412,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -418,6 +447,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -439,6 +469,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -474,6 +505,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -499,6 +531,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -537,6 +570,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -556,6 +590,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -571,6 +606,7 @@ mod tests { 0, &Bound::Unbounded, &Bound::Unbounded, + None, &mut docids, ) .unwrap(); @@ -586,6 +622,7 @@ mod tests { 1, &Bound::Unbounded, &Bound::Unbounded, + None, &mut docids, ) .unwrap(); @@ -621,6 +658,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -634,6 +672,7 @@ mod tests { 1, &start, &end, + None, &mut docids, ) .unwrap(); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 07fe64510..59a95e5bd 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -36,7 +36,7 @@ pub fn ascending_facet_sort<'t>( candidates: RoaringBitmap, ) -> Result> + 't> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index dd2692012..29586e4e4 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -19,9 +19,9 @@ pub fn descending_facet_sort<'t>( candidates: RoaringBitmap, ) -> Result> + 't> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); Ok(itertools::Either::Left(DescendingFacetSort { diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index dbd9538a5..c08abc8e0 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -4,7 +4,7 @@ use std::ops::Bound::{self, Excluded, Included}; use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Token}; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use serde_json::Value; use super::facet_range_search; @@ -224,14 +224,14 @@ impl<'a> Filter<'a> { pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time let filterable_fields = index.filterable_fields(rtxn)?; - - self.inner_evaluate(rtxn, index, &filterable_fields) + self.inner_evaluate(rtxn, index, &filterable_fields, None) } fn evaluate_operator( rtxn: &heed::RoTxn, index: &Index, field_id: FieldId, + universe: Option<&RoaringBitmap>, operator: &Condition<'a>, ) -> Result { let numbers_db = index.facet_id_f64_docids; @@ -291,14 +291,22 @@ impl<'a> Filter<'a> { } Condition::NotEqual(val) => { let operator = Condition::Equal(val.clone()); - let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?; + let docids = Self::evaluate_operator(rtxn, index, field_id, None, &operator)?; let all_ids = index.documents_ids(rtxn)?; return Ok(all_ids - docids); } }; let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels(rtxn, numbers_db, field_id, left, right, &mut output)?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + left, + right, + universe, + &mut output, + )?; Ok(output) } @@ -310,6 +318,7 @@ impl<'a> Filter<'a> { field_id: FieldId, left: Bound, right: Bound, + universe: Option<&RoaringBitmap>, output: &mut RoaringBitmap, ) -> Result<()> { match (left, right) { @@ -321,7 +330,7 @@ impl<'a> Filter<'a> { (_, _) => (), } facet_range_search::find_docids_of_facet_within_bounds::( - rtxn, db, field_id, &left, &right, output, + rtxn, db, field_id, &left, &right, universe, output, )?; Ok(()) @@ -332,31 +341,37 @@ impl<'a> Filter<'a> { rtxn: &heed::RoTxn, index: &Index, filterable_fields: &HashSet, + universe: Option<&RoaringBitmap>, ) -> Result { + if universe.map_or(false, |u| u.is_empty()) { + return Ok(RoaringBitmap::new()); + } + match &self.condition { FilterCondition::Not(f) => { - let all_ids = index.documents_ids(rtxn)?; let selected = Self::inner_evaluate( &(f.as_ref().clone()).into(), rtxn, index, filterable_fields, + universe, )?; - Ok(all_ids - selected) + match universe { + Some(universe) => Ok(universe - selected), + None => { + let all_ids = index.documents_ids(rtxn)?; + Ok(all_ids - selected) + } + } } FilterCondition::In { fid, els } => { if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; - if let Some(fid) = field_ids_map.id(fid.value()) { - let mut bitmap = RoaringBitmap::new(); - - for el in els { - let op = Condition::Equal(el.clone()); - let el_bitmap = Self::evaluate_operator(rtxn, index, fid, &op)?; - bitmap |= el_bitmap; - } - Ok(bitmap) + els.iter() + .map(|el| Condition::Equal(el.clone())) + .map(|op| Self::evaluate_operator(rtxn, index, fid, universe, &op)) + .union() } else { Ok(RoaringBitmap::new()) } @@ -371,7 +386,7 @@ impl<'a> Filter<'a> { if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; if let Some(fid) = field_ids_map.id(fid.value()) { - Self::evaluate_operator(rtxn, index, fid, op) + Self::evaluate_operator(rtxn, index, fid, universe, op) } else { Ok(RoaringBitmap::new()) } @@ -382,14 +397,11 @@ impl<'a> Filter<'a> { }))? } } - FilterCondition::Or(subfilters) => { - let mut bitmap = RoaringBitmap::new(); - for f in subfilters { - bitmap |= - Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?; - } - Ok(bitmap) - } + FilterCondition::Or(subfilters) => subfilters + .iter() + .cloned() + .map(|f| Self::inner_evaluate(&f.into(), rtxn, index, filterable_fields, universe)) + .union(), FilterCondition::And(subfilters) => { let mut subfilters_iter = subfilters.iter(); if let Some(first_subfilter) = subfilters_iter.next() { @@ -398,16 +410,21 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; for f in subfilters_iter { if bitmap.is_empty() { return Ok(bitmap); } + // TODO We are doing the intersections two times, + // it could be more efficient + // Can't I just replace this `&=` by an `=`? bitmap &= Self::inner_evaluate( &(f.clone()).into(), rtxn, index, filterable_fields, + Some(&bitmap), )?; } Ok(bitmap) @@ -507,6 +524,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; let geo_lng_token = Token::new( @@ -539,6 +557,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; let condition_right = FilterCondition::Condition { @@ -552,6 +571,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; left | right @@ -567,6 +587,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )? }; diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 34a9cdcb8..858028bb5 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::filter::{BadGeoError, Filter}; pub use self::search::{FacetValueHit, SearchForFacetValues}; -use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; +use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::heed_codec::BytesRefCodec; use crate::{Index, Result}; @@ -54,9 +54,9 @@ pub fn facet_max_value<'t>( } /// Get the first facet value in the facet database -pub(crate) fn get_first_facet_value<'t, BoundCodec>( +pub(crate) fn get_first_facet_value<'t, BoundCodec, DC>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result> where @@ -78,9 +78,9 @@ where } /// Get the last facet value in the facet database -pub(crate) fn get_last_facet_value<'t, BoundCodec>( +pub(crate) fn get_last_facet_value<'t, BoundCodec, DC>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result> where @@ -102,9 +102,9 @@ where } /// Get the height of the highest level in the facet database -pub(crate) fn get_highest_level<'t>( +pub(crate) fn get_highest_level<'t, DC>( txn: &'t RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 623c72567..52eb7ffea 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -548,6 +548,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( Ok(()) } +#[tracing::instrument(level = "trace", skip_all, target = "search")] pub fn filtered_universe( index: &Index, txn: &RoTxn<'_>,