From 9452fabfb2ed590db1a7bde089c87e9b41f5a561 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 26 Aug 2021 15:56:24 +0200 Subject: [PATCH] Optimize cbo roaring bitmaps merge --- .../cbo_roaring_bitmap_codec.rs | 76 +++++++++++++++++++ .../helpers/merge_functions.rs | 53 ++----------- .../src/update/index_documents/typed_chunk.rs | 15 +++- 3 files changed, 93 insertions(+), 51 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 53f64d648..c0e984d44 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -52,6 +52,46 @@ impl CboRoaringBitmapCodec { RoaringBitmap::deserialize_from(bytes) } } + + /// Merge serialized CboRoaringBitmaps in a buffer. + /// + /// if the merged values len is under the threshold, + /// values are directly serialized in the buffer; + /// else a RoaringBitmap is created from the values and is serialized in the buffer. + pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec) -> io::Result<()> { + let mut roaring = RoaringBitmap::new(); + let mut vec = Vec::new(); + + for bytes in slices { + if bytes.len() <= THRESHOLD * size_of::() { + let mut reader = bytes.as_ref(); + while let Ok(integer) = reader.read_u32::() { + vec.push(integer); + } + } else { + roaring |= RoaringBitmap::deserialize_from(bytes.as_ref())?; + } + } + + if roaring.is_empty() { + vec.sort_unstable(); + vec.dedup(); + + if vec.len() <= THRESHOLD { + for integer in vec { + buffer.extend_from_slice(&integer.to_ne_bytes()); + } + } else { + let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()); + roaring.serialize_into(buffer)?; + } + } else { + roaring.extend(vec); + roaring.serialize_into(buffer)?; + } + + Ok(()) + } } impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { @@ -106,4 +146,40 @@ mod tests { assert!(roaring_size > bo_size); } + + #[test] + fn merge_cbo_roaring_bitmaps() { + let mut buffer = Vec::new(); + + let small_data = vec![ + RoaringBitmap::from_sorted_iter(1..4), + RoaringBitmap::from_sorted_iter(2..5), + RoaringBitmap::from_sorted_iter(4..6), + RoaringBitmap::from_sorted_iter(1..3), + ]; + + let small_data: Vec<_> = + small_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect(); + CboRoaringBitmapCodec::merge_into(small_data.as_slice(), &mut buffer).unwrap(); + let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap(); + let expected = RoaringBitmap::from_sorted_iter(1..6); + assert_eq!(bitmap, expected); + + let medium_data = vec![ + RoaringBitmap::from_sorted_iter(1..4), + RoaringBitmap::from_sorted_iter(2..5), + RoaringBitmap::from_sorted_iter(4..8), + RoaringBitmap::from_sorted_iter(0..3), + RoaringBitmap::from_sorted_iter(7..23), + ]; + + let medium_data: Vec<_> = + medium_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect(); + buffer.clear(); + CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap(); + + let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap(); + let expected = RoaringBitmap::from_sorted_iter(0..23); + assert_eq!(bitmap, expected); + } } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 6a592e54d..c5385e347 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -120,52 +120,11 @@ pub fn merge_cbo_roaring_bitmaps<'a>( _key: &[u8], values: &[Cow<'a, [u8]>], ) -> Result> { - match values.split_first().unwrap() { - (head, []) => Ok(head.clone()), - (head, tail) => { - let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; - - for value in tail { - head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; - } - - let mut vec = Vec::new(); - CboRoaringBitmapCodec::serialize_into(&head, &mut vec); - Ok(Cow::from(vec)) - } + if values.len() == 1 { + Ok(values[0].clone()) + } else { + let mut vec = Vec::new(); + CboRoaringBitmapCodec::merge_into(values, &mut vec)?; + Ok(Cow::from(vec)) } } - -// /// Uses the FacetStringLevelZeroValueCodec to merge the values. -// pub fn tuple_string_cbo_roaring_bitmap_merge<'a>( -// _key: &[u8], -// values: &[Cow<[u8]>], -// ) -> Result> { -// let (head, tail) = values.split_first().unwrap(); -// let (head_string, mut head_rb) = FacetStringLevelZeroValueCodec::bytes_decode(&head[..]) -// .ok_or(SerializationError::Decoding { db_name: None })?; - -// for value in tail { -// let (_string, rb) = FacetStringLevelZeroValueCodec::bytes_decode(&value[..]) -// .ok_or(SerializationError::Decoding { db_name: None })?; -// head_rb |= rb; -// } - -// FacetStringLevelZeroValueCodec::bytes_encode(&(head_string, head_rb)) -// .map(|cow| cow.into_owned()) -// .ok_or(SerializationError::Encoding { db_name: None }) -// .map_err(Into::into) -// } - -// pub fn cbo_roaring_bitmap_merge<'a>(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { -// let (head, tail) = values.split_first().unwrap(); -// let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; - -// for value in tail { -// head |= CboRoaringBitmapCodec::deserialize_from(&value[..])?; -// } - -// let mut vec = Vec::new(); -// CboRoaringBitmapCodec::serialize_into(&head, &mut vec); -// Ok(vec) -// } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 84333addb..c3c71bbf4 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -188,15 +188,22 @@ fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec Ok(serialize_roaring_bitmap(&value, buffer)?) } +use std::borrow::Cow; + fn merge_cbo_roaring_bitmaps( new_value: &[u8], db_value: &[u8], buffer: &mut Vec, ) -> Result<()> { - let new_value = CboRoaringBitmapCodec::deserialize_from(new_value)?; - let db_value = CboRoaringBitmapCodec::deserialize_from(db_value)?; - let value = new_value | db_value; - Ok(CboRoaringBitmapCodec::serialize_into(&value, buffer)) + Ok(CboRoaringBitmapCodec::merge_into( + &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], + buffer, + )?) + + // let new_value = CboRoaringBitmapCodec::deserialize_from(new_value)?; + // let db_value = CboRoaringBitmapCodec::deserialize_from(db_value)?; + // let value = new_value | db_value; + // Ok(CboRoaringBitmapCodec::serialize_into(&value, buffer)) } /// Write provided entries in database using serialize_value function.