From 81fe3eaa0971e0d655554c348bb9d9802608c996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Oct 2023 10:38:58 +0200 Subject: [PATCH 1/3] Rename FieldIdWordCountDocids correctly --- milli/src/update/index_documents/extract/mod.rs | 2 +- milli/src/update/index_documents/typed_chunk.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 3eb46e4a7..f2f2a33d4 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -166,7 +166,7 @@ pub(crate) fn data_from_obkv_documents( lmdb_writer_sx.clone(), extract_fid_word_count_docids, merge_cbo_roaring_bitmaps, - TypedChunk::FieldIdWordcountDocids, + TypedChunk::FieldIdWordCountDocids, "field-id-wordcount-docids", ); diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index c83769fb5..96fbc4b46 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -27,7 +27,7 @@ pub(crate) enum TypedChunk { FieldIdDocidFacetStrings(grenad::Reader), FieldIdDocidFacetNumbers(grenad::Reader), Documents(grenad::Reader), - FieldIdWordcountDocids(grenad::Reader), + FieldIdWordCountDocids(grenad::Reader), NewDocumentsIds(RoaringBitmap), WordDocids { word_docids_reader: grenad::Reader, @@ -58,7 +58,7 @@ impl TypedChunk { TypedChunk::Documents(grenad) => { format!("Documents {{ number_of_entries: {} }}", grenad.len()) } - TypedChunk::FieldIdWordcountDocids(grenad) => { + TypedChunk::FieldIdWordCountDocids(grenad) => { format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len()) } TypedChunk::NewDocumentsIds(grenad) => { @@ -126,7 +126,7 @@ pub(crate) fn write_typed_chunk_into_index( index.documents.remap_types::().put(wtxn, key, value)?; } } - TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => { + TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { append_entries_into_database( fid_word_count_docids_iter, &index.field_id_word_count_docids, @@ -478,7 +478,7 @@ where while let Some((key, value)) = cursor.move_on_next()? { if valid_lmdb_key(key) { debug_assert!( - K::bytes_decode(&key).is_some(), + K::bytes_decode(key).is_some(), "Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}", key.len(), &key From 87f8cb8b628d3c24d17483387a3e61916286ffef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Oct 2023 10:47:00 +0200 Subject: [PATCH 2/3] Introduce a function to only serialize the Add side of a DelAdd obkv --- .../src/update/index_documents/typed_chunk.rs | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 96fbc4b46..8ce1c1d8c 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -19,6 +19,7 @@ use crate::distance::NDotProductPoint; use crate::error::UserError; use crate::facet::FacetType; use crate::index::Hnsw; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32}; @@ -132,7 +133,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.field_id_word_count_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -151,7 +152,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.word_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; @@ -161,7 +162,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.exact_word_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; @@ -171,7 +172,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.word_fid_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; @@ -193,7 +194,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.word_position_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -214,7 +215,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.facet_id_exists_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -225,7 +226,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.facet_id_is_null_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -236,7 +237,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.facet_id_is_empty_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -247,7 +248,7 @@ pub(crate) fn write_typed_chunk_into_index( &index.word_pair_proximity_docids, wtxn, index_is_empty, - |value, _buffer| Ok(value), + deladd_serialize_add_side, merge_cbo_roaring_bitmaps, )?; is_merged_database = true; @@ -320,7 +321,7 @@ pub(crate) fn write_typed_chunk_into_index( let found = vector.len(); let expected = *expected_dimensions.get_or_insert(found); if expected != found { - return Err(UserError::InvalidVectorDimensions { expected, found })?; + return Err(UserError::InvalidVectorDimensions { expected, found }.into()); } points.push(NDotProductPoint::new(vector)); @@ -398,6 +399,16 @@ fn merge_cbo_roaring_bitmaps( )?) } +/// A function that extracts and returns the Add side of a DelAdd obkv. +/// This is useful when there are no previous value in the database and +/// therefore we don't need to do a diff with what's already there. +/// +/// If there is no Add side we currently write an empty buffer +/// which is a valid CboRoaringBitmap. +fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec) -> Result<&'a [u8]> { + Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) +} + /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( From 4941543dfe8e4da87e5e18a8e83cc8fd164ce48d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 19 Oct 2023 11:18:30 +0200 Subject: [PATCH 3/3] Introduce the CboRoaringBitmapCodec merge_deladd_into and use it --- .../cbo_roaring_bitmap_codec.rs | 23 ++++++++++ .../src/update/index_documents/typed_chunk.rs | 45 ++++++++++--------- 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 79b52695e..117da1308 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -6,6 +6,7 @@ use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use roaring::RoaringBitmap; use crate::heed_codec::BytesDecodeOwned; +use crate::update::del_add::{DelAdd, KvReaderDelAdd}; /// This is the limit where using a byteorder became less size efficient /// than using a direct roaring encoding, it is also the point where we are able @@ -99,6 +100,28 @@ impl CboRoaringBitmapCodec { Ok(()) } + + /// Merges a DelAdd delta into a CboRoaringBitmap. + pub fn merge_deladd_into( + deladd: KvReaderDelAdd<'_>, + previous: &[u8], + buffer: &mut Vec, + ) -> io::Result<()> { + // Deserialize the bitmap that is already there + let mut previous = Self::deserialize_from(previous)?; + + // Remove integers we no more want in the previous bitmap + if let Some(value) = deladd.get(DelAdd::Deletion) { + previous -= Self::deserialize_from(value)?; + } + + // Insert the new integers we want in the previous bitmap + if let Some(value) = deladd.get(DelAdd::Addition) { + previous |= Self::deserialize_from(value)?; + } + + previous.serialize_into(buffer) + } } impl heed::BytesDecode<'_> for CboRoaringBitmapCodec { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 8ce1c1d8c..dc679a174 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -134,7 +134,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; is_merged_database = true; } @@ -153,7 +153,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; @@ -163,7 +163,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; @@ -173,7 +173,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; // create fst from word docids @@ -195,7 +195,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; is_merged_database = true; } @@ -216,7 +216,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; is_merged_database = true; } @@ -227,7 +227,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; is_merged_database = true; } @@ -238,7 +238,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; is_merged_database = true; } @@ -249,7 +249,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, )?; is_merged_database = true; } @@ -388,17 +388,6 @@ fn merge_word_docids_reader_into_fst( Ok(builder.into_set()) } -fn merge_cbo_roaring_bitmaps( - new_value: &[u8], - db_value: &[u8], - buffer: &mut Vec, -) -> Result<()> { - Ok(CboRoaringBitmapCodec::merge_into( - &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)], - buffer, - )?) -} - /// A function that extracts and returns the Add side of a DelAdd obkv. /// This is useful when there are no previous value in the database and /// therefore we don't need to do a diff with what's already there. @@ -409,6 +398,22 @@ fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec) -> Resul Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) } +/// A function that merges a DelAdd of bitmao into an already existing bitmap. +/// +/// The first argument is the DelAdd obkv of CboRoaringBitmaps and +/// the second one is the CboRoaringBitmap to merge into. +fn merge_deladd_cbo_roaring_bitmaps( + deladd_obkv: &[u8], + previous: &[u8], + buffer: &mut Vec, +) -> Result<()> { + Ok(CboRoaringBitmapCodec::merge_deladd_into( + KvReaderDelAdd::new(deladd_obkv), + previous, + buffer, + )?) +} + /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database(