From 8628a0c85694c97b407fe08c49f6a3280cd22d21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 7 Jun 2023 10:02:21 +0200 Subject: [PATCH] Remove docid_word_positions_db + fix deletion bug That would happen when a word was deleted from all exact attributes but not all regular attributes. --- milli/src/index.rs | 12 +- milli/src/lib.rs | 46 ------ milli/src/snapshot_tests.rs | 9 -- milli/src/update/clear_documents.rs | 3 - milli/src/update/delete_documents.rs | 146 ++++++++++-------- .../src/update/index_documents/extract/mod.rs | 2 - .../helpers/merge_functions.rs | 5 - .../src/update/index_documents/helpers/mod.rs | 4 +- milli/src/update/index_documents/mod.rs | 11 +- .../src/update/index_documents/typed_chunk.rs | 34 +--- 10 files changed, 95 insertions(+), 177 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index 9ea7b628c..0d74e0732 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -21,10 +21,9 @@ use crate::heed_codec::facet::{ }; use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec}; use crate::{ - default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, - DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, - FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, - Search, U8StrStrCodec, BEU16, BEU32, + default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, + FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, + Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -111,9 +110,6 @@ pub struct Index { /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. pub exact_word_prefix_docids: Database, - /// Maps a word and a document id (u32) to all the positions where the given word appears. - pub docid_word_positions: Database, - /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. @@ -177,7 +173,6 @@ impl Index { let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?; let exact_word_prefix_docids = env.create_database(&mut wtxn, Some(EXACT_WORD_PREFIX_DOCIDS))?; - let docid_word_positions = env.create_database(&mut wtxn, Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let script_language_docids = @@ -220,7 +215,6 @@ impl Index { exact_word_docids, word_prefix_docids, exact_word_prefix_docids, - docid_word_positions, word_pair_proximity_docids, script_language_docids, word_prefix_pair_proximity_docids, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 48699e76f..e7acdde2c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -5,52 +5,6 @@ #[global_allocator] pub static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; -// #[cfg(test)] -// pub mod allocator { -// use std::alloc::{GlobalAlloc, System}; -// use std::sync::atomic::{self, AtomicI64}; - -// #[global_allocator] -// pub static ALLOC: CountingAlloc = CountingAlloc { -// max_resident: AtomicI64::new(0), -// resident: AtomicI64::new(0), -// allocated: AtomicI64::new(0), -// }; - -// pub struct CountingAlloc { -// pub max_resident: AtomicI64, -// pub resident: AtomicI64, -// pub allocated: AtomicI64, -// } -// unsafe impl GlobalAlloc for CountingAlloc { -// unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { -// self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); -// let old_resident = -// self.resident.fetch_add(layout.size() as i64, atomic::Ordering::SeqCst); - -// let resident = old_resident + layout.size() as i64; -// self.max_resident.fetch_max(resident, atomic::Ordering::SeqCst); - -// // if layout.size() > 1_000_000 { -// // eprintln!( -// // "allocating {} with new resident size: {resident}", -// // layout.size() / 1_000_000 -// // ); -// // // let trace = std::backtrace::Backtrace::capture(); -// // // let t = trace.to_string(); -// // // eprintln!("{t}"); -// // } - -// System.alloc(layout) -// } - -// unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { -// self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); -// System.dealloc(ptr, layout) -// } -// } -// } - #[macro_use] pub mod documents; diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index b70bea496..25c1088b9 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -89,7 +89,6 @@ Create a snapshot test of the given database. - `exact_word_docids` - `word_prefix_docids` - `exact_word_prefix_docids` - - `docid_word_positions` - `word_pair_proximity_docids` - `word_prefix_pair_proximity_docids` - `word_position_docids` @@ -217,11 +216,6 @@ pub fn snap_exact_word_prefix_docids(index: &Index) -> String { &format!("{s:<16} {}", display_bitmap(&b)) }) } -pub fn snap_docid_word_positions(index: &Index) -> String { - make_db_snap_from_iter!(index, docid_word_positions, |((idx, s), b)| { - &format!("{idx:<6} {s:<16} {}", display_bitmap(&b)) - }) -} pub fn snap_word_pair_proximity_docids(index: &Index) -> String { make_db_snap_from_iter!(index, word_pair_proximity_docids, |((proximity, word1, word2), b)| { &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) @@ -477,9 +471,6 @@ macro_rules! full_snap_of_db { ($index:ident, exact_word_prefix_docids) => {{ $crate::snapshot_tests::snap_exact_word_prefix_docids(&$index) }}; - ($index:ident, docid_word_positions) => {{ - $crate::snapshot_tests::snap_docid_word_positions(&$index) - }}; ($index:ident, word_pair_proximity_docids) => {{ $crate::snapshot_tests::snap_word_pair_proximity_docids(&$index) }}; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 147643bad..04119c641 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -23,7 +23,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { exact_word_docids, word_prefix_docids, exact_word_prefix_docids, - docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids, @@ -80,7 +79,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { exact_word_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?; exact_word_prefix_docids.clear(self.wtxn)?; - docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; prefix_word_pair_proximity_docids.clear(self.wtxn)?; @@ -141,7 +139,6 @@ mod tests { assert!(index.word_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); - assert!(index.docid_word_positions.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 311f93f8f..60cd41e8a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,5 +1,5 @@ use std::collections::btree_map::Entry; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeSet, HashMap, HashSet}; use fst::IntoStreamer; use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice}; @@ -15,8 +15,7 @@ use crate::facet::FacetType; use crate::heed_codec::facet::FieldDocIdFacetCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::{ - ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, - SmallString32, BEU32, + ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32, }; pub struct DeleteDocuments<'t, 'u, 'i> { @@ -232,7 +231,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { exact_word_docids, word_prefix_docids, exact_word_prefix_docids, - docid_word_positions, word_pair_proximity_docids, field_id_word_count_docids, word_prefix_pair_proximity_docids, @@ -251,23 +249,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { facet_id_is_empty_docids, documents, } = self.index; - - // Retrieve the words contained in the documents. - let mut words = Vec::new(); + // Remove from the documents database for docid in &self.to_delete_docids { documents.delete(self.wtxn, &BEU32::new(docid))?; - - // We iterate through the words positions of the document id, retrieve the word and delete the positions. - // We create an iterator to be able to get the content and delete the key-value itself. - // It's faster to acquire a cursor to get and delete, as we avoid traversing the LMDB B-Tree two times but only once. - let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?; - while let Some(result) = iter.next() { - let ((_docid, word), _positions) = result?; - // This boolean will indicate if we must remove this word from the words FST. - words.push((SmallString32::from(word), false)); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } } // We acquire the current external documents ids map... // Note that its soft-deleted document ids field will be equal to the `to_delete_docids` @@ -278,42 +262,27 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { let new_external_documents_ids = new_external_documents_ids.into_static(); self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?; - // Maybe we can improve the get performance of the words - // if we sort the words first, keeping the LMDB pages in cache. - words.sort_unstable(); - + let mut words_to_keep = BTreeSet::default(); + let mut words_to_delete = BTreeSet::default(); // We iterate over the words and delete the documents ids // from the word docids database. - for (word, must_remove) in &mut words { - remove_from_word_docids( - self.wtxn, - word_docids, - word.as_str(), - must_remove, - &self.to_delete_docids, - )?; - - remove_from_word_docids( - self.wtxn, - exact_word_docids, - word.as_str(), - must_remove, - &self.to_delete_docids, - )?; - } + remove_from_word_docids( + self.wtxn, + word_docids, + &self.to_delete_docids, + &mut words_to_keep, + &mut words_to_delete, + )?; + remove_from_word_docids( + self.wtxn, + exact_word_docids, + &self.to_delete_docids, + &mut words_to_keep, + &mut words_to_delete, + )?; // We construct an FST set that contains the words to delete from the words FST. - let words_to_delete = - words.iter().filter_map( - |(word, must_remove)| { - if *must_remove { - Some(word.as_str()) - } else { - None - } - }, - ); - let words_to_delete = fst::Set::from_iter(words_to_delete)?; + let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?; let new_words_fst = { // We retrieve the current words FST from the database. @@ -532,23 +501,24 @@ fn remove_from_word_prefix_docids( fn remove_from_word_docids( txn: &mut heed::RwTxn, db: &heed::Database, - word: &str, - must_remove: &mut bool, to_remove: &RoaringBitmap, + words_to_keep: &mut BTreeSet, + words_to_remove: &mut BTreeSet, ) -> Result<()> { // We create an iterator to be able to get the content and delete the word docids. // It's faster to acquire a cursor to get and delete or put, as we avoid traversing // the LMDB B-Tree two times but only once. - let mut iter = db.prefix_iter_mut(txn, word)?; - if let Some((key, mut docids)) = iter.next().transpose()? { - if key == word { - let previous_len = docids.len(); - docids -= to_remove; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - *must_remove = true; - } else if docids.len() != previous_len { + let mut iter = db.iter_mut(txn)?; + while let Some((key, mut docids)) = iter.next().transpose()? { + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + words_to_remove.insert(key.to_owned()); + } else { + words_to_keep.insert(key.to_owned()); + if docids.len() != previous_len { let key = key.to_owned(); // safety: we don't keep references from inside the LMDB database. unsafe { iter.put_current(&key, &docids)? }; @@ -627,7 +597,7 @@ mod tests { use super::*; use crate::index::tests::TempIndex; - use crate::{db_snap, Filter}; + use crate::{db_snap, Filter, Search}; fn delete_documents<'t>( wtxn: &mut RwTxn<'t, '_>, @@ -1199,4 +1169,52 @@ mod tests { DeletionStrategy::AlwaysSoft, ); } + + #[test] + fn delete_words_exact_attributes() { + let index = TempIndex::new(); + + index + .update_settings(|settings| { + settings.set_primary_key(S("id")); + settings.set_searchable_fields(vec![S("text"), S("exact")]); + settings.set_exact_attributes(vec![S("exact")].into_iter().collect()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "id": 0, "text": "hello" }, + { "id": 1, "exact": "hello"} + ])) + .unwrap(); + db_snap!(index, word_docids, 1, @r###" + hello [0, ] + "###); + db_snap!(index, exact_word_docids, 1, @r###" + hello [1, ] + "###); + db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); + + let mut wtxn = index.write_txn().unwrap(); + let deleted_internal_ids = + delete_documents(&mut wtxn, &index, &["1"], DeletionStrategy::AlwaysHard); + wtxn.commit().unwrap(); + + db_snap!(index, word_docids, 2, @r###" + hello [0, ] + "###); + db_snap!(index, exact_word_docids, 2, @""); + db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f"); + + insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]"); + let txn = index.read_txn().unwrap(); + let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap(); + insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###); + + let mut s = Search::new(&txn, &index); + s.query("hello"); + let crate::SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + } } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 3df8321bc..632f568ab 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -325,8 +325,6 @@ fn send_and_extract_flattened_documents_data( // send docid_word_positions_chunk to DB writer let docid_word_positions_chunk = unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? }; - let _ = lmdb_writer_sx - .send(Ok(TypedChunk::DocidWordPositions(docid_word_positions_chunk.clone()))); let _ = lmdb_writer_sx.send(Ok(TypedChunk::ScriptLanguageDocids(script_language_pair))); diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 7b8891a7a..64bee95df 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -4,7 +4,6 @@ use std::result::Result as StdResult; use roaring::RoaringBitmap; -use super::read_u32_ne_bytes; use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::transform::Operation; use crate::Result; @@ -22,10 +21,6 @@ pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result RoaringBitmap { - read_u32_ne_bytes(slice).collect() -} - pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec) -> io::Result<()> { buffer.clear(); buffer.reserve(bitmap.serialized_size()); diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index ce6a2abe9..95e497af4 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -14,8 +14,8 @@ pub use grenad_helpers::{ }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_latest_obkv, merge_cbo_roaring_bitmaps, - merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, - roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, MergeFn, + merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs, serialize_roaring_bitmap, + MergeFn, }; use crate::MAX_WORD_LENGTH; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index bbfa1d00c..e45d927c8 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2471,11 +2471,11 @@ mod tests { { "id": 3, "text": "a a a a a a a a a a a a a a a a a - a a a a a a a a a a a a a a a a a a a a a a a a a a - a a a a a a a a a a a a a a a a a a a a a a a a a a - a a a a a a a a a a a a a a a a a a a a a a a a a a - a a a a a a a a a a a a a a a a a a a a a a a a a a - a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a a " } ])) @@ -2513,6 +2513,5 @@ mod tests { db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); - db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1"); } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 53f6d807a..89b10bffe 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -7,24 +7,19 @@ use std::io; use charabia::{Language, Script}; use grenad::MergerBuilder; use heed::types::ByteSlice; -use heed::{BytesDecode, RwTxn}; +use heed::RwTxn; use roaring::RoaringBitmap; use super::helpers::{ - self, merge_ignore_values, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, - valid_lmdb_key, CursorClonableMmap, + self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, }; use super::{ClonableMmap, MergeFn}; use crate::facet::FacetType; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::as_cloneable_grenad; -use crate::{ - lat_lng_to_xyz, BoRoaringBitmapCodec, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, - Result, -}; +use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result}; pub(crate) enum TypedChunk { - DocidWordPositions(grenad::Reader), FieldIdDocidFacetStrings(grenad::Reader), FieldIdDocidFacetNumbers(grenad::Reader), Documents(grenad::Reader), @@ -56,29 +51,6 @@ pub(crate) fn write_typed_chunk_into_index( ) -> Result<(RoaringBitmap, bool)> { let mut is_merged_database = false; match typed_chunk { - TypedChunk::DocidWordPositions(docid_word_positions_iter) => { - write_entries_into_database( - docid_word_positions_iter, - &index.docid_word_positions, - wtxn, - index_is_empty, - |value, buffer| { - // ensure that values are unique and ordered - let positions = roaring_bitmap_from_u32s_array(value); - BoRoaringBitmapCodec::serialize_into(&positions, buffer); - Ok(buffer) - }, - |new_values, db_values, buffer| { - let new_values = roaring_bitmap_from_u32s_array(new_values); - let positions = match BoRoaringBitmapCodec::bytes_decode(db_values) { - Some(db_values) => new_values | db_values, - None => new_values, // should not happen - }; - BoRoaringBitmapCodec::serialize_into(&positions, buffer); - Ok(()) - }, - )?; - } TypedChunk::Documents(obkv_documents_iter) => { let mut cursor = obkv_documents_iter.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? {