diff --git a/milli/src/index.rs b/milli/src/index.rs index 5b705e0b2..f8cceb0ef 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -84,8 +84,6 @@ pub mod db_name { pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; - pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; - pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids"; pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; @@ -130,10 +128,6 @@ pub struct Index { /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, - /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. - pub word_prefix_pair_proximity_docids: Database, - /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears. - pub prefix_word_pair_proximity_docids: Database, /// Maps the word and the position with the docids that corresponds to it. pub word_position_docids: Database, @@ -187,7 +181,7 @@ impl Index { ) -> Result { use db_name::*; - options.max_dbs(26); + options.max_dbs(24); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -204,10 +198,6 @@ impl Index { env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let script_language_docids = env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?; - let word_prefix_pair_proximity_docids = - env.create_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; - let prefix_word_pair_proximity_docids = - env.create_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?; let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?; let field_id_word_count_docids = @@ -248,8 +238,6 @@ impl Index { exact_word_prefix_docids, word_pair_proximity_docids, script_language_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, word_position_docids, word_fid_docids, word_prefix_position_docids, diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 3376cebb2..2c670658d 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -11,7 +11,9 @@ use super::interner::Interned; use super::Word; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; -use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext}; +use crate::{ + CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec, +}; /// A cache storing pointers to values in the LMDB databases. /// @@ -23,7 +25,7 @@ pub struct DatabaseCache<'ctx> { pub word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option>>, pub word_prefix_pair_proximity_docids: - FxHashMap<(u8, Interned, Interned), Option>>, + FxHashMap<(u8, Interned, Interned), Option>, pub prefix_word_pair_proximity_docids: FxHashMap<(u8, Interned, Interned), Option>>, pub word_docids: FxHashMap, Option>>, @@ -295,35 +297,47 @@ impl<'ctx> SearchContext<'ctx> { prefix2: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - (proximity, word1, prefix2), - &( - proximity, - self.word_interner.get(word1).as_str(), - self.word_interner.get(prefix2).as_str(), - ), - &mut self.db_cache.word_prefix_pair_proximity_docids, - self.index.word_prefix_pair_proximity_docids.remap_data_type::(), - ) + let docids = match self + .db_cache + .word_prefix_pair_proximity_docids + .entry((proximity, word1, prefix2)) + { + Entry::Occupied(docids) => docids.get().clone(), + Entry::Vacant(entry) => { + // compute docids using prefix iter and store the result in the cache. + let key = U8StrStrCodec::bytes_encode(&( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(prefix2).as_str(), + )) + .unwrap() + .into_owned(); + let mut prefix_docids = RoaringBitmap::new(); + let remap_key_type = self + .index + .word_pair_proximity_docids + .remap_key_type::() + .prefix_iter(self.txn, &key)?; + for result in remap_key_type { + let (_, docids) = result?; + + prefix_docids |= docids; + } + entry.insert(Some(prefix_docids.clone())); + Some(prefix_docids) + } + }; + Ok(docids) } + pub fn get_db_prefix_word_pair_proximity_docids( &mut self, left_prefix: Interned, right: Interned, proximity: u8, ) -> Result> { - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( - self.txn, - (proximity, left_prefix, right), - &( - proximity, - self.word_interner.get(left_prefix).as_str(), - self.word_interner.get(right).as_str(), - ), - &mut self.db_cache.prefix_word_pair_proximity_docids, - self.index.prefix_word_pair_proximity_docids.remap_data_type::(), - ) + // only accept exact matches on reverted positions + self.get_db_word_pair_proximity_docids(left_prefix, right, proximity) } pub fn get_db_word_fid_docids( diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index 217ebe9b3..2d181a537 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -371,7 +371,7 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best s"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); @@ -379,13 +379,13 @@ fn test_proximity_prefix_db() { insta::assert_debug_snapshot!(texts, @r###" [ "\"this is the best summer meal\"", - "\"summer best\"", "\"this is the best meal of summer\"", - "\"summer x best\"", "\"this is the best meal I have ever had in such a beautiful summer day\"", "\"this is the best cooked meal of the summer\"", "\"this is the best meal of the summer\"", "\"summer x y best\"", + "\"summer x best\"", + "\"summer best\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", ] "###); @@ -423,20 +423,20 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best win"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"this is the best winter meal\"", - "\"winter best\"", "\"this is the best meal of winter\"", - "\"winter x best\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", "\"this is the best cooked meal of the winter\"", "\"this is the best meal of the winter\"", "\"winter x y best\"", + "\"winter x best\"", + "\"winter best\"", ] "###); @@ -471,20 +471,20 @@ fn test_proximity_prefix_db() { s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.query("best wi"); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]"); insta::assert_snapshot!(format!("{document_scores:#?}")); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"this is the best winter meal\"", - "\"winter best\"", "\"this is the best meal of winter\"", - "\"winter x best\"", "\"this is the best meal I have ever had in such a beautiful winter day\"", "\"this is the best cooked meal of the winter\"", "\"this is the best meal of the winter\"", "\"winter x y best\"", + "\"winter x best\"", + "\"winter best\"", ] "###); } diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap index 8f3b964c1..efcfef7f1 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" }, ), ], - [ - Proximity( - Rank { - rank: 3, - max_rank: 4, - }, - ), - ], [ Proximity( Rank { @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 2, + rank: 1, + max_rank: 4, + }, + ), + ], + [ + Proximity( + Rank { + rank: 1, max_rank: 4, }, ), diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap index 1ee6bfc91..242bc3424 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" }, ), ], - [ - Proximity( - Rank { - rank: 3, - max_rank: 4, - }, - ), - ], [ Proximity( Rank { @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 2, + rank: 1, + max_rank: 4, + }, + ), + ], + [ + Proximity( + Rank { + rank: 1, max_rank: 4, }, ), diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap index 8f3b964c1..efcfef7f1 100644 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap +++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap @@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")" }, ), ], - [ - Proximity( - Rank { - rank: 3, - max_rank: 4, - }, - ), - ], [ Proximity( Rank { @@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")" [ Proximity( Rank { - rank: 2, + rank: 1, + max_rank: 4, + }, + ), + ], + [ + Proximity( + Rank { + rank: 1, max_rank: 4, }, ), diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f3f1eb5a5..28c4cb45c 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -219,22 +219,6 @@ pub fn snap_word_pair_proximity_docids(index: &Index) -> String { &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) }) } -pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { - make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |( - (proximity, word1, prefix), - b, - )| { - &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) - }) -} -pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String { - make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |( - (proximity, prefix, word2), - b, - )| { - &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b)) - }) -} pub fn snap_word_position_docids(index: &Index) -> String { make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 265c6f15a..afe0191b1 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -26,8 +26,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_docids, exact_word_prefix_docids, word_pair_proximity_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, word_position_docids, word_fid_docids, field_id_word_count_docids, @@ -68,8 +66,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { word_prefix_docids.clear(self.wtxn)?; exact_word_prefix_docids.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; - word_prefix_pair_proximity_docids.clear(self.wtxn)?; - prefix_word_pair_proximity_docids.clear(self.wtxn)?; word_position_docids.clear(self.wtxn)?; word_fid_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; @@ -132,7 +128,6 @@ mod tests { assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); - assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs index c8b7f0f6a..dc7c0409a 100644 --- a/milli/src/update/del_add.rs +++ b/milli/src/update/del_add.rs @@ -102,3 +102,17 @@ pub fn del_add_from_two_obkvs( pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool { del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition) } + +/// A function that extracts and returns the Add side of a DelAdd obkv. +/// This is useful when there are no previous value in the database and +/// therefore we don't need to do a diff with what's already there. +/// +/// If there is no Add side we currently write an empty buffer +/// which is a valid CboRoaringBitmap. +#[allow(clippy::ptr_arg)] // required to avoid signature mismatch +pub fn deladd_serialize_add_side<'a>( + obkv: &'a [u8], + _buffer: &mut Vec, +) -> crate::Result<&'a [u8]> { + Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) +} diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index 4f764ab95..061cbe5a0 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -1,14 +1,12 @@ use std::borrow::Cow; use std::fs::File; use std::io::{self, BufReader, BufWriter, Seek}; -use std::time::Instant; use grenad::{CompressionType, Sorter}; use heed::types::ByteSlice; -use log::debug; use super::{ClonableMmap, MergeFn}; -use crate::error::InternalError; +use crate::update::index_documents::valid_lmdb_key; use crate::Result; pub type CursorClonableMmap = io::Cursor; @@ -240,45 +238,46 @@ pub fn grenad_obkv_into_chunks( Ok(std::iter::from_fn(move || transposer().transpose())) } -pub fn sorter_into_lmdb_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, +/// Write provided sorter in database using serialize_value function. +/// merge_values function is used if an entry already exist in the database. +pub fn write_sorter_into_database( sorter: Sorter, - merge: MergeFn, -) -> Result<()> { + database: &heed::Database, + wtxn: &mut heed::RwTxn, + index_is_empty: bool, + serialize_value: FS, + merge_values: FM, +) -> Result<()> +where + FS: for<'a> Fn(&'a [u8], &'a mut Vec) -> Result<&'a [u8]>, + FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec) -> Result>, +{ puffin::profile_function!(); - debug!("Writing MTBL sorter..."); - let before = Instant::now(); + + let mut buffer = Vec::new(); + let database = database.remap_types::(); let mut merger_iter = sorter.into_stream_merger_iter()?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - while let Some((k, v)) = merger_iter.next()? { - // safety: we don't keep references from inside the LMDB database. - unsafe { out_iter.append(k, v)? }; - } - } else { - while let Some((k, v)) = merger_iter.next()? { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; - match iter.next().transpose()? { - Some((key, old_val)) if key == k => { - let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)]; - let val = merge(k, &vals).map_err(|_| { - // TODO just wrap this error? - InternalError::IndexingMergingKeys { process: "get-put-merge" } - })?; - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(k, &val)? }; + while let Some((key, value)) = merger_iter.next()? { + if valid_lmdb_key(key) { + buffer.clear(); + let value = if index_is_empty { + Some(serialize_value(value, &mut buffer)?) + } else { + match database.get(wtxn, key)? { + Some(prev_value) => merge_values(value, prev_value, &mut buffer)?, + None => Some(serialize_value(value, &mut buffer)?), } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + }; + match value { + Some(value) => database.put(wtxn, key, value)?, + None => { + database.delete(wtxn, key)?; } } } } - debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) } diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs index 98c1c1a04..5d9ca7ef2 100644 --- a/milli/src/update/index_documents/helpers/merge_functions.rs +++ b/milli/src/update/index_documents/helpers/merge_functions.rs @@ -239,3 +239,19 @@ pub fn merge_deladd_cbo_roaring_bitmaps<'a>( output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) } } + +/// A function that merges a DelAdd of bitmao into an already existing bitmap. +/// +/// The first argument is the DelAdd obkv of CboRoaringBitmaps and +/// the second one is the CboRoaringBitmap to merge into. +pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>( + deladd_obkv: &[u8], + previous: &[u8], + buffer: &'a mut Vec, +) -> Result> { + Ok(CboRoaringBitmapCodec::merge_deladd_into( + KvReaderDelAdd::new(deladd_obkv), + previous, + buffer, + )?) +} diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 1f2f8e6ef..841c09543 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -9,12 +9,13 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, - merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader, + merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader, GrenadParameters, MergeableReader, }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string, - merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn, }; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index de40e0b9b..2289666ed 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -23,8 +23,10 @@ use self::enrich::enrich_documents_batch; pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, - fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, - sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, + fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, + merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader, + ClonableMmap, MergeFn, }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; @@ -32,13 +34,12 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, - WordPrefixIntegerDocids, WordsPrefixesFst, + IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; use crate::{CboRoaringBitmapCodec, Index, Result}; static MERGED_DATABASE_COUNT: usize = 7; -static PREFIX_DATABASE_COUNT: usize = 5; +static PREFIX_DATABASE_COUNT: usize = 4; static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -411,12 +412,42 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); + let mut word_position_docids = None; + let mut word_fid_docids = None; + let mut word_docids = None; + let mut exact_word_docids = None; + for result in lmdb_writer_rx { if (self.should_abort)() { return Err(Error::InternalError(InternalError::AbortedIndexation)); } - let typed_chunk = result?; + let typed_chunk = match result? { + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; + word_docids = Some(cloneable_chunk); + let cloneable_chunk = + unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; + exact_word_docids = Some(cloneable_chunk); + let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? }; + word_fid_docids = Some(cloneable_chunk); + TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + word_fid_docids_reader, + } + } + TypedChunk::WordPositionDocids(chunk) => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + word_position_docids = Some(cloneable_chunk); + TypedChunk::WordPositionDocids(chunk) + } + otherwise => otherwise, + }; // FIXME: return newly added as well as newly deleted documents let (docids, is_merged_database) = @@ -447,17 +478,16 @@ where // We write the primary key field id into the main database self.index.put_primary_key(self.wtxn, &primary_key)?; + let number_of_documents = self.index.number_of_documents(self.wtxn)?; - // TODO: reactivate prefix DB with diff-indexing - // self.execute_prefix_databases( - // word_docids, - // exact_word_docids, - // word_pair_proximity_docids, - // word_position_docids, - // word_fid_docids, - // )?; + self.execute_prefix_databases( + word_docids, + exact_word_docids, + word_position_docids, + word_fid_docids, + )?; - self.index.number_of_documents(self.wtxn) + Ok(number_of_documents) } #[logging_timer::time("IndexDocuments::{}")] @@ -465,7 +495,6 @@ where self, word_docids: Option>, exact_word_docids: Option>, - word_pair_proximity_docids: Option>, word_position_docids: Option>, word_fid_docids: Option>, ) -> Result<()> @@ -586,32 +615,6 @@ where total_databases: TOTAL_POSTING_DATABASE_COUNT, }); - if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { - // Run the word prefix pair proximity docids update operation. - PrefixWordPairsProximityDocids::new( - self.wtxn, - self.index, - self.indexer_config.chunk_compression_type, - self.indexer_config.chunk_compression_level, - ) - .execute( - word_pair_proximity_docids, - &new_prefix_fst_words, - &common_prefix_fst_words, - &del_prefix_fst_words, - )?; - } - - if (self.should_abort)() { - return Err(Error::InternalError(InternalError::AbortedIndexation)); - } - - databases_seen += 1; - (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { - databases_seen, - total_databases: TOTAL_POSTING_DATABASE_COUNT, - }); - if let Some(word_position_docids) = word_position_docids { // Run the words prefix position docids update operation. let mut builder = WordPrefixIntegerDocids::new( diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b53d859cd..90f9b7739 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -13,7 +13,10 @@ use obkv::{KvReader, KvWriter}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; -use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap}; +use super::helpers::{ + self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values, + valid_lmdb_key, CursorClonableMmap, +}; use super::{ClonableMmap, MergeFn}; use crate::distance::NDotProductPoint; use crate::error::UserError; @@ -21,12 +24,11 @@ use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; use crate::index::Hnsw; -use crate::update::del_add::{DelAdd, KvReaderDelAdd}; +use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; use crate::{ - lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, Result, - SerializationError, BEU32, + lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError, BEU32, }; pub(crate) enum TypedChunk { @@ -186,7 +188,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -202,7 +204,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; @@ -212,7 +214,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; @@ -222,7 +224,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; // create fst from word docids @@ -244,7 +246,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -265,7 +267,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -276,7 +278,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -287,7 +289,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -298,7 +300,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, deladd_serialize_add_side, - merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; is_merged_database = true; } @@ -495,33 +497,6 @@ fn merge_word_docids_reader_into_fst( Ok(builder.into_set()) } -/// A function that extracts and returns the Add side of a DelAdd obkv. -/// This is useful when there are no previous value in the database and -/// therefore we don't need to do a diff with what's already there. -/// -/// If there is no Add side we currently write an empty buffer -/// which is a valid CboRoaringBitmap. -#[allow(clippy::ptr_arg)] // required to avoid signature mismatch -fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec) -> Result<&'a [u8]> { - Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default()) -} - -/// A function that merges a DelAdd of bitmao into an already existing bitmap. -/// -/// The first argument is the DelAdd obkv of CboRoaringBitmaps and -/// the second one is the CboRoaringBitmap to merge into. -fn merge_deladd_cbo_roaring_bitmaps<'a>( - deladd_obkv: &[u8], - previous: &[u8], - buffer: &'a mut Vec, -) -> Result> { - Ok(CboRoaringBitmapCodec::merge_deladd_into( - KvReaderDelAdd::new(deladd_obkv), - previous, - buffer, - )?) -} - /// Write provided entries in database using serialize_value function. /// merge_values function is used if an entry already exist in the database. fn write_entries_into_database( diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index dd8851ccb..eb2b6e69a 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -8,10 +8,6 @@ pub use self::index_documents::{ MergeFn, }; pub use self::indexer_config::IndexerConfig; -pub use self::prefix_word_pairs::{ - PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, - MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, -}; pub use self::settings::{Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; @@ -24,7 +20,6 @@ pub(crate) mod del_add; pub(crate) mod facet; mod index_documents; mod indexer_config; -mod prefix_word_pairs; mod settings; mod update_step; mod word_prefix_docids; diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs deleted file mode 100644 index 320c01461..000000000 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ /dev/null @@ -1,422 +0,0 @@ -use std::borrow::Cow; -use std::collections::HashSet; -use std::io::{BufReader, BufWriter}; - -use grenad::CompressionType; -use heed::types::ByteSlice; - -use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; -use crate::{Index, Result}; - -mod prefix_word; -mod word_prefix; - -pub use prefix_word::index_prefix_word_database; -pub use word_prefix::index_word_prefix_database; - -pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4; -pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2; - -pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - max_proximity: u8, - max_prefix_length: usize, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, -} -impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, - ) -> Self { - Self { - wtxn, - index, - max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, - max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, - chunk_compression_type, - chunk_compression_level, - } - } - - #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute<'a>( - self, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &'a [String], - common_prefix_fst_words: &[&'a [String]], - del_prefix_fst_words: &HashSet>, - ) -> Result<()> { - puffin::profile_function!(); - - index_word_prefix_database( - self.wtxn, - self.index.word_pair_proximity_docids, - self.index.word_prefix_pair_proximity_docids, - self.max_proximity, - self.max_prefix_length, - new_word_pair_proximity_docids.clone(), - new_prefix_fst_words, - common_prefix_fst_words, - del_prefix_fst_words, - self.chunk_compression_type, - self.chunk_compression_level, - )?; - - index_prefix_word_database( - self.wtxn, - self.index.word_pair_proximity_docids, - self.index.prefix_word_pair_proximity_docids, - self.max_proximity, - self.max_prefix_length, - new_word_pair_proximity_docids, - new_prefix_fst_words, - common_prefix_fst_words, - del_prefix_fst_words, - self.chunk_compression_type, - self.chunk_compression_level, - )?; - - Ok(()) - } -} - -// This is adapted from `sorter_into_lmdb_database` -pub fn insert_into_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - new_key: &[u8], - new_value: &[u8], -) -> Result<()> { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; - match iter.next().transpose()? { - Some((key, old_val)) if new_key == key => { - let val = - merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) - .map_err(|_| { - // TODO just wrap this error? - crate::error::InternalError::IndexingMergingKeys { - process: "get-put-merge", - } - })?; - // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour - unsafe { iter.put_current(new_key, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; - } - } - Ok(()) -} - -// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, -// but it uses `append` if the database is empty, and it assumes that the values in the -// writer don't conflict with values in the database. -pub fn write_into_lmdb_database_without_merging( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - writer: grenad::Writer>, -) -> Result<()> { - let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?; - let reader = grenad::Reader::new(BufReader::new(file))?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - // safety: the key comes from the grenad reader, not the database - unsafe { out_iter.append(k, v)? }; - } - } else { - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - Ok(()) -} - -#[cfg(test)] -mod tests { - use std::io::Cursor; - - use crate::db_snap; - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::update::IndexDocumentsMethod; - - fn documents_with_enough_different_words_for_prefixes( - prefixes: &[&str], - start_id: usize, - ) -> Vec { - let mut documents = Vec::new(); - let mut id = start_id; - for prefix in prefixes { - for i in 0..50 { - documents.push( - serde_json::json!({ - "id": id, - "text": format!("{prefix}{i:x}"), - }) - .as_object() - .unwrap() - .clone(), - ); - id += 1; - } - } - documents - } - - #[ignore] - #[test] - fn add_new_documents() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": "9000", - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": "9001", - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100); - documents.push( - serde_json::json!({ - "id": "9002", - "text": "At an extraordinary house" - }) - .as_object() - .unwrap() - .clone(), - ); - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_pair_proximity_docids, "update"); - db_snap!(index, word_prefix_pair_proximity_docids, "update"); - db_snap!(index, prefix_word_pair_proximity_docids, "update"); - } - #[ignore] - #[test] - fn batch_bug_3043() { - // https://github.com/meilisearch/meilisearch/issues/3043 - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "text": "x y" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "text": "x a y" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_pair_proximity_docids); - db_snap!(index, word_prefix_pair_proximity_docids); - db_snap!(index, prefix_word_pair_proximity_docids); - } - - #[ignore] - #[test] - fn hard_delete_and_reupdate() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - index.delete_document("9000"); - - db_snap!(index, documents_ids, "first_delete"); - db_snap!(index, word_docids, "first_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "first_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "first_delete"); - - index.delete_documents((0..50).map(|id| id.to_string()).collect()); - - db_snap!(index, documents_ids, "second_delete"); - db_snap!(index, word_docids, "second_delete"); - db_snap!(index, word_prefix_pair_proximity_docids, "second_delete"); - db_snap!(index, prefix_word_pair_proximity_docids, "second_delete"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "reupdate"); - db_snap!(index, word_docids, "reupdate"); - db_snap!(index, word_prefix_pair_proximity_docids, "reupdate"); - db_snap!(index, prefix_word_pair_proximity_docids, "reupdate"); - } - - #[ignore] - #[test] - fn replace_hard_deletion() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "id": 9000, - "text": "At an amazing house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "id": 9001, - "text": "The bell rings" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, documents_ids, "initial"); - db_snap!(index, word_docids, "initial"); - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - db_snap!(index, prefix_word_pair_proximity_docids, "initial"); - - let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0); - index.add_documents(batch_reader_from_documents(documents)).unwrap(); - - db_snap!(index, documents_ids, "replaced"); - db_snap!(index, word_docids, "replaced"); - db_snap!(index, word_prefix_pair_proximity_docids, "replaced"); - db_snap!(index, prefix_word_pair_proximity_docids, "replaced"); - } -} diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs deleted file mode 100644 index 1ec66d010..000000000 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ /dev/null @@ -1,182 +0,0 @@ -use std::borrow::Cow; -use std::collections::{BTreeMap, HashSet}; - -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; - -use crate::update::index_documents::{create_writer, CursorClonableMmap}; -use crate::update::prefix_word_pairs::{ - insert_into_database, write_into_lmdb_database_without_merging, -}; -use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; - -#[allow(clippy::too_many_arguments)] -#[logging_timer::time] -pub fn index_prefix_word_database( - wtxn: &mut heed::RwTxn, - word_pair_proximity_docids: heed::Database, - prefix_word_pair_proximity_docids: heed::Database, - max_proximity: u8, - max_prefix_length: usize, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &[String], - common_prefix_fst_words: &[&[String]], - del_prefix_fst_words: &HashSet>, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, -) -> Result<()> { - puffin::profile_function!(); - - let max_proximity = max_proximity - 1; - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - let common_prefixes: Vec<_> = common_prefix_fst_words - .iter() - .flat_map(|s| s.iter()) - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length) - .collect(); - - for proximity in 1..max_proximity { - for prefix in common_prefixes.iter() { - let mut prefix_key = vec![proximity]; - prefix_key.extend_from_slice(prefix.as_bytes()); - let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - proximity, - prefix.as_bytes(), - // the next two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.next()? { - let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key) - .ok_or(heed::Error::Decoding)?; - Ok(Some((word2, value))) - } else { - Ok(None) - } - }, - // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap) - |key, value| { - insert_into_database( - wtxn, - *prefix_word_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - let new_prefixes: Vec<_> = new_prefix_fst_words - .iter() - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length) - .collect(); - - // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) - // element in an intermediary grenad - let mut writer = - create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); - - for proximity in 1..max_proximity { - for prefix in new_prefixes.iter() { - let mut prefix_key = vec![proximity]; - prefix_key.extend_from_slice(prefix.as_bytes()); - let mut db_iter = word_pair_proximity_docids - .as_polymorph() - .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? - .remap_key_type::(); - execute_on_word_pairs_and_prefixes( - proximity, - prefix.as_bytes(), - &mut db_iter, - |db_iter| { - db_iter - .next() - .transpose() - .map(|x| x.map(|((_, _, word2), value)| (word2, value))) - .map_err(|e| e.into()) - }, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - } - } - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - wtxn, - *prefix_word_pair_proximity_docids.as_polymorph(), - writer, - )?; - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = - prefix_word_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; - while let Some(((_, prefix, _), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(prefix.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; - } - } - } - - Ok(()) -} - -/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database. -/// -/// Its arguments are: -/// - an iterator over the words following the given `prefix` with the given `proximity` -/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements -fn execute_on_word_pairs_and_prefixes( - proximity: u8, - prefix: &[u8], - iter: &mut I, - mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result>, - mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, -) -> Result<()> { - let mut batch: BTreeMap, Vec>> = BTreeMap::default(); - - // Memory usage check: - // The content of the loop will be called for each `word2` that follows a word beginning - // with `prefix` with the given proximity. - // In practice, I don't think the batch can ever get too big. - while let Some((word2, docids)) = next_word2_and_docids(iter)? { - let entry = batch.entry(word2.to_owned()).or_default(); - entry.push(Cow::Owned(docids.to_owned())); - } - - let mut key_buffer = Vec::with_capacity(512); - key_buffer.push(proximity); - key_buffer.extend_from_slice(prefix); - key_buffer.push(0); - - let mut value_buffer = Vec::with_capacity(65_536); - - for (word2, docids) in batch { - key_buffer.truncate(prefix.len() + 2); - value_buffer.clear(); - - key_buffer.extend_from_slice(&word2); - let data = if docids.len() > 1 { - CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?; - value_buffer.as_slice() - } else { - &docids[0] - }; - insert(key_buffer.as_slice(), data)?; - } - Ok(()) -} diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs deleted file mode 100644 index 570adece9..000000000 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ /dev/null @@ -1,728 +0,0 @@ -/*! -The word-prefix-pair-proximity-docids database is a database whose keys are of -the form `(proximity, word, prefix)` and the values are roaring bitmaps of -the documents which contain `word` followed by another word starting with -`prefix` at a distance of `proximity`. - -The prefixes present in this database are only those that correspond to many -different words in the documents. - -## How is it created/updated? (simplified version) -To compute it, we have access to (mainly) two inputs: - -* a list of sorted prefixes, such as: -```text -c -ca -cat -d -do -dog -``` -Note that only prefixes which correspond to more than a certain number of -different words from the database are included in this list. - -* a sorted list of proximities and word pairs (the proximity is the distance between the two words), -associated with a roaring bitmap, such as: -```text -1 good doggo -> docids1: [8] -1 good door -> docids2: [7, 19, 20] -1 good ghost -> docids3: [1] -2 good dog -> docids4: [2, 5, 6] -2 horror cathedral -> docids5: [1, 2] -``` - -I illustrate a simplified version of the algorithm to create the word-prefix -pair-proximity database below: - -1. **Outer loop:** First, we iterate over each proximity and word pair: -```text -proximity: 1 -word1 : good -word2 : doggo -``` -2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are -in the list of sorted prefixes. And we insert the key `prefix` -and the value (`docids`) to a sorted map which we call the “batch”. For example, -at the end of the first outer loop, we may have: -```text -Outer loop 1: ------------------------------- -proximity: 1 -word1 : good -word2 : doggo -docids : docids1 - -prefixes: [d, do, dog] - -batch: [ - d, -> [docids1] - do -> [docids1] - dog -> [docids1] -] -``` -3. For illustration purpose, let's run through a second iteration of the outer loop: -```text -Outer loop 2: ------------------------------- -proximity: 1 -word1 : good -word2 : door -docids : docids2 - -prefixes: [d, do, doo] - -batch: [ - d -> [docids1, docids2] - do -> [docids1, docids2] - dog -> [docids1] - doo -> [docids2] -] -``` -Notice that there were some conflicts which were resolved by merging the -conflicting values together. Also, an additional prefix was added at the -end of the batch. - -4. On the third iteration of the outer loop, we have: -```text -Outer loop 3: ------------------------------- -proximity: 1 -word1 : good -word2 : ghost -``` -Because `word2` begins with a different letter than the previous `word2`, -we know that all the prefixes of `word2` are greater than the prefixes of the previous word2 - -Therefore, we know that we can insert every element from the batch into the -database before proceeding any further. This operation is called -“flushing the batch”. Flushing the batch should also be done whenever: -* `proximity` is different than the previous `proximity`. -* `word1` is different than the previous `word1`. -* `word2` starts with a different letter than the previous word2 - -6. **Flushing the batch:** to flush the batch, we iterate over its elements: -```text -Flushing Batch loop 1: ------------------------------- -proximity : 1 -word1 : good -prefix : d - -docids : [docids2, docids3] -``` -We then merge the array of `docids` (of type `Vec>`) using -`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a -roaring bitmap of all the document ids where `word1` is followed by `prefix` -at a distance of `proximity`. -Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids` -into the database. - -7. That's it! ... except... - -## How is it created/updated (continued) - -I lied a little bit about the input data. In reality, we get two sets of the -inputs described above, which come from different places: - -* For the list of sorted prefixes, we have: - 1. `new_prefixes`, which are all the prefixes that were not present in the - database before the insertion of the new documents - - 2. `common_prefixes` which are the prefixes that are present both in the - database and in the newly added documents - -* For the list of word pairs and proximities, we have: - 1. `new_word_pairs`, which is the list of word pairs and their proximities - present in the newly added documents - - 2. `word_pairs_db`, which is the list of word pairs from the database. - This list includes all elements in `new_word_pairs` since `new_word_pairs` - was added to the database prior to calling the `WordPrefix::execute` - function. - -To update the prefix database correctly, we call the algorithm described earlier first -on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`). -Thus: - -1. For all the word pairs that were already present in the DB, we insert them -again with the `new_prefixes`. Calling the algorithm on them with the -`common_prefixes` would not result in any new data. - -2. For all the new word pairs, we insert them twice: first with the `common_prefixes`, -and then, because they are part of `word_pairs_db`, with the `new_prefixes`. - -Note, also, that since we read data from the database when iterating over -`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity- -docids from the batch directly into the database (we would have a concurrent -reader and writer). Therefore, when calling the algorithm on -`(new_prefixes, word_pairs_db)`, we insert the computed -`((proximity, word, prefix), docids)` elements in an intermediary grenad -Writer instead of the DB. At the end of the outer loop, we finally read from -the grenad and insert its elements in the database. -*/ - -use std::borrow::Cow; -use std::collections::HashSet; - -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; - -use crate::update::index_documents::{create_writer, CursorClonableMmap}; -use crate::update::prefix_word_pairs::{ - insert_into_database, write_into_lmdb_database_without_merging, -}; -use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; - -#[allow(clippy::too_many_arguments)] -#[logging_timer::time] -pub fn index_word_prefix_database( - wtxn: &mut heed::RwTxn, - word_pair_proximity_docids: heed::Database, - word_prefix_pair_proximity_docids: heed::Database, - max_proximity: u8, - max_prefix_length: usize, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &[String], - common_prefix_fst_words: &[&[String]], - del_prefix_fst_words: &HashSet>, - chunk_compression_type: CompressionType, - chunk_compression_level: Option, -) -> Result<()> { - puffin::profile_function!(); - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length - let prefixes = PrefixTrieNode::from_sorted_prefixes( - common_prefix_fst_words - .iter() - .flat_map(|s| s.iter()) - .map(|s| s.as_str()) - .filter(|s| s.len() <= max_prefix_length), - ); - - // If the prefix trie is not empty, then we can iterate over all new - // word pairs to look for new (proximity, word1, common_prefix) elements - // to insert in the DB - if !prefixes.is_empty() { - let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - // the first two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.move_on_next()? { - let (proximity, word1, word2) = - UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?; - Ok(Some(((proximity, word1, word2), value))) - } else { - Ok(None) - } - }, - &prefixes, - max_proximity, - // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap) - |key, value| { - insert_into_database( - wtxn, - *word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - - let prefixes = PrefixTrieNode::from_sorted_prefixes( - new_prefix_fst_words.iter().map(|s| s.as_str()).filter(|s| s.len() <= max_prefix_length), - ); - - if !prefixes.is_empty() { - let mut db_iter = word_pair_proximity_docids - .remap_key_type::() - .remap_data_type::() - .iter(wtxn)?; - - // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix) - // element in an intermediary grenad - let mut writer = - create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); - - execute_on_word_pairs_and_prefixes( - &mut db_iter, - |db_iter| db_iter.next().transpose().map_err(|e| e.into()), - &prefixes, - max_proximity, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - wtxn, - *word_prefix_pair_proximity_docids.as_polymorph(), - writer, - )?; - } - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = - word_prefix_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; - while let Some(((_, _, prefix), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(prefix.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; - } - } - } - - Ok(()) -} - -/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. -/// -/// Its main arguments are: -/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements -/// 2. a prefix trie -/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements -/// -/// For more information about what this function does, read the module documentation. -fn execute_on_word_pairs_and_prefixes( - iter: &mut I, - mut next_word_pair_proximity: impl for<'a> FnMut( - &'a mut I, - ) -> Result< - Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>, - >, - prefixes: &PrefixTrieNode, - max_proximity: u8, - mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, -) -> Result<()> { - let mut batch = PrefixAndProximityBatch::default(); - let mut prev_word2_start = 0; - - // Optimisation: the index at the root of the prefix trie where to search for - let mut prefix_search_start = PrefixTrieNodeSearchStart(0); - - // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter - let mut empty_prefixes = false; - - let mut prefix_buffer = Vec::with_capacity(8); - let mut merge_buffer = Vec::with_capacity(65_536); - - while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? { - // stop indexing if the proximity is over the threshold - if proximity > max_proximity { - break; - }; - let word2_start_different_than_prev = word2[0] != prev_word2_start; - // if there were no potential prefixes for the previous word2 based on its first letter, - // and if the current word2 starts with the same letter, then there is also no potential - // prefixes for the current word2, and we can skip to the next iteration - if empty_prefixes && !word2_start_different_than_prev { - continue; - } - - // if the proximity is different to the previous one, OR - // if word1 is different than the previous word1, OR - // if the start of word2 is different than the previous start of word2, - // THEN we'll need to flush the batch - let prox_different_than_prev = proximity != batch.proximity; - let word1_different_than_prev = word1 != batch.word1; - if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev - { - batch.flush(&mut merge_buffer, &mut insert)?; - batch.proximity = proximity; - // don't forget to reset the value of batch.word1 and prev_word2_start - if word1_different_than_prev { - batch.word1.clear(); - batch.word1.extend_from_slice(word1); - } - if word2_start_different_than_prev { - prev_word2_start = word2[0]; - } - prefix_search_start.0 = 0; - // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2 - empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); - } - - if !empty_prefixes { - // All conditions are satisfied, we can now insert each new prefix of word2 into the batch - prefix_buffer.clear(); - prefixes.for_each_prefix_of( - word2, - &mut prefix_buffer, - &prefix_search_start, - |prefix_buffer| { - batch.insert(prefix_buffer, data.to_vec()); - }, - ); - } - } - batch.flush(&mut merge_buffer, &mut insert)?; - Ok(()) -} -/** -A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps). -The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. - -It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently. - -The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content -can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: -- key : (proximity, word1, prefix) as bytes -- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes -*/ -#[derive(Default)] -struct PrefixAndProximityBatch { - proximity: u8, - word1: Vec, - #[allow(clippy::type_complexity)] - batch: Vec<(Vec, Vec>)>, -} - -impl PrefixAndProximityBatch { - /// Insert the new key and value into the batch - /// - /// The key must either exist in the batch or be greater than all existing keys - fn insert(&mut self, new_key: &[u8], new_value: Vec) { - match self.batch.iter_mut().find(|el| el.0 == new_key) { - Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)), - None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])), - } - } - - /// Empties the batch, calling `insert` on each element. - /// - /// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap. - fn flush( - &mut self, - merge_buffer: &mut Vec, - insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, - ) -> Result<()> { - let PrefixAndProximityBatch { proximity, word1, batch } = self; - if batch.is_empty() { - return Ok(()); - } - merge_buffer.clear(); - - let mut buffer = Vec::with_capacity(word1.len() + 1 + 6); - buffer.push(*proximity); - buffer.extend_from_slice(word1); - buffer.push(0); - - for (key, mergeable_data) in batch.drain(..) { - buffer.truncate(1 + word1.len() + 1); - buffer.extend_from_slice(key.as_slice()); - - let data = if mergeable_data.len() > 1 { - CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?; - merge_buffer.as_slice() - } else { - &mergeable_data[0] - }; - insert(buffer.as_slice(), data)?; - merge_buffer.clear(); - } - - Ok(()) - } -} - -/** A prefix trie. Used to iterate quickly over the prefixes of a word that are -within a set. - -## Structure -The trie is made of nodes composed of: -1. a byte character (e.g. 'a') -2. whether the node is an end node or not -3. a list of children nodes, sorted by their byte character - -For example, the trie that stores the strings `[ac, ae, ar, ch, cei, cel, ch, r, rel, ri]` -is drawn below. Nodes with a double border are "end nodes". - -┌──────────────────────┐ ┌──────────────────────┐ ╔══════════════════════╗ -│ a │ │ c │ ║ r ║ -└──────────────────────┘ └──────────────────────┘ ╚══════════════════════╝ -╔══════╗╔══════╗╔══════╗ ┌─────────┐ ╔═════════╗ ┌─────────┐ ╔══════════╗ -║ c ║║ e ║║ r ║ │ e │ ║ h ║ │ e │ ║ i ║ -╚══════╝╚══════╝╚══════╝ └─────────┘ ╚═════════╝ └─────────┘ ╚══════════╝ - ╔═══╗ ╔═══╗ ╔═══╗ - ║ i ║ ║ l ║ ║ l ║ - ╚═══╝ ╚═══╝ ╚═══╝ -*/ -#[derive(Default, Debug)] -struct PrefixTrieNode { - children: Vec<(PrefixTrieNode, u8)>, - is_end_node: bool, -} - -#[derive(Debug)] -struct PrefixTrieNodeSearchStart(usize); - -impl PrefixTrieNode { - fn is_empty(&self) -> bool { - self.children.is_empty() - } - - /// Returns false if the trie does not contain a prefix of the given word. - /// Returns true if the trie *may* contain a prefix of the given word. - /// - /// Moves the search start to the first node equal to the first letter of the word, - /// or to 0 otherwise. - fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool { - let byte = word[0]; - if self.children[search_start.0].1 == byte { - true - } else { - match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) { - Ok(position) => { - search_start.0 += position; - true - } - Err(_) => { - search_start.0 = 0; - false - } - } - } - } - - fn from_sorted_prefixes<'a>(prefixes: impl Iterator) -> Self { - let mut node = PrefixTrieNode::default(); - for prefix in prefixes { - node.insert_sorted_prefix(prefix.as_bytes().iter()); - } - node - } - fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter) { - if let Some(&c) = prefix.next() { - if let Some((node, byte)) = self.children.last_mut() { - if *byte == c { - node.insert_sorted_prefix(prefix); - return; - } - } - let mut new_node = PrefixTrieNode::default(); - new_node.insert_sorted_prefix(prefix); - self.children.push((new_node, c)); - } else { - self.is_end_node = true; - } - } - - /// Call the given closure on each prefix of the word contained in the prefix trie. - /// - /// The search starts from the given `search_start`. - fn for_each_prefix_of( - &self, - word: &[u8], - buffer: &mut Vec, - search_start: &PrefixTrieNodeSearchStart, - mut do_fn: impl FnMut(&mut Vec), - ) { - let first_byte = word[0]; - let mut cur_node = self; - buffer.push(first_byte); - if let Some((child_node, c)) = - cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte) - { - if *c == first_byte { - cur_node = child_node; - if cur_node.is_end_node { - do_fn(buffer); - } - for &byte in &word[1..] { - buffer.push(byte); - if let Some((child_node, c)) = - cur_node.children.iter().find(|(_, c)| *c >= byte) - { - if *c == byte { - cur_node = child_node; - if cur_node.is_end_node { - do_fn(buffer); - } - } else { - break; - } - } else { - break; - } - } - } - } - } -} -#[cfg(test)] -mod tests { - use roaring::RoaringBitmap; - - use super::*; - use crate::{CboRoaringBitmapCodec, U8StrStrCodec}; - - fn check_prefixes( - trie: &PrefixTrieNode, - search_start: &PrefixTrieNodeSearchStart, - word: &str, - expected_prefixes: &[&str], - ) { - let mut actual_prefixes = vec![]; - trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), search_start, |x| { - let s = String::from_utf8(x.to_owned()).unwrap(); - actual_prefixes.push(s); - }); - assert_eq!(actual_prefixes, expected_prefixes); - } - - #[test] - fn test_trie() { - let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au", - "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c", - "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com", - "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des", - "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f", - "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi", - "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i", - "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka", - "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar", - "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni", - "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi", - "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res", - "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si", - "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t", - "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve", - "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z", - ])); - - let mut search_start = PrefixTrieNodeSearchStart(0); - - let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start); - assert!(!is_empty); - assert_eq!(search_start.0, 2); - - check_prefixes(&trie, &search_start, "affair", &["a"]); - check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]); - - let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start); - assert!(!is_empty); - assert_eq!(trie.children[search_start.0].1, b'u'); - - check_prefixes(&trie, &search_start, "unique", &["u", "un"]); - - // NOTE: this should fail, because the search start is already beyong 'a' - let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start); - assert!(!is_empty); - // search start is reset - assert_eq!(search_start.0, 0); - - let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "arb", "arbre", "cat", "catto", - ])); - check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]); - check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]); - } - - #[test] - fn test_execute_on_word_pairs_and_prefixes() { - let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([ - "arb", "arbre", "cat", "catto", - ])); - - let mut serialised_bitmap123 = vec![]; - let mut bitmap123 = RoaringBitmap::new(); - bitmap123.insert(1); - bitmap123.insert(2); - bitmap123.insert(3); - CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123); - - let mut serialised_bitmap456 = vec![]; - let mut bitmap456 = RoaringBitmap::new(); - bitmap456.insert(4); - bitmap456.insert(5); - bitmap456.insert(6); - CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456); - - let mut serialised_bitmap789 = vec![]; - let mut bitmap789 = RoaringBitmap::new(); - bitmap789.insert(7); - bitmap789.insert(8); - bitmap789.insert(9); - CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789); - - let mut serialised_bitmap_ranges = vec![]; - let mut bitmap_ranges = RoaringBitmap::new(); - bitmap_ranges.insert_range(63_000..65_000); - bitmap_ranges.insert_range(123_000..128_000); - CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); - - let word_pairs = [ - ((1, "healthy", "arbres"), &serialised_bitmap123), - ((1, "healthy", "boat"), &serialised_bitmap123), - ((1, "healthy", "ca"), &serialised_bitmap123), - ((1, "healthy", "cats"), &serialised_bitmap456), - ((1, "healthy", "cattos"), &serialised_bitmap123), - ((1, "jittery", "cat"), &serialised_bitmap123), - ((1, "jittery", "cata"), &serialised_bitmap456), - ((1, "jittery", "catb"), &serialised_bitmap789), - ((1, "jittery", "catc"), &serialised_bitmap_ranges), - ((2, "healthy", "arbre"), &serialised_bitmap123), - ((2, "healthy", "arbres"), &serialised_bitmap456), - ((2, "healthy", "cats"), &serialised_bitmap789), - ((2, "healthy", "cattos"), &serialised_bitmap_ranges), - ((3, "healthy", "arbre"), &serialised_bitmap456), - ((3, "healthy", "arbres"), &serialised_bitmap789), - ]; - - let expected_result = [ - ((1, "healthy", "arb"), bitmap123.clone()), - ((1, "healthy", "arbre"), bitmap123.clone()), - ((1, "healthy", "cat"), &bitmap456 | &bitmap123), - ((1, "healthy", "catto"), bitmap123.clone()), - ((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), - ((2, "healthy", "arb"), &bitmap123 | &bitmap456), - ((2, "healthy", "arbre"), &bitmap123 | &bitmap456), - ((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges), - ((2, "healthy", "catto"), bitmap_ranges.clone()), - ]; - - let mut result = vec![]; - - let mut iter = - IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| { - ((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice()) - }); - execute_on_word_pairs_and_prefixes( - &mut iter, - |iter| Ok(iter.next()), - &prefixes, - 2, - |k, v| { - let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap(); - let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); - result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap)); - Ok(()) - }, - ) - .unwrap(); - - for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { - let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x; - let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y; - - assert_eq!(actual_word1, expected_word1); - assert_eq!(actual_prefix, expected_prefix); - assert_eq!(actual_proximity, expected_proximity); - assert_eq!(actual_bitmap, expected_bitmap); - } - } -} diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 8220aa777..618f451dc 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -4,9 +4,11 @@ use grenad::CompressionType; use heed::types::{ByteSlice, Str}; use heed::Database; +use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, - CursorClonableMmap, MergeFn, + create_sorter, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, + write_sorter_into_database, CursorClonableMmap, MergeFn, }; use crate::{CboRoaringBitmapCodec, Result}; @@ -51,7 +53,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, self.max_nb_chunks, @@ -92,11 +94,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // We fetch the docids associated to the newly added word prefix fst only. let db = self.word_docids.remap_data_type::(); + let mut buffer = Vec::new(); for prefix in new_prefix_fst_words { let prefix = std::str::from_utf8(prefix.as_bytes())?; for result in db.prefix_iter(self.wtxn, prefix)? { let (_word, data) = result?; - prefix_docids_sorter.insert(prefix, data)?; + buffer.clear(); + let mut writer = KvWriterDelAdd::new(&mut buffer); + writer.insert(DelAdd::Addition, data)?; + + prefix_docids_sorter.insert(prefix, writer.into_inner()?)?; } } @@ -110,12 +117,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { drop(iter); + let database_is_empty = self.word_prefix_docids.is_empty(self.wtxn)?; + // We finally write the word prefix docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.word_prefix_docids.as_polymorph(), + write_sorter_into_database( prefix_docids_sorter, - merge_cbo_roaring_bitmaps, + &self.word_prefix_docids, + self.wtxn, + database_is_empty, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; Ok(()) diff --git a/milli/src/update/words_prefix_integer_docids.rs b/milli/src/update/words_prefix_integer_docids.rs index c65438928..e083f510a 100644 --- a/milli/src/update/words_prefix_integer_docids.rs +++ b/milli/src/update/words_prefix_integer_docids.rs @@ -9,9 +9,11 @@ use log::debug; use crate::error::SerializationError; use crate::heed_codec::StrBEU16Codec; use crate::index::main_key::WORDS_PREFIXES_FST_KEY; +use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd}; use crate::update::index_documents::{ - create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, - CursorClonableMmap, MergeFn, + create_sorter, merge_deladd_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key, + write_sorter_into_database, CursorClonableMmap, MergeFn, }; use crate::{CboRoaringBitmapCodec, Result}; @@ -55,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { let mut prefix_integer_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_cbo_roaring_bitmaps, + merge_deladd_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, self.max_nb_chunks, @@ -108,6 +110,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { // We fetch the docids associated to the newly added word prefix fst only. let db = self.word_database.remap_data_type::(); + let mut buffer = Vec::new(); for prefix_bytes in new_prefix_fst_words { let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| { SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } @@ -123,7 +126,11 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { if word.starts_with(prefix) { let key = (prefix, pos); let bytes = StrBEU16Codec::bytes_encode(&key).unwrap(); - prefix_integer_docids_sorter.insert(bytes, data)?; + + buffer.clear(); + let mut writer = KvWriterDelAdd::new(&mut buffer); + writer.insert(DelAdd::Addition, data)?; + prefix_integer_docids_sorter.insert(bytes, writer.into_inner()?)?; } } } @@ -143,12 +150,16 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> { drop(iter); } + let database_is_empty = self.prefix_database.is_empty(self.wtxn)?; + // We finally write all the word prefix integer docids into the LMDB database. - sorter_into_lmdb_database( - self.wtxn, - *self.prefix_database.as_polymorph(), + write_sorter_into_database( prefix_integer_docids_sorter, - merge_cbo_roaring_bitmaps, + &self.prefix_database, + self.wtxn, + database_is_empty, + deladd_serialize_add_side, + merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, )?; Ok(()) @@ -159,6 +170,7 @@ fn write_prefixes_in_sorter( prefixes: &mut HashMap, Vec>>, sorter: &mut grenad::Sorter, ) -> Result<()> { + // TODO: Merge before insertion. for (key, data_slices) in prefixes.drain() { for data in data_slices { if valid_lmdb_key(&key) {