From 264a04922dfb16b54903ce16d0dd1c846060fbd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 15:33:13 +0200 Subject: [PATCH] Add prefix_word_pair_proximity database Similar to the word_prefix_pair_proximity one but instead the keys are: (proximity, prefix, word2) --- milli/src/index.rs | 8 +- milli/src/snapshot_tests.rs | 12 + milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 35 +- milli/src/update/index_documents/mod.rs | 11 +- milli/src/update/mod.rs | 4 +- milli/src/update/prefix_word_pairs/mod.rs | 216 +++++++++ .../update/prefix_word_pairs/prefix_word.rs | 178 ++++++++ .../word_prefix_pair_proximity_docids.snap | 46 ++ ...refix_word_pair_proximity_docids.hash.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 41 ++ .../word_pair_proximity_docids.hash.snap | 4 + ...ord_prefix_pair_proximity_docids.hash.snap | 4 + .../word_prefix.rs} | 427 +++++------------- 14 files changed, 653 insertions(+), 339 deletions(-) create mode 100644 milli/src/update/prefix_word_pairs/mod.rs create mode 100644 milli/src/update/prefix_word_pairs/prefix_word.rs create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap rename milli/src/update/{word_prefix_pair_proximity_docids.rs => prefix_word_pairs/word_prefix.rs} (67%) diff --git a/milli/src/index.rs b/milli/src/index.rs index f1bc2fa10..3bb668b43 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -71,6 +71,7 @@ pub mod db_name { pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; + pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; @@ -109,6 +110,8 @@ pub struct Index { pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, + /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears. + pub prefix_word_pair_proximity_docids: Database, /// Maps the word and the position with the docids that corresponds to it. pub word_position_docids: Database, @@ -138,7 +141,7 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(17); + options.max_dbs(18); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -151,6 +154,8 @@ impl Index { let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; + let prefix_word_pair_proximity_docids = + env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?; let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; @@ -175,6 +180,7 @@ impl Index { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, word_position_docids, word_prefix_position_docids, field_id_word_count_docids, diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index b4eee7dfe..e9c92a949 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -198,6 +198,15 @@ pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { }); snap } +pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |( + (proximity, prefix, word2), + b, + )| { + &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b)) + }); + snap +} pub fn snap_word_position_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) @@ -427,6 +436,9 @@ macro_rules! full_snap_of_db { ($index:ident, word_prefix_pair_proximity_docids) => {{ $crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index) }}; + ($index:ident, prefix_word_pair_proximity_docids) => {{ + $crate::snapshot_tests::snap_prefix_word_pair_proximity_docids(&$index) + }}; ($index:ident, word_position_docids) => {{ $crate::snapshot_tests::snap_word_position_docids(&$index) }}; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 5b7dbc57c..ba59c14cf 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -25,6 +25,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, word_position_docids, field_id_word_count_docids, word_prefix_position_docids, @@ -66,6 +67,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; + prefix_word_pair_proximity_docids.clear(self.wtxn)?; word_position_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; word_prefix_position_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index eae473f51..54328b50d 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -183,6 +183,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_pair_proximity_docids, field_id_word_count_docids, word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, word_position_docids, word_prefix_position_docids, facet_id_f64_docids, @@ -327,26 +328,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?; } - // We delete the documents ids from the word prefix pair proximity database docids - // and remove the empty pairs too. - let db = word_prefix_pair_proximity_docids.remap_key_type::(); - let mut iter = db.iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (key, mut docids) = result?; - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; + for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] { + // We delete the documents ids from the word prefix pair proximity database docids + // and remove the empty pairs too. + let db = db.remap_key_type::(); + let mut iter = db.iter_mut(self.wtxn)?; + while let Some(result) = iter.next() { + let (key, mut docids) = result?; + let previous_len = docids.len(); + docids -= &self.to_delete_docids; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let key = key.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; + } } } - drop(iter); - // We delete the documents ids that are under the pairs of words, // it is faster and use no memory to iterate over all the words pairs than // to compute the cartesian product of every words of the deleted documents. diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e0eefe07b..897f2f8f8 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -36,8 +36,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::UserError; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, - WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, + self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, + WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; use crate::{Index, Result, RoaringBitmapCodec}; @@ -528,12 +528,7 @@ where if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { // Run the word prefix pair proximity docids update operation. - let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - builder.max_nb_chunks = self.indexer_config.max_nb_chunks; - builder.max_memory = self.indexer_config.max_memory; - builder.execute( + PrefixWordPairsProximityDocids::new(self.wtxn, self.index).execute( word_pair_proximity_docids, &new_prefix_fst_words, &common_prefix_fst_words, diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 1bf27a5f0..3ddc01cef 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -6,10 +6,10 @@ pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; pub use self::indexer_config::IndexerConfig; +pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids; pub use self::settings::{Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; -pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids; pub use self::words_prefix_position_docids::WordPrefixPositionDocids; pub use self::words_prefixes_fst::WordsPrefixesFst; @@ -19,9 +19,9 @@ mod delete_documents; mod facets; mod index_documents; mod indexer_config; +mod prefix_word_pairs; mod settings; mod update_step; mod word_prefix_docids; -mod word_prefix_pair_proximity_docids; mod words_prefix_position_docids; mod words_prefixes_fst; diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs new file mode 100644 index 000000000..63286f8da --- /dev/null +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -0,0 +1,216 @@ +use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; +use crate::{Index, Result}; +use heed::types::ByteSlice; +use std::{borrow::Cow, collections::HashSet, io::BufReader}; + +mod prefix_word; +mod word_prefix; + +pub use prefix_word::index_prefix_word_database; +pub use word_prefix::index_word_prefix_database; + +pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + max_proximity: u8, + max_prefix_length: usize, +} +impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Self { + Self { wtxn, index, max_proximity: 4, max_prefix_length: 2 } + } + /// Set the maximum proximity required to make a prefix be part of the words prefixes + /// database. If two words are too far from the threshold the associated documents will + /// not be part of the prefix database. + /// + /// Default value is 4. This value must be lower or equal than 7 and will be clamped + /// to this bound otherwise. + pub fn max_proximity(&mut self, value: u8) -> &mut Self { + self.max_proximity = value.max(7); + self + } + /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words + /// prefixes database. If the prefix length is higher than the threshold, the associated documents + /// will not be part of the prefix database. + /// + /// Default value is 2. + pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { + self.max_prefix_length = value; + self + } + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] + pub fn execute<'a>( + self, + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &'a [String], + common_prefix_fst_words: &[&'a [String]], + del_prefix_fst_words: &HashSet>, + ) -> Result<()> { + index_word_prefix_database( + self.wtxn, + self.index.word_pair_proximity_docids, + self.index.word_prefix_pair_proximity_docids, + self.max_proximity, + self.max_prefix_length, + new_word_pair_proximity_docids.clone(), + new_prefix_fst_words, + common_prefix_fst_words, + del_prefix_fst_words, + )?; + + index_prefix_word_database( + self.wtxn, + self.index.word_pair_proximity_docids, + self.index.prefix_word_pair_proximity_docids, + self.max_proximity, + self.max_prefix_length, + new_word_pair_proximity_docids, + new_prefix_fst_words, + common_prefix_fst_words, + del_prefix_fst_words, + )?; + + Ok(()) + } +} + +// This is adapted from `sorter_into_lmdb_database` +pub fn insert_into_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + new_key: &[u8], + new_value: &[u8], +) -> Result<()> { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; + match iter.next().transpose()? { + Some((key, old_val)) if new_key == key => { + let val = + merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) + .map_err(|_| { + // TODO just wrap this error? + crate::error::InternalError::IndexingMergingKeys { + process: "get-put-merge", + } + })?; + // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour + unsafe { iter.put_current(new_key, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; + } + } + Ok(()) +} + +// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, +// but it uses `append` if the database is empty, and it assumes that the values in the +// writer don't conflict with values in the database. +pub fn write_into_lmdb_database_without_merging( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + writer: grenad::Writer, +) -> Result<()> { + let file = writer.into_inner()?; + let reader = grenad::Reader::new(BufReader::new(file))?; + if database.is_empty(wtxn)? { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + // safety: the key comes from the grenad reader, not the database + unsafe { out_iter.append(k, v)? }; + } + } else { + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use crate::db_snap; + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + use crate::index::tests::TempIndex; + use std::io::Cursor; + + fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { + let mut documents = Vec::new(); + for prefix in prefixes { + for i in 0..50 { + documents.push( + serde_json::json!({ + "text": format!("{prefix}{i:x}"), + }) + .as_object() + .unwrap() + .clone(), + ) + } + } + documents + } + + #[test] + fn test_update() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "text": "At an amazing and beautiful house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "text": "The bell rings at 5 am" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + + let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]); + documents.push( + serde_json::json!({ + "text": "At an extraordinary house" + }) + .as_object() + .unwrap() + .clone(), + ); + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_pair_proximity_docids, "update"); + db_snap!(index, word_prefix_pair_proximity_docids, "update"); + db_snap!(index, prefix_word_pair_proximity_docids, "update"); + } +} diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs new file mode 100644 index 000000000..cbc9ac0b2 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -0,0 +1,178 @@ +use crate::update::index_documents::{create_writer, CursorClonableMmap}; +use crate::update::prefix_word_pairs::{ + insert_into_database, write_into_lmdb_database_without_merging, +}; +use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesDecode; +use log::debug; +use std::borrow::Cow; +use std::collections::{BTreeMap, HashSet}; + +#[logging_timer::time] +pub fn index_prefix_word_database( + wtxn: &mut heed::RwTxn, + word_pair_proximity_docids: heed::Database, + prefix_word_pair_proximity_docids: heed::Database, + max_proximity: u8, + max_prefix_length: usize, + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, +) -> Result<()> { + let max_proximity = max_proximity - 1; + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + + let common_prefixes: Vec<_> = common_prefix_fst_words + .into_iter() + .map(|s| s.into_iter()) + .flatten() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length) + .collect(); + + // If the prefix trie is not empty, then we can iterate over all new + // word pairs to look for new (word1, common_prefix, proximity) elements + // to insert in the DB + for proximity in 1..=max_proximity - 1 { + for prefix in common_prefixes.iter() { + let mut prefix_key = vec![]; + prefix_key.push(proximity); + prefix_key.extend_from_slice(prefix.as_bytes()); + let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; + // This is the core of the algorithm + execute_on_word_pairs_and_prefixes( + proximity + 1, + prefix.as_bytes(), + // the next two arguments tell how to iterate over the new word pairs + &mut cursor, + |cursor| { + if let Some((key, value)) = cursor.next()? { + let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key) + .ok_or(heed::Error::Decoding)?; + Ok(Some((word2, value))) + } else { + Ok(None) + } + }, + // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap) + |key, value| { + insert_into_database( + wtxn, + *prefix_word_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, + )?; + } + } + + // Now we do the same thing with the new prefixes and all word pairs in the DB + let new_prefixes: Vec<_> = new_prefix_fst_words + .into_iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length) + .collect(); + + // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) + // element in an intermediary grenad + let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + + for proximity in 1..=max_proximity - 1 { + for prefix in new_prefixes.iter() { + let mut prefix_key = vec![]; + prefix_key.push(proximity); + prefix_key.extend_from_slice(prefix.as_bytes()); + let mut db_iter = word_pair_proximity_docids + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? + .remap_key_type::(); + execute_on_word_pairs_and_prefixes( + proximity + 1, + prefix.as_bytes(), + &mut db_iter, + |db_iter| { + db_iter + .next() + .transpose() + .map(|x| x.map(|((_, _, word2), value)| (word2, value))) + .map_err(|e| e.into()) + }, + |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + drop(db_iter); + } + } + + // and then we write the grenad into the DB + // Since the grenad contains only new prefixes, we know in advance that none + // of its elements already exist in the DB, thus there is no need to specify + // how to merge conflicting elements + write_into_lmdb_database_without_merging( + wtxn, + *prefix_word_pair_proximity_docids.as_polymorph(), + writer, + )?; + + // All of the word prefix pairs in the database that have a w2 + // that is contained in the `suppr_pw` set must be removed as well. + if !del_prefix_fst_words.is_empty() { + let mut iter = + prefix_word_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; + while let Some(((_, prefix, _), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(prefix.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; + } + } + } + + Ok(()) +} + +/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. +/// +/// Its main arguments are: +/// 1. a sorted prefix iterator over ((word1, word2, proximity), docids) elements +/// 2. a closure to describe how to handle the new computed (word1, prefix, proximity) elements +/// +/// For more information about what this function does, read the module documentation. +fn execute_on_word_pairs_and_prefixes( + proximity: u8, + prefix: &[u8], + iter: &mut I, + mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result>, + mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, +) -> Result<()> { + let mut batch: BTreeMap, Vec>> = <_>::default(); + + while let Some((word2, data)) = next_word2_and_docids(iter)? { + let entry = batch.entry(word2.to_owned()).or_default(); + entry.push(Cow::Owned(data.to_owned())); + } + + let mut key_buffer = Vec::with_capacity(8); + key_buffer.push(proximity); + key_buffer.extend_from_slice(prefix); + key_buffer.push(0); + + let mut value_buffer = Vec::with_capacity(65_536); + + for (key, values) in batch { + key_buffer.truncate(prefix.len() + 2); + value_buffer.clear(); + + key_buffer.extend_from_slice(&key); + let data = if values.len() > 1 { + CboRoaringBitmapCodec::merge_into(&values, &mut value_buffer)?; + value_buffer.as_slice() + } else { + &values[0] + }; + insert(key_buffer.as_slice(), data)?; + } + Ok(()) +} diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..9a6ffaec9 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,46 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [101, ] +1 amazing a [100, ] +1 an a [100, ] +1 and b [100, ] +1 and be [100, ] +1 at a [100, ] +1 rings a [101, ] +1 the b [101, ] +1 the be [101, ] +2 5 a [101, ] +2 amazing a [100, ] +2 amazing b [100, ] +2 amazing be [100, ] +2 an a [100, ] +2 and a [100, ] +2 at a [100, 101, ] +2 beautiful a [100, ] +2 bell a [101, ] +2 house b [100, ] +2 house be [100, ] +2 rings b [101, ] +2 rings be [101, ] +3 am a [101, ] +3 amazing a [100, ] +3 an b [100, ] +3 an be [100, ] +3 and a [100, ] +3 at a [100, ] +3 at b [101, ] +3 at be [101, ] +3 beautiful a [100, ] +3 house a [100, ] +3 rings a [101, ] +3 the a [101, ] +4 5 b [101, ] +4 5 be [101, ] +4 and a [100, ] +4 at b [100, ] +4 at be [100, ] +4 beautiful a [100, ] +4 bell a [101, ] +4 house a [100, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..e460be400 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +fb88e49fd666886731b62baef8f44995 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b94c5d52e --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,41 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +2 a 5 [101, ] +2 a amazing [100, ] +2 a an [100, 202, ] +2 a and [100, ] +2 a beautiful [100, ] +2 a extraordinary [202, ] +2 am and [100, ] +2 an amazing [100, ] +2 an beautiful [100, ] +2 an extraordinary [202, ] +2 b house [100, ] +2 b rings [101, ] +2 be house [100, ] +2 be rings [101, ] +3 a 5 [101, ] +3 a am [101, ] +3 a amazing [100, ] +3 a an [100, ] +3 a and [100, ] +3 a at [100, 202, ] +3 a beautiful [100, ] +3 a extraordinary [202, ] +3 a house [100, 202, ] +3 a rings [101, ] +3 am 5 [101, ] +3 am an [100, ] +3 am beautiful [100, ] +3 an amazing [100, ] +3 an and [100, ] +3 an at [100, 202, ] +3 an house [100, 202, ] +3 b and [100, ] +3 b at [101, ] +3 b the [101, ] +3 be and [100, ] +3 be at [101, ] +3 be the [101, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..015ef8c14 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +6965ecd1bf821f1cf921c2ab751b36cf diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..e460be400 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +fb88e49fd666886731b62baef8f44995 diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs similarity index 67% rename from milli/src/update/word_prefix_pair_proximity_docids.rs rename to milli/src/update/prefix_word_pairs/word_prefix.rs index 77294296f..bd1bea2a3 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -1,5 +1,5 @@ /*! - ## What is WordPrefixPairProximityDocids? + ## What is WordPrefix? The word-prefix-pair-proximity-docids database is a database whose keys are of the form `(proximity, word, prefix)` and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with @@ -139,7 +139,7 @@ inputs described above, which come from different places: 2. `word_pairs_db`, which is the list of word pairs from the database. This list includes all elements in `new_word_pairs` since `new_word_pairs` - was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` + was added to the database prior to calling the `WordPrefix::execute` function. To update the prefix database correctly, we call the algorithm described earlier first @@ -161,196 +161,137 @@ reader and writer). Therefore, when calling the algorithm on `((proximity, word, prefix), docids)` elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. - - - */ -use std::borrow::Cow; -use std::collections::HashSet; -use std::io::BufReader; +use crate::update::index_documents::{create_writer, CursorClonableMmap}; +use crate::update::prefix_word_pairs::{ + insert_into_database, write_into_lmdb_database_without_merging, +}; +use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; use grenad::CompressionType; use heed::types::ByteSlice; use heed::BytesDecode; use log::debug; +use std::borrow::Cow; +use std::collections::HashSet; -use crate::update::index_documents::{ - create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, -}; -use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedU8StrStrCodec}; - -pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) max_memory: Option, +#[logging_timer::time] +pub fn index_word_prefix_database( + wtxn: &mut heed::RwTxn, + word_pair_proximity_docids: heed::Database, + word_prefix_pair_proximity_docids: heed::Database, max_proximity: u8, max_prefix_length: usize, -} + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, +) -> Result<()> { + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); -impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> { - WordPrefixPairProximityDocids { - wtxn, - index, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - max_nb_chunks: None, - max_memory: None, - max_proximity: 4, - max_prefix_length: 2, - } - } + // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length + let prefixes = PrefixTrieNode::from_sorted_prefixes( + common_prefix_fst_words + .into_iter() + .map(|s| s.into_iter()) + .flatten() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length), + ); - /// Set the maximum proximity required to make a prefix be part of the words prefixes - /// database. If two words are too far from the threshold the associated documents will - /// not be part of the prefix database. - /// - /// Default value is 4. This value must be lower or equal than 7 and will be clamped - /// to this bound otherwise. - pub fn max_proximity(&mut self, value: u8) -> &mut Self { - self.max_proximity = value.max(7); - self - } - - /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words - /// prefixes database. If the prefix length is higher than the threshold, the associated documents - /// will not be part of the prefix database. - /// - /// Default value is 2. - pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value; - self - } - - #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute<'a>( - mut self, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &'a [String], - common_prefix_fst_words: &[&'a [String]], - del_prefix_fst_words: &HashSet>, - ) -> Result<()> { - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length - let prefixes = PrefixTrieNode::from_sorted_prefixes( - common_prefix_fst_words - .into_iter() - .map(|s| s.into_iter()) - .flatten() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - // If the prefix trie is not empty, then we can iterate over all new - // word pairs to look for new (word1, common_prefix, proximity) elements - // to insert in the DB - if !prefixes.is_empty() { - let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - // the first two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.move_on_next()? { - let (proximity, word1, word2) = UncheckedU8StrStrCodec::bytes_decode(key) - .ok_or(heed::Error::Decoding)?; - Ok(Some(((proximity, word1, word2), value))) - } else { - Ok(None) - } - }, - &prefixes, - self.max_proximity, - // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) - |key, value| { - insert_into_database( - &mut self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - - let prefixes = PrefixTrieNode::from_sorted_prefixes( - new_prefix_fst_words - .into_iter() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - if !prefixes.is_empty() { - let mut db_iter = self - .index - .word_pair_proximity_docids - .remap_key_type::() - .remap_data_type::() - .iter(self.wtxn)?; - - // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) - // element in an intermediary grenad - let mut writer = create_writer( - self.chunk_compression_type, - self.chunk_compression_level, - tempfile::tempfile()?, - ); - - execute_on_word_pairs_and_prefixes( - &mut db_iter, - |db_iter| db_iter.next().transpose().map_err(|e| e.into()), - &prefixes, - self.max_proximity, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - writer, - )?; - } - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = self - .index - .word_prefix_pair_proximity_docids - .remap_data_type::() - .iter_mut(self.wtxn)?; - while let Some(((_, w2, _), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(w2.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; + // If the prefix trie is not empty, then we can iterate over all new + // word pairs to look for new (proximity, word1, common_prefix) elements + // to insert in the DB + if !prefixes.is_empty() { + let mut cursor = new_word_pair_proximity_docids.into_cursor()?; + // This is the core of the algorithm + execute_on_word_pairs_and_prefixes( + // the first two arguments tell how to iterate over the new word pairs + &mut cursor, + |cursor| { + if let Some((key, value)) = cursor.move_on_next()? { + let (proximity, word1, word2) = + UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + Ok(Some(((proximity, word1, word2), value))) + } else { + Ok(None) } + }, + &prefixes, + max_proximity, + // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap) + |key, value| { + insert_into_database( + wtxn, + *word_prefix_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, + )?; + } + + // Now we do the same thing with the new prefixes and all word pairs in the DB + + let prefixes = PrefixTrieNode::from_sorted_prefixes( + new_prefix_fst_words + .into_iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length), + ); + + if !prefixes.is_empty() { + let mut db_iter = word_pair_proximity_docids + .remap_key_type::() + .remap_data_type::() + .iter(wtxn)?; + + // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix) + // element in an intermediary grenad + let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + + execute_on_word_pairs_and_prefixes( + &mut db_iter, + |db_iter| db_iter.next().transpose().map_err(|e| e.into()), + &prefixes, + max_proximity, + |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + drop(db_iter); + + // and then we write the grenad into the DB + // Since the grenad contains only new prefixes, we know in advance that none + // of its elements already exist in the DB, thus there is no need to specify + // how to merge conflicting elements + write_into_lmdb_database_without_merging( + wtxn, + *word_prefix_pair_proximity_docids.as_polymorph(), + writer, + )?; + } + + // All of the word prefix pairs in the database that have a w2 + // that is contained in the `suppr_pw` set must be removed as well. + if !del_prefix_fst_words.is_empty() { + let mut iter = + word_prefix_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; + while let Some(((_, _, prefix), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(prefix.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; } } - - Ok(()) } + + Ok(()) } /// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. /// /// Its main arguments are: -/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements +/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements /// 2. a prefix trie -/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements +/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements /// /// For more information about what this function does, read the module documentation. fn execute_on_word_pairs_and_prefixes( @@ -495,61 +436,6 @@ impl PrefixAndProximityBatch { } } -// This is adapted from `sorter_into_lmdb_database` -fn insert_into_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - new_key: &[u8], - new_value: &[u8], -) -> Result<()> { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; - match iter.next().transpose()? { - Some((key, old_val)) if new_key == key => { - let val = - merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) - .map_err(|_| { - // TODO just wrap this error? - crate::error::InternalError::IndexingMergingKeys { - process: "get-put-merge", - } - })?; - // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour - unsafe { iter.put_current(new_key, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; - } - } - Ok(()) -} - -// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, -// but it uses `append` if the database is empty, and it assumes that the values in the -// writer don't conflict with values in the database. -pub fn write_into_lmdb_database_without_merging( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - writer: grenad::Writer, -) -> Result<()> { - let file = writer.into_inner()?; - let reader = grenad::Reader::new(BufReader::new(file))?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - // safety: the key comes from the grenad reader, not the database - unsafe { out_iter.append(k, v)? }; - } - } else { - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - Ok(()) -} - /** A prefix trie. Used to iterate quickly over the prefixes of a word that are within a set. @@ -676,90 +562,9 @@ impl PrefixTrieNode { } #[cfg(test)] mod tests { - use std::io::Cursor; - - use roaring::RoaringBitmap; - use super::*; - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::{db_snap, CboRoaringBitmapCodec, U8StrStrCodec}; - - fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { - let mut documents = Vec::new(); - for prefix in prefixes { - for i in 0..50 { - documents.push( - serde_json::json!({ - "text": format!("{prefix}{i:x}"), - }) - .as_object() - .unwrap() - .clone(), - ) - } - } - documents - } - - #[test] - fn test_update() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - - let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]); - documents.push( - serde_json::json!({ - "text": "At an extraordinary house" - }) - .as_object() - .unwrap() - .clone(), - ); - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_prefix_pair_proximity_docids, "update"); - } + use crate::{CboRoaringBitmapCodec, U8StrStrCodec}; + use roaring::RoaringBitmap; fn check_prefixes( trie: &PrefixTrieNode, @@ -899,9 +704,9 @@ mod tests { &prefixes, 2, |k, v| { - let (word1, prefix, proximity) = U8StrStrCodec::bytes_decode(k).unwrap(); + let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap(); let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); - result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); + result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap)); Ok(()) }, )