From 9078e60024fda7e17fab1c35e4c3caf96b60e181 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 25 Sep 2023 16:39:32 +0200 Subject: [PATCH] Generalize usage of CboRoaringBitmap codec to ease the use --- milli/src/index.rs | 8 ++--- milli/src/search/new/db_cache.rs | 8 ++--- milli/src/update/delete_documents.rs | 4 +-- .../extract/extract_docid_word_positions.rs | 6 ++++ .../extract/extract_word_docids.rs | 35 +++++-------------- .../extract_word_pair_proximity_docids.rs | 4 ++- .../extract/extract_word_position_docids.rs | 5 +-- .../src/update/index_documents/extract/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 6 ++-- .../src/update/index_documents/typed_chunk.rs | 4 +-- milli/src/update/word_prefix_docids.rs | 16 ++++----- 11 files changed, 44 insertions(+), 54 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index d563f852b..288223a95 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -119,16 +119,16 @@ pub struct Index { pub(crate) main: PolyDatabase, /// A word and all the documents ids containing the word. - pub word_docids: Database, + pub word_docids: Database, /// A word and all the documents ids containing the word, from attributes for which typos are not allowed. - pub exact_word_docids: Database, + pub exact_word_docids: Database, /// A prefix of word and all the documents ids containing this prefix. - pub word_prefix_docids: Database, + pub word_prefix_docids: Database, /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed. - pub exact_word_prefix_docids: Database, + pub exact_word_prefix_docids: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. pub word_pair_proximity_docids: Database, diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index e0a2ba3cf..3f4751185 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -168,7 +168,7 @@ impl<'ctx> SearchContext<'ctx> { merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, word, self.word_interner.get(word).as_str(), @@ -182,7 +182,7 @@ impl<'ctx> SearchContext<'ctx> { &mut self, word: Interned, ) -> Result> { - DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, word, self.word_interner.get(word).as_str(), @@ -230,7 +230,7 @@ impl<'ctx> SearchContext<'ctx> { merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, prefix, self.word_interner.get(prefix).as_str(), @@ -244,7 +244,7 @@ impl<'ctx> SearchContext<'ctx> { &mut self, prefix: Interned, ) -> Result> { - DatabaseCache::get_value::<_, _, RoaringBitmapCodec>( + DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( self.txn, prefix, self.word_interner.get(prefix).as_str(), diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e23714530..a680bdf3f 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -495,7 +495,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { fn remove_from_word_prefix_docids( txn: &mut heed::RwTxn, - db: &Database, + db: &Database, to_remove: &RoaringBitmap, ) -> Result>> { let mut prefixes_to_delete = fst::SetBuilder::memory(); @@ -523,7 +523,7 @@ fn remove_from_word_prefix_docids( fn remove_from_word_docids( txn: &mut heed::RwTxn, - db: &heed::Database, + db: &heed::Database, to_remove: &RoaringBitmap, words_to_keep: &mut BTreeSet, words_to_remove: &mut BTreeSet, diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index ff25caa1a..2fb162f58 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -106,6 +106,12 @@ pub fn extract_docid_word_positions( if let Some(stop_words) = stop_words { tokenizer_builder.stop_words(stop_words); } + if let Some(dictionary) = dictionary { + tokenizer_builder.words_dict(dictionary); + } + if let Some(separators) = allowed_separators { + tokenizer_builder.separators(separators); + } tokenizer_builder.allow_list(&script_language); let tokenizer = tokenizer_builder.build(); diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index e5cf918dd..2e70b4bf5 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -8,7 +8,7 @@ use obkv::KvReaderU16; use roaring::RoaringBitmap; use super::helpers::{ - create_sorter, create_writer, merge_roaring_bitmaps, serialize_roaring_bitmap, + create_sorter, create_writer, merge_cbo_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters, }; use crate::error::SerializationError; @@ -36,15 +36,12 @@ pub fn extract_word_docids( let mut word_fid_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, max_memory.map(|x| x / 3), ); - - let mut current_document_id = None; - let mut fid = 0; let mut key_buffer = Vec::new(); let mut value_buffer = Vec::new(); let mut words = BTreeSet::new(); @@ -55,28 +52,12 @@ pub fn extract_word_docids( let (fid_bytes, _) = try_split_array_at(fid_bytes) .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; let document_id = u32::from_be_bytes(document_id_bytes); - fid = u16::from_be_bytes(fid_bytes); + let fid = u16::from_be_bytes(fid_bytes); - // drain the btreemaps when we change document. - if current_document_id.map_or(false, |id| id != document_id) { - words_into_sorter( - document_id, - fid, - &mut key_buffer, - &mut value_buffer, - &mut words, - &mut word_fid_docids_sorter, - )?; - } - - current_document_id = Some(document_id); for (_pos, word) in KvReaderU16::new(&value).iter() { words.insert(word.to_vec()); } - } - // We must make sure that don't lose the current document field id - if let Some(document_id) = current_document_id { words_into_sorter( document_id, fid, @@ -85,11 +66,13 @@ pub fn extract_word_docids( &mut words, &mut word_fid_docids_sorter, )?; + + words.clear(); } let mut word_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -98,7 +81,7 @@ pub fn extract_word_docids( let mut exact_word_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, indexer.chunk_compression_type, indexer.chunk_compression_level, indexer.max_nb_chunks, @@ -142,15 +125,13 @@ fn words_into_sorter( word_fid_docids_sorter: &mut grenad::Sorter, ) -> Result<()> { puffin::profile_function!(); - let bitmap = RoaringBitmap::from_iter(Some(document_id)); - serialize_roaring_bitmap(&bitmap, value_buffer)?; for word_bytes in words.iter() { key_buffer.clear(); key_buffer.extend_from_slice(&word_bytes); key_buffer.push(0); key_buffer.extend_from_slice(&fid.to_be_bytes()); - word_fid_docids_sorter.insert(&key_buffer, &value_buffer)?; + word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } words.clear(); diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 9fd8c0dd9..7c5155320 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -56,7 +56,7 @@ pub fn extract_word_pair_proximity_docids( } document_word_positions_into_sorter( - document_id, + current_document_id.unwrap(), &word_pair_proximity, &mut word_pair_proximity_docids_sorter, )?; @@ -64,6 +64,8 @@ pub fn extract_word_pair_proximity_docids( word_positions.clear(); } + current_document_id = Some(document_id); + for (position, word) in KvReaderU16::new(&value).iter() { // drain the proximity window until the head word is considered close to the word we are inserting. while word_positions.get(0).map_or(false, |(_w, p)| { diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 638052b37..7e336a150 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -35,7 +35,7 @@ pub fn extract_word_position_docids( ); let mut word_positions: HashSet<(u16, Vec)> = HashSet::new(); - let mut current_document_id = None; + let mut current_document_id: Option = None; let mut key_buffer = Vec::new(); let mut cursor = docid_word_positions.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { @@ -49,7 +49,8 @@ pub fn extract_word_position_docids( key_buffer.extend_from_slice(word_bytes); key_buffer.push(0); key_buffer.extend_from_slice(&position.to_be_bytes()); - word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + word_position_docids_sorter + .insert(&key_buffer, current_document_id.unwrap().to_ne_bytes())?; } word_positions.clear(); } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 1e1b3d3e2..aa69ac8e3 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -181,7 +181,7 @@ pub(crate) fn data_from_obkv_documents( indexer, lmdb_writer_sx.clone(), move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes), - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| { TypedChunk::WordDocids { word_docids_reader, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 58219f28c..22e42937f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -38,7 +38,7 @@ use crate::update::{ self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; -use crate::{Index, Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 5; @@ -700,8 +700,8 @@ where fn execute_word_prefix_docids( txn: &mut heed::RwTxn, reader: grenad::Reader>, - word_docids_db: Database, - word_prefix_docids_db: Database, + word_docids_db: Database, + word_prefix_docids_db: Database, indexer_config: &IndexerConfig, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index c3f7680e8..0da577eda 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -156,7 +156,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, |value, _buffer| Ok(value), - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, )?; let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; @@ -166,7 +166,7 @@ pub(crate) fn write_typed_chunk_into_index( wtxn, index_is_empty, |value, _buffer| Ok(value), - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, )?; let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index a30254994..980bab01a 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -5,15 +5,15 @@ use heed::types::{ByteSlice, Str}; use heed::Database; use crate::update::index_documents::{ - create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, + create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, CursorClonableMmap, MergeFn, }; -use crate::{Result, RoaringBitmapCodec}; +use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; pub struct WordPrefixDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, - word_docids: Database, - word_prefix_docids: Database, + word_docids: Database, + word_prefix_docids: Database, pub(crate) chunk_compression_type: CompressionType, pub(crate) chunk_compression_level: Option, pub(crate) max_nb_chunks: Option, @@ -23,8 +23,8 @@ pub struct WordPrefixDocids<'t, 'u, 'i> { impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { pub fn new( wtxn: &'t mut heed::RwTxn<'i, 'u>, - word_docids: Database, - word_prefix_docids: Database, + word_docids: Database, + word_prefix_docids: Database, ) -> WordPrefixDocids<'t, 'u, 'i> { WordPrefixDocids { wtxn, @@ -51,7 +51,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { // and write into it at the same time, therefore we write into another file. let mut prefix_docids_sorter = create_sorter( grenad::SortAlgorithm::Unstable, - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, self.chunk_compression_type, self.chunk_compression_level, self.max_nb_chunks, @@ -115,7 +115,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { self.wtxn, *self.word_prefix_docids.as_polymorph(), prefix_docids_sorter, - merge_roaring_bitmaps, + merge_cbo_roaring_bitmaps, )?; Ok(())