From 0a77be4ec02f29df26242a6ffa7a94ddcb3b0724 Mon Sep 17 00:00:00 2001 From: ad hoc Date: Thu, 24 Mar 2022 15:22:57 +0100 Subject: [PATCH] introduce exact_word_docids db --- milli/src/index.rs | 9 ++- milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 71 +++++++++++++------ .../extract/extract_word_docids.rs | 12 +++- .../src/update/index_documents/extract/mod.rs | 7 +- .../index_documents/helpers/grenad_helpers.rs | 5 ++ .../src/update/index_documents/helpers/mod.rs | 2 +- milli/src/update/index_documents/mod.rs | 18 +++-- .../src/update/index_documents/typed_chunk.rs | 49 +++++++++---- milli/src/update/word_prefix_docids.rs | 5 +- 10 files changed, 133 insertions(+), 47 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index f4e17d93c..8f9c9beb7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -59,6 +59,7 @@ pub mod main_key { pub mod db_name { pub const MAIN: &str = "main"; pub const WORD_DOCIDS: &str = "word-docids"; + pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids"; pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; @@ -83,6 +84,10 @@ pub struct Index { /// A word and all the documents ids containing the word. pub word_docids: Database, + + /// A word and all the documents ids containing the word, from attributes for which typos are not allowed. + pub exact_word_docids: Database, + /// A prefix of word and all the documents ids containing this prefix. pub word_prefix_docids: Database, @@ -119,12 +124,13 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(14); + options.max_dbs(15); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; let main = env.create_poly_database(Some(MAIN))?; let word_docids = env.create_database(Some(WORD_DOCIDS))?; + let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?; let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; @@ -146,6 +152,7 @@ impl Index { env, main, word_docids, + exact_word_docids, word_prefix_docids, docid_word_positions, word_pair_proximity_docids, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 644547b91..57c0969c7 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -19,6 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { env: _env, main: _main, word_docids, + exact_word_docids, word_prefix_docids, docid_word_positions, word_pair_proximity_docids, @@ -55,6 +56,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { // Clear the other databases. word_docids.clear(self.wtxn)?; + exact_word_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 402cc61dd..46a4721c0 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -2,7 +2,7 @@ use std::collections::btree_map::Entry; use std::collections::HashMap; use fst::IntoStreamer; -use heed::types::ByteSlice; +use heed::types::{ByteSlice, Str}; use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -16,7 +16,10 @@ use crate::heed_codec::facet::{ }; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; -use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; +use crate::{ + DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32, + BEU32, +}; pub struct DeleteDocuments<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -108,6 +111,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { env: _env, main: _main, word_docids, + exact_word_docids, word_prefix_docids, docid_word_positions, word_pair_proximity_docids, @@ -204,25 +208,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // We iterate over the words and delete the documents ids // from the word docids database. for (word, must_remove) in &mut words { - // We create an iterator to be able to get the content and delete the word docids. - // It's faster to acquire a cursor to get and delete or put, as we avoid traversing - // the LMDB B-Tree two times but only once. - let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; - if let Some((key, mut docids)) = iter.next().transpose()? { - if key == word.as_str() { - let previous_len = docids.len(); - docids -= &self.documents_ids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - *must_remove = true; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; - } - } - } + remove_from_word_docids( + self.wtxn, + word_docids, + word.as_str(), + must_remove, + &self.documents_ids, + )?; + + remove_from_word_docids( + self.wtxn, + exact_word_docids, + word.as_str(), + must_remove, + &self.documents_ids, + )?; } // We construct an FST set that contains the words to delete from the words FST. @@ -457,6 +457,35 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } +fn remove_from_word_docids( + txn: &mut heed::RwTxn, + db: &heed::Database, + word: &str, + must_remove: &mut bool, + to_remove: &RoaringBitmap, +) -> Result<()> { + // We create an iterator to be able to get the content and delete the word docids. + // It's faster to acquire a cursor to get and delete or put, as we avoid traversing + // the LMDB B-Tree two times but only once. + let mut iter = db.prefix_iter_mut(txn, &word)?; + if let Some((key, mut docids)) = iter.next().transpose()? { + if key == word { + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + *must_remove = true; + } else if docids.len() != previous_len { + let key = key.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; + } + } + } + Ok(()) +} + fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs index 80d68298a..03bfada21 100644 --- a/milli/src/update/index_documents/extract/extract_word_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_docids.rs @@ -10,17 +10,21 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::update::index_documents::MergeFn; use crate::Result; /// Extracts the word and the documents ids where this word appear. /// /// Returns a grenad reader with the list of extracted words and /// documents ids from the given chunk of docid word positions. +/// +/// The first returned reader in the one for normal word_docids, and the second one is for +/// exact_word_docids #[logging_timer::time] pub fn extract_word_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, -) -> Result> { +) -> Result<(grenad::Reader, grenad::Reader)> { let max_memory = indexer.max_memory_by_thread(); let mut word_docids_sorter = create_sorter( @@ -43,5 +47,9 @@ pub fn extract_word_docids( word_docids_sorter.insert(word_bytes, &value_buffer)?; } - sorter_into_reader(word_docids_sorter, indexer) + let empty_sorter = grenad::Sorter::new(merge_roaring_bitmaps as MergeFn); + Ok(( + sorter_into_reader(word_docids_sorter, indexer)?, + sorter_into_reader(empty_sorter, indexer)?, + )) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 100431237..4e7f211ce 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -86,13 +86,16 @@ pub(crate) fn data_from_obkv_documents( "field-id-wordcount-docids", ); - spawn_extraction_task::<_, _, Vec>>( + spawn_extraction_task::<_, _, Vec<(grenad::Reader, grenad::Reader)>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), extract_word_docids, merge_roaring_bitmaps, - TypedChunk::WordDocids, + |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids { + word_docids_reader, + exact_word_docids_reader, + }, "word-docids", ); diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index fc28860b2..fb5242910 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -277,3 +277,8 @@ pub fn sorter_into_lmdb_database( debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); Ok(()) } + +/// Used when trying to merge readers, but you don't actually care about the values. +pub fn merge_nothing<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result> { + Ok(Cow::Owned(Vec::new())) +} diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index f4940af1d..4642bcf14 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -8,7 +8,7 @@ use std::convert::{TryFrom, TryInto}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ - as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, + as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_nothing, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, GrenadParameters, MergeableReader, }; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2d3004444..633b72cc9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,7 +20,7 @@ pub use self::helpers::{ fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, }; -use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; +use self::helpers::{grenad_obkv_into_chunks, merge_nothing, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use crate::documents::DocumentBatchReader; pub use crate::update::index_documents::helpers::CursorClonableMmap; @@ -282,6 +282,7 @@ where let mut word_pair_proximity_docids = None; let mut word_position_docids = None; let mut word_docids = None; + let mut _exact_word_docids = None; let mut databases_seen = 0; (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { @@ -291,10 +292,13 @@ where for result in lmdb_writer_rx { let typed_chunk = match result? { - TypedChunk::WordDocids(chunk) => { - let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? }; word_docids = Some(cloneable_chunk); - TypedChunk::WordDocids(chunk) + let cloneable_chunk = + unsafe { as_cloneable_grenad(&exact_word_docids_reader)? }; + _exact_word_docids = Some(cloneable_chunk); + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } } TypedChunk::WordPairProximityDocids(chunk) => { let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; @@ -425,6 +429,10 @@ where }); if let Some(word_docids) = word_docids { + let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn); + word_docids_builder.push(word_docids.into_cursor()?); + // TODO: push exact_word_docids + let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?; // Run the word prefix docids update operation. let mut builder = WordPrefixDocids::new(self.wtxn, self.index); builder.chunk_compression_type = self.indexer_config.chunk_compression_type; @@ -432,7 +440,7 @@ where builder.max_nb_chunks = self.indexer_config.max_nb_chunks; builder.max_memory = self.indexer_config.max_memory; builder.execute( - word_docids, + word_docids_iter, &new_prefix_fst_words, &common_prefix_fst_words, &del_prefix_fst_words, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 77ea31138..be440114f 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -3,14 +3,16 @@ use std::convert::TryInto; use std::fs::File; use std::io; +use grenad::MergerBuilder; use heed::types::ByteSlice; use heed::{BytesDecode, RwTxn}; use roaring::RoaringBitmap; use super::helpers::{ - self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, + self, merge_nothing, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap, }; +use super::{ClonableMmap, MergeFn}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::{ @@ -25,7 +27,10 @@ pub(crate) enum TypedChunk { Documents(grenad::Reader), FieldIdWordcountDocids(grenad::Reader), NewDocumentsIds(RoaringBitmap), - WordDocids(grenad::Reader), + WordDocids { + word_docids_reader: grenad::Reader, + exact_word_docids_reader: grenad::Reader, + }, WordPositionDocids(grenad::Reader), WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), @@ -86,8 +91,8 @@ pub(crate) fn write_typed_chunk_into_index( TypedChunk::NewDocumentsIds(documents_ids) => { return Ok((documents_ids, is_merged_database)) } - TypedChunk::WordDocids(word_docids_iter) => { - let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_iter) }?; + TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => { + let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?; append_entries_into_database( word_docids_iter.clone(), &index.word_docids, @@ -97,15 +102,18 @@ pub(crate) fn write_typed_chunk_into_index( merge_roaring_bitmaps, )?; + let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; + append_entries_into_database( + exact_word_docids_iter.clone(), + &index.exact_word_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_roaring_bitmaps, + )?; + // create fst from word docids - let mut builder = fst::SetBuilder::memory(); - let mut cursor = word_docids_iter.into_cursor()?; - while let Some((word, _value)) = cursor.move_on_next()? { - // This is a lexicographically ordered word position - // we use the key to construct the words fst. - builder.insert(word)?; - } - let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?; + let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?; let db_fst = index.words_fst(wtxn)?; // merge new fst with database fst @@ -214,6 +222,23 @@ pub(crate) fn write_typed_chunk_into_index( Ok((RoaringBitmap::new(), is_merged_database)) } +fn merge_word_docids_reader_into_fst( + word_docids_iter: grenad::Reader>, + exact_word_docids_iter: grenad::Reader>, +) -> Result>> { + let mut merger_builder = MergerBuilder::new(merge_nothing as MergeFn); + merger_builder.push(word_docids_iter.into_cursor()?); + merger_builder.push(exact_word_docids_iter.into_cursor()?); + let mut iter = merger_builder.build().into_stream_merger_iter()?; + let mut builder = fst::SetBuilder::memory(); + + while let Some((k, _)) = iter.next()? { + builder.insert(k)?; + } + + Ok(builder.into_set()) +} + fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec) -> Result<()> { let new_value = RoaringBitmap::deserialize_from(new_value)?; let db_value = RoaringBitmap::deserialize_from(db_value)?; diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs index 076816f09..4114f8baf 100644 --- a/milli/src/update/word_prefix_docids.rs +++ b/milli/src/update/word_prefix_docids.rs @@ -35,7 +35,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { #[logging_timer::time("WordPrefixDocids::{}")] pub fn execute( self, - new_word_docids: grenad::Reader, + mut new_word_docids_iter: grenad::MergerIter, new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, @@ -51,10 +51,9 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> { ); if !common_prefix_fst_words.is_empty() { - let mut new_word_docids_iter = new_word_docids.into_cursor()?; let mut current_prefixes: Option<&&[String]> = None; let mut prefixes_cache = HashMap::new(); - while let Some((word, data)) = new_word_docids_iter.move_on_next()? { + while let Some((word, data)) = new_word_docids_iter.next()? { current_prefixes = match current_prefixes.take() { Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), _otherwise => {