introduce exact_word_docids db

This commit is contained in:
ad hoc 2022-03-24 15:22:57 +01:00
parent 5f9f82757d
commit 0a77be4ec0
No known key found for this signature in database
GPG Key ID: 4F00A782990CC643
10 changed files with 133 additions and 47 deletions

View File

@ -59,6 +59,7 @@ pub mod main_key {
pub mod db_name { pub mod db_name {
pub const MAIN: &str = "main"; pub const MAIN: &str = "main";
pub const WORD_DOCIDS: &str = "word-docids"; pub const WORD_DOCIDS: &str = "word-docids";
pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids";
pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids"; pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
@ -83,6 +84,10 @@ pub struct Index {
/// A word and all the documents ids containing the word. /// A word and all the documents ids containing the word.
pub word_docids: Database<Str, RoaringBitmapCodec>, pub word_docids: Database<Str, RoaringBitmapCodec>,
/// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
/// A prefix of word and all the documents ids containing this prefix. /// A prefix of word and all the documents ids containing this prefix.
pub word_prefix_docids: Database<Str, RoaringBitmapCodec>, pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
@ -119,12 +124,13 @@ impl Index {
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> { pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
use db_name::*; use db_name::*;
options.max_dbs(14); options.max_dbs(15);
unsafe { options.flag(Flags::MdbAlwaysFreePages) }; unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?; let env = options.open(path)?;
let main = env.create_poly_database(Some(MAIN))?; let main = env.create_poly_database(Some(MAIN))?;
let word_docids = env.create_database(Some(WORD_DOCIDS))?; let word_docids = env.create_database(Some(WORD_DOCIDS))?;
let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?;
let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?; let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?; let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
@ -146,6 +152,7 @@ impl Index {
env, env,
main, main,
word_docids, word_docids,
exact_word_docids,
word_prefix_docids, word_prefix_docids,
docid_word_positions, docid_word_positions,
word_pair_proximity_docids, word_pair_proximity_docids,

View File

@ -19,6 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
env: _env, env: _env,
main: _main, main: _main,
word_docids, word_docids,
exact_word_docids,
word_prefix_docids, word_prefix_docids,
docid_word_positions, docid_word_positions,
word_pair_proximity_docids, word_pair_proximity_docids,
@ -55,6 +56,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
// Clear the other databases. // Clear the other databases.
word_docids.clear(self.wtxn)?; word_docids.clear(self.wtxn)?;
exact_word_docids.clear(self.wtxn)?;
word_prefix_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?;
docid_word_positions.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?;
word_pair_proximity_docids.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?;

View File

@ -2,7 +2,7 @@ use std::collections::btree_map::Entry;
use std::collections::HashMap; use std::collections::HashMap;
use fst::IntoStreamer; use fst::IntoStreamer;
use heed::types::ByteSlice; use heed::types::{ByteSlice, Str};
use heed::{BytesDecode, BytesEncode}; use heed::{BytesDecode, BytesEncode};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -16,7 +16,10 @@ use crate::heed_codec::facet::{
}; };
use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::{db_name, main_key}; use crate::index::{db_name, main_key};
use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; use crate::{
DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32,
BEU32,
};
pub struct DeleteDocuments<'t, 'u, 'i> { pub struct DeleteDocuments<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>, wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -108,6 +111,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
env: _env, env: _env,
main: _main, main: _main,
word_docids, word_docids,
exact_word_docids,
word_prefix_docids, word_prefix_docids,
docid_word_positions, docid_word_positions,
word_pair_proximity_docids, word_pair_proximity_docids,
@ -204,25 +208,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
// We iterate over the words and delete the documents ids // We iterate over the words and delete the documents ids
// from the word docids database. // from the word docids database.
for (word, must_remove) in &mut words { for (word, must_remove) in &mut words {
// We create an iterator to be able to get the content and delete the word docids. remove_from_word_docids(
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing self.wtxn,
// the LMDB B-Tree two times but only once. word_docids,
let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?; word.as_str(),
if let Some((key, mut docids)) = iter.next().transpose()? { must_remove,
if key == word.as_str() { &self.documents_ids,
let previous_len = docids.len(); )?;
docids -= &self.documents_ids;
if docids.is_empty() { remove_from_word_docids(
// safety: we don't keep references from inside the LMDB database. self.wtxn,
unsafe { iter.del_current()? }; exact_word_docids,
*must_remove = true; word.as_str(),
} else if docids.len() != previous_len { must_remove,
let key = key.to_owned(); &self.documents_ids,
// safety: we don't keep references from inside the LMDB database. )?;
unsafe { iter.put_current(&key, &docids)? };
}
}
}
} }
// We construct an FST set that contains the words to delete from the words FST. // We construct an FST set that contains the words to delete from the words FST.
@ -457,6 +457,35 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
} }
} }
fn remove_from_word_docids(
txn: &mut heed::RwTxn,
db: &heed::Database<Str, RoaringBitmapCodec>,
word: &str,
must_remove: &mut bool,
to_remove: &RoaringBitmap,
) -> Result<()> {
// We create an iterator to be able to get the content and delete the word docids.
// It's faster to acquire a cursor to get and delete or put, as we avoid traversing
// the LMDB B-Tree two times but only once.
let mut iter = db.prefix_iter_mut(txn, &word)?;
if let Some((key, mut docids)) = iter.next().transpose()? {
if key == word {
let previous_len = docids.len();
docids -= to_remove;
if docids.is_empty() {
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? };
*must_remove = true;
} else if docids.len() != previous_len {
let key = key.to_owned();
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.put_current(&key, &docids)? };
}
}
}
Ok(())
}
fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>(
wtxn: &'a mut heed::RwTxn, wtxn: &'a mut heed::RwTxn,
db: &heed::Database<C, DC>, db: &heed::Database<C, DC>,

View File

@ -10,17 +10,21 @@ use super::helpers::{
}; };
use crate::error::SerializationError; use crate::error::SerializationError;
use crate::index::db_name::DOCID_WORD_POSITIONS; use crate::index::db_name::DOCID_WORD_POSITIONS;
use crate::update::index_documents::MergeFn;
use crate::Result; use crate::Result;
/// Extracts the word and the documents ids where this word appear. /// Extracts the word and the documents ids where this word appear.
/// ///
/// Returns a grenad reader with the list of extracted words and /// Returns a grenad reader with the list of extracted words and
/// documents ids from the given chunk of docid word positions. /// documents ids from the given chunk of docid word positions.
///
/// The first returned reader in the one for normal word_docids, and the second one is for
/// exact_word_docids
#[logging_timer::time] #[logging_timer::time]
pub fn extract_word_docids<R: io::Read + io::Seek>( pub fn extract_word_docids<R: io::Read + io::Seek>(
docid_word_positions: grenad::Reader<R>, docid_word_positions: grenad::Reader<R>,
indexer: GrenadParameters, indexer: GrenadParameters,
) -> Result<grenad::Reader<File>> { ) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
let max_memory = indexer.max_memory_by_thread(); let max_memory = indexer.max_memory_by_thread();
let mut word_docids_sorter = create_sorter( let mut word_docids_sorter = create_sorter(
@ -43,5 +47,9 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
word_docids_sorter.insert(word_bytes, &value_buffer)?; word_docids_sorter.insert(word_bytes, &value_buffer)?;
} }
sorter_into_reader(word_docids_sorter, indexer) let empty_sorter = grenad::Sorter::new(merge_roaring_bitmaps as MergeFn);
Ok((
sorter_into_reader(word_docids_sorter, indexer)?,
sorter_into_reader(empty_sorter, indexer)?,
))
} }

View File

@ -86,13 +86,16 @@ pub(crate) fn data_from_obkv_documents(
"field-id-wordcount-docids", "field-id-wordcount-docids",
); );
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>( spawn_extraction_task::<_, _, Vec<(grenad::Reader<File>, grenad::Reader<File>)>>(
docid_word_positions_chunks.clone(), docid_word_positions_chunks.clone(),
indexer.clone(), indexer.clone(),
lmdb_writer_sx.clone(), lmdb_writer_sx.clone(),
extract_word_docids, extract_word_docids,
merge_roaring_bitmaps, merge_roaring_bitmaps,
TypedChunk::WordDocids, |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
},
"word-docids", "word-docids",
); );

View File

@ -277,3 +277,8 @@ pub fn sorter_into_lmdb_database(
debug!("MTBL sorter writen in {:.02?}!", before.elapsed()); debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
Ok(()) Ok(())
} }
/// Used when trying to merge readers, but you don't actually care about the values.
pub fn merge_nothing<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
Ok(Cow::Owned(Vec::new()))
}

View File

@ -8,7 +8,7 @@ use std::convert::{TryFrom, TryInto};
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
pub use grenad_helpers::{ pub use grenad_helpers::{
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_nothing,
sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
GrenadParameters, MergeableReader, GrenadParameters, MergeableReader,
}; };

View File

@ -20,7 +20,7 @@ pub use self::helpers::{
fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn, sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn,
}; };
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; use self::helpers::{grenad_obkv_into_chunks, merge_nothing, GrenadParameters};
pub use self::transform::{Transform, TransformOutput}; pub use self::transform::{Transform, TransformOutput};
use crate::documents::DocumentBatchReader; use crate::documents::DocumentBatchReader;
pub use crate::update::index_documents::helpers::CursorClonableMmap; pub use crate::update::index_documents::helpers::CursorClonableMmap;
@ -282,6 +282,7 @@ where
let mut word_pair_proximity_docids = None; let mut word_pair_proximity_docids = None;
let mut word_position_docids = None; let mut word_position_docids = None;
let mut word_docids = None; let mut word_docids = None;
let mut _exact_word_docids = None;
let mut databases_seen = 0; let mut databases_seen = 0;
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase { (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
@ -291,10 +292,13 @@ where
for result in lmdb_writer_rx { for result in lmdb_writer_rx {
let typed_chunk = match result? { let typed_chunk = match result? {
TypedChunk::WordDocids(chunk) => { TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
word_docids = Some(cloneable_chunk); word_docids = Some(cloneable_chunk);
TypedChunk::WordDocids(chunk) let cloneable_chunk =
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
_exact_word_docids = Some(cloneable_chunk);
TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
} }
TypedChunk::WordPairProximityDocids(chunk) => { TypedChunk::WordPairProximityDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? }; let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
@ -425,6 +429,10 @@ where
}); });
if let Some(word_docids) = word_docids { if let Some(word_docids) = word_docids {
let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn);
word_docids_builder.push(word_docids.into_cursor()?);
// TODO: push exact_word_docids
let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?;
// Run the word prefix docids update operation. // Run the word prefix docids update operation.
let mut builder = WordPrefixDocids::new(self.wtxn, self.index); let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
builder.chunk_compression_type = self.indexer_config.chunk_compression_type; builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
@ -432,7 +440,7 @@ where
builder.max_nb_chunks = self.indexer_config.max_nb_chunks; builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
builder.max_memory = self.indexer_config.max_memory; builder.max_memory = self.indexer_config.max_memory;
builder.execute( builder.execute(
word_docids, word_docids_iter,
&new_prefix_fst_words, &new_prefix_fst_words,
&common_prefix_fst_words, &common_prefix_fst_words,
&del_prefix_fst_words, &del_prefix_fst_words,

View File

@ -3,14 +3,16 @@ use std::convert::TryInto;
use std::fs::File; use std::fs::File;
use std::io; use std::io;
use grenad::MergerBuilder;
use heed::types::ByteSlice; use heed::types::ByteSlice;
use heed::{BytesDecode, RwTxn}; use heed::{BytesDecode, RwTxn};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::helpers::{ use super::helpers::{
self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key, self, merge_nothing, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key,
CursorClonableMmap, CursorClonableMmap,
}; };
use super::{ClonableMmap, MergeFn};
use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string}; use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
use crate::update::index_documents::helpers::as_cloneable_grenad; use crate::update::index_documents::helpers::as_cloneable_grenad;
use crate::{ use crate::{
@ -25,7 +27,10 @@ pub(crate) enum TypedChunk {
Documents(grenad::Reader<CursorClonableMmap>), Documents(grenad::Reader<CursorClonableMmap>),
FieldIdWordcountDocids(grenad::Reader<File>), FieldIdWordcountDocids(grenad::Reader<File>),
NewDocumentsIds(RoaringBitmap), NewDocumentsIds(RoaringBitmap),
WordDocids(grenad::Reader<File>), WordDocids {
word_docids_reader: grenad::Reader<File>,
exact_word_docids_reader: grenad::Reader<File>,
},
WordPositionDocids(grenad::Reader<File>), WordPositionDocids(grenad::Reader<File>),
WordPairProximityDocids(grenad::Reader<File>), WordPairProximityDocids(grenad::Reader<File>),
FieldIdFacetStringDocids(grenad::Reader<File>), FieldIdFacetStringDocids(grenad::Reader<File>),
@ -86,8 +91,8 @@ pub(crate) fn write_typed_chunk_into_index(
TypedChunk::NewDocumentsIds(documents_ids) => { TypedChunk::NewDocumentsIds(documents_ids) => {
return Ok((documents_ids, is_merged_database)) return Ok((documents_ids, is_merged_database))
} }
TypedChunk::WordDocids(word_docids_iter) => { TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_iter) }?; let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
append_entries_into_database( append_entries_into_database(
word_docids_iter.clone(), word_docids_iter.clone(),
&index.word_docids, &index.word_docids,
@ -97,15 +102,18 @@ pub(crate) fn write_typed_chunk_into_index(
merge_roaring_bitmaps, merge_roaring_bitmaps,
)?; )?;
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
append_entries_into_database(
exact_word_docids_iter.clone(),
&index.exact_word_docids,
wtxn,
index_is_empty,
|value, _buffer| Ok(value),
merge_roaring_bitmaps,
)?;
// create fst from word docids // create fst from word docids
let mut builder = fst::SetBuilder::memory(); let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?;
let mut cursor = word_docids_iter.into_cursor()?;
while let Some((word, _value)) = cursor.move_on_next()? {
// This is a lexicographically ordered word position
// we use the key to construct the words fst.
builder.insert(word)?;
}
let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?;
let db_fst = index.words_fst(wtxn)?; let db_fst = index.words_fst(wtxn)?;
// merge new fst with database fst // merge new fst with database fst
@ -214,6 +222,23 @@ pub(crate) fn write_typed_chunk_into_index(
Ok((RoaringBitmap::new(), is_merged_database)) Ok((RoaringBitmap::new(), is_merged_database))
} }
fn merge_word_docids_reader_into_fst(
word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
) -> Result<fst::Set<Vec<u8>>> {
let mut merger_builder = MergerBuilder::new(merge_nothing as MergeFn);
merger_builder.push(word_docids_iter.into_cursor()?);
merger_builder.push(exact_word_docids_iter.into_cursor()?);
let mut iter = merger_builder.build().into_stream_merger_iter()?;
let mut builder = fst::SetBuilder::memory();
while let Some((k, _)) = iter.next()? {
builder.insert(k)?;
}
Ok(builder.into_set())
}
fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> { fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
let new_value = RoaringBitmap::deserialize_from(new_value)?; let new_value = RoaringBitmap::deserialize_from(new_value)?;
let db_value = RoaringBitmap::deserialize_from(db_value)?; let db_value = RoaringBitmap::deserialize_from(db_value)?;

View File

@ -35,7 +35,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
#[logging_timer::time("WordPrefixDocids::{}")] #[logging_timer::time("WordPrefixDocids::{}")]
pub fn execute( pub fn execute(
self, self,
new_word_docids: grenad::Reader<CursorClonableMmap>, mut new_word_docids_iter: grenad::MergerIter<CursorClonableMmap, MergeFn>,
new_prefix_fst_words: &[String], new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]], common_prefix_fst_words: &[&[String]],
del_prefix_fst_words: &HashSet<Vec<u8>>, del_prefix_fst_words: &HashSet<Vec<u8>>,
@ -51,10 +51,9 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
); );
if !common_prefix_fst_words.is_empty() { if !common_prefix_fst_words.is_empty() {
let mut new_word_docids_iter = new_word_docids.into_cursor()?;
let mut current_prefixes: Option<&&[String]> = None; let mut current_prefixes: Option<&&[String]> = None;
let mut prefixes_cache = HashMap::new(); let mut prefixes_cache = HashMap::new();
while let Some((word, data)) = new_word_docids_iter.move_on_next()? { while let Some((word, data)) = new_word_docids_iter.next()? {
current_prefixes = match current_prefixes.take() { current_prefixes = match current_prefixes.take() {
Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes), Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes),
_otherwise => { _otherwise => {