Merge pull request #4207 from meilisearch/diff-indexing-prefix-databases

Diff indexing prefix databases
This commit is contained in:
Many the fish 2023-11-14 16:04:05 +01:00 committed by GitHub
commit b0adc73ce6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
20 changed files with 238 additions and 1563 deletions

View File

@ -84,8 +84,6 @@ pub mod db_name {
pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids"; pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids";
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids"; pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids";
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
@ -130,10 +128,6 @@ pub struct Index {
/// Maps the proximity between a pair of words with all the docids where this relation appears. /// Maps the proximity between a pair of words with all the docids where this relation appears.
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>, pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
pub word_prefix_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of prefix and word with all the docids where this relation appears.
pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
/// Maps the word and the position with the docids that corresponds to it. /// Maps the word and the position with the docids that corresponds to it.
pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>, pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
@ -187,7 +181,7 @@ impl Index {
) -> Result<Index> { ) -> Result<Index> {
use db_name::*; use db_name::*;
options.max_dbs(26); options.max_dbs(24);
unsafe { options.flag(Flags::MdbAlwaysFreePages) }; unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?; let env = options.open(path)?;
@ -204,10 +198,6 @@ impl Index {
env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?; env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
let script_language_docids = let script_language_docids =
env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?; env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?;
let word_prefix_pair_proximity_docids =
env.create_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
let prefix_word_pair_proximity_docids =
env.create_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?; let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?;
let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?; let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?;
let field_id_word_count_docids = let field_id_word_count_docids =
@ -248,8 +238,6 @@ impl Index {
exact_word_prefix_docids, exact_word_prefix_docids,
word_pair_proximity_docids, word_pair_proximity_docids,
script_language_docids, script_language_docids,
word_prefix_pair_proximity_docids,
prefix_word_pair_proximity_docids,
word_position_docids, word_position_docids,
word_fid_docids, word_fid_docids,
word_prefix_position_docids, word_prefix_position_docids,

View File

@ -11,7 +11,9 @@ use super::interner::Interned;
use super::Word; use super::Word;
use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec}; use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
use crate::update::{merge_cbo_roaring_bitmaps, MergeFn}; use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext}; use crate::{
CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
};
/// A cache storing pointers to values in the LMDB databases. /// A cache storing pointers to values in the LMDB databases.
/// ///
@ -23,7 +25,7 @@ pub struct DatabaseCache<'ctx> {
pub word_pair_proximity_docids: pub word_pair_proximity_docids:
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
pub word_prefix_pair_proximity_docids: pub word_prefix_pair_proximity_docids:
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, FxHashMap<(u8, Interned<String>, Interned<String>), Option<RoaringBitmap>>,
pub prefix_word_pair_proximity_docids: pub prefix_word_pair_proximity_docids:
FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>, FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>, pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
@ -295,35 +297,47 @@ impl<'ctx> SearchContext<'ctx> {
prefix2: Interned<String>, prefix2: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( let docids = match self
self.txn, .db_cache
(proximity, word1, prefix2), .word_prefix_pair_proximity_docids
&( .entry((proximity, word1, prefix2))
{
Entry::Occupied(docids) => docids.get().clone(),
Entry::Vacant(entry) => {
// compute docids using prefix iter and store the result in the cache.
let key = U8StrStrCodec::bytes_encode(&(
proximity, proximity,
self.word_interner.get(word1).as_str(), self.word_interner.get(word1).as_str(),
self.word_interner.get(prefix2).as_str(), self.word_interner.get(prefix2).as_str(),
), ))
&mut self.db_cache.word_prefix_pair_proximity_docids, .unwrap()
self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(), .into_owned();
) let mut prefix_docids = RoaringBitmap::new();
let remap_key_type = self
.index
.word_pair_proximity_docids
.remap_key_type::<ByteSlice>()
.prefix_iter(self.txn, &key)?;
for result in remap_key_type {
let (_, docids) = result?;
prefix_docids |= docids;
} }
entry.insert(Some(prefix_docids.clone()));
Some(prefix_docids)
}
};
Ok(docids)
}
pub fn get_db_prefix_word_pair_proximity_docids( pub fn get_db_prefix_word_pair_proximity_docids(
&mut self, &mut self,
left_prefix: Interned<String>, left_prefix: Interned<String>,
right: Interned<String>, right: Interned<String>,
proximity: u8, proximity: u8,
) -> Result<Option<RoaringBitmap>> { ) -> Result<Option<RoaringBitmap>> {
DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( // only accept exact matches on reverted positions
self.txn, self.get_db_word_pair_proximity_docids(left_prefix, right, proximity)
(proximity, left_prefix, right),
&(
proximity,
self.word_interner.get(left_prefix).as_str(),
self.word_interner.get(right).as_str(),
),
&mut self.db_cache.prefix_word_pair_proximity_docids,
self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
)
} }
pub fn get_db_word_fid_docids( pub fn get_db_word_fid_docids(

View File

@ -371,7 +371,7 @@ fn test_proximity_prefix_db() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.query("best s"); s.query("best s");
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]");
insta::assert_snapshot!(format!("{document_scores:#?}")); insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
@ -379,13 +379,13 @@ fn test_proximity_prefix_db() {
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"this is the best summer meal\"", "\"this is the best summer meal\"",
"\"summer best\"",
"\"this is the best meal of summer\"", "\"this is the best meal of summer\"",
"\"summer x best\"",
"\"this is the best meal I have ever had in such a beautiful summer day\"", "\"this is the best meal I have ever had in such a beautiful summer day\"",
"\"this is the best cooked meal of the summer\"", "\"this is the best cooked meal of the summer\"",
"\"this is the best meal of the summer\"", "\"this is the best meal of the summer\"",
"\"summer x y best\"", "\"summer x y best\"",
"\"summer x best\"",
"\"summer best\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"", "\"this is the best meal I have ever had in such a beautiful winter day\"",
] ]
"###); "###);
@ -423,20 +423,20 @@ fn test_proximity_prefix_db() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.query("best win"); s.query("best win");
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
insta::assert_snapshot!(format!("{document_scores:#?}")); insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"this is the best winter meal\"", "\"this is the best winter meal\"",
"\"winter best\"",
"\"this is the best meal of winter\"", "\"this is the best meal of winter\"",
"\"winter x best\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"", "\"this is the best meal I have ever had in such a beautiful winter day\"",
"\"this is the best cooked meal of the winter\"", "\"this is the best cooked meal of the winter\"",
"\"this is the best meal of the winter\"", "\"this is the best meal of the winter\"",
"\"winter x y best\"", "\"winter x y best\"",
"\"winter x best\"",
"\"winter best\"",
] ]
"###); "###);
@ -471,20 +471,20 @@ fn test_proximity_prefix_db() {
s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed); s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
s.query("best wi"); s.query("best wi");
let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap(); let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]"); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
insta::assert_snapshot!(format!("{document_scores:#?}")); insta::assert_snapshot!(format!("{document_scores:#?}"));
let texts = collect_field_values(&index, &txn, "text", &documents_ids); let texts = collect_field_values(&index, &txn, "text", &documents_ids);
insta::assert_debug_snapshot!(texts, @r###" insta::assert_debug_snapshot!(texts, @r###"
[ [
"\"this is the best winter meal\"", "\"this is the best winter meal\"",
"\"winter best\"",
"\"this is the best meal of winter\"", "\"this is the best meal of winter\"",
"\"winter x best\"",
"\"this is the best meal I have ever had in such a beautiful winter day\"", "\"this is the best meal I have ever had in such a beautiful winter day\"",
"\"this is the best cooked meal of the winter\"", "\"this is the best cooked meal of the winter\"",
"\"this is the best meal of the winter\"", "\"this is the best meal of the winter\"",
"\"winter x y best\"", "\"winter x y best\"",
"\"winter x best\"",
"\"winter best\"",
] ]
"###); "###);
} }

View File

@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
}, },
), ),
], ],
[
Proximity(
Rank {
rank: 3,
max_rank: 4,
},
),
],
[ [
Proximity( Proximity(
Rank { Rank {
@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
[ [
Proximity( Proximity(
Rank { Rank {
rank: 2, rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4, max_rank: 4,
}, },
), ),

View File

@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
}, },
), ),
], ],
[
Proximity(
Rank {
rank: 3,
max_rank: 4,
},
),
],
[ [
Proximity( Proximity(
Rank { Rank {
@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
[ [
Proximity( Proximity(
Rank { Rank {
rank: 2, rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4, max_rank: 4,
}, },
), ),

View File

@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
}, },
), ),
], ],
[
Proximity(
Rank {
rank: 3,
max_rank: 4,
},
),
],
[ [
Proximity( Proximity(
Rank { Rank {
@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
[ [
Proximity( Proximity(
Rank { Rank {
rank: 2, rank: 1,
max_rank: 4,
},
),
],
[
Proximity(
Rank {
rank: 1,
max_rank: 4, max_rank: 4,
}, },
), ),

View File

@ -219,22 +219,6 @@ pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
&format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
}) })
} }
pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |(
(proximity, word1, prefix),
b,
)| {
&format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b))
})
}
pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String {
make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |(
(proximity, prefix, word2),
b,
)| {
&format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b))
})
}
pub fn snap_word_position_docids(index: &Index) -> String { pub fn snap_word_position_docids(index: &Index) -> String {
make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| {
&format!("{word:<16} {position:<6} {}", display_bitmap(&b)) &format!("{word:<16} {position:<6} {}", display_bitmap(&b))

View File

@ -26,8 +26,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
word_prefix_docids, word_prefix_docids,
exact_word_prefix_docids, exact_word_prefix_docids,
word_pair_proximity_docids, word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
prefix_word_pair_proximity_docids,
word_position_docids, word_position_docids,
word_fid_docids, word_fid_docids,
field_id_word_count_docids, field_id_word_count_docids,
@ -68,8 +66,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
word_prefix_docids.clear(self.wtxn)?; word_prefix_docids.clear(self.wtxn)?;
exact_word_prefix_docids.clear(self.wtxn)?; exact_word_prefix_docids.clear(self.wtxn)?;
word_pair_proximity_docids.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?;
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
word_position_docids.clear(self.wtxn)?; word_position_docids.clear(self.wtxn)?;
word_fid_docids.clear(self.wtxn)?; word_fid_docids.clear(self.wtxn)?;
field_id_word_count_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?;
@ -132,7 +128,6 @@ mod tests {
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap()); assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap()); assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap()); assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap()); assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap());

View File

@ -102,3 +102,17 @@ pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool { pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool {
del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition) del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
} }
/// A function that extracts and returns the Add side of a DelAdd obkv.
/// This is useful when there are no previous value in the database and
/// therefore we don't need to do a diff with what's already there.
///
/// If there is no Add side we currently write an empty buffer
/// which is a valid CboRoaringBitmap.
#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
pub fn deladd_serialize_add_side<'a>(
obkv: &'a [u8],
_buffer: &mut Vec<u8>,
) -> crate::Result<&'a [u8]> {
Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
}

View File

@ -1,14 +1,12 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::fs::File; use std::fs::File;
use std::io::{self, BufReader, BufWriter, Seek}; use std::io::{self, BufReader, BufWriter, Seek};
use std::time::Instant;
use grenad::{CompressionType, Sorter}; use grenad::{CompressionType, Sorter};
use heed::types::ByteSlice; use heed::types::ByteSlice;
use log::debug;
use super::{ClonableMmap, MergeFn}; use super::{ClonableMmap, MergeFn};
use crate::error::InternalError; use crate::update::index_documents::valid_lmdb_key;
use crate::Result; use crate::Result;
pub type CursorClonableMmap = io::Cursor<ClonableMmap>; pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
@ -240,45 +238,46 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
Ok(std::iter::from_fn(move || transposer().transpose())) Ok(std::iter::from_fn(move || transposer().transpose()))
} }
pub fn sorter_into_lmdb_database( /// Write provided sorter in database using serialize_value function.
wtxn: &mut heed::RwTxn, /// merge_values function is used if an entry already exist in the database.
database: heed::PolyDatabase, pub fn write_sorter_into_database<K, V, FS, FM>(
sorter: Sorter<MergeFn>, sorter: Sorter<MergeFn>,
merge: MergeFn, database: &heed::Database<K, V>,
) -> Result<()> { wtxn: &mut heed::RwTxn,
index_is_empty: bool,
serialize_value: FS,
merge_values: FM,
) -> Result<()>
where
FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
{
puffin::profile_function!(); puffin::profile_function!();
debug!("Writing MTBL sorter...");
let before = Instant::now(); let mut buffer = Vec::new();
let database = database.remap_types::<ByteSlice, ByteSlice>();
let mut merger_iter = sorter.into_stream_merger_iter()?; let mut merger_iter = sorter.into_stream_merger_iter()?;
if database.is_empty(wtxn)? { while let Some((key, value)) = merger_iter.next()? {
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; if valid_lmdb_key(key) {
while let Some((k, v)) = merger_iter.next()? { buffer.clear();
// safety: we don't keep references from inside the LMDB database. let value = if index_is_empty {
unsafe { out_iter.append(k, v)? }; Some(serialize_value(value, &mut buffer)?)
}
} else { } else {
while let Some((k, v)) = merger_iter.next()? { match database.get(wtxn, key)? {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?; Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
match iter.next().transpose()? { None => Some(serialize_value(value, &mut buffer)?),
Some((key, old_val)) if key == k => {
let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
let val = merge(k, &vals).map_err(|_| {
// TODO just wrap this error?
InternalError::IndexingMergingKeys { process: "get-put-merge" }
})?;
// safety: we don't keep references from inside the LMDB database.
unsafe { iter.put_current(k, &val)? };
} }
_ => { };
drop(iter); match value {
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; Some(value) => database.put(wtxn, key, value)?,
None => {
database.delete(wtxn, key)?;
} }
} }
} }
} }
debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
Ok(()) Ok(())
} }

View File

@ -239,3 +239,19 @@ pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into) output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
} }
} }
/// A function that merges a DelAdd of bitmao into an already existing bitmap.
///
/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
/// the second one is the CboRoaringBitmap to merge into.
pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
deladd_obkv: &[u8],
previous: &[u8],
buffer: &'a mut Vec<u8>,
) -> Result<Option<&'a [u8]>> {
Ok(CboRoaringBitmapCodec::merge_deladd_into(
KvReaderDelAdd::new(deladd_obkv),
previous,
buffer,
)?)
}

View File

@ -9,12 +9,13 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
use fst::{IntoStreamer, Streamer}; use fst::{IntoStreamer, Streamer};
pub use grenad_helpers::{ pub use grenad_helpers::{
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader, merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
GrenadParameters, MergeableReader, GrenadParameters, MergeableReader,
}; };
pub use merge_functions::{ pub use merge_functions::{
concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string, concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps,
obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions, obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions,
serialize_roaring_bitmap, MergeFn, serialize_roaring_bitmap, MergeFn,
}; };

View File

@ -23,8 +23,10 @@ use self::enrich::enrich_documents_batch;
pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId}; pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId};
pub use self::helpers::{ pub use self::helpers::{
as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn, merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader,
ClonableMmap, MergeFn,
}; };
use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
pub use self::transform::{Transform, TransformOutput}; pub use self::transform::{Transform, TransformOutput};
@ -32,13 +34,12 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::{Error, InternalError, UserError}; use crate::error::{Error, InternalError, UserError};
pub use crate::update::index_documents::helpers::CursorClonableMmap; pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{ use crate::update::{
IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
WordPrefixIntegerDocids, WordsPrefixesFst,
}; };
use crate::{CboRoaringBitmapCodec, Index, Result}; use crate::{CboRoaringBitmapCodec, Index, Result};
static MERGED_DATABASE_COUNT: usize = 7; static MERGED_DATABASE_COUNT: usize = 7;
static PREFIX_DATABASE_COUNT: usize = 5; static PREFIX_DATABASE_COUNT: usize = 4;
static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT; static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@ -411,12 +412,42 @@ where
total_databases: TOTAL_POSTING_DATABASE_COUNT, total_databases: TOTAL_POSTING_DATABASE_COUNT,
}); });
let mut word_position_docids = None;
let mut word_fid_docids = None;
let mut word_docids = None;
let mut exact_word_docids = None;
for result in lmdb_writer_rx { for result in lmdb_writer_rx {
if (self.should_abort)() { if (self.should_abort)() {
return Err(Error::InternalError(InternalError::AbortedIndexation)); return Err(Error::InternalError(InternalError::AbortedIndexation));
} }
let typed_chunk = result?; let typed_chunk = match result? {
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
} => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
word_docids = Some(cloneable_chunk);
let cloneable_chunk =
unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
exact_word_docids = Some(cloneable_chunk);
let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
word_fid_docids = Some(cloneable_chunk);
TypedChunk::WordDocids {
word_docids_reader,
exact_word_docids_reader,
word_fid_docids_reader,
}
}
TypedChunk::WordPositionDocids(chunk) => {
let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
word_position_docids = Some(cloneable_chunk);
TypedChunk::WordPositionDocids(chunk)
}
otherwise => otherwise,
};
// FIXME: return newly added as well as newly deleted documents // FIXME: return newly added as well as newly deleted documents
let (docids, is_merged_database) = let (docids, is_merged_database) =
@ -447,17 +478,16 @@ where
// We write the primary key field id into the main database // We write the primary key field id into the main database
self.index.put_primary_key(self.wtxn, &primary_key)?; self.index.put_primary_key(self.wtxn, &primary_key)?;
let number_of_documents = self.index.number_of_documents(self.wtxn)?;
// TODO: reactivate prefix DB with diff-indexing self.execute_prefix_databases(
// self.execute_prefix_databases( word_docids,
// word_docids, exact_word_docids,
// exact_word_docids, word_position_docids,
// word_pair_proximity_docids, word_fid_docids,
// word_position_docids, )?;
// word_fid_docids,
// )?;
self.index.number_of_documents(self.wtxn) Ok(number_of_documents)
} }
#[logging_timer::time("IndexDocuments::{}")] #[logging_timer::time("IndexDocuments::{}")]
@ -465,7 +495,6 @@ where
self, self,
word_docids: Option<grenad::Reader<CursorClonableMmap>>, word_docids: Option<grenad::Reader<CursorClonableMmap>>,
exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>, exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
word_pair_proximity_docids: Option<grenad::Reader<CursorClonableMmap>>,
word_position_docids: Option<grenad::Reader<CursorClonableMmap>>, word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>, word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>,
) -> Result<()> ) -> Result<()>
@ -586,32 +615,6 @@ where
total_databases: TOTAL_POSTING_DATABASE_COUNT, total_databases: TOTAL_POSTING_DATABASE_COUNT,
}); });
if let Some(word_pair_proximity_docids) = word_pair_proximity_docids {
// Run the word prefix pair proximity docids update operation.
PrefixWordPairsProximityDocids::new(
self.wtxn,
self.index,
self.indexer_config.chunk_compression_type,
self.indexer_config.chunk_compression_level,
)
.execute(
word_pair_proximity_docids,
&new_prefix_fst_words,
&common_prefix_fst_words,
&del_prefix_fst_words,
)?;
}
if (self.should_abort)() {
return Err(Error::InternalError(InternalError::AbortedIndexation));
}
databases_seen += 1;
(self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
databases_seen,
total_databases: TOTAL_POSTING_DATABASE_COUNT,
});
if let Some(word_position_docids) = word_position_docids { if let Some(word_position_docids) = word_position_docids {
// Run the words prefix position docids update operation. // Run the words prefix position docids update operation.
let mut builder = WordPrefixIntegerDocids::new( let mut builder = WordPrefixIntegerDocids::new(

View File

@ -13,7 +13,10 @@ use obkv::{KvReader, KvWriter};
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap}; use super::helpers::{
self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values,
valid_lmdb_key, CursorClonableMmap,
};
use super::{ClonableMmap, MergeFn}; use super::{ClonableMmap, MergeFn};
use crate::distance::NDotProductPoint; use crate::distance::NDotProductPoint;
use crate::error::UserError; use crate::error::UserError;
@ -21,12 +24,11 @@ use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::index::db_name::DOCUMENTS; use crate::index::db_name::DOCUMENTS;
use crate::index::Hnsw; use crate::index::Hnsw;
use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
use crate::update::facet::FacetsUpdate; use crate::update::facet::FacetsUpdate;
use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at}; use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
use crate::{ use crate::{
lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, Result, lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError, BEU32,
SerializationError, BEU32,
}; };
pub(crate) enum TypedChunk { pub(crate) enum TypedChunk {
@ -186,7 +188,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn, wtxn,
index_is_empty, index_is_empty,
deladd_serialize_add_side, deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?; )?;
is_merged_database = true; is_merged_database = true;
} }
@ -202,7 +204,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn, wtxn,
index_is_empty, index_is_empty,
deladd_serialize_add_side, deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?; )?;
let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?; let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
@ -212,7 +214,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn, wtxn,
index_is_empty, index_is_empty,
deladd_serialize_add_side, deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?; )?;
let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?; let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
@ -222,7 +224,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn, wtxn,
index_is_empty, index_is_empty,
deladd_serialize_add_side, deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?; )?;
// create fst from word docids // create fst from word docids
@ -244,7 +246,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn, wtxn,
index_is_empty, index_is_empty,
deladd_serialize_add_side, deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?; )?;
is_merged_database = true; is_merged_database = true;
} }
@ -265,7 +267,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn, wtxn,
index_is_empty, index_is_empty,
deladd_serialize_add_side, deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?; )?;
is_merged_database = true; is_merged_database = true;
} }
@ -276,7 +278,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn, wtxn,
index_is_empty, index_is_empty,
deladd_serialize_add_side, deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?; )?;
is_merged_database = true; is_merged_database = true;
} }
@ -287,7 +289,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn, wtxn,
index_is_empty, index_is_empty,
deladd_serialize_add_side, deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?; )?;
is_merged_database = true; is_merged_database = true;
} }
@ -298,7 +300,7 @@ pub(crate) fn write_typed_chunk_into_index(
wtxn, wtxn,
index_is_empty, index_is_empty,
deladd_serialize_add_side, deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?; )?;
is_merged_database = true; is_merged_database = true;
} }
@ -495,33 +497,6 @@ fn merge_word_docids_reader_into_fst(
Ok(builder.into_set()) Ok(builder.into_set())
} }
/// A function that extracts and returns the Add side of a DelAdd obkv.
/// This is useful when there are no previous value in the database and
/// therefore we don't need to do a diff with what's already there.
///
/// If there is no Add side we currently write an empty buffer
/// which is a valid CboRoaringBitmap.
#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec<u8>) -> Result<&'a [u8]> {
Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
}
/// A function that merges a DelAdd of bitmao into an already existing bitmap.
///
/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
/// the second one is the CboRoaringBitmap to merge into.
fn merge_deladd_cbo_roaring_bitmaps<'a>(
deladd_obkv: &[u8],
previous: &[u8],
buffer: &'a mut Vec<u8>,
) -> Result<Option<&'a [u8]>> {
Ok(CboRoaringBitmapCodec::merge_deladd_into(
KvReaderDelAdd::new(deladd_obkv),
previous,
buffer,
)?)
}
/// Write provided entries in database using serialize_value function. /// Write provided entries in database using serialize_value function.
/// merge_values function is used if an entry already exist in the database. /// merge_values function is used if an entry already exist in the database.
fn write_entries_into_database<R, K, V, FS, FM>( fn write_entries_into_database<R, K, V, FS, FM>(

View File

@ -8,10 +8,6 @@ pub use self::index_documents::{
MergeFn, MergeFn,
}; };
pub use self::indexer_config::IndexerConfig; pub use self::indexer_config::IndexerConfig;
pub use self::prefix_word_pairs::{
PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
};
pub use self::settings::{Setting, Settings}; pub use self::settings::{Setting, Settings};
pub use self::update_step::UpdateIndexingStep; pub use self::update_step::UpdateIndexingStep;
pub use self::word_prefix_docids::WordPrefixDocids; pub use self::word_prefix_docids::WordPrefixDocids;
@ -24,7 +20,6 @@ pub(crate) mod del_add;
pub(crate) mod facet; pub(crate) mod facet;
mod index_documents; mod index_documents;
mod indexer_config; mod indexer_config;
mod prefix_word_pairs;
mod settings; mod settings;
mod update_step; mod update_step;
mod word_prefix_docids; mod word_prefix_docids;

View File

@ -1,422 +0,0 @@
use std::borrow::Cow;
use std::collections::HashSet;
use std::io::{BufReader, BufWriter};
use grenad::CompressionType;
use heed::types::ByteSlice;
use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap};
use crate::{Index, Result};
mod prefix_word;
mod word_prefix;
pub use prefix_word::index_prefix_word_database;
pub use word_prefix::index_word_prefix_database;
pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4;
pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2;
pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
max_proximity: u8,
max_prefix_length: usize,
chunk_compression_type: CompressionType,
chunk_compression_level: Option<u32>,
}
impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
chunk_compression_type: CompressionType,
chunk_compression_level: Option<u32>,
) -> Self {
Self {
wtxn,
index,
max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
chunk_compression_type,
chunk_compression_level,
}
}
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
pub fn execute<'a>(
self,
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
new_prefix_fst_words: &'a [String],
common_prefix_fst_words: &[&'a [String]],
del_prefix_fst_words: &HashSet<Vec<u8>>,
) -> Result<()> {
puffin::profile_function!();
index_word_prefix_database(
self.wtxn,
self.index.word_pair_proximity_docids,
self.index.word_prefix_pair_proximity_docids,
self.max_proximity,
self.max_prefix_length,
new_word_pair_proximity_docids.clone(),
new_prefix_fst_words,
common_prefix_fst_words,
del_prefix_fst_words,
self.chunk_compression_type,
self.chunk_compression_level,
)?;
index_prefix_word_database(
self.wtxn,
self.index.word_pair_proximity_docids,
self.index.prefix_word_pair_proximity_docids,
self.max_proximity,
self.max_prefix_length,
new_word_pair_proximity_docids,
new_prefix_fst_words,
common_prefix_fst_words,
del_prefix_fst_words,
self.chunk_compression_type,
self.chunk_compression_level,
)?;
Ok(())
}
}
// This is adapted from `sorter_into_lmdb_database`
pub fn insert_into_database(
wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase,
new_key: &[u8],
new_value: &[u8],
) -> Result<()> {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
match iter.next().transpose()? {
Some((key, old_val)) if new_key == key => {
let val =
merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
.map_err(|_| {
// TODO just wrap this error?
crate::error::InternalError::IndexingMergingKeys {
process: "get-put-merge",
}
})?;
// safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
unsafe { iter.put_current(new_key, &val)? };
}
_ => {
drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
}
}
Ok(())
}
// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
// but it uses `append` if the database is empty, and it assumes that the values in the
// writer don't conflict with values in the database.
pub fn write_into_lmdb_database_without_merging(
wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase,
writer: grenad::Writer<BufWriter<std::fs::File>>,
) -> Result<()> {
let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
let reader = grenad::Reader::new(BufReader::new(file))?;
if database.is_empty(wtxn)? {
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
let mut cursor = reader.into_cursor()?;
while let Some((k, v)) = cursor.move_on_next()? {
// safety: the key comes from the grenad reader, not the database
unsafe { out_iter.append(k, v)? };
}
} else {
let mut cursor = reader.into_cursor()?;
while let Some((k, v)) = cursor.move_on_next()? {
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use std::io::Cursor;
use crate::db_snap;
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use crate::index::tests::TempIndex;
use crate::update::IndexDocumentsMethod;
fn documents_with_enough_different_words_for_prefixes(
prefixes: &[&str],
start_id: usize,
) -> Vec<crate::Object> {
let mut documents = Vec::new();
let mut id = start_id;
for prefix in prefixes {
for i in 0..50 {
documents.push(
serde_json::json!({
"id": id,
"text": format!("{prefix}{i:x}"),
})
.as_object()
.unwrap()
.clone(),
);
id += 1;
}
}
documents
}
#[ignore]
#[test]
fn add_new_documents() {
let mut index = TempIndex::new();
index.index_documents_config.words_prefix_threshold = Some(50);
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| {
settings.set_searchable_fields(vec!["text".to_owned()]);
})
.unwrap();
let batch_reader_from_documents = |documents| {
let mut builder = DocumentsBatchBuilder::new(Vec::new());
for object in documents {
builder.append_json_object(&object).unwrap();
}
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
};
let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0);
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
documents.push(
serde_json::json!({
"id": "9000",
"text": "At an amazing and beautiful house"
})
.as_object()
.unwrap()
.clone(),
);
documents.push(
serde_json::json!({
"id": "9001",
"text": "The bell rings at 5 am"
})
.as_object()
.unwrap()
.clone(),
);
let documents = batch_reader_from_documents(documents);
index.add_documents(documents).unwrap();
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100);
documents.push(
serde_json::json!({
"id": "9002",
"text": "At an extraordinary house"
})
.as_object()
.unwrap()
.clone(),
);
let documents = batch_reader_from_documents(documents);
index.add_documents(documents).unwrap();
db_snap!(index, word_pair_proximity_docids, "update");
db_snap!(index, word_prefix_pair_proximity_docids, "update");
db_snap!(index, prefix_word_pair_proximity_docids, "update");
}
#[ignore]
#[test]
fn batch_bug_3043() {
// https://github.com/meilisearch/meilisearch/issues/3043
let mut index = TempIndex::new();
index.index_documents_config.words_prefix_threshold = Some(50);
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| {
settings.set_searchable_fields(vec!["text".to_owned()]);
})
.unwrap();
let batch_reader_from_documents = |documents| {
let mut builder = DocumentsBatchBuilder::new(Vec::new());
for object in documents {
builder.append_json_object(&object).unwrap();
}
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
};
let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0);
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
documents.push(
serde_json::json!({
"text": "x y"
})
.as_object()
.unwrap()
.clone(),
);
documents.push(
serde_json::json!({
"text": "x a y"
})
.as_object()
.unwrap()
.clone(),
);
let documents = batch_reader_from_documents(documents);
index.add_documents(documents).unwrap();
db_snap!(index, word_pair_proximity_docids);
db_snap!(index, word_prefix_pair_proximity_docids);
db_snap!(index, prefix_word_pair_proximity_docids);
}
#[ignore]
#[test]
fn hard_delete_and_reupdate() {
let mut index = TempIndex::new();
index.index_documents_config.words_prefix_threshold = Some(50);
index
.update_settings(|settings| {
settings.set_primary_key("id".to_owned());
settings.set_searchable_fields(vec!["text".to_owned()]);
})
.unwrap();
let batch_reader_from_documents = |documents| {
let mut builder = DocumentsBatchBuilder::new(Vec::new());
for object in documents {
builder.append_json_object(&object).unwrap();
}
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
};
let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
documents.push(
serde_json::json!({
"id": 9000,
"text": "At an amazing and beautiful house"
})
.as_object()
.unwrap()
.clone(),
);
documents.push(
serde_json::json!({
"id": 9001,
"text": "The bell rings at 5 am"
})
.as_object()
.unwrap()
.clone(),
);
let documents = batch_reader_from_documents(documents);
index.add_documents(documents).unwrap();
db_snap!(index, documents_ids, "initial");
db_snap!(index, word_docids, "initial");
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
index.delete_document("9000");
db_snap!(index, documents_ids, "first_delete");
db_snap!(index, word_docids, "first_delete");
db_snap!(index, word_prefix_pair_proximity_docids, "first_delete");
db_snap!(index, prefix_word_pair_proximity_docids, "first_delete");
index.delete_documents((0..50).map(|id| id.to_string()).collect());
db_snap!(index, documents_ids, "second_delete");
db_snap!(index, word_docids, "second_delete");
db_snap!(index, word_prefix_pair_proximity_docids, "second_delete");
db_snap!(index, prefix_word_pair_proximity_docids, "second_delete");
let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000);
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
index.add_documents(batch_reader_from_documents(documents)).unwrap();
db_snap!(index, documents_ids, "reupdate");
db_snap!(index, word_docids, "reupdate");
db_snap!(index, word_prefix_pair_proximity_docids, "reupdate");
db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
}
#[ignore]
#[test]
fn replace_hard_deletion() {
let mut index = TempIndex::new();
index.index_documents_config.words_prefix_threshold = Some(50);
index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
index
.update_settings(|settings| {
settings.set_primary_key("id".to_owned());
settings.set_searchable_fields(vec!["text".to_owned()]);
})
.unwrap();
let batch_reader_from_documents = |documents| {
let mut builder = DocumentsBatchBuilder::new(Vec::new());
for object in documents {
builder.append_json_object(&object).unwrap();
}
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
};
let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
documents.push(
serde_json::json!({
"id": 9000,
"text": "At an amazing house"
})
.as_object()
.unwrap()
.clone(),
);
documents.push(
serde_json::json!({
"id": 9001,
"text": "The bell rings"
})
.as_object()
.unwrap()
.clone(),
);
let documents = batch_reader_from_documents(documents);
index.add_documents(documents).unwrap();
db_snap!(index, documents_ids, "initial");
db_snap!(index, word_docids, "initial");
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
db_snap!(index, prefix_word_pair_proximity_docids, "initial");
let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0);
index.add_documents(batch_reader_from_documents(documents)).unwrap();
db_snap!(index, documents_ids, "replaced");
db_snap!(index, word_docids, "replaced");
db_snap!(index, word_prefix_pair_proximity_docids, "replaced");
db_snap!(index, prefix_word_pair_proximity_docids, "replaced");
}
}

View File

@ -1,182 +0,0 @@
use std::borrow::Cow;
use std::collections::{BTreeMap, HashSet};
use grenad::CompressionType;
use heed::types::ByteSlice;
use heed::BytesDecode;
use log::debug;
use crate::update::index_documents::{create_writer, CursorClonableMmap};
use crate::update::prefix_word_pairs::{
insert_into_database, write_into_lmdb_database_without_merging,
};
use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
#[allow(clippy::too_many_arguments)]
#[logging_timer::time]
pub fn index_prefix_word_database(
wtxn: &mut heed::RwTxn,
word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
max_proximity: u8,
max_prefix_length: usize,
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]],
del_prefix_fst_words: &HashSet<Vec<u8>>,
chunk_compression_type: CompressionType,
chunk_compression_level: Option<u32>,
) -> Result<()> {
puffin::profile_function!();
let max_proximity = max_proximity - 1;
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
let common_prefixes: Vec<_> = common_prefix_fst_words
.iter()
.flat_map(|s| s.iter())
.map(|s| s.as_str())
.filter(|s| s.len() <= max_prefix_length)
.collect();
for proximity in 1..max_proximity {
for prefix in common_prefixes.iter() {
let mut prefix_key = vec![proximity];
prefix_key.extend_from_slice(prefix.as_bytes());
let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?;
// This is the core of the algorithm
execute_on_word_pairs_and_prefixes(
proximity,
prefix.as_bytes(),
// the next two arguments tell how to iterate over the new word pairs
&mut cursor,
|cursor| {
if let Some((key, value)) = cursor.next()? {
let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
.ok_or(heed::Error::Decoding)?;
Ok(Some((word2, value)))
} else {
Ok(None)
}
},
// and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap)
|key, value| {
insert_into_database(
wtxn,
*prefix_word_pair_proximity_docids.as_polymorph(),
key,
value,
)
},
)?;
}
}
// Now we do the same thing with the new prefixes and all word pairs in the DB
let new_prefixes: Vec<_> = new_prefix_fst_words
.iter()
.map(|s| s.as_str())
.filter(|s| s.len() <= max_prefix_length)
.collect();
// Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
// element in an intermediary grenad
let mut writer =
create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?);
for proximity in 1..max_proximity {
for prefix in new_prefixes.iter() {
let mut prefix_key = vec![proximity];
prefix_key.extend_from_slice(prefix.as_bytes());
let mut db_iter = word_pair_proximity_docids
.as_polymorph()
.prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())?
.remap_key_type::<UncheckedU8StrStrCodec>();
execute_on_word_pairs_and_prefixes(
proximity,
prefix.as_bytes(),
&mut db_iter,
|db_iter| {
db_iter
.next()
.transpose()
.map(|x| x.map(|((_, _, word2), value)| (word2, value)))
.map_err(|e| e.into())
},
|key, value| writer.insert(key, value).map_err(|e| e.into()),
)?;
drop(db_iter);
}
}
// and then we write the grenad into the DB
// Since the grenad contains only new prefixes, we know in advance that none
// of its elements already exist in the DB, thus there is no need to specify
// how to merge conflicting elements
write_into_lmdb_database_without_merging(
wtxn,
*prefix_word_pair_proximity_docids.as_polymorph(),
writer,
)?;
// All of the word prefix pairs in the database that have a w2
// that is contained in the `suppr_pw` set must be removed as well.
if !del_prefix_fst_words.is_empty() {
let mut iter =
prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
while let Some(((_, prefix, _), _)) = iter.next().transpose()? {
if del_prefix_fst_words.contains(prefix.as_bytes()) {
// Delete this entry as the w2 prefix is no more in the words prefix fst.
unsafe { iter.del_current()? };
}
}
}
Ok(())
}
/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database.
///
/// Its arguments are:
/// - an iterator over the words following the given `prefix` with the given `proximity`
/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements
fn execute_on_word_pairs_and_prefixes<I>(
proximity: u8,
prefix: &[u8],
iter: &mut I,
mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>,
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
) -> Result<()> {
let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = BTreeMap::default();
// Memory usage check:
// The content of the loop will be called for each `word2` that follows a word beginning
// with `prefix` with the given proximity.
// In practice, I don't think the batch can ever get too big.
while let Some((word2, docids)) = next_word2_and_docids(iter)? {
let entry = batch.entry(word2.to_owned()).or_default();
entry.push(Cow::Owned(docids.to_owned()));
}
let mut key_buffer = Vec::with_capacity(512);
key_buffer.push(proximity);
key_buffer.extend_from_slice(prefix);
key_buffer.push(0);
let mut value_buffer = Vec::with_capacity(65_536);
for (word2, docids) in batch {
key_buffer.truncate(prefix.len() + 2);
value_buffer.clear();
key_buffer.extend_from_slice(&word2);
let data = if docids.len() > 1 {
CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?;
value_buffer.as_slice()
} else {
&docids[0]
};
insert(key_buffer.as_slice(), data)?;
}
Ok(())
}

View File

@ -1,728 +0,0 @@
/*!
The word-prefix-pair-proximity-docids database is a database whose keys are of
the form `(proximity, word, prefix)` and the values are roaring bitmaps of
the documents which contain `word` followed by another word starting with
`prefix` at a distance of `proximity`.
The prefixes present in this database are only those that correspond to many
different words in the documents.
## How is it created/updated? (simplified version)
To compute it, we have access to (mainly) two inputs:
* a list of sorted prefixes, such as:
```text
c
ca
cat
d
do
dog
```
Note that only prefixes which correspond to more than a certain number of
different words from the database are included in this list.
* a sorted list of proximities and word pairs (the proximity is the distance between the two words),
associated with a roaring bitmap, such as:
```text
1 good doggo -> docids1: [8]
1 good door -> docids2: [7, 19, 20]
1 good ghost -> docids3: [1]
2 good dog -> docids4: [2, 5, 6]
2 horror cathedral -> docids5: [1, 2]
```
I illustrate a simplified version of the algorithm to create the word-prefix
pair-proximity database below:
1. **Outer loop:** First, we iterate over each proximity and word pair:
```text
proximity: 1
word1 : good
word2 : doggo
```
2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
in the list of sorted prefixes. And we insert the key `prefix`
and the value (`docids`) to a sorted map which we call the batch. For example,
at the end of the first outer loop, we may have:
```text
Outer loop 1:
------------------------------
proximity: 1
word1 : good
word2 : doggo
docids : docids1
prefixes: [d, do, dog]
batch: [
d, -> [docids1]
do -> [docids1]
dog -> [docids1]
]
```
3. For illustration purpose, let's run through a second iteration of the outer loop:
```text
Outer loop 2:
------------------------------
proximity: 1
word1 : good
word2 : door
docids : docids2
prefixes: [d, do, doo]
batch: [
d -> [docids1, docids2]
do -> [docids1, docids2]
dog -> [docids1]
doo -> [docids2]
]
```
Notice that there were some conflicts which were resolved by merging the
conflicting values together. Also, an additional prefix was added at the
end of the batch.
4. On the third iteration of the outer loop, we have:
```text
Outer loop 3:
------------------------------
proximity: 1
word1 : good
word2 : ghost
```
Because `word2` begins with a different letter than the previous `word2`,
we know that all the prefixes of `word2` are greater than the prefixes of the previous word2
Therefore, we know that we can insert every element from the batch into the
database before proceeding any further. This operation is called
flushing the batch. Flushing the batch should also be done whenever:
* `proximity` is different than the previous `proximity`.
* `word1` is different than the previous `word1`.
* `word2` starts with a different letter than the previous word2
6. **Flushing the batch:** to flush the batch, we iterate over its elements:
```text
Flushing Batch loop 1:
------------------------------
proximity : 1
word1 : good
prefix : d
docids : [docids2, docids3]
```
We then merge the array of `docids` (of type `Vec<Vec<u8>>`) using
`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a
roaring bitmap of all the document ids where `word1` is followed by `prefix`
at a distance of `proximity`.
Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids`
into the database.
7. That's it! ... except...
## How is it created/updated (continued)
I lied a little bit about the input data. In reality, we get two sets of the
inputs described above, which come from different places:
* For the list of sorted prefixes, we have:
1. `new_prefixes`, which are all the prefixes that were not present in the
database before the insertion of the new documents
2. `common_prefixes` which are the prefixes that are present both in the
database and in the newly added documents
* For the list of word pairs and proximities, we have:
1. `new_word_pairs`, which is the list of word pairs and their proximities
present in the newly added documents
2. `word_pairs_db`, which is the list of word pairs from the database.
This list includes all elements in `new_word_pairs` since `new_word_pairs`
was added to the database prior to calling the `WordPrefix::execute`
function.
To update the prefix database correctly, we call the algorithm described earlier first
on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`).
Thus:
1. For all the word pairs that were already present in the DB, we insert them
again with the `new_prefixes`. Calling the algorithm on them with the
`common_prefixes` would not result in any new data.
2. For all the new word pairs, we insert them twice: first with the `common_prefixes`,
and then, because they are part of `word_pairs_db`, with the `new_prefixes`.
Note, also, that since we read data from the database when iterating over
`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-
docids from the batch directly into the database (we would have a concurrent
reader and writer). Therefore, when calling the algorithm on
`(new_prefixes, word_pairs_db)`, we insert the computed
`((proximity, word, prefix), docids)` elements in an intermediary grenad
Writer instead of the DB. At the end of the outer loop, we finally read from
the grenad and insert its elements in the database.
*/
use std::borrow::Cow;
use std::collections::HashSet;
use grenad::CompressionType;
use heed::types::ByteSlice;
use heed::BytesDecode;
use log::debug;
use crate::update::index_documents::{create_writer, CursorClonableMmap};
use crate::update::prefix_word_pairs::{
insert_into_database, write_into_lmdb_database_without_merging,
};
use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
#[allow(clippy::too_many_arguments)]
#[logging_timer::time]
pub fn index_word_prefix_database(
wtxn: &mut heed::RwTxn,
word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
word_prefix_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
max_proximity: u8,
max_prefix_length: usize,
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]],
del_prefix_fst_words: &HashSet<Vec<u8>>,
chunk_compression_type: CompressionType,
chunk_compression_level: Option<u32>,
) -> Result<()> {
puffin::profile_function!();
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
// Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
let prefixes = PrefixTrieNode::from_sorted_prefixes(
common_prefix_fst_words
.iter()
.flat_map(|s| s.iter())
.map(|s| s.as_str())
.filter(|s| s.len() <= max_prefix_length),
);
// If the prefix trie is not empty, then we can iterate over all new
// word pairs to look for new (proximity, word1, common_prefix) elements
// to insert in the DB
if !prefixes.is_empty() {
let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
// This is the core of the algorithm
execute_on_word_pairs_and_prefixes(
// the first two arguments tell how to iterate over the new word pairs
&mut cursor,
|cursor| {
if let Some((key, value)) = cursor.move_on_next()? {
let (proximity, word1, word2) =
UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
Ok(Some(((proximity, word1, word2), value)))
} else {
Ok(None)
}
},
&prefixes,
max_proximity,
// and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap)
|key, value| {
insert_into_database(
wtxn,
*word_prefix_pair_proximity_docids.as_polymorph(),
key,
value,
)
},
)?;
}
// Now we do the same thing with the new prefixes and all word pairs in the DB
let prefixes = PrefixTrieNode::from_sorted_prefixes(
new_prefix_fst_words.iter().map(|s| s.as_str()).filter(|s| s.len() <= max_prefix_length),
);
if !prefixes.is_empty() {
let mut db_iter = word_pair_proximity_docids
.remap_key_type::<UncheckedU8StrStrCodec>()
.remap_data_type::<ByteSlice>()
.iter(wtxn)?;
// Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix)
// element in an intermediary grenad
let mut writer =
create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?);
execute_on_word_pairs_and_prefixes(
&mut db_iter,
|db_iter| db_iter.next().transpose().map_err(|e| e.into()),
&prefixes,
max_proximity,
|key, value| writer.insert(key, value).map_err(|e| e.into()),
)?;
drop(db_iter);
// and then we write the grenad into the DB
// Since the grenad contains only new prefixes, we know in advance that none
// of its elements already exist in the DB, thus there is no need to specify
// how to merge conflicting elements
write_into_lmdb_database_without_merging(
wtxn,
*word_prefix_pair_proximity_docids.as_polymorph(),
writer,
)?;
}
// All of the word prefix pairs in the database that have a w2
// that is contained in the `suppr_pw` set must be removed as well.
if !del_prefix_fst_words.is_empty() {
let mut iter =
word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
while let Some(((_, _, prefix), _)) = iter.next().transpose()? {
if del_prefix_fst_words.contains(prefix.as_bytes()) {
// Delete this entry as the w2 prefix is no more in the words prefix fst.
unsafe { iter.del_current()? };
}
}
}
Ok(())
}
/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
///
/// Its main arguments are:
/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements
/// 2. a prefix trie
/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements
///
/// For more information about what this function does, read the module documentation.
fn execute_on_word_pairs_and_prefixes<I>(
iter: &mut I,
mut next_word_pair_proximity: impl for<'a> FnMut(
&'a mut I,
) -> Result<
Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>,
>,
prefixes: &PrefixTrieNode,
max_proximity: u8,
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
) -> Result<()> {
let mut batch = PrefixAndProximityBatch::default();
let mut prev_word2_start = 0;
// Optimisation: the index at the root of the prefix trie where to search for
let mut prefix_search_start = PrefixTrieNodeSearchStart(0);
// Optimisation: true if there are no potential prefixes for the current word2 based on its first letter
let mut empty_prefixes = false;
let mut prefix_buffer = Vec::with_capacity(8);
let mut merge_buffer = Vec::with_capacity(65_536);
while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? {
// stop indexing if the proximity is over the threshold
if proximity > max_proximity {
break;
};
let word2_start_different_than_prev = word2[0] != prev_word2_start;
// if there were no potential prefixes for the previous word2 based on its first letter,
// and if the current word2 starts with the same letter, then there is also no potential
// prefixes for the current word2, and we can skip to the next iteration
if empty_prefixes && !word2_start_different_than_prev {
continue;
}
// if the proximity is different to the previous one, OR
// if word1 is different than the previous word1, OR
// if the start of word2 is different than the previous start of word2,
// THEN we'll need to flush the batch
let prox_different_than_prev = proximity != batch.proximity;
let word1_different_than_prev = word1 != batch.word1;
if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
{
batch.flush(&mut merge_buffer, &mut insert)?;
batch.proximity = proximity;
// don't forget to reset the value of batch.word1 and prev_word2_start
if word1_different_than_prev {
batch.word1.clear();
batch.word1.extend_from_slice(word1);
}
if word2_start_different_than_prev {
prev_word2_start = word2[0];
}
prefix_search_start.0 = 0;
// Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2
empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start);
}
if !empty_prefixes {
// All conditions are satisfied, we can now insert each new prefix of word2 into the batch
prefix_buffer.clear();
prefixes.for_each_prefix_of(
word2,
&mut prefix_buffer,
&prefix_search_start,
|prefix_buffer| {
batch.insert(prefix_buffer, data.to_vec());
},
);
}
}
batch.flush(&mut merge_buffer, &mut insert)?;
Ok(())
}
/**
A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps).
The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together.
It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently.
The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content
can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments:
- key : (proximity, word1, prefix) as bytes
- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes
*/
#[derive(Default)]
struct PrefixAndProximityBatch {
proximity: u8,
word1: Vec<u8>,
#[allow(clippy::type_complexity)]
batch: Vec<(Vec<u8>, Vec<Cow<'static, [u8]>>)>,
}
impl PrefixAndProximityBatch {
/// Insert the new key and value into the batch
///
/// The key must either exist in the batch or be greater than all existing keys
fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) {
match self.batch.iter_mut().find(|el| el.0 == new_key) {
Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)),
None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])),
}
}
/// Empties the batch, calling `insert` on each element.
///
/// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap.
fn flush(
&mut self,
merge_buffer: &mut Vec<u8>,
insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>,
) -> Result<()> {
let PrefixAndProximityBatch { proximity, word1, batch } = self;
if batch.is_empty() {
return Ok(());
}
merge_buffer.clear();
let mut buffer = Vec::with_capacity(word1.len() + 1 + 6);
buffer.push(*proximity);
buffer.extend_from_slice(word1);
buffer.push(0);
for (key, mergeable_data) in batch.drain(..) {
buffer.truncate(1 + word1.len() + 1);
buffer.extend_from_slice(key.as_slice());
let data = if mergeable_data.len() > 1 {
CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?;
merge_buffer.as_slice()
} else {
&mergeable_data[0]
};
insert(buffer.as_slice(), data)?;
merge_buffer.clear();
}
Ok(())
}
}
/** A prefix trie. Used to iterate quickly over the prefixes of a word that are
within a set.
## Structure
The trie is made of nodes composed of:
1. a byte character (e.g. 'a')
2. whether the node is an end node or not
3. a list of children nodes, sorted by their byte character
For example, the trie that stores the strings `[ac, ae, ar, ch, cei, cel, ch, r, rel, ri]`
is drawn below. Nodes with a double border are "end nodes".
a c r
c e r e h e i
i l l
*/
#[derive(Default, Debug)]
struct PrefixTrieNode {
children: Vec<(PrefixTrieNode, u8)>,
is_end_node: bool,
}
#[derive(Debug)]
struct PrefixTrieNodeSearchStart(usize);
impl PrefixTrieNode {
fn is_empty(&self) -> bool {
self.children.is_empty()
}
/// Returns false if the trie does not contain a prefix of the given word.
/// Returns true if the trie *may* contain a prefix of the given word.
///
/// Moves the search start to the first node equal to the first letter of the word,
/// or to 0 otherwise.
fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool {
let byte = word[0];
if self.children[search_start.0].1 == byte {
true
} else {
match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) {
Ok(position) => {
search_start.0 += position;
true
}
Err(_) => {
search_start.0 = 0;
false
}
}
}
}
fn from_sorted_prefixes<'a>(prefixes: impl Iterator<Item = &'a str>) -> Self {
let mut node = PrefixTrieNode::default();
for prefix in prefixes {
node.insert_sorted_prefix(prefix.as_bytes().iter());
}
node
}
fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter<u8>) {
if let Some(&c) = prefix.next() {
if let Some((node, byte)) = self.children.last_mut() {
if *byte == c {
node.insert_sorted_prefix(prefix);
return;
}
}
let mut new_node = PrefixTrieNode::default();
new_node.insert_sorted_prefix(prefix);
self.children.push((new_node, c));
} else {
self.is_end_node = true;
}
}
/// Call the given closure on each prefix of the word contained in the prefix trie.
///
/// The search starts from the given `search_start`.
fn for_each_prefix_of(
&self,
word: &[u8],
buffer: &mut Vec<u8>,
search_start: &PrefixTrieNodeSearchStart,
mut do_fn: impl FnMut(&mut Vec<u8>),
) {
let first_byte = word[0];
let mut cur_node = self;
buffer.push(first_byte);
if let Some((child_node, c)) =
cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte)
{
if *c == first_byte {
cur_node = child_node;
if cur_node.is_end_node {
do_fn(buffer);
}
for &byte in &word[1..] {
buffer.push(byte);
if let Some((child_node, c)) =
cur_node.children.iter().find(|(_, c)| *c >= byte)
{
if *c == byte {
cur_node = child_node;
if cur_node.is_end_node {
do_fn(buffer);
}
} else {
break;
}
} else {
break;
}
}
}
}
}
}
#[cfg(test)]
mod tests {
use roaring::RoaringBitmap;
use super::*;
use crate::{CboRoaringBitmapCodec, U8StrStrCodec};
fn check_prefixes(
trie: &PrefixTrieNode,
search_start: &PrefixTrieNodeSearchStart,
word: &str,
expected_prefixes: &[&str],
) {
let mut actual_prefixes = vec![];
trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), search_start, |x| {
let s = String::from_utf8(x.to_owned()).unwrap();
actual_prefixes.push(s);
});
assert_eq!(actual_prefixes, expected_prefixes);
}
#[test]
fn test_trie() {
let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
"1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au",
"b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c",
"ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com",
"comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des",
"di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f",
"fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi",
"gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i",
"im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka",
"ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar",
"mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni",
"no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi",
"pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res",
"ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si",
"sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t",
"ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve",
"vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z",
]));
let mut search_start = PrefixTrieNodeSearchStart(0);
let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start);
assert!(!is_empty);
assert_eq!(search_start.0, 2);
check_prefixes(&trie, &search_start, "affair", &["a"]);
check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]);
let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start);
assert!(!is_empty);
assert_eq!(trie.children[search_start.0].1, b'u');
check_prefixes(&trie, &search_start, "unique", &["u", "un"]);
// NOTE: this should fail, because the search start is already beyong 'a'
let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start);
assert!(!is_empty);
// search start is reset
assert_eq!(search_start.0, 0);
let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
"arb", "arbre", "cat", "catto",
]));
check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]);
check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]);
}
#[test]
fn test_execute_on_word_pairs_and_prefixes() {
let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
"arb", "arbre", "cat", "catto",
]));
let mut serialised_bitmap123 = vec![];
let mut bitmap123 = RoaringBitmap::new();
bitmap123.insert(1);
bitmap123.insert(2);
bitmap123.insert(3);
CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123);
let mut serialised_bitmap456 = vec![];
let mut bitmap456 = RoaringBitmap::new();
bitmap456.insert(4);
bitmap456.insert(5);
bitmap456.insert(6);
CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456);
let mut serialised_bitmap789 = vec![];
let mut bitmap789 = RoaringBitmap::new();
bitmap789.insert(7);
bitmap789.insert(8);
bitmap789.insert(9);
CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789);
let mut serialised_bitmap_ranges = vec![];
let mut bitmap_ranges = RoaringBitmap::new();
bitmap_ranges.insert_range(63_000..65_000);
bitmap_ranges.insert_range(123_000..128_000);
CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges);
let word_pairs = [
((1, "healthy", "arbres"), &serialised_bitmap123),
((1, "healthy", "boat"), &serialised_bitmap123),
((1, "healthy", "ca"), &serialised_bitmap123),
((1, "healthy", "cats"), &serialised_bitmap456),
((1, "healthy", "cattos"), &serialised_bitmap123),
((1, "jittery", "cat"), &serialised_bitmap123),
((1, "jittery", "cata"), &serialised_bitmap456),
((1, "jittery", "catb"), &serialised_bitmap789),
((1, "jittery", "catc"), &serialised_bitmap_ranges),
((2, "healthy", "arbre"), &serialised_bitmap123),
((2, "healthy", "arbres"), &serialised_bitmap456),
((2, "healthy", "cats"), &serialised_bitmap789),
((2, "healthy", "cattos"), &serialised_bitmap_ranges),
((3, "healthy", "arbre"), &serialised_bitmap456),
((3, "healthy", "arbres"), &serialised_bitmap789),
];
let expected_result = [
((1, "healthy", "arb"), bitmap123.clone()),
((1, "healthy", "arbre"), bitmap123.clone()),
((1, "healthy", "cat"), &bitmap456 | &bitmap123),
((1, "healthy", "catto"), bitmap123.clone()),
((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)),
((2, "healthy", "arb"), &bitmap123 | &bitmap456),
((2, "healthy", "arbre"), &bitmap123 | &bitmap456),
((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges),
((2, "healthy", "catto"), bitmap_ranges.clone()),
];
let mut result = vec![];
let mut iter =
IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| {
((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice())
});
execute_on_word_pairs_and_prefixes(
&mut iter,
|iter| Ok(iter.next()),
&prefixes,
2,
|k, v| {
let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap();
let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap();
result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap));
Ok(())
},
)
.unwrap();
for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) {
let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x;
let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y;
assert_eq!(actual_word1, expected_word1);
assert_eq!(actual_prefix, expected_prefix);
assert_eq!(actual_proximity, expected_proximity);
assert_eq!(actual_bitmap, expected_bitmap);
}
}
}

View File

@ -4,9 +4,11 @@ use grenad::CompressionType;
use heed::types::{ByteSlice, Str}; use heed::types::{ByteSlice, Str};
use heed::Database; use heed::Database;
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
use crate::update::index_documents::{ use crate::update::index_documents::{
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, create_sorter, merge_deladd_cbo_roaring_bitmaps,
CursorClonableMmap, MergeFn, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
write_sorter_into_database, CursorClonableMmap, MergeFn,
}; };
use crate::{CboRoaringBitmapCodec, Result}; use crate::{CboRoaringBitmapCodec, Result};
@ -51,7 +53,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
// and write into it at the same time, therefore we write into another file. // and write into it at the same time, therefore we write into another file.
let mut prefix_docids_sorter = create_sorter( let mut prefix_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable, grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
self.chunk_compression_type, self.chunk_compression_type,
self.chunk_compression_level, self.chunk_compression_level,
self.max_nb_chunks, self.max_nb_chunks,
@ -92,11 +94,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
// We fetch the docids associated to the newly added word prefix fst only. // We fetch the docids associated to the newly added word prefix fst only.
let db = self.word_docids.remap_data_type::<ByteSlice>(); let db = self.word_docids.remap_data_type::<ByteSlice>();
let mut buffer = Vec::new();
for prefix in new_prefix_fst_words { for prefix in new_prefix_fst_words {
let prefix = std::str::from_utf8(prefix.as_bytes())?; let prefix = std::str::from_utf8(prefix.as_bytes())?;
for result in db.prefix_iter(self.wtxn, prefix)? { for result in db.prefix_iter(self.wtxn, prefix)? {
let (_word, data) = result?; let (_word, data) = result?;
prefix_docids_sorter.insert(prefix, data)?; buffer.clear();
let mut writer = KvWriterDelAdd::new(&mut buffer);
writer.insert(DelAdd::Addition, data)?;
prefix_docids_sorter.insert(prefix, writer.into_inner()?)?;
} }
} }
@ -110,12 +117,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
drop(iter); drop(iter);
let database_is_empty = self.word_prefix_docids.is_empty(self.wtxn)?;
// We finally write the word prefix docids into the LMDB database. // We finally write the word prefix docids into the LMDB database.
sorter_into_lmdb_database( write_sorter_into_database(
self.wtxn,
*self.word_prefix_docids.as_polymorph(),
prefix_docids_sorter, prefix_docids_sorter,
merge_cbo_roaring_bitmaps, &self.word_prefix_docids,
self.wtxn,
database_is_empty,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?; )?;
Ok(()) Ok(())

View File

@ -9,9 +9,11 @@ use log::debug;
use crate::error::SerializationError; use crate::error::SerializationError;
use crate::heed_codec::StrBEU16Codec; use crate::heed_codec::StrBEU16Codec;
use crate::index::main_key::WORDS_PREFIXES_FST_KEY; use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
use crate::update::index_documents::{ use crate::update::index_documents::{
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, create_sorter, merge_deladd_cbo_roaring_bitmaps,
CursorClonableMmap, MergeFn, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
write_sorter_into_database, CursorClonableMmap, MergeFn,
}; };
use crate::{CboRoaringBitmapCodec, Result}; use crate::{CboRoaringBitmapCodec, Result};
@ -55,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
let mut prefix_integer_docids_sorter = create_sorter( let mut prefix_integer_docids_sorter = create_sorter(
grenad::SortAlgorithm::Unstable, grenad::SortAlgorithm::Unstable,
merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
self.chunk_compression_type, self.chunk_compression_type,
self.chunk_compression_level, self.chunk_compression_level,
self.max_nb_chunks, self.max_nb_chunks,
@ -108,6 +110,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
// We fetch the docids associated to the newly added word prefix fst only. // We fetch the docids associated to the newly added word prefix fst only.
let db = self.word_database.remap_data_type::<ByteSlice>(); let db = self.word_database.remap_data_type::<ByteSlice>();
let mut buffer = Vec::new();
for prefix_bytes in new_prefix_fst_words { for prefix_bytes in new_prefix_fst_words {
let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| { let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| {
SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) } SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) }
@ -123,7 +126,11 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
if word.starts_with(prefix) { if word.starts_with(prefix) {
let key = (prefix, pos); let key = (prefix, pos);
let bytes = StrBEU16Codec::bytes_encode(&key).unwrap(); let bytes = StrBEU16Codec::bytes_encode(&key).unwrap();
prefix_integer_docids_sorter.insert(bytes, data)?;
buffer.clear();
let mut writer = KvWriterDelAdd::new(&mut buffer);
writer.insert(DelAdd::Addition, data)?;
prefix_integer_docids_sorter.insert(bytes, writer.into_inner()?)?;
} }
} }
} }
@ -143,12 +150,16 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
drop(iter); drop(iter);
} }
let database_is_empty = self.prefix_database.is_empty(self.wtxn)?;
// We finally write all the word prefix integer docids into the LMDB database. // We finally write all the word prefix integer docids into the LMDB database.
sorter_into_lmdb_database( write_sorter_into_database(
self.wtxn,
*self.prefix_database.as_polymorph(),
prefix_integer_docids_sorter, prefix_integer_docids_sorter,
merge_cbo_roaring_bitmaps, &self.prefix_database,
self.wtxn,
database_is_empty,
deladd_serialize_add_side,
merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
)?; )?;
Ok(()) Ok(())
@ -159,6 +170,7 @@ fn write_prefixes_in_sorter(
prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>, prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
sorter: &mut grenad::Sorter<MergeFn>, sorter: &mut grenad::Sorter<MergeFn>,
) -> Result<()> { ) -> Result<()> {
// TODO: Merge before insertion.
for (key, data_slices) in prefixes.drain() { for (key, data_slices) in prefixes.drain() {
for data in data_slices { for data in data_slices {
if valid_lmdb_key(&key) { if valid_lmdb_key(&key) {