Add prefix_word_pair_proximity database

Similar to the word_prefix_pair_proximity one but instead the keys are:
(proximity, prefix, word2)
This commit is contained in:
Loïc Lecrenier 2022-09-14 15:33:13 +02:00 committed by Loïc Lecrenier
parent 1dbbd8694f
commit 264a04922d
14 changed files with 653 additions and 339 deletions

View File

@ -71,6 +71,7 @@ pub mod db_name {
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
@ -109,6 +110,8 @@ pub struct Index {
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>, pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
pub word_prefix_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>, pub word_prefix_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
/// Maps the proximity between a pair of prefix and word with all the docids where this relation appears.
pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
/// Maps the word and the position with the docids that corresponds to it. /// Maps the word and the position with the docids that corresponds to it.
pub word_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>, pub word_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
@ -138,7 +141,7 @@ impl Index {
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> { pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
use db_name::*; use db_name::*;
options.max_dbs(17); options.max_dbs(18);
unsafe { options.flag(Flags::MdbAlwaysFreePages) }; unsafe { options.flag(Flags::MdbAlwaysFreePages) };
let env = options.open(path)?; let env = options.open(path)?;
@ -151,6 +154,8 @@ impl Index {
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
let word_prefix_pair_proximity_docids = let word_prefix_pair_proximity_docids =
env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
let prefix_word_pair_proximity_docids =
env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?; let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
@ -175,6 +180,7 @@ impl Index {
docid_word_positions, docid_word_positions,
word_pair_proximity_docids, word_pair_proximity_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
prefix_word_pair_proximity_docids,
word_position_docids, word_position_docids,
word_prefix_position_docids, word_prefix_position_docids,
field_id_word_count_docids, field_id_word_count_docids,

View File

@ -198,6 +198,15 @@ pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
}); });
snap snap
} }
pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |(
(proximity, prefix, word2),
b,
)| {
&format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b))
});
snap
}
pub fn snap_word_position_docids(index: &Index) -> String { pub fn snap_word_position_docids(index: &Index) -> String {
let snap = make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { let snap = make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| {
&format!("{word:<16} {position:<6} {}", display_bitmap(&b)) &format!("{word:<16} {position:<6} {}", display_bitmap(&b))
@ -427,6 +436,9 @@ macro_rules! full_snap_of_db {
($index:ident, word_prefix_pair_proximity_docids) => {{ ($index:ident, word_prefix_pair_proximity_docids) => {{
$crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index) $crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index)
}}; }};
($index:ident, prefix_word_pair_proximity_docids) => {{
$crate::snapshot_tests::snap_prefix_word_pair_proximity_docids(&$index)
}};
($index:ident, word_position_docids) => {{ ($index:ident, word_position_docids) => {{
$crate::snapshot_tests::snap_word_position_docids(&$index) $crate::snapshot_tests::snap_word_position_docids(&$index)
}}; }};

View File

@ -25,6 +25,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
docid_word_positions, docid_word_positions,
word_pair_proximity_docids, word_pair_proximity_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
prefix_word_pair_proximity_docids,
word_position_docids, word_position_docids,
field_id_word_count_docids, field_id_word_count_docids,
word_prefix_position_docids, word_prefix_position_docids,
@ -66,6 +67,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
docid_word_positions.clear(self.wtxn)?; docid_word_positions.clear(self.wtxn)?;
word_pair_proximity_docids.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?;
word_prefix_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?;
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
word_position_docids.clear(self.wtxn)?; word_position_docids.clear(self.wtxn)?;
field_id_word_count_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?;
word_prefix_position_docids.clear(self.wtxn)?; word_prefix_position_docids.clear(self.wtxn)?;

View File

@ -183,6 +183,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
word_pair_proximity_docids, word_pair_proximity_docids,
field_id_word_count_docids, field_id_word_count_docids,
word_prefix_pair_proximity_docids, word_prefix_pair_proximity_docids,
prefix_word_pair_proximity_docids,
word_position_docids, word_position_docids,
word_prefix_position_docids, word_prefix_position_docids,
facet_id_f64_docids, facet_id_f64_docids,
@ -327,26 +328,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?; self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?;
} }
// We delete the documents ids from the word prefix pair proximity database docids for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] {
// and remove the empty pairs too. // We delete the documents ids from the word prefix pair proximity database docids
let db = word_prefix_pair_proximity_docids.remap_key_type::<ByteSlice>(); // and remove the empty pairs too.
let mut iter = db.iter_mut(self.wtxn)?; let db = db.remap_key_type::<ByteSlice>();
while let Some(result) = iter.next() { let mut iter = db.iter_mut(self.wtxn)?;
let (key, mut docids) = result?; while let Some(result) = iter.next() {
let previous_len = docids.len(); let (key, mut docids) = result?;
docids -= &self.to_delete_docids; let previous_len = docids.len();
if docids.is_empty() { docids -= &self.to_delete_docids;
// safety: we don't keep references from inside the LMDB database. if docids.is_empty() {
unsafe { iter.del_current()? }; // safety: we don't keep references from inside the LMDB database.
} else if docids.len() != previous_len { unsafe { iter.del_current()? };
let key = key.to_owned(); } else if docids.len() != previous_len {
// safety: we don't keep references from inside the LMDB database. let key = key.to_owned();
unsafe { iter.put_current(&key, &docids)? }; // safety: we don't keep references from inside the LMDB database.
unsafe { iter.put_current(&key, &docids)? };
}
} }
} }
drop(iter);
// We delete the documents ids that are under the pairs of words, // We delete the documents ids that are under the pairs of words,
// it is faster and use no memory to iterate over all the words pairs than // it is faster and use no memory to iterate over all the words pairs than
// to compute the cartesian product of every words of the deleted documents. // to compute the cartesian product of every words of the deleted documents.

View File

@ -36,8 +36,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
use crate::error::UserError; use crate::error::UserError;
pub use crate::update::index_documents::helpers::CursorClonableMmap; pub use crate::update::index_documents::helpers::CursorClonableMmap;
use crate::update::{ use crate::update::{
self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst,
}; };
use crate::{Index, Result, RoaringBitmapCodec}; use crate::{Index, Result, RoaringBitmapCodec};
@ -528,12 +528,7 @@ where
if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { if let Some(word_pair_proximity_docids) = word_pair_proximity_docids {
// Run the word prefix pair proximity docids update operation. // Run the word prefix pair proximity docids update operation.
let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); PrefixWordPairsProximityDocids::new(self.wtxn, self.index).execute(
builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
builder.max_memory = self.indexer_config.max_memory;
builder.execute(
word_pair_proximity_docids, word_pair_proximity_docids,
&new_prefix_fst_words, &new_prefix_fst_words,
&common_prefix_fst_words, &common_prefix_fst_words,

View File

@ -6,10 +6,10 @@ pub use self::index_documents::{
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
}; };
pub use self::indexer_config::IndexerConfig; pub use self::indexer_config::IndexerConfig;
pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids;
pub use self::settings::{Setting, Settings}; pub use self::settings::{Setting, Settings};
pub use self::update_step::UpdateIndexingStep; pub use self::update_step::UpdateIndexingStep;
pub use self::word_prefix_docids::WordPrefixDocids; pub use self::word_prefix_docids::WordPrefixDocids;
pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids;
pub use self::words_prefix_position_docids::WordPrefixPositionDocids; pub use self::words_prefix_position_docids::WordPrefixPositionDocids;
pub use self::words_prefixes_fst::WordsPrefixesFst; pub use self::words_prefixes_fst::WordsPrefixesFst;
@ -19,9 +19,9 @@ mod delete_documents;
mod facets; mod facets;
mod index_documents; mod index_documents;
mod indexer_config; mod indexer_config;
mod prefix_word_pairs;
mod settings; mod settings;
mod update_step; mod update_step;
mod word_prefix_docids; mod word_prefix_docids;
mod word_prefix_pair_proximity_docids;
mod words_prefix_position_docids; mod words_prefix_position_docids;
mod words_prefixes_fst; mod words_prefixes_fst;

View File

@ -0,0 +1,216 @@
use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap};
use crate::{Index, Result};
use heed::types::ByteSlice;
use std::{borrow::Cow, collections::HashSet, io::BufReader};
mod prefix_word;
mod word_prefix;
pub use prefix_word::index_prefix_word_database;
pub use word_prefix::index_word_prefix_database;
pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
max_proximity: u8,
max_prefix_length: usize,
}
impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Self {
Self { wtxn, index, max_proximity: 4, max_prefix_length: 2 }
}
/// Set the maximum proximity required to make a prefix be part of the words prefixes
/// database. If two words are too far from the threshold the associated documents will
/// not be part of the prefix database.
///
/// Default value is 4. This value must be lower or equal than 7 and will be clamped
/// to this bound otherwise.
pub fn max_proximity(&mut self, value: u8) -> &mut Self {
self.max_proximity = value.max(7);
self
}
/// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
/// prefixes database. If the prefix length is higher than the threshold, the associated documents
/// will not be part of the prefix database.
///
/// Default value is 2.
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
self.max_prefix_length = value;
self
}
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
pub fn execute<'a>(
self,
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
new_prefix_fst_words: &'a [String],
common_prefix_fst_words: &[&'a [String]],
del_prefix_fst_words: &HashSet<Vec<u8>>,
) -> Result<()> {
index_word_prefix_database(
self.wtxn,
self.index.word_pair_proximity_docids,
self.index.word_prefix_pair_proximity_docids,
self.max_proximity,
self.max_prefix_length,
new_word_pair_proximity_docids.clone(),
new_prefix_fst_words,
common_prefix_fst_words,
del_prefix_fst_words,
)?;
index_prefix_word_database(
self.wtxn,
self.index.word_pair_proximity_docids,
self.index.prefix_word_pair_proximity_docids,
self.max_proximity,
self.max_prefix_length,
new_word_pair_proximity_docids,
new_prefix_fst_words,
common_prefix_fst_words,
del_prefix_fst_words,
)?;
Ok(())
}
}
// This is adapted from `sorter_into_lmdb_database`
pub fn insert_into_database(
wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase,
new_key: &[u8],
new_value: &[u8],
) -> Result<()> {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
match iter.next().transpose()? {
Some((key, old_val)) if new_key == key => {
let val =
merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
.map_err(|_| {
// TODO just wrap this error?
crate::error::InternalError::IndexingMergingKeys {
process: "get-put-merge",
}
})?;
// safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
unsafe { iter.put_current(new_key, &val)? };
}
_ => {
drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
}
}
Ok(())
}
// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
// but it uses `append` if the database is empty, and it assumes that the values in the
// writer don't conflict with values in the database.
pub fn write_into_lmdb_database_without_merging(
wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase,
writer: grenad::Writer<std::fs::File>,
) -> Result<()> {
let file = writer.into_inner()?;
let reader = grenad::Reader::new(BufReader::new(file))?;
if database.is_empty(wtxn)? {
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
let mut cursor = reader.into_cursor()?;
while let Some((k, v)) = cursor.move_on_next()? {
// safety: the key comes from the grenad reader, not the database
unsafe { out_iter.append(k, v)? };
}
} else {
let mut cursor = reader.into_cursor()?;
while let Some((k, v)) = cursor.move_on_next()? {
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
}
}
Ok(())
}
#[cfg(test)]
mod tests {
use crate::db_snap;
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
use crate::index::tests::TempIndex;
use std::io::Cursor;
fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> {
let mut documents = Vec::new();
for prefix in prefixes {
for i in 0..50 {
documents.push(
serde_json::json!({
"text": format!("{prefix}{i:x}"),
})
.as_object()
.unwrap()
.clone(),
)
}
}
documents
}
#[test]
fn test_update() {
let mut index = TempIndex::new();
index.index_documents_config.words_prefix_threshold = Some(50);
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| {
settings.set_searchable_fields(vec!["text".to_owned()]);
})
.unwrap();
let batch_reader_from_documents = |documents| {
let mut builder = DocumentsBatchBuilder::new(Vec::new());
for object in documents {
builder.append_json_object(&object).unwrap();
}
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
};
let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]);
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
documents.push(
serde_json::json!({
"text": "At an amazing and beautiful house"
})
.as_object()
.unwrap()
.clone(),
);
documents.push(
serde_json::json!({
"text": "The bell rings at 5 am"
})
.as_object()
.unwrap()
.clone(),
);
let documents = batch_reader_from_documents(documents);
index.add_documents(documents).unwrap();
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]);
documents.push(
serde_json::json!({
"text": "At an extraordinary house"
})
.as_object()
.unwrap()
.clone(),
);
let documents = batch_reader_from_documents(documents);
index.add_documents(documents).unwrap();
db_snap!(index, word_pair_proximity_docids, "update");
db_snap!(index, word_prefix_pair_proximity_docids, "update");
db_snap!(index, prefix_word_pair_proximity_docids, "update");
}
}

View File

@ -0,0 +1,178 @@
use crate::update::index_documents::{create_writer, CursorClonableMmap};
use crate::update::prefix_word_pairs::{
insert_into_database, write_into_lmdb_database_without_merging,
};
use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
use grenad::CompressionType;
use heed::types::ByteSlice;
use heed::BytesDecode;
use log::debug;
use std::borrow::Cow;
use std::collections::{BTreeMap, HashSet};
#[logging_timer::time]
pub fn index_prefix_word_database(
wtxn: &mut heed::RwTxn,
word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
max_proximity: u8,
max_prefix_length: usize,
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]],
del_prefix_fst_words: &HashSet<Vec<u8>>,
) -> Result<()> {
let max_proximity = max_proximity - 1;
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
let common_prefixes: Vec<_> = common_prefix_fst_words
.into_iter()
.map(|s| s.into_iter())
.flatten()
.map(|s| s.as_str())
.filter(|s| s.len() <= max_prefix_length)
.collect();
// If the prefix trie is not empty, then we can iterate over all new
// word pairs to look for new (word1, common_prefix, proximity) elements
// to insert in the DB
for proximity in 1..=max_proximity - 1 {
for prefix in common_prefixes.iter() {
let mut prefix_key = vec![];
prefix_key.push(proximity);
prefix_key.extend_from_slice(prefix.as_bytes());
let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?;
// This is the core of the algorithm
execute_on_word_pairs_and_prefixes(
proximity + 1,
prefix.as_bytes(),
// the next two arguments tell how to iterate over the new word pairs
&mut cursor,
|cursor| {
if let Some((key, value)) = cursor.next()? {
let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
.ok_or(heed::Error::Decoding)?;
Ok(Some((word2, value)))
} else {
Ok(None)
}
},
// and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap)
|key, value| {
insert_into_database(
wtxn,
*prefix_word_pair_proximity_docids.as_polymorph(),
key,
value,
)
},
)?;
}
}
// Now we do the same thing with the new prefixes and all word pairs in the DB
let new_prefixes: Vec<_> = new_prefix_fst_words
.into_iter()
.map(|s| s.as_str())
.filter(|s| s.len() <= max_prefix_length)
.collect();
// Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
// element in an intermediary grenad
let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?);
for proximity in 1..=max_proximity - 1 {
for prefix in new_prefixes.iter() {
let mut prefix_key = vec![];
prefix_key.push(proximity);
prefix_key.extend_from_slice(prefix.as_bytes());
let mut db_iter = word_pair_proximity_docids
.as_polymorph()
.prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())?
.remap_key_type::<UncheckedU8StrStrCodec>();
execute_on_word_pairs_and_prefixes(
proximity + 1,
prefix.as_bytes(),
&mut db_iter,
|db_iter| {
db_iter
.next()
.transpose()
.map(|x| x.map(|((_, _, word2), value)| (word2, value)))
.map_err(|e| e.into())
},
|key, value| writer.insert(key, value).map_err(|e| e.into()),
)?;
drop(db_iter);
}
}
// and then we write the grenad into the DB
// Since the grenad contains only new prefixes, we know in advance that none
// of its elements already exist in the DB, thus there is no need to specify
// how to merge conflicting elements
write_into_lmdb_database_without_merging(
wtxn,
*prefix_word_pair_proximity_docids.as_polymorph(),
writer,
)?;
// All of the word prefix pairs in the database that have a w2
// that is contained in the `suppr_pw` set must be removed as well.
if !del_prefix_fst_words.is_empty() {
let mut iter =
prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
while let Some(((_, prefix, _), _)) = iter.next().transpose()? {
if del_prefix_fst_words.contains(prefix.as_bytes()) {
// Delete this entry as the w2 prefix is no more in the words prefix fst.
unsafe { iter.del_current()? };
}
}
}
Ok(())
}
/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
///
/// Its main arguments are:
/// 1. a sorted prefix iterator over ((word1, word2, proximity), docids) elements
/// 2. a closure to describe how to handle the new computed (word1, prefix, proximity) elements
///
/// For more information about what this function does, read the module documentation.
fn execute_on_word_pairs_and_prefixes<I>(
proximity: u8,
prefix: &[u8],
iter: &mut I,
mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>,
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
) -> Result<()> {
let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = <_>::default();
while let Some((word2, data)) = next_word2_and_docids(iter)? {
let entry = batch.entry(word2.to_owned()).or_default();
entry.push(Cow::Owned(data.to_owned()));
}
let mut key_buffer = Vec::with_capacity(8);
key_buffer.push(proximity);
key_buffer.extend_from_slice(prefix);
key_buffer.push(0);
let mut value_buffer = Vec::with_capacity(65_536);
for (key, values) in batch {
key_buffer.truncate(prefix.len() + 2);
value_buffer.clear();
key_buffer.extend_from_slice(&key);
let data = if values.len() > 1 {
CboRoaringBitmapCodec::merge_into(&values, &mut value_buffer)?;
value_buffer.as_slice()
} else {
&values[0]
};
insert(key_buffer.as_slice(), data)?;
}
Ok(())
}

View File

@ -0,0 +1,46 @@
---
source: milli/src/update/prefix_word_pairs/mod.rs
---
1 5 a [101, ]
1 amazing a [100, ]
1 an a [100, ]
1 and b [100, ]
1 and be [100, ]
1 at a [100, ]
1 rings a [101, ]
1 the b [101, ]
1 the be [101, ]
2 5 a [101, ]
2 amazing a [100, ]
2 amazing b [100, ]
2 amazing be [100, ]
2 an a [100, ]
2 and a [100, ]
2 at a [100, 101, ]
2 beautiful a [100, ]
2 bell a [101, ]
2 house b [100, ]
2 house be [100, ]
2 rings b [101, ]
2 rings be [101, ]
3 am a [101, ]
3 amazing a [100, ]
3 an b [100, ]
3 an be [100, ]
3 and a [100, ]
3 at a [100, ]
3 at b [101, ]
3 at be [101, ]
3 beautiful a [100, ]
3 house a [100, ]
3 rings a [101, ]
3 the a [101, ]
4 5 b [101, ]
4 5 be [101, ]
4 and a [100, ]
4 at b [100, ]
4 at be [100, ]
4 beautiful a [100, ]
4 bell a [101, ]
4 house a [100, ]

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/prefix_word_pairs/mod.rs
---
fb88e49fd666886731b62baef8f44995

View File

@ -0,0 +1,41 @@
---
source: milli/src/update/prefix_word_pairs/mod.rs
---
2 a 5 [101, ]
2 a amazing [100, ]
2 a an [100, 202, ]
2 a and [100, ]
2 a beautiful [100, ]
2 a extraordinary [202, ]
2 am and [100, ]
2 an amazing [100, ]
2 an beautiful [100, ]
2 an extraordinary [202, ]
2 b house [100, ]
2 b rings [101, ]
2 be house [100, ]
2 be rings [101, ]
3 a 5 [101, ]
3 a am [101, ]
3 a amazing [100, ]
3 a an [100, ]
3 a and [100, ]
3 a at [100, 202, ]
3 a beautiful [100, ]
3 a extraordinary [202, ]
3 a house [100, 202, ]
3 a rings [101, ]
3 am 5 [101, ]
3 am an [100, ]
3 am beautiful [100, ]
3 an amazing [100, ]
3 an and [100, ]
3 an at [100, 202, ]
3 an house [100, 202, ]
3 b and [100, ]
3 b at [101, ]
3 b the [101, ]
3 be and [100, ]
3 be at [101, ]
3 be the [101, ]

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/prefix_word_pairs/mod.rs
---
6965ecd1bf821f1cf921c2ab751b36cf

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/prefix_word_pairs/mod.rs
---
fb88e49fd666886731b62baef8f44995

View File

@ -1,5 +1,5 @@
/*! /*!
## What is WordPrefixPairProximityDocids? ## What is WordPrefix?
The word-prefix-pair-proximity-docids database is a database whose keys are of The word-prefix-pair-proximity-docids database is a database whose keys are of
the form `(proximity, word, prefix)` and the values are roaring bitmaps of the form `(proximity, word, prefix)` and the values are roaring bitmaps of
the documents which contain `word` followed by another word starting with the documents which contain `word` followed by another word starting with
@ -139,7 +139,7 @@ inputs described above, which come from different places:
2. `word_pairs_db`, which is the list of word pairs from the database. 2. `word_pairs_db`, which is the list of word pairs from the database.
This list includes all elements in `new_word_pairs` since `new_word_pairs` This list includes all elements in `new_word_pairs` since `new_word_pairs`
was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` was added to the database prior to calling the `WordPrefix::execute`
function. function.
To update the prefix database correctly, we call the algorithm described earlier first To update the prefix database correctly, we call the algorithm described earlier first
@ -161,196 +161,137 @@ reader and writer). Therefore, when calling the algorithm on
`((proximity, word, prefix), docids)` elements in an intermediary grenad `((proximity, word, prefix), docids)` elements in an intermediary grenad
Writer instead of the DB. At the end of the outer loop, we finally read from Writer instead of the DB. At the end of the outer loop, we finally read from
the grenad and insert its elements in the database. the grenad and insert its elements in the database.
*/ */
use std::borrow::Cow;
use std::collections::HashSet;
use std::io::BufReader;
use crate::update::index_documents::{create_writer, CursorClonableMmap};
use crate::update::prefix_word_pairs::{
insert_into_database, write_into_lmdb_database_without_merging,
};
use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
use grenad::CompressionType; use grenad::CompressionType;
use heed::types::ByteSlice; use heed::types::ByteSlice;
use heed::BytesDecode; use heed::BytesDecode;
use log::debug; use log::debug;
use std::borrow::Cow;
use std::collections::HashSet;
use crate::update::index_documents::{ #[logging_timer::time]
create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, pub fn index_word_prefix_database(
}; wtxn: &mut heed::RwTxn,
use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedU8StrStrCodec}; word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
word_prefix_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
wtxn: &'t mut heed::RwTxn<'i, 'u>,
index: &'i Index,
pub(crate) chunk_compression_type: CompressionType,
pub(crate) chunk_compression_level: Option<u32>,
pub(crate) max_nb_chunks: Option<usize>,
pub(crate) max_memory: Option<usize>,
max_proximity: u8, max_proximity: u8,
max_prefix_length: usize, max_prefix_length: usize,
} new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
new_prefix_fst_words: &[String],
common_prefix_fst_words: &[&[String]],
del_prefix_fst_words: &HashSet<Vec<u8>>,
) -> Result<()> {
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
pub fn new( let prefixes = PrefixTrieNode::from_sorted_prefixes(
wtxn: &'t mut heed::RwTxn<'i, 'u>, common_prefix_fst_words
index: &'i Index, .into_iter()
) -> WordPrefixPairProximityDocids<'t, 'u, 'i> { .map(|s| s.into_iter())
WordPrefixPairProximityDocids { .flatten()
wtxn, .map(|s| s.as_str())
index, .filter(|s| s.len() <= max_prefix_length),
chunk_compression_type: CompressionType::None, );
chunk_compression_level: None,
max_nb_chunks: None,
max_memory: None,
max_proximity: 4,
max_prefix_length: 2,
}
}
/// Set the maximum proximity required to make a prefix be part of the words prefixes // If the prefix trie is not empty, then we can iterate over all new
/// database. If two words are too far from the threshold the associated documents will // word pairs to look for new (proximity, word1, common_prefix) elements
/// not be part of the prefix database. // to insert in the DB
/// if !prefixes.is_empty() {
/// Default value is 4. This value must be lower or equal than 7 and will be clamped let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
/// to this bound otherwise. // This is the core of the algorithm
pub fn max_proximity(&mut self, value: u8) -> &mut Self { execute_on_word_pairs_and_prefixes(
self.max_proximity = value.max(7); // the first two arguments tell how to iterate over the new word pairs
self &mut cursor,
} |cursor| {
if let Some((key, value)) = cursor.move_on_next()? {
/// Set the maximum length the prefix of a word pair is allowed to have to be part of the words let (proximity, word1, word2) =
/// prefixes database. If the prefix length is higher than the threshold, the associated documents UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
/// will not be part of the prefix database. Ok(Some(((proximity, word1, word2), value)))
/// } else {
/// Default value is 2. Ok(None)
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
self.max_prefix_length = value;
self
}
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
pub fn execute<'a>(
mut self,
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
new_prefix_fst_words: &'a [String],
common_prefix_fst_words: &[&'a [String]],
del_prefix_fst_words: &HashSet<Vec<u8>>,
) -> Result<()> {
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
// Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
let prefixes = PrefixTrieNode::from_sorted_prefixes(
common_prefix_fst_words
.into_iter()
.map(|s| s.into_iter())
.flatten()
.map(|s| s.as_str())
.filter(|s| s.len() <= self.max_prefix_length),
);
// If the prefix trie is not empty, then we can iterate over all new
// word pairs to look for new (word1, common_prefix, proximity) elements
// to insert in the DB
if !prefixes.is_empty() {
let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
// This is the core of the algorithm
execute_on_word_pairs_and_prefixes(
// the first two arguments tell how to iterate over the new word pairs
&mut cursor,
|cursor| {
if let Some((key, value)) = cursor.move_on_next()? {
let (proximity, word1, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
.ok_or(heed::Error::Decoding)?;
Ok(Some(((proximity, word1, word2), value)))
} else {
Ok(None)
}
},
&prefixes,
self.max_proximity,
// and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap)
|key, value| {
insert_into_database(
&mut self.wtxn,
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
key,
value,
)
},
)?;
}
// Now we do the same thing with the new prefixes and all word pairs in the DB
let prefixes = PrefixTrieNode::from_sorted_prefixes(
new_prefix_fst_words
.into_iter()
.map(|s| s.as_str())
.filter(|s| s.len() <= self.max_prefix_length),
);
if !prefixes.is_empty() {
let mut db_iter = self
.index
.word_pair_proximity_docids
.remap_key_type::<UncheckedU8StrStrCodec>()
.remap_data_type::<ByteSlice>()
.iter(self.wtxn)?;
// Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
// element in an intermediary grenad
let mut writer = create_writer(
self.chunk_compression_type,
self.chunk_compression_level,
tempfile::tempfile()?,
);
execute_on_word_pairs_and_prefixes(
&mut db_iter,
|db_iter| db_iter.next().transpose().map_err(|e| e.into()),
&prefixes,
self.max_proximity,
|key, value| writer.insert(key, value).map_err(|e| e.into()),
)?;
drop(db_iter);
// and then we write the grenad into the DB
// Since the grenad contains only new prefixes, we know in advance that none
// of its elements already exist in the DB, thus there is no need to specify
// how to merge conflicting elements
write_into_lmdb_database_without_merging(
self.wtxn,
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
writer,
)?;
}
// All of the word prefix pairs in the database that have a w2
// that is contained in the `suppr_pw` set must be removed as well.
if !del_prefix_fst_words.is_empty() {
let mut iter = self
.index
.word_prefix_pair_proximity_docids
.remap_data_type::<ByteSlice>()
.iter_mut(self.wtxn)?;
while let Some(((_, w2, _), _)) = iter.next().transpose()? {
if del_prefix_fst_words.contains(w2.as_bytes()) {
// Delete this entry as the w2 prefix is no more in the words prefix fst.
unsafe { iter.del_current()? };
} }
},
&prefixes,
max_proximity,
// and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap)
|key, value| {
insert_into_database(
wtxn,
*word_prefix_pair_proximity_docids.as_polymorph(),
key,
value,
)
},
)?;
}
// Now we do the same thing with the new prefixes and all word pairs in the DB
let prefixes = PrefixTrieNode::from_sorted_prefixes(
new_prefix_fst_words
.into_iter()
.map(|s| s.as_str())
.filter(|s| s.len() <= max_prefix_length),
);
if !prefixes.is_empty() {
let mut db_iter = word_pair_proximity_docids
.remap_key_type::<UncheckedU8StrStrCodec>()
.remap_data_type::<ByteSlice>()
.iter(wtxn)?;
// Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix)
// element in an intermediary grenad
let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?);
execute_on_word_pairs_and_prefixes(
&mut db_iter,
|db_iter| db_iter.next().transpose().map_err(|e| e.into()),
&prefixes,
max_proximity,
|key, value| writer.insert(key, value).map_err(|e| e.into()),
)?;
drop(db_iter);
// and then we write the grenad into the DB
// Since the grenad contains only new prefixes, we know in advance that none
// of its elements already exist in the DB, thus there is no need to specify
// how to merge conflicting elements
write_into_lmdb_database_without_merging(
wtxn,
*word_prefix_pair_proximity_docids.as_polymorph(),
writer,
)?;
}
// All of the word prefix pairs in the database that have a w2
// that is contained in the `suppr_pw` set must be removed as well.
if !del_prefix_fst_words.is_empty() {
let mut iter =
word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
while let Some(((_, _, prefix), _)) = iter.next().transpose()? {
if del_prefix_fst_words.contains(prefix.as_bytes()) {
// Delete this entry as the w2 prefix is no more in the words prefix fst.
unsafe { iter.del_current()? };
} }
} }
Ok(())
} }
Ok(())
} }
/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. /// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
/// ///
/// Its main arguments are: /// Its main arguments are:
/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements /// 1. a sorted iterator over ((proximity, word1, word2), docids) elements
/// 2. a prefix trie /// 2. a prefix trie
/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements /// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements
/// ///
/// For more information about what this function does, read the module documentation. /// For more information about what this function does, read the module documentation.
fn execute_on_word_pairs_and_prefixes<I>( fn execute_on_word_pairs_and_prefixes<I>(
@ -495,61 +436,6 @@ impl PrefixAndProximityBatch {
} }
} }
// This is adapted from `sorter_into_lmdb_database`
fn insert_into_database(
wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase,
new_key: &[u8],
new_value: &[u8],
) -> Result<()> {
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
match iter.next().transpose()? {
Some((key, old_val)) if new_key == key => {
let val =
merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
.map_err(|_| {
// TODO just wrap this error?
crate::error::InternalError::IndexingMergingKeys {
process: "get-put-merge",
}
})?;
// safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
unsafe { iter.put_current(new_key, &val)? };
}
_ => {
drop(iter);
database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
}
}
Ok(())
}
// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
// but it uses `append` if the database is empty, and it assumes that the values in the
// writer don't conflict with values in the database.
pub fn write_into_lmdb_database_without_merging(
wtxn: &mut heed::RwTxn,
database: heed::PolyDatabase,
writer: grenad::Writer<std::fs::File>,
) -> Result<()> {
let file = writer.into_inner()?;
let reader = grenad::Reader::new(BufReader::new(file))?;
if database.is_empty(wtxn)? {
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
let mut cursor = reader.into_cursor()?;
while let Some((k, v)) = cursor.move_on_next()? {
// safety: the key comes from the grenad reader, not the database
unsafe { out_iter.append(k, v)? };
}
} else {
let mut cursor = reader.into_cursor()?;
while let Some((k, v)) = cursor.move_on_next()? {
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
}
}
Ok(())
}
/** A prefix trie. Used to iterate quickly over the prefixes of a word that are /** A prefix trie. Used to iterate quickly over the prefixes of a word that are
within a set. within a set.
@ -676,90 +562,9 @@ impl PrefixTrieNode {
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::io::Cursor;
use roaring::RoaringBitmap;
use super::*; use super::*;
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::{CboRoaringBitmapCodec, U8StrStrCodec};
use crate::index::tests::TempIndex; use roaring::RoaringBitmap;
use crate::{db_snap, CboRoaringBitmapCodec, U8StrStrCodec};
fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> {
let mut documents = Vec::new();
for prefix in prefixes {
for i in 0..50 {
documents.push(
serde_json::json!({
"text": format!("{prefix}{i:x}"),
})
.as_object()
.unwrap()
.clone(),
)
}
}
documents
}
#[test]
fn test_update() {
let mut index = TempIndex::new();
index.index_documents_config.words_prefix_threshold = Some(50);
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| {
settings.set_searchable_fields(vec!["text".to_owned()]);
})
.unwrap();
let batch_reader_from_documents = |documents| {
let mut builder = DocumentsBatchBuilder::new(Vec::new());
for object in documents {
builder.append_json_object(&object).unwrap();
}
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
};
let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]);
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
documents.push(
serde_json::json!({
"text": "At an amazing and beautiful house"
})
.as_object()
.unwrap()
.clone(),
);
documents.push(
serde_json::json!({
"text": "The bell rings at 5 am"
})
.as_object()
.unwrap()
.clone(),
);
let documents = batch_reader_from_documents(documents);
index.add_documents(documents).unwrap();
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]);
documents.push(
serde_json::json!({
"text": "At an extraordinary house"
})
.as_object()
.unwrap()
.clone(),
);
let documents = batch_reader_from_documents(documents);
index.add_documents(documents).unwrap();
db_snap!(index, word_prefix_pair_proximity_docids, "update");
}
fn check_prefixes( fn check_prefixes(
trie: &PrefixTrieNode, trie: &PrefixTrieNode,
@ -899,9 +704,9 @@ mod tests {
&prefixes, &prefixes,
2, 2,
|k, v| { |k, v| {
let (word1, prefix, proximity) = U8StrStrCodec::bytes_decode(k).unwrap(); let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap();
let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap();
result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap));
Ok(()) Ok(())
}, },
) )