mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 17:11:15 +08:00
Add prefix_word_pair_proximity database
Similar to the word_prefix_pair_proximity one but instead the keys are: (proximity, prefix, word2)
This commit is contained in:
parent
1dbbd8694f
commit
264a04922d
@ -71,6 +71,7 @@ pub mod db_name {
|
|||||||
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
|
pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
|
||||||
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
|
pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
|
||||||
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
|
||||||
|
pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
|
||||||
pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
|
pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
|
||||||
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
|
pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
|
||||||
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
|
pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
|
||||||
@ -109,6 +110,8 @@ pub struct Index {
|
|||||||
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
|
/// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
|
||||||
pub word_prefix_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
pub word_prefix_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||||
|
/// Maps the proximity between a pair of prefix and word with all the docids where this relation appears.
|
||||||
|
pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||||
|
|
||||||
/// Maps the word and the position with the docids that corresponds to it.
|
/// Maps the word and the position with the docids that corresponds to it.
|
||||||
pub word_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
|
pub word_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
|
||||||
@ -138,7 +141,7 @@ impl Index {
|
|||||||
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
|
||||||
use db_name::*;
|
use db_name::*;
|
||||||
|
|
||||||
options.max_dbs(17);
|
options.max_dbs(18);
|
||||||
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
unsafe { options.flag(Flags::MdbAlwaysFreePages) };
|
||||||
|
|
||||||
let env = options.open(path)?;
|
let env = options.open(path)?;
|
||||||
@ -151,6 +154,8 @@ impl Index {
|
|||||||
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||||
let word_prefix_pair_proximity_docids =
|
let word_prefix_pair_proximity_docids =
|
||||||
env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
|
env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
|
||||||
|
let prefix_word_pair_proximity_docids =
|
||||||
|
env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
|
||||||
let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
|
let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
|
||||||
let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
|
let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
|
||||||
let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
|
let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
|
||||||
@ -175,6 +180,7 @@ impl Index {
|
|||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
|
prefix_word_pair_proximity_docids,
|
||||||
word_position_docids,
|
word_position_docids,
|
||||||
word_prefix_position_docids,
|
word_prefix_position_docids,
|
||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
|
@ -198,6 +198,15 @@ pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
|
|||||||
});
|
});
|
||||||
snap
|
snap
|
||||||
}
|
}
|
||||||
|
pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String {
|
||||||
|
let snap = make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |(
|
||||||
|
(proximity, prefix, word2),
|
||||||
|
b,
|
||||||
|
)| {
|
||||||
|
&format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b))
|
||||||
|
});
|
||||||
|
snap
|
||||||
|
}
|
||||||
pub fn snap_word_position_docids(index: &Index) -> String {
|
pub fn snap_word_position_docids(index: &Index) -> String {
|
||||||
let snap = make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| {
|
let snap = make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| {
|
||||||
&format!("{word:<16} {position:<6} {}", display_bitmap(&b))
|
&format!("{word:<16} {position:<6} {}", display_bitmap(&b))
|
||||||
@ -427,6 +436,9 @@ macro_rules! full_snap_of_db {
|
|||||||
($index:ident, word_prefix_pair_proximity_docids) => {{
|
($index:ident, word_prefix_pair_proximity_docids) => {{
|
||||||
$crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index)
|
$crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index)
|
||||||
}};
|
}};
|
||||||
|
($index:ident, prefix_word_pair_proximity_docids) => {{
|
||||||
|
$crate::snapshot_tests::snap_prefix_word_pair_proximity_docids(&$index)
|
||||||
|
}};
|
||||||
($index:ident, word_position_docids) => {{
|
($index:ident, word_position_docids) => {{
|
||||||
$crate::snapshot_tests::snap_word_position_docids(&$index)
|
$crate::snapshot_tests::snap_word_position_docids(&$index)
|
||||||
}};
|
}};
|
||||||
|
@ -25,6 +25,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
docid_word_positions,
|
docid_word_positions,
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
|
prefix_word_pair_proximity_docids,
|
||||||
word_position_docids,
|
word_position_docids,
|
||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
word_prefix_position_docids,
|
word_prefix_position_docids,
|
||||||
@ -66,6 +67,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
docid_word_positions.clear(self.wtxn)?;
|
docid_word_positions.clear(self.wtxn)?;
|
||||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
|
prefix_word_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
word_position_docids.clear(self.wtxn)?;
|
word_position_docids.clear(self.wtxn)?;
|
||||||
field_id_word_count_docids.clear(self.wtxn)?;
|
field_id_word_count_docids.clear(self.wtxn)?;
|
||||||
word_prefix_position_docids.clear(self.wtxn)?;
|
word_prefix_position_docids.clear(self.wtxn)?;
|
||||||
|
@ -183,6 +183,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
|
prefix_word_pair_proximity_docids,
|
||||||
word_position_docids,
|
word_position_docids,
|
||||||
word_prefix_position_docids,
|
word_prefix_position_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
@ -327,9 +328,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?;
|
self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] {
|
||||||
// We delete the documents ids from the word prefix pair proximity database docids
|
// We delete the documents ids from the word prefix pair proximity database docids
|
||||||
// and remove the empty pairs too.
|
// and remove the empty pairs too.
|
||||||
let db = word_prefix_pair_proximity_docids.remap_key_type::<ByteSlice>();
|
let db = db.remap_key_type::<ByteSlice>();
|
||||||
let mut iter = db.iter_mut(self.wtxn)?;
|
let mut iter = db.iter_mut(self.wtxn)?;
|
||||||
while let Some(result) = iter.next() {
|
while let Some(result) = iter.next() {
|
||||||
let (key, mut docids) = result?;
|
let (key, mut docids) = result?;
|
||||||
@ -344,8 +346,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
unsafe { iter.put_current(&key, &docids)? };
|
unsafe { iter.put_current(&key, &docids)? };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
drop(iter);
|
|
||||||
|
|
||||||
// We delete the documents ids that are under the pairs of words,
|
// We delete the documents ids that are under the pairs of words,
|
||||||
// it is faster and use no memory to iterate over all the words pairs than
|
// it is faster and use no memory to iterate over all the words pairs than
|
||||||
|
@ -36,8 +36,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
|
|||||||
use crate::error::UserError;
|
use crate::error::UserError;
|
||||||
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
pub use crate::update::index_documents::helpers::CursorClonableMmap;
|
||||||
use crate::update::{
|
use crate::update::{
|
||||||
self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
|
self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
|
||||||
WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst,
|
WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst,
|
||||||
};
|
};
|
||||||
use crate::{Index, Result, RoaringBitmapCodec};
|
use crate::{Index, Result, RoaringBitmapCodec};
|
||||||
|
|
||||||
@ -528,12 +528,7 @@ where
|
|||||||
|
|
||||||
if let Some(word_pair_proximity_docids) = word_pair_proximity_docids {
|
if let Some(word_pair_proximity_docids) = word_pair_proximity_docids {
|
||||||
// Run the word prefix pair proximity docids update operation.
|
// Run the word prefix pair proximity docids update operation.
|
||||||
let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index);
|
PrefixWordPairsProximityDocids::new(self.wtxn, self.index).execute(
|
||||||
builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
|
|
||||||
builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
|
|
||||||
builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
|
|
||||||
builder.max_memory = self.indexer_config.max_memory;
|
|
||||||
builder.execute(
|
|
||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
&new_prefix_fst_words,
|
&new_prefix_fst_words,
|
||||||
&common_prefix_fst_words,
|
&common_prefix_fst_words,
|
||||||
|
@ -6,10 +6,10 @@ pub use self::index_documents::{
|
|||||||
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
|
DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
|
||||||
};
|
};
|
||||||
pub use self::indexer_config::IndexerConfig;
|
pub use self::indexer_config::IndexerConfig;
|
||||||
|
pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids;
|
||||||
pub use self::settings::{Setting, Settings};
|
pub use self::settings::{Setting, Settings};
|
||||||
pub use self::update_step::UpdateIndexingStep;
|
pub use self::update_step::UpdateIndexingStep;
|
||||||
pub use self::word_prefix_docids::WordPrefixDocids;
|
pub use self::word_prefix_docids::WordPrefixDocids;
|
||||||
pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids;
|
|
||||||
pub use self::words_prefix_position_docids::WordPrefixPositionDocids;
|
pub use self::words_prefix_position_docids::WordPrefixPositionDocids;
|
||||||
pub use self::words_prefixes_fst::WordsPrefixesFst;
|
pub use self::words_prefixes_fst::WordsPrefixesFst;
|
||||||
|
|
||||||
@ -19,9 +19,9 @@ mod delete_documents;
|
|||||||
mod facets;
|
mod facets;
|
||||||
mod index_documents;
|
mod index_documents;
|
||||||
mod indexer_config;
|
mod indexer_config;
|
||||||
|
mod prefix_word_pairs;
|
||||||
mod settings;
|
mod settings;
|
||||||
mod update_step;
|
mod update_step;
|
||||||
mod word_prefix_docids;
|
mod word_prefix_docids;
|
||||||
mod word_prefix_pair_proximity_docids;
|
|
||||||
mod words_prefix_position_docids;
|
mod words_prefix_position_docids;
|
||||||
mod words_prefixes_fst;
|
mod words_prefixes_fst;
|
||||||
|
216
milli/src/update/prefix_word_pairs/mod.rs
Normal file
216
milli/src/update/prefix_word_pairs/mod.rs
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap};
|
||||||
|
use crate::{Index, Result};
|
||||||
|
use heed::types::ByteSlice;
|
||||||
|
use std::{borrow::Cow, collections::HashSet, io::BufReader};
|
||||||
|
|
||||||
|
mod prefix_word;
|
||||||
|
mod word_prefix;
|
||||||
|
|
||||||
|
pub use prefix_word::index_prefix_word_database;
|
||||||
|
pub use word_prefix::index_word_prefix_database;
|
||||||
|
|
||||||
|
pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
||||||
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
|
index: &'i Index,
|
||||||
|
max_proximity: u8,
|
||||||
|
max_prefix_length: usize,
|
||||||
|
}
|
||||||
|
impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
|
||||||
|
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Self {
|
||||||
|
Self { wtxn, index, max_proximity: 4, max_prefix_length: 2 }
|
||||||
|
}
|
||||||
|
/// Set the maximum proximity required to make a prefix be part of the words prefixes
|
||||||
|
/// database. If two words are too far from the threshold the associated documents will
|
||||||
|
/// not be part of the prefix database.
|
||||||
|
///
|
||||||
|
/// Default value is 4. This value must be lower or equal than 7 and will be clamped
|
||||||
|
/// to this bound otherwise.
|
||||||
|
pub fn max_proximity(&mut self, value: u8) -> &mut Self {
|
||||||
|
self.max_proximity = value.max(7);
|
||||||
|
self
|
||||||
|
}
|
||||||
|
/// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
|
||||||
|
/// prefixes database. If the prefix length is higher than the threshold, the associated documents
|
||||||
|
/// will not be part of the prefix database.
|
||||||
|
///
|
||||||
|
/// Default value is 2.
|
||||||
|
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
|
||||||
|
self.max_prefix_length = value;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
|
||||||
|
pub fn execute<'a>(
|
||||||
|
self,
|
||||||
|
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
||||||
|
new_prefix_fst_words: &'a [String],
|
||||||
|
common_prefix_fst_words: &[&'a [String]],
|
||||||
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
index_word_prefix_database(
|
||||||
|
self.wtxn,
|
||||||
|
self.index.word_pair_proximity_docids,
|
||||||
|
self.index.word_prefix_pair_proximity_docids,
|
||||||
|
self.max_proximity,
|
||||||
|
self.max_prefix_length,
|
||||||
|
new_word_pair_proximity_docids.clone(),
|
||||||
|
new_prefix_fst_words,
|
||||||
|
common_prefix_fst_words,
|
||||||
|
del_prefix_fst_words,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
index_prefix_word_database(
|
||||||
|
self.wtxn,
|
||||||
|
self.index.word_pair_proximity_docids,
|
||||||
|
self.index.prefix_word_pair_proximity_docids,
|
||||||
|
self.max_proximity,
|
||||||
|
self.max_prefix_length,
|
||||||
|
new_word_pair_proximity_docids,
|
||||||
|
new_prefix_fst_words,
|
||||||
|
common_prefix_fst_words,
|
||||||
|
del_prefix_fst_words,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is adapted from `sorter_into_lmdb_database`
|
||||||
|
pub fn insert_into_database(
|
||||||
|
wtxn: &mut heed::RwTxn,
|
||||||
|
database: heed::PolyDatabase,
|
||||||
|
new_key: &[u8],
|
||||||
|
new_value: &[u8],
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
|
||||||
|
match iter.next().transpose()? {
|
||||||
|
Some((key, old_val)) if new_key == key => {
|
||||||
|
let val =
|
||||||
|
merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
|
||||||
|
.map_err(|_| {
|
||||||
|
// TODO just wrap this error?
|
||||||
|
crate::error::InternalError::IndexingMergingKeys {
|
||||||
|
process: "get-put-merge",
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
// safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
|
||||||
|
unsafe { iter.put_current(new_key, &val)? };
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
|
drop(iter);
|
||||||
|
database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
|
||||||
|
// but it uses `append` if the database is empty, and it assumes that the values in the
|
||||||
|
// writer don't conflict with values in the database.
|
||||||
|
pub fn write_into_lmdb_database_without_merging(
|
||||||
|
wtxn: &mut heed::RwTxn,
|
||||||
|
database: heed::PolyDatabase,
|
||||||
|
writer: grenad::Writer<std::fs::File>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let file = writer.into_inner()?;
|
||||||
|
let reader = grenad::Reader::new(BufReader::new(file))?;
|
||||||
|
if database.is_empty(wtxn)? {
|
||||||
|
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
||||||
|
let mut cursor = reader.into_cursor()?;
|
||||||
|
while let Some((k, v)) = cursor.move_on_next()? {
|
||||||
|
// safety: the key comes from the grenad reader, not the database
|
||||||
|
unsafe { out_iter.append(k, v)? };
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
let mut cursor = reader.into_cursor()?;
|
||||||
|
while let Some((k, v)) = cursor.move_on_next()? {
|
||||||
|
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use crate::db_snap;
|
||||||
|
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
||||||
|
use crate::index::tests::TempIndex;
|
||||||
|
use std::io::Cursor;
|
||||||
|
|
||||||
|
fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> {
|
||||||
|
let mut documents = Vec::new();
|
||||||
|
for prefix in prefixes {
|
||||||
|
for i in 0..50 {
|
||||||
|
documents.push(
|
||||||
|
serde_json::json!({
|
||||||
|
"text": format!("{prefix}{i:x}"),
|
||||||
|
})
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
.clone(),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
documents
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_update() {
|
||||||
|
let mut index = TempIndex::new();
|
||||||
|
index.index_documents_config.words_prefix_threshold = Some(50);
|
||||||
|
index.index_documents_config.autogenerate_docids = true;
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|settings| {
|
||||||
|
settings.set_searchable_fields(vec!["text".to_owned()]);
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let batch_reader_from_documents = |documents| {
|
||||||
|
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
||||||
|
for object in documents {
|
||||||
|
builder.append_json_object(&object).unwrap();
|
||||||
|
}
|
||||||
|
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]);
|
||||||
|
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
||||||
|
documents.push(
|
||||||
|
serde_json::json!({
|
||||||
|
"text": "At an amazing and beautiful house"
|
||||||
|
})
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
.clone(),
|
||||||
|
);
|
||||||
|
documents.push(
|
||||||
|
serde_json::json!({
|
||||||
|
"text": "The bell rings at 5 am"
|
||||||
|
})
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
.clone(),
|
||||||
|
);
|
||||||
|
|
||||||
|
let documents = batch_reader_from_documents(documents);
|
||||||
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
||||||
|
|
||||||
|
let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]);
|
||||||
|
documents.push(
|
||||||
|
serde_json::json!({
|
||||||
|
"text": "At an extraordinary house"
|
||||||
|
})
|
||||||
|
.as_object()
|
||||||
|
.unwrap()
|
||||||
|
.clone(),
|
||||||
|
);
|
||||||
|
let documents = batch_reader_from_documents(documents);
|
||||||
|
index.add_documents(documents).unwrap();
|
||||||
|
|
||||||
|
db_snap!(index, word_pair_proximity_docids, "update");
|
||||||
|
db_snap!(index, word_prefix_pair_proximity_docids, "update");
|
||||||
|
db_snap!(index, prefix_word_pair_proximity_docids, "update");
|
||||||
|
}
|
||||||
|
}
|
178
milli/src/update/prefix_word_pairs/prefix_word.rs
Normal file
178
milli/src/update/prefix_word_pairs/prefix_word.rs
Normal file
@ -0,0 +1,178 @@
|
|||||||
|
use crate::update::index_documents::{create_writer, CursorClonableMmap};
|
||||||
|
use crate::update::prefix_word_pairs::{
|
||||||
|
insert_into_database, write_into_lmdb_database_without_merging,
|
||||||
|
};
|
||||||
|
use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||||
|
use grenad::CompressionType;
|
||||||
|
use heed::types::ByteSlice;
|
||||||
|
use heed::BytesDecode;
|
||||||
|
use log::debug;
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use std::collections::{BTreeMap, HashSet};
|
||||||
|
|
||||||
|
#[logging_timer::time]
|
||||||
|
pub fn index_prefix_word_database(
|
||||||
|
wtxn: &mut heed::RwTxn,
|
||||||
|
word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||||
|
prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||||
|
max_proximity: u8,
|
||||||
|
max_prefix_length: usize,
|
||||||
|
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
||||||
|
new_prefix_fst_words: &[String],
|
||||||
|
common_prefix_fst_words: &[&[String]],
|
||||||
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let max_proximity = max_proximity - 1;
|
||||||
|
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||||
|
|
||||||
|
let common_prefixes: Vec<_> = common_prefix_fst_words
|
||||||
|
.into_iter()
|
||||||
|
.map(|s| s.into_iter())
|
||||||
|
.flatten()
|
||||||
|
.map(|s| s.as_str())
|
||||||
|
.filter(|s| s.len() <= max_prefix_length)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// If the prefix trie is not empty, then we can iterate over all new
|
||||||
|
// word pairs to look for new (word1, common_prefix, proximity) elements
|
||||||
|
// to insert in the DB
|
||||||
|
for proximity in 1..=max_proximity - 1 {
|
||||||
|
for prefix in common_prefixes.iter() {
|
||||||
|
let mut prefix_key = vec![];
|
||||||
|
prefix_key.push(proximity);
|
||||||
|
prefix_key.extend_from_slice(prefix.as_bytes());
|
||||||
|
let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?;
|
||||||
|
// This is the core of the algorithm
|
||||||
|
execute_on_word_pairs_and_prefixes(
|
||||||
|
proximity + 1,
|
||||||
|
prefix.as_bytes(),
|
||||||
|
// the next two arguments tell how to iterate over the new word pairs
|
||||||
|
&mut cursor,
|
||||||
|
|cursor| {
|
||||||
|
if let Some((key, value)) = cursor.next()? {
|
||||||
|
let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
|
||||||
|
.ok_or(heed::Error::Decoding)?;
|
||||||
|
Ok(Some((word2, value)))
|
||||||
|
} else {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
// and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap)
|
||||||
|
|key, value| {
|
||||||
|
insert_into_database(
|
||||||
|
wtxn,
|
||||||
|
*prefix_word_pair_proximity_docids.as_polymorph(),
|
||||||
|
key,
|
||||||
|
value,
|
||||||
|
)
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now we do the same thing with the new prefixes and all word pairs in the DB
|
||||||
|
let new_prefixes: Vec<_> = new_prefix_fst_words
|
||||||
|
.into_iter()
|
||||||
|
.map(|s| s.as_str())
|
||||||
|
.filter(|s| s.len() <= max_prefix_length)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
// Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
|
||||||
|
// element in an intermediary grenad
|
||||||
|
let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?);
|
||||||
|
|
||||||
|
for proximity in 1..=max_proximity - 1 {
|
||||||
|
for prefix in new_prefixes.iter() {
|
||||||
|
let mut prefix_key = vec![];
|
||||||
|
prefix_key.push(proximity);
|
||||||
|
prefix_key.extend_from_slice(prefix.as_bytes());
|
||||||
|
let mut db_iter = word_pair_proximity_docids
|
||||||
|
.as_polymorph()
|
||||||
|
.prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())?
|
||||||
|
.remap_key_type::<UncheckedU8StrStrCodec>();
|
||||||
|
execute_on_word_pairs_and_prefixes(
|
||||||
|
proximity + 1,
|
||||||
|
prefix.as_bytes(),
|
||||||
|
&mut db_iter,
|
||||||
|
|db_iter| {
|
||||||
|
db_iter
|
||||||
|
.next()
|
||||||
|
.transpose()
|
||||||
|
.map(|x| x.map(|((_, _, word2), value)| (word2, value)))
|
||||||
|
.map_err(|e| e.into())
|
||||||
|
},
|
||||||
|
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
||||||
|
)?;
|
||||||
|
drop(db_iter);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// and then we write the grenad into the DB
|
||||||
|
// Since the grenad contains only new prefixes, we know in advance that none
|
||||||
|
// of its elements already exist in the DB, thus there is no need to specify
|
||||||
|
// how to merge conflicting elements
|
||||||
|
write_into_lmdb_database_without_merging(
|
||||||
|
wtxn,
|
||||||
|
*prefix_word_pair_proximity_docids.as_polymorph(),
|
||||||
|
writer,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
// All of the word prefix pairs in the database that have a w2
|
||||||
|
// that is contained in the `suppr_pw` set must be removed as well.
|
||||||
|
if !del_prefix_fst_words.is_empty() {
|
||||||
|
let mut iter =
|
||||||
|
prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
|
||||||
|
while let Some(((_, prefix, _), _)) = iter.next().transpose()? {
|
||||||
|
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
||||||
|
// Delete this entry as the w2 prefix is no more in the words prefix fst.
|
||||||
|
unsafe { iter.del_current()? };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
|
||||||
|
///
|
||||||
|
/// Its main arguments are:
|
||||||
|
/// 1. a sorted prefix iterator over ((word1, word2, proximity), docids) elements
|
||||||
|
/// 2. a closure to describe how to handle the new computed (word1, prefix, proximity) elements
|
||||||
|
///
|
||||||
|
/// For more information about what this function does, read the module documentation.
|
||||||
|
fn execute_on_word_pairs_and_prefixes<I>(
|
||||||
|
proximity: u8,
|
||||||
|
prefix: &[u8],
|
||||||
|
iter: &mut I,
|
||||||
|
mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>,
|
||||||
|
mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
|
||||||
|
) -> Result<()> {
|
||||||
|
let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = <_>::default();
|
||||||
|
|
||||||
|
while let Some((word2, data)) = next_word2_and_docids(iter)? {
|
||||||
|
let entry = batch.entry(word2.to_owned()).or_default();
|
||||||
|
entry.push(Cow::Owned(data.to_owned()));
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut key_buffer = Vec::with_capacity(8);
|
||||||
|
key_buffer.push(proximity);
|
||||||
|
key_buffer.extend_from_slice(prefix);
|
||||||
|
key_buffer.push(0);
|
||||||
|
|
||||||
|
let mut value_buffer = Vec::with_capacity(65_536);
|
||||||
|
|
||||||
|
for (key, values) in batch {
|
||||||
|
key_buffer.truncate(prefix.len() + 2);
|
||||||
|
value_buffer.clear();
|
||||||
|
|
||||||
|
key_buffer.extend_from_slice(&key);
|
||||||
|
let data = if values.len() > 1 {
|
||||||
|
CboRoaringBitmapCodec::merge_into(&values, &mut value_buffer)?;
|
||||||
|
value_buffer.as_slice()
|
||||||
|
} else {
|
||||||
|
&values[0]
|
||||||
|
};
|
||||||
|
insert(key_buffer.as_slice(), data)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
@ -0,0 +1,46 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||||
|
---
|
||||||
|
1 5 a [101, ]
|
||||||
|
1 amazing a [100, ]
|
||||||
|
1 an a [100, ]
|
||||||
|
1 and b [100, ]
|
||||||
|
1 and be [100, ]
|
||||||
|
1 at a [100, ]
|
||||||
|
1 rings a [101, ]
|
||||||
|
1 the b [101, ]
|
||||||
|
1 the be [101, ]
|
||||||
|
2 5 a [101, ]
|
||||||
|
2 amazing a [100, ]
|
||||||
|
2 amazing b [100, ]
|
||||||
|
2 amazing be [100, ]
|
||||||
|
2 an a [100, ]
|
||||||
|
2 and a [100, ]
|
||||||
|
2 at a [100, 101, ]
|
||||||
|
2 beautiful a [100, ]
|
||||||
|
2 bell a [101, ]
|
||||||
|
2 house b [100, ]
|
||||||
|
2 house be [100, ]
|
||||||
|
2 rings b [101, ]
|
||||||
|
2 rings be [101, ]
|
||||||
|
3 am a [101, ]
|
||||||
|
3 amazing a [100, ]
|
||||||
|
3 an b [100, ]
|
||||||
|
3 an be [100, ]
|
||||||
|
3 and a [100, ]
|
||||||
|
3 at a [100, ]
|
||||||
|
3 at b [101, ]
|
||||||
|
3 at be [101, ]
|
||||||
|
3 beautiful a [100, ]
|
||||||
|
3 house a [100, ]
|
||||||
|
3 rings a [101, ]
|
||||||
|
3 the a [101, ]
|
||||||
|
4 5 b [101, ]
|
||||||
|
4 5 be [101, ]
|
||||||
|
4 and a [100, ]
|
||||||
|
4 at b [100, ]
|
||||||
|
4 at be [100, ]
|
||||||
|
4 beautiful a [100, ]
|
||||||
|
4 bell a [101, ]
|
||||||
|
4 house a [100, ]
|
||||||
|
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||||
|
---
|
||||||
|
fb88e49fd666886731b62baef8f44995
|
@ -0,0 +1,41 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||||
|
---
|
||||||
|
2 a 5 [101, ]
|
||||||
|
2 a amazing [100, ]
|
||||||
|
2 a an [100, 202, ]
|
||||||
|
2 a and [100, ]
|
||||||
|
2 a beautiful [100, ]
|
||||||
|
2 a extraordinary [202, ]
|
||||||
|
2 am and [100, ]
|
||||||
|
2 an amazing [100, ]
|
||||||
|
2 an beautiful [100, ]
|
||||||
|
2 an extraordinary [202, ]
|
||||||
|
2 b house [100, ]
|
||||||
|
2 b rings [101, ]
|
||||||
|
2 be house [100, ]
|
||||||
|
2 be rings [101, ]
|
||||||
|
3 a 5 [101, ]
|
||||||
|
3 a am [101, ]
|
||||||
|
3 a amazing [100, ]
|
||||||
|
3 a an [100, ]
|
||||||
|
3 a and [100, ]
|
||||||
|
3 a at [100, 202, ]
|
||||||
|
3 a beautiful [100, ]
|
||||||
|
3 a extraordinary [202, ]
|
||||||
|
3 a house [100, 202, ]
|
||||||
|
3 a rings [101, ]
|
||||||
|
3 am 5 [101, ]
|
||||||
|
3 am an [100, ]
|
||||||
|
3 am beautiful [100, ]
|
||||||
|
3 an amazing [100, ]
|
||||||
|
3 an and [100, ]
|
||||||
|
3 an at [100, 202, ]
|
||||||
|
3 an house [100, 202, ]
|
||||||
|
3 b and [100, ]
|
||||||
|
3 b at [101, ]
|
||||||
|
3 b the [101, ]
|
||||||
|
3 be and [100, ]
|
||||||
|
3 be at [101, ]
|
||||||
|
3 be the [101, ]
|
||||||
|
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||||
|
---
|
||||||
|
6965ecd1bf821f1cf921c2ab751b36cf
|
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
source: milli/src/update/prefix_word_pairs/mod.rs
|
||||||
|
---
|
||||||
|
fb88e49fd666886731b62baef8f44995
|
@ -1,5 +1,5 @@
|
|||||||
/*!
|
/*!
|
||||||
## What is WordPrefixPairProximityDocids?
|
## What is WordPrefix?
|
||||||
The word-prefix-pair-proximity-docids database is a database whose keys are of
|
The word-prefix-pair-proximity-docids database is a database whose keys are of
|
||||||
the form `(proximity, word, prefix)` and the values are roaring bitmaps of
|
the form `(proximity, word, prefix)` and the values are roaring bitmaps of
|
||||||
the documents which contain `word` followed by another word starting with
|
the documents which contain `word` followed by another word starting with
|
||||||
@ -139,7 +139,7 @@ inputs described above, which come from different places:
|
|||||||
|
|
||||||
2. `word_pairs_db`, which is the list of word pairs from the database.
|
2. `word_pairs_db`, which is the list of word pairs from the database.
|
||||||
This list includes all elements in `new_word_pairs` since `new_word_pairs`
|
This list includes all elements in `new_word_pairs` since `new_word_pairs`
|
||||||
was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute`
|
was added to the database prior to calling the `WordPrefix::execute`
|
||||||
function.
|
function.
|
||||||
|
|
||||||
To update the prefix database correctly, we call the algorithm described earlier first
|
To update the prefix database correctly, we call the algorithm described earlier first
|
||||||
@ -161,81 +161,32 @@ reader and writer). Therefore, when calling the algorithm on
|
|||||||
`((proximity, word, prefix), docids)` elements in an intermediary grenad
|
`((proximity, word, prefix), docids)` elements in an intermediary grenad
|
||||||
Writer instead of the DB. At the end of the outer loop, we finally read from
|
Writer instead of the DB. At the end of the outer loop, we finally read from
|
||||||
the grenad and insert its elements in the database.
|
the grenad and insert its elements in the database.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
*/
|
*/
|
||||||
use std::borrow::Cow;
|
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::io::BufReader;
|
|
||||||
|
|
||||||
|
use crate::update::index_documents::{create_writer, CursorClonableMmap};
|
||||||
|
use crate::update::prefix_word_pairs::{
|
||||||
|
insert_into_database, write_into_lmdb_database_without_merging,
|
||||||
|
};
|
||||||
|
use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||||
use grenad::CompressionType;
|
use grenad::CompressionType;
|
||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use heed::BytesDecode;
|
use heed::BytesDecode;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use crate::update::index_documents::{
|
#[logging_timer::time]
|
||||||
create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap,
|
pub fn index_word_prefix_database(
|
||||||
};
|
wtxn: &mut heed::RwTxn,
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedU8StrStrCodec};
|
word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||||
|
word_prefix_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
|
||||||
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
|
||||||
index: &'i Index,
|
|
||||||
pub(crate) chunk_compression_type: CompressionType,
|
|
||||||
pub(crate) chunk_compression_level: Option<u32>,
|
|
||||||
pub(crate) max_nb_chunks: Option<usize>,
|
|
||||||
pub(crate) max_memory: Option<usize>,
|
|
||||||
max_proximity: u8,
|
max_proximity: u8,
|
||||||
max_prefix_length: usize,
|
max_prefix_length: usize,
|
||||||
}
|
|
||||||
|
|
||||||
impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|
||||||
pub fn new(
|
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
|
||||||
index: &'i Index,
|
|
||||||
) -> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|
||||||
WordPrefixPairProximityDocids {
|
|
||||||
wtxn,
|
|
||||||
index,
|
|
||||||
chunk_compression_type: CompressionType::None,
|
|
||||||
chunk_compression_level: None,
|
|
||||||
max_nb_chunks: None,
|
|
||||||
max_memory: None,
|
|
||||||
max_proximity: 4,
|
|
||||||
max_prefix_length: 2,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set the maximum proximity required to make a prefix be part of the words prefixes
|
|
||||||
/// database. If two words are too far from the threshold the associated documents will
|
|
||||||
/// not be part of the prefix database.
|
|
||||||
///
|
|
||||||
/// Default value is 4. This value must be lower or equal than 7 and will be clamped
|
|
||||||
/// to this bound otherwise.
|
|
||||||
pub fn max_proximity(&mut self, value: u8) -> &mut Self {
|
|
||||||
self.max_proximity = value.max(7);
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
|
|
||||||
/// prefixes database. If the prefix length is higher than the threshold, the associated documents
|
|
||||||
/// will not be part of the prefix database.
|
|
||||||
///
|
|
||||||
/// Default value is 2.
|
|
||||||
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
|
|
||||||
self.max_prefix_length = value;
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
#[logging_timer::time("WordPrefixPairProximityDocids::{}")]
|
|
||||||
pub fn execute<'a>(
|
|
||||||
mut self,
|
|
||||||
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
|
||||||
new_prefix_fst_words: &'a [String],
|
new_prefix_fst_words: &[String],
|
||||||
common_prefix_fst_words: &[&'a [String]],
|
common_prefix_fst_words: &[&[String]],
|
||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||||
|
|
||||||
// Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
|
// Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
|
||||||
@ -245,11 +196,11 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
.map(|s| s.into_iter())
|
.map(|s| s.into_iter())
|
||||||
.flatten()
|
.flatten()
|
||||||
.map(|s| s.as_str())
|
.map(|s| s.as_str())
|
||||||
.filter(|s| s.len() <= self.max_prefix_length),
|
.filter(|s| s.len() <= max_prefix_length),
|
||||||
);
|
);
|
||||||
|
|
||||||
// If the prefix trie is not empty, then we can iterate over all new
|
// If the prefix trie is not empty, then we can iterate over all new
|
||||||
// word pairs to look for new (word1, common_prefix, proximity) elements
|
// word pairs to look for new (proximity, word1, common_prefix) elements
|
||||||
// to insert in the DB
|
// to insert in the DB
|
||||||
if !prefixes.is_empty() {
|
if !prefixes.is_empty() {
|
||||||
let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
|
let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
|
||||||
@ -259,20 +210,20 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
&mut cursor,
|
&mut cursor,
|
||||||
|cursor| {
|
|cursor| {
|
||||||
if let Some((key, value)) = cursor.move_on_next()? {
|
if let Some((key, value)) = cursor.move_on_next()? {
|
||||||
let (proximity, word1, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
|
let (proximity, word1, word2) =
|
||||||
.ok_or(heed::Error::Decoding)?;
|
UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
|
||||||
Ok(Some(((proximity, word1, word2), value)))
|
Ok(Some(((proximity, word1, word2), value)))
|
||||||
} else {
|
} else {
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
&prefixes,
|
&prefixes,
|
||||||
self.max_proximity,
|
max_proximity,
|
||||||
// and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap)
|
// and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap)
|
||||||
|key, value| {
|
|key, value| {
|
||||||
insert_into_database(
|
insert_into_database(
|
||||||
&mut self.wtxn,
|
wtxn,
|
||||||
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
*word_prefix_pair_proximity_docids.as_polymorph(),
|
||||||
key,
|
key,
|
||||||
value,
|
value,
|
||||||
)
|
)
|
||||||
@ -286,30 +237,24 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
new_prefix_fst_words
|
new_prefix_fst_words
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|s| s.as_str())
|
.map(|s| s.as_str())
|
||||||
.filter(|s| s.len() <= self.max_prefix_length),
|
.filter(|s| s.len() <= max_prefix_length),
|
||||||
);
|
);
|
||||||
|
|
||||||
if !prefixes.is_empty() {
|
if !prefixes.is_empty() {
|
||||||
let mut db_iter = self
|
let mut db_iter = word_pair_proximity_docids
|
||||||
.index
|
|
||||||
.word_pair_proximity_docids
|
|
||||||
.remap_key_type::<UncheckedU8StrStrCodec>()
|
.remap_key_type::<UncheckedU8StrStrCodec>()
|
||||||
.remap_data_type::<ByteSlice>()
|
.remap_data_type::<ByteSlice>()
|
||||||
.iter(self.wtxn)?;
|
.iter(wtxn)?;
|
||||||
|
|
||||||
// Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
|
// Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix)
|
||||||
// element in an intermediary grenad
|
// element in an intermediary grenad
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?);
|
||||||
self.chunk_compression_type,
|
|
||||||
self.chunk_compression_level,
|
|
||||||
tempfile::tempfile()?,
|
|
||||||
);
|
|
||||||
|
|
||||||
execute_on_word_pairs_and_prefixes(
|
execute_on_word_pairs_and_prefixes(
|
||||||
&mut db_iter,
|
&mut db_iter,
|
||||||
|db_iter| db_iter.next().transpose().map_err(|e| e.into()),
|
|db_iter| db_iter.next().transpose().map_err(|e| e.into()),
|
||||||
&prefixes,
|
&prefixes,
|
||||||
self.max_proximity,
|
max_proximity,
|
||||||
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
||||||
)?;
|
)?;
|
||||||
drop(db_iter);
|
drop(db_iter);
|
||||||
@ -319,8 +264,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
// of its elements already exist in the DB, thus there is no need to specify
|
// of its elements already exist in the DB, thus there is no need to specify
|
||||||
// how to merge conflicting elements
|
// how to merge conflicting elements
|
||||||
write_into_lmdb_database_without_merging(
|
write_into_lmdb_database_without_merging(
|
||||||
self.wtxn,
|
wtxn,
|
||||||
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
*word_prefix_pair_proximity_docids.as_polymorph(),
|
||||||
writer,
|
writer,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
@ -328,13 +273,10 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
// All of the word prefix pairs in the database that have a w2
|
// All of the word prefix pairs in the database that have a w2
|
||||||
// that is contained in the `suppr_pw` set must be removed as well.
|
// that is contained in the `suppr_pw` set must be removed as well.
|
||||||
if !del_prefix_fst_words.is_empty() {
|
if !del_prefix_fst_words.is_empty() {
|
||||||
let mut iter = self
|
let mut iter =
|
||||||
.index
|
word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
|
||||||
.word_prefix_pair_proximity_docids
|
while let Some(((_, _, prefix), _)) = iter.next().transpose()? {
|
||||||
.remap_data_type::<ByteSlice>()
|
if del_prefix_fst_words.contains(prefix.as_bytes()) {
|
||||||
.iter_mut(self.wtxn)?;
|
|
||||||
while let Some(((_, w2, _), _)) = iter.next().transpose()? {
|
|
||||||
if del_prefix_fst_words.contains(w2.as_bytes()) {
|
|
||||||
// Delete this entry as the w2 prefix is no more in the words prefix fst.
|
// Delete this entry as the w2 prefix is no more in the words prefix fst.
|
||||||
unsafe { iter.del_current()? };
|
unsafe { iter.del_current()? };
|
||||||
}
|
}
|
||||||
@ -342,15 +284,14 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
|
/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
|
||||||
///
|
///
|
||||||
/// Its main arguments are:
|
/// Its main arguments are:
|
||||||
/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements
|
/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements
|
||||||
/// 2. a prefix trie
|
/// 2. a prefix trie
|
||||||
/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements
|
/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements
|
||||||
///
|
///
|
||||||
/// For more information about what this function does, read the module documentation.
|
/// For more information about what this function does, read the module documentation.
|
||||||
fn execute_on_word_pairs_and_prefixes<I>(
|
fn execute_on_word_pairs_and_prefixes<I>(
|
||||||
@ -495,61 +436,6 @@ impl PrefixAndProximityBatch {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is adapted from `sorter_into_lmdb_database`
|
|
||||||
fn insert_into_database(
|
|
||||||
wtxn: &mut heed::RwTxn,
|
|
||||||
database: heed::PolyDatabase,
|
|
||||||
new_key: &[u8],
|
|
||||||
new_value: &[u8],
|
|
||||||
) -> Result<()> {
|
|
||||||
let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
|
|
||||||
match iter.next().transpose()? {
|
|
||||||
Some((key, old_val)) if new_key == key => {
|
|
||||||
let val =
|
|
||||||
merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
|
|
||||||
.map_err(|_| {
|
|
||||||
// TODO just wrap this error?
|
|
||||||
crate::error::InternalError::IndexingMergingKeys {
|
|
||||||
process: "get-put-merge",
|
|
||||||
}
|
|
||||||
})?;
|
|
||||||
// safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
|
|
||||||
unsafe { iter.put_current(new_key, &val)? };
|
|
||||||
}
|
|
||||||
_ => {
|
|
||||||
drop(iter);
|
|
||||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
|
|
||||||
// but it uses `append` if the database is empty, and it assumes that the values in the
|
|
||||||
// writer don't conflict with values in the database.
|
|
||||||
pub fn write_into_lmdb_database_without_merging(
|
|
||||||
wtxn: &mut heed::RwTxn,
|
|
||||||
database: heed::PolyDatabase,
|
|
||||||
writer: grenad::Writer<std::fs::File>,
|
|
||||||
) -> Result<()> {
|
|
||||||
let file = writer.into_inner()?;
|
|
||||||
let reader = grenad::Reader::new(BufReader::new(file))?;
|
|
||||||
if database.is_empty(wtxn)? {
|
|
||||||
let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
|
|
||||||
let mut cursor = reader.into_cursor()?;
|
|
||||||
while let Some((k, v)) = cursor.move_on_next()? {
|
|
||||||
// safety: the key comes from the grenad reader, not the database
|
|
||||||
unsafe { out_iter.append(k, v)? };
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
let mut cursor = reader.into_cursor()?;
|
|
||||||
while let Some((k, v)) = cursor.move_on_next()? {
|
|
||||||
database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
/** A prefix trie. Used to iterate quickly over the prefixes of a word that are
|
/** A prefix trie. Used to iterate quickly over the prefixes of a word that are
|
||||||
within a set.
|
within a set.
|
||||||
|
|
||||||
@ -676,90 +562,9 @@ impl PrefixTrieNode {
|
|||||||
}
|
}
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use std::io::Cursor;
|
|
||||||
|
|
||||||
use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
|
use crate::{CboRoaringBitmapCodec, U8StrStrCodec};
|
||||||
use crate::index::tests::TempIndex;
|
use roaring::RoaringBitmap;
|
||||||
use crate::{db_snap, CboRoaringBitmapCodec, U8StrStrCodec};
|
|
||||||
|
|
||||||
fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> {
|
|
||||||
let mut documents = Vec::new();
|
|
||||||
for prefix in prefixes {
|
|
||||||
for i in 0..50 {
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"text": format!("{prefix}{i:x}"),
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
documents
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_update() {
|
|
||||||
let mut index = TempIndex::new();
|
|
||||||
index.index_documents_config.words_prefix_threshold = Some(50);
|
|
||||||
index.index_documents_config.autogenerate_docids = true;
|
|
||||||
|
|
||||||
index
|
|
||||||
.update_settings(|settings| {
|
|
||||||
settings.set_searchable_fields(vec!["text".to_owned()]);
|
|
||||||
})
|
|
||||||
.unwrap();
|
|
||||||
|
|
||||||
let batch_reader_from_documents = |documents| {
|
|
||||||
let mut builder = DocumentsBatchBuilder::new(Vec::new());
|
|
||||||
for object in documents {
|
|
||||||
builder.append_json_object(&object).unwrap();
|
|
||||||
}
|
|
||||||
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]);
|
|
||||||
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"text": "At an amazing and beautiful house"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"text": "The bell rings at 5 am"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
|
|
||||||
let documents = batch_reader_from_documents(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "initial");
|
|
||||||
|
|
||||||
let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]);
|
|
||||||
documents.push(
|
|
||||||
serde_json::json!({
|
|
||||||
"text": "At an extraordinary house"
|
|
||||||
})
|
|
||||||
.as_object()
|
|
||||||
.unwrap()
|
|
||||||
.clone(),
|
|
||||||
);
|
|
||||||
let documents = batch_reader_from_documents(documents);
|
|
||||||
index.add_documents(documents).unwrap();
|
|
||||||
|
|
||||||
db_snap!(index, word_prefix_pair_proximity_docids, "update");
|
|
||||||
}
|
|
||||||
|
|
||||||
fn check_prefixes(
|
fn check_prefixes(
|
||||||
trie: &PrefixTrieNode,
|
trie: &PrefixTrieNode,
|
||||||
@ -899,9 +704,9 @@ mod tests {
|
|||||||
&prefixes,
|
&prefixes,
|
||||||
2,
|
2,
|
||||||
|k, v| {
|
|k, v| {
|
||||||
let (word1, prefix, proximity) = U8StrStrCodec::bytes_decode(k).unwrap();
|
let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap();
|
||||||
let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap();
|
let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap();
|
||||||
result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap));
|
result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap));
|
||||||
Ok(())
|
Ok(())
|
||||||
},
|
},
|
||||||
)
|
)
|
Loading…
Reference in New Issue
Block a user