mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 08:48:32 +08:00
Simplify and document WordPrefixPairProximityDocIds::execute
This commit is contained in:
parent
044356d221
commit
220921628b
@ -1,18 +1,14 @@
|
|||||||
use grenad::CompressionType;
|
|
||||||
use heed::types::ByteSlice;
|
|
||||||
|
|
||||||
use heed::BytesDecode;
|
|
||||||
use log::debug;
|
|
||||||
|
|
||||||
use std::borrow::Cow;
|
|
||||||
use std::cmp::Ordering;
|
|
||||||
use std::collections::HashSet;
|
|
||||||
use std::io::BufReader;
|
|
||||||
|
|
||||||
use crate::update::index_documents::{
|
use crate::update::index_documents::{
|
||||||
create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap,
|
create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap,
|
||||||
};
|
};
|
||||||
use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec};
|
use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec};
|
||||||
|
use grenad::CompressionType;
|
||||||
|
use heed::types::ByteSlice;
|
||||||
|
use heed::BytesDecode;
|
||||||
|
use log::debug;
|
||||||
|
use std::borrow::Cow;
|
||||||
|
use std::collections::HashSet;
|
||||||
|
use std::io::BufReader;
|
||||||
|
|
||||||
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
@ -72,10 +68,11 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
del_prefix_fst_words: &HashSet<Vec<u8>>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||||
|
|
||||||
|
// This is an optimisation, to reuse allocations between loop iterations
|
||||||
let mut allocations = Allocations::default();
|
let mut allocations = Allocations::default();
|
||||||
|
|
||||||
let mut count = 0;
|
// Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
|
||||||
|
|
||||||
let prefixes = PrefixTrieNode::from_sorted_prefixes(
|
let prefixes = PrefixTrieNode::from_sorted_prefixes(
|
||||||
common_prefix_fst_words
|
common_prefix_fst_words
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@ -85,9 +82,14 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
.filter(|s| s.len() <= self.max_prefix_length),
|
.filter(|s| s.len() <= self.max_prefix_length),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// If the prefix trie is not empty, then we can iterate over all new
|
||||||
|
// word pairs to look for new (word1, common_prefix, proximity) elements
|
||||||
|
// to insert in the DB
|
||||||
if !prefixes.is_empty() {
|
if !prefixes.is_empty() {
|
||||||
let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
|
let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
|
||||||
|
// This is the core of the algorithm
|
||||||
execute_on_word_pairs_and_prefixes(
|
execute_on_word_pairs_and_prefixes(
|
||||||
|
// the first two arguments tell how to iterate over the new word pairs
|
||||||
&mut cursor,
|
&mut cursor,
|
||||||
|cursor| {
|
|cursor| {
|
||||||
if let Some((key, value)) = cursor.move_on_next()? {
|
if let Some((key, value)) = cursor.move_on_next()? {
|
||||||
@ -101,8 +103,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
&prefixes,
|
&prefixes,
|
||||||
&mut allocations,
|
&mut allocations,
|
||||||
self.max_proximity,
|
self.max_proximity,
|
||||||
|
// and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap)
|
||||||
|key, value| {
|
|key, value| {
|
||||||
count += 1;
|
|
||||||
insert_into_database(
|
insert_into_database(
|
||||||
&mut self.wtxn,
|
&mut self.wtxn,
|
||||||
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
||||||
@ -113,6 +115,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Now we do the same thing with the new prefixes and all word pairs in the DB
|
||||||
|
|
||||||
let prefixes = PrefixTrieNode::from_sorted_prefixes(
|
let prefixes = PrefixTrieNode::from_sorted_prefixes(
|
||||||
new_prefix_fst_words
|
new_prefix_fst_words
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@ -128,6 +132,8 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
.remap_data_type::<ByteSlice>()
|
.remap_data_type::<ByteSlice>()
|
||||||
.iter(self.wtxn)?;
|
.iter(self.wtxn)?;
|
||||||
|
|
||||||
|
// Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
|
||||||
|
// element in an intermediary grenad
|
||||||
let mut writer = create_writer(
|
let mut writer = create_writer(
|
||||||
self.chunk_compression_type,
|
self.chunk_compression_type,
|
||||||
self.chunk_compression_level,
|
self.chunk_compression_level,
|
||||||
@ -143,7 +149,12 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
|key, value| writer.insert(key, value).map_err(|e| e.into()),
|
||||||
)?;
|
)?;
|
||||||
drop(db_iter);
|
drop(db_iter);
|
||||||
writer_of_new_elements_into_lmdb_database(
|
|
||||||
|
// and then we write the grenad into the DB
|
||||||
|
// Since the grenad contains only new prefixes, we know in advance that none
|
||||||
|
// of its elements already exist in the DB, thus there is no need to specify
|
||||||
|
// how to merge conflicting elements
|
||||||
|
write_into_lmdb_database_without_merging(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
||||||
writer,
|
writer,
|
||||||
@ -169,6 +180,15 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
|
||||||
|
///
|
||||||
|
/// Its main arguments are:
|
||||||
|
/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements
|
||||||
|
/// 2. a prefix trie
|
||||||
|
/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements
|
||||||
|
///
|
||||||
|
/// For more information about the
|
||||||
fn execute_on_word_pairs_and_prefixes<Iter>(
|
fn execute_on_word_pairs_and_prefixes<Iter>(
|
||||||
iter: &mut Iter,
|
iter: &mut Iter,
|
||||||
mut next_word_pair_proximity: impl for<'a> FnMut(
|
mut next_word_pair_proximity: impl for<'a> FnMut(
|
||||||
@ -252,52 +272,19 @@ struct PrefixAndProximityBatch {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl PrefixAndProximityBatch {
|
impl PrefixAndProximityBatch {
|
||||||
|
/// Insert the new key and value into the batch
|
||||||
fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>, allocations: &mut Allocations) {
|
fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>, allocations: &mut Allocations) {
|
||||||
// this is a macro instead of a closure because the borrow checker will complain
|
match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) {
|
||||||
// about the closure moving `new_value`
|
Ok(position) => {
|
||||||
macro_rules! insert_new_key_value {
|
self.batch[position].1.push(Cow::Owned(new_value));
|
||||||
() => {
|
}
|
||||||
|
Err(position) => {
|
||||||
let mut key = allocations.take_byte_vector();
|
let mut key = allocations.take_byte_vector();
|
||||||
key.extend_from_slice(new_key);
|
key.extend_from_slice(new_key);
|
||||||
let mut mergeable_data = allocations.take_mergeable_data_vector();
|
let mut mergeable_data = allocations.take_mergeable_data_vector();
|
||||||
mergeable_data.push(Cow::Owned(new_value));
|
mergeable_data.push(Cow::Owned(new_value));
|
||||||
self.batch.push((key, mergeable_data));
|
self.batch.insert(position, (key, mergeable_data));
|
||||||
};
|
|
||||||
($idx:expr) => {
|
|
||||||
let mut key = allocations.take_byte_vector();
|
|
||||||
key.extend_from_slice(new_key);
|
|
||||||
let mut mergeable_data = allocations.take_mergeable_data_vector();
|
|
||||||
mergeable_data.push(Cow::Owned(new_value));
|
|
||||||
self.batch.insert($idx, (key, mergeable_data));
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
match self.batch.len() {
|
|
||||||
0 => {
|
|
||||||
insert_new_key_value!();
|
|
||||||
}
|
}
|
||||||
1 => {
|
|
||||||
let (existing_key, existing_data) = &mut self.batch[0];
|
|
||||||
match new_key.cmp(&existing_key) {
|
|
||||||
Ordering::Less => {
|
|
||||||
insert_new_key_value!(0);
|
|
||||||
}
|
|
||||||
Ordering::Equal => {
|
|
||||||
existing_data.push(Cow::Owned(new_value));
|
|
||||||
}
|
|
||||||
Ordering::Greater => {
|
|
||||||
insert_new_key_value!();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
_ => match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) {
|
|
||||||
Ok(position) => {
|
|
||||||
self.batch[position].1.push(Cow::Owned(new_value));
|
|
||||||
}
|
|
||||||
Err(position) => {
|
|
||||||
insert_new_key_value!(position);
|
|
||||||
}
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -369,8 +356,10 @@ fn insert_into_database(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// This is adapted from `sorter_into_lmdb_database`
|
// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
|
||||||
pub fn writer_of_new_elements_into_lmdb_database(
|
// but it uses `append` if the database is empty, and it assumes that the values in the
|
||||||
|
// writer don't conflict with values in the database.
|
||||||
|
pub fn write_into_lmdb_database_without_merging(
|
||||||
wtxn: &mut heed::RwTxn,
|
wtxn: &mut heed::RwTxn,
|
||||||
database: heed::PolyDatabase,
|
database: heed::PolyDatabase,
|
||||||
writer: grenad::Writer<std::fs::File>,
|
writer: grenad::Writer<std::fs::File>,
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
## What is WordPrefixPairProximityDocids?
|
## What is WordPrefixPairProximityDocids?
|
||||||
The word-prefix-pair-proximity-docids database is a database whose keys are of the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`.
|
The word-prefix-pair-proximity-docids database is a database whose keys are of the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`.
|
||||||
|
|
||||||
The prefixes present in this database are only those that correspond to many different words present in the documents.
|
The prefixes present in this database are only those that correspond to many different words in the documents.
|
||||||
|
|
||||||
## How is it created/updated? (simplified version)
|
## How is it created/updated? (simplified version)
|
||||||
To compute it, we have access to (mainly) two inputs:
|
To compute it, we have access to (mainly) two inputs:
|
||||||
@ -28,13 +28,13 @@ horror cathedral 4 -> docids5: [1, 2]
|
|||||||
|
|
||||||
I illustrate a simplified version of the algorithm to create the word-prefix-pair-proximity database below:
|
I illustrate a simplified version of the algorithm to create the word-prefix-pair-proximity database below:
|
||||||
|
|
||||||
1. ==Outer loop:== First, we iterate over each word pair and its proximity:
|
1. **Outer loop:** First, we iterate over each word pair and its proximity:
|
||||||
```
|
```
|
||||||
word1 : good
|
word1 : good
|
||||||
word2 : dog
|
word2 : dog
|
||||||
proximity: 3
|
proximity: 3
|
||||||
```
|
```
|
||||||
2. ==Inner loop:== Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have:
|
2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have:
|
||||||
```
|
```
|
||||||
Outer loop 1:
|
Outer loop 1:
|
||||||
------------------------------
|
------------------------------
|
||||||
@ -108,7 +108,7 @@ Because `word2` begins with a different letter than the previous `word2`, we kno
|
|||||||
2. And therefore, every instance of (`word2`, `prefix`) will be greater than any element in the batch.
|
2. And therefore, every instance of (`word2`, `prefix`) will be greater than any element in the batch.
|
||||||
Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called “flushing the batch”. Flushing the batch should also be done whenever `word1` is different than the previous `word1`.
|
Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called “flushing the batch”. Flushing the batch should also be done whenever `word1` is different than the previous `word1`.
|
||||||
|
|
||||||
6. ==Flushing the batch==: to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order:
|
6. **Flushing the batch:** to flush the batch, we look at the `word1` and iterate over the elements of the batch in sorted order:
|
||||||
```
|
```
|
||||||
Flushing Batch loop 1:
|
Flushing Batch loop 1:
|
||||||
------------------------------
|
------------------------------
|
||||||
|
Loading…
Reference in New Issue
Block a user