Change encoding of word_pair_proximity DB to (proximity, word1, word2)

Same for word_prefix_pair_proximity
This commit is contained in:
Loïc Lecrenier 2022-09-14 13:54:12 +02:00 committed by Loïc Lecrenier
parent 19b2326f3d
commit bdeb47305e
6 changed files with 130 additions and 179 deletions

View File

@ -7,12 +7,11 @@ impl<'a> heed::BytesDecode<'a> for StrStrU8Codec {
type DItem = (&'a str, &'a str, u8);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (n, bytes) = bytes.split_last()?;
let (n, bytes) = bytes.split_first()?;
let s1_end = bytes.iter().position(|b| *b == 0)?;
let (s1_bytes, rest) = bytes.split_at(s1_end);
let rest = &rest[1..];
let s2_bytes = &rest[1..];
let s1 = str::from_utf8(s1_bytes).ok()?;
let (_, s2_bytes) = rest.split_last()?;
let s2 = str::from_utf8(s2_bytes).ok()?;
Some((s1, s2, *n))
}
@ -22,12 +21,11 @@ impl<'a> heed::BytesEncode<'a> for StrStrU8Codec {
type EItem = (&'a str, &'a str, u8);
fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1);
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
bytes.push(*n);
bytes.extend_from_slice(s1.as_bytes());
bytes.push(0);
bytes.extend_from_slice(s2.as_bytes());
bytes.push(0);
bytes.push(*n);
Some(Cow::Owned(bytes))
}
}
@ -37,11 +35,10 @@ impl<'a> heed::BytesDecode<'a> for UncheckedStrStrU8Codec {
type DItem = (&'a [u8], &'a [u8], u8);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (n, bytes) = bytes.split_last()?;
let (n, bytes) = bytes.split_first()?;
let s1_end = bytes.iter().position(|b| *b == 0)?;
let (s1_bytes, rest) = bytes.split_at(s1_end);
let rest = &rest[1..];
let (_, s2_bytes) = rest.split_last()?;
let s2_bytes = &rest[1..];
Some((s1_bytes, s2_bytes, *n))
}
}
@ -50,12 +47,11 @@ impl<'a> heed::BytesEncode<'a> for UncheckedStrStrU8Codec {
type EItem = (&'a [u8], &'a [u8], u8);
fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1);
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
bytes.push(*n);
bytes.extend_from_slice(s1);
bytes.push(0);
bytes.extend_from_slice(s2);
bytes.push(0);
bytes.push(*n);
Some(Cow::Owned(bytes))
}
}

View File

@ -194,7 +194,7 @@ pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
(word1, prefix, proximity),
b,
)| {
&format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b))
&format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b))
});
snap
}

View File

@ -151,11 +151,10 @@ fn document_word_positions_into_sorter<'b>(
let mut key_buffer = Vec::new();
for ((w1, w2), prox) in word_pair_proximity {
key_buffer.clear();
key_buffer.push(prox as u8);
key_buffer.extend_from_slice(w1.as_bytes());
key_buffer.push(0);
key_buffer.extend_from_slice(w2.as_bytes());
key_buffer.push(0);
key_buffer.push(prox as u8);
word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?;
}

View File

@ -1,46 +1,46 @@
---
source: milli/src/update/word_prefix_pair_proximity_docids.rs
---
5 a 1 [101, ]
5 a 2 [101, ]
5 b 4 [101, ]
5 be 4 [101, ]
am a 3 [101, ]
amazing a 1 [100, ]
amazing a 2 [100, ]
amazing a 3 [100, ]
amazing b 2 [100, ]
amazing be 2 [100, ]
an a 1 [100, ]
an a 2 [100, ]
an b 3 [100, ]
an be 3 [100, ]
and a 2 [100, ]
and a 3 [100, ]
and a 4 [100, ]
and b 1 [100, ]
and be 1 [100, ]
at a 1 [100, ]
at a 2 [100, 101, ]
at a 3 [100, ]
at b 3 [101, ]
at b 4 [100, ]
at be 3 [101, ]
at be 4 [100, ]
beautiful a 2 [100, ]
beautiful a 3 [100, ]
beautiful a 4 [100, ]
bell a 2 [101, ]
bell a 4 [101, ]
house a 3 [100, ]
house a 4 [100, ]
house b 2 [100, ]
house be 2 [100, ]
rings a 1 [101, ]
rings a 3 [101, ]
rings b 2 [101, ]
rings be 2 [101, ]
the a 3 [101, ]
the b 1 [101, ]
the be 1 [101, ]
1 5 a [101, ]
1 amazing a [100, ]
1 an a [100, ]
1 and b [100, ]
1 and be [100, ]
1 at a [100, ]
1 rings a [101, ]
1 the b [101, ]
1 the be [101, ]
2 5 a [101, ]
2 amazing a [100, ]
2 amazing b [100, ]
2 amazing be [100, ]
2 an a [100, ]
2 and a [100, ]
2 at a [100, 101, ]
2 beautiful a [100, ]
2 bell a [101, ]
2 house b [100, ]
2 house be [100, ]
2 rings b [101, ]
2 rings be [101, ]
3 am a [101, ]
3 amazing a [100, ]
3 an b [100, ]
3 an be [100, ]
3 and a [100, ]
3 at a [100, ]
3 at b [101, ]
3 at be [101, ]
3 beautiful a [100, ]
3 house a [100, ]
3 rings a [101, ]
3 the a [101, ]
4 5 b [101, ]
4 5 be [101, ]
4 and a [100, ]
4 at b [100, ]
4 at be [100, ]
4 beautiful a [100, ]
4 bell a [101, ]
4 house a [100, ]

View File

@ -1,4 +1,4 @@
---
source: milli/src/update/word_prefix_pair_proximity_docids.rs
---
5ed4bf83317b10962a55ade353427bdd
fb88e49fd666886731b62baef8f44995

View File

@ -1,7 +1,7 @@
/*!
## What is WordPrefixPairProximityDocids?
The word-prefix-pair-proximity-docids database is a database whose keys are of
the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of
the form `(proximity, word, prefix)` and the values are roaring bitmaps of
the documents which contain `word` followed by another word starting with
`prefix` at a distance of `proximity`.
@ -23,127 +23,100 @@ dog
Note that only prefixes which correspond to more than a certain number of
different words from the database are included in this list.
* a sorted list of word pairs and the distance between them (i.e. proximity),
* associated with a roaring bitmap, such as:
* a sorted list of proximities and word pairs (the proximity is the distance between the two words),
associated with a roaring bitmap, such as:
```text
good dog 3 -> docids1: [2, 5, 6]
good doggo 1 -> docids2: [8]
good dogma 1 -> docids3: [7, 19, 20]
good ghost 2 -> docids4: [1]
horror cathedral 4 -> docids5: [1, 2]
1 good doggo -> docids1: [8]
1 good door -> docids2: [7, 19, 20]
1 good ghost -> docids3: [1]
2 good dog -> docids4: [2, 5, 6]
2 horror cathedral -> docids5: [1, 2]
```
I illustrate a simplified version of the algorithm to create the word-prefix
pair-proximity database below:
1. **Outer loop:** First, we iterate over each word pair and its proximity:
1. **Outer loop:** First, we iterate over each proximity and word pair:
```text
proximity: 1
word1 : good
word2 : dog
proximity: 3
word2 : doggo
```
2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`)
in the list of sorted prefixes. And we insert the key `prefix`
and the value (`docids`) to a sorted map which we call the batch. For example,
at the end of the first inner loop, we may have:
```text
Outer loop 1:
------------------------------
proximity: 1
word1 : good
word2 : dog
proximity: 3
word2 : doggo
docids : docids1
prefixes: [d, do, dog]
batch: [
(d, 3) -> [docids1]
(do, 3) -> [docids1]
(dog, 3) -> [docids1]
d, -> [docids1]
do -> [docids1]
dog -> [docids1]
]
```
3. For illustration purpose, let's run through a second iteration of the outer loop:
```text
Outer loop 2:
------------------------------
word1 : good
word2 : doggo
proximity: 1
word1 : good
word2 : door
docids : docids2
prefixes: [d, do, dog]
prefixes: [d, do, doo]
batch: [
(d, 1) -> [docids2]
(d, 3) -> [docids1]
(do, 1) -> [docids2]
(do, 3) -> [docids1]
(dog, 1) -> [docids2]
(dog, 3) -> [docids1]
]
```
Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some
of the elements inserted in the second iteration of the outer loop appear
*before* elements from the first iteration.
4. And a third:
```text
Outer loop 3:
------------------------------
word1 : good
word2 : dogma
proximity: 1
docids : docids3
prefixes: [d, do, dog]
batch: [
(d, 1) -> [docids2, docids3]
(d, 3) -> [docids1]
(do, 1) -> [docids2, docids3]
(do, 3) -> [docids1]
(dog, 1) -> [docids2, docids3]
(dog, 3) -> [docids1]
d -> [docids1, docids2]
do -> [docids1, docids2]
dog -> [docids1]
doo -> [docids2]
]
```
Notice that there were some conflicts which were resolved by merging the
conflicting values together.
conflicting values together. Also, an additional prefix was added at the
end of the batch.
5. On the fourth iteration of the outer loop, we have:
4. On the third iteration of the outer loop, we have:
```text
Outer loop 4:
------------------------------
proximity: 1
word1 : good
word2 : ghost
proximity: 2
```
Because `word2` begins with a different letter than the previous `word2`,
we know that:
1. All the prefixes of `word2` are greater than the prefixes of the previous word2
2. And therefore, every instance of (`word2`, `prefix`) will be greater than
any element in the batch.
we know that all the prefixes of `word2` are greater than the prefixes of the previous word2
Therefore, we know that we can insert every element from the batch into the
database before proceeding any further. This operation is called
flushing the batch. Flushing the batch should also be done whenever `word1`
is different than the previous `word1`.
flushing the batch. Flushing the batch should also be done whenever:
* `proximity` is different than the previous `proximity`.
* `word1` is different than the previous `word1`.
* `word2` starts with a different letter than the previous word2
6. **Flushing the batch:** to flush the batch, we look at the `word1` and
iterate over the elements of the batch in sorted order:
6. **Flushing the batch:** to flush the batch, we iterate over its elements:
```text
Flushing Batch loop 1:
------------------------------
word1 : good
word2 : d
proximity : 1
word1 : good
prefix : d
docids : [docids2, docids3]
```
We then merge the array of `docids` (of type `Vec<Vec<u8>>`) using
`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a
roaring bitmap of all the document ids where `word1` is followed by `prefix`
at a distance of `proximity`.
Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids`
Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids`
into the database.
7. That's it! ... except...
@ -184,8 +157,8 @@ Note, also, that since we read data from the database when iterating over
`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-
docids from the batch directly into the database (we would have a concurrent
reader and writer). Therefore, when calling the algorithm on
(`new_prefixes`, `word_pairs_db`), we insert the computed
((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad
`(new_prefixes, word_pairs_db)`, we insert the computed
`((proximity, word, prefix), docids)` elements in an intermediary grenad
Writer instead of the DB. At the end of the outer loop, we finally read from
the grenad and insert its elements in the database.
@ -406,7 +379,7 @@ fn execute_on_word_pairs_and_prefixes<I>(
while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? {
// skip this iteration if the proximity is over the threshold
if proximity > max_proximity {
continue;
break;
};
let word2_start_different_than_prev = word2[0] != prev_word2_start;
// if there were no potential prefixes for the previous word2 based on its first letter,
@ -416,16 +389,21 @@ fn execute_on_word_pairs_and_prefixes<I>(
continue;
}
// if word1 is different than the previous word1 OR if the start of word2 is different
// than the previous start of word2, then we'll need to flush the batch
// if the proximity is different to the previous one, OR
// if word1 is different than the previous word1, OR
// if the start of word2 is different than the previous start of word2,
// THEN we'll need to flush the batch
let prox_different_than_prev = proximity != batch.proximity;
let word1_different_than_prev = word1 != batch.word1;
if word1_different_than_prev || word2_start_different_than_prev {
if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
{
batch.flush(&mut merge_buffer, &mut insert)?;
// don't forget to reset the value of batch.word1 and prev_word2_start
if word1_different_than_prev {
prefix_search_start.0 = 0;
batch.word1.clear();
batch.word1.extend_from_slice(word1);
batch.proximity = proximity;
}
if word2_start_different_than_prev {
// word2_start_different_than_prev == true
@ -437,74 +415,70 @@ fn execute_on_word_pairs_and_prefixes<I>(
if !empty_prefixes {
// All conditions are satisfied, we can now insert each new prefix of word2 into the batch
prefix_buffer.clear();
prefixes.for_each_prefix_of(
word2,
&mut prefix_buffer,
&prefix_search_start,
|prefix_buffer| {
let prefix_len = prefix_buffer.len();
prefix_buffer.push(0);
prefix_buffer.push(proximity);
batch.insert(&prefix_buffer, data.to_vec());
prefix_buffer.truncate(prefix_len);
},
);
prefix_buffer.clear();
}
}
batch.flush(&mut merge_buffer, &mut insert)?;
Ok(())
}
/**
A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps).
A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps).
The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together.
It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently.
It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently.
The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content
The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content
can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments:
- key : (word1, prefix, proximity) as bytes
- value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes
- key : (proximity, word1, prefix) as bytes
- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes
*/
#[derive(Default)]
struct PrefixAndProximityBatch {
proximity: u8,
word1: Vec<u8>,
batch: Vec<(Vec<u8>, Vec<Cow<'static, [u8]>>)>,
}
impl PrefixAndProximityBatch {
/// Insert the new key and value into the batch
///
/// The key must either exist in the batch or be greater than all existing keys
fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) {
match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) {
Ok(position) => {
self.batch[position].1.push(Cow::Owned(new_value));
}
Err(position) => {
self.batch.insert(position, (new_key.to_vec(), vec![Cow::Owned(new_value)]));
}
match self.batch.iter_mut().find(|el| el.0 == new_key) {
Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)),
None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])),
}
}
/// Empties the batch, calling `insert` on each element.
///
/// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap.
/// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap.
fn flush(
&mut self,
merge_buffer: &mut Vec<u8>,
insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>,
) -> Result<()> {
let PrefixAndProximityBatch { word1, batch } = self;
let PrefixAndProximityBatch { proximity, word1, batch } = self;
if batch.is_empty() {
return Ok(());
}
merge_buffer.clear();
let mut buffer = Vec::with_capacity(word1.len() + 1 + 6 + 1);
let mut buffer = Vec::with_capacity(word1.len() + 1 + 6);
buffer.push(*proximity);
buffer.extend_from_slice(word1);
buffer.push(0);
for (key, mergeable_data) in batch.drain(..) {
buffer.truncate(word1.len() + 1);
buffer.truncate(1 + word1.len() + 1);
buffer.extend_from_slice(key.as_slice());
let data = if mergeable_data.len() > 1 {
@ -884,51 +858,33 @@ mod tests {
CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges);
let word_pairs = [
// 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456)
(("healthy", "arbre", 2), &serialised_bitmap123),
// not inserted because 3 > max_proximity
(("healthy", "arbre", 3), &serialised_bitmap456),
// 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123)
(("healthy", "arbres", 1), &serialised_bitmap123),
// 1, 3:
(("healthy", "arbres", 2), &serialised_bitmap456),
// not be inserted because 3 > max_proximity
(("healthy", "arbres", 3), &serialised_bitmap789),
// not inserted because no prefixes for boat
(("healthy", "boat", 1), &serialised_bitmap123),
// not inserted because no prefixes for ca
(("healthy", "ca", 1), &serialised_bitmap123),
// 4: (healthy cat 1) with (bitmap456 + bitmap123)
(("healthy", "cats", 1), &serialised_bitmap456),
// 5: (healthy cat 2) with (bitmap789 + bitmap_ranges)
(("healthy", "cats", 2), &serialised_bitmap789),
// 4 + 6: (healthy catto 1) with (bitmap123)
(("healthy", "cattos", 1), &serialised_bitmap123),
// 5 + 7: (healthy catto 2) with (bitmap_ranges)
(("healthy", "cattos", 2), &serialised_bitmap_ranges),
// 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges)
(("jittery", "cat", 1), &serialised_bitmap123),
// 8:
(("jittery", "cata", 1), &serialised_bitmap456),
// 8:
(("jittery", "catb", 1), &serialised_bitmap789),
// 8:
(("jittery", "catc", 1), &serialised_bitmap_ranges),
(("healthy", "arbre", 2), &serialised_bitmap123),
(("healthy", "arbres", 2), &serialised_bitmap456),
(("healthy", "cats", 2), &serialised_bitmap789),
(("healthy", "cattos", 2), &serialised_bitmap_ranges),
(("healthy", "arbre", 3), &serialised_bitmap456),
(("healthy", "arbres", 3), &serialised_bitmap789),
];
let expected_result = [
// first batch:
(("healthy", "arb", 1), bitmap123.clone()),
(("healthy", "arb", 2), &bitmap123 | &bitmap456),
(("healthy", "arbre", 1), bitmap123.clone()),
(("healthy", "arbre", 2), &bitmap123 | &bitmap456),
// second batch:
(("healthy", "cat", 1), &bitmap456 | &bitmap123),
(("healthy", "cat", 2), &bitmap789 | &bitmap_ranges),
(("healthy", "catto", 1), bitmap123.clone()),
(("healthy", "catto", 2), bitmap_ranges.clone()),
// third batch
(("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)),
(("healthy", "arb", 2), &bitmap123 | &bitmap456),
(("healthy", "arbre", 2), &bitmap123 | &bitmap456),
(("healthy", "cat", 2), &bitmap789 | &bitmap_ranges),
(("healthy", "catto", 2), bitmap_ranges.clone()),
];
let mut result = vec![];