From bdeb47305e52dfbeccc5cabc10ffccdd94054759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 13:54:12 +0200 Subject: [PATCH 01/15] Change encoding of word_pair_proximity DB to (proximity, word1, word2) Same for word_prefix_pair_proximity --- milli/src/heed_codec/str_str_u8_codec.rs | 20 +- milli/src/snapshot_tests.rs | 2 +- .../extract_word_pair_proximity_docids.rs | 3 +- .../word_prefix_pair_proximity_docids.snap | 84 ++++---- ...ord_prefix_pair_proximity_docids.hash.snap | 2 +- .../word_prefix_pair_proximity_docids.rs | 198 +++++++----------- 6 files changed, 130 insertions(+), 179 deletions(-) diff --git a/milli/src/heed_codec/str_str_u8_codec.rs b/milli/src/heed_codec/str_str_u8_codec.rs index 888e08752..6cfff3ecf 100644 --- a/milli/src/heed_codec/str_str_u8_codec.rs +++ b/milli/src/heed_codec/str_str_u8_codec.rs @@ -7,12 +7,11 @@ impl<'a> heed::BytesDecode<'a> for StrStrU8Codec { type DItem = (&'a str, &'a str, u8); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (n, bytes) = bytes.split_last()?; + let (n, bytes) = bytes.split_first()?; let s1_end = bytes.iter().position(|b| *b == 0)?; let (s1_bytes, rest) = bytes.split_at(s1_end); - let rest = &rest[1..]; + let s2_bytes = &rest[1..]; let s1 = str::from_utf8(s1_bytes).ok()?; - let (_, s2_bytes) = rest.split_last()?; let s2 = str::from_utf8(s2_bytes).ok()?; Some((s1, s2, *n)) } @@ -22,12 +21,11 @@ impl<'a> heed::BytesEncode<'a> for StrStrU8Codec { type EItem = (&'a str, &'a str, u8); fn bytes_encode((s1, s2, n): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1); + let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); + bytes.push(*n); bytes.extend_from_slice(s1.as_bytes()); bytes.push(0); bytes.extend_from_slice(s2.as_bytes()); - bytes.push(0); - bytes.push(*n); Some(Cow::Owned(bytes)) } } @@ -37,11 +35,10 @@ impl<'a> heed::BytesDecode<'a> for UncheckedStrStrU8Codec { type DItem = (&'a [u8], &'a [u8], u8); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (n, bytes) = bytes.split_last()?; + let (n, bytes) = bytes.split_first()?; let s1_end = bytes.iter().position(|b| *b == 0)?; let (s1_bytes, rest) = bytes.split_at(s1_end); - let rest = &rest[1..]; - let (_, s2_bytes) = rest.split_last()?; + let s2_bytes = &rest[1..]; Some((s1_bytes, s2_bytes, *n)) } } @@ -50,12 +47,11 @@ impl<'a> heed::BytesEncode<'a> for UncheckedStrStrU8Codec { type EItem = (&'a [u8], &'a [u8], u8); fn bytes_encode((s1, s2, n): &Self::EItem) -> Option> { - let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1); + let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); + bytes.push(*n); bytes.extend_from_slice(s1); bytes.push(0); bytes.extend_from_slice(s2); - bytes.push(0); - bytes.push(*n); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index eac3340fd..17f490758 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -194,7 +194,7 @@ pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { (word1, prefix, proximity), b, )| { - &format!("{word1:<16} {prefix:<4} {proximity:<2} {}", display_bitmap(&b)) + &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) }); snap } diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 9448f0e23..3837c1bbe 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -151,11 +151,10 @@ fn document_word_positions_into_sorter<'b>( let mut key_buffer = Vec::new(); for ((w1, w2), prox) in word_pair_proximity { key_buffer.clear(); + key_buffer.push(prox as u8); key_buffer.extend_from_slice(w1.as_bytes()); key_buffer.push(0); key_buffer.extend_from_slice(w2.as_bytes()); - key_buffer.push(0); - key_buffer.push(prox as u8); word_pair_proximity_docids_sorter.insert(&key_buffer, &document_id.to_ne_bytes())?; } diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap index 0a61cf4e8..47a6df343 100644 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap +++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap @@ -1,46 +1,46 @@ --- source: milli/src/update/word_prefix_pair_proximity_docids.rs --- -5 a 1 [101, ] -5 a 2 [101, ] -5 b 4 [101, ] -5 be 4 [101, ] -am a 3 [101, ] -amazing a 1 [100, ] -amazing a 2 [100, ] -amazing a 3 [100, ] -amazing b 2 [100, ] -amazing be 2 [100, ] -an a 1 [100, ] -an a 2 [100, ] -an b 3 [100, ] -an be 3 [100, ] -and a 2 [100, ] -and a 3 [100, ] -and a 4 [100, ] -and b 1 [100, ] -and be 1 [100, ] -at a 1 [100, ] -at a 2 [100, 101, ] -at a 3 [100, ] -at b 3 [101, ] -at b 4 [100, ] -at be 3 [101, ] -at be 4 [100, ] -beautiful a 2 [100, ] -beautiful a 3 [100, ] -beautiful a 4 [100, ] -bell a 2 [101, ] -bell a 4 [101, ] -house a 3 [100, ] -house a 4 [100, ] -house b 2 [100, ] -house be 2 [100, ] -rings a 1 [101, ] -rings a 3 [101, ] -rings b 2 [101, ] -rings be 2 [101, ] -the a 3 [101, ] -the b 1 [101, ] -the be 1 [101, ] +1 5 a [101, ] +1 amazing a [100, ] +1 an a [100, ] +1 and b [100, ] +1 and be [100, ] +1 at a [100, ] +1 rings a [101, ] +1 the b [101, ] +1 the be [101, ] +2 5 a [101, ] +2 amazing a [100, ] +2 amazing b [100, ] +2 amazing be [100, ] +2 an a [100, ] +2 and a [100, ] +2 at a [100, 101, ] +2 beautiful a [100, ] +2 bell a [101, ] +2 house b [100, ] +2 house be [100, ] +2 rings b [101, ] +2 rings be [101, ] +3 am a [101, ] +3 amazing a [100, ] +3 an b [100, ] +3 an be [100, ] +3 and a [100, ] +3 at a [100, ] +3 at b [101, ] +3 at be [101, ] +3 beautiful a [100, ] +3 house a [100, ] +3 rings a [101, ] +3 the a [101, ] +4 5 b [101, ] +4 5 be [101, ] +4 and a [100, ] +4 at b [100, ] +4 at be [100, ] +4 beautiful a [100, ] +4 bell a [101, ] +4 house a [100, ] diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap index a39ee07b5..bb2cc3b84 100644 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap +++ b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap @@ -1,4 +1,4 @@ --- source: milli/src/update/word_prefix_pair_proximity_docids.rs --- -5ed4bf83317b10962a55ade353427bdd +fb88e49fd666886731b62baef8f44995 diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index 724858e4f..f919aecc7 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -1,7 +1,7 @@ /*! ## What is WordPrefixPairProximityDocids? The word-prefix-pair-proximity-docids database is a database whose keys are of -the form (`word`, `prefix`, `proximity`) and the values are roaring bitmaps of +the form `(proximity, word, prefix)` and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with `prefix` at a distance of `proximity`. @@ -23,127 +23,100 @@ dog Note that only prefixes which correspond to more than a certain number of different words from the database are included in this list. -* a sorted list of word pairs and the distance between them (i.e. proximity), -* associated with a roaring bitmap, such as: +* a sorted list of proximities and word pairs (the proximity is the distance between the two words), +associated with a roaring bitmap, such as: ```text -good dog 3 -> docids1: [2, 5, 6] -good doggo 1 -> docids2: [8] -good dogma 1 -> docids3: [7, 19, 20] -good ghost 2 -> docids4: [1] -horror cathedral 4 -> docids5: [1, 2] +1 good doggo -> docids1: [8] +1 good door -> docids2: [7, 19, 20] +1 good ghost -> docids3: [1] +2 good dog -> docids4: [2, 5, 6] +2 horror cathedral -> docids5: [1, 2] ``` I illustrate a simplified version of the algorithm to create the word-prefix pair-proximity database below: -1. **Outer loop:** First, we iterate over each word pair and its proximity: +1. **Outer loop:** First, we iterate over each proximity and word pair: ```text +proximity: 1 word1 : good -word2 : dog -proximity: 3 +word2 : doggo ``` 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are -in the list of sorted prefixes. And we insert the key (`prefix`, `proximity`) +in the list of sorted prefixes. And we insert the key `prefix` and the value (`docids`) to a sorted map which we call the “batch”. For example, at the end of the first inner loop, we may have: ```text Outer loop 1: ------------------------------ +proximity: 1 word1 : good -word2 : dog -proximity: 3 +word2 : doggo docids : docids1 prefixes: [d, do, dog] batch: [ - (d, 3) -> [docids1] - (do, 3) -> [docids1] - (dog, 3) -> [docids1] + d, -> [docids1] + do -> [docids1] + dog -> [docids1] ] ``` 3. For illustration purpose, let's run through a second iteration of the outer loop: ```text Outer loop 2: ------------------------------ -word1 : good -word2 : doggo proximity: 1 +word1 : good +word2 : door docids : docids2 -prefixes: [d, do, dog] +prefixes: [d, do, doo] batch: [ - (d, 1) -> [docids2] - (d, 3) -> [docids1] - (do, 1) -> [docids2] - (do, 3) -> [docids1] - (dog, 1) -> [docids2] - (dog, 3) -> [docids1] -] -``` -Notice that the batch had to re-order some (`prefix`, `proximity`) keys: some -of the elements inserted in the second iteration of the outer loop appear -*before* elements from the first iteration. - -4. And a third: -```text -Outer loop 3: ------------------------------- -word1 : good -word2 : dogma -proximity: 1 -docids : docids3 - -prefixes: [d, do, dog] - -batch: [ - (d, 1) -> [docids2, docids3] - (d, 3) -> [docids1] - (do, 1) -> [docids2, docids3] - (do, 3) -> [docids1] - (dog, 1) -> [docids2, docids3] - (dog, 3) -> [docids1] + d -> [docids1, docids2] + do -> [docids1, docids2] + dog -> [docids1] + doo -> [docids2] ] ``` Notice that there were some conflicts which were resolved by merging the -conflicting values together. +conflicting values together. Also, an additional prefix was added at the +end of the batch. -5. On the fourth iteration of the outer loop, we have: +4. On the third iteration of the outer loop, we have: ```text Outer loop 4: ------------------------------ +proximity: 1 word1 : good word2 : ghost -proximity: 2 ``` Because `word2` begins with a different letter than the previous `word2`, -we know that: - -1. All the prefixes of `word2` are greater than the prefixes of the previous word2 -2. And therefore, every instance of (`word2`, `prefix`) will be greater than -any element in the batch. +we know that all the prefixes of `word2` are greater than the prefixes of the previous word2 Therefore, we know that we can insert every element from the batch into the database before proceeding any further. This operation is called -“flushing the batch”. Flushing the batch should also be done whenever `word1` -is different than the previous `word1`. +“flushing the batch”. Flushing the batch should also be done whenever: +* `proximity` is different than the previous `proximity`. +* `word1` is different than the previous `word1`. +* `word2` starts with a different letter than the previous word2 -6. **Flushing the batch:** to flush the batch, we look at the `word1` and -iterate over the elements of the batch in sorted order: +6. **Flushing the batch:** to flush the batch, we iterate over its elements: ```text Flushing Batch loop 1: ------------------------------ -word1 : good -word2 : d -proximity: 1 +proximity : 1 +word1 : good +prefix : d + docids : [docids2, docids3] ``` We then merge the array of `docids` (of type `Vec>`) using `merge_cbo_roaring_bitmap` in order to get a single byte vector representing a roaring bitmap of all the document ids where `word1` is followed by `prefix` at a distance of `proximity`. -Once we have done that, we insert (`word1`, `prefix`, `proximity`) -> `merged_docids` +Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids` into the database. 7. That's it! ... except... @@ -184,8 +157,8 @@ Note, also, that since we read data from the database when iterating over `word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity- docids from the batch directly into the database (we would have a concurrent reader and writer). Therefore, when calling the algorithm on -(`new_prefixes`, `word_pairs_db`), we insert the computed -((`word`, `prefix`, `proximity`), `docids`) elements in an intermediary grenad +`(new_prefixes, word_pairs_db)`, we insert the computed +`((proximity, word, prefix), docids)` elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. @@ -406,7 +379,7 @@ fn execute_on_word_pairs_and_prefixes( while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { // skip this iteration if the proximity is over the threshold if proximity > max_proximity { - continue; + break; }; let word2_start_different_than_prev = word2[0] != prev_word2_start; // if there were no potential prefixes for the previous word2 based on its first letter, @@ -416,16 +389,21 @@ fn execute_on_word_pairs_and_prefixes( continue; } - // if word1 is different than the previous word1 OR if the start of word2 is different - // than the previous start of word2, then we'll need to flush the batch + // if the proximity is different to the previous one, OR + // if word1 is different than the previous word1, OR + // if the start of word2 is different than the previous start of word2, + // THEN we'll need to flush the batch + let prox_different_than_prev = proximity != batch.proximity; let word1_different_than_prev = word1 != batch.word1; - if word1_different_than_prev || word2_start_different_than_prev { + if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev + { batch.flush(&mut merge_buffer, &mut insert)?; // don't forget to reset the value of batch.word1 and prev_word2_start if word1_different_than_prev { prefix_search_start.0 = 0; batch.word1.clear(); batch.word1.extend_from_slice(word1); + batch.proximity = proximity; } if word2_start_different_than_prev { // word2_start_different_than_prev == true @@ -437,74 +415,70 @@ fn execute_on_word_pairs_and_prefixes( if !empty_prefixes { // All conditions are satisfied, we can now insert each new prefix of word2 into the batch + prefix_buffer.clear(); prefixes.for_each_prefix_of( word2, &mut prefix_buffer, &prefix_search_start, |prefix_buffer| { - let prefix_len = prefix_buffer.len(); - prefix_buffer.push(0); - prefix_buffer.push(proximity); batch.insert(&prefix_buffer, data.to_vec()); - prefix_buffer.truncate(prefix_len); }, ); - prefix_buffer.clear(); } } batch.flush(&mut merge_buffer, &mut insert)?; Ok(()) } /** -A map structure whose keys are (prefix, proximity) and whose values are vectors of bitstrings (serialized roaring bitmaps). +A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps). The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together. -It is used to ensure that all ((word1, prefix, proximity), docids) are inserted into the database in sorted order and efficiently. +It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently. -The batch is flushed as often as possible, when we are sure that every (word1, prefix, proximity) key derived from its content +The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments: -- key : (word1, prefix, proximity) as bytes -- value : merged roaring bitmaps from all values associated with (prefix, proximity) in the batch, serialised to bytes +- key : (proximity, word1, prefix) as bytes +- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes */ #[derive(Default)] struct PrefixAndProximityBatch { + proximity: u8, word1: Vec, batch: Vec<(Vec, Vec>)>, } impl PrefixAndProximityBatch { /// Insert the new key and value into the batch + /// + /// The key must either exist in the batch or be greater than all existing keys fn insert(&mut self, new_key: &[u8], new_value: Vec) { - match self.batch.binary_search_by_key(&new_key, |(k, _)| k.as_slice()) { - Ok(position) => { - self.batch[position].1.push(Cow::Owned(new_value)); - } - Err(position) => { - self.batch.insert(position, (new_key.to_vec(), vec![Cow::Owned(new_value)])); - } + match self.batch.iter_mut().find(|el| el.0 == new_key) { + Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)), + None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])), } } /// Empties the batch, calling `insert` on each element. /// - /// The key given to `insert` is `(word1, prefix, proximity)` and the value is the associated merged roaring bitmap. + /// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap. fn flush( &mut self, merge_buffer: &mut Vec, insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>, ) -> Result<()> { - let PrefixAndProximityBatch { word1, batch } = self; + let PrefixAndProximityBatch { proximity, word1, batch } = self; if batch.is_empty() { return Ok(()); } merge_buffer.clear(); - let mut buffer = Vec::with_capacity(word1.len() + 1 + 6 + 1); + let mut buffer = Vec::with_capacity(word1.len() + 1 + 6); + buffer.push(*proximity); buffer.extend_from_slice(word1); buffer.push(0); for (key, mergeable_data) in batch.drain(..) { - buffer.truncate(word1.len() + 1); + buffer.truncate(1 + word1.len() + 1); buffer.extend_from_slice(key.as_slice()); let data = if mergeable_data.len() > 1 { @@ -884,51 +858,33 @@ mod tests { CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); let word_pairs = [ - // 1, 3: (healthy arb 2) and (healthy arbre 2) with (bitmap123 | bitmap456) - (("healthy", "arbre", 2), &serialised_bitmap123), - // not inserted because 3 > max_proximity - (("healthy", "arbre", 3), &serialised_bitmap456), - // 0, 2: (healthy arb 1) and (healthy arbre 1) with (bitmap123) (("healthy", "arbres", 1), &serialised_bitmap123), - // 1, 3: - (("healthy", "arbres", 2), &serialised_bitmap456), - // not be inserted because 3 > max_proximity - (("healthy", "arbres", 3), &serialised_bitmap789), - // not inserted because no prefixes for boat (("healthy", "boat", 1), &serialised_bitmap123), - // not inserted because no prefixes for ca (("healthy", "ca", 1), &serialised_bitmap123), - // 4: (healthy cat 1) with (bitmap456 + bitmap123) (("healthy", "cats", 1), &serialised_bitmap456), - // 5: (healthy cat 2) with (bitmap789 + bitmap_ranges) - (("healthy", "cats", 2), &serialised_bitmap789), - // 4 + 6: (healthy catto 1) with (bitmap123) (("healthy", "cattos", 1), &serialised_bitmap123), - // 5 + 7: (healthy catto 2) with (bitmap_ranges) - (("healthy", "cattos", 2), &serialised_bitmap_ranges), - // 8: (jittery cat 1) with (bitmap123 | bitmap456 | bitmap789 | bitmap_ranges) (("jittery", "cat", 1), &serialised_bitmap123), - // 8: (("jittery", "cata", 1), &serialised_bitmap456), - // 8: (("jittery", "catb", 1), &serialised_bitmap789), - // 8: (("jittery", "catc", 1), &serialised_bitmap_ranges), + (("healthy", "arbre", 2), &serialised_bitmap123), + (("healthy", "arbres", 2), &serialised_bitmap456), + (("healthy", "cats", 2), &serialised_bitmap789), + (("healthy", "cattos", 2), &serialised_bitmap_ranges), + (("healthy", "arbre", 3), &serialised_bitmap456), + (("healthy", "arbres", 3), &serialised_bitmap789), ]; let expected_result = [ - // first batch: (("healthy", "arb", 1), bitmap123.clone()), - (("healthy", "arb", 2), &bitmap123 | &bitmap456), (("healthy", "arbre", 1), bitmap123.clone()), - (("healthy", "arbre", 2), &bitmap123 | &bitmap456), - // second batch: (("healthy", "cat", 1), &bitmap456 | &bitmap123), - (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), (("healthy", "catto", 1), bitmap123.clone()), - (("healthy", "catto", 2), bitmap_ranges.clone()), - // third batch (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), + (("healthy", "arb", 2), &bitmap123 | &bitmap456), + (("healthy", "arbre", 2), &bitmap123 | &bitmap456), + (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), + (("healthy", "catto", 2), bitmap_ranges.clone()), ]; let mut result = vec![]; From 1dbbd8694feb66c07cb2eef2144ff785fba16604 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 14:01:53 +0200 Subject: [PATCH 02/15] Rename StrStrU8Codec to U8StrStrCodec and reorder its fields --- milli/src/heed_codec/mod.rs | 2 +- milli/src/heed_codec/str_str_u8_codec.rs | 28 +++---- milli/src/index.rs | 6 +- milli/src/lib.rs | 2 +- milli/src/search/criteria/mod.rs | 4 +- milli/src/snapshot_tests.rs | 6 +- .../word_prefix_pair_proximity_docids.rs | 74 +++++++++---------- 7 files changed, 61 insertions(+), 61 deletions(-) diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index f3691b7d8..e07e47c79 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -15,4 +15,4 @@ pub use self::roaring_bitmap_length::{ BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec, }; pub use self::str_beu32_codec::StrBEU32Codec; -pub use self::str_str_u8_codec::{StrStrU8Codec, UncheckedStrStrU8Codec}; +pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec}; diff --git a/milli/src/heed_codec/str_str_u8_codec.rs b/milli/src/heed_codec/str_str_u8_codec.rs index 6cfff3ecf..60be8ddc7 100644 --- a/milli/src/heed_codec/str_str_u8_codec.rs +++ b/milli/src/heed_codec/str_str_u8_codec.rs @@ -1,10 +1,10 @@ use std::borrow::Cow; use std::str; -pub struct StrStrU8Codec; +pub struct U8StrStrCodec; -impl<'a> heed::BytesDecode<'a> for StrStrU8Codec { - type DItem = (&'a str, &'a str, u8); +impl<'a> heed::BytesDecode<'a> for U8StrStrCodec { + type DItem = (u8, &'a str, &'a str); fn bytes_decode(bytes: &'a [u8]) -> Option { let (n, bytes) = bytes.split_first()?; @@ -13,14 +13,14 @@ impl<'a> heed::BytesDecode<'a> for StrStrU8Codec { let s2_bytes = &rest[1..]; let s1 = str::from_utf8(s1_bytes).ok()?; let s2 = str::from_utf8(s2_bytes).ok()?; - Some((s1, s2, *n)) + Some((*n, s1, s2)) } } -impl<'a> heed::BytesEncode<'a> for StrStrU8Codec { - type EItem = (&'a str, &'a str, u8); +impl<'a> heed::BytesEncode<'a> for U8StrStrCodec { + type EItem = (u8, &'a str, &'a str); - fn bytes_encode((s1, s2, n): &Self::EItem) -> Option> { + fn bytes_encode((n, s1, s2): &Self::EItem) -> Option> { let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); bytes.push(*n); bytes.extend_from_slice(s1.as_bytes()); @@ -29,24 +29,24 @@ impl<'a> heed::BytesEncode<'a> for StrStrU8Codec { Some(Cow::Owned(bytes)) } } -pub struct UncheckedStrStrU8Codec; +pub struct UncheckedU8StrStrCodec; -impl<'a> heed::BytesDecode<'a> for UncheckedStrStrU8Codec { - type DItem = (&'a [u8], &'a [u8], u8); +impl<'a> heed::BytesDecode<'a> for UncheckedU8StrStrCodec { + type DItem = (u8, &'a [u8], &'a [u8]); fn bytes_decode(bytes: &'a [u8]) -> Option { let (n, bytes) = bytes.split_first()?; let s1_end = bytes.iter().position(|b| *b == 0)?; let (s1_bytes, rest) = bytes.split_at(s1_end); let s2_bytes = &rest[1..]; - Some((s1_bytes, s2_bytes, *n)) + Some((*n, s1_bytes, s2_bytes)) } } -impl<'a> heed::BytesEncode<'a> for UncheckedStrStrU8Codec { - type EItem = (&'a [u8], &'a [u8], u8); +impl<'a> heed::BytesEncode<'a> for UncheckedU8StrStrCodec { + type EItem = (u8, &'a [u8], &'a [u8]); - fn bytes_encode((s1, s2, n): &Self::EItem) -> Option> { + fn bytes_encode((n, s1, s2): &Self::EItem) -> Option> { let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); bytes.push(*n); bytes.extend_from_slice(s1); diff --git a/milli/src/index.rs b/milli/src/index.rs index 0dccabf03..f1bc2fa10 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -21,7 +21,7 @@ use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, - Search, StrBEU32Codec, StrStrU8Codec, BEU16, BEU32, + Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -106,9 +106,9 @@ pub struct Index { pub docid_word_positions: Database, /// Maps the proximity between a pair of words with all the docids where this relation appears. - pub word_pair_proximity_docids: Database, + pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. - pub word_prefix_pair_proximity_docids: Database, + pub word_prefix_pair_proximity_docids: Database, /// Maps the word and the position with the docids that corresponds to it. pub word_position_docids: Database, diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 517d28ccc..b5671b33b 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -37,7 +37,7 @@ pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldIdWordCountCodec, ObkvCodec, RoaringBitmapCodec, - RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, UncheckedStrStrU8Codec, + RoaringBitmapLenCodec, StrBEU32Codec, U8StrStrCodec, UncheckedU8StrStrCodec, }; pub use self::index::Index; pub use self::search::{ diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 866eaefde..86cec1ddc 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -138,7 +138,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { right: &str, proximity: u8, ) -> heed::Result> { - let key = (left, right, proximity); + let key = (proximity, left, right); self.index.word_pair_proximity_docids.get(self.rtxn, &key) } @@ -148,7 +148,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { right: &str, proximity: u8, ) -> heed::Result> { - let key = (left, right, proximity); + let key = (proximity, left, right); self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) } diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index 17f490758..b4eee7dfe 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -182,16 +182,16 @@ pub fn snap_docid_word_positions(index: &Index) -> String { } pub fn snap_word_pair_proximity_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, word_pair_proximity_docids, |( - (word1, word2, proximity), + (proximity, word1, word2), b, )| { - &format!("{word1:<16} {word2:<16} {proximity:<2} {}", display_bitmap(&b)) + &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b)) }); snap } pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |( - (word1, prefix, proximity), + (proximity, word1, prefix), b, )| { &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b)) diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/word_prefix_pair_proximity_docids.rs index f919aecc7..77294296f 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/word_prefix_pair_proximity_docids.rs @@ -177,7 +177,7 @@ use log::debug; use crate::update::index_documents::{ create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, }; -use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedStrStrU8Codec}; +use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedU8StrStrCodec}; pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, @@ -259,9 +259,9 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { &mut cursor, |cursor| { if let Some((key, value)) = cursor.move_on_next()? { - let (word1, word2, proximity) = UncheckedStrStrU8Codec::bytes_decode(key) + let (proximity, word1, word2) = UncheckedU8StrStrCodec::bytes_decode(key) .ok_or(heed::Error::Decoding)?; - Ok(Some(((word1, word2, proximity), value))) + Ok(Some(((proximity, word1, word2), value))) } else { Ok(None) } @@ -293,7 +293,7 @@ impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { let mut db_iter = self .index .word_pair_proximity_docids - .remap_key_type::() + .remap_key_type::() .remap_data_type::() .iter(self.wtxn)?; @@ -358,7 +358,7 @@ fn execute_on_word_pairs_and_prefixes( mut next_word_pair_proximity: impl for<'a> FnMut( &'a mut I, ) -> Result< - Option<((&'a [u8], &'a [u8], u8), &'a [u8])>, + Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>, >, prefixes: &PrefixTrieNode, max_proximity: u8, @@ -376,14 +376,14 @@ fn execute_on_word_pairs_and_prefixes( let mut prefix_buffer = Vec::with_capacity(8); let mut merge_buffer = Vec::with_capacity(65_536); - while let Some(((word1, word2, proximity), data)) = next_word_pair_proximity(iter)? { + while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? { // skip this iteration if the proximity is over the threshold if proximity > max_proximity { break; }; let word2_start_different_than_prev = word2[0] != prev_word2_start; // if there were no potential prefixes for the previous word2 based on its first letter, - // and if the current word2 starts with the same letter, then there is also no potential + // and if the current word2 starts with the s`ame letter, then there is also no potential // prefixes for the current word2, and we can skip to the next iteration if empty_prefixes && !word2_start_different_than_prev { continue; @@ -683,7 +683,7 @@ mod tests { use super::*; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; - use crate::{db_snap, CboRoaringBitmapCodec, StrStrU8Codec}; + use crate::{db_snap, CboRoaringBitmapCodec, U8StrStrCodec}; fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { let mut documents = Vec::new(); @@ -858,40 +858,40 @@ mod tests { CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges); let word_pairs = [ - (("healthy", "arbres", 1), &serialised_bitmap123), - (("healthy", "boat", 1), &serialised_bitmap123), - (("healthy", "ca", 1), &serialised_bitmap123), - (("healthy", "cats", 1), &serialised_bitmap456), - (("healthy", "cattos", 1), &serialised_bitmap123), - (("jittery", "cat", 1), &serialised_bitmap123), - (("jittery", "cata", 1), &serialised_bitmap456), - (("jittery", "catb", 1), &serialised_bitmap789), - (("jittery", "catc", 1), &serialised_bitmap_ranges), - (("healthy", "arbre", 2), &serialised_bitmap123), - (("healthy", "arbres", 2), &serialised_bitmap456), - (("healthy", "cats", 2), &serialised_bitmap789), - (("healthy", "cattos", 2), &serialised_bitmap_ranges), - (("healthy", "arbre", 3), &serialised_bitmap456), - (("healthy", "arbres", 3), &serialised_bitmap789), + ((1, "healthy", "arbres"), &serialised_bitmap123), + ((1, "healthy", "boat"), &serialised_bitmap123), + ((1, "healthy", "ca"), &serialised_bitmap123), + ((1, "healthy", "cats"), &serialised_bitmap456), + ((1, "healthy", "cattos"), &serialised_bitmap123), + ((1, "jittery", "cat"), &serialised_bitmap123), + ((1, "jittery", "cata"), &serialised_bitmap456), + ((1, "jittery", "catb"), &serialised_bitmap789), + ((1, "jittery", "catc"), &serialised_bitmap_ranges), + ((2, "healthy", "arbre"), &serialised_bitmap123), + ((2, "healthy", "arbres"), &serialised_bitmap456), + ((2, "healthy", "cats"), &serialised_bitmap789), + ((2, "healthy", "cattos"), &serialised_bitmap_ranges), + ((3, "healthy", "arbre"), &serialised_bitmap456), + ((3, "healthy", "arbres"), &serialised_bitmap789), ]; let expected_result = [ - (("healthy", "arb", 1), bitmap123.clone()), - (("healthy", "arbre", 1), bitmap123.clone()), - (("healthy", "cat", 1), &bitmap456 | &bitmap123), - (("healthy", "catto", 1), bitmap123.clone()), - (("jittery", "cat", 1), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), - (("healthy", "arb", 2), &bitmap123 | &bitmap456), - (("healthy", "arbre", 2), &bitmap123 | &bitmap456), - (("healthy", "cat", 2), &bitmap789 | &bitmap_ranges), - (("healthy", "catto", 2), bitmap_ranges.clone()), + ((1, "healthy", "arb"), bitmap123.clone()), + ((1, "healthy", "arbre"), bitmap123.clone()), + ((1, "healthy", "cat"), &bitmap456 | &bitmap123), + ((1, "healthy", "catto"), bitmap123.clone()), + ((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)), + ((2, "healthy", "arb"), &bitmap123 | &bitmap456), + ((2, "healthy", "arbre"), &bitmap123 | &bitmap456), + ((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges), + ((2, "healthy", "catto"), bitmap_ranges.clone()), ]; let mut result = vec![]; let mut iter = - IntoIterator::into_iter(word_pairs).map(|((word1, word2, proximity), data)| { - ((word1.as_bytes(), word2.as_bytes(), proximity), data.as_slice()) + IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| { + ((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice()) }); execute_on_word_pairs_and_prefixes( &mut iter, @@ -899,7 +899,7 @@ mod tests { &prefixes, 2, |k, v| { - let (word1, prefix, proximity) = StrStrU8Codec::bytes_decode(k).unwrap(); + let (word1, prefix, proximity) = U8StrStrCodec::bytes_decode(k).unwrap(); let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); Ok(()) @@ -908,8 +908,8 @@ mod tests { .unwrap(); for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) { - let ((actual_word1, actual_prefix, actual_proximity), actual_bitmap) = x; - let ((expected_word1, expected_prefix, expected_proximity), expected_bitmap) = y; + let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x; + let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y; assert_eq!(actual_word1, expected_word1); assert_eq!(actual_prefix, expected_prefix); From 264a04922dfb16b54903ce16d0dd1c846060fbd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 15:33:13 +0200 Subject: [PATCH 03/15] Add prefix_word_pair_proximity database Similar to the word_prefix_pair_proximity one but instead the keys are: (proximity, prefix, word2) --- milli/src/index.rs | 8 +- milli/src/snapshot_tests.rs | 12 + milli/src/update/clear_documents.rs | 2 + milli/src/update/delete_documents.rs | 35 +- milli/src/update/index_documents/mod.rs | 11 +- milli/src/update/mod.rs | 4 +- milli/src/update/prefix_word_pairs/mod.rs | 216 +++++++++ .../update/prefix_word_pairs/prefix_word.rs | 178 ++++++++ .../word_prefix_pair_proximity_docids.snap | 46 ++ ...refix_word_pair_proximity_docids.hash.snap | 4 + .../prefix_word_pair_proximity_docids.snap | 41 ++ .../word_pair_proximity_docids.hash.snap | 4 + ...ord_prefix_pair_proximity_docids.hash.snap | 4 + .../word_prefix.rs} | 427 +++++------------- 14 files changed, 653 insertions(+), 339 deletions(-) create mode 100644 milli/src/update/prefix_word_pairs/mod.rs create mode 100644 milli/src/update/prefix_word_pairs/prefix_word.rs create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap rename milli/src/update/{word_prefix_pair_proximity_docids.rs => prefix_word_pairs/word_prefix.rs} (67%) diff --git a/milli/src/index.rs b/milli/src/index.rs index f1bc2fa10..3bb668b43 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -71,6 +71,7 @@ pub mod db_name { pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions"; pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids"; pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids"; + pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids"; pub const WORD_POSITION_DOCIDS: &str = "word-position-docids"; pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids"; pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids"; @@ -109,6 +110,8 @@ pub struct Index { pub word_pair_proximity_docids: Database, /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears. pub word_prefix_pair_proximity_docids: Database, + /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears. + pub prefix_word_pair_proximity_docids: Database, /// Maps the word and the position with the docids that corresponds to it. pub word_position_docids: Database, @@ -138,7 +141,7 @@ impl Index { pub fn new>(mut options: heed::EnvOpenOptions, path: P) -> Result { use db_name::*; - options.max_dbs(17); + options.max_dbs(18); unsafe { options.flag(Flags::MdbAlwaysFreePages) }; let env = options.open(path)?; @@ -151,6 +154,8 @@ impl Index { let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?; let word_prefix_pair_proximity_docids = env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?; + let prefix_word_pair_proximity_docids = + env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?; let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?; let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?; let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?; @@ -175,6 +180,7 @@ impl Index { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, word_position_docids, word_prefix_position_docids, field_id_word_count_docids, diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index b4eee7dfe..e9c92a949 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -198,6 +198,15 @@ pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String { }); snap } +pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String { + let snap = make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |( + (proximity, prefix, word2), + b, + )| { + &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b)) + }); + snap +} pub fn snap_word_position_docids(index: &Index) -> String { let snap = make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) @@ -427,6 +436,9 @@ macro_rules! full_snap_of_db { ($index:ident, word_prefix_pair_proximity_docids) => {{ $crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index) }}; + ($index:ident, prefix_word_pair_proximity_docids) => {{ + $crate::snapshot_tests::snap_prefix_word_pair_proximity_docids(&$index) + }}; ($index:ident, word_position_docids) => {{ $crate::snapshot_tests::snap_word_position_docids(&$index) }}; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 5b7dbc57c..ba59c14cf 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -25,6 +25,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, word_position_docids, field_id_word_count_docids, word_prefix_position_docids, @@ -66,6 +67,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { docid_word_positions.clear(self.wtxn)?; word_pair_proximity_docids.clear(self.wtxn)?; word_prefix_pair_proximity_docids.clear(self.wtxn)?; + prefix_word_pair_proximity_docids.clear(self.wtxn)?; word_position_docids.clear(self.wtxn)?; field_id_word_count_docids.clear(self.wtxn)?; word_prefix_position_docids.clear(self.wtxn)?; diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index eae473f51..54328b50d 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -183,6 +183,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_pair_proximity_docids, field_id_word_count_docids, word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, word_position_docids, word_prefix_position_docids, facet_id_f64_docids, @@ -327,26 +328,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?; } - // We delete the documents ids from the word prefix pair proximity database docids - // and remove the empty pairs too. - let db = word_prefix_pair_proximity_docids.remap_key_type::(); - let mut iter = db.iter_mut(self.wtxn)?; - while let Some(result) = iter.next() { - let (key, mut docids) = result?; - let previous_len = docids.len(); - docids -= &self.to_delete_docids; - if docids.is_empty() { - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.del_current()? }; - } else if docids.len() != previous_len { - let key = key.to_owned(); - // safety: we don't keep references from inside the LMDB database. - unsafe { iter.put_current(&key, &docids)? }; + for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] { + // We delete the documents ids from the word prefix pair proximity database docids + // and remove the empty pairs too. + let db = db.remap_key_type::(); + let mut iter = db.iter_mut(self.wtxn)?; + while let Some(result) = iter.next() { + let (key, mut docids) = result?; + let previous_len = docids.len(); + docids -= &self.to_delete_docids; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let key = key.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&key, &docids)? }; + } } } - drop(iter); - // We delete the documents ids that are under the pairs of words, // it is faster and use no memory to iterate over all the words pairs than // to compute the cartesian product of every words of the deleted documents. diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index e0eefe07b..897f2f8f8 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -36,8 +36,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::UserError; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, - WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, + self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, + WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; use crate::{Index, Result, RoaringBitmapCodec}; @@ -528,12 +528,7 @@ where if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { // Run the word prefix pair proximity docids update operation. - let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index); - builder.chunk_compression_type = self.indexer_config.chunk_compression_type; - builder.chunk_compression_level = self.indexer_config.chunk_compression_level; - builder.max_nb_chunks = self.indexer_config.max_nb_chunks; - builder.max_memory = self.indexer_config.max_memory; - builder.execute( + PrefixWordPairsProximityDocids::new(self.wtxn, self.index).execute( word_pair_proximity_docids, &new_prefix_fst_words, &common_prefix_fst_words, diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 1bf27a5f0..3ddc01cef 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -6,10 +6,10 @@ pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; pub use self::indexer_config::IndexerConfig; +pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids; pub use self::settings::{Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; -pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids; pub use self::words_prefix_position_docids::WordPrefixPositionDocids; pub use self::words_prefixes_fst::WordsPrefixesFst; @@ -19,9 +19,9 @@ mod delete_documents; mod facets; mod index_documents; mod indexer_config; +mod prefix_word_pairs; mod settings; mod update_step; mod word_prefix_docids; -mod word_prefix_pair_proximity_docids; mod words_prefix_position_docids; mod words_prefixes_fst; diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs new file mode 100644 index 000000000..63286f8da --- /dev/null +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -0,0 +1,216 @@ +use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; +use crate::{Index, Result}; +use heed::types::ByteSlice; +use std::{borrow::Cow, collections::HashSet, io::BufReader}; + +mod prefix_word; +mod word_prefix; + +pub use prefix_word::index_prefix_word_database; +pub use word_prefix::index_word_prefix_database; + +pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + max_proximity: u8, + max_prefix_length: usize, +} +impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { + pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Self { + Self { wtxn, index, max_proximity: 4, max_prefix_length: 2 } + } + /// Set the maximum proximity required to make a prefix be part of the words prefixes + /// database. If two words are too far from the threshold the associated documents will + /// not be part of the prefix database. + /// + /// Default value is 4. This value must be lower or equal than 7 and will be clamped + /// to this bound otherwise. + pub fn max_proximity(&mut self, value: u8) -> &mut Self { + self.max_proximity = value.max(7); + self + } + /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words + /// prefixes database. If the prefix length is higher than the threshold, the associated documents + /// will not be part of the prefix database. + /// + /// Default value is 2. + pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { + self.max_prefix_length = value; + self + } + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] + pub fn execute<'a>( + self, + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &'a [String], + common_prefix_fst_words: &[&'a [String]], + del_prefix_fst_words: &HashSet>, + ) -> Result<()> { + index_word_prefix_database( + self.wtxn, + self.index.word_pair_proximity_docids, + self.index.word_prefix_pair_proximity_docids, + self.max_proximity, + self.max_prefix_length, + new_word_pair_proximity_docids.clone(), + new_prefix_fst_words, + common_prefix_fst_words, + del_prefix_fst_words, + )?; + + index_prefix_word_database( + self.wtxn, + self.index.word_pair_proximity_docids, + self.index.prefix_word_pair_proximity_docids, + self.max_proximity, + self.max_prefix_length, + new_word_pair_proximity_docids, + new_prefix_fst_words, + common_prefix_fst_words, + del_prefix_fst_words, + )?; + + Ok(()) + } +} + +// This is adapted from `sorter_into_lmdb_database` +pub fn insert_into_database( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + new_key: &[u8], + new_value: &[u8], +) -> Result<()> { + let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; + match iter.next().transpose()? { + Some((key, old_val)) if new_key == key => { + let val = + merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) + .map_err(|_| { + // TODO just wrap this error? + crate::error::InternalError::IndexingMergingKeys { + process: "get-put-merge", + } + })?; + // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour + unsafe { iter.put_current(new_key, &val)? }; + } + _ => { + drop(iter); + database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; + } + } + Ok(()) +} + +// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, +// but it uses `append` if the database is empty, and it assumes that the values in the +// writer don't conflict with values in the database. +pub fn write_into_lmdb_database_without_merging( + wtxn: &mut heed::RwTxn, + database: heed::PolyDatabase, + writer: grenad::Writer, +) -> Result<()> { + let file = writer.into_inner()?; + let reader = grenad::Reader::new(BufReader::new(file))?; + if database.is_empty(wtxn)? { + let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + // safety: the key comes from the grenad reader, not the database + unsafe { out_iter.append(k, v)? }; + } + } else { + let mut cursor = reader.into_cursor()?; + while let Some((k, v)) = cursor.move_on_next()? { + database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; + } + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use crate::db_snap; + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + use crate::index::tests::TempIndex; + use std::io::Cursor; + + fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { + let mut documents = Vec::new(); + for prefix in prefixes { + for i in 0..50 { + documents.push( + serde_json::json!({ + "text": format!("{prefix}{i:x}"), + }) + .as_object() + .unwrap() + .clone(), + ) + } + } + documents + } + + #[test] + fn test_update() { + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "text": "At an amazing and beautiful house" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "text": "The bell rings at 5 am" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_prefix_pair_proximity_docids, "initial"); + + let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]); + documents.push( + serde_json::json!({ + "text": "At an extraordinary house" + }) + .as_object() + .unwrap() + .clone(), + ); + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_pair_proximity_docids, "update"); + db_snap!(index, word_prefix_pair_proximity_docids, "update"); + db_snap!(index, prefix_word_pair_proximity_docids, "update"); + } +} diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs new file mode 100644 index 000000000..cbc9ac0b2 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -0,0 +1,178 @@ +use crate::update::index_documents::{create_writer, CursorClonableMmap}; +use crate::update::prefix_word_pairs::{ + insert_into_database, write_into_lmdb_database_without_merging, +}; +use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesDecode; +use log::debug; +use std::borrow::Cow; +use std::collections::{BTreeMap, HashSet}; + +#[logging_timer::time] +pub fn index_prefix_word_database( + wtxn: &mut heed::RwTxn, + word_pair_proximity_docids: heed::Database, + prefix_word_pair_proximity_docids: heed::Database, + max_proximity: u8, + max_prefix_length: usize, + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, +) -> Result<()> { + let max_proximity = max_proximity - 1; + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); + + let common_prefixes: Vec<_> = common_prefix_fst_words + .into_iter() + .map(|s| s.into_iter()) + .flatten() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length) + .collect(); + + // If the prefix trie is not empty, then we can iterate over all new + // word pairs to look for new (word1, common_prefix, proximity) elements + // to insert in the DB + for proximity in 1..=max_proximity - 1 { + for prefix in common_prefixes.iter() { + let mut prefix_key = vec![]; + prefix_key.push(proximity); + prefix_key.extend_from_slice(prefix.as_bytes()); + let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; + // This is the core of the algorithm + execute_on_word_pairs_and_prefixes( + proximity + 1, + prefix.as_bytes(), + // the next two arguments tell how to iterate over the new word pairs + &mut cursor, + |cursor| { + if let Some((key, value)) = cursor.next()? { + let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key) + .ok_or(heed::Error::Decoding)?; + Ok(Some((word2, value))) + } else { + Ok(None) + } + }, + // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap) + |key, value| { + insert_into_database( + wtxn, + *prefix_word_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, + )?; + } + } + + // Now we do the same thing with the new prefixes and all word pairs in the DB + let new_prefixes: Vec<_> = new_prefix_fst_words + .into_iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length) + .collect(); + + // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) + // element in an intermediary grenad + let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + + for proximity in 1..=max_proximity - 1 { + for prefix in new_prefixes.iter() { + let mut prefix_key = vec![]; + prefix_key.push(proximity); + prefix_key.extend_from_slice(prefix.as_bytes()); + let mut db_iter = word_pair_proximity_docids + .as_polymorph() + .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? + .remap_key_type::(); + execute_on_word_pairs_and_prefixes( + proximity + 1, + prefix.as_bytes(), + &mut db_iter, + |db_iter| { + db_iter + .next() + .transpose() + .map(|x| x.map(|((_, _, word2), value)| (word2, value))) + .map_err(|e| e.into()) + }, + |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + drop(db_iter); + } + } + + // and then we write the grenad into the DB + // Since the grenad contains only new prefixes, we know in advance that none + // of its elements already exist in the DB, thus there is no need to specify + // how to merge conflicting elements + write_into_lmdb_database_without_merging( + wtxn, + *prefix_word_pair_proximity_docids.as_polymorph(), + writer, + )?; + + // All of the word prefix pairs in the database that have a w2 + // that is contained in the `suppr_pw` set must be removed as well. + if !del_prefix_fst_words.is_empty() { + let mut iter = + prefix_word_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; + while let Some(((_, prefix, _), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(prefix.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; + } + } + } + + Ok(()) +} + +/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. +/// +/// Its main arguments are: +/// 1. a sorted prefix iterator over ((word1, word2, proximity), docids) elements +/// 2. a closure to describe how to handle the new computed (word1, prefix, proximity) elements +/// +/// For more information about what this function does, read the module documentation. +fn execute_on_word_pairs_and_prefixes( + proximity: u8, + prefix: &[u8], + iter: &mut I, + mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result>, + mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, +) -> Result<()> { + let mut batch: BTreeMap, Vec>> = <_>::default(); + + while let Some((word2, data)) = next_word2_and_docids(iter)? { + let entry = batch.entry(word2.to_owned()).or_default(); + entry.push(Cow::Owned(data.to_owned())); + } + + let mut key_buffer = Vec::with_capacity(8); + key_buffer.push(proximity); + key_buffer.extend_from_slice(prefix); + key_buffer.push(0); + + let mut value_buffer = Vec::with_capacity(65_536); + + for (key, values) in batch { + key_buffer.truncate(prefix.len() + 2); + value_buffer.clear(); + + key_buffer.extend_from_slice(&key); + let data = if values.len() > 1 { + CboRoaringBitmapCodec::merge_into(&values, &mut value_buffer)?; + value_buffer.as_slice() + } else { + &values[0] + }; + insert(key_buffer.as_slice(), data)?; + } + Ok(()) +} diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..9a6ffaec9 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,46 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [101, ] +1 amazing a [100, ] +1 an a [100, ] +1 and b [100, ] +1 and be [100, ] +1 at a [100, ] +1 rings a [101, ] +1 the b [101, ] +1 the be [101, ] +2 5 a [101, ] +2 amazing a [100, ] +2 amazing b [100, ] +2 amazing be [100, ] +2 an a [100, ] +2 and a [100, ] +2 at a [100, 101, ] +2 beautiful a [100, ] +2 bell a [101, ] +2 house b [100, ] +2 house be [100, ] +2 rings b [101, ] +2 rings be [101, ] +3 am a [101, ] +3 amazing a [100, ] +3 an b [100, ] +3 an be [100, ] +3 and a [100, ] +3 at a [100, ] +3 at b [101, ] +3 at be [101, ] +3 beautiful a [100, ] +3 house a [100, ] +3 rings a [101, ] +3 the a [101, ] +4 5 b [101, ] +4 5 be [101, ] +4 and a [100, ] +4 at b [100, ] +4 at be [100, ] +4 beautiful a [100, ] +4 bell a [101, ] +4 house a [100, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..e460be400 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +fb88e49fd666886731b62baef8f44995 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..b94c5d52e --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,41 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +2 a 5 [101, ] +2 a amazing [100, ] +2 a an [100, 202, ] +2 a and [100, ] +2 a beautiful [100, ] +2 a extraordinary [202, ] +2 am and [100, ] +2 an amazing [100, ] +2 an beautiful [100, ] +2 an extraordinary [202, ] +2 b house [100, ] +2 b rings [101, ] +2 be house [100, ] +2 be rings [101, ] +3 a 5 [101, ] +3 a am [101, ] +3 a amazing [100, ] +3 a an [100, ] +3 a and [100, ] +3 a at [100, 202, ] +3 a beautiful [100, ] +3 a extraordinary [202, ] +3 a house [100, 202, ] +3 a rings [101, ] +3 am 5 [101, ] +3 am an [100, ] +3 am beautiful [100, ] +3 an amazing [100, ] +3 an and [100, ] +3 an at [100, 202, ] +3 an house [100, 202, ] +3 b and [100, ] +3 b at [101, ] +3 b the [101, ] +3 be and [100, ] +3 be at [101, ] +3 be the [101, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..015ef8c14 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +6965ecd1bf821f1cf921c2ab751b36cf diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap new file mode 100644 index 000000000..e460be400 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +fb88e49fd666886731b62baef8f44995 diff --git a/milli/src/update/word_prefix_pair_proximity_docids.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs similarity index 67% rename from milli/src/update/word_prefix_pair_proximity_docids.rs rename to milli/src/update/prefix_word_pairs/word_prefix.rs index 77294296f..bd1bea2a3 100644 --- a/milli/src/update/word_prefix_pair_proximity_docids.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -1,5 +1,5 @@ /*! - ## What is WordPrefixPairProximityDocids? + ## What is WordPrefix? The word-prefix-pair-proximity-docids database is a database whose keys are of the form `(proximity, word, prefix)` and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with @@ -139,7 +139,7 @@ inputs described above, which come from different places: 2. `word_pairs_db`, which is the list of word pairs from the database. This list includes all elements in `new_word_pairs` since `new_word_pairs` - was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute` + was added to the database prior to calling the `WordPrefix::execute` function. To update the prefix database correctly, we call the algorithm described earlier first @@ -161,196 +161,137 @@ reader and writer). Therefore, when calling the algorithm on `((proximity, word, prefix), docids)` elements in an intermediary grenad Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. - - - */ -use std::borrow::Cow; -use std::collections::HashSet; -use std::io::BufReader; +use crate::update::index_documents::{create_writer, CursorClonableMmap}; +use crate::update::prefix_word_pairs::{ + insert_into_database, write_into_lmdb_database_without_merging, +}; +use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; use grenad::CompressionType; use heed::types::ByteSlice; use heed::BytesDecode; use log::debug; +use std::borrow::Cow; +use std::collections::HashSet; -use crate::update::index_documents::{ - create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap, -}; -use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedU8StrStrCodec}; - -pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> { - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - pub(crate) chunk_compression_type: CompressionType, - pub(crate) chunk_compression_level: Option, - pub(crate) max_nb_chunks: Option, - pub(crate) max_memory: Option, +#[logging_timer::time] +pub fn index_word_prefix_database( + wtxn: &mut heed::RwTxn, + word_pair_proximity_docids: heed::Database, + word_prefix_pair_proximity_docids: heed::Database, max_proximity: u8, max_prefix_length: usize, -} + new_word_pair_proximity_docids: grenad::Reader, + new_prefix_fst_words: &[String], + common_prefix_fst_words: &[&[String]], + del_prefix_fst_words: &HashSet>, +) -> Result<()> { + debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); -impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> { - pub fn new( - wtxn: &'t mut heed::RwTxn<'i, 'u>, - index: &'i Index, - ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> { - WordPrefixPairProximityDocids { - wtxn, - index, - chunk_compression_type: CompressionType::None, - chunk_compression_level: None, - max_nb_chunks: None, - max_memory: None, - max_proximity: 4, - max_prefix_length: 2, - } - } + // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length + let prefixes = PrefixTrieNode::from_sorted_prefixes( + common_prefix_fst_words + .into_iter() + .map(|s| s.into_iter()) + .flatten() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length), + ); - /// Set the maximum proximity required to make a prefix be part of the words prefixes - /// database. If two words are too far from the threshold the associated documents will - /// not be part of the prefix database. - /// - /// Default value is 4. This value must be lower or equal than 7 and will be clamped - /// to this bound otherwise. - pub fn max_proximity(&mut self, value: u8) -> &mut Self { - self.max_proximity = value.max(7); - self - } - - /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words - /// prefixes database. If the prefix length is higher than the threshold, the associated documents - /// will not be part of the prefix database. - /// - /// Default value is 2. - pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value; - self - } - - #[logging_timer::time("WordPrefixPairProximityDocids::{}")] - pub fn execute<'a>( - mut self, - new_word_pair_proximity_docids: grenad::Reader, - new_prefix_fst_words: &'a [String], - common_prefix_fst_words: &[&'a [String]], - del_prefix_fst_words: &HashSet>, - ) -> Result<()> { - debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); - - // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length - let prefixes = PrefixTrieNode::from_sorted_prefixes( - common_prefix_fst_words - .into_iter() - .map(|s| s.into_iter()) - .flatten() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - // If the prefix trie is not empty, then we can iterate over all new - // word pairs to look for new (word1, common_prefix, proximity) elements - // to insert in the DB - if !prefixes.is_empty() { - let mut cursor = new_word_pair_proximity_docids.into_cursor()?; - // This is the core of the algorithm - execute_on_word_pairs_and_prefixes( - // the first two arguments tell how to iterate over the new word pairs - &mut cursor, - |cursor| { - if let Some((key, value)) = cursor.move_on_next()? { - let (proximity, word1, word2) = UncheckedU8StrStrCodec::bytes_decode(key) - .ok_or(heed::Error::Decoding)?; - Ok(Some(((proximity, word1, word2), value))) - } else { - Ok(None) - } - }, - &prefixes, - self.max_proximity, - // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap) - |key, value| { - insert_into_database( - &mut self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - key, - value, - ) - }, - )?; - } - - // Now we do the same thing with the new prefixes and all word pairs in the DB - - let prefixes = PrefixTrieNode::from_sorted_prefixes( - new_prefix_fst_words - .into_iter() - .map(|s| s.as_str()) - .filter(|s| s.len() <= self.max_prefix_length), - ); - - if !prefixes.is_empty() { - let mut db_iter = self - .index - .word_pair_proximity_docids - .remap_key_type::() - .remap_data_type::() - .iter(self.wtxn)?; - - // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) - // element in an intermediary grenad - let mut writer = create_writer( - self.chunk_compression_type, - self.chunk_compression_level, - tempfile::tempfile()?, - ); - - execute_on_word_pairs_and_prefixes( - &mut db_iter, - |db_iter| db_iter.next().transpose().map_err(|e| e.into()), - &prefixes, - self.max_proximity, - |key, value| writer.insert(key, value).map_err(|e| e.into()), - )?; - drop(db_iter); - - // and then we write the grenad into the DB - // Since the grenad contains only new prefixes, we know in advance that none - // of its elements already exist in the DB, thus there is no need to specify - // how to merge conflicting elements - write_into_lmdb_database_without_merging( - self.wtxn, - *self.index.word_prefix_pair_proximity_docids.as_polymorph(), - writer, - )?; - } - - // All of the word prefix pairs in the database that have a w2 - // that is contained in the `suppr_pw` set must be removed as well. - if !del_prefix_fst_words.is_empty() { - let mut iter = self - .index - .word_prefix_pair_proximity_docids - .remap_data_type::() - .iter_mut(self.wtxn)?; - while let Some(((_, w2, _), _)) = iter.next().transpose()? { - if del_prefix_fst_words.contains(w2.as_bytes()) { - // Delete this entry as the w2 prefix is no more in the words prefix fst. - unsafe { iter.del_current()? }; + // If the prefix trie is not empty, then we can iterate over all new + // word pairs to look for new (proximity, word1, common_prefix) elements + // to insert in the DB + if !prefixes.is_empty() { + let mut cursor = new_word_pair_proximity_docids.into_cursor()?; + // This is the core of the algorithm + execute_on_word_pairs_and_prefixes( + // the first two arguments tell how to iterate over the new word pairs + &mut cursor, + |cursor| { + if let Some((key, value)) = cursor.move_on_next()? { + let (proximity, word1, word2) = + UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?; + Ok(Some(((proximity, word1, word2), value))) + } else { + Ok(None) } + }, + &prefixes, + max_proximity, + // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap) + |key, value| { + insert_into_database( + wtxn, + *word_prefix_pair_proximity_docids.as_polymorph(), + key, + value, + ) + }, + )?; + } + + // Now we do the same thing with the new prefixes and all word pairs in the DB + + let prefixes = PrefixTrieNode::from_sorted_prefixes( + new_prefix_fst_words + .into_iter() + .map(|s| s.as_str()) + .filter(|s| s.len() <= max_prefix_length), + ); + + if !prefixes.is_empty() { + let mut db_iter = word_pair_proximity_docids + .remap_key_type::() + .remap_data_type::() + .iter(wtxn)?; + + // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix) + // element in an intermediary grenad + let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + + execute_on_word_pairs_and_prefixes( + &mut db_iter, + |db_iter| db_iter.next().transpose().map_err(|e| e.into()), + &prefixes, + max_proximity, + |key, value| writer.insert(key, value).map_err(|e| e.into()), + )?; + drop(db_iter); + + // and then we write the grenad into the DB + // Since the grenad contains only new prefixes, we know in advance that none + // of its elements already exist in the DB, thus there is no need to specify + // how to merge conflicting elements + write_into_lmdb_database_without_merging( + wtxn, + *word_prefix_pair_proximity_docids.as_polymorph(), + writer, + )?; + } + + // All of the word prefix pairs in the database that have a w2 + // that is contained in the `suppr_pw` set must be removed as well. + if !del_prefix_fst_words.is_empty() { + let mut iter = + word_prefix_pair_proximity_docids.remap_data_type::().iter_mut(wtxn)?; + while let Some(((_, _, prefix), _)) = iter.next().transpose()? { + if del_prefix_fst_words.contains(prefix.as_bytes()) { + // Delete this entry as the w2 prefix is no more in the words prefix fst. + unsafe { iter.del_current()? }; } } - - Ok(()) } + + Ok(()) } /// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. /// /// Its main arguments are: -/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements +/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements /// 2. a prefix trie -/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements +/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements /// /// For more information about what this function does, read the module documentation. fn execute_on_word_pairs_and_prefixes( @@ -495,61 +436,6 @@ impl PrefixAndProximityBatch { } } -// This is adapted from `sorter_into_lmdb_database` -fn insert_into_database( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - new_key: &[u8], - new_value: &[u8], -) -> Result<()> { - let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?; - match iter.next().transpose()? { - Some((key, old_val)) if new_key == key => { - let val = - merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)]) - .map_err(|_| { - // TODO just wrap this error? - crate::error::InternalError::IndexingMergingKeys { - process: "get-put-merge", - } - })?; - // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour - unsafe { iter.put_current(new_key, &val)? }; - } - _ => { - drop(iter); - database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?; - } - } - Ok(()) -} - -// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`, -// but it uses `append` if the database is empty, and it assumes that the values in the -// writer don't conflict with values in the database. -pub fn write_into_lmdb_database_without_merging( - wtxn: &mut heed::RwTxn, - database: heed::PolyDatabase, - writer: grenad::Writer, -) -> Result<()> { - let file = writer.into_inner()?; - let reader = grenad::Reader::new(BufReader::new(file))?; - if database.is_empty(wtxn)? { - let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?; - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - // safety: the key comes from the grenad reader, not the database - unsafe { out_iter.append(k, v)? }; - } - } else { - let mut cursor = reader.into_cursor()?; - while let Some((k, v)) = cursor.move_on_next()? { - database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?; - } - } - Ok(()) -} - /** A prefix trie. Used to iterate quickly over the prefixes of a word that are within a set. @@ -676,90 +562,9 @@ impl PrefixTrieNode { } #[cfg(test)] mod tests { - use std::io::Cursor; - - use roaring::RoaringBitmap; - use super::*; - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::{db_snap, CboRoaringBitmapCodec, U8StrStrCodec}; - - fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { - let mut documents = Vec::new(); - for prefix in prefixes { - for i in 0..50 { - documents.push( - serde_json::json!({ - "text": format!("{prefix}{i:x}"), - }) - .as_object() - .unwrap() - .clone(), - ) - } - } - documents - } - - #[test] - fn test_update() { - let mut index = TempIndex::new(); - index.index_documents_config.words_prefix_threshold = Some(50); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_searchable_fields(vec!["text".to_owned()]); - }) - .unwrap(); - - let batch_reader_from_documents = |documents| { - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - for object in documents { - builder.append_json_object(&object).unwrap(); - } - DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() - }; - - let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]); - // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database - documents.push( - serde_json::json!({ - "text": "At an amazing and beautiful house" - }) - .as_object() - .unwrap() - .clone(), - ); - documents.push( - serde_json::json!({ - "text": "The bell rings at 5 am" - }) - .as_object() - .unwrap() - .clone(), - ); - - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_prefix_pair_proximity_docids, "initial"); - - let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]); - documents.push( - serde_json::json!({ - "text": "At an extraordinary house" - }) - .as_object() - .unwrap() - .clone(), - ); - let documents = batch_reader_from_documents(documents); - index.add_documents(documents).unwrap(); - - db_snap!(index, word_prefix_pair_proximity_docids, "update"); - } + use crate::{CboRoaringBitmapCodec, U8StrStrCodec}; + use roaring::RoaringBitmap; fn check_prefixes( trie: &PrefixTrieNode, @@ -899,9 +704,9 @@ mod tests { &prefixes, 2, |k, v| { - let (word1, prefix, proximity) = U8StrStrCodec::bytes_decode(k).unwrap(); + let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap(); let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap(); - result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap)); + result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap)); Ok(()) }, ) From a7de4f5b854715198c1c537f8bb7010d7f614b75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 15:35:15 +0200 Subject: [PATCH 04/15] Don't add swapped word pairs to the word_pair_proximity_docids db --- .../extract/extract_word_pair_proximity_docids.rs | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs index 3837c1bbe..25117c706 100644 --- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs @@ -106,17 +106,6 @@ fn document_word_positions_into_sorter<'b>( *p = cmp::min(*p, prox); }) .or_insert(prox); - - // We also compute the inverse proximity. - let prox = prox + 1; - if prox < MAX_DISTANCE { - word_pair_proximity - .entry((word.clone(), head.word.clone())) - .and_modify(|p| { - *p = cmp::min(*p, prox); - }) - .or_insert(prox); - } } } From 6c3a5d69e1ff8465c9e2e2a78a6d880c0f730250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 15:43:10 +0200 Subject: [PATCH 05/15] Update snapshots --- .../word_prefix_pair_proximity_docids.snap | 20 -------- ...refix_word_pair_proximity_docids.hash.snap | 4 -- .../prefix_word_pair_proximity_docids.snap | 12 ----- .../word_pair_proximity_docids.hash.snap | 4 -- .../update/word_pair_proximity_docids.snap | 39 ++++++++++++++++ ...ord_prefix_pair_proximity_docids.hash.snap | 4 -- .../word_prefix_pair_proximity_docids.snap | 35 ++++++++++++++ .../word_prefix_pair_proximity_docids.snap | 46 ------------------- ...ord_prefix_pair_proximity_docids.hash.snap | 4 -- 9 files changed, 74 insertions(+), 94 deletions(-) delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.snap delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap delete mode 100644 milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap index 9a6ffaec9..c760ae440 100644 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap @@ -10,37 +10,17 @@ source: milli/src/update/prefix_word_pairs/mod.rs 1 rings a [101, ] 1 the b [101, ] 1 the be [101, ] -2 5 a [101, ] -2 amazing a [100, ] 2 amazing b [100, ] 2 amazing be [100, ] 2 an a [100, ] -2 and a [100, ] 2 at a [100, 101, ] -2 beautiful a [100, ] 2 bell a [101, ] -2 house b [100, ] -2 house be [100, ] -2 rings b [101, ] -2 rings be [101, ] -3 am a [101, ] -3 amazing a [100, ] 3 an b [100, ] 3 an be [100, ] -3 and a [100, ] 3 at a [100, ] -3 at b [101, ] -3 at be [101, ] -3 beautiful a [100, ] -3 house a [100, ] 3 rings a [101, ] 3 the a [101, ] -4 5 b [101, ] -4 5 be [101, ] -4 and a [100, ] 4 at b [100, ] 4 at be [100, ] -4 beautiful a [100, ] 4 bell a [101, ] -4 house a [100, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap deleted file mode 100644 index e460be400..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -fb88e49fd666886731b62baef8f44995 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap index b94c5d52e..c5f45a9eb 100644 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap @@ -15,27 +15,15 @@ source: milli/src/update/prefix_word_pairs/mod.rs 2 b rings [101, ] 2 be house [100, ] 2 be rings [101, ] -3 a 5 [101, ] 3 a am [101, ] 3 a amazing [100, ] -3 a an [100, ] 3 a and [100, ] -3 a at [100, 202, ] 3 a beautiful [100, ] 3 a extraordinary [202, ] 3 a house [100, 202, ] -3 a rings [101, ] -3 am 5 [101, ] -3 am an [100, ] 3 am beautiful [100, ] -3 an amazing [100, ] 3 an and [100, ] -3 an at [100, 202, ] 3 an house [100, 202, ] -3 b and [100, ] 3 b at [101, ] -3 b the [101, ] -3 be and [100, ] 3 be at [101, ] -3 be the [101, ] diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap deleted file mode 100644 index 015ef8c14..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -6965ecd1bf821f1cf921c2ab751b36cf diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.snap new file mode 100644 index 000000000..4fcd0fbd2 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.snap @@ -0,0 +1,39 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 am [101, ] +1 amazing and [100, ] +1 an amazing [100, ] +1 an extraordinary [202, ] +1 and beautiful [100, ] +1 at 5 [101, ] +1 at an [100, 202, ] +1 beautiful house [100, ] +1 bell rings [101, ] +1 extraordinary house [202, ] +1 rings at [101, ] +1 the bell [101, ] +2 amazing beautiful [100, ] +2 an and [100, ] +2 an house [202, ] +2 and house [100, ] +2 at am [101, ] +2 at amazing [100, ] +2 at extraordinary [202, ] +2 bell at [101, ] +2 rings 5 [101, ] +2 the rings [101, ] +3 amazing house [100, ] +3 an beautiful [100, ] +3 at and [100, ] +3 at house [202, ] +3 bell 5 [101, ] +3 rings am [101, ] +3 the at [101, ] +4 an house [100, ] +4 at beautiful [100, ] +4 bell am [101, ] +4 the 5 [101, ] +5 at house [100, ] +5 the am [101, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap deleted file mode 100644 index e460be400..000000000 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/prefix_word_pairs/mod.rs ---- -fb88e49fd666886731b62baef8f44995 diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..0f2e458a8 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,35 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 5 a [101, ] +1 5 am [101, ] +1 amazing a [100, ] +1 amazing an [100, ] +1 an a [100, ] +1 an am [100, ] +1 and b [100, ] +1 and be [100, ] +1 at a [100, 202, ] +1 at an [100, 202, ] +1 rings a [101, ] +1 the b [101, ] +1 the be [101, ] +2 amazing b [100, ] +2 amazing be [100, ] +2 an a [100, ] +2 an an [100, ] +2 at a [100, 101, ] +2 at am [100, 101, ] +2 bell a [101, ] +3 an b [100, ] +3 an be [100, ] +3 at a [100, ] +3 at an [100, ] +3 rings a [101, ] +3 rings am [101, ] +3 the a [101, ] +4 at b [100, ] +4 at be [100, ] +4 bell a [101, ] +4 bell am [101, ] + diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap deleted file mode 100644 index 47a6df343..000000000 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/initial/word_prefix_pair_proximity_docids.snap +++ /dev/null @@ -1,46 +0,0 @@ ---- -source: milli/src/update/word_prefix_pair_proximity_docids.rs ---- -1 5 a [101, ] -1 amazing a [100, ] -1 an a [100, ] -1 and b [100, ] -1 and be [100, ] -1 at a [100, ] -1 rings a [101, ] -1 the b [101, ] -1 the be [101, ] -2 5 a [101, ] -2 amazing a [100, ] -2 amazing b [100, ] -2 amazing be [100, ] -2 an a [100, ] -2 and a [100, ] -2 at a [100, 101, ] -2 beautiful a [100, ] -2 bell a [101, ] -2 house b [100, ] -2 house be [100, ] -2 rings b [101, ] -2 rings be [101, ] -3 am a [101, ] -3 amazing a [100, ] -3 an b [100, ] -3 an be [100, ] -3 and a [100, ] -3 at a [100, ] -3 at b [101, ] -3 at be [101, ] -3 beautiful a [100, ] -3 house a [100, ] -3 rings a [101, ] -3 the a [101, ] -4 5 b [101, ] -4 5 be [101, ] -4 and a [100, ] -4 at b [100, ] -4 at be [100, ] -4 beautiful a [100, ] -4 bell a [101, ] -4 house a [100, ] - diff --git a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap b/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap deleted file mode 100644 index bb2cc3b84..000000000 --- a/milli/src/update/snapshots/word_prefix_pair_proximity_docids.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: milli/src/update/word_prefix_pair_proximity_docids.rs ---- -fb88e49fd666886731b62baef8f44995 From 072b57651407a4c0d1e81f3d385a07776b9bb7a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 14 Sep 2022 16:00:56 +0200 Subject: [PATCH 06/15] Fix proximity value in keys of prefix_word_pair_proximity_docids --- .../update/prefix_word_pairs/prefix_word.rs | 4 +- .../prefix_word_pair_proximity_docids.snap | 42 +++++++++---------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index cbc9ac0b2..18f5bdc5a 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -44,7 +44,7 @@ pub fn index_prefix_word_database( let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?; // This is the core of the algorithm execute_on_word_pairs_and_prefixes( - proximity + 1, + proximity, prefix.as_bytes(), // the next two arguments tell how to iterate over the new word pairs &mut cursor, @@ -91,7 +91,7 @@ pub fn index_prefix_word_database( .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())? .remap_key_type::(); execute_on_word_pairs_and_prefixes( - proximity + 1, + proximity, prefix.as_bytes(), &mut db_iter, |db_iter| { diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap index c5f45a9eb..7644c433d 100644 --- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap @@ -1,29 +1,29 @@ --- source: milli/src/update/prefix_word_pairs/mod.rs --- -2 a 5 [101, ] +1 a 5 [101, ] +1 a amazing [100, ] +1 a an [100, 202, ] +1 a and [100, ] +1 a beautiful [100, ] +1 a extraordinary [202, ] +1 am and [100, ] +1 an amazing [100, ] +1 an beautiful [100, ] +1 an extraordinary [202, ] +1 b house [100, ] +1 b rings [101, ] +1 be house [100, ] +1 be rings [101, ] +2 a am [101, ] 2 a amazing [100, ] -2 a an [100, 202, ] 2 a and [100, ] 2 a beautiful [100, ] 2 a extraordinary [202, ] -2 am and [100, ] -2 an amazing [100, ] -2 an beautiful [100, ] -2 an extraordinary [202, ] -2 b house [100, ] -2 b rings [101, ] -2 be house [100, ] -2 be rings [101, ] -3 a am [101, ] -3 a amazing [100, ] -3 a and [100, ] -3 a beautiful [100, ] -3 a extraordinary [202, ] -3 a house [100, 202, ] -3 am beautiful [100, ] -3 an and [100, ] -3 an house [100, 202, ] -3 b at [101, ] -3 be at [101, ] +2 a house [100, 202, ] +2 am beautiful [100, ] +2 an and [100, ] +2 an house [100, 202, ] +2 b at [101, ] +2 be at [101, ] From 18d578dfc439db736892d58ccda6c60d323dca26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 15 Sep 2022 09:34:35 +0200 Subject: [PATCH 07/15] Adjust some algorithms using DBs of word pair proximities --- milli/src/search/criteria/exactness.rs | 1 + milli/src/search/criteria/mod.rs | 183 ++++++++++++++++++++----- 2 files changed, 153 insertions(+), 31 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index e7775423c..5327f13e4 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -226,6 +226,7 @@ fn resolve_state( } // compute intersection on pair of words with a proximity of 0. Phrase(phrase) => { + // TODO: use resolve_phrase here let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); for words in phrase.windows(2) { if let [left, right] = words { diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 86cec1ddc..cefc071ee 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -71,6 +71,7 @@ pub trait Context<'c> { fn exact_word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; fn exact_word_prefix_docids(&self, word: &str) -> heed::Result>; + fn word_pair_proximity_docids( &self, left: &str, @@ -83,6 +84,12 @@ pub trait Context<'c> { right: &str, proximity: u8, ) -> heed::Result>; + fn prefix_word_pair_proximity_docids( + &self, + prefix: &str, + right: &str, + proximity: u8, + ) -> heed::Result>; fn words_fst<'t>(&self) -> &'t fst::Set>; fn in_prefix_cache(&self, word: &str) -> bool; fn docid_words_positions( @@ -111,6 +118,68 @@ pub struct CriteriaBuilder<'t> { words_prefixes_fst: fst::Set>, } +/// Return the docids for the following word pairs and proximities using [`Context::word_pair_proximity_docids`]. +/// * `left, right, prox` (leftward proximity) +/// * `right, left, prox-1` (rightward proximity) +/// +/// ## Example +/// For a document with the text `the good fox eats the apple`, we have: +/// * `rightward_proximity(the, eats) = 3` +/// * `leftward_proximity(eats, the) = 1` +/// +/// So both the expressions `word_pair_overall_proximity_docids(ctx, the, eats, 3)` +/// and `word_pair_overall_proximity_docids(ctx, the, eats, 2)` would return a bitmap containing +/// the id of this document. +fn word_pair_overall_proximity_docids( + ctx: &dyn Context, + left: &str, + right: &str, + prox: u8, +) -> heed::Result> { + let rightward = ctx.word_pair_proximity_docids(left, right, prox)?; + let leftward = + if prox > 1 { ctx.word_pair_proximity_docids(right, left, prox - 1)? } else { None }; + if let Some(mut all) = rightward { + if let Some(leftward) = leftward { + all |= leftward; + } + Ok(Some(all)) + } else { + Ok(leftward) + } +} + +/// This function works identically to [`word_pair_overall_proximity_docids`] except that the +/// right word is replaced by a prefix string. +/// +/// It will return None if no documents were found or if the prefix does not exist in the +/// `word_prefix_pair_proximity_docids` database. +fn word_prefix_pair_overall_proximity_docids( + ctx: &dyn Context, + left: &str, + prefix: &str, + proximity: u8, +) -> heed::Result> { + // We retrieve the docids for the original and swapped word pairs: + // A: word1 prefix2 proximity + // B: prefix2 word1 proximity-1 + let rightward = ctx.word_prefix_pair_proximity_docids(left, prefix, proximity)?; + + let leftward = if proximity > 1 { + ctx.prefix_word_pair_proximity_docids(prefix, left, proximity - 1)? + } else { + None + }; + if let Some(mut all) = rightward { + if let Some(leftward) = leftward { + all |= leftward; + } + Ok(Some(all)) + } else { + Ok(leftward) + } +} + impl<'c> Context<'c> for CriteriaBuilder<'c> { fn documents_ids(&self) -> heed::Result { self.index.documents_ids(self.rtxn) @@ -138,18 +207,24 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { right: &str, proximity: u8, ) -> heed::Result> { - let key = (proximity, left, right); - self.index.word_pair_proximity_docids.get(self.rtxn, &key) + self.index.word_pair_proximity_docids.get(self.rtxn, &(proximity, left, right)) } fn word_prefix_pair_proximity_docids( &self, left: &str, + prefix: &str, + proximity: u8, + ) -> heed::Result> { + self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &(proximity, left, prefix)) + } + fn prefix_word_pair_proximity_docids( + &self, + prefix: &str, right: &str, proximity: u8, ) -> heed::Result> { - let key = (proximity, left, right); - self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) + self.index.prefix_word_pair_proximity_docids.get(self.rtxn, &(proximity, prefix, right)) } fn words_fst<'t>(&self) -> &'t fst::Set> { @@ -353,17 +428,34 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result bitmaps.push(m), - // If there are no document for this distance, there will be no - // results for the phrase query. - None => return Ok(RoaringBitmap::new()), + if s1 == s2 { + continue; + } + if dist == 0 { + match ctx.word_pair_proximity_docids(s1, s2, 1)? { + Some(m) => bitmaps.push(m), + // If there are no document for this pair, there will be no + // results for the phrase query. + None => return Ok(RoaringBitmap::new()), + } + } else { + let mut bitmap = RoaringBitmap::new(); + for dist in 0..=dist { + match ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + Some(m) => bitmap |= m, + None => {} + } + } + if bitmap.is_empty() { + return Ok(bitmap); + } else { + bitmaps.push(bitmap); + } } } } @@ -387,7 +479,7 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result, U: AsRef>( +fn all_word_pair_overall_proximity_docids, U: AsRef>( ctx: &dyn Context, left_words: &[(T, u8)], right_words: &[(U, u8)], @@ -396,9 +488,9 @@ fn all_word_pair_proximity_docids, U: AsRef>( let mut docids = RoaringBitmap::new(); for (left, _l_typo) in left_words { for (right, _r_typo) in right_words { - let current_docids = ctx - .word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)? - .unwrap_or_default(); + let current_docids = + word_pair_overall_proximity_docids(ctx, left.as_ref(), right.as_ref(), proximity)? + .unwrap_or_default(); docids |= current_docids; } } @@ -472,7 +564,8 @@ fn query_pair_proximity_docids( match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { if prefix { - match ctx.word_prefix_pair_proximity_docids( + match word_prefix_pair_overall_proximity_docids( + ctx, left.as_str(), right.as_str(), proximity, @@ -480,7 +573,12 @@ fn query_pair_proximity_docids( Some(docids) => Ok(docids), None => { let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + all_word_pair_overall_proximity_docids( + ctx, + &[(left, 0)], + &r_words, + proximity, + ) } } } else { @@ -495,7 +593,8 @@ fn query_pair_proximity_docids( if prefix { let mut docids = RoaringBitmap::new(); for (left, _) in l_words { - let current_docids = match ctx.word_prefix_pair_proximity_docids( + let current_docids = match word_prefix_pair_overall_proximity_docids( + ctx, left.as_str(), right.as_str(), proximity, @@ -504,19 +603,24 @@ fn query_pair_proximity_docids( None => { let r_words = word_derivations(&right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + all_word_pair_overall_proximity_docids( + ctx, + &[(left, 0)], + &r_words, + proximity, + ) } }?; docids |= current_docids; } Ok(docids) } else { - all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) + all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } } (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) } ( QueryKind::Tolerant { typo: l_typo, word: left }, @@ -525,7 +629,7 @@ fn query_pair_proximity_docids( let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst(), wdcache)?; - all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) + all_word_pair_overall_proximity_docids(ctx, &l_words, &r_words, proximity) } } } @@ -552,6 +656,7 @@ pub mod test { exact_word_prefix_docids: HashMap, word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + prefix_word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, docid_words: HashMap>, } @@ -588,13 +693,22 @@ pub mod test { fn word_prefix_pair_proximity_docids( &self, - left: &str, - right: &str, + word: &str, + prefix: &str, proximity: u8, ) -> heed::Result> { - let key = (left.to_string(), right.to_string(), proximity.into()); + let key = (word.to_string(), prefix.to_string(), proximity.into()); Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) } + fn prefix_word_pair_proximity_docids( + &self, + prefix: &str, + word: &str, + proximity: u8, + ) -> heed::Result> { + let key = (prefix.to_string(), word.to_string(), proximity.into()); + Ok(self.prefix_word_pair_proximity_docids.get(&key).cloned()) + } fn words_fst<'t>(&self) -> &'t fst::Set> { &self.words_fst @@ -708,6 +822,8 @@ pub mod test { let mut word_pair_proximity_docids = HashMap::new(); let mut word_prefix_pair_proximity_docids = HashMap::new(); + let mut prefix_word_pair_proximity_docids = HashMap::new(); + for (lword, lcandidates) in &word_docids { for (rword, rcandidates) in &word_docids { if lword == rword { @@ -740,15 +856,19 @@ pub mod test { let lposition = docid_words.iter().position(|w| w == lword).unwrap(); let rposition = docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); - let key = if lposition < rposition { - (s(lword), s(pword), (rposition - lposition) as i32) + if lposition < rposition { + let key = (s(lword), s(pword), (rposition - lposition) as i32); + let docids = word_prefix_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); + docids.push(candidate); } else { - (s(lword), s(pword), (lposition - rposition + 1) as i32) + let key = (s(lword), s(pword), (lposition - rposition) as i32); + let docids = prefix_word_pair_proximity_docids + .entry(key) + .or_insert(RoaringBitmap::new()); + docids.push(candidate); }; - let docids = word_prefix_pair_proximity_docids - .entry(key) - .or_insert(RoaringBitmap::new()); - docids.push(candidate); } } } @@ -766,6 +886,7 @@ pub mod test { exact_word_prefix_docids, word_pair_proximity_docids, word_prefix_pair_proximity_docids, + prefix_word_pair_proximity_docids, docid_words, } } From 830a7c0c7ab8a39f3444d767d802fdac86ea2c6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 15 Sep 2022 13:34:52 +0200 Subject: [PATCH 08/15] Use `resolve_phrase` function for exactness criteria as well --- milli/src/search/criteria/exactness.rs | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs index 5327f13e4..d5b2ff0ee 100644 --- a/milli/src/search/criteria/exactness.rs +++ b/milli/src/search/criteria/exactness.rs @@ -7,7 +7,7 @@ use log::debug; use roaring::RoaringBitmap; use crate::search::criteria::{ - resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, + resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, }; use crate::search::query_tree::{Operation, PrimitiveQueryPart}; use crate::{absolute_from_relative_position, FieldId, Result}; @@ -226,20 +226,7 @@ fn resolve_state( } // compute intersection on pair of words with a proximity of 0. Phrase(phrase) => { - // TODO: use resolve_phrase here - let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); - for words in phrase.windows(2) { - if let [left, right] = words { - match ctx.word_pair_proximity_docids(left, right, 0)? { - Some(docids) => bitmaps.push(docids), - None => { - bitmaps.clear(); - break; - } - } - } - } - candidates |= intersection_of(bitmaps.iter().collect()); + candidates |= resolve_phrase(ctx, phrase)?; } } parts_candidates_array.push(candidates); From 178d00f93aebe3a688b32d548006d8e0e0d34393 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 19 Sep 2022 11:03:52 +0200 Subject: [PATCH 09/15] Cargo fmt --- milli/src/update/prefix_word_pairs/mod.rs | 11 ++++++++--- .../src/update/prefix_word_pairs/prefix_word.rs | 14 ++++++++------ .../src/update/prefix_word_pairs/word_prefix.rs | 17 ++++++++++------- 3 files changed, 26 insertions(+), 16 deletions(-) diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 63286f8da..1549acf40 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -1,7 +1,11 @@ +use std::borrow::Cow; +use std::collections::HashSet; +use std::io::BufReader; + +use heed::types::ByteSlice; + use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; use crate::{Index, Result}; -use heed::types::ByteSlice; -use std::{borrow::Cow, collections::HashSet, io::BufReader}; mod prefix_word; mod word_prefix; @@ -131,10 +135,11 @@ pub fn write_into_lmdb_database_without_merging( #[cfg(test)] mod tests { + use std::io::Cursor; + use crate::db_snap; use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; - use std::io::Cursor; fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { let mut documents = Vec::new(); diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index 18f5bdc5a..0cd55c929 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -1,14 +1,16 @@ +use std::borrow::Cow; +use std::collections::{BTreeMap, HashSet}; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesDecode; +use log::debug; + use crate::update::index_documents::{create_writer, CursorClonableMmap}; use crate::update::prefix_word_pairs::{ insert_into_database, write_into_lmdb_database_without_merging, }; use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; -use std::borrow::Cow; -use std::collections::{BTreeMap, HashSet}; #[logging_timer::time] pub fn index_prefix_word_database( diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index bd1bea2a3..1c7a4fffe 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -163,17 +163,19 @@ Writer instead of the DB. At the end of the outer loop, we finally read from the grenad and insert its elements in the database. */ +use std::borrow::Cow; +use std::collections::HashSet; + +use grenad::CompressionType; +use heed::types::ByteSlice; +use heed::BytesDecode; +use log::debug; + use crate::update::index_documents::{create_writer, CursorClonableMmap}; use crate::update::prefix_word_pairs::{ insert_into_database, write_into_lmdb_database_without_merging, }; use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec}; -use grenad::CompressionType; -use heed::types::ByteSlice; -use heed::BytesDecode; -use log::debug; -use std::borrow::Cow; -use std::collections::HashSet; #[logging_timer::time] pub fn index_word_prefix_database( @@ -562,9 +564,10 @@ impl PrefixTrieNode { } #[cfg(test)] mod tests { + use roaring::RoaringBitmap; + use super::*; use crate::{CboRoaringBitmapCodec, U8StrStrCodec}; - use roaring::RoaringBitmap; fn check_prefixes( trie: &PrefixTrieNode, From e6e76fbefecd0a727c0a1c32492604a65d6dc0de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 19 Sep 2022 15:59:05 +0200 Subject: [PATCH 10/15] Improve performance of resolve_phrase at the cost of some relevancy --- milli/src/search/criteria/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index cefc071ee..234252ff2 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -427,12 +427,14 @@ pub fn resolve_query_tree( pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result { let mut candidates = RoaringBitmap::new(); let mut first_iter = true; - let winsize = phrase.len().min(7); + let winsize = phrase.len().min(3); for win in phrase.windows(winsize) { // Get all the documents with the matching distance for each word pairs. let mut bitmaps = Vec::with_capacity(winsize.pow(2)); for (offset, s1) in win.iter().enumerate() { for (dist, s2) in win.iter().skip(offset + 1).enumerate() { + // TODO: add proximity between identical words to the word + // pair proximity database if s1 == s2 { continue; } From ab2f6f3aa4f489d4ea69f065a417a91b3efc6796 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 19 Sep 2022 16:22:07 +0200 Subject: [PATCH 11/15] Refine some details in word_prefix_pair_proximity indexing code --- .../update/prefix_word_pairs/prefix_word.rs | 33 +++++++++---------- .../update/prefix_word_pairs/word_prefix.rs | 3 +- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index 0cd55c929..8883cc451 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -35,9 +35,6 @@ pub fn index_prefix_word_database( .filter(|s| s.len() <= max_prefix_length) .collect(); - // If the prefix trie is not empty, then we can iterate over all new - // word pairs to look for new (word1, common_prefix, proximity) elements - // to insert in the DB for proximity in 1..=max_proximity - 1 { for prefix in common_prefixes.iter() { let mut prefix_key = vec![]; @@ -135,13 +132,11 @@ pub fn index_prefix_word_database( Ok(()) } -/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database. +/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database. /// -/// Its main arguments are: -/// 1. a sorted prefix iterator over ((word1, word2, proximity), docids) elements -/// 2. a closure to describe how to handle the new computed (word1, prefix, proximity) elements -/// -/// For more information about what this function does, read the module documentation. +/// Its arguments are: +/// - an iterator over the words following the given `prefix` with the given `proximity` +/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements fn execute_on_word_pairs_and_prefixes( proximity: u8, prefix: &[u8], @@ -151,28 +146,32 @@ fn execute_on_word_pairs_and_prefixes( ) -> Result<()> { let mut batch: BTreeMap, Vec>> = <_>::default(); - while let Some((word2, data)) = next_word2_and_docids(iter)? { + // Memory usage check: + // The content of the loop will be called for each `word2` that follows a word beginning + // with `prefix` with the given proximity. + // In practice, I don't think the batch can ever get too big. + while let Some((word2, docids)) = next_word2_and_docids(iter)? { let entry = batch.entry(word2.to_owned()).or_default(); - entry.push(Cow::Owned(data.to_owned())); + entry.push(Cow::Owned(docids.to_owned())); } - let mut key_buffer = Vec::with_capacity(8); + let mut key_buffer = Vec::with_capacity(512); key_buffer.push(proximity); key_buffer.extend_from_slice(prefix); key_buffer.push(0); let mut value_buffer = Vec::with_capacity(65_536); - for (key, values) in batch { + for (word2, docids) in batch { key_buffer.truncate(prefix.len() + 2); value_buffer.clear(); - key_buffer.extend_from_slice(&key); - let data = if values.len() > 1 { - CboRoaringBitmapCodec::merge_into(&values, &mut value_buffer)?; + key_buffer.extend_from_slice(&word2); + let data = if docids.len() > 1 { + CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?; value_buffer.as_slice() } else { - &values[0] + &docids[0] }; insert(key_buffer.as_slice(), data)?; } diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index 1c7a4fffe..eb0b05d89 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -1,5 +1,4 @@ /*! - ## What is WordPrefix? The word-prefix-pair-proximity-docids database is a database whose keys are of the form `(proximity, word, prefix)` and the values are roaring bitmaps of the documents which contain `word` followed by another word starting with @@ -320,7 +319,7 @@ fn execute_on_word_pairs_and_prefixes( let mut merge_buffer = Vec::with_capacity(65_536); while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? { - // skip this iteration if the proximity is over the threshold + // stop indexing if the proximity is over the threshold if proximity > max_proximity { break; }; From 176ffd23f554ea1535454b6392387b130652409a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 18 Oct 2022 10:40:26 +0200 Subject: [PATCH 12/15] Fix compile error after rebasing wppd-refactor --- milli/src/search/query_tree.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 080f89080..034b9123b 100755 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -203,7 +203,7 @@ impl<'a> Context for QueryTreeBuilder<'a> { right_word: &str, proximity: u8, ) -> heed::Result> { - let key = (left_word, right_word, proximity); + let key = (proximity, left_word, right_word); self.index .word_pair_proximity_docids .remap_data_type::() From a983129613d1f9340484e648536fcce4c4f4303d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 20 Oct 2022 09:49:37 +0200 Subject: [PATCH 13/15] Apply suggestions from code review --- milli/src/search/criteria/mod.rs | 5 ++-- milli/src/update/index_documents/mod.rs | 8 ++++++- milli/src/update/prefix_word_pairs/mod.rs | 24 +++++++++++++++++-- .../update/prefix_word_pairs/prefix_word.rs | 9 ++++--- .../update/prefix_word_pairs/word_prefix.rs | 7 ++++-- 5 files changed, 42 insertions(+), 11 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 234252ff2..4069306b3 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -448,9 +448,8 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result bitmap |= m, - None => {} + if let Some(m) = ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { + bitmap |= m } } if bitmap.is_empty() { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 897f2f8f8..5550c8725 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -528,7 +528,13 @@ where if let Some(word_pair_proximity_docids) = word_pair_proximity_docids { // Run the word prefix pair proximity docids update operation. - PrefixWordPairsProximityDocids::new(self.wtxn, self.index).execute( + PrefixWordPairsProximityDocids::new( + self.wtxn, + self.index, + self.indexer_config.chunk_compression_type, + self.indexer_config.chunk_compression_level, + ) + .execute( word_pair_proximity_docids, &new_prefix_fst_words, &common_prefix_fst_words, diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 1549acf40..03abdbb6e 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::collections::HashSet; use std::io::BufReader; +use grenad::CompressionType; use heed::types::ByteSlice; use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap}; @@ -18,10 +19,24 @@ pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { index: &'i Index, max_proximity: u8, max_prefix_length: usize, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, } impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { - pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Self { - Self { wtxn, index, max_proximity: 4, max_prefix_length: 2 } + pub fn new( + wtxn: &'t mut heed::RwTxn<'i, 'u>, + index: &'i Index, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, + ) -> Self { + Self { + wtxn, + index, + max_proximity: 4, + max_prefix_length: 2, + chunk_compression_type, + chunk_compression_level, + } } /// Set the maximum proximity required to make a prefix be part of the words prefixes /// database. If two words are too far from the threshold the associated documents will @@ -42,6 +57,7 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { self.max_prefix_length = value; self } + #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute<'a>( self, @@ -60,6 +76,8 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words, + self.chunk_compression_type, + self.chunk_compression_level, )?; index_prefix_word_database( @@ -72,6 +90,8 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { new_prefix_fst_words, common_prefix_fst_words, del_prefix_fst_words, + self.chunk_compression_type, + self.chunk_compression_level, )?; Ok(()) diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index 8883cc451..9bc184825 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -23,6 +23,8 @@ pub fn index_prefix_word_database( new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, ) -> Result<()> { let max_proximity = max_proximity - 1; debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); @@ -35,7 +37,7 @@ pub fn index_prefix_word_database( .filter(|s| s.len() <= max_prefix_length) .collect(); - for proximity in 1..=max_proximity - 1 { + for proximity in 1..max_proximity { for prefix in common_prefixes.iter() { let mut prefix_key = vec![]; prefix_key.push(proximity); @@ -78,7 +80,8 @@ pub fn index_prefix_word_database( // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity) // element in an intermediary grenad - let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + let mut writer = + create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); for proximity in 1..=max_proximity - 1 { for prefix in new_prefixes.iter() { @@ -144,7 +147,7 @@ fn execute_on_word_pairs_and_prefixes( mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result>, mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>, ) -> Result<()> { - let mut batch: BTreeMap, Vec>> = <_>::default(); + let mut batch: BTreeMap, Vec>> = BTreeMap::default(); // Memory usage check: // The content of the loop will be called for each `word2` that follows a word beginning diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index eb0b05d89..5895cdc46 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -187,6 +187,8 @@ pub fn index_word_prefix_database( new_prefix_fst_words: &[String], common_prefix_fst_words: &[&[String]], del_prefix_fst_words: &HashSet>, + chunk_compression_type: CompressionType, + chunk_compression_level: Option, ) -> Result<()> { debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk..."); @@ -249,7 +251,8 @@ pub fn index_word_prefix_database( // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix) // element in an intermediary grenad - let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?); + let mut writer = + create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); execute_on_word_pairs_and_prefixes( &mut db_iter, @@ -325,7 +328,7 @@ fn execute_on_word_pairs_and_prefixes( }; let word2_start_different_than_prev = word2[0] != prev_word2_start; // if there were no potential prefixes for the previous word2 based on its first letter, - // and if the current word2 starts with the s`ame letter, then there is also no potential + // and if the current word2 starts with the same letter, then there is also no potential // prefixes for the current word2, and we can skip to the next iteration if empty_prefixes && !word2_start_different_than_prev { continue; From be302fd25038ab92ed8c8e62ad59f3462962b76c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 24 Oct 2022 15:27:06 +0200 Subject: [PATCH 14/15] Remove outdated workaround for duplicate words in phrase search --- milli/src/search/criteria/mod.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 4069306b3..3159afb9e 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -433,11 +433,6 @@ pub fn resolve_phrase(ctx: &dyn Context, phrase: &[String]) -> Result bitmaps.push(m), From 9a569d73d18427b65d009c7314f8234ba90bafc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 24 Oct 2022 15:30:43 +0200 Subject: [PATCH 15/15] Minor code style change --- milli/src/update/prefix_word_pairs/prefix_word.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs index 9bc184825..26fe0105e 100644 --- a/milli/src/update/prefix_word_pairs/prefix_word.rs +++ b/milli/src/update/prefix_word_pairs/prefix_word.rs @@ -83,7 +83,7 @@ pub fn index_prefix_word_database( let mut writer = create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?); - for proximity in 1..=max_proximity - 1 { + for proximity in 1..max_proximity { for prefix in new_prefixes.iter() { let mut prefix_key = vec![]; prefix_key.push(proximity);