Merge #697

697: Fix bug in prefix DB indexing r=loiclec a=loiclec Where the batch's information was not properly updated in cases where only the proximity changed between two consecutive word pair proximities. Closes partially https://github.com/meilisearch/meilisearch/issues/3043 Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
2025-03-03 12:24:40 +08:00 · 2022-11-17 15:22:01 +00:00 · 2022-11-17 15:22:01 +00:00 · 57c9f03e51
commit 57c9f03e51
parent 467e742bd1 777eb3fa00
5 changed files with 70 additions and 5 deletions
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@ -238,4 +238,51 @@ mod tests {
        db_snap!(index, word_prefix_pair_proximity_docids, "update");
        db_snap!(index, prefix_word_pair_proximity_docids, "update");
    }
+    #[test]
+    fn test_batch_bug_3043() {
+        // https://github.com/meilisearch/meilisearch/issues/3043
+        let mut index = TempIndex::new();
+        index.index_documents_config.words_prefix_threshold = Some(50);
+        index.index_documents_config.autogenerate_docids = true;
+
+        index
+            .update_settings(|settings| {
+                settings.set_searchable_fields(vec!["text".to_owned()]);
+            })
+            .unwrap();
+
+        let batch_reader_from_documents = |documents| {
+            let mut builder = DocumentsBatchBuilder::new(Vec::new());
+            for object in documents {
+                builder.append_json_object(&object).unwrap();
+            }
+            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
+        };
+
+        let mut documents = documents_with_enough_different_words_for_prefixes(&["y"]);
+        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
+        documents.push(
+            serde_json::json!({
+                "text": "x y"
+            })
+            .as_object()
+            .unwrap()
+            .clone(),
+        );
+        documents.push(
+            serde_json::json!({
+                "text": "x a y"
+            })
+            .as_object()
+            .unwrap()
+            .clone(),
+        );
+
+        let documents = batch_reader_from_documents(documents);
+        index.add_documents(documents).unwrap();
+
+        db_snap!(index, word_pair_proximity_docids);
+        db_snap!(index, word_prefix_pair_proximity_docids);
+        db_snap!(index, prefix_word_pair_proximity_docids);
+    }
 }
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap
+++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap
@ -0,0 +1,4 @@
+---
+source: milli/src/update/prefix_word_pairs/mod.rs
+---
+
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap
+++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap
@ -0,0 +1,8 @@
+---
+source: milli/src/update/prefix_word_pairs/mod.rs
+---
+1  a                y                [51, ]
+1  x                a                [51, ]
+1  x                y                [50, ]
+2  x                y                [51, ]
+
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap
+++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap
@ -0,0 +1,7 @@
+---
+source: milli/src/update/prefix_word_pairs/mod.rs
+---
+1  a                y    [51, ]
+1  x                y    [50, ]
+2  x                y    [51, ]
+
--- a/milli/src/update/prefix_word_pairs/word_prefix.rs
+++ b/milli/src/update/prefix_word_pairs/word_prefix.rs
@ -44,7 +44,7 @@ word2    : doggo
 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
 in the list of sorted prefixes. And we insert the key `prefix`
 and the value (`docids`) to a sorted map which we call the “batch”. For example,
-at the end of the first inner loop, we may have:
+at the end of the first outer loop, we may have:
 ```text
 Outer loop 1:
 ------------------------------
@ -85,7 +85,7 @@ end of the batch.

 4. On the third iteration of the outer loop, we have:
 ```text
-Outer loop 4:
+Outer loop 3:
 ------------------------------
 proximity: 1
 word1    : good
@ -340,17 +340,16 @@ fn execute_on_word_pairs_and_prefixes<I>(
        if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
        {
            batch.flush(&mut merge_buffer, &mut insert)?;
+            batch.proximity = proximity;
            // don't forget to reset the value of batch.word1 and prev_word2_start
            if word1_different_than_prev {
-                prefix_search_start.0 = 0;
                batch.word1.clear();
                batch.word1.extend_from_slice(word1);
-                batch.proximity = proximity;
            }
            if word2_start_different_than_prev {
-                // word2_start_different_than_prev == true
                prev_word2_start = word2[0];
            }
+            prefix_search_start.0 = 0;
            // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2
            empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start);
        }