From f7c8730d0984f3e19f6ed8e915a1abb9f453025e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Mon, 14 Nov 2022 15:19:00 +0100
Subject: [PATCH] Fix bug in prefix DB indexing

Where the batch's information was not properly updated in cases
where only the proximity changed between two consecutive word pair
proximities.

Closes https://github.com/meilisearch/meilisearch/issues/3043
---
 milli/src/update/prefix_word_pairs/mod.rs     | 47 +++++++++++++++++++
 .../update/prefix_word_pairs/word_prefix.rs   |  9 ++--
 2 files changed, 51 insertions(+), 5 deletions(-)
diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs
index 03abdbb6e..6030a82f2 100644
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@@ -238,4 +238,51 @@ mod tests {
         db_snap!(index, word_prefix_pair_proximity_docids, "update");
         db_snap!(index, prefix_word_pair_proximity_docids, "update");
     }
+    #[test]
+    fn test_batch_bug_3034() {
+        // https://github.com/meilisearch/meilisearch/issues/3043
+        let mut index = TempIndex::new();
+        index.index_documents_config.words_prefix_threshold = Some(50);
+        index.index_documents_config.autogenerate_docids = true;
+
+        index
+            .update_settings(|settings| {
+                settings.set_searchable_fields(vec!["text".to_owned()]);
+            })
+            .unwrap();
+
+        let batch_reader_from_documents = |documents| {
+            let mut builder = DocumentsBatchBuilder::new(Vec::new());
+            for object in documents {
+                builder.append_json_object(&object).unwrap();
+            }
+            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
+        };
+
+        let mut documents = documents_with_enough_different_words_for_prefixes(&["y"]);
+        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
+        documents.push(
+            serde_json::json!({
+                "text": "x y"
+            })
+            .as_object()
+            .unwrap()
+            .clone(),
+        );
+        documents.push(
+            serde_json::json!({
+                "text": "x a y"
+            })
+            .as_object()
+            .unwrap()
+            .clone(),
+        );
+
+        let documents = batch_reader_from_documents(documents);
+        index.add_documents(documents).unwrap();
+
+        db_snap!(index, word_pair_proximity_docids);
+        db_snap!(index, word_prefix_pair_proximity_docids);
+        db_snap!(index, prefix_word_pair_proximity_docids);
+    }
 }
diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs
index 71a2a2915..db607e56c 100644
--- a/milli/src/update/prefix_word_pairs/word_prefix.rs
+++ b/milli/src/update/prefix_word_pairs/word_prefix.rs
@@ -44,7 +44,7 @@ word2    : doggo
 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
 in the list of sorted prefixes. And we insert the key `prefix`
 and the value (`docids`) to a sorted map which we call the “batch”. For example,
-at the end of the first inner loop, we may have:
+at the end of the first outer loop, we may have:
 ```text
 Outer loop 1:
 ------------------------------
@@ -85,7 +85,7 @@ end of the batch.
 
 4. On the third iteration of the outer loop, we have:
 ```text
-Outer loop 4:
+Outer loop 3:
 ------------------------------
 proximity: 1
 word1    : good
@@ -340,17 +340,16 @@ fn execute_on_word_pairs_and_prefixes<I>(
         if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
         {
             batch.flush(&mut merge_buffer, &mut insert)?;
+            batch.proximity = proximity;
             // don't forget to reset the value of batch.word1 and prev_word2_start
             if word1_different_than_prev {
-                prefix_search_start.0 = 0;
                 batch.word1.clear();
                 batch.word1.extend_from_slice(word1);
-                batch.proximity = proximity;
             }
             if word2_start_different_than_prev {
-                // word2_start_different_than_prev == true
                 prev_word2_start = word2[0];
             }
+            prefix_search_start.0 = 0;
             // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2
             empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start);
         }