From f7c8730d0984f3e19f6ed8e915a1abb9f453025e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 14 Nov 2022 15:19:00 +0100 Subject: [PATCH 1/3] Fix bug in prefix DB indexing Where the batch's information was not properly updated in cases where only the proximity changed between two consecutive word pair proximities. Closes https://github.com/meilisearch/meilisearch/issues/3043 --- milli/src/update/prefix_word_pairs/mod.rs | 47 +++++++++++++++++++ .../update/prefix_word_pairs/word_prefix.rs | 9 ++-- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 03abdbb6e..6030a82f2 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -238,4 +238,51 @@ mod tests { db_snap!(index, word_prefix_pair_proximity_docids, "update"); db_snap!(index, prefix_word_pair_proximity_docids, "update"); } + #[test] + fn test_batch_bug_3034() { + // https://github.com/meilisearch/meilisearch/issues/3043 + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["y"]); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "text": "x y" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "text": "x a y" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_pair_proximity_docids); + db_snap!(index, word_prefix_pair_proximity_docids); + db_snap!(index, prefix_word_pair_proximity_docids); + } } diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index 71a2a2915..db607e56c 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -44,7 +44,7 @@ word2 : doggo 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key `prefix` and the value (`docids`) to a sorted map which we call the “batch”. For example, -at the end of the first inner loop, we may have: +at the end of the first outer loop, we may have: ```text Outer loop 1: ------------------------------ @@ -85,7 +85,7 @@ end of the batch. 4. On the third iteration of the outer loop, we have: ```text -Outer loop 4: +Outer loop 3: ------------------------------ proximity: 1 word1 : good @@ -340,17 +340,16 @@ fn execute_on_word_pairs_and_prefixes( if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev { batch.flush(&mut merge_buffer, &mut insert)?; + batch.proximity = proximity; // don't forget to reset the value of batch.word1 and prev_word2_start if word1_different_than_prev { - prefix_search_start.0 = 0; batch.word1.clear(); batch.word1.extend_from_slice(word1); - batch.proximity = proximity; } if word2_start_different_than_prev { - // word2_start_different_than_prev == true prev_word2_start = word2[0]; } + prefix_search_start.0 = 0; // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2 empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); } From f00108d2ec665681db49315c98f1b1c66c47e91b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 16 Nov 2022 12:12:49 +0100 Subject: [PATCH 2/3] Fix name of bug in reproduction test --- milli/src/update/prefix_word_pairs/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 6030a82f2..10ea850af 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -239,7 +239,7 @@ mod tests { db_snap!(index, prefix_word_pair_proximity_docids, "update"); } #[test] - fn test_batch_bug_3034() { + fn test_batch_bug_3043() { // https://github.com/meilisearch/meilisearch/issues/3043 let mut index = TempIndex::new(); index.index_documents_config.words_prefix_threshold = Some(50); From 777eb3fa006443b787cab58ce6705128b8219bac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 17 Nov 2022 12:21:27 +0100 Subject: [PATCH 3/3] Add insta-snaps for test of bug 3043 --- .../prefix_word_pair_proximity_docids.snap | 4 ++++ .../test_batch_bug_3043/word_pair_proximity_docids.snap | 8 ++++++++ .../word_prefix_pair_proximity_docids.snap | 7 +++++++ 3 files changed, 19 insertions(+) create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap create mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap new file mode 100644 index 000000000..d212999bb --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap @@ -0,0 +1,4 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap new file mode 100644 index 000000000..816895dcf --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap @@ -0,0 +1,8 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a y [51, ] +1 x a [51, ] +1 x y [50, ] +2 x y [51, ] + diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap new file mode 100644 index 000000000..03530a2f1 --- /dev/null +++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap @@ -0,0 +1,7 @@ +--- +source: milli/src/update/prefix_word_pairs/mod.rs +--- +1 a y [51, ] +1 x y [50, ] +2 x y [51, ] +