From f7c8730d0984f3e19f6ed8e915a1abb9f453025e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 14 Nov 2022 15:19:00 +0100 Subject: [PATCH] Fix bug in prefix DB indexing Where the batch's information was not properly updated in cases where only the proximity changed between two consecutive word pair proximities. Closes https://github.com/meilisearch/meilisearch/issues/3043 --- milli/src/update/prefix_word_pairs/mod.rs | 47 +++++++++++++++++++ .../update/prefix_word_pairs/word_prefix.rs | 9 ++-- 2 files changed, 51 insertions(+), 5 deletions(-) diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 03abdbb6e..6030a82f2 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -238,4 +238,51 @@ mod tests { db_snap!(index, word_prefix_pair_proximity_docids, "update"); db_snap!(index, prefix_word_pair_proximity_docids, "update"); } + #[test] + fn test_batch_bug_3034() { + // https://github.com/meilisearch/meilisearch/issues/3043 + let mut index = TempIndex::new(); + index.index_documents_config.words_prefix_threshold = Some(50); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec!["text".to_owned()]); + }) + .unwrap(); + + let batch_reader_from_documents = |documents| { + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + for object in documents { + builder.append_json_object(&object).unwrap(); + } + DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap() + }; + + let mut documents = documents_with_enough_different_words_for_prefixes(&["y"]); + // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database + documents.push( + serde_json::json!({ + "text": "x y" + }) + .as_object() + .unwrap() + .clone(), + ); + documents.push( + serde_json::json!({ + "text": "x a y" + }) + .as_object() + .unwrap() + .clone(), + ); + + let documents = batch_reader_from_documents(documents); + index.add_documents(documents).unwrap(); + + db_snap!(index, word_pair_proximity_docids); + db_snap!(index, word_prefix_pair_proximity_docids); + db_snap!(index, prefix_word_pair_proximity_docids); + } } diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs index 71a2a2915..db607e56c 100644 --- a/milli/src/update/prefix_word_pairs/word_prefix.rs +++ b/milli/src/update/prefix_word_pairs/word_prefix.rs @@ -44,7 +44,7 @@ word2 : doggo 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are in the list of sorted prefixes. And we insert the key `prefix` and the value (`docids`) to a sorted map which we call the “batch”. For example, -at the end of the first inner loop, we may have: +at the end of the first outer loop, we may have: ```text Outer loop 1: ------------------------------ @@ -85,7 +85,7 @@ end of the batch. 4. On the third iteration of the outer loop, we have: ```text -Outer loop 4: +Outer loop 3: ------------------------------ proximity: 1 word1 : good @@ -340,17 +340,16 @@ fn execute_on_word_pairs_and_prefixes( if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev { batch.flush(&mut merge_buffer, &mut insert)?; + batch.proximity = proximity; // don't forget to reset the value of batch.word1 and prev_word2_start if word1_different_than_prev { - prefix_search_start.0 = 0; batch.word1.clear(); batch.word1.extend_from_slice(word1); - batch.proximity = proximity; } if word2_start_different_than_prev { - // word2_start_different_than_prev == true prev_word2_start = word2[0]; } + prefix_search_start.0 = 0; // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2 empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start); }