Fix bug in prefix DB indexing

Where the batch's information was not properly updated in cases where only the proximity changed between two consecutive word pair proximities. Closes https://github.com/meilisearch/meilisearch/issues/3043
2024-11-30 09:04:59 +08:00 · 2022-11-14 15:19:00 +01:00 · 2022-11-14 15:19:00 +01:00 · f7c8730d09
commit f7c8730d09
parent a651397afc
2 changed files with 51 additions and 5 deletions
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@ -238,4 +238,51 @@ mod tests {
        db_snap!(index, word_prefix_pair_proximity_docids, "update");
        db_snap!(index, prefix_word_pair_proximity_docids, "update");
    }
    #[test]
    fn test_batch_bug_3034() {
        // https://github.com/meilisearch/meilisearch/issues/3043
        let mut index = TempIndex::new();
        index.index_documents_config.words_prefix_threshold = Some(50);
        index.index_documents_config.autogenerate_docids = true;
        index
            .update_settings(|settings| {
                settings.set_searchable_fields(vec!["text".to_owned()]);
            })
            .unwrap();
        let batch_reader_from_documents = |documents| {
            let mut builder = DocumentsBatchBuilder::new(Vec::new());
            for object in documents {
                builder.append_json_object(&object).unwrap();
            }
            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
        };
        let mut documents = documents_with_enough_different_words_for_prefixes(&["y"]);
        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
        documents.push(
            serde_json::json!({
                "text": "x y"
            })
            .as_object()
            .unwrap()
            .clone(),
        );
        documents.push(
            serde_json::json!({
                "text": "x a y"
            })
            .as_object()
            .unwrap()
            .clone(),
        );
        let documents = batch_reader_from_documents(documents);
        index.add_documents(documents).unwrap();
        db_snap!(index, word_pair_proximity_docids);
        db_snap!(index, word_prefix_pair_proximity_docids);
        db_snap!(index, prefix_word_pair_proximity_docids);
    }
 }
--- a/milli/src/update/prefix_word_pairs/word_prefix.rs
+++ b/milli/src/update/prefix_word_pairs/word_prefix.rs
@ -44,7 +44,7 @@ word2    : doggo
 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
 in the list of sorted prefixes. And we insert the key `prefix`
 and the value (`docids`) to a sorted map which we call the “batch”. For example,
-at the end of the first inner loop, we may have:
+at the end of the first outer loop, we may have:
 ```text
 Outer loop 1:
 ------------------------------
@ -85,7 +85,7 @@ end of the batch.
 4. On the third iteration of the outer loop, we have:
 ```text
-Outer loop 4:
+Outer loop 3:
 ------------------------------
 proximity: 1
 word1    : good
@ -340,17 +340,16 @@ fn execute_on_word_pairs_and_prefixes<I>(
        if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
        {
            batch.flush(&mut merge_buffer, &mut insert)?;
            batch.proximity = proximity;
            // don't forget to reset the value of batch.word1 and prev_word2_start
            if word1_different_than_prev {
                prefix_search_start.0 = 0;
                batch.word1.clear();
                batch.word1.extend_from_slice(word1);
                batch.proximity = proximity;
            }
            if word2_start_different_than_prev {
                // word2_start_different_than_prev == true
                prev_word2_start = word2[0];
            }
            prefix_search_start.0 = 0;
            // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2
            empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start);
        }