Merge #697

697: Fix bug in prefix DB indexing r=loiclec a=loiclec Where the batch's information was not properly updated in cases where only the proximity changed between two consecutive word pair proximities. Closes partially https://github.com/meilisearch/meilisearch/issues/3043 Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
2024-11-23 02:27:40 +08:00 · 2022-11-17 15:22:01 +00:00 · 2022-11-17 15:22:01 +00:00 · 57c9f03e51
commit 57c9f03e51
parent 467e742bd1 777eb3fa00
5 changed files with 70 additions and 5 deletions
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@ -238,4 +238,51 @@ mod tests {
        db_snap!(index, word_prefix_pair_proximity_docids, "update");
        db_snap!(index, prefix_word_pair_proximity_docids, "update");
    }
    #[test]
    fn test_batch_bug_3043() {
        // https://github.com/meilisearch/meilisearch/issues/3043
        let mut index = TempIndex::new();
        index.index_documents_config.words_prefix_threshold = Some(50);
        index.index_documents_config.autogenerate_docids = true;
        index
            .update_settings(|settings| {
                settings.set_searchable_fields(vec!["text".to_owned()]);
            })
            .unwrap();
        let batch_reader_from_documents = |documents| {
            let mut builder = DocumentsBatchBuilder::new(Vec::new());
            for object in documents {
                builder.append_json_object(&object).unwrap();
            }
            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
        };
        let mut documents = documents_with_enough_different_words_for_prefixes(&["y"]);
        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
        documents.push(
            serde_json::json!({
                "text": "x y"
            })
            .as_object()
            .unwrap()
            .clone(),
        );
        documents.push(
            serde_json::json!({
                "text": "x a y"
            })
            .as_object()
            .unwrap()
            .clone(),
        );
        let documents = batch_reader_from_documents(documents);
        index.add_documents(documents).unwrap();
        db_snap!(index, word_pair_proximity_docids);
        db_snap!(index, word_prefix_pair_proximity_docids);
        db_snap!(index, prefix_word_pair_proximity_docids);
    }
 }
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap
+++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/prefix_word_pair_proximity_docids.snap
@ -0,0 +1,4 @@
 ---
 source: milli/src/update/prefix_word_pairs/mod.rs
 ---
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap
+++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_pair_proximity_docids.snap
@ -0,0 +1,8 @@
 ---
 source: milli/src/update/prefix_word_pairs/mod.rs
 ---
 1  a                y                [51, ]
 1  x                a                [51, ]
 1  x                y                [50, ]
 2  x                y                [51, ]
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap
+++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_batch_bug_3043/word_prefix_pair_proximity_docids.snap
@ -0,0 +1,7 @@
 ---
 source: milli/src/update/prefix_word_pairs/mod.rs
 ---
 1  a                y    [51, ]
 1  x                y    [50, ]
 2  x                y    [51, ]
--- a/milli/src/update/prefix_word_pairs/word_prefix.rs
+++ b/milli/src/update/prefix_word_pairs/word_prefix.rs
@ -44,7 +44,7 @@ word2    : doggo
 2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
 in the list of sorted prefixes. And we insert the key `prefix`
 and the value (`docids`) to a sorted map which we call the “batch”. For example,
-at the end of the first inner loop, we may have:
+at the end of the first outer loop, we may have:
 ```text
 Outer loop 1:
 ------------------------------
@ -85,7 +85,7 @@ end of the batch.
 4. On the third iteration of the outer loop, we have:
 ```text
-Outer loop 4:
+Outer loop 3:
 ------------------------------
 proximity: 1
 word1    : good
@ -340,17 +340,16 @@ fn execute_on_word_pairs_and_prefixes<I>(
        if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
        {
            batch.flush(&mut merge_buffer, &mut insert)?;
            batch.proximity = proximity;
            // don't forget to reset the value of batch.word1 and prev_word2_start
            if word1_different_than_prev {
                prefix_search_start.0 = 0;
                batch.word1.clear();
                batch.word1.extend_from_slice(word1);
                batch.proximity = proximity;
            }
            if word2_start_different_than_prev {
                // word2_start_different_than_prev == true
                prev_word2_start = word2[0];
            }
            prefix_search_start.0 = 0;
            // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2
            empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start);
        }