697: Fix bug in prefix DB indexing r=loiclec a=loiclec

Where the batch's information was not properly updated in cases where only the proximity changed between two consecutive word pair proximities.

Closes partially https://github.com/meilisearch/meilisearch/issues/3043



Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
This commit is contained in:
bors[bot] 2022-11-17 15:22:01 +00:00 committed by GitHub
commit 57c9f03e51
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 70 additions and 5 deletions

View File

@ -238,4 +238,51 @@ mod tests {
db_snap!(index, word_prefix_pair_proximity_docids, "update");
db_snap!(index, prefix_word_pair_proximity_docids, "update");
}
#[test]
fn test_batch_bug_3043() {
// https://github.com/meilisearch/meilisearch/issues/3043
let mut index = TempIndex::new();
index.index_documents_config.words_prefix_threshold = Some(50);
index.index_documents_config.autogenerate_docids = true;
index
.update_settings(|settings| {
settings.set_searchable_fields(vec!["text".to_owned()]);
})
.unwrap();
let batch_reader_from_documents = |documents| {
let mut builder = DocumentsBatchBuilder::new(Vec::new());
for object in documents {
builder.append_json_object(&object).unwrap();
}
DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
};
let mut documents = documents_with_enough_different_words_for_prefixes(&["y"]);
// now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
documents.push(
serde_json::json!({
"text": "x y"
})
.as_object()
.unwrap()
.clone(),
);
documents.push(
serde_json::json!({
"text": "x a y"
})
.as_object()
.unwrap()
.clone(),
);
let documents = batch_reader_from_documents(documents);
index.add_documents(documents).unwrap();
db_snap!(index, word_pair_proximity_docids);
db_snap!(index, word_prefix_pair_proximity_docids);
db_snap!(index, prefix_word_pair_proximity_docids);
}
}

View File

@ -0,0 +1,4 @@
---
source: milli/src/update/prefix_word_pairs/mod.rs
---

View File

@ -0,0 +1,8 @@
---
source: milli/src/update/prefix_word_pairs/mod.rs
---
1 a y [51, ]
1 x a [51, ]
1 x y [50, ]
2 x y [51, ]

View File

@ -0,0 +1,7 @@
---
source: milli/src/update/prefix_word_pairs/mod.rs
---
1 a y [51, ]
1 x y [50, ]
2 x y [51, ]

View File

@ -44,7 +44,7 @@ word2 : doggo
2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
in the list of sorted prefixes. And we insert the key `prefix`
and the value (`docids`) to a sorted map which we call the batch. For example,
at the end of the first inner loop, we may have:
at the end of the first outer loop, we may have:
```text
Outer loop 1:
------------------------------
@ -85,7 +85,7 @@ end of the batch.
4. On the third iteration of the outer loop, we have:
```text
Outer loop 4:
Outer loop 3:
------------------------------
proximity: 1
word1 : good
@ -340,17 +340,16 @@ fn execute_on_word_pairs_and_prefixes<I>(
if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
{
batch.flush(&mut merge_buffer, &mut insert)?;
batch.proximity = proximity;
// don't forget to reset the value of batch.word1 and prev_word2_start
if word1_different_than_prev {
prefix_search_start.0 = 0;
batch.word1.clear();
batch.word1.extend_from_slice(word1);
batch.proximity = proximity;
}
if word2_start_different_than_prev {
// word2_start_different_than_prev == true
prev_word2_start = word2[0];
}
prefix_search_start.0 = 0;
// Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2
empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start);
}