723: Fix bug in handling of soft deleted documents when updating settings r=Kerollmops a=loiclec

# Pull Request

## Related issue
Fixes (partially, until merged into meilisearch) https://github.com/meilisearch/meilisearch/issues/3021

## What does this PR do?
This PR fixes the bug where a `missing key in documents database` internal error message could appear when indexing documents.

When updating the settings, before clearing the database and before creating the transform output, we now modify the `ExternalDocumentsIds` structure to get rid of all references to soft deleted document ids in its FSTs.

It used to be that updating the settings would clear the soft-deleted document ids, but keep the original `ExternalDocumentsIds` structure. As a consequence of this, when processing a future document addition, we could wrongly believe that a document was being replaced when, in fact, it was a completely new document. See the tests `bug_3021_first`, `bug_3021_second`, and `bug_3021` for a minimal test case that would have reproduced the issue.
 
We need to take special care to:
- evaluate how users should update to v0.30.1 (containing this fix): dump? reimporting all documents from scratch?
- understand IF/HOW this bug could have caused duplicate documents to be returned 
- and evaluate the correctness of the fix, of course :)


Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
This commit is contained in:
bors[bot] 2022-12-06 14:37:38 +00:00 committed by GitHub
commit 0a301b5f88
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 311 additions and 183 deletions

View File

@ -71,6 +71,30 @@ impl<'a> ExternalDocumentsIds<'a> {
self.merge_soft_into_hard()
}
/// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
/// don't contain any soft deleted document id.
pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
let mut new_hard_builder = fst::MapBuilder::memory();
let union_op = self.hard.op().add(&self.soft).r#union();
let mut iter = union_op.into_stream();
while let Some((external_id, docids)) = iter.next() {
// prefer selecting the ids from soft, always
let id = indexed_last_value(docids).unwrap();
if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) {
new_hard_builder.insert(external_id, id)?;
}
}
drop(iter);
// Delete soft map completely
self.soft = fst::Map::default().map_data(Cow::Owned)?;
// We save the new map as the new hard map.
self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
Ok(())
}
pub fn insert_ids<A: AsRef<[u8]>>(&mut self, other: &fst::Map<A>) -> fst::Result<()> {
let union_op = self.soft.op().add(other).r#union();

View File

@ -1185,13 +1185,15 @@ pub(crate) mod tests {
use big_s::S;
use heed::{EnvOpenOptions, RwTxn};
use maplit::hashset;
use tempfile::TempDir;
use crate::documents::DocumentsBatchReader;
use crate::error::{Error, InternalError};
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::update::{
self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
IndexerConfig, Settings,
};
use crate::{db_snap, obkv_to_json, Index};
@ -1485,7 +1487,7 @@ pub(crate) mod tests {
use big_s::S;
use maplit::hashset;
let mut index = TempIndex::new();
let index = TempIndex::new();
index
.update_settings(|settings| {
@ -1544,7 +1546,6 @@ pub(crate) mod tests {
1 0 3 1 [3, 6, ]
"###);
index.index_documents_config.disable_soft_deletion = false;
index
.add_documents(documents!([{ "id": 3, "doggo": 4 }, { "id": 3, "doggo": 5 },{ "id": 3, "doggo": 4 }]))
.unwrap();
@ -1568,7 +1569,6 @@ pub(crate) mod tests {
1 0 4 1 [7, ]
"###);
index.index_documents_config.disable_soft_deletion = false;
index
.update_settings(|settings| {
settings.set_distinct_field("id".to_owned());
@ -1578,12 +1578,11 @@ pub(crate) mod tests {
db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]");
db_snap!(index, external_documents_ids, 3, @r###"
soft:
3 7
hard:
0 4
1 5
2 6
3 3
3 7
"###);
db_snap!(index, soft_deleted_documents_ids, 3, @"[]");
db_snap!(index, facet_id_f64_docids, 3, @r###"
@ -1596,163 +1595,6 @@ pub(crate) mod tests {
1 0 3 1 [6, ]
1 0 4 1 [7, ]
"###);
index.index_documents_config.disable_soft_deletion = true;
index.add_documents(documents!([{ "id": 3, "doggo": 4 }])).unwrap();
db_snap!(index, external_documents_ids, 3, @r###"
soft:
3 7
hard:
0 4
1 5
2 6
3 3
"###);
db_snap!(index, soft_deleted_documents_ids, 3, @"[]");
db_snap!(index, facet_id_f64_docids, 3, @r###"
0 0 0 1 [4, ]
0 0 1 1 [5, ]
0 0 2 1 [6, ]
0 0 3 1 [7, ]
1 0 1 1 [4, ]
1 0 2 1 [5, ]
1 0 3 1 [6, ]
1 0 4 1 [7, ]
"###);
let mut wtxn = index.write_txn().unwrap();
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
let docid = delete.delete_external_id("3").unwrap();
insta::assert_snapshot!(format!("{docid}"), @"7");
delete.execute().unwrap();
wtxn.commit().unwrap();
db_snap!(index, documents_ids, @"[4, 5, 6, ]");
db_snap!(index, external_documents_ids, 4, @r###"
soft:
3 7
hard:
0 4
1 5
2 6
3 3
"###);
db_snap!(index, soft_deleted_documents_ids, 4, @"[7, ]");
db_snap!(index, facet_id_f64_docids, 4, @r###"
0 0 0 1 [4, ]
0 0 1 1 [5, ]
0 0 2 1 [6, ]
0 0 3 1 [7, ]
1 0 1 1 [4, ]
1 0 2 1 [5, ]
1 0 3 1 [6, ]
1 0 4 1 [7, ]
"###);
index.index_documents_config.disable_soft_deletion = false;
index.add_documents(documents!([{ "id": 3, "doggo": 4 }])).unwrap();
db_snap!(index, external_documents_ids, 4, @r###"
soft:
3 0
hard:
0 4
1 5
2 6
3 3
"###);
db_snap!(index, soft_deleted_documents_ids, 4, @"[7, ]");
db_snap!(index, facet_id_f64_docids, 4, @r###"
0 0 0 1 [4, ]
0 0 1 1 [5, ]
0 0 2 1 [6, ]
0 0 3 1 [0, 7, ]
1 0 1 1 [4, ]
1 0 2 1 [5, ]
1 0 3 1 [6, ]
1 0 4 1 [0, 7, ]
"###);
index.index_documents_config.disable_soft_deletion = false;
index.add_documents(documents!([{ "id": 3, "doggo": 5 }])).unwrap();
db_snap!(index, external_documents_ids, 4, @r###"
soft:
3 1
hard:
0 4
1 5
2 6
3 3
"###);
db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 7, ]");
db_snap!(index, facet_id_f64_docids, 4, @r###"
0 0 0 1 [4, ]
0 0 1 1 [5, ]
0 0 2 1 [6, ]
0 0 3 1 [0, 1, 7, ]
1 0 1 1 [4, ]
1 0 2 1 [5, ]
1 0 3 1 [6, ]
1 0 4 1 [0, 7, ]
1 0 5 1 [1, ]
"###);
index.index_documents_config.disable_soft_deletion = false;
index.add_documents(documents!([{ "id": 3, "doggo": 5, "id": 2, "doggo": 4 }])).unwrap();
db_snap!(index, external_documents_ids, 4, @r###"
soft:
hard:
0 4
1 5
2 2
3 1
"###);
db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 6, 7, ]");
db_snap!(index, facet_id_f64_docids, 4, @r###"
0 0 0 1 [4, ]
0 0 1 1 [5, ]
0 0 2 1 [2, 6, ]
0 0 3 1 [0, 1, 7, ]
1 0 1 1 [4, ]
1 0 2 1 [5, ]
1 0 3 1 [6, ]
1 0 4 1 [0, 2, 7, ]
1 0 5 1 [1, ]
"###);
index.index_documents_config.disable_soft_deletion = false;
index
.add_documents(documents!([{ "id": 4, "doggo": 5 }, { "id": 3, "doggo": 5 }]))
.unwrap();
db_snap!(index, external_documents_ids, 4, @r###"
soft:
4 3
hard:
0 4
1 5
2 2
3 1
"###);
db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 6, 7, ]");
db_snap!(index, facet_id_f64_docids, 4, @r###"
0 0 0 1 [4, ]
0 0 1 1 [5, ]
0 0 2 1 [2, 6, ]
0 0 3 1 [0, 1, 7, ]
0 0 4 1 [3, ]
1 0 1 1 [4, ]
1 0 2 1 [5, ]
1 0 3 1 [6, ]
1 0 4 1 [0, 2, 7, ]
1 0 5 1 [1, 3, ]
"###);
}
#[test]
@ -2020,4 +1862,253 @@ pub(crate) mod tests {
drop(rtxn);
}
}
#[test]
fn bug_3021_first() {
// https://github.com/meilisearch/meilisearch/issues/3021
let mut index = TempIndex::new();
index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
index
.update_settings(|settings| {
settings.set_primary_key("primary_key".to_owned());
})
.unwrap();
index
.add_documents(documents!([
{ "primary_key": 38 },
{ "primary_key": 34 }
]))
.unwrap();
db_snap!(index, documents_ids, @"[0, 1, ]");
db_snap!(index, external_documents_ids, 1, @r###"
soft:
hard:
34 1
38 0
"###);
db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
let mut wtxn = index.write_txn().unwrap();
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
delete.delete_external_id("34");
delete.execute().unwrap();
wtxn.commit().unwrap();
db_snap!(index, documents_ids, @"[0, ]");
db_snap!(index, external_documents_ids, 2, @r###"
soft:
hard:
34 1
38 0
"###);
db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]");
index
.update_settings(|s| {
s.set_searchable_fields(vec![]);
})
.unwrap();
// The key point of the test is to verify that the external documents ids
// do not contain any entry for previously soft-deleted document ids
db_snap!(index, documents_ids, @"[0, ]");
db_snap!(index, external_documents_ids, 3, @r###"
soft:
hard:
38 0
"###);
db_snap!(index, soft_deleted_documents_ids, 3, @"[]");
// So that this document addition works correctly now.
// It would be wrongly interpreted as a replacement before
index.add_documents(documents!({ "primary_key": 34 })).unwrap();
db_snap!(index, documents_ids, @"[0, 1, ]");
db_snap!(index, external_documents_ids, 4, @r###"
soft:
hard:
34 1
38 0
"###);
db_snap!(index, soft_deleted_documents_ids, 4, @"[]");
// We do the test again, but deleting the document with id 0 instead of id 1 now
let mut wtxn = index.write_txn().unwrap();
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
delete.delete_external_id("38");
delete.execute().unwrap();
wtxn.commit().unwrap();
db_snap!(index, documents_ids, @"[1, ]");
db_snap!(index, external_documents_ids, 5, @r###"
soft:
hard:
34 1
38 0
"###);
db_snap!(index, soft_deleted_documents_ids, 5, @"[0, ]");
index
.update_settings(|s| {
s.set_searchable_fields(vec!["primary_key".to_owned()]);
})
.unwrap();
db_snap!(index, documents_ids, @"[1, ]");
db_snap!(index, external_documents_ids, 6, @r###"
soft:
hard:
34 1
"###);
db_snap!(index, soft_deleted_documents_ids, 6, @"[]");
// And adding lots of documents afterwards instead of just one.
// These extra subtests don't add much, but it's better than nothing.
index.add_documents(documents!([{ "primary_key": 38 }, { "primary_key": 39 }, { "primary_key": 41 }, { "primary_key": 40 }, { "primary_key": 41 }, { "primary_key": 42 }])).unwrap();
db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]");
db_snap!(index, external_documents_ids, 7, @r###"
soft:
hard:
34 1
38 0
39 2
40 4
41 3
42 5
"###);
db_snap!(index, soft_deleted_documents_ids, 7, @"[]");
}
#[test]
fn bug_3021_second() {
// https://github.com/meilisearch/meilisearch/issues/3021
let mut index = TempIndex::new();
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
index
.update_settings(|settings| {
settings.set_primary_key("primary_key".to_owned());
})
.unwrap();
index
.add_documents(documents!([
{ "primary_key": 30 },
{ "primary_key": 34 }
]))
.unwrap();
db_snap!(index, documents_ids, @"[0, 1, ]");
db_snap!(index, external_documents_ids, 1, @r###"
soft:
hard:
30 0
34 1
"###);
db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
let mut wtxn = index.write_txn().unwrap();
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
delete.delete_external_id("34");
delete.execute().unwrap();
wtxn.commit().unwrap();
db_snap!(index, documents_ids, @"[0, ]");
db_snap!(index, external_documents_ids, 2, @r###"
soft:
hard:
30 0
34 1
"###);
db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]");
index
.update_settings(|s| {
s.set_searchable_fields(vec![]);
})
.unwrap();
// The key point of the test is to verify that the external documents ids
// do not contain any entry for previously soft-deleted document ids
db_snap!(index, documents_ids, @"[0, ]");
db_snap!(index, external_documents_ids, 3, @r###"
soft:
hard:
30 0
"###);
db_snap!(index, soft_deleted_documents_ids, 3, @"[]");
// So that when we add a new document
index.add_documents(documents!({ "primary_key": 35, "b": 2 })).unwrap();
db_snap!(index, documents_ids, @"[0, 1, ]");
// The external documents ids don't have several external ids pointing to the same
// internal document id
db_snap!(index, external_documents_ids, 4, @r###"
soft:
hard:
30 0
35 1
"###);
db_snap!(index, soft_deleted_documents_ids, 4, @"[]");
// And when we add 34 again, we don't replace document 35
index.add_documents(documents!({ "primary_key": 34, "a": 1 })).unwrap();
// And document 35 still exists, is not deleted
db_snap!(index, documents_ids, @"[0, 1, 2, ]");
db_snap!(index, external_documents_ids, 5, @r###"
soft:
hard:
30 0
34 2
35 1
"###);
db_snap!(index, soft_deleted_documents_ids, 5, @"[]");
let rtxn = index.read_txn().unwrap();
let (_docid, obkv) = index.documents(&rtxn, [0]).unwrap()[0];
let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
insta::assert_debug_snapshot!(json, @r###"
{
"primary_key": Number(30),
}
"###);
// Furthermore, when we retrieve document 34, it is not the result of merging 35 with 34
let (_docid, obkv) = index.documents(&rtxn, [2]).unwrap()[0];
let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
insta::assert_debug_snapshot!(json, @r###"
{
"primary_key": Number(34),
"a": Number(1),
}
"###);
drop(rtxn);
// Add new documents again
index
.add_documents(
documents!([{ "primary_key": 37 }, { "primary_key": 38 }, { "primary_key": 39 }]),
)
.unwrap();
db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]");
db_snap!(index, external_documents_ids, 6, @r###"
soft:
hard:
30 0
34 2
35 1
37 3
38 4
39 5
"###);
db_snap!(index, soft_deleted_documents_ids, 6, @"[]");
}
}

View File

@ -17,7 +17,7 @@ use super::{IndexDocumentsMethod, IndexerConfig};
use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
use crate::error::{Error, InternalError, UserError};
use crate::index::db_name;
use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
use crate::{
ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
Result, BEU32,
@ -546,12 +546,13 @@ impl<'a, 'i> Transform<'a, 'i> {
})
}
/// Returns a `TransformOutput` with a file that contains the documents of the index
/// with the attributes reordered accordingly to the `FieldsIdsMap` given as argument.
/// Clear all databases. Returns a `TransformOutput` with a file that contains the documents
/// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument.
///
// TODO this can be done in parallel by using the rayon `ThreadPool`.
pub fn remap_index_documents(
pub fn prepare_for_documents_reindexing(
self,
wtxn: &mut heed::RwTxn,
wtxn: &mut heed::RwTxn<'i, '_>,
old_fields_ids_map: FieldsIdsMap,
mut new_fields_ids_map: FieldsIdsMap,
) -> Result<TransformOutput> {
@ -559,7 +560,14 @@ impl<'a, 'i> Transform<'a, 'i> {
let primary_key =
self.index.primary_key(wtxn)?.ok_or(UserError::MissingPrimaryKey)?.to_string();
let field_distribution = self.index.field_distribution(wtxn)?;
let external_documents_ids = self.index.external_documents_ids(wtxn)?;
// Delete the soft deleted document ids from the maps inside the external_document_ids structure
let new_external_documents_ids = {
let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
external_documents_ids
};
let documents_ids = self.index.documents_ids(wtxn)?;
let documents_count = documents_ids.len() as usize;
@ -638,17 +646,25 @@ impl<'a, 'i> Transform<'a, 'i> {
let mut flattened_documents = flattened_writer.into_inner()?;
flattened_documents.seek(SeekFrom::Start(0))?;
Ok(TransformOutput {
let output = TransformOutput {
primary_key,
fields_ids_map: new_fields_ids_map,
field_distribution,
external_documents_ids: external_documents_ids.into_static(),
external_documents_ids: new_external_documents_ids.into_static(),
new_documents_ids: documents_ids,
replaced_documents_ids: RoaringBitmap::default(),
documents_count,
original_documents,
flattened_documents,
})
};
let new_facets = output.compute_real_facets(wtxn, self.index)?;
self.index.put_faceted_fields(wtxn, &new_facets)?;
// We clear the full database (words-fst, documents ids and documents content).
ClearDocuments::new(wtxn, self.index).execute()?;
Ok(output)
}
}

View File

@ -12,7 +12,7 @@ use crate::criterion::Criterion;
use crate::error::UserError;
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
use crate::update::index_documents::IndexDocumentsMethod;
use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
use crate::update::{IndexDocuments, UpdateIndexingStep};
use crate::{FieldsIdsMap, Index, Result};
#[derive(Debug, Clone, PartialEq, Eq, Copy)]
@ -291,15 +291,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
false,
)?;
// We remap the documents fields based on the new `FieldsIdsMap`.
let output =
transform.remap_index_documents(self.wtxn, old_fields_ids_map, fields_ids_map)?;
let new_facets = output.compute_real_facets(self.wtxn, self.index)?;
self.index.put_faceted_fields(self.wtxn, &new_facets)?;
// We clear the full database (words-fst, documents ids and documents content).
ClearDocuments::new(self.wtxn, self.index).execute()?;
// We clear the databases and remap the documents fields based on the new `FieldsIdsMap`.
let output = transform.prepare_for_documents_reindexing(
self.wtxn,
old_fields_ids_map,
fields_ids_map,
)?;
// We index the generated `TransformOutput` which must contain
// all the documents with fields in the newly defined searchable order.
@ -736,7 +733,7 @@ mod tests {
use super::*;
use crate::error::Error;
use crate::index::tests::TempIndex;
use crate::update::DeleteDocuments;
use crate::update::{ClearDocuments, DeleteDocuments};
use crate::{Criterion, Filter, SearchResult};
#[test]