734: Fix bug 2945/3021 (missing key in documents database) r=Kerollmops a=loiclec

# Pull Request

## Related issue
Fixes (partially, until merged into meilisearch) https://github.com/meilisearch/meilisearch/issues/2945 (until we integrate the new milli bump into meilisearch).

**Note that a dump will not be sufficient to upgrade from meilisearch v0.30.2 to meilisearch v0.30.3 due to this fix** because the bug could have caused the `documents` database to be corrupted. Instead, a full manual reimport of the documents will be necessary.

## What does this PR do?
There was a bug happening when:
1. A few documents are added to the index
2. Some of these documents are soft-deleted
3. New documents are added, replacing existing ones and triggering a hard-deletion

The `IndexDocuments::execute` method would then perform the hard-deletion but forget to change the `external_document_ids` structure appropriately. As a result, the `external_document_ids` would contain keys corresponding to documents that do no exist anymore.

To fix this bug, I split the `DeleteDocuments::execute` method into two: `execute_inner` and `execute`. 
- `execute_inner` returns a `DetailedDocumentDeletionResult` which says whether soft-deletion was used or not
- `execute` keeps the exact same signature and behaviour

Then, when deleting replaced documents inside `IndexDocuments::execute`, we call `DeleteDocuments::execute_inner` instead of `DeleteDocuments::execute`. If soft-deletion was used, nothing more is done. But if hard-deletion was used, we remove every reference to soft-deleted documents in the new `external_documents_ids` structure.

## Correctness

- Every other test still passes
- The reproduction test case now passes
- In a different branch ([`update-fuzz-test`](https://github.com/meilisearch/milli/pull/735)), I created a fuzz-test that reproduces the past two bugs. This fuzz test cannot find this bug through any combination of some hand-selected `DocumentAddition / DocumentDeletion / DocumentClear / SettingsUpdate` operations. In that test, each relevant operations can be executed with or without soft-deletion, and document additions can be done in batches, replacing or updating existing documents.



Co-authored-by: Loïc Lecrenier <loic.lecrenier@me.com>
This commit is contained in:
bors[bot] 2022-12-13 09:45:57 +00:00 committed by GitHub
commit e0a8f8cb5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 114 additions and 8 deletions

View File

@ -2111,4 +2111,80 @@ pub(crate) mod tests {
"###);
db_snap!(index, soft_deleted_documents_ids, 6, @"[]");
}
#[test]
fn bug_3021_third() {
// https://github.com/meilisearch/meilisearch/issues/3021
let mut index = TempIndex::new();
index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
index
.update_settings(|settings| {
settings.set_primary_key("primary_key".to_owned());
})
.unwrap();
index
.add_documents(documents!([
{ "primary_key": 3 },
{ "primary_key": 4 },
{ "primary_key": 5 }
]))
.unwrap();
db_snap!(index, documents_ids, @"[0, 1, 2, ]");
db_snap!(index, external_documents_ids, 1, @r###"
soft:
hard:
3 0
4 1
5 2
"###);
db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
let mut wtxn = index.write_txn().unwrap();
let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
delete.delete_external_id("3");
delete.execute().unwrap();
wtxn.commit().unwrap();
db_snap!(index, documents_ids, @"[1, 2, ]");
db_snap!(index, external_documents_ids, 2, @r###"
soft:
hard:
3 0
4 1
5 2
"###);
db_snap!(index, soft_deleted_documents_ids, 2, @"[0, ]");
index.index_documents_config.disable_soft_deletion = true;
index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap();
db_snap!(index, documents_ids, @"[2, 3, ]");
db_snap!(index, external_documents_ids, 2, @r###"
soft:
hard:
4 3
5 2
"###);
db_snap!(index, soft_deleted_documents_ids, 2, @"[]");
index
.add_documents(documents!([
{ "primary_key": "3" },
]))
.unwrap();
db_snap!(index, documents_ids, @"[0, 2, 3, ]");
db_snap!(index, external_documents_ids, 2, @r###"
soft:
hard:
3 0
4 3
5 2
"###);
db_snap!(index, soft_deleted_documents_ids, 2, @"[]");
}
}

View File

@ -29,12 +29,24 @@ pub struct DeleteDocuments<'t, 'u, 'i> {
disable_soft_deletion: bool,
}
/// Result of a [`DeleteDocuments`] operation.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct DocumentDeletionResult {
pub deleted_documents: u64,
pub remaining_documents: u64,
}
/// Result of a [`DeleteDocuments`] operation, used for internal purposes.
///
/// It is a superset of the [`DocumentDeletionResult`] structure, giving
/// additional information about the algorithm used to delete the documents.
#[derive(Debug)]
pub(crate) struct DetailedDocumentDeletionResult {
pub deleted_documents: u64,
pub remaining_documents: u64,
pub soft_deletion_used: bool,
}
impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
pub fn new(
wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -68,8 +80,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
self.delete_document(docid);
Some(docid)
}
pub fn execute(self) -> Result<DocumentDeletionResult> {
let DetailedDocumentDeletionResult {
deleted_documents,
remaining_documents,
soft_deletion_used: _,
} = self.execute_inner()?;
pub fn execute(mut self) -> Result<DocumentDeletionResult> {
Ok(DocumentDeletionResult { deleted_documents, remaining_documents })
}
pub(crate) fn execute_inner(mut self) -> Result<DetailedDocumentDeletionResult> {
self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
// We retrieve the current documents ids that are in the database.
@ -83,7 +103,11 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
if !soft_deleted_docids.is_empty() {
ClearDocuments::new(self.wtxn, self.index).execute()?;
}
return Ok(DocumentDeletionResult { deleted_documents: 0, remaining_documents: 0 });
return Ok(DetailedDocumentDeletionResult {
deleted_documents: 0,
remaining_documents: 0,
soft_deletion_used: false,
});
}
// We remove the documents ids that we want to delete
@ -95,9 +119,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
// to delete is exactly the number of documents in the database.
if current_documents_ids_len == self.to_delete_docids.len() {
let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?;
return Ok(DocumentDeletionResult {
return Ok(DetailedDocumentDeletionResult {
deleted_documents: current_documents_ids_len,
remaining_documents,
soft_deletion_used: false,
});
}
@ -159,9 +184,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
&& percentage_used_by_soft_deleted_documents < 10
{
self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
return Ok(DocumentDeletionResult {
return Ok(DetailedDocumentDeletionResult {
deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(),
soft_deletion_used: true,
});
}
@ -488,9 +514,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
&self.to_delete_docids,
)?;
Ok(DocumentDeletionResult {
Ok(DetailedDocumentDeletionResult {
deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(),
soft_deletion_used: false,
})
}
}

View File

@ -210,7 +210,7 @@ where
primary_key,
fields_ids_map,
field_distribution,
external_documents_ids,
mut external_documents_ids,
new_documents_ids,
replaced_documents_ids,
documents_count,
@ -335,8 +335,11 @@ where
deletion_builder.disable_soft_deletion(self.config.disable_soft_deletion);
debug!("documents to delete {:?}", replaced_documents_ids);
deletion_builder.delete_documents(&replaced_documents_ids);
let deleted_documents_count = deletion_builder.execute()?;
debug!("{} documents actually deleted", deleted_documents_count.deleted_documents);
let deleted_documents_result = deletion_builder.execute_inner()?;
debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
if !deleted_documents_result.soft_deletion_used {
external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
}
}
let index_documents_ids = self.index.documents_ids(self.wtxn)?;