From 67d8cec20903a37c339fc1b460a32a672932dfcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 6 Dec 2022 11:38:15 +0100 Subject: [PATCH] Fix bug in handling of soft deleted documents when updating settings --- milli/src/external_documents_ids.rs | 24 + milli/src/index.rs | 417 +++++++++++------- milli/src/update/index_documents/transform.rs | 34 +- milli/src/update/settings.rs | 17 +- 4 files changed, 310 insertions(+), 182 deletions(-) diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs index 6029722af..64b294541 100644 --- a/milli/src/external_documents_ids.rs +++ b/milli/src/external_documents_ids.rs @@ -71,6 +71,30 @@ impl<'a> ExternalDocumentsIds<'a> { self.merge_soft_into_hard() } + /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they + /// don't contain any soft deleted document id. + pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> { + let mut new_hard_builder = fst::MapBuilder::memory(); + + let union_op = self.hard.op().add(&self.soft).r#union(); + let mut iter = union_op.into_stream(); + while let Some((external_id, docids)) = iter.next() { + // prefer selecting the ids from soft, always + let id = indexed_last_value(docids).unwrap(); + if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) { + new_hard_builder.insert(external_id, id)?; + } + } + drop(iter); + + // Delete soft map completely + self.soft = fst::Map::default().map_data(Cow::Owned)?; + // We save the new map as the new hard map. + self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?; + + Ok(()) + } + pub fn insert_ids>(&mut self, other: &fst::Map) -> fst::Result<()> { let union_op = self.soft.op().add(other).r#union(); diff --git a/milli/src/index.rs b/milli/src/index.rs index a98247b6e..e9d66a3ae 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1185,13 +1185,15 @@ pub(crate) mod tests { use big_s::S; use heed::{EnvOpenOptions, RwTxn}; + use maplit::hashset; use tempfile::TempDir; use crate::documents::DocumentsBatchReader; use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{ - self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings, + self, DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, + IndexerConfig, Settings, }; use crate::{db_snap, obkv_to_json, Index}; @@ -1485,7 +1487,7 @@ pub(crate) mod tests { use big_s::S; use maplit::hashset; - let mut index = TempIndex::new(); + let index = TempIndex::new(); index .update_settings(|settings| { @@ -1544,7 +1546,6 @@ pub(crate) mod tests { 1 0 3 1 [3, 6, ] "###); - index.index_documents_config.disable_soft_deletion = false; index .add_documents(documents!([{ "id": 3, "doggo": 4 }, { "id": 3, "doggo": 5 },{ "id": 3, "doggo": 4 }])) .unwrap(); @@ -1568,7 +1569,6 @@ pub(crate) mod tests { 1 0 4 1 [7, ] "###); - index.index_documents_config.disable_soft_deletion = false; index .update_settings(|settings| { settings.set_distinct_field("id".to_owned()); @@ -1576,37 +1576,13 @@ pub(crate) mod tests { .unwrap(); db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]"); - db_snap!(index, external_documents_ids, 3, @r###" - soft: - 3 7 - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); - db_snap!(index, facet_id_f64_docids, 3, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [7, ] - "###); - - index.index_documents_config.disable_soft_deletion = true; - index.add_documents(documents!([{ "id": 3, "doggo": 4 }])).unwrap(); db_snap!(index, external_documents_ids, 3, @r###" soft: - 3 7 hard: 0 4 1 5 2 6 - 3 3 + 3 7 "###); db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); db_snap!(index, facet_id_f64_docids, 3, @r###" @@ -1619,140 +1595,6 @@ pub(crate) mod tests { 1 0 3 1 [6, ] 1 0 4 1 [7, ] "###); - - let mut wtxn = index.write_txn().unwrap(); - let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); - let docid = delete.delete_external_id("3").unwrap(); - insta::assert_snapshot!(format!("{docid}"), @"7"); - delete.execute().unwrap(); - wtxn.commit().unwrap(); - - db_snap!(index, documents_ids, @"[4, 5, 6, ]"); - db_snap!(index, external_documents_ids, 4, @r###" - soft: - 3 7 - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [7, ] - "###); - - index.index_documents_config.disable_soft_deletion = false; - index.add_documents(documents!([{ "id": 3, "doggo": 4 }])).unwrap(); - - db_snap!(index, external_documents_ids, 4, @r###" - soft: - 3 0 - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [0, 7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [0, 7, ] - "###); - - index.index_documents_config.disable_soft_deletion = false; - index.add_documents(documents!([{ "id": 3, "doggo": 5 }])).unwrap(); - - db_snap!(index, external_documents_ids, 4, @r###" - soft: - 3 1 - hard: - 0 4 - 1 5 - 2 6 - 3 3 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [6, ] - 0 0 3 1 [0, 1, 7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [0, 7, ] - 1 0 5 1 [1, ] - "###); - - index.index_documents_config.disable_soft_deletion = false; - index.add_documents(documents!([{ "id": 3, "doggo": 5, "id": 2, "doggo": 4 }])).unwrap(); - db_snap!(index, external_documents_ids, 4, @r###" - soft: - hard: - 0 4 - 1 5 - 2 2 - 3 1 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 6, 7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [2, 6, ] - 0 0 3 1 [0, 1, 7, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [0, 2, 7, ] - 1 0 5 1 [1, ] - "###); - - index.index_documents_config.disable_soft_deletion = false; - index - .add_documents(documents!([{ "id": 4, "doggo": 5 }, { "id": 3, "doggo": 5 }])) - .unwrap(); - - db_snap!(index, external_documents_ids, 4, @r###" - soft: - 4 3 - hard: - 0 4 - 1 5 - 2 2 - 3 1 - "###); - - db_snap!(index, soft_deleted_documents_ids, 4, @"[0, 6, 7, ]"); - db_snap!(index, facet_id_f64_docids, 4, @r###" - 0 0 0 1 [4, ] - 0 0 1 1 [5, ] - 0 0 2 1 [2, 6, ] - 0 0 3 1 [0, 1, 7, ] - 0 0 4 1 [3, ] - 1 0 1 1 [4, ] - 1 0 2 1 [5, ] - 1 0 3 1 [6, ] - 1 0 4 1 [0, 2, 7, ] - 1 0 5 1 [1, 3, ] - "###); } #[test] @@ -2020,4 +1862,253 @@ pub(crate) mod tests { drop(rtxn); } } + + #[test] + fn bug_3021_first() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 38 }, + { "primary_key": 34 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_external_id("34"); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); + + index + .update_settings(|s| { + s.set_searchable_fields(vec![]); + }) + .unwrap(); + + // The key point of the test is to verify that the external documents ids + // do not contain any entry for previously soft-deleted document ids + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + hard: + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + + // So that this document addition works correctly now. + // It would be wrongly interpreted as a replacement before + index.add_documents(documents!({ "primary_key": 34 })).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 4, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); + + // We do the test again, but deleting the document with id 0 instead of id 1 now + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_external_id("38"); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[1, ]"); + db_snap!(index, external_documents_ids, 5, @r###" + soft: + hard: + 34 1 + 38 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 5, @"[0, ]"); + + index + .update_settings(|s| { + s.set_searchable_fields(vec!["primary_key".to_owned()]); + }) + .unwrap(); + + db_snap!(index, documents_ids, @"[1, ]"); + db_snap!(index, external_documents_ids, 6, @r###" + soft: + hard: + 34 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); + + // And adding lots of documents afterwards instead of just one. + // These extra subtests don't add much, but it's better than nothing. + index.add_documents(documents!([{ "primary_key": 38 }, { "primary_key": 39 }, { "primary_key": 41 }, { "primary_key": 40 }, { "primary_key": 41 }, { "primary_key": 42 }])).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); + db_snap!(index, external_documents_ids, 7, @r###" + soft: + hard: + 34 1 + 38 0 + 39 2 + 40 4 + 41 3 + 42 5 + "###); + db_snap!(index, soft_deleted_documents_ids, 7, @"[]"); + } + + #[test] + fn bug_3021_second() { + // https://github.com/meilisearch/meilisearch/issues/3021 + let mut index = TempIndex::new(); + index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments; + + index + .update_settings(|settings| { + settings.set_primary_key("primary_key".to_owned()); + }) + .unwrap(); + + index + .add_documents(documents!([ + { "primary_key": 30 }, + { "primary_key": 34 } + ])) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + db_snap!(index, external_documents_ids, 1, @r###" + soft: + hard: + 30 0 + 34 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 1, @"[]"); + + let mut wtxn = index.write_txn().unwrap(); + let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + delete.delete_external_id("34"); + delete.execute().unwrap(); + wtxn.commit().unwrap(); + + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 2, @r###" + soft: + hard: + 30 0 + 34 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]"); + + index + .update_settings(|s| { + s.set_searchable_fields(vec![]); + }) + .unwrap(); + + // The key point of the test is to verify that the external documents ids + // do not contain any entry for previously soft-deleted document ids + db_snap!(index, documents_ids, @"[0, ]"); + db_snap!(index, external_documents_ids, 3, @r###" + soft: + hard: + 30 0 + "###); + db_snap!(index, soft_deleted_documents_ids, 3, @"[]"); + + // So that when we add a new document + index.add_documents(documents!({ "primary_key": 35, "b": 2 })).unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, ]"); + // The external documents ids don't have several external ids pointing to the same + // internal document id + db_snap!(index, external_documents_ids, 4, @r###" + soft: + hard: + 30 0 + 35 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 4, @"[]"); + + // And when we add 34 again, we don't replace document 35 + index.add_documents(documents!({ "primary_key": 34, "a": 1 })).unwrap(); + + // And document 35 still exists, is not deleted + db_snap!(index, documents_ids, @"[0, 1, 2, ]"); + db_snap!(index, external_documents_ids, 5, @r###" + soft: + hard: + 30 0 + 34 2 + 35 1 + "###); + db_snap!(index, soft_deleted_documents_ids, 5, @"[]"); + + let rtxn = index.read_txn().unwrap(); + let (_docid, obkv) = index.documents(&rtxn, [0]).unwrap()[0]; + let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "primary_key": Number(30), + } + "###); + + // Furthermore, when we retrieve document 34, it is not the result of merging 35 with 34 + let (_docid, obkv) = index.documents(&rtxn, [2]).unwrap()[0]; + let json = obkv_to_json(&[0, 1, 2], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap(); + insta::assert_debug_snapshot!(json, @r###" + { + "primary_key": Number(34), + "a": Number(1), + } + "###); + + drop(rtxn); + + // Add new documents again + index + .add_documents( + documents!([{ "primary_key": 37 }, { "primary_key": 38 }, { "primary_key": 39 }]), + ) + .unwrap(); + + db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]"); + db_snap!(index, external_documents_ids, 6, @r###" + soft: + hard: + 30 0 + 34 2 + 35 1 + 37 3 + 38 4 + 39 5 + "###); + db_snap!(index, soft_deleted_documents_ids, 6, @"[]"); + } } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 57aa02e04..f414569b9 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -17,7 +17,7 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; -use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, @@ -546,12 +546,13 @@ impl<'a, 'i> Transform<'a, 'i> { }) } - /// Returns a `TransformOutput` with a file that contains the documents of the index - /// with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. + /// Clear all databases. Returns a `TransformOutput` with a file that contains the documents + /// of the index with the attributes reordered accordingly to the `FieldsIdsMap` given as argument. + /// // TODO this can be done in parallel by using the rayon `ThreadPool`. - pub fn remap_index_documents( + pub fn prepare_for_documents_reindexing( self, - wtxn: &mut heed::RwTxn, + wtxn: &mut heed::RwTxn<'i, '_>, old_fields_ids_map: FieldsIdsMap, mut new_fields_ids_map: FieldsIdsMap, ) -> Result { @@ -559,7 +560,14 @@ impl<'a, 'i> Transform<'a, 'i> { let primary_key = self.index.primary_key(wtxn)?.ok_or(UserError::MissingPrimaryKey)?.to_string(); let field_distribution = self.index.field_distribution(wtxn)?; - let external_documents_ids = self.index.external_documents_ids(wtxn)?; + + // Delete the soft deleted document ids from the maps inside the external_document_ids structure + let new_external_documents_ids = { + let mut external_documents_ids = self.index.external_documents_ids(wtxn)?; + external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?; + external_documents_ids + }; + let documents_ids = self.index.documents_ids(wtxn)?; let documents_count = documents_ids.len() as usize; @@ -638,17 +646,25 @@ impl<'a, 'i> Transform<'a, 'i> { let mut flattened_documents = flattened_writer.into_inner()?; flattened_documents.seek(SeekFrom::Start(0))?; - Ok(TransformOutput { + let output = TransformOutput { primary_key, fields_ids_map: new_fields_ids_map, field_distribution, - external_documents_ids: external_documents_ids.into_static(), + external_documents_ids: new_external_documents_ids.into_static(), new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(), documents_count, original_documents, flattened_documents, - }) + }; + + let new_facets = output.compute_real_facets(wtxn, self.index)?; + self.index.put_faceted_fields(wtxn, &new_facets)?; + + // We clear the full database (words-fst, documents ids and documents content). + ClearDocuments::new(wtxn, self.index).execute()?; + + Ok(output) } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index fc7e6bc03..b66893ee3 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -12,7 +12,7 @@ use crate::criterion::Criterion; use crate::error::UserError; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::index_documents::IndexDocumentsMethod; -use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep}; +use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::{FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] @@ -291,15 +291,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { false, )?; - // We remap the documents fields based on the new `FieldsIdsMap`. - let output = - transform.remap_index_documents(self.wtxn, old_fields_ids_map, fields_ids_map)?; - - let new_facets = output.compute_real_facets(self.wtxn, self.index)?; - self.index.put_faceted_fields(self.wtxn, &new_facets)?; - - // We clear the full database (words-fst, documents ids and documents content). - ClearDocuments::new(self.wtxn, self.index).execute()?; + // We clear the databases and remap the documents fields based on the new `FieldsIdsMap`. + let output = transform.prepare_for_documents_reindexing( + self.wtxn, + old_fields_ids_map, + fields_ids_map, + )?; // We index the generated `TransformOutput` which must contain // all the documents with fields in the newly defined searchable order.