From 946c762d289f4ca468f243226ca2a61f718599ec Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 24 Oct 2023 14:26:49 +0200 Subject: [PATCH] WIP: reset documents in TypedChunk::Documents --- milli/src/update/index_documents/mod.rs | 17 +++--------- .../src/update/index_documents/typed_chunk.rs | 26 +++++++++++++++---- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 27021c3fb..d1fa28826 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -35,7 +35,7 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ - self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, + DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; use crate::{CboRoaringBitmapCodec, Index, Result}; @@ -374,17 +374,6 @@ where drop(lmdb_writer_sx) }); - // We delete the documents that this document addition replaces. This way we are - // able to simply insert all the documents even if they already exist in the database. - if !replaced_documents_ids.is_empty() { - let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?; - deletion_builder.strategy(self.config.deletion_strategy); - debug!("documents to delete {:?}", replaced_documents_ids); - deletion_builder.delete_documents(&replaced_documents_ids); - let deleted_documents_result = deletion_builder.execute_inner()?; - debug!("{} documents actually deleted", deleted_documents_result.deleted_documents); - } - let index_documents_ids = self.index.documents_ids(self.wtxn)?; let index_is_empty = index_documents_ids.is_empty(); let mut final_documents_ids = RoaringBitmap::new(); @@ -437,6 +426,7 @@ where otherwise => otherwise, }; + // FIXME: return newly added as well as newly deleted documents let (docids, is_merged_database) = write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?; if !docids.is_empty() { @@ -472,8 +462,9 @@ where let external_documents_ids = external_documents_ids.into_static(); self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?; + // FIXME: remove `new_documents_ids` entirely and `replaced_documents_ids` let all_documents_ids = index_documents_ids | new_documents_ids; - self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; + //self.index.put_documents_ids(self.wtxn, &all_documents_ids)?; // TODO: reactivate prefix DB with diff-indexing // self.execute_prefix_databases( diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index aebfca151..39537cce7 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -118,22 +118,38 @@ pub(crate) fn write_typed_chunk_into_index( let mut is_merged_database = false; match typed_chunk { TypedChunk::Documents(obkv_documents_iter) => { + let mut docids = index.documents_ids(wtxn)?; + let mut cursor = obkv_documents_iter.into_cursor()?; while let Some((docid, reader)) = cursor.move_on_next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); let reader: KvReader = KvReader::new(reader); + let mut written = false; for (field_id, value) in reader.iter() { let Some(value) = KvReaderDelAdd::new(value).get(DelAdd::Addition) else { continue; }; + // TODO: writer.is_empty + written = true; writer.insert(field_id, value)?; } - index.documents.remap_types::().put( - wtxn, - docid, - &writer.into_inner().unwrap(), - )?; + + let db = index.documents.remap_data_type::(); + let docid = docid.try_into().map(DocumentId::from_be_bytes).unwrap(); + + if written { + db.put(wtxn, &BEU32::new(docid), &writer.into_inner().unwrap())?; + docids.insert(docid); + } else { + db.delete(wtxn, &BEU32::new(docid))?; + // FIXME: unwrap + if !docids.remove(docid) { + panic!("Attempt to remove a document id that doesn't exist") + } + } } + + index.put_documents_ids(wtxn, &docids)?; } TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => { append_entries_into_database(