Merge #4203

4203: Extract external document docids from docs on deletion by filter r=Kerollmops a=dureuill This fixes some of the performance regression observed on `diff-indexing` when doing delete-by-filter with a filter matching many documents. To delete 19 768 771 documents (hackernews dataset, all documents matching `type = comment`), here are the observed time: |branch (commit sha1sum)|time|speed-down factor (lower is better)| |--|--|--| |`main` (48865470d7)|1212.885536s (~20min)|x1.0 (baseline)| |`diff-indexing` (523519fdbf)|5385.550543s (90min)|x4.44| |**`diff-indexing-extract-primary-key`**(f8289cd974)|2582.323324s (43min) | x2.13| So we're still suffering a speed-down of x2.13, but that's much better than x4.44. --- Changes: - Refactor the logic of PrimaryKey extraction to a struct - Add a trait to abstract the extraction of field id from a name between `DocumentBatch` and `FieldIdMap`. - Add `Index::external_id_of` to get the external ids of a bitmap of internal ids. - Use this new method to add new Transform and Batch methods to remove documents that are known to be from the DB. - Modify delete-by-filter to use the new method Co-authored-by: Louis Dureuil <louis@meilisearch.com>
2024-11-26 12:05:05 +08:00 · 2023-11-13 13:02:10 +00:00 · 2023-11-13 13:02:10 +00:00 · 72d3fa4898
commit 72d3fa4898
parent 523519fdbf 772964125d
9 changed files with 382 additions and 249 deletions
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@ -864,22 +864,12 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBi
    let indexer_config = IndexerConfig::default();
    for ids in document_ids_to_delete {
        let external_documents_ids = index.external_documents_ids();
        // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
        // Since what we have is an iterator, it would be better to delete in chunks
        let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
            external_documents_ids
                .find_external_id_of(&wtxn, ids)
                .unwrap()
                .only_external_ids()
                .collect();
        let ids = external_to_internal.unwrap();
        let config = IndexDocumentsConfig::default();
        let mut builder =
            IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false)
                .unwrap();
-        (builder, _) = builder.remove_documents(ids).unwrap();
+        (builder, _) = builder.remove_documents_from_db_no_batch(&ids).unwrap();
        builder.execute().unwrap();
    }
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@ -1534,18 +1534,6 @@ fn delete_document_by_filter<'a>(
            }
            e => e.into(),
        })?;
        let external_documents_ids = index.external_documents_ids();
        // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
        // Since what we have is an iterator, it would be better to delete in chunks
        let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
            external_documents_ids
                .find_external_id_of(wtxn, candidates)?
                .only_external_ids()
                .collect();
        let document_ids = match external_to_internal {
            Ok(external_ids) => external_ids,
            Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids),
        };
        let config = IndexDocumentsConfig {
            update_method: IndexDocumentsMethod::ReplaceDocuments,
@ -1561,13 +1549,10 @@ fn delete_document_by_filter<'a>(
            || must_stop_processing.get(),
        )?;
-        let (new_builder, user_result) = builder.remove_documents(document_ids)?;
+        let (new_builder, count) = builder.remove_documents_from_db_no_batch(&candidates)?;
        builder = new_builder;
        // Uses Invariant: remove documents actually always returns Ok for the inner result
        let count = user_result.unwrap();
        let _ = builder.execute()?;
        count
    } else {
        0
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@ -1,5 +1,6 @@
 mod builder;
 mod enriched;
 mod primary_key;
 mod reader;
 mod serde_impl;
@ -11,6 +12,7 @@ use bimap::BiHashMap;
 pub use builder::DocumentsBatchBuilder;
 pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
 use obkv::KvReader;
 pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY};
 pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
 use serde::{Deserialize, Serialize};
@ -87,6 +89,12 @@ impl DocumentsBatchIndex {
    }
 }
 impl FieldIdMapper for DocumentsBatchIndex {
    fn id(&self, name: &str) -> Option<FieldId> {
        self.id(name)
    }
 }
 #[derive(Debug, thiserror::Error)]
 pub enum Error {
    #[error("Error parsing number {value:?} at line {line}: {error}")]
--- a/milli/src/documents/primary_key.rs
+++ b/milli/src/documents/primary_key.rs
@ -0,0 +1,172 @@
 use std::iter;
 use std::result::Result as StdResult;
 use serde_json::Value;
 use crate::{FieldId, InternalError, Object, Result, UserError};
 /// The symbol used to define levels in a nested primary key.
 const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
 /// The default primary that is used when not specified.
 pub const DEFAULT_PRIMARY_KEY: &str = "id";
 /// Trait for objects that can map the name of a field to its [`FieldId`].
 pub trait FieldIdMapper {
    /// Attempts to map the passed name to its [`FieldId`].
    ///
    /// `None` if the field with this name was not found.
    fn id(&self, name: &str) -> Option<FieldId>;
 }
 /// A type that represent the type of primary key that has been set
 /// for this index, a classic flat one or a nested one.
 #[derive(Debug, Clone, Copy)]
 pub enum PrimaryKey<'a> {
    Flat { name: &'a str, field_id: FieldId },
    Nested { name: &'a str },
 }
 pub enum DocumentIdExtractionError {
    InvalidDocumentId(UserError),
    MissingDocumentId,
    TooManyDocumentIds(usize),
 }
 impl<'a> PrimaryKey<'a> {
    pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option<Self> {
        Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) {
            Self::Nested { name: path }
        } else {
            let field_id = fields.id(path)?;
            Self::Flat { name: path, field_id }
        })
    }
    pub fn name(&self) -> &str {
        match self {
            PrimaryKey::Flat { name, .. } => name,
            PrimaryKey::Nested { name } => name,
        }
    }
    pub fn document_id(
        &self,
        document: &obkv::KvReader<FieldId>,
        fields: &impl FieldIdMapper,
    ) -> Result<StdResult<String, DocumentIdExtractionError>> {
        match self {
            PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) {
                Some(document_id_bytes) => {
                    let document_id = serde_json::from_slice(document_id_bytes)
                        .map_err(InternalError::SerdeJson)?;
                    match validate_document_id_value(document_id)? {
                        Ok(document_id) => Ok(Ok(document_id)),
                        Err(user_error) => {
                            Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
                        }
                    }
                }
                None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
            },
            nested @ PrimaryKey::Nested { .. } => {
                let mut matching_documents_ids = Vec::new();
                for (first_level_name, right) in nested.possible_level_names() {
                    if let Some(field_id) = fields.id(first_level_name) {
                        if let Some(value_bytes) = document.get(field_id) {
                            let object = serde_json::from_slice(value_bytes)
                                .map_err(InternalError::SerdeJson)?;
                            fetch_matching_values(object, right, &mut matching_documents_ids);
                            if matching_documents_ids.len() >= 2 {
                                return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(
                                    matching_documents_ids.len(),
                                )));
                            }
                        }
                    }
                }
                match matching_documents_ids.pop() {
                    Some(document_id) => match validate_document_id_value(document_id)? {
                        Ok(document_id) => Ok(Ok(document_id)),
                        Err(user_error) => {
                            Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
                        }
                    },
                    None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
                }
            }
        }
    }
    /// Returns an `Iterator` that gives all the possible fields names the primary key
    /// can have depending of the first level name and depth of the objects.
    pub fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
        let name = self.name();
        name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
            .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
            .chain(iter::once((name, "")))
    }
 }
 fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
    match value {
        Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
        otherwise => output.push(otherwise),
    }
 }
 fn fetch_matching_values_in_object(
    object: Object,
    selector: &str,
    base_key: &str,
    output: &mut Vec<Value>,
 ) {
    for (key, value) in object {
        let base_key = if base_key.is_empty() {
            key.to_string()
        } else {
            format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
        };
        if starts_with(selector, &base_key) {
            match value {
                Value::Object(object) => {
                    fetch_matching_values_in_object(object, selector, &base_key, output)
                }
                value => output.push(value),
            }
        }
    }
 }
 fn starts_with(selector: &str, key: &str) -> bool {
    selector.strip_prefix(key).map_or(false, |tail| {
        tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
    })
 }
 // FIXME: move to a DocumentId struct
 fn validate_document_id(document_id: &str) -> Option<&str> {
    if !document_id.is_empty()
        && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
    {
        Some(document_id)
    } else {
        None
    }
 }
 pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
    match document_id {
        Value::String(string) => match validate_document_id(&string) {
            Some(s) if s.len() == string.len() => Ok(Ok(string)),
            Some(s) => Ok(Ok(s.to_string())),
            None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
        },
        Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
        content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
    }
 }
--- a/milli/src/fields_ids_map.rs
+++ b/milli/src/fields_ids_map.rs
@ -81,6 +81,12 @@ impl Default for FieldsIdsMap {
    }
 }
 impl crate::documents::FieldIdMapper for FieldsIdsMap {
    fn id(&self, name: &str) -> Option<FieldId> {
        self.id(name)
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -12,6 +12,7 @@ use rstar::RTree;
 use time::OffsetDateTime;
 use crate::distance::NDotProductPoint;
 use crate::documents::PrimaryKey;
 use crate::error::{InternalError, UserError};
 use crate::fields_ids_map::FieldsIdsMap;
 use crate::heed_codec::facet::{
@ -1176,6 +1177,36 @@ impl Index {
        self.iter_documents(rtxn, self.documents_ids(rtxn)?)
    }
    pub fn external_id_of<'a, 't: 'a>(
        &'a self,
        rtxn: &'t RoTxn,
        ids: impl IntoIterator<Item = DocumentId> + 'a,
    ) -> Result<impl IntoIterator<Item = Result<String>> + 'a> {
        let fields = self.fields_ids_map(rtxn)?;
        // uses precondition "never called on an empty index"
        let primary_key = self.primary_key(rtxn)?.ok_or(InternalError::DatabaseMissingEntry {
            db_name: db_name::MAIN,
            key: Some(main_key::PRIMARY_KEY_KEY),
        })?;
        let primary_key = PrimaryKey::new(primary_key, &fields).ok_or_else(|| {
            InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldName {
                field_name: primary_key.to_owned(),
                process: "external_id_of",
            })
        })?;
        Ok(self.iter_documents(rtxn, ids)?.map(move |entry| -> Result<_> {
            let (_docid, obkv) = entry?;
            match primary_key.document_id(&obkv, &fields)? {
                Ok(document_id) => Ok(document_id),
                Err(_) => Err(InternalError::DocumentsError(
                    crate::documents::Error::InvalidDocumentFormat,
                )
                .into()),
            }
        }))
    }
    pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> {
        FacetDistribution::new(rtxn, self)
    }
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@ -1,20 +1,17 @@
 use std::fmt;
 use std::io::{BufWriter, Read, Seek};
 use std::result::Result as StdResult;
 use std::{fmt, iter};
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
-use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader};
+use crate::documents::{
    DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader,
    EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY,
 };
 use crate::error::{GeoError, InternalError, UserError};
 use crate::update::index_documents::{obkv_to_object, writer_into_reader};
-use crate::{FieldId, Index, Object, Result};
+use crate::{FieldId, Index, Result};
 /// The symbol used to define levels in a nested primary key.
 const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
 /// The default primary that is used when not specified.
 const DEFAULT_PRIMARY_KEY: &str = "id";
 /// This function validates and enrich the documents by checking that:
 ///  - we can infer a primary key,
@ -41,14 +38,12 @@ pub fn enrich_documents_batch<R: Read + Seek>(
    // The primary key *field id* that has already been set for this index or the one
    // we will guess by searching for the first key that contains "id" as a substring.
    let primary_key = match index.primary_key(rtxn)? {
-        Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => {
+        Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) {
-            PrimaryKey::nested(primary_key)
+            Some(primary_key) => primary_key,
-        }
+            None if autogenerate_docids => PrimaryKey::Flat {
-        Some(primary_key) => match documents_batch_index.id(primary_key) {
+                name: primary_key,
-            Some(id) => PrimaryKey::flat(primary_key, id),
+                field_id: documents_batch_index.insert(primary_key),
-            None if autogenerate_docids => {
+            },
                PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key))
            }
            None => {
                return match cursor.next_document()? {
                    Some(first_document) => Ok(Err(UserError::MissingDocumentId {
@ -76,14 +71,14 @@ pub fn enrich_documents_batch<R: Read + Seek>(
            });
            match guesses.as_slice() {
-                [] if autogenerate_docids => PrimaryKey::flat(
+                [] if autogenerate_docids => PrimaryKey::Flat {
-                    DEFAULT_PRIMARY_KEY,
+                    name: DEFAULT_PRIMARY_KEY,
-                    documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
+                    field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
-                ),
+                },
                [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
                [(field_id, name)] => {
                    log::info!("Primary key was not specified in index. Inferred to '{name}'");
-                    PrimaryKey::flat(name, *field_id)
+                    PrimaryKey::Flat { name, field_id: *field_id }
                }
                multiple => {
                    return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
@ -156,92 +151,24 @@ fn fetch_or_generate_document_id(
    uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH],
    count: u32,
 ) -> Result<StdResult<DocumentId, UserError>> {
-    match primary_key {
+    Ok(match primary_key.document_id(document, documents_batch_index)? {
-        PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => {
+        Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }),
-            match document.get(primary_key_id) {
+        Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error),
-                Some(document_id_bytes) => {
+        Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => {
                    let document_id = serde_json::from_slice(document_id_bytes)
                        .map_err(InternalError::SerdeJson)?;
                    match validate_document_id_value(document_id)? {
                        Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
                        Err(user_error) => Ok(Err(user_error)),
                    }
                }
                None if autogenerate_docids => {
            let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
-                    Ok(Ok(DocumentId::generated(uuid.to_string(), count)))
+            Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count })
        }
-                None => Ok(Err(UserError::MissingDocumentId {
+        Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId {
-                    primary_key: primary_key.to_string(),
+            primary_key: primary_key.name().to_string(),
            document: obkv_to_object(document, documents_batch_index)?,
-                })),
+        }),
-            }
+        Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
-        }
+            Err(UserError::TooManyDocumentIds {
-        nested @ PrimaryKey::Nested { .. } => {
+                primary_key: primary_key.name().to_string(),
            let mut matching_documents_ids = Vec::new();
            for (first_level_name, right) in nested.possible_level_names() {
                if let Some(field_id) = documents_batch_index.id(first_level_name) {
                    if let Some(value_bytes) = document.get(field_id) {
                        let object = serde_json::from_slice(value_bytes)
                            .map_err(InternalError::SerdeJson)?;
                        fetch_matching_values(object, right, &mut matching_documents_ids);
                        if matching_documents_ids.len() >= 2 {
                            return Ok(Err(UserError::TooManyDocumentIds {
                                primary_key: nested.name().to_string(),
                document: obkv_to_object(document, documents_batch_index)?,
-                            }));
+            })
                        }
                    }
                }
            }
            match matching_documents_ids.pop() {
                Some(document_id) => match validate_document_id_value(document_id)? {
                    Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
                    Err(user_error) => Ok(Err(user_error)),
                },
                None => Ok(Err(UserError::MissingDocumentId {
                    primary_key: nested.name().to_string(),
                    document: obkv_to_object(document, documents_batch_index)?,
                })),
            }
        }
    }
 }
 /// A type that represent the type of primary key that has been set
 /// for this index, a classic flat one or a nested one.
 #[derive(Debug, Clone, Copy)]
 enum PrimaryKey<'a> {
    Flat { name: &'a str, field_id: FieldId },
    Nested { name: &'a str },
 }
 impl PrimaryKey<'_> {
    fn flat(name: &str, field_id: FieldId) -> PrimaryKey {
        PrimaryKey::Flat { name, field_id }
    }
    fn nested(name: &str) -> PrimaryKey {
        PrimaryKey::Nested { name }
    }
    fn name(&self) -> &str {
        match self {
            PrimaryKey::Flat { name, .. } => name,
            PrimaryKey::Nested { name } => name,
        }
    }
    /// Returns an `Iterator` that gives all the possible fields names the primary key
    /// can have depending of the first level name and deepnes of the objects.
    fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
        let name = self.name();
        name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
            .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
            .chain(iter::once((name, "")))
        }
    })
 }
 /// A type that represents a document id that has been retrieved from a document or auto-generated.
@ -255,14 +182,6 @@ pub enum DocumentId {
 }
 impl DocumentId {
    fn retrieved(value: String) -> DocumentId {
        DocumentId::Retrieved { value }
    }
    fn generated(value: String, document_nth: u32) -> DocumentId {
        DocumentId::Generated { value, document_nth }
    }
    fn debug(&self) -> String {
        format!("{:?}", self)
    }
@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId {
    }
 }
 fn starts_with(selector: &str, key: &str) -> bool {
    selector.strip_prefix(key).map_or(false, |tail| {
        tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
    })
 }
 pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
    match value {
        Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
        otherwise => output.push(otherwise),
    }
 }
 pub fn fetch_matching_values_in_object(
    object: Object,
    selector: &str,
    base_key: &str,
    output: &mut Vec<Value>,
 ) {
    for (key, value) in object {
        let base_key = if base_key.is_empty() {
            key.to_string()
        } else {
            format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
        };
        if starts_with(selector, &base_key) {
            match value {
                Value::Object(object) => {
                    fetch_matching_values_in_object(object, selector, &base_key, output)
                }
                value => output.push(value),
            }
        }
    }
 }
 pub fn validate_document_id(document_id: &str) -> Option<&str> {
    if !document_id.is_empty()
        && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
    {
        Some(document_id)
    } else {
        None
    }
 }
 /// Parses a Json encoded document id and validate it, returning a user error when it is one.
 pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
    match document_id {
        Value::String(string) => match validate_document_id(&string) {
            Some(s) if s.len() == string.len() => Ok(Ok(string)),
            Some(s) => Ok(Ok(s.to_string())),
            None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
        },
        Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
        content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
    }
 }
 /// Try to extract an `f64` from a JSON `Value` and return the `Value`
 /// in the `Err` variant if it failed.
 pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -20,10 +20,7 @@ use slice_group_by::GroupBy;
 use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
 use self::enrich::enrich_documents_batch;
-pub use self::enrich::{
+pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId};
    extract_finite_float_from_value, validate_document_id, validate_document_id_value,
    validate_geo_from_json, DocumentId,
 };
 pub use self::helpers::{
    as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
    fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
@ -197,6 +194,39 @@ where
        Ok((self, Ok(deleted_documents)))
    }
    /// Removes documents from db using their internal document ids.
    ///
    /// # Warning
    ///
    /// This function is dangerous and will only work correctly if:
    ///
    /// - All the passed ids currently exist in the database
    /// - No batching using the standards `remove_documents` and `add_documents` took place
    ///
    /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
    pub fn remove_documents_from_db_no_batch(
        mut self,
        to_delete: &RoaringBitmap,
    ) -> Result<(Self, u64)> {
        puffin::profile_function!();
        // Early return when there is no document to add
        if to_delete.is_empty() {
            return Ok((self, 0));
        }
        let deleted_documents = self
            .transform
            .as_mut()
            .expect("Invalid document deletion state")
            .remove_documents_from_db_no_batch(to_delete, self.wtxn, &self.should_abort)?
            as u64;
        self.deleted_documents += deleted_documents;
        Ok((self, deleted_documents))
    }
    #[logging_timer::time("IndexDocuments::{}")]
    pub fn execute(mut self) -> Result<DocumentAdditionResult> {
        puffin::profile_function!();
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -421,52 +421,13 @@ impl<'a, 'i> Transform<'a, 'i> {
            // Then we push the document in sorters in deletion mode.
            let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? {
                Some(docid) => {
-                    self.replaced_documents_ids.insert(docid);
+                    self.remove_document_from_db(
-
+                        docid,
-                    // fetch the obkv document
+                        to_remove,
-                    let original_key = BEU32::new(docid);
+                        wtxn,
-                    let base_obkv = self
+                        &mut document_sorter_key_buffer,
                        .index
                        .documents
                        .remap_data_type::<heed::types::ByteSlice>()
                        .get(wtxn, &original_key)?
                        .ok_or(InternalError::DatabaseMissingEntry {
                            db_name: db_name::DOCUMENTS,
                            key: None,
                        })?;
                    // Key is the concatenation of the internal docid and the external one.
                    document_sorter_key_buffer.clear();
                    document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
                    document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes());
                    // push it as to delete in the original_sorter
                    document_sorter_value_buffer.clear();
                    document_sorter_value_buffer.push(Operation::Deletion as u8);
                    into_del_add_obkv(
                        KvReaderU16::new(base_obkv),
                        true,
                        false,
                        &mut document_sorter_value_buffer,
                    )?;
                    self.original_sorter
                        .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
                    // flatten it and push it as to delete in the flattened_sorter
                    let flattened_obkv = KvReader::new(base_obkv);
                    if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
                        // we recreate our buffer with the flattened documents
                        document_sorter_value_buffer.clear();
                        document_sorter_value_buffer.push(Operation::Deletion as u8);
                        into_del_add_obkv(
                            KvReaderU16::new(&obkv),
                            true,
                            false,
                            &mut document_sorter_value_buffer,
                        )?;
                    }
                    self.flattened_sorter
                        .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
                    true
                }
                None => false,
@ -481,6 +442,97 @@ impl<'a, 'i> Transform<'a, 'i> {
        Ok(documents_deleted)
    }
    /// Removes documents from db using their internal document ids.
    ///
    /// # Warning
    ///
    /// This function is dangerous and will only work correctly if:
    ///
    /// - All the passed ids currently exist in the database
    /// - No batching using the standards `remove_documents` and `add_documents` took place
    ///
    /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
    #[logging_timer::time]
    pub fn remove_documents_from_db_no_batch<FA>(
        &mut self,
        to_remove: &RoaringBitmap,
        wtxn: &mut heed::RwTxn,
        should_abort: FA,
    ) -> Result<usize>
    where
        FA: Fn() -> bool + Sync,
    {
        puffin::profile_function!();
        let mut documents_deleted = 0;
        let mut document_sorter_value_buffer = Vec::new();
        let mut document_sorter_key_buffer = Vec::new();
        let external_ids = self.index.external_id_of(wtxn, to_remove.iter())?;
        for (internal_docid, external_docid) in to_remove.iter().zip(external_ids) {
            let external_docid = external_docid?;
            if should_abort() {
                return Err(Error::InternalError(InternalError::AbortedIndexation));
            }
            self.remove_document_from_db(
                internal_docid,
                external_docid,
                wtxn,
                &mut document_sorter_key_buffer,
                &mut document_sorter_value_buffer,
            )?;
            documents_deleted += 1;
        }
        Ok(documents_deleted)
    }
    fn remove_document_from_db(
        &mut self,
        internal_docid: u32,
        external_docid: String,
        txn: &heed::RoTxn,
        document_sorter_key_buffer: &mut Vec<u8>,
        document_sorter_value_buffer: &mut Vec<u8>,
    ) -> Result<()> {
        self.replaced_documents_ids.insert(internal_docid);
        // fetch the obkv document
        let original_key = BEU32::new(internal_docid);
        let base_obkv = self
            .index
            .documents
            .remap_data_type::<heed::types::ByteSlice>()
            .get(txn, &original_key)?
            .ok_or(InternalError::DatabaseMissingEntry {
                db_name: db_name::DOCUMENTS,
                key: None,
            })?;
        // Key is the concatenation of the internal docid and the external one.
        document_sorter_key_buffer.clear();
        document_sorter_key_buffer.extend_from_slice(&internal_docid.to_be_bytes());
        document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes());
        // push it as to delete in the original_sorter
        document_sorter_value_buffer.clear();
        document_sorter_value_buffer.push(Operation::Deletion as u8);
        into_del_add_obkv(KvReaderU16::new(base_obkv), true, false, document_sorter_value_buffer)?;
        self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
        // flatten it and push it as to delete in the flattened_sorter
        let flattened_obkv = KvReader::new(base_obkv);
        if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
            // we recreate our buffer with the flattened documents
            document_sorter_value_buffer.clear();
            document_sorter_value_buffer.push(Operation::Deletion as u8);
            into_del_add_obkv(KvReaderU16::new(&obkv), true, false, document_sorter_value_buffer)?;
        }
        self.flattened_sorter
            .insert(internal_docid.to_be_bytes(), &document_sorter_value_buffer)?;
        Ok(())
    }
    // Flatten a document from the fields ids map contained in self and insert the new
    // created fields. Returns `None` if the document doesn't need to be flattened.
    fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {