From 17b647dfe58da2453ae345bfd0a75e66055e5591 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 18 Sep 2023 09:59:38 +0200
Subject: [PATCH 001/127] Wip

---
 Cargo.lock                                    |   1 +
 milli/Cargo.toml                              |   1 +
 milli/src/search/new/tests/sort.rs            |   1 +
 .../extract/extract_docid_word_positions.rs   |  19 +--
 .../extract/extract_fid_word_count_docids.rs  |  67 ++------
 .../extract/extract_word_docids.rs            | 138 +++++++++++++----
 .../extract/extract_word_fid_docids.rs        |   2 +
 .../extract_word_pair_proximity_docids.rs     | 145 ++++++++----------
 .../extract/extract_word_position_docids.rs   |  13 +-
 .../src/update/index_documents/extract/mod.rs |  33 ++--
 .../index_documents/helpers/grenad_helpers.rs |  26 ++++
 milli/src/update/index_documents/mod.rs       |  14 +-
 .../src/update/index_documents/typed_chunk.rs |  28 +++-
 13 files changed, 288 insertions(+), 200 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index b3991d130..d8cd12cc2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2703,6 +2703,7 @@ dependencies = [
  "logging_timer",
  "maplit",
  "md5",
+ "meili-snap",
  "memmap2",
  "mimalloc",
  "obkv",
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index b19b40e85..68bc2d2b5 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -79,6 +79,7 @@ big_s = "1.0.2"
 insta = "1.29.0"
 maplit = "1.0.2"
 md5 = "0.7.0"
+meili-snap = { path = "../meili-snap" }
 rand = { version = "0.8.5", features = ["small_rng"] }
 
 [features]
diff --git a/milli/src/search/new/tests/sort.rs b/milli/src/search/new/tests/sort.rs
index aa6aa971f..8fdf52d44 100644
--- a/milli/src/search/new/tests/sort.rs
+++ b/milli/src/search/new/tests/sort.rs
@@ -13,6 +13,7 @@ This module tests the `sort` ranking rule:
 
 use big_s::S;
 use maplit::hashset;
+use meili_snap::insta;
 
 use crate::index::tests::TempIndex;
 use crate::search::new::tests::collect_field_values;
diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index 643d16354..6aa66c92a 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -5,11 +5,11 @@ use std::io::BufReader;
 use std::{io, mem, str};
 
 use charabia::{Language, Script, SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
-use obkv::KvReader;
+use obkv::{KvReader, KvWriterU16};
 use roaring::RoaringBitmap;
 use serde_json::Value;
 
-use super::helpers::{concat_u32s_array, create_sorter, sorter_into_reader, GrenadParameters};
+use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
 use crate::error::{InternalError, SerializationError};
 use crate::update::index_documents::MergeFn;
 use crate::{
@@ -43,7 +43,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     let mut script_language_docids = HashMap::new();
     let mut docid_word_positions_sorter = create_sorter(
         grenad::SortAlgorithm::Stable,
-        concat_u32s_array,
+        keep_latest_obkv,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
@@ -156,6 +156,7 @@ fn extract_tokens_from_document(
                 let tokens = process_tokens(tokenizer.tokenize(field))
                     .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
 
+                let mut writer = KvWriterU16::memory();
                 for (index, token) in tokens {
                     // if a language has been detected for the token, we update the counter.
                     if let Some(language) = token.language {
@@ -169,17 +170,17 @@ fn extract_tokens_from_document(
                     }
                     let token = token.lemma().trim();
                     if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
-                        buffers.key_buffer.truncate(mem::size_of::<u32>());
-                        buffers.key_buffer.extend_from_slice(token.as_bytes());
-
                         let position: u16 = index
                             .try_into()
                             .map_err(|_| SerializationError::InvalidNumberSerialization)?;
-                        let position = absolute_from_relative_position(field_id, position);
-                        docid_word_positions_sorter
-                            .insert(&buffers.key_buffer, position.to_ne_bytes())?;
+                        writer.insert(position, token.as_bytes())?;
                     }
                 }
+
+                let positions = writer.into_inner()?;
+                buffers.key_buffer.truncate(mem::size_of::<u32>());
+                buffers.key_buffer.extend_from_slice(&field_id.to_be_bytes());
+                docid_word_positions_sorter.insert(&buffers.key_buffer, positions)?;
             }
         }
     }
diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
index 92564b4cd..289a744da 100644
--- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
@@ -1,16 +1,17 @@
-use std::collections::HashMap;
 use std::fs::File;
 use std::io::{self, BufReader};
 
-use grenad::Sorter;
+use obkv::KvReaderU16;
 
 use super::helpers::{
-    create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
-    try_split_array_at, GrenadParameters, MergeFn,
+    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
+    GrenadParameters,
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::{relative_from_absolute_position, DocumentId, FieldId, Result};
+use crate::Result;
+
+const MAX_COUNTED_WORDS: usize = 30;
 
 /// Extracts the field id word count and the documents ids where
 /// this field id with this amount of words appear.
@@ -35,63 +36,21 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
         max_memory,
     );
 
-    // This map is assumed to not consume a lot of memory.
-    let mut document_fid_wordcount = HashMap::new();
-    let mut current_document_id = None;
-
+    let mut key_buffer = Vec::new();
     let mut cursor = docid_word_positions.into_cursor()?;
     while let Some((key, value)) = cursor.move_on_next()? {
-        let (document_id_bytes, _word_bytes) = try_split_array_at(key)
+        let (document_id_bytes, fid_bytes) = try_split_array_at(key)
             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
         let document_id = u32::from_be_bytes(document_id_bytes);
 
-        let curr_document_id = *current_document_id.get_or_insert(document_id);
-        if curr_document_id != document_id {
-            drain_document_fid_wordcount_into_sorter(
-                &mut fid_word_count_docids_sorter,
-                &mut document_fid_wordcount,
-                curr_document_id,
-            )?;
-            current_document_id = Some(document_id);
-        }
-
-        for position in read_u32_ne_bytes(value) {
-            let (field_id, _) = relative_from_absolute_position(position);
-
-            let value = document_fid_wordcount.entry(field_id as FieldId).or_insert(0);
-            *value += 1;
-        }
-    }
-
-    if let Some(document_id) = current_document_id {
-        // We must make sure that don't lose the current document field id
-        // word count map if we break because we reached the end of the chunk.
-        drain_document_fid_wordcount_into_sorter(
-            &mut fid_word_count_docids_sorter,
-            &mut document_fid_wordcount,
-            document_id,
-        )?;
-    }
-
-    sorter_into_reader(fid_word_count_docids_sorter, indexer)
-}
-
-fn drain_document_fid_wordcount_into_sorter(
-    fid_word_count_docids_sorter: &mut Sorter<MergeFn>,
-    document_fid_wordcount: &mut HashMap<FieldId, u32>,
-    document_id: DocumentId,
-) -> Result<()> {
-    let mut key_buffer = Vec::new();
-
-    for (fid, count) in document_fid_wordcount.drain() {
-        if count <= 30 {
+        let word_count = KvReaderU16::new(&value).iter().take(MAX_COUNTED_WORDS + 1).count();
+        if word_count <= MAX_COUNTED_WORDS {
             key_buffer.clear();
-            key_buffer.extend_from_slice(&fid.to_be_bytes());
-            key_buffer.push(count as u8);
-
+            key_buffer.extend_from_slice(fid_bytes);
+            key_buffer.push(word_count as u8);
             fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
         }
     }
 
-    Ok(())
+    sorter_into_reader(fid_word_count_docids_sorter, indexer)
 }
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index f211f7023..8b93ea23c 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -1,18 +1,19 @@
-use std::collections::HashSet;
+use std::collections::{BTreeSet, HashSet};
 use std::fs::File;
 use std::io::{self, BufReader};
 use std::iter::FromIterator;
 
+use obkv::KvReaderU16;
 use roaring::RoaringBitmap;
 
 use super::helpers::{
-    create_sorter, merge_roaring_bitmaps, serialize_roaring_bitmap, sorter_into_reader,
-    try_split_array_at, GrenadParameters,
+    create_sorter, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, serialize_roaring_bitmap,
+    sorter_into_reader, try_split_array_at, GrenadParameters,
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::update::index_documents::helpers::read_u32_ne_bytes;
-use crate::{relative_from_absolute_position, FieldId, Result};
+use crate::update::MergeFn;
+use crate::{DocumentId, FieldId, Result};
 
 /// Extracts the word and the documents ids where this word appear.
 ///
@@ -26,7 +27,11 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
     docid_word_positions: grenad::Reader<R>,
     indexer: GrenadParameters,
     exact_attributes: &HashSet<FieldId>,
-) -> Result<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)> {
+) -> Result<(
+    grenad::Reader<BufReader<File>>,
+    grenad::Reader<BufReader<File>>,
+    grenad::Reader<BufReader<File>>,
+)> {
     puffin::profile_function!();
 
     let max_memory = indexer.max_memory_by_thread();
@@ -37,7 +42,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
-        max_memory.map(|x| x / 2),
+        max_memory.map(|x| x / 3),
     );
 
     let mut exact_word_docids_sorter = create_sorter(
@@ -46,45 +51,116 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
-        max_memory.map(|x| x / 2),
+        max_memory.map(|x| x / 3),
     );
 
+    let mut word_fid_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Unstable,
+        merge_roaring_bitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|x| x / 3),
+    );
+
+    let mut current_document_id = None;
+    let mut fid = 0;
+    let mut key_buffer = Vec::new();
     let mut value_buffer = Vec::new();
+    let mut words = BTreeSet::new();
+    let mut exact_words = BTreeSet::new();
     let mut cursor = docid_word_positions.into_cursor()?;
-    while let Some((key, positions)) = cursor.move_on_next()? {
-        let (document_id_bytes, word_bytes) = try_split_array_at(key)
+    while let Some((key, value)) = cursor.move_on_next()? {
+        let (document_id_bytes, fid_bytes) = try_split_array_at(key)
+            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
+        let (fid_bytes, _) = try_split_array_at(key)
             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
         let document_id = u32::from_be_bytes(document_id_bytes);
+        fid = u16::from_be_bytes(fid_bytes);
 
-        let bitmap = RoaringBitmap::from_iter(Some(document_id));
-        serialize_roaring_bitmap(&bitmap, &mut value_buffer)?;
+        // drain the btreemaps when we change document.
+        if current_document_id.map_or(false, |id| id != document_id) {
+            words_into_sorters(
+                document_id,
+                fid,
+                &mut key_buffer,
+                &mut value_buffer,
+                &mut exact_words,
+                &mut words,
+                &mut exact_word_docids_sorter,
+                &mut word_docids_sorter,
+                &mut word_fid_docids_sorter,
+            )?;
+        }
 
-        // If there are no exact attributes, we do not need to iterate over positions.
-        if exact_attributes.is_empty() {
-            word_docids_sorter.insert(word_bytes, &value_buffer)?;
+        current_document_id = Some(document_id);
+
+        // every words contained in an attribute set to exact must be pushed in the exact_words list.
+        if exact_attributes.contains(&fid) {
+            for (_pos, word) in KvReaderU16::new(&value).iter() {
+                exact_words.insert(word.to_vec());
+            }
         } else {
-            let mut added_to_exact = false;
-            let mut added_to_word_docids = false;
-            for position in read_u32_ne_bytes(positions) {
-                // as soon as we know that this word had been to both readers, we don't need to
-                // iterate over the positions.
-                if added_to_exact && added_to_word_docids {
-                    break;
-                }
-                let (fid, _) = relative_from_absolute_position(position);
-                if exact_attributes.contains(&fid) && !added_to_exact {
-                    exact_word_docids_sorter.insert(word_bytes, &value_buffer)?;
-                    added_to_exact = true;
-                } else if !added_to_word_docids {
-                    word_docids_sorter.insert(word_bytes, &value_buffer)?;
-                    added_to_word_docids = true;
-                }
+            for (_pos, word) in KvReaderU16::new(&value).iter() {
+                words.insert(word.to_vec());
             }
         }
     }
 
+    // We must make sure that don't lose the current document field id
+    if let Some(document_id) = current_document_id {
+        words_into_sorters(
+            document_id,
+            fid,
+            &mut key_buffer,
+            &mut value_buffer,
+            &mut exact_words,
+            &mut words,
+            &mut exact_word_docids_sorter,
+            &mut word_docids_sorter,
+            &mut word_fid_docids_sorter,
+        )?;
+    }
+
     Ok((
         sorter_into_reader(word_docids_sorter, indexer)?,
         sorter_into_reader(exact_word_docids_sorter, indexer)?,
+        sorter_into_reader(word_fid_docids_sorter, indexer)?,
     ))
 }
+
+fn words_into_sorters(
+    document_id: DocumentId,
+    fid: FieldId,
+    key_buffer: &mut Vec<u8>,
+    value_buffer: &mut Vec<u8>,
+    exact_words: &mut BTreeSet<Vec<u8>>,
+    words: &mut BTreeSet<Vec<u8>>,
+    exact_word_docids_sorter: &mut grenad::Sorter<MergeFn>,
+    word_docids_sorter: &mut grenad::Sorter<MergeFn>,
+    word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
+) -> Result<()> {
+    puffin::profile_function!();
+    let bitmap = RoaringBitmap::from_iter(Some(document_id));
+    serialize_roaring_bitmap(&bitmap, value_buffer)?;
+    for word_bytes in exact_words.iter() {
+        exact_word_docids_sorter.insert(word_bytes, &mut *value_buffer)?;
+    }
+
+    for word_bytes in words.iter() {
+        word_docids_sorter.insert(word_bytes, &value_buffer)?;
+    }
+
+    for word_bytes in (&*words | &*exact_words).iter() {
+        key_buffer.clear();
+        key_buffer.extend_from_slice(&word_bytes);
+        key_buffer.push(0);
+        key_buffer.extend_from_slice(&fid.to_be_bytes());
+        word_fid_docids_sorter.insert(word_bytes, &value_buffer)?;
+    }
+
+    exact_words.clear();
+    words.clear();
+
+    Ok(())
+}
diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
index 09f571038..dd4d42431 100644
--- a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
@@ -17,6 +17,8 @@ pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
 ) -> Result<grenad::Reader<BufReader<File>>> {
     puffin::profile_function!();
 
+    todo!("remove me");
+
     let max_memory = indexer.max_memory_by_thread();
 
     let mut word_fid_docids_sorter = create_sorter(
diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
index 9ddd5ff4c..41604ff4a 100644
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -1,12 +1,14 @@
 use std::cmp::Ordering;
-use std::collections::{BinaryHeap, HashMap};
+use std::collections::HashMap;
 use std::fs::File;
 use std::io::BufReader;
-use std::{cmp, io, mem, str, vec};
+use std::{cmp, io};
+
+use obkv::KvReaderU16;
 
 use super::helpers::{
-    create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
-    try_split_array_at, GrenadParameters, MergeFn,
+    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
+    GrenadParameters, MergeFn,
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
@@ -35,44 +37,59 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
         max_memory.map(|m| m / 2),
     );
 
-    // This map is assumed to not consume a lot of memory.
-    let mut document_word_positions_heap = BinaryHeap::new();
+    let mut word_positions: Vec<(String, u16)> = Vec::with_capacity(MAX_DISTANCE as usize);
+    let mut word_pair_proximity = HashMap::new();
     let mut current_document_id = None;
 
     let mut cursor = docid_word_positions.into_cursor()?;
     while let Some((key, value)) = cursor.move_on_next()? {
-        let (document_id_bytes, word_bytes) = try_split_array_at(key)
+        let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
         let document_id = u32::from_be_bytes(document_id_bytes);
-        let word = str::from_utf8(word_bytes)?;
 
-        let curr_document_id = *current_document_id.get_or_insert(document_id);
-        if curr_document_id != document_id {
-            let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
-            document_word_positions_into_sorter(
-                curr_document_id,
-                document_word_positions_heap,
-                &mut word_pair_proximity_docids_sorter,
-            )?;
-            current_document_id = Some(document_id);
-        }
+        for (position, word) in KvReaderU16::new(&value).iter() {
+            // if we change document, we fill the sorter
+            if current_document_id.map_or(false, |id| id != document_id) {
+                while !word_positions.is_empty() {
+                    word_positions_into_word_pair_proximity(
+                        &mut word_positions,
+                        &mut word_pair_proximity,
+                    )?;
+                }
 
-        let word = word.to_string();
-        let mut positions: Vec<_> = read_u32_ne_bytes(value).collect();
-        positions.sort_unstable();
-        let mut iter = positions.into_iter();
-        if let Some(position) = iter.next() {
-            document_word_positions_heap.push(PeekedWordPosition { word, position, iter });
+                document_word_positions_into_sorter(
+                    document_id,
+                    &word_pair_proximity,
+                    &mut word_pair_proximity_docids_sorter,
+                )?;
+                word_pair_proximity.clear();
+                word_positions.clear();
+            }
+
+            // drain the proximity window until the head word is considered close to the word we are inserting.
+            while word_positions.get(0).map_or(false, |(_w, p)| {
+                positions_proximity(*p as u32, position as u32) > MAX_DISTANCE
+            }) {
+                word_positions_into_word_pair_proximity(
+                    &mut word_positions,
+                    &mut word_pair_proximity,
+                )?;
+            }
+
+            // insert the new word.
+            let word = std::str::from_utf8(word)?;
+            word_positions.push((word.to_string(), position));
         }
     }
 
     if let Some(document_id) = current_document_id {
-        // We must make sure that don't lose the current document field id
-        // word count map if we break because we reached the end of the chunk.
-        let document_word_positions_heap = mem::take(&mut document_word_positions_heap);
+        while !word_positions.is_empty() {
+            word_positions_into_word_pair_proximity(&mut word_positions, &mut word_pair_proximity)?;
+        }
+
         document_word_positions_into_sorter(
             document_id,
-            document_word_positions_heap,
+            &word_pair_proximity,
             &mut word_pair_proximity_docids_sorter,
         )?;
     }
@@ -86,64 +103,13 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
 /// close to each other.
 fn document_word_positions_into_sorter(
     document_id: DocumentId,
-    mut word_positions_heap: BinaryHeap<PeekedWordPosition<vec::IntoIter<u32>>>,
+    word_pair_proximity: &HashMap<(String, String), u8>,
     word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>,
 ) -> Result<()> {
-    let mut word_pair_proximity = HashMap::new();
-    let mut ordered_peeked_word_positions = Vec::new();
-    while !word_positions_heap.is_empty() {
-        while let Some(peeked_word_position) = word_positions_heap.pop() {
-            ordered_peeked_word_positions.push(peeked_word_position);
-            if ordered_peeked_word_positions.len() == 7 {
-                break;
-            }
-        }
-
-        if let Some((head, tail)) = ordered_peeked_word_positions.split_first() {
-            for PeekedWordPosition { word, position, .. } in tail {
-                let prox = positions_proximity(head.position, *position);
-                if prox > 0 && prox < MAX_DISTANCE {
-                    word_pair_proximity
-                        .entry((head.word.clone(), word.clone()))
-                        .and_modify(|p| {
-                            *p = cmp::min(*p, prox);
-                        })
-                        .or_insert(prox);
-                }
-            }
-
-            // Push the tail in the heap.
-            let tail_iter = ordered_peeked_word_positions.drain(1..);
-            word_positions_heap.extend(tail_iter);
-
-            // Advance the head and push it in the heap.
-            if let Some(mut head) = ordered_peeked_word_positions.pop() {
-                if let Some(next_position) = head.iter.next() {
-                    let prox = positions_proximity(head.position, next_position);
-
-                    if prox > 0 && prox < MAX_DISTANCE {
-                        word_pair_proximity
-                            .entry((head.word.clone(), head.word.clone()))
-                            .and_modify(|p| {
-                                *p = cmp::min(*p, prox);
-                            })
-                            .or_insert(prox);
-                    }
-
-                    word_positions_heap.push(PeekedWordPosition {
-                        word: head.word,
-                        position: next_position,
-                        iter: head.iter,
-                    });
-                }
-            }
-        }
-    }
-
     let mut key_buffer = Vec::new();
     for ((w1, w2), prox) in word_pair_proximity {
         key_buffer.clear();
-        key_buffer.push(prox as u8);
+        key_buffer.push(*prox as u8);
         key_buffer.extend_from_slice(w1.as_bytes());
         key_buffer.push(0);
         key_buffer.extend_from_slice(w2.as_bytes());
@@ -154,6 +120,23 @@ fn document_word_positions_into_sorter(
     Ok(())
 }
 
+fn word_positions_into_word_pair_proximity(
+    word_positions: &mut Vec<(String, u16)>,
+    word_pair_proximity: &mut HashMap<(String, String), u8>,
+) -> Result<()> {
+    let (head_word, head_position) = word_positions.remove(0);
+    for (word, position) in word_positions.iter() {
+        let prox = positions_proximity(head_position as u32, *position as u32) as u8;
+        word_pair_proximity
+            .entry((head_word.clone(), word.clone()))
+            .and_modify(|p| {
+                *p = cmp::min(*p, prox);
+            })
+            .or_insert(prox);
+    }
+    Ok(())
+}
+
 struct PeekedWordPosition<I> {
     word: String,
     position: u32,
diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
index 94139ddf8..db2f6217f 100644
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -1,13 +1,15 @@
 use std::fs::File;
 use std::io::{self, BufReader};
 
+use obkv::KvReaderU16;
+
 use super::helpers::{
-    create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
-    try_split_array_at, GrenadParameters,
+    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
+    GrenadParameters,
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result};
+use crate::{bucketed_position, DocumentId, Result};
 
 /// Extracts the word positions and the documents ids where this word appear.
 ///
@@ -34,15 +36,14 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
     let mut key_buffer = Vec::new();
     let mut cursor = docid_word_positions.into_cursor()?;
     while let Some((key, value)) = cursor.move_on_next()? {
-        let (document_id_bytes, word_bytes) = try_split_array_at(key)
+        let (document_id_bytes, fid_bytes) = try_split_array_at(key)
             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
         let document_id = DocumentId::from_be_bytes(document_id_bytes);
 
-        for position in read_u32_ne_bytes(value) {
+        for (position, word_bytes) in KvReaderU16::new(&value).iter() {
             key_buffer.clear();
             key_buffer.extend_from_slice(word_bytes);
             key_buffer.push(0);
-            let (_, position) = relative_from_absolute_position(position);
             let position = bucketed_position(position);
             key_buffer.extend_from_slice(&position.to_be_bytes());
             word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index f44eac8f5..a6cc04111 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -176,16 +176,23 @@ pub(crate) fn data_from_obkv_documents(
     spawn_extraction_task::<
         _,
         _,
-        Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<BufReader<File>>)>,
+        Vec<(
+            grenad::Reader<BufReader<File>>,
+            grenad::Reader<BufReader<File>>,
+            grenad::Reader<BufReader<File>>,
+        )>,
     >(
         docid_word_positions_chunks.clone(),
         indexer,
         lmdb_writer_sx.clone(),
         move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
         merge_roaring_bitmaps,
-        |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
-            word_docids_reader,
-            exact_word_docids_reader,
+        |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
+            TypedChunk::WordDocids {
+                word_docids_reader,
+                exact_word_docids_reader,
+                word_fid_docids_reader,
+            }
         },
         "word-docids",
     );
@@ -199,15 +206,15 @@ pub(crate) fn data_from_obkv_documents(
         TypedChunk::WordPositionDocids,
         "word-position-docids",
     );
-    spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
-        docid_word_positions_chunks,
-        indexer,
-        lmdb_writer_sx.clone(),
-        extract_word_fid_docids,
-        merge_cbo_roaring_bitmaps,
-        TypedChunk::WordFidDocids,
-        "word-fid-docids",
-    );
+    // spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
+    //     docid_word_positions_chunks,
+    //     indexer,
+    //     lmdb_writer_sx.clone(),
+    //     extract_word_fid_docids,
+    //     merge_cbo_roaring_bitmaps,
+    //     TypedChunk::WordFidDocids,
+    //     "word-fid-docids",
+    // );
 
     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
         docid_fid_facet_strings_chunks,
diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs
index 582bf2a5b..6c3a81a0e 100644
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -115,6 +115,32 @@ impl MergeableReader for Vec<(grenad::Reader<BufReader<File>>, grenad::Reader<Bu
     }
 }
 
+impl MergeableReader
+    for Vec<(
+        grenad::Reader<BufReader<File>>,
+        grenad::Reader<BufReader<File>>,
+        grenad::Reader<BufReader<File>>,
+    )>
+{
+    type Output = (
+        grenad::Reader<BufReader<File>>,
+        grenad::Reader<BufReader<File>>,
+        grenad::Reader<BufReader<File>>,
+    );
+
+    fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
+        let mut m1 = MergerBuilder::new(merge_fn);
+        let mut m2 = MergerBuilder::new(merge_fn);
+        let mut m3 = MergerBuilder::new(merge_fn);
+        for (r1, r2, r3) in self.into_iter() {
+            m1.push(r1)?;
+            m2.push(r2)?;
+            m3.push(r3)?;
+        }
+        Ok((m1.finish(params)?, m2.finish(params)?, m3.finish(params)?))
+    }
+}
+
 struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
 
 impl<R: io::Read + io::Seek> MergerBuilder<R> {
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 52aa1113e..58219f28c 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -406,13 +406,23 @@ where
             }
 
             let typed_chunk = match result? {
-                TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
+                TypedChunk::WordDocids {
+                    word_docids_reader,
+                    exact_word_docids_reader,
+                    word_fid_docids_reader,
+                } => {
                     let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
                     word_docids = Some(cloneable_chunk);
                     let cloneable_chunk =
                         unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
                     exact_word_docids = Some(cloneable_chunk);
-                    TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
+                    word_fid_docids = Some(cloneable_chunk);
+                    TypedChunk::WordDocids {
+                        word_docids_reader,
+                        exact_word_docids_reader,
+                        word_fid_docids_reader,
+                    }
                 }
                 TypedChunk::WordPairProximityDocids(chunk) => {
                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 5895a69c5..d57484cab 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -32,6 +32,7 @@ pub(crate) enum TypedChunk {
     WordDocids {
         word_docids_reader: grenad::Reader<BufReader<File>>,
         exact_word_docids_reader: grenad::Reader<BufReader<File>>,
+        word_fid_docids_reader: grenad::Reader<BufReader<File>>,
     },
     WordPositionDocids(grenad::Reader<BufReader<File>>),
     WordFidDocids(grenad::Reader<BufReader<File>>),
@@ -64,10 +65,15 @@ impl TypedChunk {
             TypedChunk::NewDocumentsIds(grenad) => {
                 format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len())
             }
-            TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => format!(
-                "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {} }}",
+            TypedChunk::WordDocids {
+                word_docids_reader,
+                exact_word_docids_reader,
+                word_fid_docids_reader,
+            } => format!(
+                "WordDocids {{ word_docids_reader: {}, exact_word_docids_reader: {}, word_fid_docids_reader: {} }}",
                 word_docids_reader.len(),
-                exact_word_docids_reader.len()
+                exact_word_docids_reader.len(),
+                word_fid_docids_reader.len()
             ),
             TypedChunk::WordPositionDocids(grenad) => {
                 format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
@@ -138,7 +144,11 @@ pub(crate) fn write_typed_chunk_into_index(
         TypedChunk::NewDocumentsIds(documents_ids) => {
             return Ok((documents_ids, is_merged_database))
         }
-        TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
+        TypedChunk::WordDocids {
+            word_docids_reader,
+            exact_word_docids_reader,
+            word_fid_docids_reader,
+        } => {
             let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
             append_entries_into_database(
                 word_docids_iter.clone(),
@@ -159,6 +169,16 @@ pub(crate) fn write_typed_chunk_into_index(
                 merge_roaring_bitmaps,
             )?;
 
+            let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
+            append_entries_into_database(
+                word_fid_docids_iter,
+                &index.word_fid_docids,
+                wtxn,
+                index_is_empty,
+                |value, _buffer| Ok(value),
+                merge_cbo_roaring_bitmaps,
+            )?;
+
             // create fst from word docids
             let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?;
             let db_fst = index.words_fst(wtxn)?;

From 748b333161729c63a5611b2b56dbe99544648b1d Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 19 Sep 2023 14:12:43 +0200
Subject: [PATCH 002/127] Add usefull debug assert before key insertion in
 database

---
 milli/src/update/index_documents/typed_chunk.rs | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index d57484cab..a450b5f34 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -475,6 +475,7 @@ where
     R: io::Read + io::Seek,
     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
     FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
+    K: for<'a> heed::BytesDecode<'a>,
 {
     puffin::profile_function!(format!("number of entries: {}", data.len()));
 
@@ -495,6 +496,12 @@ where
     let mut cursor = data.into_cursor()?;
     while let Some((key, value)) = cursor.move_on_next()? {
         if valid_lmdb_key(key) {
+            debug_assert!(
+                K::bytes_decode(&key).is_some(),
+                "Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}",
+                key.len(),
+                &key
+            );
             buffer.clear();
             let value = serialize_value(value, &mut buffer)?;
             unsafe { database.append(key, value)? };

From 8d77736a6795cba0e2eff9727015928f0aa13c3b Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 19 Sep 2023 14:20:57 +0200
Subject: [PATCH 003/127] Fix fid_word_docids

---
 milli/src/update/index_documents/extract/extract_word_docids.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 8b93ea23c..8c72ba48a 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -156,7 +156,7 @@ fn words_into_sorters(
         key_buffer.extend_from_slice(&word_bytes);
         key_buffer.push(0);
         key_buffer.extend_from_slice(&fid.to_be_bytes());
-        word_fid_docids_sorter.insert(word_bytes, &value_buffer)?;
+        word_fid_docids_sorter.insert(&key_buffer, &value_buffer)?;
     }
 
     exact_words.clear();

From 11ea5acff94b70ab0181010a277c663630b5d1d4 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 19 Sep 2023 18:29:21 +0200
Subject: [PATCH 004/127] Fix

---
 .../extract/extract_word_docids.rs            |  2 +-
 .../extract_word_pair_proximity_docids.rs     | 32 +++++++++----------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 8c72ba48a..84c6f8635 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -73,7 +73,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
     while let Some((key, value)) = cursor.move_on_next()? {
         let (document_id_bytes, fid_bytes) = try_split_array_at(key)
             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
-        let (fid_bytes, _) = try_split_array_at(key)
+        let (fid_bytes, _) = try_split_array_at(fid_bytes)
             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
         let document_id = u32::from_be_bytes(document_id_bytes);
         fid = u16::from_be_bytes(fid_bytes);
diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
index 41604ff4a..6373d5822 100644
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -47,25 +47,25 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
         let document_id = u32::from_be_bytes(document_id_bytes);
 
-        for (position, word) in KvReaderU16::new(&value).iter() {
-            // if we change document, we fill the sorter
-            if current_document_id.map_or(false, |id| id != document_id) {
-                while !word_positions.is_empty() {
-                    word_positions_into_word_pair_proximity(
-                        &mut word_positions,
-                        &mut word_pair_proximity,
-                    )?;
-                }
-
-                document_word_positions_into_sorter(
-                    document_id,
-                    &word_pair_proximity,
-                    &mut word_pair_proximity_docids_sorter,
+        // if we change document, we fill the sorter
+        if current_document_id.map_or(false, |id| id != document_id) {
+            while !word_positions.is_empty() {
+                word_positions_into_word_pair_proximity(
+                    &mut word_positions,
+                    &mut word_pair_proximity,
                 )?;
-                word_pair_proximity.clear();
-                word_positions.clear();
             }
 
+            document_word_positions_into_sorter(
+                document_id,
+                &word_pair_proximity,
+                &mut word_pair_proximity_docids_sorter,
+            )?;
+            word_pair_proximity.clear();
+            word_positions.clear();
+        }
+
+        for (position, word) in KvReaderU16::new(&value).iter() {
             // drain the proximity window until the head word is considered close to the word we are inserting.
             while word_positions.get(0).map_or(false, |(_w, p)| {
                 positions_proximity(*p as u32, position as u32) > MAX_DISTANCE

From db1ca2123103f77589d28868d76bff87c3fd567c Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 19 Sep 2023 18:40:35 +0200
Subject: [PATCH 005/127] add puffin in sorter into reeder function

---
 milli/src/update/index_documents/helpers/grenad_helpers.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs
index 6c3a81a0e..cc0ccb609 100644
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -54,6 +54,7 @@ pub fn sorter_into_reader(
     sorter: grenad::Sorter<MergeFn>,
     indexer: GrenadParameters,
 ) -> Result<grenad::Reader<BufReader<File>>> {
+    puffin::profile_function!();
     let mut writer = create_writer(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,

From 8ccf32d1a06f4225c752be2554b28791704a5254 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 19 Sep 2023 19:29:06 +0200
Subject: [PATCH 006/127] Compute word_fid_docids before word_docids and
 exact_word_docids

---
 .../extract/extract_word_docids.rs            | 139 +++++++++---------
 1 file changed, 68 insertions(+), 71 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 84c6f8635..8409f2836 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -3,14 +3,16 @@ use std::fs::File;
 use std::io::{self, BufReader};
 use std::iter::FromIterator;
 
+use heed::BytesDecode;
 use obkv::KvReaderU16;
 use roaring::RoaringBitmap;
 
 use super::helpers::{
-    create_sorter, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, serialize_roaring_bitmap,
-    sorter_into_reader, try_split_array_at, GrenadParameters,
+    create_sorter, create_writer, merge_roaring_bitmaps, serialize_roaring_bitmap,
+    sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters,
 };
 use crate::error::SerializationError;
+use crate::heed_codec::StrBEU16Codec;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
 use crate::update::MergeFn;
 use crate::{DocumentId, FieldId, Result};
@@ -36,6 +38,59 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
 
     let max_memory = indexer.max_memory_by_thread();
 
+    let mut word_fid_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Unstable,
+        merge_roaring_bitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory.map(|x| x / 3),
+    );
+
+    let mut current_document_id = None;
+    let mut fid = 0;
+    let mut key_buffer = Vec::new();
+    let mut value_buffer = Vec::new();
+    let mut words = BTreeSet::new();
+    let mut cursor = docid_word_positions.into_cursor()?;
+    while let Some((key, value)) = cursor.move_on_next()? {
+        let (document_id_bytes, fid_bytes) = try_split_array_at(key)
+            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
+        let (fid_bytes, _) = try_split_array_at(fid_bytes)
+            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
+        let document_id = u32::from_be_bytes(document_id_bytes);
+        fid = u16::from_be_bytes(fid_bytes);
+
+        // drain the btreemaps when we change document.
+        if current_document_id.map_or(false, |id| id != document_id) {
+            words_into_sorter(
+                document_id,
+                fid,
+                &mut key_buffer,
+                &mut value_buffer,
+                &mut words,
+                &mut word_fid_docids_sorter,
+            )?;
+        }
+
+        current_document_id = Some(document_id);
+        for (_pos, word) in KvReaderU16::new(&value).iter() {
+            words.insert(word.to_vec());
+        }
+    }
+
+    // We must make sure that don't lose the current document field id
+    if let Some(document_id) = current_document_id {
+        words_into_sorter(
+            document_id,
+            fid,
+            &mut key_buffer,
+            &mut value_buffer,
+            &mut words,
+            &mut word_fid_docids_sorter,
+        )?;
+    }
+
     let mut word_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Unstable,
         merge_roaring_bitmaps,
@@ -54,104 +109,47 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         max_memory.map(|x| x / 3),
     );
 
-    let mut word_fid_docids_sorter = create_sorter(
-        grenad::SortAlgorithm::Unstable,
-        merge_roaring_bitmaps,
+    let mut word_fid_docids_writer = create_writer(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
-        indexer.max_nb_chunks,
-        max_memory.map(|x| x / 3),
+        tempfile::tempfile()?,
     );
 
-    let mut current_document_id = None;
-    let mut fid = 0;
-    let mut key_buffer = Vec::new();
-    let mut value_buffer = Vec::new();
-    let mut words = BTreeSet::new();
-    let mut exact_words = BTreeSet::new();
-    let mut cursor = docid_word_positions.into_cursor()?;
-    while let Some((key, value)) = cursor.move_on_next()? {
-        let (document_id_bytes, fid_bytes) = try_split_array_at(key)
-            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
-        let (fid_bytes, _) = try_split_array_at(fid_bytes)
-            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
-        let document_id = u32::from_be_bytes(document_id_bytes);
-        fid = u16::from_be_bytes(fid_bytes);
+    let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
+    while let Some((key, value)) = iter.next()? {
+        word_fid_docids_writer.insert(key, value)?;
 
-        // drain the btreemaps when we change document.
-        if current_document_id.map_or(false, |id| id != document_id) {
-            words_into_sorters(
-                document_id,
-                fid,
-                &mut key_buffer,
-                &mut value_buffer,
-                &mut exact_words,
-                &mut words,
-                &mut exact_word_docids_sorter,
-                &mut word_docids_sorter,
-                &mut word_fid_docids_sorter,
-            )?;
-        }
-
-        current_document_id = Some(document_id);
+        let (word, fid) = StrBEU16Codec::bytes_decode(key)
+            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
 
         // every words contained in an attribute set to exact must be pushed in the exact_words list.
         if exact_attributes.contains(&fid) {
-            for (_pos, word) in KvReaderU16::new(&value).iter() {
-                exact_words.insert(word.to_vec());
-            }
+            exact_word_docids_sorter.insert(word.as_bytes(), &value)?;
         } else {
-            for (_pos, word) in KvReaderU16::new(&value).iter() {
-                words.insert(word.to_vec());
-            }
+            word_docids_sorter.insert(word.as_bytes(), &value)?;
         }
     }
 
-    // We must make sure that don't lose the current document field id
-    if let Some(document_id) = current_document_id {
-        words_into_sorters(
-            document_id,
-            fid,
-            &mut key_buffer,
-            &mut value_buffer,
-            &mut exact_words,
-            &mut words,
-            &mut exact_word_docids_sorter,
-            &mut word_docids_sorter,
-            &mut word_fid_docids_sorter,
-        )?;
-    }
-
     Ok((
         sorter_into_reader(word_docids_sorter, indexer)?,
         sorter_into_reader(exact_word_docids_sorter, indexer)?,
-        sorter_into_reader(word_fid_docids_sorter, indexer)?,
+        writer_into_reader(word_fid_docids_writer)?,
     ))
 }
 
-fn words_into_sorters(
+fn words_into_sorter(
     document_id: DocumentId,
     fid: FieldId,
     key_buffer: &mut Vec<u8>,
     value_buffer: &mut Vec<u8>,
-    exact_words: &mut BTreeSet<Vec<u8>>,
     words: &mut BTreeSet<Vec<u8>>,
-    exact_word_docids_sorter: &mut grenad::Sorter<MergeFn>,
-    word_docids_sorter: &mut grenad::Sorter<MergeFn>,
     word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
 ) -> Result<()> {
     puffin::profile_function!();
     let bitmap = RoaringBitmap::from_iter(Some(document_id));
     serialize_roaring_bitmap(&bitmap, value_buffer)?;
-    for word_bytes in exact_words.iter() {
-        exact_word_docids_sorter.insert(word_bytes, &mut *value_buffer)?;
-    }
 
     for word_bytes in words.iter() {
-        word_docids_sorter.insert(word_bytes, &value_buffer)?;
-    }
-
-    for word_bytes in (&*words | &*exact_words).iter() {
         key_buffer.clear();
         key_buffer.extend_from_slice(&word_bytes);
         key_buffer.push(0);
@@ -159,7 +157,6 @@ fn words_into_sorters(
         word_fid_docids_sorter.insert(&key_buffer, &value_buffer)?;
     }
 
-    exact_words.clear();
     words.clear();
 
     Ok(())

From b541d48847ff81382a69a925eac702356aa85287 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 21 Sep 2023 10:02:08 +0200
Subject: [PATCH 007/127] Add buffer to the obkv writter

---
 .../extract/extract_docid_word_positions.rs   |  5 +++-
 .../extract/extract_word_position_docids.rs   | 30 ++++++++++++++++---
 2 files changed, 30 insertions(+), 5 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index 6aa66c92a..ea329b212 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -156,7 +156,8 @@ fn extract_tokens_from_document(
                 let tokens = process_tokens(tokenizer.tokenize(field))
                     .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
 
-                let mut writer = KvWriterU16::memory();
+                buffers.obkv_buffer.clear();
+                let mut writer = KvWriterU16::new(&mut buffers.obkv_buffer);
                 for (index, token) in tokens {
                     // if a language has been detected for the token, we update the counter.
                     if let Some(language) = token.language {
@@ -294,4 +295,6 @@ struct Buffers {
     key_buffer: Vec<u8>,
     // the field buffer for each fields desserialization, and must be cleared between each field.
     field_buffer: String,
+    // buffer used to store the value data containing an obkv.
+    obkv_buffer: Vec<u8>,
 }
diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
index db2f6217f..0b07f63b5 100644
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -1,3 +1,4 @@
+use std::collections::HashSet;
 use std::fs::File;
 use std::io::{self, BufReader};
 
@@ -33,18 +34,39 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
         max_memory,
     );
 
+    let mut word_positions: HashSet<(u16, Vec<u8>)> = HashSet::new();
+    let mut current_document_id = None;
     let mut key_buffer = Vec::new();
     let mut cursor = docid_word_positions.into_cursor()?;
     while let Some((key, value)) = cursor.move_on_next()? {
-        let (document_id_bytes, fid_bytes) = try_split_array_at(key)
+        let (document_id_bytes, _fid_bytes) = try_split_array_at(key)
             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
         let document_id = DocumentId::from_be_bytes(document_id_bytes);
 
+        if current_document_id.map_or(false, |id| document_id != id) {
+            for (position, word_bytes) in word_positions.iter() {
+                key_buffer.clear();
+                key_buffer.extend_from_slice(word_bytes);
+                key_buffer.push(0);
+                key_buffer.extend_from_slice(&position.to_be_bytes());
+                word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
+            }
+            word_positions.clear();
+        }
+
+        current_document_id = Some(document_id);
+
         for (position, word_bytes) in KvReaderU16::new(&value).iter() {
-            key_buffer.clear();
-            key_buffer.extend_from_slice(word_bytes);
-            key_buffer.push(0);
             let position = bucketed_position(position);
+            word_positions.insert((position, word_bytes.to_vec()));
+        }
+    }
+
+    if let Some(document_id) = current_document_id {
+        for (position, word_bytes) in word_positions {
+            key_buffer.clear();
+            key_buffer.extend_from_slice(&word_bytes);
+            key_buffer.push(0);
             key_buffer.extend_from_slice(&position.to_be_bytes());
             word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
         }

From df9e5c8651d6043d522d425457b2e9559ddb8224 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 25 Sep 2023 16:39:32 +0200
Subject: [PATCH 008/127] Generalize usage of CboRoaringBitmap codec to ease
 the use

---
 milli/src/index.rs                            |  8 ++---
 milli/src/search/new/db_cache.rs              |  8 ++---
 milli/src/update/delete_documents.rs          |  4 +--
 .../extract/extract_docid_word_positions.rs   |  6 ++++
 .../extract/extract_word_docids.rs            | 35 +++++--------------
 .../extract_word_pair_proximity_docids.rs     |  4 ++-
 .../extract/extract_word_position_docids.rs   |  5 +--
 .../src/update/index_documents/extract/mod.rs |  2 +-
 milli/src/update/index_documents/mod.rs       |  6 ++--
 .../src/update/index_documents/typed_chunk.rs |  4 +--
 milli/src/update/word_prefix_docids.rs        | 16 ++++-----
 11 files changed, 44 insertions(+), 54 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index d563f852b..288223a95 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -119,16 +119,16 @@ pub struct Index {
     pub(crate) main: PolyDatabase,
 
     /// A word and all the documents ids containing the word.
-    pub word_docids: Database<Str, RoaringBitmapCodec>,
+    pub word_docids: Database<Str, CboRoaringBitmapCodec>,
 
     /// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
-    pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
+    pub exact_word_docids: Database<Str, CboRoaringBitmapCodec>,
 
     /// A prefix of word and all the documents ids containing this prefix.
-    pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
+    pub word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
 
     /// A prefix of word and all the documents ids containing this prefix, from attributes for which typos are not allowed.
-    pub exact_word_prefix_docids: Database<Str, RoaringBitmapCodec>,
+    pub exact_word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
 
     /// Maps the proximity between a pair of words with all the docids where this relation appears.
     pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs
index e0a2ba3cf..3f4751185 100644
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@@ -168,7 +168,7 @@ impl<'ctx> SearchContext<'ctx> {
                     merge_cbo_roaring_bitmaps,
                 )
             }
-            None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
+            None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
                 self.txn,
                 word,
                 self.word_interner.get(word).as_str(),
@@ -182,7 +182,7 @@ impl<'ctx> SearchContext<'ctx> {
         &mut self,
         word: Interned<String>,
     ) -> Result<Option<RoaringBitmap>> {
-        DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
+        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
             self.txn,
             word,
             self.word_interner.get(word).as_str(),
@@ -230,7 +230,7 @@ impl<'ctx> SearchContext<'ctx> {
                     merge_cbo_roaring_bitmaps,
                 )
             }
-            None => DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
+            None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
                 self.txn,
                 prefix,
                 self.word_interner.get(prefix).as_str(),
@@ -244,7 +244,7 @@ impl<'ctx> SearchContext<'ctx> {
         &mut self,
         prefix: Interned<String>,
     ) -> Result<Option<RoaringBitmap>> {
-        DatabaseCache::get_value::<_, _, RoaringBitmapCodec>(
+        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
             self.txn,
             prefix,
             self.word_interner.get(prefix).as_str(),
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 164ad0c7e..c3b2cf1a3 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -499,7 +499,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
 
 fn remove_from_word_prefix_docids(
     txn: &mut heed::RwTxn,
-    db: &Database<Str, RoaringBitmapCodec>,
+    db: &Database<Str, CboRoaringBitmapCodec>,
     to_remove: &RoaringBitmap,
 ) -> Result<fst::Set<Vec<u8>>> {
     puffin::profile_function!();
@@ -529,7 +529,7 @@ fn remove_from_word_prefix_docids(
 
 fn remove_from_word_docids(
     txn: &mut heed::RwTxn,
-    db: &heed::Database<Str, RoaringBitmapCodec>,
+    db: &heed::Database<Str, CboRoaringBitmapCodec>,
     to_remove: &RoaringBitmap,
     words_to_keep: &mut BTreeSet<String>,
     words_to_remove: &mut BTreeSet<String>,
diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index ea329b212..a45d488e4 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -107,6 +107,12 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
                 if let Some(stop_words) = stop_words {
                     tokenizer_builder.stop_words(stop_words);
                 }
+                if let Some(dictionary) = dictionary {
+                    tokenizer_builder.words_dict(dictionary);
+                }
+                if let Some(separators) = allowed_separators {
+                    tokenizer_builder.separators(separators);
+                }
                 tokenizer_builder.allow_list(&script_language);
                 let tokenizer = tokenizer_builder.build();
 
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 8409f2836..d9fb72cc2 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -8,7 +8,7 @@ use obkv::KvReaderU16;
 use roaring::RoaringBitmap;
 
 use super::helpers::{
-    create_sorter, create_writer, merge_roaring_bitmaps, serialize_roaring_bitmap,
+    create_sorter, create_writer, merge_cbo_roaring_bitmaps, serialize_roaring_bitmap,
     sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters,
 };
 use crate::error::SerializationError;
@@ -40,15 +40,12 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
 
     let mut word_fid_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Unstable,
-        merge_roaring_bitmaps,
+        merge_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
         max_memory.map(|x| x / 3),
     );
-
-    let mut current_document_id = None;
-    let mut fid = 0;
     let mut key_buffer = Vec::new();
     let mut value_buffer = Vec::new();
     let mut words = BTreeSet::new();
@@ -59,28 +56,12 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         let (fid_bytes, _) = try_split_array_at(fid_bytes)
             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
         let document_id = u32::from_be_bytes(document_id_bytes);
-        fid = u16::from_be_bytes(fid_bytes);
+        let fid = u16::from_be_bytes(fid_bytes);
 
-        // drain the btreemaps when we change document.
-        if current_document_id.map_or(false, |id| id != document_id) {
-            words_into_sorter(
-                document_id,
-                fid,
-                &mut key_buffer,
-                &mut value_buffer,
-                &mut words,
-                &mut word_fid_docids_sorter,
-            )?;
-        }
-
-        current_document_id = Some(document_id);
         for (_pos, word) in KvReaderU16::new(&value).iter() {
             words.insert(word.to_vec());
         }
-    }
 
-    // We must make sure that don't lose the current document field id
-    if let Some(document_id) = current_document_id {
         words_into_sorter(
             document_id,
             fid,
@@ -89,11 +70,13 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
             &mut words,
             &mut word_fid_docids_sorter,
         )?;
+
+        words.clear();
     }
 
     let mut word_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Unstable,
-        merge_roaring_bitmaps,
+        merge_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
@@ -102,7 +85,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
 
     let mut exact_word_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Unstable,
-        merge_roaring_bitmaps,
+        merge_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
@@ -146,15 +129,13 @@ fn words_into_sorter(
     word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
 ) -> Result<()> {
     puffin::profile_function!();
-    let bitmap = RoaringBitmap::from_iter(Some(document_id));
-    serialize_roaring_bitmap(&bitmap, value_buffer)?;
 
     for word_bytes in words.iter() {
         key_buffer.clear();
         key_buffer.extend_from_slice(&word_bytes);
         key_buffer.push(0);
         key_buffer.extend_from_slice(&fid.to_be_bytes());
-        word_fid_docids_sorter.insert(&key_buffer, &value_buffer)?;
+        word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
     }
 
     words.clear();
diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
index 6373d5822..d54513786 100644
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -57,7 +57,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
             }
 
             document_word_positions_into_sorter(
-                document_id,
+                current_document_id.unwrap(),
                 &word_pair_proximity,
                 &mut word_pair_proximity_docids_sorter,
             )?;
@@ -65,6 +65,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
             word_positions.clear();
         }
 
+        current_document_id = Some(document_id);
+
         for (position, word) in KvReaderU16::new(&value).iter() {
             // drain the proximity window until the head word is considered close to the word we are inserting.
             while word_positions.get(0).map_or(false, |(_w, p)| {
diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
index 0b07f63b5..220dca960 100644
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -35,7 +35,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
     );
 
     let mut word_positions: HashSet<(u16, Vec<u8>)> = HashSet::new();
-    let mut current_document_id = None;
+    let mut current_document_id: Option<u32> = None;
     let mut key_buffer = Vec::new();
     let mut cursor = docid_word_positions.into_cursor()?;
     while let Some((key, value)) = cursor.move_on_next()? {
@@ -49,7 +49,8 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
                 key_buffer.extend_from_slice(word_bytes);
                 key_buffer.push(0);
                 key_buffer.extend_from_slice(&position.to_be_bytes());
-                word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
+                word_position_docids_sorter
+                    .insert(&key_buffer, current_document_id.unwrap().to_ne_bytes())?;
             }
             word_positions.clear();
         }
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index a6cc04111..32ec6fe5c 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -186,7 +186,7 @@ pub(crate) fn data_from_obkv_documents(
         indexer,
         lmdb_writer_sx.clone(),
         move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
-        merge_roaring_bitmaps,
+        merge_cbo_roaring_bitmaps,
         |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
             TypedChunk::WordDocids {
                 word_docids_reader,
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 58219f28c..22e42937f 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -38,7 +38,7 @@ use crate::update::{
     self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
     WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
 };
-use crate::{Index, Result, RoaringBitmapCodec};
+use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};
 
 static MERGED_DATABASE_COUNT: usize = 7;
 static PREFIX_DATABASE_COUNT: usize = 5;
@@ -700,8 +700,8 @@ where
 fn execute_word_prefix_docids(
     txn: &mut heed::RwTxn,
     reader: grenad::Reader<Cursor<ClonableMmap>>,
-    word_docids_db: Database<Str, RoaringBitmapCodec>,
-    word_prefix_docids_db: Database<Str, RoaringBitmapCodec>,
+    word_docids_db: Database<Str, CboRoaringBitmapCodec>,
+    word_prefix_docids_db: Database<Str, CboRoaringBitmapCodec>,
     indexer_config: &IndexerConfig,
     new_prefix_fst_words: &[String],
     common_prefix_fst_words: &[&[String]],
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index a450b5f34..cf3194255 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -156,7 +156,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 |value, _buffer| Ok(value),
-                merge_roaring_bitmaps,
+                merge_cbo_roaring_bitmaps,
             )?;
 
             let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
@@ -166,7 +166,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 |value, _buffer| Ok(value),
-                merge_roaring_bitmaps,
+                merge_cbo_roaring_bitmaps,
             )?;
 
             let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs
index a30254994..980bab01a 100644
--- a/milli/src/update/word_prefix_docids.rs
+++ b/milli/src/update/word_prefix_docids.rs
@@ -5,15 +5,15 @@ use heed::types::{ByteSlice, Str};
 use heed::Database;
 
 use crate::update::index_documents::{
-    create_sorter, merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
+    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
     CursorClonableMmap, MergeFn,
 };
-use crate::{Result, RoaringBitmapCodec};
+use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
 
 pub struct WordPrefixDocids<'t, 'u, 'i> {
     wtxn: &'t mut heed::RwTxn<'i, 'u>,
-    word_docids: Database<Str, RoaringBitmapCodec>,
-    word_prefix_docids: Database<Str, RoaringBitmapCodec>,
+    word_docids: Database<Str, CboRoaringBitmapCodec>,
+    word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
     pub(crate) chunk_compression_type: CompressionType,
     pub(crate) chunk_compression_level: Option<u32>,
     pub(crate) max_nb_chunks: Option<usize>,
@@ -23,8 +23,8 @@ pub struct WordPrefixDocids<'t, 'u, 'i> {
 impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
     pub fn new(
         wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        word_docids: Database<Str, RoaringBitmapCodec>,
-        word_prefix_docids: Database<Str, RoaringBitmapCodec>,
+        word_docids: Database<Str, CboRoaringBitmapCodec>,
+        word_prefix_docids: Database<Str, CboRoaringBitmapCodec>,
     ) -> WordPrefixDocids<'t, 'u, 'i> {
         WordPrefixDocids {
             wtxn,
@@ -51,7 +51,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
         // and write into it at the same time, therefore we write into another file.
         let mut prefix_docids_sorter = create_sorter(
             grenad::SortAlgorithm::Unstable,
-            merge_roaring_bitmaps,
+            merge_cbo_roaring_bitmaps,
             self.chunk_compression_type,
             self.chunk_compression_level,
             self.max_nb_chunks,
@@ -115,7 +115,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
             self.wtxn,
             *self.word_prefix_docids.as_polymorph(),
             prefix_docids_sorter,
-            merge_roaring_bitmaps,
+            merge_cbo_roaring_bitmaps,
         )?;
 
         Ok(())

From 96be85396d4fd0199683729b27091d6069b5efcf Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 25 Sep 2023 18:55:20 +0200
Subject: [PATCH 009/127] Use a vecDeque in wpp database

---
 .../extract_word_pair_proximity_docids.rs     | 37 +++----------------
 1 file changed, 6 insertions(+), 31 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
index d54513786..fb0ea1ca8 100644
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -1,5 +1,5 @@
 use std::cmp::Ordering;
-use std::collections::HashMap;
+use std::collections::{HashMap, VecDeque};
 use std::fs::File;
 use std::io::BufReader;
 use std::{cmp, io};
@@ -37,7 +37,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
         max_memory.map(|m| m / 2),
     );
 
-    let mut word_positions: Vec<(String, u16)> = Vec::with_capacity(MAX_DISTANCE as usize);
+    let mut word_positions: VecDeque<(String, u16)> =
+        VecDeque::with_capacity(MAX_DISTANCE as usize);
     let mut word_pair_proximity = HashMap::new();
     let mut current_document_id = None;
 
@@ -80,7 +81,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
 
             // insert the new word.
             let word = std::str::from_utf8(word)?;
-            word_positions.push((word.to_string(), position));
+            word_positions.push_back((word.to_string(), position));
         }
     }
 
@@ -123,10 +124,10 @@ fn document_word_positions_into_sorter(
 }
 
 fn word_positions_into_word_pair_proximity(
-    word_positions: &mut Vec<(String, u16)>,
+    word_positions: &mut VecDeque<(String, u16)>,
     word_pair_proximity: &mut HashMap<(String, String), u8>,
 ) -> Result<()> {
-    let (head_word, head_position) = word_positions.remove(0);
+    let (head_word, head_position) = word_positions.pop_front().unwrap();
     for (word, position) in word_positions.iter() {
         let prox = positions_proximity(head_position as u32, *position as u32) as u8;
         word_pair_proximity
@@ -138,29 +139,3 @@ fn word_positions_into_word_pair_proximity(
     }
     Ok(())
 }
-
-struct PeekedWordPosition<I> {
-    word: String,
-    position: u32,
-    iter: I,
-}
-
-impl<I> Ord for PeekedWordPosition<I> {
-    fn cmp(&self, other: &Self) -> Ordering {
-        self.position.cmp(&other.position).reverse()
-    }
-}
-
-impl<I> PartialOrd for PeekedWordPosition<I> {
-    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
-        Some(self.cmp(other))
-    }
-}
-
-impl<I> Eq for PeekedWordPosition<I> {}
-
-impl<I> PartialEq for PeekedWordPosition<I> {
-    fn eq(&self, other: &Self) -> bool {
-        self.position == other.position
-    }
-}

From 28a8d0ccdac9a2ae3e0ed4bb91127b11df5c36b0 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 26 Sep 2023 10:08:36 +0200
Subject: [PATCH 010/127] Fix word pair proximity

---
 .../extract/extract_word_pair_proximity_docids.rs        | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
index fb0ea1ca8..847da01c5 100644
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -1,4 +1,3 @@
-use std::cmp::Ordering;
 use std::collections::{HashMap, VecDeque};
 use std::fs::File;
 use std::io::BufReader;
@@ -12,7 +11,7 @@ use super::helpers::{
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::proximity::{positions_proximity, MAX_DISTANCE};
+use crate::proximity::{index_proximity, MAX_DISTANCE};
 use crate::{DocumentId, Result};
 
 /// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
@@ -71,7 +70,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
         for (position, word) in KvReaderU16::new(&value).iter() {
             // drain the proximity window until the head word is considered close to the word we are inserting.
             while word_positions.get(0).map_or(false, |(_w, p)| {
-                positions_proximity(*p as u32, position as u32) > MAX_DISTANCE
+                index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
             }) {
                 word_positions_into_word_pair_proximity(
                     &mut word_positions,
@@ -109,6 +108,7 @@ fn document_word_positions_into_sorter(
     word_pair_proximity: &HashMap<(String, String), u8>,
     word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>,
 ) -> Result<()> {
+    puffin::profile_function!();
     let mut key_buffer = Vec::new();
     for ((w1, w2), prox) in word_pair_proximity {
         key_buffer.clear();
@@ -127,9 +127,10 @@ fn word_positions_into_word_pair_proximity(
     word_positions: &mut VecDeque<(String, u16)>,
     word_pair_proximity: &mut HashMap<(String, String), u8>,
 ) -> Result<()> {
+    puffin::profile_function!();
     let (head_word, head_position) = word_positions.pop_front().unwrap();
     for (word, position) in word_positions.iter() {
-        let prox = positions_proximity(head_position as u32, *position as u32) as u8;
+        let prox = index_proximity(head_position as u32, *position as u32) as u8;
         word_pair_proximity
             .entry((head_word.clone(), word.clone()))
             .and_modify(|p| {

From 66c2c82a18614208b5b47f9597aa8a8509d1e697 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 28 Sep 2023 10:45:25 +0200
Subject: [PATCH 011/127] Split wpp in several sorters

---
 .../extract_word_pair_proximity_docids.rs     | 66 ++++++++++++-------
 1 file changed, 43 insertions(+), 23 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
index 847da01c5..70865acbe 100644
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -6,8 +6,8 @@ use std::{cmp, io};
 use obkv::KvReaderU16;
 
 use super::helpers::{
-    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
-    GrenadParameters, MergeFn,
+    create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_reader,
+    try_split_array_at, writer_into_reader, GrenadParameters, MergeFn,
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
@@ -27,14 +27,19 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
 
     let max_memory = indexer.max_memory_by_thread();
 
-    let mut word_pair_proximity_docids_sorter = create_sorter(
-        grenad::SortAlgorithm::Unstable,
-        merge_cbo_roaring_bitmaps,
-        indexer.chunk_compression_type,
-        indexer.chunk_compression_level,
-        indexer.max_nb_chunks,
-        max_memory.map(|m| m / 2),
-    );
+    let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
+        .into_iter()
+        .map(|_| {
+            create_sorter(
+                grenad::SortAlgorithm::Unstable,
+                merge_cbo_roaring_bitmaps,
+                indexer.chunk_compression_type,
+                indexer.chunk_compression_level,
+                indexer.max_nb_chunks,
+                max_memory.map(|m| m / MAX_DISTANCE as usize),
+            )
+        })
+        .collect();
 
     let mut word_positions: VecDeque<(String, u16)> =
         VecDeque::with_capacity(MAX_DISTANCE as usize);
@@ -49,6 +54,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
 
         // if we change document, we fill the sorter
         if current_document_id.map_or(false, |id| id != document_id) {
+            puffin::profile_scope!("Document into sorter");
             while !word_positions.is_empty() {
                 word_positions_into_word_pair_proximity(
                     &mut word_positions,
@@ -59,7 +65,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
             document_word_positions_into_sorter(
                 current_document_id.unwrap(),
                 &word_pair_proximity,
-                &mut word_pair_proximity_docids_sorter,
+                &mut word_pair_proximity_docids_sorters,
             )?;
             word_pair_proximity.clear();
             word_positions.clear();
@@ -85,6 +91,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
     }
 
     if let Some(document_id) = current_document_id {
+        puffin::profile_scope!("Final document into sorter");
         while !word_positions.is_empty() {
             word_positions_into_word_pair_proximity(&mut word_positions, &mut word_pair_proximity)?;
         }
@@ -92,11 +99,23 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
         document_word_positions_into_sorter(
             document_id,
             &word_pair_proximity,
-            &mut word_pair_proximity_docids_sorter,
+            &mut word_pair_proximity_docids_sorters,
         )?;
     }
+    {
+        puffin::profile_scope!("sorter_into_reader");
+        let mut writer = create_writer(
+            indexer.chunk_compression_type,
+            indexer.chunk_compression_level,
+            tempfile::tempfile()?,
+        );
 
-    sorter_into_reader(word_pair_proximity_docids_sorter, indexer)
+        for sorter in word_pair_proximity_docids_sorters {
+            sorter.write_into_stream_writer(&mut writer)?;
+        }
+
+        writer_into_reader(writer)
+    }
 }
 
 /// Fills the list of all pairs of words with the shortest proximity between 1 and 7 inclusive.
@@ -106,9 +125,8 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
 fn document_word_positions_into_sorter(
     document_id: DocumentId,
     word_pair_proximity: &HashMap<(String, String), u8>,
-    word_pair_proximity_docids_sorter: &mut grenad::Sorter<MergeFn>,
+    word_pair_proximity_docids_sorters: &mut Vec<grenad::Sorter<MergeFn>>,
 ) -> Result<()> {
-    puffin::profile_function!();
     let mut key_buffer = Vec::new();
     for ((w1, w2), prox) in word_pair_proximity {
         key_buffer.clear();
@@ -117,7 +135,8 @@ fn document_word_positions_into_sorter(
         key_buffer.push(0);
         key_buffer.extend_from_slice(w2.as_bytes());
 
-        word_pair_proximity_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
+        word_pair_proximity_docids_sorters[*prox as usize - 1]
+            .insert(&key_buffer, document_id.to_ne_bytes())?;
     }
 
     Ok(())
@@ -127,16 +146,17 @@ fn word_positions_into_word_pair_proximity(
     word_positions: &mut VecDeque<(String, u16)>,
     word_pair_proximity: &mut HashMap<(String, String), u8>,
 ) -> Result<()> {
-    puffin::profile_function!();
     let (head_word, head_position) = word_positions.pop_front().unwrap();
     for (word, position) in word_positions.iter() {
         let prox = index_proximity(head_position as u32, *position as u32) as u8;
-        word_pair_proximity
-            .entry((head_word.clone(), word.clone()))
-            .and_modify(|p| {
-                *p = cmp::min(*p, prox);
-            })
-            .or_insert(prox);
+        if prox > 0 && prox < MAX_DISTANCE as u8 {
+            word_pair_proximity
+                .entry((head_word.clone(), word.clone()))
+                .and_modify(|p| {
+                    *p = cmp::min(*p, prox);
+                })
+                .or_insert(prox);
+        }
     }
     Ok(())
 }

From 1c5705c164bfe0771a11d611233be1e8790f0b6e Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 10 Oct 2023 11:23:16 +0200
Subject: [PATCH 012/127] clean PR warnings

---
 milli/src/search/new/db_cache.rs              |  4 +-
 milli/src/update/delete_documents.rs          |  4 +-
 .../extract/extract_docid_word_positions.rs   |  4 +-
 .../extract/extract_word_docids.rs            |  8 +--
 .../extract/extract_word_fid_docids.rs        | 53 -------------------
 .../src/update/index_documents/extract/mod.rs | 15 +-----
 .../helpers/merge_functions.rs                |  1 +
 .../src/update/index_documents/helpers/mod.rs |  1 +
 milli/src/update/index_documents/mod.rs       |  7 +--
 .../src/update/index_documents/typed_chunk.rs | 15 ------
 milli/src/update/word_prefix_docids.rs        |  2 +-
 11 files changed, 11 insertions(+), 103 deletions(-)
 delete mode 100644 milli/src/update/index_documents/extract/extract_word_fid_docids.rs

diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs
index 3f4751185..3376cebb2 100644
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@@ -11,9 +11,7 @@ use super::interner::Interned;
 use super::Word;
 use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
 use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
-use crate::{
-    CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, RoaringBitmapCodec, SearchContext,
-};
+use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext};
 
 /// A cache storing pointers to values in the LMDB databases.
 ///
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index c3b2cf1a3..1fef922cd 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -16,9 +16,7 @@ use crate::facet::FacetType;
 use crate::heed_codec::facet::FieldDocIdFacetCodec;
 use crate::heed_codec::CboRoaringBitmapCodec;
 use crate::index::Hnsw;
-use crate::{
-    ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, RoaringBitmapCodec, BEU32,
-};
+use crate::{ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, BEU32};
 
 pub struct DeleteDocuments<'t, 'u, 'i> {
     wtxn: &'t mut heed::RwTxn<'i, 'u>,
diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index a45d488e4..0c7c5cf46 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -12,9 +12,7 @@ use serde_json::Value;
 use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
 use crate::error::{InternalError, SerializationError};
 use crate::update::index_documents::MergeFn;
-use crate::{
-    absolute_from_relative_position, FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH,
-};
+use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
 
 pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
 
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index d9fb72cc2..3df962585 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -5,11 +5,10 @@ use std::iter::FromIterator;
 
 use heed::BytesDecode;
 use obkv::KvReaderU16;
-use roaring::RoaringBitmap;
 
 use super::helpers::{
-    create_sorter, create_writer, merge_cbo_roaring_bitmaps, serialize_roaring_bitmap,
-    sorter_into_reader, try_split_array_at, writer_into_reader, GrenadParameters,
+    create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_reader,
+    try_split_array_at, writer_into_reader, GrenadParameters,
 };
 use crate::error::SerializationError;
 use crate::heed_codec::StrBEU16Codec;
@@ -47,7 +46,6 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         max_memory.map(|x| x / 3),
     );
     let mut key_buffer = Vec::new();
-    let mut value_buffer = Vec::new();
     let mut words = BTreeSet::new();
     let mut cursor = docid_word_positions.into_cursor()?;
     while let Some((key, value)) = cursor.move_on_next()? {
@@ -66,7 +64,6 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
             document_id,
             fid,
             &mut key_buffer,
-            &mut value_buffer,
             &mut words,
             &mut word_fid_docids_sorter,
         )?;
@@ -124,7 +121,6 @@ fn words_into_sorter(
     document_id: DocumentId,
     fid: FieldId,
     key_buffer: &mut Vec<u8>,
-    value_buffer: &mut Vec<u8>,
     words: &mut BTreeSet<Vec<u8>>,
     word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
 ) -> Result<()> {
diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
deleted file mode 100644
index dd4d42431..000000000
--- a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
+++ /dev/null
@@ -1,53 +0,0 @@
-use std::fs::File;
-use std::io::{self, BufReader};
-
-use super::helpers::{
-    create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
-    try_split_array_at, GrenadParameters,
-};
-use crate::error::SerializationError;
-use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::{relative_from_absolute_position, DocumentId, Result};
-
-/// Extracts the word, field id, and the documents ids where this word appear at this field id.
-#[logging_timer::time]
-pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
-    docid_word_positions: grenad::Reader<R>,
-    indexer: GrenadParameters,
-) -> Result<grenad::Reader<BufReader<File>>> {
-    puffin::profile_function!();
-
-    todo!("remove me");
-
-    let max_memory = indexer.max_memory_by_thread();
-
-    let mut word_fid_docids_sorter = create_sorter(
-        grenad::SortAlgorithm::Unstable,
-        merge_cbo_roaring_bitmaps,
-        indexer.chunk_compression_type,
-        indexer.chunk_compression_level,
-        indexer.max_nb_chunks,
-        max_memory,
-    );
-
-    let mut key_buffer = Vec::new();
-    let mut cursor = docid_word_positions.into_cursor()?;
-    while let Some((key, value)) = cursor.move_on_next()? {
-        let (document_id_bytes, word_bytes) = try_split_array_at(key)
-            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
-        let document_id = DocumentId::from_be_bytes(document_id_bytes);
-
-        for position in read_u32_ne_bytes(value) {
-            key_buffer.clear();
-            key_buffer.extend_from_slice(word_bytes);
-            key_buffer.push(0);
-            let (fid, _) = relative_from_absolute_position(position);
-            key_buffer.extend_from_slice(&fid.to_be_bytes());
-            word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
-        }
-    }
-
-    let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
-
-    Ok(word_fid_docids_reader)
-}
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 32ec6fe5c..164f95452 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -6,7 +6,6 @@ mod extract_fid_word_count_docids;
 mod extract_geo_points;
 mod extract_vector_points;
 mod extract_word_docids;
-mod extract_word_fid_docids;
 mod extract_word_pair_proximity_docids;
 mod extract_word_position_docids;
 
@@ -26,12 +25,11 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
 use self::extract_geo_points::extract_geo_points;
 use self::extract_vector_points::extract_vector_points;
 use self::extract_word_docids::extract_word_docids;
-use self::extract_word_fid_docids::extract_word_fid_docids;
 use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
 use self::extract_word_position_docids::extract_word_position_docids;
 use super::helpers::{
-    as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
-    GrenadParameters, MergeFn, MergeableReader,
+    as_cloneable_grenad, merge_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
+    MergeableReader,
 };
 use super::{helpers, TypedChunk};
 use crate::{FieldId, Result};
@@ -206,15 +204,6 @@ pub(crate) fn data_from_obkv_documents(
         TypedChunk::WordPositionDocids,
         "word-position-docids",
     );
-    // spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
-    //     docid_word_positions_chunks,
-    //     indexer,
-    //     lmdb_writer_sx.clone(),
-    //     extract_word_fid_docids,
-    //     merge_cbo_roaring_bitmaps,
-    //     TypedChunk::WordFidDocids,
-    //     "word-fid-docids",
-    // );
 
     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
         docid_fid_facet_strings_chunks,
diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs
index 5d111067a..90cfa0f60 100644
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@@ -11,6 +11,7 @@ use crate::Result;
 
 pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
 
+#[allow(unused)]
 pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
     if values.len() == 1 {
         Ok(values[0].clone())
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index d59a3bc08..3dc9f8172 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -44,6 +44,7 @@ where
     Some((head, tail))
 }
 
+#[allow(unused)]
 pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ {
     bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes)
 }
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 22e42937f..e4385de70 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -38,7 +38,7 @@ use crate::update::{
     self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
     WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
 };
-use crate::{CboRoaringBitmapCodec, Index, Result, RoaringBitmapCodec};
+use crate::{CboRoaringBitmapCodec, Index, Result};
 
 static MERGED_DATABASE_COUNT: usize = 7;
 static PREFIX_DATABASE_COUNT: usize = 5;
@@ -434,11 +434,6 @@ where
                     word_position_docids = Some(cloneable_chunk);
                     TypedChunk::WordPositionDocids(chunk)
                 }
-                TypedChunk::WordFidDocids(chunk) => {
-                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
-                    word_fid_docids = Some(cloneable_chunk);
-                    TypedChunk::WordFidDocids(chunk)
-                }
                 otherwise => otherwise,
             };
 
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index cf3194255..a94bcf581 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -35,7 +35,6 @@ pub(crate) enum TypedChunk {
         word_fid_docids_reader: grenad::Reader<BufReader<File>>,
     },
     WordPositionDocids(grenad::Reader<BufReader<File>>),
-    WordFidDocids(grenad::Reader<BufReader<File>>),
     WordPairProximityDocids(grenad::Reader<BufReader<File>>),
     FieldIdFacetStringDocids(grenad::Reader<BufReader<File>>),
     FieldIdFacetNumberDocids(grenad::Reader<BufReader<File>>),
@@ -78,9 +77,6 @@ impl TypedChunk {
             TypedChunk::WordPositionDocids(grenad) => {
                 format!("WordPositionDocids {{ number_of_entries: {} }}", grenad.len())
             }
-            TypedChunk::WordFidDocids(grenad) => {
-                format!("WordFidDocids {{ number_of_entries: {} }}", grenad.len())
-            }
             TypedChunk::WordPairProximityDocids(grenad) => {
                 format!("WordPairProximityDocids {{ number_of_entries: {} }}", grenad.len())
             }
@@ -202,17 +198,6 @@ pub(crate) fn write_typed_chunk_into_index(
             )?;
             is_merged_database = true;
         }
-        TypedChunk::WordFidDocids(word_fid_docids_iter) => {
-            append_entries_into_database(
-                word_fid_docids_iter,
-                &index.word_fid_docids,
-                wtxn,
-                index_is_empty,
-                |value, _buffer| Ok(value),
-                merge_cbo_roaring_bitmaps,
-            )?;
-            is_merged_database = true;
-        }
         TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
             let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
             indexer.execute(wtxn)?;
diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs
index 980bab01a..8220aa777 100644
--- a/milli/src/update/word_prefix_docids.rs
+++ b/milli/src/update/word_prefix_docids.rs
@@ -8,7 +8,7 @@ use crate::update::index_documents::{
     create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
     CursorClonableMmap, MergeFn,
 };
-use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
+use crate::{CboRoaringBitmapCodec, Result};
 
 pub struct WordPrefixDocids<'t, 'u, 'i> {
     wtxn: &'t mut heed::RwTxn<'i, 'u>,

From f5ef69293bcf1ab643fc7c40d8543ddd4596a225 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 10 Oct 2023 16:17:03 +0200
Subject: [PATCH 013/127] deactivate prefix dbs

---
 milli/src/update/index_documents/mod.rs | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index e4385de70..703d7ee29 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -475,13 +475,14 @@ where
         let all_documents_ids = index_documents_ids | new_documents_ids;
         self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
 
-        self.execute_prefix_databases(
-            word_docids,
-            exact_word_docids,
-            word_pair_proximity_docids,
-            word_position_docids,
-            word_fid_docids,
-        )?;
+        // TODO: reactivate prefix DB with diff-indexing
+        // self.execute_prefix_databases(
+        //     word_docids,
+        //     exact_word_docids,
+        //     word_pair_proximity_docids,
+        //     word_position_docids,
+        //     word_fid_docids,
+        // )?;
 
         Ok(all_documents_ids.len())
     }

From 1dd97578a821a6dcf6ffd4eac752fcab36c2c44b Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 12 Oct 2023 11:46:56 +0200
Subject: [PATCH 014/127] Make the transform struct return diff-based documents
 obkvs

---
 milli/src/update/del_add.rs                   |  60 +++++
 .../helpers/merge_functions.rs                | 126 ++++++---
 .../src/update/index_documents/helpers/mod.rs |   4 +-
 milli/src/update/index_documents/transform.rs | 253 +++++++++++++-----
 milli/src/update/mod.rs                       |   1 +
 5 files changed, 349 insertions(+), 95 deletions(-)
 create mode 100644 milli/src/update/del_add.rs

diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs
new file mode 100644
index 000000000..e8e595837
--- /dev/null
+++ b/milli/src/update/del_add.rs
@@ -0,0 +1,60 @@
+use obkv::Key;
+
+pub type KvWriterDelAdd<W> = obkv::KvWriter<W, DelAdd>;
+pub type KvReaderDelAdd<'a> = obkv::KvReader<'a, DelAdd>;
+
+/// DelAdd defines the new value to add in the database and old value to delete from the database.
+///
+/// Its used in an OBKV to be serialized in grenad files.
+#[repr(u8)]
+#[derive(Clone, Copy, PartialOrd, PartialEq, Debug)]
+pub enum DelAdd {
+    Deletion = 0,
+    Addition = 1,
+}
+
+impl Key for DelAdd {
+    const BYTES_SIZE: usize = std::mem::size_of::<DelAdd>();
+    type BYTES = [u8; Self::BYTES_SIZE];
+
+    fn to_be_bytes(&self) -> Self::BYTES {
+        u8::to_be_bytes(*self as u8)
+    }
+
+    fn from_be_bytes(array: Self::BYTES) -> Self {
+        match u8::from_be_bytes(array) {
+            0 => Self::Deletion,
+            1 => Self::Addition,
+            otherwise => unreachable!("DelAdd has only 2 variants, unknown variant: {}", otherwise),
+        }
+    }
+}
+
+/// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value>
+///
+/// if deletion is `true`, the value will be inserted behind a DelAdd::Deletion key.
+/// if addition is `true`, the value will be inserted behind a DelAdd::Addition key.
+/// if both deletion and addition are `true, the value will be inserted in both keys.
+pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
+    reader: obkv::KvReader<K>,
+    deletion: bool,
+    addition: bool,
+    buffer: &mut Vec<u8>,
+) -> Result<(), std::io::Error> {
+    let mut writer = obkv::KvWriter::new(buffer);
+    let mut value_buffer = Vec::new();
+    for (key, value) in reader.iter() {
+        value_buffer.clear();
+        let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+        if deletion {
+            value_writer.insert(DelAdd::Deletion, value)?;
+        }
+        if addition {
+            value_writer.insert(DelAdd::Addition, value)?;
+        }
+        value_writer.finish()?;
+        writer.insert(key, &value_buffer)?;
+    }
+
+    writer.finish()
+}
diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs
index 90cfa0f60..6317b5610 100644
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@@ -6,6 +6,7 @@ use std::result::Result as StdResult;
 use roaring::RoaringBitmap;
 
 use crate::heed_codec::CboRoaringBitmapCodec;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::index_documents::transform::Operation;
 use crate::Result;
 
@@ -76,55 +77,118 @@ pub fn keep_latest_obkv<'a>(_key: &[u8], obkvs: &[Cow<'a, [u8]>]) -> Result<Cow<
     Ok(obkvs.last().unwrap().clone())
 }
 
-pub fn merge_two_obkvs(base: obkv::KvReaderU16, update: obkv::KvReaderU16, buffer: &mut Vec<u8>) {
+pub fn merge_two_del_add_obkvs(
+    base: obkv::KvReaderU16,
+    update: obkv::KvReaderU16,
+    merge_additions: bool,
+    buffer: &mut Vec<u8>,
+) {
     use itertools::merge_join_by;
     use itertools::EitherOrBoth::{Both, Left, Right};
 
     buffer.clear();
 
     let mut writer = obkv::KvWriter::new(buffer);
+    let mut value_buffer = Vec::new();
     for eob in merge_join_by(base.iter(), update.iter(), |(b, _), (u, _)| b.cmp(u)) {
         match eob {
-            Both(_, (k, v)) | Left((k, v)) | Right((k, v)) => writer.insert(k, v).unwrap(),
+            Left((k, v)) => {
+                if merge_additions {
+                    writer.insert(k, v).unwrap()
+                } else {
+                    // If merge_additions is false, recreate an obkv keeping the deletions only.
+                    value_buffer.clear();
+                    let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                    let base_reader = KvReaderDelAdd::new(v);
+
+                    if let Some(deletion) = base_reader.get(DelAdd::Deletion) {
+                        value_writer.insert(DelAdd::Deletion, deletion).unwrap();
+                        value_writer.finish().unwrap();
+                        writer.insert(k, &value_buffer).unwrap()
+                    }
+                }
+            }
+            Right((k, v)) => writer.insert(k, v).unwrap(),
+            Both((k, base), (_, update)) => {
+                // merge deletions and additions.
+                value_buffer.clear();
+                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                let base_reader = KvReaderDelAdd::new(base);
+                let update_reader = KvReaderDelAdd::new(update);
+
+                // keep newest deletion.
+                if let Some(deletion) =
+                    update_reader.get(DelAdd::Deletion).or(base_reader.get(DelAdd::Deletion))
+                {
+                    value_writer.insert(DelAdd::Deletion, deletion).unwrap();
+                }
+
+                // keep base addition only if merge_additions is true.
+                let base_addition =
+                    merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten();
+                // keep newest addition.
+                if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) {
+                    value_writer.insert(DelAdd::Addition, addition).unwrap();
+                }
+
+                value_writer.finish().unwrap();
+                writer.insert(k, &value_buffer).unwrap()
+            }
         }
     }
 
     writer.finish().unwrap();
 }
 
-/// Merge all the obks in the order we see them.
-pub fn merge_obkvs_and_operations<'a>(
+/// Merge all the obkvs from the newest to the oldest.
+fn inner_merge_del_add_obkvs<'a>(
+    obkvs: &[Cow<'a, [u8]>],
+    merge_additions: bool,
+) -> Result<Cow<'a, [u8]>> {
+    // pop the newest operation from the list.
+    let (newest, obkvs) = obkvs.split_last().unwrap();
+    // keep the operation type for the returned value.
+    let newest_operation_type = newest[0];
+
+    // treat the newest obkv as the starting point of the merge.
+    let mut acc_operation_type = newest_operation_type;
+    let mut acc = newest[1..].to_vec();
+    let mut buffer = Vec::new();
+    // reverse iter from the most recent to the oldest.
+    for current in obkvs.into_iter().rev() {
+        // if in the previous iteration there was a complete deletion,
+        // stop the merge process.
+        if acc_operation_type == Operation::Deletion as u8 {
+            break;
+        }
+
+        let newest = obkv::KvReader::new(&acc);
+        let oldest = obkv::KvReader::new(&current[1..]);
+        merge_two_del_add_obkvs(oldest, newest, merge_additions, &mut buffer);
+
+        // we want the result of the merge into our accumulator.
+        std::mem::swap(&mut acc, &mut buffer);
+        acc_operation_type = current[0];
+    }
+
+    acc.insert(0, newest_operation_type);
+    Ok(Cow::from(acc))
+}
+
+/// Merge all the obkvs from the newest to the oldest.
+pub fn obkvs_merge_additions_and_deletions<'a>(
     _key: &[u8],
     obkvs: &[Cow<'a, [u8]>],
 ) -> Result<Cow<'a, [u8]>> {
-    // [add, add, delete, add, add]
-    // we can ignore everything that happened before the last delete.
-    let starting_position =
-        obkvs.iter().rposition(|obkv| obkv[0] == Operation::Deletion as u8).unwrap_or(0);
+    inner_merge_del_add_obkvs(obkvs, true)
+}
 
-    // [add, add, delete]
-    // if the last operation was a deletion then we simply return the deletion
-    if starting_position == obkvs.len() - 1 && obkvs.last().unwrap()[0] == Operation::Deletion as u8
-    {
-        return Ok(obkvs[obkvs.len() - 1].clone());
-    }
-    let mut buffer = Vec::new();
-
-    // (add, add, delete) [add, add]
-    // in the other case, no deletion will be encountered during the merge
-    let mut ret =
-        obkvs[starting_position..].iter().cloned().fold(Vec::new(), |mut acc, current| {
-            let first = obkv::KvReader::new(&acc);
-            let second = obkv::KvReader::new(&current[1..]);
-            merge_two_obkvs(first, second, &mut buffer);
-
-            // we want the result of the merge into our accumulator
-            std::mem::swap(&mut acc, &mut buffer);
-            acc
-        });
-
-    ret.insert(0, Operation::Addition as u8);
-    Ok(Cow::from(ret))
+/// Merge all the obkvs deletions from the newest to the oldest and keep only the newest additions.
+pub fn obkvs_keep_last_addition_merge_deletions<'a>(
+    _key: &[u8],
+    obkvs: &[Cow<'a, [u8]>],
+) -> Result<Cow<'a, [u8]>> {
+    inner_merge_del_add_obkvs(obkvs, false)
 }
 
 pub fn merge_cbo_roaring_bitmaps<'a>(
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index 3dc9f8172..8f70a2de2 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -14,8 +14,8 @@ pub use grenad_helpers::{
 };
 pub use merge_functions::{
     concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
-    merge_cbo_roaring_bitmaps, merge_obkvs_and_operations, merge_roaring_bitmaps, merge_two_obkvs,
-    serialize_roaring_bitmap, MergeFn,
+    merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions,
+    obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn,
 };
 
 use crate::MAX_WORD_LENGTH;
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index f0e3bbbf0..a45a6ee3c 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -7,18 +7,20 @@ use std::io::{Read, Seek};
 use fxhash::FxHashMap;
 use heed::RoTxn;
 use itertools::Itertools;
-use obkv::{KvReader, KvWriter};
+use obkv::{KvReader, KvReaderU16, KvWriter};
 use roaring::RoaringBitmap;
 use serde_json::Value;
 use smartstring::SmartString;
 
 use super::helpers::{
-    create_sorter, create_writer, keep_latest_obkv, merge_obkvs_and_operations, MergeFn,
+    create_sorter, create_writer, obkvs_keep_last_addition_merge_deletions,
+    obkvs_merge_additions_and_deletions, MergeFn,
 };
 use super::{IndexDocumentsMethod, IndexerConfig};
 use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::{db_name, main_key};
+use crate::update::del_add::into_del_add_obkv;
 use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
 use crate::{
     FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
@@ -106,8 +108,8 @@ impl<'a, 'i> Transform<'a, 'i> {
         // We must choose the appropriate merge function for when two or more documents
         // with the same user id must be merged or fully replaced in the same batch.
         let merge_function = match index_documents_method {
-            IndexDocumentsMethod::ReplaceDocuments => keep_latest_obkv,
-            IndexDocumentsMethod::UpdateDocuments => merge_obkvs_and_operations,
+            IndexDocumentsMethod::ReplaceDocuments => obkvs_keep_last_addition_merge_deletions,
+            IndexDocumentsMethod::UpdateDocuments => obkvs_merge_additions_and_deletions,
         };
 
         // We initialize the sorter with the user indexing settings.
@@ -223,19 +225,21 @@ impl<'a, 'i> Transform<'a, 'i> {
             let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
                 Entry::Occupied(entry) => *entry.get() as u32,
                 Entry::Vacant(entry) => {
-                    // If the document was already in the db we mark it as a replaced document.
-                    // It'll be deleted later.
-                    if let Some(docid) = external_documents_ids.get(entry.key()) {
-                        // If it was already in the list of replaced documents it means it was deleted
-                        // by the remove_document method. We should starts as if it never existed.
-                        if self.replaced_documents_ids.insert(docid) {
-                            original_docid = Some(docid);
+                    let docid = match external_documents_ids.get(entry.key()) {
+                        Some(docid) => {
+                            // If it was already in the list of replaced documents it means it was deleted
+                            // by the remove_document method. We should starts as if it never existed.
+                            if self.replaced_documents_ids.insert(docid) {
+                                original_docid = Some(docid);
+                            }
+
+                            docid
                         }
-                    }
-                    let docid = self
-                        .available_documents_ids
-                        .next()
-                        .ok_or(UserError::DocumentLimitReached)?;
+                        None => self
+                            .available_documents_ids
+                            .next()
+                            .ok_or(UserError::DocumentLimitReached)?,
+                    };
                     entry.insert(docid as u64);
                     docid
                 }
@@ -263,16 +267,28 @@ impl<'a, 'i> Transform<'a, 'i> {
                     skip_insertion = true;
                 } else {
                     // we associate the base document with the new key, everything will get merged later.
+                    let keep_original_version =
+                        self.index_documents_method == IndexDocumentsMethod::UpdateDocuments;
                     document_sorter_buffer.clear();
                     document_sorter_buffer.push(Operation::Addition as u8);
-                    document_sorter_buffer.extend_from_slice(base_obkv);
+                    into_del_add_obkv(
+                        KvReaderU16::new(base_obkv),
+                        true,
+                        keep_original_version,
+                        &mut document_sorter_buffer,
+                    )?;
                     self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
                     match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
                         Some(flattened_obkv) => {
                             // we recreate our buffer with the flattened documents
                             document_sorter_buffer.clear();
                             document_sorter_buffer.push(Operation::Addition as u8);
-                            document_sorter_buffer.extend_from_slice(&flattened_obkv);
+                            into_del_add_obkv(
+                                KvReaderU16::new(&flattened_obkv),
+                                true,
+                                keep_original_version,
+                                &mut document_sorter_buffer,
+                            )?;
                             self.flattened_sorter
                                 .insert(docid.to_be_bytes(), &document_sorter_buffer)?
                         }
@@ -288,7 +304,12 @@ impl<'a, 'i> Transform<'a, 'i> {
 
                 document_sorter_buffer.clear();
                 document_sorter_buffer.push(Operation::Addition as u8);
-                document_sorter_buffer.extend_from_slice(&obkv_buffer);
+                into_del_add_obkv(
+                    KvReaderU16::new(&obkv_buffer),
+                    false,
+                    true,
+                    &mut document_sorter_buffer,
+                )?;
                 // We use the extracted/generated user id as the key for this document.
                 self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
 
@@ -296,7 +317,12 @@ impl<'a, 'i> Transform<'a, 'i> {
                     Some(flattened_obkv) => {
                         document_sorter_buffer.clear();
                         document_sorter_buffer.push(Operation::Addition as u8);
-                        document_sorter_buffer.extend_from_slice(&flattened_obkv);
+                        into_del_add_obkv(
+                            KvReaderU16::new(&flattened_obkv),
+                            false,
+                            true,
+                            &mut document_sorter_buffer,
+                        )?;
                         self.flattened_sorter
                             .insert(docid.to_be_bytes(), &document_sorter_buffer)?
                     }
@@ -354,19 +380,25 @@ impl<'a, 'i> Transform<'a, 'i> {
         let external_documents_ids = self.index.external_documents_ids(wtxn)?;
 
         let mut documents_deleted = 0;
+        let mut document_sorter_buffer = Vec::new();
         for to_remove in to_remove {
             if should_abort() {
                 return Err(Error::InternalError(InternalError::AbortedIndexation));
             }
 
-            match self.new_external_documents_ids_builder.entry((*to_remove).into()) {
+            // Check if the document has been added in the current indexing process.
+            let deleted_from_current = match self
+                .new_external_documents_ids_builder
+                .entry((*to_remove).into())
+            {
                 // if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
                 Entry::Occupied(entry) => {
                     let doc_id = *entry.get() as u32;
-                    self.original_sorter
-                        .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
-                    self.flattened_sorter
-                        .insert(doc_id.to_be_bytes(), [Operation::Deletion as u8])?;
+                    document_sorter_buffer.clear();
+                    document_sorter_buffer.push(Operation::Deletion as u8);
+                    obkv::KvWriterU16::new(&mut document_sorter_buffer).finish().unwrap();
+                    self.original_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?;
+                    self.flattened_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?;
 
                     // we must NOT update the list of replaced_documents_ids
                     // Either:
@@ -375,21 +407,69 @@ impl<'a, 'i> Transform<'a, 'i> {
                     //    we're removing it there is nothing to do.
                     self.new_documents_ids.remove(doc_id);
                     entry.remove_entry();
+                    true
                 }
-                Entry::Vacant(entry) => {
-                    // If the document was already in the db we mark it as a `to_delete` document.
-                    // It'll be deleted later. We don't need to push anything to the sorters.
-                    if let Some(docid) = external_documents_ids.get(entry.key()) {
-                        self.replaced_documents_ids.insert(docid);
-                    } else {
-                        // if the document is nowehere to be found, there is nothing to do and we must NOT
-                        // increment the count of documents_deleted
-                        continue;
-                    }
-                }
+                Entry::Vacant(_) => false,
             };
 
-            documents_deleted += 1;
+            // If the document was already in the db we mark it as a `to_delete` document.
+            // Then we push the document in sorters in deletion mode.
+            let deleted_from_db = match external_documents_ids.get(&to_remove) {
+                Some(docid) => {
+                    self.replaced_documents_ids.insert(docid);
+
+                    // fetch the obkv document
+                    let original_key = BEU32::new(docid);
+                    let base_obkv = self
+                        .index
+                        .documents
+                        .remap_data_type::<heed::types::ByteSlice>()
+                        .get(wtxn, &original_key)?
+                        .ok_or(InternalError::DatabaseMissingEntry {
+                            db_name: db_name::DOCUMENTS,
+                            key: None,
+                        })?;
+
+                    // push it as to delete in the original_sorter
+                    document_sorter_buffer.clear();
+                    document_sorter_buffer.push(Operation::Deletion as u8);
+                    into_del_add_obkv(
+                        KvReaderU16::new(base_obkv),
+                        true,
+                        false,
+                        &mut document_sorter_buffer,
+                    )?;
+                    self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
+
+                    // flatten it and push it as to delete in the flattened_sorter
+                    match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
+                        Some(flattened_obkv) => {
+                            // we recreate our buffer with the flattened documents
+                            document_sorter_buffer.clear();
+                            document_sorter_buffer.push(Operation::Deletion as u8);
+                            into_del_add_obkv(
+                                KvReaderU16::new(&flattened_obkv),
+                                true,
+                                false,
+                                &mut document_sorter_buffer,
+                            )?;
+                            self.flattened_sorter
+                                .insert(docid.to_be_bytes(), &document_sorter_buffer)?
+                        }
+                        None => self
+                            .flattened_sorter
+                            .insert(docid.to_be_bytes(), &document_sorter_buffer)?,
+                    }
+
+                    true
+                }
+                None => false,
+            };
+
+            // increase counter only if the document existed somewhere before.
+            if deleted_from_current || deleted_from_db {
+                documents_deleted += 1;
+            }
         }
 
         Ok(documents_deleted)
@@ -589,9 +669,7 @@ impl<'a, 'i> Transform<'a, 'i> {
         let mut documents_count = 0;
 
         while let Some((key, val)) = iter.next()? {
-            if val[0] == Operation::Deletion as u8 {
-                continue;
-            }
+            // skip first byte corresponding to the operation type (Deletion or Addition).
             let val = &val[1..];
 
             // send a callback to show at which step we are
@@ -631,9 +709,7 @@ impl<'a, 'i> Transform<'a, 'i> {
         // We get rids of the `Operation` byte and skip the deleted documents as well.
         let mut iter = self.flattened_sorter.into_stream_merger_iter()?;
         while let Some((key, val)) = iter.next()? {
-            if val[0] == Operation::Deletion as u8 {
-                continue;
-            }
+            // skip first byte corresponding to the operation type (Deletion or Addition).
             let val = &val[1..];
             writer.insert(key, val)?;
         }
@@ -713,6 +789,7 @@ impl<'a, 'i> Transform<'a, 'i> {
         );
 
         let mut obkv_buffer = Vec::new();
+        let mut document_sorter_buffer = Vec::new();
         for result in self.index.all_documents(wtxn)? {
             let (docid, obkv) = result?;
 
@@ -727,7 +804,9 @@ impl<'a, 'i> Transform<'a, 'i> {
             }
 
             let buffer = obkv_writer.into_inner()?;
-            original_writer.insert(docid.to_be_bytes(), &buffer)?;
+            document_sorter_buffer.clear();
+            into_del_add_obkv(KvReaderU16::new(buffer), true, true, &mut document_sorter_buffer)?;
+            original_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
 
             // Once we have the document. We're going to flatten it
             // and insert it in the flattened sorter.
@@ -762,7 +841,9 @@ impl<'a, 'i> Transform<'a, 'i> {
                 let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
                 writer.insert(fid, &value)?;
             }
-            flattened_writer.insert(docid.to_be_bytes(), &buffer)?;
+            document_sorter_buffer.clear();
+            into_del_add_obkv(KvReaderU16::new(&buffer), true, true, &mut document_sorter_buffer)?;
+            flattened_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
         }
 
         // Once we have written all the documents, we extract
@@ -828,38 +909,86 @@ mod test {
 
     #[test]
     fn merge_obkvs() {
-        let mut doc_0 = Vec::new();
-        let mut kv_writer = KvWriter::new(&mut doc_0);
+        let mut additive_doc_0 = Vec::new();
+        let mut deletive_doc_0 = Vec::new();
+        let mut del_add_doc_0 = Vec::new();
+        let mut kv_writer = KvWriter::memory();
         kv_writer.insert(0_u8, [0]).unwrap();
-        kv_writer.finish().unwrap();
-        doc_0.insert(0, Operation::Addition as u8);
+        let buffer = kv_writer.into_inner().unwrap();
+        into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0).unwrap();
+        additive_doc_0.insert(0, Operation::Addition as u8);
+        into_del_add_obkv(KvReaderU16::new(&buffer), true, false, &mut deletive_doc_0).unwrap();
+        deletive_doc_0.insert(0, Operation::Deletion as u8);
+        into_del_add_obkv(KvReaderU16::new(&buffer), true, true, &mut del_add_doc_0).unwrap();
+        del_add_doc_0.insert(0, Operation::Addition as u8);
 
-        let ret = merge_obkvs_and_operations(&[], &[Cow::from(doc_0.as_slice())]).unwrap();
-        assert_eq!(*ret, doc_0);
+        let mut additive_doc_1 = Vec::new();
+        let mut kv_writer = KvWriter::memory();
+        kv_writer.insert(1_u8, [1]).unwrap();
+        let buffer = kv_writer.into_inner().unwrap();
+        into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_1).unwrap();
+        additive_doc_1.insert(0, Operation::Addition as u8);
 
-        let ret = merge_obkvs_and_operations(
+        let mut additive_doc_0_1 = Vec::new();
+        let mut kv_writer = KvWriter::memory();
+        kv_writer.insert(0_u8, [0]).unwrap();
+        kv_writer.insert(1_u8, [1]).unwrap();
+        let buffer = kv_writer.into_inner().unwrap();
+        into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0_1).unwrap();
+        additive_doc_0_1.insert(0, Operation::Addition as u8);
+
+        let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())])
+            .unwrap();
+        assert_eq!(*ret, additive_doc_0);
+
+        let ret = obkvs_merge_additions_and_deletions(
             &[],
-            &[Cow::from([Operation::Deletion as u8].as_slice()), Cow::from(doc_0.as_slice())],
+            &[Cow::from(deletive_doc_0.as_slice()), Cow::from(additive_doc_0.as_slice())],
         )
         .unwrap();
-        assert_eq!(*ret, doc_0);
+        assert_eq!(*ret, del_add_doc_0);
 
-        let ret = merge_obkvs_and_operations(
+        let ret = obkvs_merge_additions_and_deletions(
             &[],
-            &[Cow::from(doc_0.as_slice()), Cow::from([Operation::Deletion as u8].as_slice())],
+            &[Cow::from(additive_doc_0.as_slice()), Cow::from(deletive_doc_0.as_slice())],
         )
         .unwrap();
-        assert_eq!(*ret, [Operation::Deletion as u8]);
+        assert_eq!(*ret, deletive_doc_0);
 
-        let ret = merge_obkvs_and_operations(
+        let ret = obkvs_merge_additions_and_deletions(
             &[],
             &[
-                Cow::from([Operation::Addition as u8, 1].as_slice()),
-                Cow::from([Operation::Deletion as u8].as_slice()),
-                Cow::from(doc_0.as_slice()),
+                Cow::from(additive_doc_1.as_slice()),
+                Cow::from(deletive_doc_0.as_slice()),
+                Cow::from(additive_doc_0.as_slice()),
             ],
         )
         .unwrap();
-        assert_eq!(*ret, doc_0);
+        assert_eq!(*ret, del_add_doc_0);
+
+        let ret = obkvs_merge_additions_and_deletions(
+            &[],
+            &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
+        )
+        .unwrap();
+        assert_eq!(*ret, additive_doc_0_1);
+
+        let ret = obkvs_keep_last_addition_merge_deletions(
+            &[],
+            &[Cow::from(additive_doc_1.as_slice()), Cow::from(additive_doc_0.as_slice())],
+        )
+        .unwrap();
+        assert_eq!(*ret, additive_doc_0);
+
+        let ret = obkvs_keep_last_addition_merge_deletions(
+            &[],
+            &[
+                Cow::from(deletive_doc_0.as_slice()),
+                Cow::from(additive_doc_1.as_slice()),
+                Cow::from(additive_doc_0.as_slice()),
+            ],
+        )
+        .unwrap();
+        assert_eq!(*ret, del_add_doc_0);
     }
 }
diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs
index 9982957e5..6224995a3 100644
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@@ -21,6 +21,7 @@ pub use self::words_prefixes_fst::WordsPrefixesFst;
 
 mod available_documents_ids;
 mod clear_documents;
+pub(crate) mod del_add;
 mod delete_documents;
 pub(crate) mod facet;
 mod index_documents;

From 313b16bec28835ef1e921fb967ec881e335f5192 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 16 Oct 2023 14:58:11 +0200
Subject: [PATCH 015/127] Support diff indexing on extract_docid_word_positions

---
 milli/src/update/del_add.rs                   |  40 ++
 .../extract/extract_docid_word_positions.rs   | 369 ++++++++++++------
 .../helpers/merge_functions.rs                |   6 +-
 milli/src/update/index_documents/transform.rs |   4 +-
 .../src/update/index_documents/typed_chunk.rs |  34 +-
 5 files changed, 322 insertions(+), 131 deletions(-)

diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs
index e8e595837..346ae0afa 100644
--- a/milli/src/update/del_add.rs
+++ b/milli/src/update/del_add.rs
@@ -58,3 +58,43 @@ pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
 
     writer.finish()
 }
+
+/// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value>
+///
+/// putting each deletion obkv's keys under an DelAdd::Deletion
+/// and putting each addition obkv's keys under an DelAdd::Addition
+pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
+    deletion: obkv::KvReader<K>,
+    addition: obkv::KvReader<K>,
+    buffer: &mut Vec<u8>,
+) -> Result<(), std::io::Error> {
+    use itertools::merge_join_by;
+    use itertools::EitherOrBoth::{Both, Left, Right};
+
+    let mut writer = obkv::KvWriter::new(buffer);
+    let mut value_buffer = Vec::new();
+
+    for eob in merge_join_by(deletion.iter(), addition.iter(), |(b, _), (u, _)| b.cmp(u)) {
+        value_buffer.clear();
+        match eob {
+            Left((k, v)) => {
+                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                value_writer.insert(DelAdd::Deletion, v).unwrap();
+                writer.insert(k, value_writer.into_inner()?).unwrap();
+            }
+            Right((k, v)) => {
+                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                value_writer.insert(DelAdd::Addition, v).unwrap();
+                writer.insert(k, value_writer.into_inner()?).unwrap();
+            }
+            Both((k, deletion), (_, addition)) => {
+                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                value_writer.insert(DelAdd::Deletion, deletion).unwrap();
+                value_writer.insert(DelAdd::Addition, addition).unwrap();
+                writer.insert(k, value_writer.into_inner()?).unwrap();
+            }
+        }
+    }
+
+    writer.finish()
+}
diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index 0c7c5cf46..e02e492d2 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -11,7 +11,7 @@ use serde_json::Value;
 
 use super::helpers::{create_sorter, keep_latest_obkv, sorter_into_reader, GrenadParameters};
 use crate::error::{InternalError, SerializationError};
-use crate::update::index_documents::MergeFn;
+use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
 use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
 
 pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
@@ -30,15 +30,21 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     allowed_separators: Option<&[&str]>,
     dictionary: Option<&[&str]>,
     max_positions_per_attributes: Option<u32>,
-) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
+) -> Result<(
+    RoaringBitmap,
+    grenad::Reader<BufReader<File>>,
+    (ScriptLanguageDocidsMap, ScriptLanguageDocidsMap),
+)> {
     puffin::profile_function!();
 
     let max_positions_per_attributes = max_positions_per_attributes
         .map_or(MAX_POSITION_PER_ATTRIBUTE, |max| max.min(MAX_POSITION_PER_ATTRIBUTE));
     let max_memory = indexer.max_memory_by_thread();
 
+    // initialize destination values.
     let mut documents_ids = RoaringBitmap::new();
-    let mut script_language_docids = HashMap::new();
+    let mut del_script_language_docids = HashMap::new();
+    let mut add_script_language_docids = HashMap::new();
     let mut docid_word_positions_sorter = create_sorter(
         grenad::SortAlgorithm::Stable,
         keep_latest_obkv,
@@ -48,7 +54,142 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
         max_memory,
     );
 
-    let mut buffers = Buffers::default();
+    // initialize buffers.
+    let mut del_buffers = Buffers::default();
+    let mut add_buffers = Buffers::default();
+    let mut key_buffer = Vec::new();
+    let mut value_buffer = Vec::new();
+
+    // initialize tokenizer.
+    let mut builder = tokenizer_builder(stop_words, dictionary, allowed_separators, None);
+    let tokenizer = builder.build();
+
+    // iterate over documents.
+    let mut cursor = obkv_documents.into_cursor()?;
+    while let Some((key, value)) = cursor.move_on_next()? {
+        let document_id = key
+            .try_into()
+            .map(u32::from_be_bytes)
+            .map_err(|_| SerializationError::InvalidNumberSerialization)?;
+        let obkv = KvReader::<FieldId>::new(value);
+
+        // if the searchable fields didn't change, skip the searchable indexing for this document.
+        if !searchable_fields_changed(&KvReader::<FieldId>::new(value), searchable_fields) {
+            continue;
+        }
+
+        documents_ids.push(document_id);
+
+        // Update key buffer prefix.
+        key_buffer.clear();
+        key_buffer.extend_from_slice(&document_id.to_be_bytes());
+
+        // Tokenize deletions and additions in 2 diffferent threads.
+        let (del, add): (Result<_>, Result<_>) = rayon::join(
+            || {
+                // deletions
+                lang_safe_tokens_from_document(
+                    &obkv,
+                    searchable_fields,
+                    &tokenizer,
+                    stop_words,
+                    allowed_separators,
+                    dictionary,
+                    max_positions_per_attributes,
+                    DelAdd::Deletion,
+                    &mut del_buffers,
+                )
+            },
+            || {
+                // additions
+                lang_safe_tokens_from_document(
+                    &obkv,
+                    searchable_fields,
+                    &tokenizer,
+                    stop_words,
+                    allowed_separators,
+                    dictionary,
+                    max_positions_per_attributes,
+                    DelAdd::Addition,
+                    &mut add_buffers,
+                )
+            },
+        );
+
+        let (del_obkv, del_script_language_word_count) = del?;
+        let (add_obkv, add_script_language_word_count) = add?;
+
+        // merge deletions and additions.
+        value_buffer.clear();
+        del_add_from_two_obkvs(
+            KvReader::<FieldId>::new(del_obkv),
+            KvReader::<FieldId>::new(add_obkv),
+            &mut value_buffer,
+        )?;
+
+        // write them into the sorter.
+        let obkv = KvReader::<FieldId>::new(value);
+        for (field_id, value) in obkv.iter() {
+            key_buffer.truncate(mem::size_of::<u32>());
+            key_buffer.extend_from_slice(&field_id.to_be_bytes());
+            docid_word_positions_sorter.insert(&key_buffer, value)?;
+        }
+
+        // update script_language_docids deletions.
+        for (script, languages_frequency) in del_script_language_word_count {
+            for (language, _) in languages_frequency {
+                let entry = del_script_language_docids
+                    .entry((script, language))
+                    .or_insert_with(RoaringBitmap::new);
+                entry.push(document_id);
+            }
+        }
+
+        // update script_language_docids additions.
+        for (script, languages_frequency) in add_script_language_word_count {
+            for (language, _) in languages_frequency {
+                let entry = add_script_language_docids
+                    .entry((script, language))
+                    .or_insert_with(RoaringBitmap::new);
+                entry.push(document_id);
+            }
+        }
+    }
+
+    let script_language_docids = (del_script_language_docids, add_script_language_docids);
+    sorter_into_reader(docid_word_positions_sorter, indexer)
+        .map(|reader| (documents_ids, reader, script_language_docids))
+}
+
+/// Check if any searchable fields of a document changed.
+fn searchable_fields_changed(
+    obkv: &KvReader<FieldId>,
+    searchable_fields: &Option<HashSet<FieldId>>,
+) -> bool {
+    for (field_id, field_bytes) in obkv.iter() {
+        if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
+            let del_add = KvReaderDelAdd::new(field_bytes);
+            match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) {
+                // if both fields are None, check the next field.
+                (None, None) => (),
+                // if both contains a value and values are the same, check the next field.
+                (Some(del), Some(add)) if del == add => (),
+                // otherwise the fields are different, return true.
+                _otherwise => return true,
+            }
+        }
+    }
+
+    false
+}
+
+/// Factorize tokenizer building.
+fn tokenizer_builder<'a>(
+    stop_words: Option<&'a fst::Set<&[u8]>>,
+    allowed_separators: Option<&'a [&str]>,
+    dictionary: Option<&'a [&str]>,
+    script_language: Option<&'a HashMap<Script, Vec<Language>>>,
+) -> TokenizerBuilder<'a, &'a [u8]> {
     let mut tokenizer_builder = TokenizerBuilder::new();
     if let Some(stop_words) = stop_words {
         tokenizer_builder.stop_words(stop_words);
@@ -59,138 +200,144 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     if let Some(separators) = allowed_separators {
         tokenizer_builder.separators(separators);
     }
-    let tokenizer = tokenizer_builder.build();
 
-    let mut cursor = obkv_documents.into_cursor()?;
-    while let Some((key, value)) = cursor.move_on_next()? {
-        let document_id = key
-            .try_into()
-            .map(u32::from_be_bytes)
-            .map_err(|_| SerializationError::InvalidNumberSerialization)?;
-        let obkv = KvReader::<FieldId>::new(value);
+    if let Some(script_language) = script_language {
+        tokenizer_builder.allow_list(&script_language);
+    }
 
-        documents_ids.push(document_id);
-        buffers.key_buffer.clear();
-        buffers.key_buffer.extend_from_slice(&document_id.to_be_bytes());
+    tokenizer_builder
+}
 
-        let mut script_language_word_count = HashMap::new();
+/// Extract words maped with their positions of a document,
+/// ensuring no Language detection mistakes was made.
+fn lang_safe_tokens_from_document<'a>(
+    obkv: &KvReader<FieldId>,
+    searchable_fields: &Option<HashSet<FieldId>>,
+    tokenizer: &Tokenizer,
+    stop_words: Option<&fst::Set<&[u8]>>,
+    allowed_separators: Option<&[&str]>,
+    dictionary: Option<&[&str]>,
+    max_positions_per_attributes: u32,
+    del_add: DelAdd,
+    buffers: &'a mut Buffers,
+) -> Result<(&'a [u8], HashMap<Script, Vec<(Language, usize)>>)> {
+    let mut script_language_word_count = HashMap::new();
 
-        extract_tokens_from_document(
-            &obkv,
-            searchable_fields,
-            &tokenizer,
-            max_positions_per_attributes,
-            &mut buffers,
-            &mut script_language_word_count,
-            &mut docid_word_positions_sorter,
-        )?;
+    tokens_from_document(
+        &obkv,
+        searchable_fields,
+        &tokenizer,
+        max_positions_per_attributes,
+        del_add,
+        buffers,
+        &mut script_language_word_count,
+    )?;
 
-        // if we detect a potetial mistake in the language detection,
-        // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
-        // context: https://github.com/meilisearch/meilisearch/issues/3565
-        if script_language_word_count
-            .values()
-            .map(Vec::as_slice)
-            .any(potential_language_detection_error)
-        {
-            // build an allow list with the most frequent detected languages in the document.
-            let script_language: HashMap<_, _> =
-                script_language_word_count.iter().filter_map(most_frequent_languages).collect();
+    // if we detect a potetial mistake in the language detection,
+    // we rerun the extraction forcing the tokenizer to detect the most frequently detected Languages.
+    // context: https://github.com/meilisearch/meilisearch/issues/3565
+    if script_language_word_count
+        .values()
+        .map(Vec::as_slice)
+        .any(potential_language_detection_error)
+    {
+        // build an allow list with the most frequent detected languages in the document.
+        let script_language: HashMap<_, _> =
+            script_language_word_count.iter().filter_map(most_frequent_languages).collect();
 
-            // if the allow list is empty, meaning that no Language is considered frequent,
-            // then we don't rerun the extraction.
-            if !script_language.is_empty() {
-                // build a new temporary tokenizer including the allow list.
-                let mut tokenizer_builder = TokenizerBuilder::new();
-                if let Some(stop_words) = stop_words {
-                    tokenizer_builder.stop_words(stop_words);
-                }
-                if let Some(dictionary) = dictionary {
-                    tokenizer_builder.words_dict(dictionary);
-                }
-                if let Some(separators) = allowed_separators {
-                    tokenizer_builder.separators(separators);
-                }
-                tokenizer_builder.allow_list(&script_language);
-                let tokenizer = tokenizer_builder.build();
+        // if the allow list is empty, meaning that no Language is considered frequent,
+        // then we don't rerun the extraction.
+        if !script_language.is_empty() {
+            // build a new temporary tokenizer including the allow list.
+            let mut builder = tokenizer_builder(
+                stop_words,
+                dictionary,
+                allowed_separators,
+                Some(&script_language),
+            );
+            let tokenizer = builder.build();
 
-                script_language_word_count.clear();
+            script_language_word_count.clear();
 
-                // rerun the extraction.
-                extract_tokens_from_document(
-                    &obkv,
-                    searchable_fields,
-                    &tokenizer,
-                    max_positions_per_attributes,
-                    &mut buffers,
-                    &mut script_language_word_count,
-                    &mut docid_word_positions_sorter,
-                )?;
-            }
-        }
-
-        for (script, languages_frequency) in script_language_word_count {
-            for (language, _) in languages_frequency {
-                let entry = script_language_docids
-                    .entry((script, language))
-                    .or_insert_with(RoaringBitmap::new);
-                entry.push(document_id);
-            }
+            // rerun the extraction.
+            tokens_from_document(
+                &obkv,
+                searchable_fields,
+                &tokenizer,
+                max_positions_per_attributes,
+                del_add,
+                buffers,
+                &mut script_language_word_count,
+            )?;
         }
     }
 
-    sorter_into_reader(docid_word_positions_sorter, indexer)
-        .map(|reader| (documents_ids, reader, script_language_docids))
+    Ok((&buffers.obkv_buffer, script_language_word_count))
 }
 
-fn extract_tokens_from_document(
+/// Extract words maped with their positions of a document.
+fn tokens_from_document<'a>(
     obkv: &KvReader<FieldId>,
     searchable_fields: &Option<HashSet<FieldId>>,
     tokenizer: &Tokenizer,
     max_positions_per_attributes: u32,
-    buffers: &mut Buffers,
+    del_add: DelAdd,
+    buffers: &'a mut Buffers,
     script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
-    docid_word_positions_sorter: &mut grenad::Sorter<MergeFn>,
-) -> Result<()> {
+) -> Result<&'a [u8]> {
+    buffers.obkv_buffer.clear();
+    let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer);
     for (field_id, field_bytes) in obkv.iter() {
+        // if field is searchable.
         if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) {
-            let value = serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
-            buffers.field_buffer.clear();
-            if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
-                let tokens = process_tokens(tokenizer.tokenize(field))
-                    .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
+            // extract deletion or addition only.
+            if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) {
+                // parse json.
+                let value =
+                    serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
 
-                buffers.obkv_buffer.clear();
-                let mut writer = KvWriterU16::new(&mut buffers.obkv_buffer);
-                for (index, token) in tokens {
-                    // if a language has been detected for the token, we update the counter.
-                    if let Some(language) = token.language {
-                        let script = token.script;
-                        let entry =
-                            script_language_word_count.entry(script).or_insert_with(Vec::new);
-                        match entry.iter_mut().find(|(l, _)| *l == language) {
-                            Some((_, n)) => *n += 1,
-                            None => entry.push((language, 1)),
+                // prepare writting destination.
+                buffers.obkv_positions_buffer.clear();
+                let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
+
+                // convert json into an unique string.
+                buffers.field_buffer.clear();
+                if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
+                    // create an iterator of token with their positions.
+                    let tokens = process_tokens(tokenizer.tokenize(field))
+                        .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
+
+                    for (index, token) in tokens {
+                        // if a language has been detected for the token, we update the counter.
+                        if let Some(language) = token.language {
+                            let script = token.script;
+                            let entry =
+                                script_language_word_count.entry(script).or_insert_with(Vec::new);
+                            match entry.iter_mut().find(|(l, _)| *l == language) {
+                                Some((_, n)) => *n += 1,
+                                None => entry.push((language, 1)),
+                            }
+                        }
+
+                        // keep a word only if it is not empty and fit in a LMDB key.
+                        let token = token.lemma().trim();
+                        if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
+                            let position: u16 = index
+                                .try_into()
+                                .map_err(|_| SerializationError::InvalidNumberSerialization)?;
+                            writer.insert(position, token.as_bytes())?;
                         }
                     }
-                    let token = token.lemma().trim();
-                    if !token.is_empty() && token.len() <= MAX_WORD_LENGTH {
-                        let position: u16 = index
-                            .try_into()
-                            .map_err(|_| SerializationError::InvalidNumberSerialization)?;
-                        writer.insert(position, token.as_bytes())?;
-                    }
-                }
 
-                let positions = writer.into_inner()?;
-                buffers.key_buffer.truncate(mem::size_of::<u32>());
-                buffers.key_buffer.extend_from_slice(&field_id.to_be_bytes());
-                docid_word_positions_sorter.insert(&buffers.key_buffer, positions)?;
+                    // write positions into document.
+                    let positions = writer.into_inner()?;
+                    document_writer.insert(field_id, positions)?;
+                }
             }
         }
     }
 
-    Ok(())
+    Ok(document_writer.into_inner().map(|v| v.as_slice())?)
 }
 
 /// Transform a JSON value into a string that can be indexed.
@@ -293,12 +440,10 @@ fn compute_language_frequency_threshold(languages_frequency: &[(Language, usize)
 
 #[derive(Default)]
 struct Buffers {
-    // the key buffer is the concatenation of the internal document id with the field id.
-    // The buffer has to be completelly cleared between documents,
-    // and the field id part must be cleared between each field.
-    key_buffer: Vec<u8>,
     // the field buffer for each fields desserialization, and must be cleared between each field.
     field_buffer: String,
     // buffer used to store the value data containing an obkv.
     obkv_buffer: Vec<u8>,
+    // buffer used to store the value data containing an obkv of tokens with their positions.
+    obkv_positions_buffer: Vec<u8>,
 }
diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs
index 6317b5610..dee200b21 100644
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@@ -117,8 +117,9 @@ pub fn merge_two_del_add_obkvs(
                 let update_reader = KvReaderDelAdd::new(update);
 
                 // keep newest deletion.
-                if let Some(deletion) =
-                    update_reader.get(DelAdd::Deletion).or(base_reader.get(DelAdd::Deletion))
+                if let Some(deletion) = update_reader
+                    .get(DelAdd::Deletion)
+                    .or_else(|| base_reader.get(DelAdd::Deletion))
                 {
                     value_writer.insert(DelAdd::Deletion, deletion).unwrap();
                 }
@@ -127,6 +128,7 @@ pub fn merge_two_del_add_obkvs(
                 let base_addition =
                     merge_additions.then(|| base_reader.get(DelAdd::Addition)).flatten();
                 // keep newest addition.
+                // TODO use or_else
                 if let Some(addition) = update_reader.get(DelAdd::Addition).or(base_addition) {
                     value_writer.insert(DelAdd::Addition, addition).unwrap();
                 }
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index a45a6ee3c..2b77768cb 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -805,7 +805,7 @@ impl<'a, 'i> Transform<'a, 'i> {
 
             let buffer = obkv_writer.into_inner()?;
             document_sorter_buffer.clear();
-            into_del_add_obkv(KvReaderU16::new(buffer), true, true, &mut document_sorter_buffer)?;
+            into_del_add_obkv(KvReaderU16::new(buffer), false, true, &mut document_sorter_buffer)?;
             original_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
 
             // Once we have the document. We're going to flatten it
@@ -842,7 +842,7 @@ impl<'a, 'i> Transform<'a, 'i> {
                 writer.insert(fid, &value)?;
             }
             document_sorter_buffer.clear();
-            into_del_add_obkv(KvReaderU16::new(&buffer), true, true, &mut document_sorter_buffer)?;
+            into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut document_sorter_buffer)?;
             flattened_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
         }
 
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index a94bcf581..f2dc7d336 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -43,7 +43,9 @@ pub(crate) enum TypedChunk {
     FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
     GeoPoints(grenad::Reader<BufReader<File>>),
     VectorPoints(grenad::Reader<BufReader<File>>),
-    ScriptLanguageDocids(HashMap<(Script, Language), RoaringBitmap>),
+    ScriptLanguageDocids(
+        (HashMap<(Script, Language), RoaringBitmap>, HashMap<(Script, Language), RoaringBitmap>),
+    ),
 }
 
 impl TypedChunk {
@@ -101,8 +103,8 @@ impl TypedChunk {
             TypedChunk::VectorPoints(grenad) => {
                 format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
             }
-            TypedChunk::ScriptLanguageDocids(grenad) => {
-                format!("ScriptLanguageDocids {{ number_of_entries: {} }}", grenad.len())
+            TypedChunk::ScriptLanguageDocids((_, addition)) => {
+                format!("ScriptLanguageDocids {{ number_of_entries: {} }}", addition.len())
             }
         }
     }
@@ -344,19 +346,21 @@ pub(crate) fn write_typed_chunk_into_index(
             log::debug!("There are {} entries in the HNSW so far", hnsw_length);
             index.put_vector_hnsw(wtxn, &new_hnsw)?;
         }
-        TypedChunk::ScriptLanguageDocids(hash_pair) => {
-            let mut buffer = Vec::new();
-            for (key, value) in hash_pair {
-                buffer.clear();
-                let final_value = match index.script_language_docids.get(wtxn, &key)? {
-                    Some(db_values) => {
-                        let mut db_value_buffer = Vec::new();
-                        serialize_roaring_bitmap(&db_values, &mut db_value_buffer)?;
-                        let mut new_value_buffer = Vec::new();
-                        serialize_roaring_bitmap(&value, &mut new_value_buffer)?;
-                        merge_roaring_bitmaps(&new_value_buffer, &db_value_buffer, &mut buffer)?;
-                        RoaringBitmap::deserialize_from(&buffer[..])?
+        TypedChunk::ScriptLanguageDocids((deletion, addition)) => {
+            for (key, value) in deletion {
+                if let Some(mut db_values) = index.script_language_docids.get(wtxn, &key)? {
+                    db_values -= value;
+                    if db_values.is_empty() {
+                        index.script_language_docids.delete(wtxn, &key)?;
+                    } else {
+                        index.script_language_docids.put(wtxn, &key, &db_values)?;
                     }
+                }
+            }
+
+            for (key, value) in addition {
+                let final_value = match index.script_language_docids.get(wtxn, &key)? {
+                    Some(mut db_values) => db_values | value,
                     None => value,
                 };
                 index.script_language_docids.put(wtxn, &key, &final_value)?;

From 0c47defeee739ec7b2528c4993d8e11bad08d34b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Tue, 17 Oct 2023 18:09:41 +0200
Subject: [PATCH 016/127] Work on fid docid facet values rewrite

---
 milli/src/update/facet/bulk.rs                |   2 +
 milli/src/update/facet/mod.rs                 |   1 +
 .../extract/extract_facet_number_docids.rs    |   4 +
 .../extract/extract_facet_string_docids.rs    |   4 +
 .../extract/extract_fid_docid_facet_values.rs | 276 +++++++++++++++---
 5 files changed, 249 insertions(+), 38 deletions(-)

diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs
index a3f0c8f71..a2b1c9dcd 100644
--- a/milli/src/update/facet/bulk.rs
+++ b/milli/src/update/facet/bulk.rs
@@ -133,6 +133,8 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
         self.db.delete_range(wtxn, &range).map(drop)?;
         Ok(())
     }
+
+    // TODO the new_data is an Reader<Obkv<Key, Obkv<DelAdd, RoaringBitmap>>>
     fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
         let new_data = match self.new_data.take() {
             Some(x) => x,
diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs
index bbd25f91e..decb6a9ac 100644
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -115,6 +115,7 @@ pub struct FacetsUpdate<'i> {
     min_level_size: u8,
 }
 impl<'i> FacetsUpdate<'i> {
+    // TODO grenad::Reader<Key, Obkv<DelAdd, RoaringBitmap>>
     pub fn new(
         index: &'i Index,
         facet_type: FacetType,
diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
index d557e0b6c..76dc6d3c6 100644
--- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
@@ -17,6 +17,7 @@ use crate::Result;
 /// documents ids from the given chunk of docid facet number positions.
 #[logging_timer::time]
 pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
+    // TODO Reader<Key, Obkv<DelAdd, ()>>
     docid_fid_facet_number: grenad::Reader<R>,
     indexer: GrenadParameters,
 ) -> Result<grenad::Reader<BufReader<File>>> {
@@ -26,6 +27,7 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
 
     let mut facet_number_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Unstable,
+        // TODO We must modify the merger to do unions of Del and Add separately
         merge_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
@@ -34,12 +36,14 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
     );
 
     let mut cursor = docid_fid_facet_number.into_cursor()?;
+    // TODO the value is a Obkv<DelAdd, ()> and must be taken into account
     while let Some((key_bytes, _)) = cursor.move_on_next()? {
         let (field_id, document_id, number) =
             FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
 
         let key = FacetGroupKey { field_id, level: 0, left_bound: number };
         let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
+        // TODO We must put a Obkv<DelAdd, RoaringBitmap>
         facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
     }
 
diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
index b1b27449e..b861c04e4 100644
--- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@@ -15,6 +15,7 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
 /// documents ids from the given chunk of docid facet string positions.
 #[logging_timer::time]
 pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
+    // TODO Reader<Key, Obkv<DelAdd, OriginalString>>
     docid_fid_facet_string: grenad::Reader<R>,
     indexer: GrenadParameters,
 ) -> Result<grenad::Reader<BufReader<File>>> {
@@ -24,6 +25,7 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
 
     let mut facet_string_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Stable,
+        // TODO We must modify the merger to do unions of Del and Add separately
         merge_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
@@ -33,6 +35,7 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
 
     let mut cursor = docid_fid_facet_string.into_cursor()?;
     while let Some((key, _original_value_bytes)) = cursor.move_on_next()? {
+        // TODO the value is a Obkv<DelAdd, OriginalString> and must be taken into account
         let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
         let field_id = FieldId::from_be_bytes(field_id_bytes);
 
@@ -54,6 +57,7 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
         let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value };
         let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
         // document id is encoded in native-endian because of the CBO roaring bitmap codec
+        // TODO Reader<KeyBytes, Obkv<DelAdd, RoaringBitmap>>
         facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
     }
 
diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index 42c355323..0340fb709 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -6,17 +6,21 @@ use std::mem::size_of;
 
 use heed::zerocopy::AsBytes;
 use heed::BytesEncode;
+use itertools::EitherOrBoth;
+use ordered_float::OrderedFloat;
 use roaring::RoaringBitmap;
 use serde_json::{from_slice, Value};
 
 use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
 use crate::error::InternalError;
 use crate::facet::value_encoding::f64_into_bytes;
+use crate::update::del_add::{DelAdd, KvWriterDelAdd};
 use crate::update::index_documents::{create_writer, writer_into_reader};
 use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
 
 /// The extracted facet values stored in grenad files by type.
 pub struct ExtractedFacetValues {
+    // TOOD rename into `fid_docid_*`
     pub docid_fid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
     pub docid_fid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
     pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
@@ -31,6 +35,7 @@ pub struct ExtractedFacetValues {
 /// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
 #[logging_timer::time]
 pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
+    // TODO Reader<Obkv<FieldId, Obkv<DelAdd, serde_json::Value>>>
     obkv_documents: grenad::Reader<R>,
     indexer: GrenadParameters,
     faceted_fields: &HashSet<FieldId>,
@@ -58,13 +63,15 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         max_memory.map(|m| m / 2),
     );
 
-    let mut facet_exists_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
-    let mut facet_is_null_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
-    let mut facet_is_empty_docids = BTreeMap::<FieldId, RoaringBitmap>::new();
+    // The tuples represents the Del and Add side for a bitmap
+    let mut facet_exists_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
+    let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
+    let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
 
     let mut key_buffer = Vec::new();
     let mut cursor = obkv_documents.into_cursor()?;
     while let Some((docid_bytes, value)) = cursor.move_on_next()? {
+        // TODO Obkv<FieldId, Obkv<DelAdd, serde_json::Value>>
         let obkv = obkv::KvReader::new(value);
 
         for (field_id, field_bytes) in obkv.iter() {
@@ -79,50 +86,233 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
                 let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
                 let document = BEU32::from(document).get();
 
-                facet_exists_docids.entry(field_id).or_default().insert(document);
-
                 // For the other extraction tasks, prefix the key with the field_id and the document_id
                 key_buffer.extend_from_slice(docid_bytes);
 
-                let value = from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
+                let del_add_obkv = obkv::KvReader::new(field_bytes);
+                let del_value = match del_add_obkv.get(DelAdd::Deletion) {
+                    Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
+                    None => None,
+                };
+                let add_value = match del_add_obkv.get(DelAdd::Addition) {
+                    Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
+                    None => None,
+                };
 
-                match extract_facet_values(
-                    &value,
-                    geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng),
-                ) {
-                    FilterableValues::Null => {
-                        facet_is_null_docids.entry(field_id).or_default().insert(document);
-                    }
-                    FilterableValues::Empty => {
-                        facet_is_empty_docids.entry(field_id).or_default().insert(document);
-                    }
-                    FilterableValues::Values { numbers, strings } => {
-                        // insert facet numbers in sorter
-                        for number in numbers {
-                            key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
-                            if let Some(value_bytes) = f64_into_bytes(number) {
-                                key_buffer.extend_from_slice(&value_bytes);
-                                key_buffer.extend_from_slice(&number.to_be_bytes());
+                // We insert the document id on the Del and the Add side if the field exists.
+                let (mut del_exists, mut add_exists) =
+                    facet_exists_docids.entry(field_id).or_default();
+                if del_value.is_some() {
+                    del_exists.insert(document);
+                }
+                if add_value.is_some() {
+                    add_exists.insert(document);
+                }
 
-                                fid_docid_facet_numbers_sorter
-                                    .insert(&key_buffer, ().as_bytes())?;
+                // TODO extract both Del and Add numbers an strings (dedup)
+                // TODO use the `itertools::merge_join_by` method to sort and diff both sides (Del and Add)
+                // TODO if there is a Left generate a Del
+                // TODO if there is a Right generate an Add
+                // TODO if there is a Both don't insert
+                // TODO compare numbers using OrderedFloat and strings using both normalized and original values.
+
+                let geo_support =
+                    geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
+
+                let del_filterable_values =
+                    del_value.map(|value| extract_facet_values(&value, geo_support));
+                let add_filterable_values =
+                    add_value.map(|value| extract_facet_values(&value, geo_support));
+
+                use FilterableValues::{Empty, Null, Values};
+
+                match (del_filterable_values, add_filterable_values) {
+                    (None, None) => (),
+                    (Some(del_filterable_values), None) => match del_filterable_values {
+                        Null => {
+                            let (mut del_is_null, _) =
+                                facet_is_null_docids.entry(field_id).or_default();
+                            del_is_null.insert(document);
+                        }
+                        Empty => {
+                            let (mut del_is_empty, _) =
+                                facet_is_empty_docids.entry(field_id).or_default();
+                            del_is_empty.insert(document);
+                        }
+                        Values { numbers, strings } => {
+                            // insert facet numbers in sorter
+                            for number in numbers {
+                                key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
+                                if let Some(value_bytes) = f64_into_bytes(number) {
+                                    key_buffer.extend_from_slice(&value_bytes);
+                                    key_buffer.extend_from_slice(&number.to_be_bytes());
+
+                                    // We insert only the Del part of the Obkv to inform
+                                    // that we only want to remove all those numbers.
+                                    let mut obkv = KvWriterDelAdd::memory();
+                                    obkv.insert(DelAdd::Deletion, ().as_bytes())?;
+                                    let bytes = obkv.into_inner()?;
+                                    fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
+                                }
+                            }
+
+                            // insert normalized and original facet string in sorter
+                            for (normalized, original) in
+                                strings.into_iter().filter(|(n, _)| !n.is_empty())
+                            {
+                                let normalized_truncated_value: String = normalized
+                                    .char_indices()
+                                    .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
+                                    .map(|(_, c)| c)
+                                    .collect();
+
+                                key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
+                                key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
+
+                                // We insert only the Del part of the Obkv to inform
+                                // that we only want to remove all those strings.
+                                let mut obkv = KvWriterDelAdd::memory();
+                                obkv.insert(DelAdd::Deletion, original.as_bytes())?;
+                                let bytes = obkv.into_inner()?;
+                                fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
                             }
                         }
+                    },
+                    (None, Some(add_filterable_values)) => {
+                        todo!()
+                    }
+                    (Some(del_filterable_values), Some(add_filterable_values)) => {
+                        let (mut del_is_null, mut add_is_null) =
+                            facet_is_null_docids.entry(field_id).or_default();
+                        let (mut del_is_empty, mut add_is_empty) =
+                            facet_is_empty_docids.entry(field_id).or_default();
 
-                        // insert normalized and original facet string in sorter
-                        for (normalized, original) in
-                            strings.into_iter().filter(|(n, _)| !n.is_empty())
-                        {
-                            let normalized_truncated_value: String = normalized
-                                .char_indices()
-                                .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
-                                .map(|(_, c)| c)
-                                .collect();
+                        match (del_filterable_values, add_filterable_values) {
+                            (Null, Null) | (Empty, Empty) => (),
+                            (Null, Empty) => {
+                                del_is_null.insert(document);
+                                add_is_empty.insert(document);
+                            }
+                            (Empty, Null) => {
+                                del_is_empty.insert(document);
+                                add_is_null.insert(document);
+                            }
+                            (Null, Values { numbers, strings }) => {
+                                del_is_null.insert(document);
+                                todo!()
+                            }
+                            (Empty, Values { numbers, strings }) => {
+                                del_is_empty.insert(document);
+                                todo!()
+                            }
+                            (Values { numbers, strings }, Null) => {
+                                todo!();
+                                add_is_null.insert(document);
+                            }
+                            (Values { numbers, strings }, Empty) => {
+                                todo!();
+                                add_is_empty.insert(document);
+                            }
+                            (
+                                Values { numbers: mut del_numbers, strings: mut del_strings },
+                                Values { numbers: mut add_numbers, strings: mut add_strings },
+                            ) => {
+                                // We sort and dedup the float numbers
+                                del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
+                                add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
+                                del_numbers.dedup_by_key(|f| OrderedFloat(*f));
+                                add_numbers.dedup_by_key(|f| OrderedFloat(*f));
 
-                            key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
-                            key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
-                            fid_docid_facet_strings_sorter
-                                .insert(&key_buffer, original.as_bytes())?;
+                                let merged_numbers_iter = itertools::merge_join_by(
+                                    del_numbers.into_iter().map(OrderedFloat),
+                                    add_numbers.into_iter().map(OrderedFloat),
+                                    |del, add| del.cmp(&add),
+                                );
+
+                                // insert facet numbers in sorter
+                                for eob in merged_numbers_iter {
+                                    key_buffer
+                                        .truncate(size_of::<FieldId>() + size_of::<DocumentId>());
+                                    match eob {
+                                        EitherOrBoth::Both(_, _) => (), // no need to touch anything
+                                        EitherOrBoth::Left(OrderedFloat(number)) => {
+                                            if let Some(value_bytes) = f64_into_bytes(number) {
+                                                key_buffer.extend_from_slice(&value_bytes);
+                                                key_buffer.extend_from_slice(&number.to_be_bytes());
+
+                                                // We insert only the Del part of the Obkv to inform
+                                                // that we only want to remove all those numbers.
+                                                let mut obkv = KvWriterDelAdd::memory();
+                                                obkv.insert(DelAdd::Deletion, ().as_bytes())?;
+                                                let bytes = obkv.into_inner()?;
+                                                fid_docid_facet_numbers_sorter
+                                                    .insert(&key_buffer, bytes)?;
+                                            }
+                                        }
+                                        EitherOrBoth::Right(OrderedFloat(number)) => {
+                                            if let Some(value_bytes) = f64_into_bytes(number) {
+                                                key_buffer.extend_from_slice(&value_bytes);
+                                                key_buffer.extend_from_slice(&number.to_be_bytes());
+
+                                                // We insert only the Del part of the Obkv to inform
+                                                // that we only want to remove all those numbers.
+                                                let mut obkv = KvWriterDelAdd::memory();
+                                                obkv.insert(DelAdd::Addition, ().as_bytes())?;
+                                                let bytes = obkv.into_inner()?;
+                                                fid_docid_facet_numbers_sorter
+                                                    .insert(&key_buffer, bytes)?;
+                                            }
+                                        }
+                                    }
+                                }
+
+                                // We sort and dedup the normalized and original strings
+                                del_strings.sort_unstable();
+                                add_strings.sort_unstable();
+                                del_strings.dedup();
+                                add_strings.dedup();
+
+                                let merged_strings_iter = itertools::merge_join_by(
+                                    del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
+                                    add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
+                                    |del, add| del.cmp(&add),
+                                );
+
+                                // insert normalized and original facet string in sorter
+                                for eob in merged_strings_iter {
+                                    match eob {
+                                        EitherOrBoth::Both(_, _) => (), // no need to touch anything
+                                        EitherOrBoth::Left((normalized, original)) => {
+                                            let truncated = truncate_string(normalized);
+
+                                            key_buffer.truncate(
+                                                size_of::<FieldId>() + size_of::<DocumentId>(),
+                                            );
+                                            key_buffer.extend_from_slice(truncated.as_bytes());
+
+                                            let mut obkv = KvWriterDelAdd::memory();
+                                            obkv.insert(DelAdd::Deletion, original)?;
+                                            let bytes = obkv.into_inner()?;
+                                            fid_docid_facet_strings_sorter
+                                                .insert(&key_buffer, bytes)?;
+                                        }
+                                        EitherOrBoth::Right((normalized, original)) => {
+                                            let truncated = truncate_string(normalized);
+
+                                            key_buffer.truncate(
+                                                size_of::<FieldId>() + size_of::<DocumentId>(),
+                                            );
+                                            key_buffer.extend_from_slice(truncated.as_bytes());
+
+                                            let mut obkv = KvWriterDelAdd::memory();
+                                            obkv.insert(DelAdd::Addition, original)?;
+                                            let bytes = obkv.into_inner()?;
+                                            fid_docid_facet_strings_sorter
+                                                .insert(&key_buffer, bytes)?;
+                                        }
+                                    }
+                                }
+                            }
                         }
                     }
                 }
@@ -135,6 +325,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         indexer.chunk_compression_level,
         tempfile::tempfile()?,
     );
+    // TODO generate an Obkv<DelAdd, Bitmap>
     for (fid, bitmap) in facet_exists_docids.into_iter() {
         let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
         facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
@@ -146,12 +337,14 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         indexer.chunk_compression_level,
         tempfile::tempfile()?,
     );
+    // TODO generate an Obkv<DelAdd, Bitmap>
     for (fid, bitmap) in facet_is_null_docids.into_iter() {
         let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
         facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
     }
     let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
 
+    // TODO generate an Obkv<DelAdd, Bitmap>
     let mut facet_is_empty_docids_writer = create_writer(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
@@ -243,3 +436,10 @@ fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
         }
     }
 }
+
+fn truncate_string(mut s: String) -> String {
+    s.char_indices()
+        .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
+        .map(|(_, c)| c)
+        .collect()
+}

From 6ae4100f0720ec1973ed73ad53719f3e74c88aac Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Tue, 17 Oct 2023 18:15:14 +0200
Subject: [PATCH 017/127] Generate the DelAdd for is_null, is_empty, and exists

---
 .../extract/extract_fid_docid_facet_values.rs | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index 0340fb709..e8d70bf0d 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -325,10 +325,14 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         indexer.chunk_compression_level,
         tempfile::tempfile()?,
     );
-    // TODO generate an Obkv<DelAdd, Bitmap>
-    for (fid, bitmap) in facet_exists_docids.into_iter() {
-        let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
-        facet_exists_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
+    for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
+        let mut obkv = KvWriterDelAdd::memory();
+        let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&del_bitmap).unwrap();
+        let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&add_bitmap).unwrap();
+        obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
+        obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
+        let bytes = obkv.into_inner()?;
+        facet_exists_docids_writer.insert(fid.to_be_bytes(), &bytes)?;
     }
     let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
 
@@ -337,22 +341,30 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         indexer.chunk_compression_level,
         tempfile::tempfile()?,
     );
-    // TODO generate an Obkv<DelAdd, Bitmap>
-    for (fid, bitmap) in facet_is_null_docids.into_iter() {
-        let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
-        facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
+    for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
+        let mut obkv = KvWriterDelAdd::memory();
+        let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&del_bitmap).unwrap();
+        let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&add_bitmap).unwrap();
+        obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
+        obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
+        let bytes = obkv.into_inner()?;
+        facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bytes)?;
     }
     let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
 
-    // TODO generate an Obkv<DelAdd, Bitmap>
     let mut facet_is_empty_docids_writer = create_writer(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         tempfile::tempfile()?,
     );
-    for (fid, bitmap) in facet_is_empty_docids.into_iter() {
-        let bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
-        facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bitmap_bytes)?;
+    for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
+        let mut obkv = KvWriterDelAdd::memory();
+        let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&del_bitmap).unwrap();
+        let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&add_bitmap).unwrap();
+        obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
+        obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
+        let bytes = obkv.into_inner()?;
+        facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bytes)?;
     }
     let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
 

From bc45c1206d01654f6272ffac5cdbfa76aeaa7930 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 18 Oct 2023 11:01:02 +0200
Subject: [PATCH 018/127] Implement all the facet extraction paths and simplify
 them

---
 .../extract/extract_fid_docid_facet_values.rs | 404 +++++++++---------
 1 file changed, 212 insertions(+), 192 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index e8d70bf0d..ec0960b86 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -1,22 +1,31 @@
+use std::borrow::Cow;
 use std::collections::{BTreeMap, HashSet};
 use std::convert::TryInto;
 use std::fs::File;
 use std::io::{self, BufReader};
 use std::mem::size_of;
+use std::result::Result as StdResult;
 
+use grenad::Sorter;
 use heed::zerocopy::AsBytes;
 use heed::BytesEncode;
 use itertools::EitherOrBoth;
 use ordered_float::OrderedFloat;
 use roaring::RoaringBitmap;
 use serde_json::{from_slice, Value};
+use FilterableValues::{Empty, Null, Values};
 
 use super::helpers::{create_sorter, keep_first, sorter_into_reader, GrenadParameters};
 use crate::error::InternalError;
 use crate::facet::value_encoding::f64_into_bytes;
 use crate::update::del_add::{DelAdd, KvWriterDelAdd};
 use crate::update::index_documents::{create_writer, writer_into_reader};
-use crate::{CboRoaringBitmapCodec, DocumentId, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH};
+use crate::{
+    CboRoaringBitmapCodec, DocumentId, Error, FieldId, Result, BEU32, MAX_FACET_VALUE_LENGTH,
+};
+
+/// The length of the elements that are always in the buffer when inserting new values.
+const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
 
 /// The extracted facet values stored in grenad files by type.
 pub struct ExtractedFacetValues {
@@ -68,7 +77,10 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
     let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
     let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
 
-    let mut key_buffer = Vec::new();
+    // We create two buffer for mutable ref issues with closures.
+    let mut numbers_key_buffer = Vec::new();
+    let mut strings_key_buffer = Vec::new();
+
     let mut cursor = obkv_documents.into_cursor()?;
     while let Some((docid_bytes, value)) = cursor.move_on_next()? {
         // TODO Obkv<FieldId, Obkv<DelAdd, serde_json::Value>>
@@ -76,18 +88,21 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
 
         for (field_id, field_bytes) in obkv.iter() {
             if faceted_fields.contains(&field_id) {
-                key_buffer.clear();
+                numbers_key_buffer.clear();
+                strings_key_buffer.clear();
 
                 // Set key to the field_id
                 // Note: this encoding is consistent with FieldIdCodec
-                key_buffer.extend_from_slice(&field_id.to_be_bytes());
+                numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
+                strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
 
                 // Here, we know already that the document must be added to the “field id exists” database
                 let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
                 let document = BEU32::from(document).get();
 
                 // For the other extraction tasks, prefix the key with the field_id and the document_id
-                key_buffer.extend_from_slice(docid_bytes);
+                numbers_key_buffer.extend_from_slice(docid_bytes);
+                strings_key_buffer.extend_from_slice(docid_bytes);
 
                 let del_add_obkv = obkv::KvReader::new(field_bytes);
                 let del_value = match del_add_obkv.get(DelAdd::Deletion) {
@@ -100,8 +115,13 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
                 };
 
                 // We insert the document id on the Del and the Add side if the field exists.
-                let (mut del_exists, mut add_exists) =
+                let (ref mut del_exists, ref mut add_exists) =
                     facet_exists_docids.entry(field_id).or_default();
+                let (ref mut del_is_null, ref mut add_is_null) =
+                    facet_is_null_docids.entry(field_id).or_default();
+                let (ref mut del_is_empty, ref mut add_is_empty) =
+                    facet_is_empty_docids.entry(field_id).or_default();
+
                 if del_value.is_some() {
                     del_exists.insert(document);
                 }
@@ -109,84 +129,58 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
                     add_exists.insert(document);
                 }
 
-                // TODO extract both Del and Add numbers an strings (dedup)
-                // TODO use the `itertools::merge_join_by` method to sort and diff both sides (Del and Add)
-                // TODO if there is a Left generate a Del
-                // TODO if there is a Right generate an Add
-                // TODO if there is a Both don't insert
-                // TODO compare numbers using OrderedFloat and strings using both normalized and original values.
-
                 let geo_support =
                     geo_fields_ids.map_or(false, |(lat, lng)| field_id == lat || field_id == lng);
-
                 let del_filterable_values =
                     del_value.map(|value| extract_facet_values(&value, geo_support));
                 let add_filterable_values =
                     add_value.map(|value| extract_facet_values(&value, geo_support));
 
-                use FilterableValues::{Empty, Null, Values};
+                // Those closures are just here to simplify things a bit.
+                let mut insert_numbers_diff = |del_numbers, add_numbers| {
+                    insert_numbers_diff(
+                        &mut fid_docid_facet_numbers_sorter,
+                        &mut numbers_key_buffer,
+                        del_numbers,
+                        add_numbers,
+                    )
+                };
+                let mut insert_strings_diff = |del_strings, add_strings| {
+                    insert_strings_diff(
+                        &mut fid_docid_facet_strings_sorter,
+                        &mut strings_key_buffer,
+                        del_strings,
+                        add_strings,
+                    )
+                };
 
                 match (del_filterable_values, add_filterable_values) {
                     (None, None) => (),
                     (Some(del_filterable_values), None) => match del_filterable_values {
                         Null => {
-                            let (mut del_is_null, _) =
-                                facet_is_null_docids.entry(field_id).or_default();
                             del_is_null.insert(document);
                         }
                         Empty => {
-                            let (mut del_is_empty, _) =
-                                facet_is_empty_docids.entry(field_id).or_default();
                             del_is_empty.insert(document);
                         }
                         Values { numbers, strings } => {
-                            // insert facet numbers in sorter
-                            for number in numbers {
-                                key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
-                                if let Some(value_bytes) = f64_into_bytes(number) {
-                                    key_buffer.extend_from_slice(&value_bytes);
-                                    key_buffer.extend_from_slice(&number.to_be_bytes());
-
-                                    // We insert only the Del part of the Obkv to inform
-                                    // that we only want to remove all those numbers.
-                                    let mut obkv = KvWriterDelAdd::memory();
-                                    obkv.insert(DelAdd::Deletion, ().as_bytes())?;
-                                    let bytes = obkv.into_inner()?;
-                                    fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
-                                }
-                            }
-
-                            // insert normalized and original facet string in sorter
-                            for (normalized, original) in
-                                strings.into_iter().filter(|(n, _)| !n.is_empty())
-                            {
-                                let normalized_truncated_value: String = normalized
-                                    .char_indices()
-                                    .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
-                                    .map(|(_, c)| c)
-                                    .collect();
-
-                                key_buffer.truncate(size_of::<FieldId>() + size_of::<DocumentId>());
-                                key_buffer.extend_from_slice(normalized_truncated_value.as_bytes());
-
-                                // We insert only the Del part of the Obkv to inform
-                                // that we only want to remove all those strings.
-                                let mut obkv = KvWriterDelAdd::memory();
-                                obkv.insert(DelAdd::Deletion, original.as_bytes())?;
-                                let bytes = obkv.into_inner()?;
-                                fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
-                            }
+                            insert_numbers_diff(numbers, vec![])?;
+                            insert_strings_diff(strings, vec![])?;
+                        }
+                    },
+                    (None, Some(add_filterable_values)) => match add_filterable_values {
+                        Null => {
+                            add_is_null.insert(document);
+                        }
+                        Empty => {
+                            add_is_empty.insert(document);
+                        }
+                        Values { numbers, strings } => {
+                            insert_numbers_diff(vec![], numbers)?;
+                            insert_strings_diff(vec![], strings)?;
                         }
                     },
-                    (None, Some(add_filterable_values)) => {
-                        todo!()
-                    }
                     (Some(del_filterable_values), Some(add_filterable_values)) => {
-                        let (mut del_is_null, mut add_is_null) =
-                            facet_is_null_docids.entry(field_id).or_default();
-                        let (mut del_is_empty, mut add_is_empty) =
-                            facet_is_empty_docids.entry(field_id).or_default();
-
                         match (del_filterable_values, add_filterable_values) {
                             (Null, Null) | (Empty, Empty) => (),
                             (Null, Empty) => {
@@ -198,120 +192,31 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
                                 add_is_null.insert(document);
                             }
                             (Null, Values { numbers, strings }) => {
+                                insert_numbers_diff(vec![], numbers)?;
+                                insert_strings_diff(vec![], strings)?;
                                 del_is_null.insert(document);
-                                todo!()
                             }
                             (Empty, Values { numbers, strings }) => {
+                                insert_numbers_diff(vec![], numbers)?;
+                                insert_strings_diff(vec![], strings)?;
                                 del_is_empty.insert(document);
-                                todo!()
                             }
                             (Values { numbers, strings }, Null) => {
-                                todo!();
                                 add_is_null.insert(document);
+                                insert_numbers_diff(numbers, vec![])?;
+                                insert_strings_diff(strings, vec![])?;
                             }
                             (Values { numbers, strings }, Empty) => {
-                                todo!();
                                 add_is_empty.insert(document);
+                                insert_numbers_diff(numbers, vec![])?;
+                                insert_strings_diff(strings, vec![])?;
                             }
                             (
-                                Values { numbers: mut del_numbers, strings: mut del_strings },
-                                Values { numbers: mut add_numbers, strings: mut add_strings },
+                                Values { numbers: del_numbers, strings: del_strings },
+                                Values { numbers: add_numbers, strings: add_strings },
                             ) => {
-                                // We sort and dedup the float numbers
-                                del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
-                                add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
-                                del_numbers.dedup_by_key(|f| OrderedFloat(*f));
-                                add_numbers.dedup_by_key(|f| OrderedFloat(*f));
-
-                                let merged_numbers_iter = itertools::merge_join_by(
-                                    del_numbers.into_iter().map(OrderedFloat),
-                                    add_numbers.into_iter().map(OrderedFloat),
-                                    |del, add| del.cmp(&add),
-                                );
-
-                                // insert facet numbers in sorter
-                                for eob in merged_numbers_iter {
-                                    key_buffer
-                                        .truncate(size_of::<FieldId>() + size_of::<DocumentId>());
-                                    match eob {
-                                        EitherOrBoth::Both(_, _) => (), // no need to touch anything
-                                        EitherOrBoth::Left(OrderedFloat(number)) => {
-                                            if let Some(value_bytes) = f64_into_bytes(number) {
-                                                key_buffer.extend_from_slice(&value_bytes);
-                                                key_buffer.extend_from_slice(&number.to_be_bytes());
-
-                                                // We insert only the Del part of the Obkv to inform
-                                                // that we only want to remove all those numbers.
-                                                let mut obkv = KvWriterDelAdd::memory();
-                                                obkv.insert(DelAdd::Deletion, ().as_bytes())?;
-                                                let bytes = obkv.into_inner()?;
-                                                fid_docid_facet_numbers_sorter
-                                                    .insert(&key_buffer, bytes)?;
-                                            }
-                                        }
-                                        EitherOrBoth::Right(OrderedFloat(number)) => {
-                                            if let Some(value_bytes) = f64_into_bytes(number) {
-                                                key_buffer.extend_from_slice(&value_bytes);
-                                                key_buffer.extend_from_slice(&number.to_be_bytes());
-
-                                                // We insert only the Del part of the Obkv to inform
-                                                // that we only want to remove all those numbers.
-                                                let mut obkv = KvWriterDelAdd::memory();
-                                                obkv.insert(DelAdd::Addition, ().as_bytes())?;
-                                                let bytes = obkv.into_inner()?;
-                                                fid_docid_facet_numbers_sorter
-                                                    .insert(&key_buffer, bytes)?;
-                                            }
-                                        }
-                                    }
-                                }
-
-                                // We sort and dedup the normalized and original strings
-                                del_strings.sort_unstable();
-                                add_strings.sort_unstable();
-                                del_strings.dedup();
-                                add_strings.dedup();
-
-                                let merged_strings_iter = itertools::merge_join_by(
-                                    del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
-                                    add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
-                                    |del, add| del.cmp(&add),
-                                );
-
-                                // insert normalized and original facet string in sorter
-                                for eob in merged_strings_iter {
-                                    match eob {
-                                        EitherOrBoth::Both(_, _) => (), // no need to touch anything
-                                        EitherOrBoth::Left((normalized, original)) => {
-                                            let truncated = truncate_string(normalized);
-
-                                            key_buffer.truncate(
-                                                size_of::<FieldId>() + size_of::<DocumentId>(),
-                                            );
-                                            key_buffer.extend_from_slice(truncated.as_bytes());
-
-                                            let mut obkv = KvWriterDelAdd::memory();
-                                            obkv.insert(DelAdd::Deletion, original)?;
-                                            let bytes = obkv.into_inner()?;
-                                            fid_docid_facet_strings_sorter
-                                                .insert(&key_buffer, bytes)?;
-                                        }
-                                        EitherOrBoth::Right((normalized, original)) => {
-                                            let truncated = truncate_string(normalized);
-
-                                            key_buffer.truncate(
-                                                size_of::<FieldId>() + size_of::<DocumentId>(),
-                                            );
-                                            key_buffer.extend_from_slice(truncated.as_bytes());
-
-                                            let mut obkv = KvWriterDelAdd::memory();
-                                            obkv.insert(DelAdd::Addition, original)?;
-                                            let bytes = obkv.into_inner()?;
-                                            fid_docid_facet_strings_sorter
-                                                .insert(&key_buffer, bytes)?;
-                                        }
-                                    }
-                                }
+                                insert_numbers_diff(del_numbers, add_numbers)?;
+                                insert_strings_diff(del_strings, add_strings)?;
                             }
                         }
                     }
@@ -320,19 +225,15 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         }
     }
 
+    let mut buffer = Vec::new();
     let mut facet_exists_docids_writer = create_writer(
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         tempfile::tempfile()?,
     );
     for (fid, (del_bitmap, add_bitmap)) in facet_exists_docids.into_iter() {
-        let mut obkv = KvWriterDelAdd::memory();
-        let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&del_bitmap).unwrap();
-        let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&add_bitmap).unwrap();
-        obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
-        obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
-        let bytes = obkv.into_inner()?;
-        facet_exists_docids_writer.insert(fid.to_be_bytes(), &bytes)?;
+        deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
+        facet_exists_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
     }
     let facet_exists_docids_reader = writer_into_reader(facet_exists_docids_writer)?;
 
@@ -342,13 +243,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         tempfile::tempfile()?,
     );
     for (fid, (del_bitmap, add_bitmap)) in facet_is_null_docids.into_iter() {
-        let mut obkv = KvWriterDelAdd::memory();
-        let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&del_bitmap).unwrap();
-        let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&add_bitmap).unwrap();
-        obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
-        obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
-        let bytes = obkv.into_inner()?;
-        facet_is_null_docids_writer.insert(fid.to_be_bytes(), &bytes)?;
+        deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
+        facet_is_null_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
     }
     let facet_is_null_docids_reader = writer_into_reader(facet_is_null_docids_writer)?;
 
@@ -358,13 +254,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
         tempfile::tempfile()?,
     );
     for (fid, (del_bitmap, add_bitmap)) in facet_is_empty_docids.into_iter() {
-        let mut obkv = KvWriterDelAdd::memory();
-        let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&del_bitmap).unwrap();
-        let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(&add_bitmap).unwrap();
-        obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
-        obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
-        let bytes = obkv.into_inner()?;
-        facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &bytes)?;
+        deladd_obkv_cbo_roaring_bitmaps(&mut buffer, &del_bitmap, &add_bitmap)?;
+        facet_is_empty_docids_writer.insert(fid.to_be_bytes(), &buffer)?;
     }
     let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
 
@@ -377,6 +268,141 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
     })
 }
 
+/// Generates a vector of bytes containing a DelAdd obkv with two bitmaps.
+fn deladd_obkv_cbo_roaring_bitmaps(
+    buffer: &mut Vec<u8>,
+    del_bitmap: &RoaringBitmap,
+    add_bitmap: &RoaringBitmap,
+) -> io::Result<()> {
+    buffer.clear();
+    let mut obkv = KvWriterDelAdd::new(buffer);
+    let del_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(del_bitmap).unwrap();
+    let add_bitmap_bytes = CboRoaringBitmapCodec::bytes_encode(add_bitmap).unwrap();
+    obkv.insert(DelAdd::Deletion, del_bitmap_bytes)?;
+    obkv.insert(DelAdd::Addition, add_bitmap_bytes)?;
+    obkv.finish()
+}
+
+/// Truncates a string to the biggest valid LMDB key size.
+fn truncate_string(s: String) -> String {
+    s.char_indices()
+        .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
+        .map(|(_, c)| c)
+        .collect()
+}
+
+/// Computes the diff between both Del and Add numbers and
+/// only inserts the parts that differ in the sorter.
+fn insert_numbers_diff<MF>(
+    fid_docid_facet_numbers_sorter: &mut Sorter<MF>,
+    key_buffer: &mut Vec<u8>,
+    mut del_numbers: Vec<f64>,
+    mut add_numbers: Vec<f64>,
+) -> Result<()>
+where
+    MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
+{
+    // We sort and dedup the float numbers
+    del_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
+    add_numbers.sort_unstable_by_key(|f| OrderedFloat(*f));
+    del_numbers.dedup_by_key(|f| OrderedFloat(*f));
+    add_numbers.dedup_by_key(|f| OrderedFloat(*f));
+
+    let merged_numbers_iter = itertools::merge_join_by(
+        del_numbers.into_iter().map(OrderedFloat),
+        add_numbers.into_iter().map(OrderedFloat),
+        |del, add| del.cmp(add),
+    );
+
+    // insert facet numbers in sorter
+    for eob in merged_numbers_iter {
+        key_buffer.truncate(TRUNCATE_SIZE);
+        match eob {
+            EitherOrBoth::Both(_, _) => (), // no need to touch anything
+            EitherOrBoth::Left(OrderedFloat(number)) => {
+                if let Some(value_bytes) = f64_into_bytes(number) {
+                    key_buffer.extend_from_slice(&value_bytes);
+                    key_buffer.extend_from_slice(&number.to_be_bytes());
+
+                    // We insert only the Del part of the Obkv to inform
+                    // that we only want to remove all those numbers.
+                    let mut obkv = KvWriterDelAdd::memory();
+                    obkv.insert(DelAdd::Deletion, ().as_bytes())?;
+                    let bytes = obkv.into_inner()?;
+                    fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
+                }
+            }
+            EitherOrBoth::Right(OrderedFloat(number)) => {
+                if let Some(value_bytes) = f64_into_bytes(number) {
+                    key_buffer.extend_from_slice(&value_bytes);
+                    key_buffer.extend_from_slice(&number.to_be_bytes());
+
+                    // We insert only the Del part of the Obkv to inform
+                    // that we only want to remove all those numbers.
+                    let mut obkv = KvWriterDelAdd::memory();
+                    obkv.insert(DelAdd::Addition, ().as_bytes())?;
+                    let bytes = obkv.into_inner()?;
+                    fid_docid_facet_numbers_sorter.insert(&key_buffer, bytes)?;
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Computes the diff between both Del and Add strings and
+/// only inserts the parts that differ in the sorter.
+fn insert_strings_diff<MF>(
+    fid_docid_facet_strings_sorter: &mut Sorter<MF>,
+    key_buffer: &mut Vec<u8>,
+    mut del_strings: Vec<(String, String)>,
+    mut add_strings: Vec<(String, String)>,
+) -> Result<()>
+where
+    MF: for<'a> Fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Cow<'a, [u8]>, Error>,
+{
+    // We sort and dedup the normalized and original strings
+    del_strings.sort_unstable();
+    add_strings.sort_unstable();
+    del_strings.dedup();
+    add_strings.dedup();
+
+    let merged_strings_iter = itertools::merge_join_by(
+        del_strings.into_iter().filter(|(n, _)| !n.is_empty()),
+        add_strings.into_iter().filter(|(n, _)| !n.is_empty()),
+        |del, add| del.cmp(add),
+    );
+
+    // insert normalized and original facet string in sorter
+    for eob in merged_strings_iter {
+        key_buffer.truncate(TRUNCATE_SIZE);
+        match eob {
+            EitherOrBoth::Both(_, _) => (), // no need to touch anything
+            EitherOrBoth::Left((normalized, original)) => {
+                let truncated = truncate_string(normalized);
+                key_buffer.extend_from_slice(truncated.as_bytes());
+
+                let mut obkv = KvWriterDelAdd::memory();
+                obkv.insert(DelAdd::Deletion, original)?;
+                let bytes = obkv.into_inner()?;
+                fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
+            }
+            EitherOrBoth::Right((normalized, original)) => {
+                let truncated = truncate_string(normalized);
+                key_buffer.extend_from_slice(truncated.as_bytes());
+
+                let mut obkv = KvWriterDelAdd::memory();
+                obkv.insert(DelAdd::Addition, original)?;
+                let bytes = obkv.into_inner()?;
+                fid_docid_facet_strings_sorter.insert(&key_buffer, bytes)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
 /// Represent what a document field contains.
 enum FilterableValues {
     /// Corresponds to the JSON `null` value.
@@ -387,6 +413,7 @@ enum FilterableValues {
     Values { numbers: Vec<f64>, strings: Vec<(String, String)> },
 }
 
+/// Extracts the facet values of a JSON field.
 fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
     fn inner_extract_facet_values(
         value: &Value,
@@ -448,10 +475,3 @@ fn extract_facet_values(value: &Value, geo_field: bool) -> FilterableValues {
         }
     }
 }
-
-fn truncate_string(mut s: String) -> String {
-    s.char_indices()
-        .take_while(|(idx, _)| idx + 4 < MAX_FACET_VALUE_LENGTH)
-        .map(|(_, c)| c)
-        .collect()
-}

From a82dee21e09dcf4d55ed604478bfa4aa4e7e6da3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 18 Oct 2023 13:53:58 +0200
Subject: [PATCH 019/127] Rename docid_fid into fid_docid

---
 .../extract/extract_fid_docid_facet_values.rs | 12 +++----
 .../src/update/index_documents/extract/mod.rs | 36 +++++++++----------
 2 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index ec0960b86..87320a675 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -29,9 +29,8 @@ const TRUNCATE_SIZE: usize = size_of::<FieldId>() + size_of::<DocumentId>();
 
 /// The extracted facet values stored in grenad files by type.
 pub struct ExtractedFacetValues {
-    // TOOD rename into `fid_docid_*`
-    pub docid_fid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
-    pub docid_fid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
+    pub fid_docid_facet_numbers_chunk: grenad::Reader<BufReader<File>>,
+    pub fid_docid_facet_strings_chunk: grenad::Reader<BufReader<File>>,
     pub fid_facet_is_null_docids_chunk: grenad::Reader<BufReader<File>>,
     pub fid_facet_is_empty_docids_chunk: grenad::Reader<BufReader<File>>,
     pub fid_facet_exists_docids_chunk: grenad::Reader<BufReader<File>>,
@@ -44,7 +43,6 @@ pub struct ExtractedFacetValues {
 /// We need the fid of the geofields to correctly parse them as numbers if they were sent as strings initially.
 #[logging_timer::time]
 pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
-    // TODO Reader<Obkv<FieldId, Obkv<DelAdd, serde_json::Value>>>
     obkv_documents: grenad::Reader<R>,
     indexer: GrenadParameters,
     faceted_fields: &HashSet<FieldId>,
@@ -83,7 +81,6 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
 
     let mut cursor = obkv_documents.into_cursor()?;
     while let Some((docid_bytes, value)) = cursor.move_on_next()? {
-        // TODO Obkv<FieldId, Obkv<DelAdd, serde_json::Value>>
         let obkv = obkv::KvReader::new(value);
 
         for (field_id, field_bytes) in obkv.iter() {
@@ -96,7 +93,6 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
                 numbers_key_buffer.extend_from_slice(&field_id.to_be_bytes());
                 strings_key_buffer.extend_from_slice(&field_id.to_be_bytes());
 
-                // Here, we know already that the document must be added to the “field id exists” database
                 let document: [u8; 4] = docid_bytes[..4].try_into().ok().unwrap();
                 let document = BEU32::from(document).get();
 
@@ -260,8 +256,8 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
     let facet_is_empty_docids_reader = writer_into_reader(facet_is_empty_docids_writer)?;
 
     Ok(ExtractedFacetValues {
-        docid_fid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
-        docid_fid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
+        fid_docid_facet_numbers_chunk: sorter_into_reader(fid_docid_facet_numbers_sorter, indexer)?,
+        fid_docid_facet_strings_chunk: sorter_into_reader(fid_docid_facet_strings_sorter, indexer)?,
         fid_facet_is_null_docids_chunk: facet_is_null_docids_reader,
         fid_facet_is_empty_docids_chunk: facet_is_empty_docids_reader,
         fid_facet_exists_docids_chunk: facet_exists_docids_reader,
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 164f95452..0522fc93c 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -92,9 +92,9 @@ pub(crate) fn data_from_obkv_documents(
     let (
         docid_word_positions_chunks,
         (
-            docid_fid_facet_numbers_chunks,
+            fid_docid_facet_numbers_chunks,
             (
-                docid_fid_facet_strings_chunks,
+                fid_docid_facet_strings_chunks,
                 (
                     facet_is_null_docids_chunks,
                     (facet_is_empty_docids_chunks, facet_exists_docids_chunks),
@@ -206,7 +206,7 @@ pub(crate) fn data_from_obkv_documents(
     );
 
     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
-        docid_fid_facet_strings_chunks,
+        fid_docid_facet_strings_chunks,
         indexer,
         lmdb_writer_sx.clone(),
         extract_facet_string_docids,
@@ -216,7 +216,7 @@ pub(crate) fn data_from_obkv_documents(
     );
 
     spawn_extraction_task::<_, _, Vec<grenad::Reader<BufReader<File>>>>(
-        docid_fid_facet_numbers_chunks,
+        fid_docid_facet_numbers_chunks,
         indexer,
         lmdb_writer_sx,
         extract_facet_number_docids,
@@ -352,7 +352,7 @@ fn send_and_extract_flattened_documents_data(
         });
     }
 
-    let (docid_word_positions_chunk, docid_fid_facet_values_chunks): (Result<_>, Result<_>) =
+    let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
         rayon::join(
             || {
                 let (documents_ids, docid_word_positions_chunk, script_language_pair) =
@@ -380,8 +380,8 @@ fn send_and_extract_flattened_documents_data(
             },
             || {
                 let ExtractedFacetValues {
-                    docid_fid_facet_numbers_chunk,
-                    docid_fid_facet_strings_chunk,
+                    fid_docid_facet_numbers_chunk,
+                    fid_docid_facet_strings_chunk,
                     fid_facet_is_null_docids_chunk,
                     fid_facet_is_empty_docids_chunk,
                     fid_facet_exists_docids_chunk,
@@ -392,26 +392,26 @@ fn send_and_extract_flattened_documents_data(
                     geo_fields_ids,
                 )?;
 
-                // send docid_fid_facet_numbers_chunk to DB writer
-                let docid_fid_facet_numbers_chunk =
-                    unsafe { as_cloneable_grenad(&docid_fid_facet_numbers_chunk)? };
+                // send fid_docid_facet_numbers_chunk to DB writer
+                let fid_docid_facet_numbers_chunk =
+                    unsafe { as_cloneable_grenad(&fid_docid_facet_numbers_chunk)? };
 
                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetNumbers(
-                    docid_fid_facet_numbers_chunk.clone(),
+                    fid_docid_facet_numbers_chunk.clone(),
                 )));
 
-                // send docid_fid_facet_strings_chunk to DB writer
-                let docid_fid_facet_strings_chunk =
-                    unsafe { as_cloneable_grenad(&docid_fid_facet_strings_chunk)? };
+                // send fid_docid_facet_strings_chunk to DB writer
+                let fid_docid_facet_strings_chunk =
+                    unsafe { as_cloneable_grenad(&fid_docid_facet_strings_chunk)? };
 
                 let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdDocidFacetStrings(
-                    docid_fid_facet_strings_chunk.clone(),
+                    fid_docid_facet_strings_chunk.clone(),
                 )));
 
                 Ok((
-                    docid_fid_facet_numbers_chunk,
+                    fid_docid_facet_numbers_chunk,
                     (
-                        docid_fid_facet_strings_chunk,
+                        fid_docid_facet_strings_chunk,
                         (
                             fid_facet_is_null_docids_chunk,
                             (fid_facet_is_empty_docids_chunk, fid_facet_exists_docids_chunk),
@@ -421,5 +421,5 @@ fn send_and_extract_flattened_documents_data(
             },
         );
 
-    Ok((docid_word_positions_chunk?, docid_fid_facet_values_chunks?))
+    Ok((docid_word_positions_chunk?, fid_docid_facet_values_chunks?))
 }

From fcd3a1434d2a8e6da49a5a86d0591bd872d3de29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 18 Oct 2023 17:40:13 +0200
Subject: [PATCH 020/127] Update extract_facet_number_docids to support deladd
 obkvs

---
 .../cbo_roaring_bitmap_codec.rs               | 10 ++++--
 .../extract/extract_facet_number_docids.rs    | 26 ++++++++++------
 .../helpers/merge_functions.rs                | 31 +++++++++++++++++++
 .../src/update/index_documents/helpers/mod.rs |  5 +--
 4 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
index bf76287d8..79b52695e 100644
--- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
+++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
@@ -60,12 +60,16 @@ impl CboRoaringBitmapCodec {
     /// if the merged values length is under the threshold, values are directly
     /// serialized in the buffer else a RoaringBitmap is created from the
     /// values and is serialized in the buffer.
-    pub fn merge_into(slices: &[Cow<[u8]>], buffer: &mut Vec<u8>) -> io::Result<()> {
+    pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
+    where
+        I: IntoIterator<Item = A>,
+        A: AsRef<[u8]>,
+    {
         let mut roaring = RoaringBitmap::new();
         let mut vec = Vec::new();
 
         for bytes in slices {
-            if bytes.len() <= THRESHOLD * size_of::<u32>() {
+            if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
                 let mut reader = bytes.as_ref();
                 while let Ok(integer) = reader.read_u32::<NativeEndian>() {
                     vec.push(integer);
@@ -85,7 +89,7 @@ impl CboRoaringBitmapCodec {
                 }
             } else {
                 // We can unwrap safely because the vector is sorted upper.
-                let roaring = RoaringBitmap::from_sorted_iter(vec.into_iter()).unwrap();
+                let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
                 roaring.serialize_into(buffer)?;
             }
         } else {
diff --git a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
index 76dc6d3c6..f860aacba 100644
--- a/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_number_docids.rs
@@ -4,11 +4,12 @@ use std::io::{self, BufReader};
 use heed::{BytesDecode, BytesEncode};
 
 use super::helpers::{
-    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
+    create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
 };
 use crate::heed_codec::facet::{
     FacetGroupKey, FacetGroupKeyCodec, FieldDocIdFacetF64Codec, OrderedF64Codec,
 };
+use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
 use crate::Result;
 
 /// Extracts the facet number and the documents ids where this facet number appear.
@@ -17,8 +18,7 @@ use crate::Result;
 /// documents ids from the given chunk of docid facet number positions.
 #[logging_timer::time]
 pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
-    // TODO Reader<Key, Obkv<DelAdd, ()>>
-    docid_fid_facet_number: grenad::Reader<R>,
+    fid_docid_facet_number: grenad::Reader<R>,
     indexer: GrenadParameters,
 ) -> Result<grenad::Reader<BufReader<File>>> {
     puffin::profile_function!();
@@ -27,24 +27,30 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
 
     let mut facet_number_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Unstable,
-        // TODO We must modify the merger to do unions of Del and Add separately
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
         max_memory,
     );
 
-    let mut cursor = docid_fid_facet_number.into_cursor()?;
-    // TODO the value is a Obkv<DelAdd, ()> and must be taken into account
-    while let Some((key_bytes, _)) = cursor.move_on_next()? {
+    let mut buffer = Vec::new();
+    let mut cursor = fid_docid_facet_number.into_cursor()?;
+    while let Some((key_bytes, deladd_obkv_bytes)) = cursor.move_on_next()? {
         let (field_id, document_id, number) =
             FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
 
         let key = FacetGroupKey { field_id, level: 0, left_bound: number };
         let key_bytes = FacetGroupKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
-        // TODO We must put a Obkv<DelAdd, RoaringBitmap>
-        facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
+
+        buffer.clear();
+        let mut obkv = KvWriterDelAdd::new(&mut buffer);
+        for (deladd_key, _) in KvReaderDelAdd::new(deladd_obkv_bytes).iter() {
+            obkv.insert(deladd_key, document_id.to_ne_bytes())?;
+        }
+        obkv.finish()?;
+
+        facet_number_docids_sorter.insert(key_bytes, &buffer)?;
     }
 
     sorter_into_reader(facet_number_docids_sorter, indexer)
diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs
index dee200b21..a418f8786 100644
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@@ -205,3 +205,34 @@ pub fn merge_cbo_roaring_bitmaps<'a>(
         Ok(Cow::from(vec))
     }
 }
+
+pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
+    _key: &[u8],
+    values: &[Cow<'a, [u8]>],
+) -> Result<Cow<'a, [u8]>> {
+    if values.len() == 1 {
+        Ok(values[0].clone())
+    } else {
+        // Retrieve the bitmaps from both sides
+        let mut del_bitmaps_bytes = Vec::new();
+        let mut add_bitmaps_bytes = Vec::new();
+        for value in values {
+            let obkv = KvReaderDelAdd::new(value);
+            if let Some(bitmap_bytes) = obkv.get(DelAdd::Deletion) {
+                del_bitmaps_bytes.push(bitmap_bytes);
+            }
+            if let Some(bitmap_bytes) = obkv.get(DelAdd::Addition) {
+                add_bitmaps_bytes.push(bitmap_bytes);
+            }
+        }
+
+        let mut output_deladd_obkv = KvWriterDelAdd::memory();
+        let mut buffer = Vec::new();
+        CboRoaringBitmapCodec::merge_into(del_bitmaps_bytes, &mut buffer)?;
+        output_deladd_obkv.insert(DelAdd::Deletion, &buffer)?;
+        buffer.clear();
+        CboRoaringBitmapCodec::merge_into(add_bitmaps_bytes, &mut buffer)?;
+        output_deladd_obkv.insert(DelAdd::Addition, &buffer)?;
+        output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
+    }
+}
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index 8f70a2de2..1f2f8e6ef 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -14,8 +14,9 @@ pub use grenad_helpers::{
 };
 pub use merge_functions::{
     concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
-    merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions,
-    obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn,
+    merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_roaring_bitmaps,
+    obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions,
+    serialize_roaring_bitmap, MergeFn,
 };
 
 use crate::MAX_WORD_LENGTH;

From e2bc054604c96f9fefc036a1dcec4aa9ec9ae4b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 18 Oct 2023 18:06:41 +0200
Subject: [PATCH 021/127] Update extract_facet_string_docids to support deladd
 obkvs

---
 .../extract/extract_facet_string_docids.rs    | 41 ++++++++-----------
 .../helpers/merge_functions.rs                |  3 ++
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
index b861c04e4..2ade776c3 100644
--- a/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_facet_string_docids.rs
@@ -1,13 +1,15 @@
 use std::fs::File;
-use std::io::{self, BufReader};
+use std::io::BufReader;
+use std::{io, str};
 
 use heed::BytesEncode;
 
 use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec};
 use crate::heed_codec::StrRefCodec;
-use crate::update::index_documents::merge_cbo_roaring_bitmaps;
-use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
+use crate::update::del_add::{KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::index_documents::helpers::merge_deladd_cbo_roaring_bitmaps;
+use crate::{FieldId, Result};
 
 /// Extracts the facet string and the documents ids where this facet string appear.
 ///
@@ -15,7 +17,6 @@ use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH};
 /// documents ids from the given chunk of docid facet string positions.
 #[logging_timer::time]
 pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
-    // TODO Reader<Key, Obkv<DelAdd, OriginalString>>
     docid_fid_facet_string: grenad::Reader<R>,
     indexer: GrenadParameters,
 ) -> Result<grenad::Reader<BufReader<File>>> {
@@ -25,17 +26,16 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
 
     let mut facet_string_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Stable,
-        // TODO We must modify the merger to do unions of Del and Add separately
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
         max_memory,
     );
 
+    let mut buffer = Vec::new();
     let mut cursor = docid_fid_facet_string.into_cursor()?;
-    while let Some((key, _original_value_bytes)) = cursor.move_on_next()? {
-        // TODO the value is a Obkv<DelAdd, OriginalString> and must be taken into account
+    while let Some((key, deladd_original_value_bytes)) = cursor.move_on_next()? {
         let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
         let field_id = FieldId::from_be_bytes(field_id_bytes);
 
@@ -43,22 +43,17 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
             try_split_array_at::<_, 4>(bytes).unwrap();
         let document_id = u32::from_be_bytes(document_id_bytes);
 
-        let mut normalised_value = std::str::from_utf8(normalized_value_bytes)?;
-
-        let normalised_truncated_value: String;
-        if normalised_value.len() > MAX_FACET_VALUE_LENGTH {
-            normalised_truncated_value = normalised_value
-                .char_indices()
-                .take_while(|(idx, _)| *idx < MAX_FACET_VALUE_LENGTH)
-                .map(|(_, c)| c)
-                .collect();
-            normalised_value = normalised_truncated_value.as_str();
-        }
-        let key = FacetGroupKey { field_id, level: 0, left_bound: normalised_value };
+        let normalized_value = str::from_utf8(normalized_value_bytes)?;
+        let key = FacetGroupKey { field_id, level: 0, left_bound: normalized_value };
         let key_bytes = FacetGroupKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
-        // document id is encoded in native-endian because of the CBO roaring bitmap codec
-        // TODO Reader<KeyBytes, Obkv<DelAdd, RoaringBitmap>>
-        facet_string_docids_sorter.insert(&key_bytes, document_id.to_ne_bytes())?;
+
+        buffer.clear();
+        let mut obkv = KvWriterDelAdd::new(&mut buffer);
+        for (deladd_key, _) in KvReaderDelAdd::new(deladd_original_value_bytes).iter() {
+            obkv.insert(deladd_key, document_id.to_ne_bytes())?;
+        }
+        obkv.finish()?;
+        facet_string_docids_sorter.insert(&key_bytes, &buffer)?;
     }
 
     sorter_into_reader(facet_string_docids_sorter, indexer)
diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs
index a418f8786..770629c8e 100644
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@@ -193,6 +193,7 @@ pub fn obkvs_keep_last_addition_merge_deletions<'a>(
     inner_merge_del_add_obkvs(obkvs, false)
 }
 
+/// Do a union of all the CboRoaringBitmaps in the values.
 pub fn merge_cbo_roaring_bitmaps<'a>(
     _key: &[u8],
     values: &[Cow<'a, [u8]>],
@@ -206,6 +207,8 @@ pub fn merge_cbo_roaring_bitmaps<'a>(
     }
 }
 
+/// Do a union of CboRoaringBitmaps on both sides of a DelAdd obkv
+/// separately and outputs a new DelAdd with both unions.
 pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
     _key: &[u8],
     values: &[Cow<'a, [u8]>],

From 2597bbd107215938b6c6dd9e0c4176e8a564e8ad Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 19 Oct 2023 10:22:39 +0200
Subject: [PATCH 022/127] Make script language docids map taking a tuple of
 roaring bitmaps expressing the deletions and the additions

---
 .../extract/extract_docid_word_positions.rs   | 24 ++++------
 .../src/update/index_documents/typed_chunk.rs | 48 ++++++++-----------
 2 files changed, 29 insertions(+), 43 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index e02e492d2..36258b275 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -14,7 +14,7 @@ use crate::error::{InternalError, SerializationError};
 use crate::update::del_add::{del_add_from_two_obkvs, DelAdd, KvReaderDelAdd};
 use crate::{FieldId, Result, MAX_POSITION_PER_ATTRIBUTE, MAX_WORD_LENGTH};
 
-pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), RoaringBitmap>;
+pub type ScriptLanguageDocidsMap = HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>;
 
 /// Extracts the word and positions where this word appear and
 /// prefixes it by the document id.
@@ -30,11 +30,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     allowed_separators: Option<&[&str]>,
     dictionary: Option<&[&str]>,
     max_positions_per_attributes: Option<u32>,
-) -> Result<(
-    RoaringBitmap,
-    grenad::Reader<BufReader<File>>,
-    (ScriptLanguageDocidsMap, ScriptLanguageDocidsMap),
-)> {
+) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
     puffin::profile_function!();
 
     let max_positions_per_attributes = max_positions_per_attributes
@@ -43,8 +39,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
 
     // initialize destination values.
     let mut documents_ids = RoaringBitmap::new();
-    let mut del_script_language_docids = HashMap::new();
-    let mut add_script_language_docids = HashMap::new();
+    let mut script_language_docids = HashMap::new();
     let mut docid_word_positions_sorter = create_sorter(
         grenad::SortAlgorithm::Stable,
         keep_latest_obkv,
@@ -138,25 +133,24 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
         // update script_language_docids deletions.
         for (script, languages_frequency) in del_script_language_word_count {
             for (language, _) in languages_frequency {
-                let entry = del_script_language_docids
+                let entry = script_language_docids
                     .entry((script, language))
-                    .or_insert_with(RoaringBitmap::new);
-                entry.push(document_id);
+                    .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
+                entry.0.push(document_id);
             }
         }
 
         // update script_language_docids additions.
         for (script, languages_frequency) in add_script_language_word_count {
             for (language, _) in languages_frequency {
-                let entry = add_script_language_docids
+                let entry = script_language_docids
                     .entry((script, language))
-                    .or_insert_with(RoaringBitmap::new);
-                entry.push(document_id);
+                    .or_insert_with(|| (RoaringBitmap::new(), RoaringBitmap::new()));
+                entry.1.push(document_id);
             }
         }
     }
 
-    let script_language_docids = (del_script_language_docids, add_script_language_docids);
     sorter_into_reader(docid_word_positions_sorter, indexer)
         .map(|reader| (documents_ids, reader, script_language_docids))
 }
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index f2dc7d336..e3ff9b253 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -43,9 +43,7 @@ pub(crate) enum TypedChunk {
     FieldIdFacetIsEmptyDocids(grenad::Reader<BufReader<File>>),
     GeoPoints(grenad::Reader<BufReader<File>>),
     VectorPoints(grenad::Reader<BufReader<File>>),
-    ScriptLanguageDocids(
-        (HashMap<(Script, Language), RoaringBitmap>, HashMap<(Script, Language), RoaringBitmap>),
-    ),
+    ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
 }
 
 impl TypedChunk {
@@ -103,8 +101,8 @@ impl TypedChunk {
             TypedChunk::VectorPoints(grenad) => {
                 format!("VectorPoints {{ number_of_entries: {} }}", grenad.len())
             }
-            TypedChunk::ScriptLanguageDocids((_, addition)) => {
-                format!("ScriptLanguageDocids {{ number_of_entries: {} }}", addition.len())
+            TypedChunk::ScriptLanguageDocids(sl_map) => {
+                format!("ScriptLanguageDocids {{ number_of_entries: {} }}", sl_map.len())
             }
         }
     }
@@ -346,24 +344,25 @@ pub(crate) fn write_typed_chunk_into_index(
             log::debug!("There are {} entries in the HNSW so far", hnsw_length);
             index.put_vector_hnsw(wtxn, &new_hnsw)?;
         }
-        TypedChunk::ScriptLanguageDocids((deletion, addition)) => {
-            for (key, value) in deletion {
-                if let Some(mut db_values) = index.script_language_docids.get(wtxn, &key)? {
-                    db_values -= value;
-                    if db_values.is_empty() {
-                        index.script_language_docids.delete(wtxn, &key)?;
-                    } else {
-                        index.script_language_docids.put(wtxn, &key, &db_values)?;
-                    }
-                }
-            }
-
-            for (key, value) in addition {
+        TypedChunk::ScriptLanguageDocids(sl_map) => {
+            for (key, (deletion, addition)) in sl_map {
+                let mut db_key_exists = false;
                 let final_value = match index.script_language_docids.get(wtxn, &key)? {
-                    Some(mut db_values) => db_values | value,
-                    None => value,
+                    Some(db_values) => {
+                        db_key_exists = true;
+                        (db_values - deletion) | addition
+                    }
+                    None => addition,
                 };
-                index.script_language_docids.put(wtxn, &key, &final_value)?;
+
+                if final_value.is_empty() {
+                    // If the database entry exists, delete it.
+                    if db_key_exists == true {
+                        index.script_language_docids.delete(wtxn, &key)?;
+                    }
+                } else {
+                    index.script_language_docids.put(wtxn, &key, &final_value)?;
+                }
             }
         }
     }
@@ -388,13 +387,6 @@ fn merge_word_docids_reader_into_fst(
     Ok(builder.into_set())
 }
 
-fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
-    let new_value = RoaringBitmap::deserialize_from(new_value)?;
-    let db_value = RoaringBitmap::deserialize_from(db_value)?;
-    let value = new_value | db_value;
-    Ok(serialize_roaring_bitmap(&value, buffer)?)
-}
-
 fn merge_cbo_roaring_bitmaps(
     new_value: &[u8],
     db_value: &[u8],

From 46aa75abdb5fd1f25965aa8511344f06692776eb Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 19 Oct 2023 11:58:31 +0200
Subject: [PATCH 023/127] update extract word docids

---
 milli/src/update/del_add.rs                   |  4 ++
 .../extract/extract_word_docids.rs            | 70 +++++++++++++++----
 2 files changed, 59 insertions(+), 15 deletions(-)

diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs
index 346ae0afa..c8b7f0f6a 100644
--- a/milli/src/update/del_add.rs
+++ b/milli/src/update/del_add.rs
@@ -98,3 +98,7 @@ pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
 
     writer.finish()
 }
+
+pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool {
+    del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
+}
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 3df962585..a95162236 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -7,12 +7,13 @@ use heed::BytesDecode;
 use obkv::KvReaderU16;
 
 use super::helpers::{
-    create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_reader,
+    create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader,
     try_split_array_at, writer_into_reader, GrenadParameters,
 };
 use crate::error::SerializationError;
 use crate::heed_codec::StrBEU16Codec;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
+use crate::update::del_add::{is_noop_del_add_obkv, DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::MergeFn;
 use crate::{DocumentId, FieldId, Result};
 
@@ -39,14 +40,15 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
 
     let mut word_fid_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Unstable,
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
         max_memory.map(|x| x / 3),
     );
     let mut key_buffer = Vec::new();
-    let mut words = BTreeSet::new();
+    let mut del_words = BTreeSet::new();
+    let mut add_words = BTreeSet::new();
     let mut cursor = docid_word_positions.into_cursor()?;
     while let Some((key, value)) = cursor.move_on_next()? {
         let (document_id_bytes, fid_bytes) = try_split_array_at(key)
@@ -56,24 +58,37 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         let document_id = u32::from_be_bytes(document_id_bytes);
         let fid = u16::from_be_bytes(fid_bytes);
 
-        for (_pos, word) in KvReaderU16::new(&value).iter() {
-            words.insert(word.to_vec());
+        let del_add_reader = KvReaderDelAdd::new(&value);
+        // extract all unique words to remove.
+        if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
+            for (_pos, word) in KvReaderU16::new(&deletion).iter() {
+                del_words.insert(word.to_vec());
+            }
+        }
+
+        // extract all unique additional words.
+        if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
+            for (_pos, word) in KvReaderU16::new(&addition).iter() {
+                add_words.insert(word.to_vec());
+            }
         }
 
         words_into_sorter(
             document_id,
             fid,
             &mut key_buffer,
-            &mut words,
+            &del_words,
+            &add_words,
             &mut word_fid_docids_sorter,
         )?;
 
-        words.clear();
+        del_words.clear();
+        add_words.clear();
     }
 
     let mut word_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Unstable,
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
@@ -82,7 +97,7 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
 
     let mut exact_word_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Unstable,
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
@@ -96,8 +111,12 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
     );
 
     let mut iter = word_fid_docids_sorter.into_stream_merger_iter()?;
+    // TODO: replace sorters by writers by accumulating values into a buffer before inserting them.
     while let Some((key, value)) = iter.next()? {
-        word_fid_docids_writer.insert(key, value)?;
+        // only keep the value if their is a change to apply in the DB.
+        if !is_noop_del_add_obkv(KvReaderDelAdd::new(value)) {
+            word_fid_docids_writer.insert(key, value)?;
+        }
 
         let (word, fid) = StrBEU16Codec::bytes_decode(key)
             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
@@ -121,20 +140,41 @@ fn words_into_sorter(
     document_id: DocumentId,
     fid: FieldId,
     key_buffer: &mut Vec<u8>,
-    words: &mut BTreeSet<Vec<u8>>,
+    del_words: &BTreeSet<Vec<u8>>,
+    add_words: &BTreeSet<Vec<u8>>,
     word_fid_docids_sorter: &mut grenad::Sorter<MergeFn>,
 ) -> Result<()> {
     puffin::profile_function!();
 
-    for word_bytes in words.iter() {
+    use itertools::merge_join_by;
+    use itertools::EitherOrBoth::{Both, Left, Right};
+
+    let mut buffer = Vec::new();
+    for eob in merge_join_by(del_words.iter(), add_words.iter(), |d, a| d.cmp(a)) {
+        buffer.clear();
+        let mut value_writer = KvWriterDelAdd::new(&mut buffer);
+        let word_bytes = match eob {
+            Left(word_bytes) => {
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                word_bytes
+            }
+            Right(word_bytes) => {
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                word_bytes
+            }
+            Both(word_bytes, _) => {
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                word_bytes
+            }
+        };
+
         key_buffer.clear();
         key_buffer.extend_from_slice(&word_bytes);
         key_buffer.push(0);
         key_buffer.extend_from_slice(&fid.to_be_bytes());
-        word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
+        word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
     }
 
-    words.clear();
-
     Ok(())
 }

From 6bcf8b4f8cab1b58be1a96b76c14c8fa056ef17e Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 19 Oct 2023 13:27:07 +0200
Subject: [PATCH 024/127] update extract word position docids

---
 .../extract/extract_word_position_docids.rs   | 105 ++++++++++++++----
 1 file changed, 82 insertions(+), 23 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
index 220dca960..2ff2f2ad5 100644
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -1,15 +1,17 @@
-use std::collections::HashSet;
+use std::collections::BTreeSet;
 use std::fs::File;
 use std::io::{self, BufReader};
 
 use obkv::KvReaderU16;
 
 use super::helpers::{
-    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
+    create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
     GrenadParameters,
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
+use crate::update::MergeFn;
 use crate::{bucketed_position, DocumentId, Result};
 
 /// Extracts the word positions and the documents ids where this word appear.
@@ -27,14 +29,15 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
 
     let mut word_position_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Unstable,
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
         max_memory,
     );
 
-    let mut word_positions: HashSet<(u16, Vec<u8>)> = HashSet::new();
+    let mut del_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
+    let mut add_word_positions: BTreeSet<(u16, Vec<u8>)> = BTreeSet::new();
     let mut current_document_id: Option<u32> = None;
     let mut key_buffer = Vec::new();
     let mut cursor = docid_word_positions.into_cursor()?;
@@ -44,36 +47,92 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
         let document_id = DocumentId::from_be_bytes(document_id_bytes);
 
         if current_document_id.map_or(false, |id| document_id != id) {
-            for (position, word_bytes) in word_positions.iter() {
-                key_buffer.clear();
-                key_buffer.extend_from_slice(word_bytes);
-                key_buffer.push(0);
-                key_buffer.extend_from_slice(&position.to_be_bytes());
-                word_position_docids_sorter
-                    .insert(&key_buffer, current_document_id.unwrap().to_ne_bytes())?;
-            }
-            word_positions.clear();
+            words_position_into_sorter(
+                current_document_id.unwrap(),
+                &mut key_buffer,
+                &del_word_positions,
+                &add_word_positions,
+                &mut word_position_docids_sorter,
+            )?;
+            del_word_positions.clear();
+            add_word_positions.clear();
         }
 
         current_document_id = Some(document_id);
 
-        for (position, word_bytes) in KvReaderU16::new(&value).iter() {
-            let position = bucketed_position(position);
-            word_positions.insert((position, word_bytes.to_vec()));
+        let del_add_reader = KvReaderDelAdd::new(&value);
+        // extract all unique words to remove.
+        if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
+            for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
+                let position = bucketed_position(position);
+                del_word_positions.insert((position, word_bytes.to_vec()));
+            }
+        }
+
+        // extract all unique additional words.
+        if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
+            for (position, word_bytes) in KvReaderU16::new(addition).iter() {
+                let position = bucketed_position(position);
+                add_word_positions.insert((position, word_bytes.to_vec()));
+            }
         }
     }
 
     if let Some(document_id) = current_document_id {
-        for (position, word_bytes) in word_positions {
-            key_buffer.clear();
-            key_buffer.extend_from_slice(&word_bytes);
-            key_buffer.push(0);
-            key_buffer.extend_from_slice(&position.to_be_bytes());
-            word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
-        }
+        words_position_into_sorter(
+            document_id,
+            &mut key_buffer,
+            &del_word_positions,
+            &add_word_positions,
+            &mut word_position_docids_sorter,
+        )?;
     }
 
+    // TODO remove noop DelAdd OBKV
     let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
 
     Ok(word_position_docids_reader)
 }
+
+fn words_position_into_sorter(
+    document_id: DocumentId,
+    key_buffer: &mut Vec<u8>,
+    del_word_positions: &BTreeSet<(u16, Vec<u8>)>,
+    add_word_positions: &BTreeSet<(u16, Vec<u8>)>,
+    word_position_docids_sorter: &mut grenad::Sorter<MergeFn>,
+) -> Result<()> {
+    puffin::profile_function!();
+
+    use itertools::merge_join_by;
+    use itertools::EitherOrBoth::{Both, Left, Right};
+
+    let mut buffer = Vec::new();
+    for eob in merge_join_by(del_word_positions.iter(), add_word_positions.iter(), |d, a| d.cmp(a))
+    {
+        buffer.clear();
+        let mut value_writer = KvWriterDelAdd::new(&mut buffer);
+        let (position, word_bytes) = match eob {
+            Left(key) => {
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                key
+            }
+            Right(key) => {
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                key
+            }
+            Both(key, _) => {
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                key
+            }
+        };
+
+        key_buffer.clear();
+        key_buffer.extend_from_slice(word_bytes);
+        key_buffer.push(0);
+        key_buffer.extend_from_slice(&position.to_be_bytes());
+        word_position_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
+    }
+
+    Ok(())
+}

From 87e3d278786ad90f60014bc4e92d9a24adc14afd Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 19 Oct 2023 14:18:14 +0200
Subject: [PATCH 025/127] update extract word pair proximity to support deladd
 obkvs

---
 .../extract_word_pair_proximity_docids.rs     | 147 +++++++++++++-----
 1 file changed, 109 insertions(+), 38 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
index 70865acbe..76a1d1d68 100644
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -1,4 +1,4 @@
-use std::collections::{HashMap, VecDeque};
+use std::collections::{BTreeMap, VecDeque};
 use std::fs::File;
 use std::io::BufReader;
 use std::{cmp, io};
@@ -6,12 +6,13 @@ use std::{cmp, io};
 use obkv::KvReaderU16;
 
 use super::helpers::{
-    create_sorter, create_writer, merge_cbo_roaring_bitmaps, sorter_into_reader,
-    try_split_array_at, writer_into_reader, GrenadParameters, MergeFn,
+    create_sorter, create_writer, merge_deladd_cbo_roaring_bitmaps, try_split_array_at,
+    writer_into_reader, GrenadParameters, MergeFn,
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
 use crate::proximity::{index_proximity, MAX_DISTANCE};
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::{DocumentId, Result};
 
 /// Extracts the best proximity between pairs of words and the documents ids where this pair appear.
@@ -32,7 +33,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
         .map(|_| {
             create_sorter(
                 grenad::SortAlgorithm::Unstable,
-                merge_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps,
                 indexer.chunk_compression_type,
                 indexer.chunk_compression_level,
                 indexer.max_nb_chunks,
@@ -41,9 +42,12 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
         })
         .collect();
 
-    let mut word_positions: VecDeque<(String, u16)> =
+    let mut del_word_positions: VecDeque<(String, u16)> =
         VecDeque::with_capacity(MAX_DISTANCE as usize);
-    let mut word_pair_proximity = HashMap::new();
+    let mut add_word_positions: VecDeque<(String, u16)> =
+        VecDeque::with_capacity(MAX_DISTANCE as usize);
+    let mut del_word_pair_proximity = BTreeMap::new();
+    let mut add_word_pair_proximity = BTreeMap::new();
     let mut current_document_id = None;
 
     let mut cursor = docid_word_positions.into_cursor()?;
@@ -55,50 +59,90 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
         // if we change document, we fill the sorter
         if current_document_id.map_or(false, |id| id != document_id) {
             puffin::profile_scope!("Document into sorter");
-            while !word_positions.is_empty() {
-                word_positions_into_word_pair_proximity(
-                    &mut word_positions,
-                    &mut word_pair_proximity,
-                )?;
-            }
 
             document_word_positions_into_sorter(
                 current_document_id.unwrap(),
-                &word_pair_proximity,
+                &del_word_pair_proximity,
+                &add_word_pair_proximity,
                 &mut word_pair_proximity_docids_sorters,
             )?;
-            word_pair_proximity.clear();
-            word_positions.clear();
+            del_word_pair_proximity.clear();
+            add_word_pair_proximity.clear();
         }
 
         current_document_id = Some(document_id);
 
-        for (position, word) in KvReaderU16::new(&value).iter() {
-            // drain the proximity window until the head word is considered close to the word we are inserting.
-            while word_positions.get(0).map_or(false, |(_w, p)| {
-                index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
-            }) {
-                word_positions_into_word_pair_proximity(
-                    &mut word_positions,
-                    &mut word_pair_proximity,
-                )?;
-            }
+        let (del, add): (Result<_>, Result<_>) = rayon::join(
+            || {
+                // deletions
+                if let Some(deletion) = KvReaderDelAdd::new(&value).get(DelAdd::Deletion) {
+                    for (position, word) in KvReaderU16::new(deletion).iter() {
+                        // drain the proximity window until the head word is considered close to the word we are inserting.
+                        while del_word_positions.get(0).map_or(false, |(_w, p)| {
+                            index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
+                        }) {
+                            word_positions_into_word_pair_proximity(
+                                &mut del_word_positions,
+                                &mut del_word_pair_proximity,
+                            )?;
+                        }
 
-            // insert the new word.
-            let word = std::str::from_utf8(word)?;
-            word_positions.push_back((word.to_string(), position));
-        }
+                        // insert the new word.
+                        let word = std::str::from_utf8(word)?;
+                        del_word_positions.push_back((word.to_string(), position));
+                    }
+
+                    while !del_word_positions.is_empty() {
+                        word_positions_into_word_pair_proximity(
+                            &mut del_word_positions,
+                            &mut del_word_pair_proximity,
+                        )?;
+                    }
+                }
+
+                Ok(())
+            },
+            || {
+                // additions
+                if let Some(addition) = KvReaderDelAdd::new(&value).get(DelAdd::Addition) {
+                    for (position, word) in KvReaderU16::new(addition).iter() {
+                        // drain the proximity window until the head word is considered close to the word we are inserting.
+                        while add_word_positions.get(0).map_or(false, |(_w, p)| {
+                            index_proximity(*p as u32, position as u32) >= MAX_DISTANCE
+                        }) {
+                            word_positions_into_word_pair_proximity(
+                                &mut add_word_positions,
+                                &mut add_word_pair_proximity,
+                            )?;
+                        }
+
+                        // insert the new word.
+                        let word = std::str::from_utf8(word)?;
+                        add_word_positions.push_back((word.to_string(), position));
+                    }
+
+                    while !add_word_positions.is_empty() {
+                        word_positions_into_word_pair_proximity(
+                            &mut add_word_positions,
+                            &mut add_word_pair_proximity,
+                        )?;
+                    }
+                }
+
+                Ok(())
+            },
+        );
+
+        del?;
+        add?;
     }
 
     if let Some(document_id) = current_document_id {
         puffin::profile_scope!("Final document into sorter");
-        while !word_positions.is_empty() {
-            word_positions_into_word_pair_proximity(&mut word_positions, &mut word_pair_proximity)?;
-        }
-
         document_word_positions_into_sorter(
             document_id,
-            &word_pair_proximity,
+            &del_word_pair_proximity,
+            &add_word_pair_proximity,
             &mut word_pair_proximity_docids_sorters,
         )?;
     }
@@ -124,11 +168,38 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
 /// close to each other.
 fn document_word_positions_into_sorter(
     document_id: DocumentId,
-    word_pair_proximity: &HashMap<(String, String), u8>,
+    del_word_pair_proximity: &BTreeMap<(String, String), u8>,
+    add_word_pair_proximity: &BTreeMap<(String, String), u8>,
     word_pair_proximity_docids_sorters: &mut Vec<grenad::Sorter<MergeFn>>,
 ) -> Result<()> {
+    use itertools::merge_join_by;
+    use itertools::EitherOrBoth::{Both, Left, Right};
+
+    let mut buffer = Vec::new();
     let mut key_buffer = Vec::new();
-    for ((w1, w2), prox) in word_pair_proximity {
+    for eob in
+        merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
+            d.cmp(a)
+        })
+    {
+        buffer.clear();
+        let mut value_writer = KvWriterDelAdd::new(&mut buffer);
+        let ((w1, w2), prox) = match eob {
+            Left(key_value) => {
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                key_value
+            }
+            Right(key_value) => {
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                key_value
+            }
+            Both(key_value, _) => {
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                key_value
+            }
+        };
+
         key_buffer.clear();
         key_buffer.push(*prox as u8);
         key_buffer.extend_from_slice(w1.as_bytes());
@@ -136,7 +207,7 @@ fn document_word_positions_into_sorter(
         key_buffer.extend_from_slice(w2.as_bytes());
 
         word_pair_proximity_docids_sorters[*prox as usize - 1]
-            .insert(&key_buffer, document_id.to_ne_bytes())?;
+            .insert(&key_buffer, value_writer.into_inner().unwrap())?;
     }
 
     Ok(())
@@ -144,7 +215,7 @@ fn document_word_positions_into_sorter(
 
 fn word_positions_into_word_pair_proximity(
     word_positions: &mut VecDeque<(String, u16)>,
-    word_pair_proximity: &mut HashMap<(String, String), u8>,
+    word_pair_proximity: &mut BTreeMap<(String, String), u8>,
 ) -> Result<()> {
     let (head_word, head_position) = word_positions.pop_front().unwrap();
     for (word, position) in word_positions.iter() {

From 40186bf4033383ed1acc287b93ee01b7d7162d0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Thu, 19 Oct 2023 10:38:58 +0200
Subject: [PATCH 026/127] Rename FieldIdWordCountDocids correctly

---
 milli/src/update/index_documents/extract/mod.rs | 2 +-
 milli/src/update/index_documents/typed_chunk.rs | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 0522fc93c..7d643d61f 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -167,7 +167,7 @@ pub(crate) fn data_from_obkv_documents(
         lmdb_writer_sx.clone(),
         extract_fid_word_count_docids,
         merge_cbo_roaring_bitmaps,
-        TypedChunk::FieldIdWordcountDocids,
+        TypedChunk::FieldIdWordCountDocids,
         "field-id-wordcount-docids",
     );
 
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index e3ff9b253..2e7266db0 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -27,7 +27,7 @@ pub(crate) enum TypedChunk {
     FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
     FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
     Documents(grenad::Reader<CursorClonableMmap>),
-    FieldIdWordcountDocids(grenad::Reader<BufReader<File>>),
+    FieldIdWordCountDocids(grenad::Reader<BufReader<File>>),
     NewDocumentsIds(RoaringBitmap),
     WordDocids {
         word_docids_reader: grenad::Reader<BufReader<File>>,
@@ -58,7 +58,7 @@ impl TypedChunk {
             TypedChunk::Documents(grenad) => {
                 format!("Documents {{ number_of_entries: {} }}", grenad.len())
             }
-            TypedChunk::FieldIdWordcountDocids(grenad) => {
+            TypedChunk::FieldIdWordCountDocids(grenad) => {
                 format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len())
             }
             TypedChunk::NewDocumentsIds(grenad) => {
@@ -126,7 +126,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 index.documents.remap_types::<ByteSlice, ByteSlice>().put(wtxn, key, value)?;
             }
         }
-        TypedChunk::FieldIdWordcountDocids(fid_word_count_docids_iter) => {
+        TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {
             append_entries_into_database(
                 fid_word_count_docids_iter,
                 &index.field_id_word_count_docids,
@@ -478,7 +478,7 @@ where
     while let Some((key, value)) = cursor.move_on_next()? {
         if valid_lmdb_key(key) {
             debug_assert!(
-                K::bytes_decode(&key).is_some(),
+                K::bytes_decode(key).is_some(),
                 "Couldn't decode key with the database decoder, key length: {} - key bytes: {:x?}",
                 key.len(),
                 &key

From 2d3f15f82c4f6104aeba9199b8a71f6924de45fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Thu, 19 Oct 2023 10:47:00 +0200
Subject: [PATCH 027/127] Introduce a function to only serialize the Add side
 of a DelAdd obkv

---
 .../src/update/index_documents/typed_chunk.rs | 31 +++++++++++++------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 2e7266db0..e0e2ff1ec 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -19,6 +19,7 @@ use crate::distance::NDotProductPoint;
 use crate::error::UserError;
 use crate::facet::FacetType;
 use crate::index::Hnsw;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd};
 use crate::update::facet::FacetsUpdate;
 use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
 use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
@@ -132,7 +133,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 &index.field_id_word_count_docids,
                 wtxn,
                 index_is_empty,
-                |value, _buffer| Ok(value),
+                deladd_serialize_add_side,
                 merge_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
@@ -151,7 +152,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 &index.word_docids,
                 wtxn,
                 index_is_empty,
-                |value, _buffer| Ok(value),
+                deladd_serialize_add_side,
                 merge_cbo_roaring_bitmaps,
             )?;
 
@@ -161,7 +162,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 &index.exact_word_docids,
                 wtxn,
                 index_is_empty,
-                |value, _buffer| Ok(value),
+                deladd_serialize_add_side,
                 merge_cbo_roaring_bitmaps,
             )?;
 
@@ -171,7 +172,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 &index.word_fid_docids,
                 wtxn,
                 index_is_empty,
-                |value, _buffer| Ok(value),
+                deladd_serialize_add_side,
                 merge_cbo_roaring_bitmaps,
             )?;
 
@@ -193,7 +194,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 &index.word_position_docids,
                 wtxn,
                 index_is_empty,
-                |value, _buffer| Ok(value),
+                deladd_serialize_add_side,
                 merge_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
@@ -214,7 +215,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 &index.facet_id_exists_docids,
                 wtxn,
                 index_is_empty,
-                |value, _buffer| Ok(value),
+                deladd_serialize_add_side,
                 merge_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
@@ -225,7 +226,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 &index.facet_id_is_null_docids,
                 wtxn,
                 index_is_empty,
-                |value, _buffer| Ok(value),
+                deladd_serialize_add_side,
                 merge_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
@@ -236,7 +237,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 &index.facet_id_is_empty_docids,
                 wtxn,
                 index_is_empty,
-                |value, _buffer| Ok(value),
+                deladd_serialize_add_side,
                 merge_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
@@ -247,7 +248,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 &index.word_pair_proximity_docids,
                 wtxn,
                 index_is_empty,
-                |value, _buffer| Ok(value),
+                deladd_serialize_add_side,
                 merge_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
@@ -320,7 +321,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 let found = vector.len();
                 let expected = *expected_dimensions.get_or_insert(found);
                 if expected != found {
-                    return Err(UserError::InvalidVectorDimensions { expected, found })?;
+                    return Err(UserError::InvalidVectorDimensions { expected, found }.into());
                 }
 
                 points.push(NDotProductPoint::new(vector));
@@ -398,6 +399,16 @@ fn merge_cbo_roaring_bitmaps(
     )?)
 }
 
+/// A function that extracts and returns the Add side of a DelAdd obkv.
+/// This is useful when there are no previous value in the database and
+/// therefore we don't need to do a diff with what's already there.
+///
+/// If there is no Add side we currently write an empty buffer
+/// which is a valid CboRoaringBitmap.
+fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec<u8>) -> Result<&'a [u8]> {
+    Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
+}
+
 /// Write provided entries in database using serialize_value function.
 /// merge_values function is used if an entry already exist in the database.
 fn write_entries_into_database<R, K, V, FS, FM>(

From 560e8f56135f14e3a0be3bafccc917647bb87c98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Thu, 19 Oct 2023 11:18:30 +0200
Subject: [PATCH 028/127] Introduce the CboRoaringBitmapCodec merge_deladd_into
 and use it

---
 .../cbo_roaring_bitmap_codec.rs               | 23 ++++++++++
 .../src/update/index_documents/typed_chunk.rs | 45 ++++++++++---------
 2 files changed, 48 insertions(+), 20 deletions(-)

diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
index 79b52695e..117da1308 100644
--- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
+++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
@@ -6,6 +6,7 @@ use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
 use roaring::RoaringBitmap;
 
 use crate::heed_codec::BytesDecodeOwned;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd};
 
 /// This is the limit where using a byteorder became less size efficient
 /// than using a direct roaring encoding, it is also the point where we are able
@@ -99,6 +100,28 @@ impl CboRoaringBitmapCodec {
 
         Ok(())
     }
+
+    /// Merges a DelAdd delta into a CboRoaringBitmap.
+    pub fn merge_deladd_into(
+        deladd: KvReaderDelAdd<'_>,
+        previous: &[u8],
+        buffer: &mut Vec<u8>,
+    ) -> io::Result<()> {
+        // Deserialize the bitmap that is already there
+        let mut previous = Self::deserialize_from(previous)?;
+
+        // Remove integers we no more want in the previous bitmap
+        if let Some(value) = deladd.get(DelAdd::Deletion) {
+            previous -= Self::deserialize_from(value)?;
+        }
+
+        // Insert the new integers we want in the previous bitmap
+        if let Some(value) = deladd.get(DelAdd::Addition) {
+            previous |= Self::deserialize_from(value)?;
+        }
+
+        previous.serialize_into(buffer)
+    }
 }
 
 impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index e0e2ff1ec..faeee944f 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -134,7 +134,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
         }
@@ -153,7 +153,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps,
             )?;
 
             let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
@@ -163,7 +163,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps,
             )?;
 
             let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
@@ -173,7 +173,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps,
             )?;
 
             // create fst from word docids
@@ -195,7 +195,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
         }
@@ -216,7 +216,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
         }
@@ -227,7 +227,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
         }
@@ -238,7 +238,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
         }
@@ -249,7 +249,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps,
             )?;
             is_merged_database = true;
         }
@@ -388,17 +388,6 @@ fn merge_word_docids_reader_into_fst(
     Ok(builder.into_set())
 }
 
-fn merge_cbo_roaring_bitmaps(
-    new_value: &[u8],
-    db_value: &[u8],
-    buffer: &mut Vec<u8>,
-) -> Result<()> {
-    Ok(CboRoaringBitmapCodec::merge_into(
-        &[Cow::Borrowed(db_value), Cow::Borrowed(new_value)],
-        buffer,
-    )?)
-}
-
 /// A function that extracts and returns the Add side of a DelAdd obkv.
 /// This is useful when there are no previous value in the database and
 /// therefore we don't need to do a diff with what's already there.
@@ -409,6 +398,22 @@ fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec<u8>) -> Resul
     Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
 }
 
+/// A function that merges a DelAdd of bitmao into an already existing bitmap.
+///
+/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
+/// the second one is the CboRoaringBitmap to merge into.
+fn merge_deladd_cbo_roaring_bitmaps(
+    deladd_obkv: &[u8],
+    previous: &[u8],
+    buffer: &mut Vec<u8>,
+) -> Result<()> {
+    Ok(CboRoaringBitmapCodec::merge_deladd_into(
+        KvReaderDelAdd::new(deladd_obkv),
+        previous,
+        buffer,
+    )?)
+}
+
 /// Write provided entries in database using serialize_value function.
 /// merge_values function is used if an entry already exist in the database.
 fn write_entries_into_database<R, K, V, FS, FM>(

From f67ff3a738374dc957a676707cb9f5214cb64629 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 19 Oct 2023 11:56:42 +0200
Subject: [PATCH 029/127] Facets Bulk update

---
 milli/src/update/facet/bulk.rs | 46 ++++++++++++++++++++++------------
 milli/src/update/facet/mod.rs  | 16 ++++++------
 2 files changed, 38 insertions(+), 24 deletions(-)

diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs
index a2b1c9dcd..40b64fc25 100644
--- a/milli/src/update/facet/bulk.rs
+++ b/milli/src/update/facet/bulk.rs
@@ -2,9 +2,10 @@ use std::borrow::Cow;
 use std::fs::File;
 use std::io::BufReader;
 
-use grenad::CompressionType;
+use grenad::{CompressionType, Reader};
 use heed::types::ByteSlice;
 use heed::{BytesEncode, Error, RoTxn, RwTxn};
+use obkv::KvReader;
 use roaring::RoaringBitmap;
 
 use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
@@ -13,6 +14,7 @@ use crate::heed_codec::facet::{
     FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
 };
 use crate::heed_codec::ByteSliceRefCodec;
+use crate::update::del_add::DelAdd;
 use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
 use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
 
@@ -31,7 +33,7 @@ pub struct FacetsUpdateBulk<'i> {
     facet_type: FacetType,
     field_ids: Vec<FieldId>,
     // None if level 0 does not need to be updated
-    new_data: Option<grenad::Reader<BufReader<File>>>,
+    delta_data: Option<grenad::Reader<BufReader<File>>>,
 }
 
 impl<'i> FacetsUpdateBulk<'i> {
@@ -39,7 +41,7 @@ impl<'i> FacetsUpdateBulk<'i> {
         index: &'i Index,
         field_ids: Vec<FieldId>,
         facet_type: FacetType,
-        new_data: grenad::Reader<BufReader<File>>,
+        delta_data: grenad::Reader<BufReader<File>>,
         group_size: u8,
         min_level_size: u8,
     ) -> FacetsUpdateBulk<'i> {
@@ -49,7 +51,7 @@ impl<'i> FacetsUpdateBulk<'i> {
             group_size,
             min_level_size,
             facet_type,
-            new_data: Some(new_data),
+            delta_data: Some(delta_data),
         }
     }
 
@@ -64,13 +66,13 @@ impl<'i> FacetsUpdateBulk<'i> {
             group_size: FACET_GROUP_SIZE,
             min_level_size: FACET_MIN_LEVEL_SIZE,
             facet_type,
-            new_data: None,
+            delta_data: None,
         }
     }
 
     #[logging_timer::time("FacetsUpdateBulk::{}")]
     pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
-        let Self { index, field_ids, group_size, min_level_size, facet_type, new_data } = self;
+        let Self { index, field_ids, group_size, min_level_size, facet_type, delta_data } = self;
 
         let db = match facet_type {
             FacetType::String => index
@@ -81,7 +83,7 @@ impl<'i> FacetsUpdateBulk<'i> {
             }
         };
 
-        let inner = FacetsUpdateBulkInner { db, new_data, group_size, min_level_size };
+        let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
 
         inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {
             index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?;
@@ -95,7 +97,7 @@ impl<'i> FacetsUpdateBulk<'i> {
 /// Implementation of `FacetsUpdateBulk` that is independent of milli's `Index` type
 pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
     pub db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
-    pub new_data: Option<grenad::Reader<R>>,
+    pub delta_data: Option<grenad::Reader<R>>,
     pub group_size: u8,
     pub min_level_size: u8,
 }
@@ -134,20 +136,26 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
         Ok(())
     }
 
-    // TODO the new_data is an Reader<Obkv<Key, Obkv<DelAdd, RoaringBitmap>>>
     fn update_level0(&mut self, wtxn: &mut RwTxn) -> Result<()> {
-        let new_data = match self.new_data.take() {
+        let delta_data = match self.delta_data.take() {
             Some(x) => x,
             None => return Ok(()),
         };
         if self.db.is_empty(wtxn)? {
             let mut buffer = Vec::new();
             let mut database = self.db.iter_mut(wtxn)?.remap_types::<ByteSlice, ByteSlice>();
-            let mut cursor = new_data.into_cursor()?;
+            let mut cursor = delta_data.into_cursor()?;
             while let Some((key, value)) = cursor.move_on_next()? {
                 if !valid_lmdb_key(key) {
                     continue;
                 }
+                let value: KvReader<DelAdd> = KvReader::new(value);
+
+                // DB is empty, it is safe to ignore Del operations
+                let Some(value) = value.get(DelAdd::Addition) else {
+                    continue;
+                };
+
                 buffer.clear();
                 // the group size for level 0
                 buffer.push(1);
@@ -159,11 +167,14 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
             let mut buffer = Vec::new();
             let database = self.db.remap_types::<ByteSlice, ByteSlice>();
 
-            let mut cursor = new_data.into_cursor()?;
+            let mut cursor = delta_data.into_cursor()?;
             while let Some((key, value)) = cursor.move_on_next()? {
                 if !valid_lmdb_key(key) {
                     continue;
                 }
+
+                let value: KvReader<DelAdd> = KvReader::new(value);
+
                 // the value is a CboRoaringBitmap, but I still need to prepend the
                 // group size for level 0 (= 1) to it
                 buffer.clear();
@@ -172,12 +183,15 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
                 match database.get(wtxn, key)? {
                     Some(prev_value) => {
                         let old_bitmap = &prev_value[1..];
-                        CboRoaringBitmapCodec::merge_into(
-                            &[Cow::Borrowed(value), Cow::Borrowed(old_bitmap)],
-                            &mut buffer,
-                        )?;
+                        CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
                     }
                     None => {
+                        // it is safe to ignore the del in that case.
+                        let Some(value) = value.get(DelAdd::Addition) else {
+                            // won't put the key in DB as the value would be empty
+                            continue;
+                        };
+
                         buffer.extend_from_slice(value);
                     }
                 };
diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs
index decb6a9ac..c016af354 100644
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -109,7 +109,7 @@ pub struct FacetsUpdate<'i> {
     index: &'i Index,
     database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
     facet_type: FacetType,
-    new_data: grenad::Reader<BufReader<File>>,
+    delta_data: grenad::Reader<BufReader<File>>,
     group_size: u8,
     max_group_size: u8,
     min_level_size: u8,
@@ -119,7 +119,7 @@ impl<'i> FacetsUpdate<'i> {
     pub fn new(
         index: &'i Index,
         facet_type: FacetType,
-        new_data: grenad::Reader<BufReader<File>>,
+        delta_data: grenad::Reader<BufReader<File>>,
     ) -> Self {
         let database = match facet_type {
             FacetType::String => index
@@ -136,26 +136,26 @@ impl<'i> FacetsUpdate<'i> {
             max_group_size: FACET_MAX_GROUP_SIZE,
             min_level_size: FACET_MIN_LEVEL_SIZE,
             facet_type,
-            new_data,
+            delta_data,
         }
     }
 
     pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
-        if self.new_data.is_empty() {
+        if self.delta_data.is_empty() {
             return Ok(());
         }
         debug!("Computing and writing the facet values levels docids into LMDB on disk...");
         self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
 
         // See self::comparison_bench::benchmark_facet_indexing
-        if self.new_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
+        if self.delta_data.len() >= (self.database.len(wtxn)? as u64 / 50) {
             let field_ids =
                 self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::<Vec<_>>();
             let bulk_update = FacetsUpdateBulk::new(
                 self.index,
                 field_ids,
                 self.facet_type,
-                self.new_data,
+                self.delta_data,
                 self.group_size,
                 self.min_level_size,
             );
@@ -164,7 +164,7 @@ impl<'i> FacetsUpdate<'i> {
             let incremental_update = FacetsUpdateIncremental::new(
                 self.index,
                 self.facet_type,
-                self.new_data,
+                self.delta_data,
                 self.group_size,
                 self.min_level_size,
                 self.max_group_size,
@@ -464,7 +464,7 @@ pub(crate) mod test_helpers {
 
             let update = FacetsUpdateBulkInner {
                 db: self.content,
-                new_data: Some(reader),
+                delta_data: Some(reader),
                 group_size: self.group_size.get(),
                 min_level_size: self.min_level_size.get(),
             };

From 04ec293024191deabe4725427230f78f01af72af Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 19 Oct 2023 12:01:12 +0200
Subject: [PATCH 030/127] Facet Incremental update

---
 milli/src/update/facet/incremental.rs | 77 +++++++++++++++++++++++----
 1 file changed, 66 insertions(+), 11 deletions(-)

diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs
index 743c0b038..802c02b85 100644
--- a/milli/src/update/facet/incremental.rs
+++ b/milli/src/update/facet/incremental.rs
@@ -4,6 +4,7 @@ use std::io::BufReader;
 
 use heed::types::{ByteSlice, DecodeIgnore};
 use heed::{BytesDecode, Error, RoTxn, RwTxn};
+use obkv::KvReader;
 use roaring::RoaringBitmap;
 
 use crate::facet::FacetType;
@@ -12,6 +13,7 @@ use crate::heed_codec::facet::{
 };
 use crate::heed_codec::ByteSliceRefCodec;
 use crate::search::facet::get_highest_level;
+use crate::update::del_add::DelAdd;
 use crate::update::index_documents::valid_lmdb_key;
 use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
 
@@ -35,14 +37,14 @@ pub struct FacetsUpdateIncremental<'i> {
     index: &'i Index,
     inner: FacetsUpdateIncrementalInner,
     facet_type: FacetType,
-    new_data: grenad::Reader<BufReader<File>>,
+    delta_data: grenad::Reader<BufReader<File>>,
 }
 
 impl<'i> FacetsUpdateIncremental<'i> {
     pub fn new(
         index: &'i Index,
         facet_type: FacetType,
-        new_data: grenad::Reader<BufReader<File>>,
+        delta_data: grenad::Reader<BufReader<File>>,
         group_size: u8,
         min_level_size: u8,
         max_group_size: u8,
@@ -63,29 +65,82 @@ impl<'i> FacetsUpdateIncremental<'i> {
                 min_level_size,
             },
             facet_type,
-            new_data,
+            delta_data,
         }
     }
 
     pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
-        let mut new_faceted_docids = HashMap::<FieldId, RoaringBitmap>::default();
+        #[derive(Default)]
+        struct DeltaDocids {
+            deleted: RoaringBitmap,
+            added: RoaringBitmap,
+        }
+        impl DeltaDocids {
+            fn add(&mut self, added: &RoaringBitmap) {
+                self.deleted -= added;
+                self.added |= added;
+            }
+            fn delete(&mut self, deleted: &RoaringBitmap) {
+                self.deleted |= deleted;
+                self.added -= deleted;
+            }
+            fn applied(self, mut docids: RoaringBitmap) -> RoaringBitmap {
+                docids -= self.deleted;
+                docids |= self.added;
+                docids
+            }
+        }
 
-        let mut cursor = self.new_data.into_cursor()?;
+        let mut new_faceted_docids = HashMap::<FieldId, DeltaDocids>::default();
+
+        let mut cursor = self.delta_data.into_cursor()?;
         while let Some((key, value)) = cursor.move_on_next()? {
             if !valid_lmdb_key(key) {
                 continue;
             }
             let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_decode(key)
                 .ok_or(heed::Error::Encoding)?;
-            let docids = CboRoaringBitmapCodec::bytes_decode(value).ok_or(heed::Error::Encoding)?;
-            self.inner.insert(wtxn, key.field_id, key.left_bound, &docids)?;
-            *new_faceted_docids.entry(key.field_id).or_default() |= docids;
+            let value = KvReader::new(value);
+
+            let entry = new_faceted_docids.entry(key.field_id).or_default();
+
+            let docids_to_delete = value
+                .get(DelAdd::Deletion)
+                .map(CboRoaringBitmapCodec::bytes_decode)
+                .map(|o| o.ok_or(heed::Error::Encoding));
+
+            let docids_to_add = value
+                .get(DelAdd::Addition)
+                .map(CboRoaringBitmapCodec::bytes_decode)
+                .map(|o| o.ok_or(heed::Error::Encoding));
+
+            if let Some(docids_to_delete) = docids_to_delete {
+                let docids_to_delete = docids_to_delete?;
+                self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?;
+                entry.delete(&docids_to_delete);
+            }
+
+            if let Some(docids_to_add) = docids_to_add {
+                let docids_to_add = docids_to_add?;
+                self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?;
+                entry.add(&docids_to_add);
+            }
         }
 
+        // FIXME: broken for multi-value facets?
+        //
+        // Consider an incremental update: `facet="tags", facet_value="Action", {Del: Some([0, 1]), Add: None }`
+        // The current code will inconditionally remove docs 0 and 1 from faceted docs for "tags".
+        // Now for doc 0: `"tags": "Action"`, it's correct behavior
+        // for doc 1: `"tags": "Action, Adventure"`, it's incorrect behavior
         for (field_id, new_docids) in new_faceted_docids {
-            let mut docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?;
-            docids |= new_docids;
-            self.index.put_faceted_documents_ids(wtxn, field_id, self.facet_type, &docids)?;
+            let old_docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?;
+            self.index.put_faceted_documents_ids(
+                wtxn,
+                field_id,
+                self.facet_type,
+                &new_docids.applied(old_docids),
+            )?;
         }
         Ok(())
     }

From 14832cb32414b28b10ac778bcf4794fccc748b67 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 23 Oct 2023 14:50:11 +0200
Subject: [PATCH 031/127] Remove Index::faceted_documents_ids

---
 milli/src/index.rs                      | 40 ----------------------
 milli/src/snapshot_tests.rs             | 30 -----------------
 milli/src/update/clear_documents.rs     | 16 ---------
 milli/src/update/delete_documents.rs    |  6 ----
 milli/src/update/facet/bulk.rs          |  6 +---
 milli/src/update/facet/delete.rs        | 11 ------
 milli/src/update/facet/incremental.rs   | 45 -------------------------
 milli/src/update/facet/mod.rs           |  3 --
 milli/src/update/index_documents/mod.rs | 18 ----------
 9 files changed, 1 insertion(+), 174 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 288223a95..f8be55545 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -55,7 +55,6 @@ pub mod main_key {
     /// e.g. vector-hnsw0x0032.
     pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw";
     pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
-    pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
     pub const PRIMARY_KEY_KEY: &str = "primary-key";
     pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
     pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
@@ -64,7 +63,6 @@ pub mod main_key {
     pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens";
     pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens";
     pub const DICTIONARY_KEY: &str = "dictionary";
-    pub const STRING_FACETED_DOCUMENTS_IDS_PREFIX: &str = "string-faceted-documents-ids";
     pub const SYNONYMS_KEY: &str = "synonyms";
     pub const USER_DEFINED_SYNONYMS_KEY: &str = "user-defined-synonyms";
     pub const WORDS_FST_KEY: &str = "words-fst";
@@ -926,44 +924,6 @@ impl Index {
 
     /* faceted documents ids */
 
-    /// Writes the documents ids that are faceted under this field id for the given facet type.
-    pub fn put_faceted_documents_ids(
-        &self,
-        wtxn: &mut RwTxn,
-        field_id: FieldId,
-        facet_type: FacetType,
-        docids: &RoaringBitmap,
-    ) -> heed::Result<()> {
-        let key = match facet_type {
-            FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX,
-            FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX,
-        };
-        let mut buffer = vec![0u8; key.len() + size_of::<FieldId>()];
-        buffer[..key.len()].copy_from_slice(key.as_bytes());
-        buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes());
-        self.main.put::<_, ByteSlice, RoaringBitmapCodec>(wtxn, &buffer, docids)
-    }
-
-    /// Retrieve all the documents ids that are faceted under this field id for the given facet type.
-    pub fn faceted_documents_ids(
-        &self,
-        rtxn: &RoTxn,
-        field_id: FieldId,
-        facet_type: FacetType,
-    ) -> heed::Result<RoaringBitmap> {
-        let key = match facet_type {
-            FacetType::String => main_key::STRING_FACETED_DOCUMENTS_IDS_PREFIX,
-            FacetType::Number => main_key::NUMBER_FACETED_DOCUMENTS_IDS_PREFIX,
-        };
-        let mut buffer = vec![0u8; key.len() + size_of::<FieldId>()];
-        buffer[..key.len()].copy_from_slice(key.as_bytes());
-        buffer[key.len()..].copy_from_slice(&field_id.to_be_bytes());
-        match self.main.get::<_, ByteSlice, RoaringBitmapCodec>(rtxn, &buffer)? {
-            Some(docids) => Ok(docids),
-            None => Ok(RoaringBitmap::new()),
-        }
-    }
-
     /// Retrieve all the documents which contain this field id set as null
     pub fn null_faceted_documents_ids(
         &self,
diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs
index 158f515b8..4b21cc175 100644
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -359,31 +359,7 @@ pub fn snap_external_documents_ids(index: &Index) -> String {
 
     snap
 }
-pub fn snap_number_faceted_documents_ids(index: &Index) -> String {
-    let rtxn = index.read_txn().unwrap();
-    let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
-    let mut snap = String::new();
-    for field_id in fields_ids_map.ids() {
-        let number_faceted_documents_ids =
-            index.faceted_documents_ids(&rtxn, field_id, FacetType::Number).unwrap();
-        writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&number_faceted_documents_ids))
-            .unwrap();
-    }
-    snap
-}
-pub fn snap_string_faceted_documents_ids(index: &Index) -> String {
-    let rtxn = index.read_txn().unwrap();
-    let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
 
-    let mut snap = String::new();
-    for field_id in fields_ids_map.ids() {
-        let string_faceted_documents_ids =
-            index.faceted_documents_ids(&rtxn, field_id, FacetType::String).unwrap();
-        writeln!(&mut snap, "{field_id:<3} {}", display_bitmap(&string_faceted_documents_ids))
-            .unwrap();
-    }
-    snap
-}
 pub fn snap_words_fst(index: &Index) -> String {
     let rtxn = index.read_txn().unwrap();
     let words_fst = index.words_fst(&rtxn).unwrap();
@@ -531,12 +507,6 @@ macro_rules! full_snap_of_db {
     ($index:ident, external_documents_ids) => {{
         $crate::snapshot_tests::snap_external_documents_ids(&$index)
     }};
-    ($index:ident, number_faceted_documents_ids) => {{
-        $crate::snapshot_tests::snap_number_faceted_documents_ids(&$index)
-    }};
-    ($index:ident, string_faceted_documents_ids) => {{
-        $crate::snapshot_tests::snap_string_faceted_documents_ids(&$index)
-    }};
     ($index:ident, words_fst) => {{
         $crate::snapshot_tests::snap_words_fst(&$index)
     }};
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index ab42fd854..52f3e80db 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -64,22 +64,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
         self.index.delete_vector_hnsw(self.wtxn)?;
 
-        // We clean all the faceted documents ids.
-        for field_id in faceted_fields {
-            self.index.put_faceted_documents_ids(
-                self.wtxn,
-                field_id,
-                FacetType::Number,
-                &empty_roaring,
-            )?;
-            self.index.put_faceted_documents_ids(
-                self.wtxn,
-                field_id,
-                FacetType::String,
-                &empty_roaring,
-            )?;
-        }
-
         // Clear the other databases.
         word_docids.clear(self.wtxn)?;
         exact_word_docids.clear(self.wtxn)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 1fef922cd..9044f03be 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -384,12 +384,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
         for facet_type in [FacetType::Number, FacetType::String] {
             let mut affected_facet_values = HashMap::new();
             for field_id in self.index.faceted_fields_ids(self.wtxn)? {
-                // Remove docids from the number faceted documents ids
-                let mut docids =
-                    self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?;
-                docids -= &self.to_delete_docids;
-                self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?;
-
                 let facet_values = remove_docids_from_field_id_docid_facet_value(
                     self.index,
                     self.wtxn,
diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs
index 40b64fc25..5247298a4 100644
--- a/milli/src/update/facet/bulk.rs
+++ b/milli/src/update/facet/bulk.rs
@@ -23,9 +23,6 @@ use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
 ///
 /// First, the new elements are inserted into the level 0 of the database. Then, the
 /// higher levels are cleared and recomputed from the content of level 0.
-///
-/// Finally, the `faceted_documents_ids` value in the main database of `Index`
-/// is updated to contain the new set of faceted documents.
 pub struct FacetsUpdateBulk<'i> {
     index: &'i Index,
     group_size: u8,
@@ -86,7 +83,7 @@ impl<'i> FacetsUpdateBulk<'i> {
         let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
 
         inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {
-            index.put_faceted_documents_ids(wtxn, field_id, facet_type, &all_docids)?;
+            // TODO: remove the lambda altogether
             Ok(())
         })?;
 
@@ -507,7 +504,6 @@ mod tests {
         index.add_documents(documents).unwrap();
 
         db_snap!(index, facet_id_f64_docids, "initial", @"c34f499261f3510d862fa0283bbe843a");
-        db_snap!(index, number_faceted_documents_ids, "initial", @"01594fecbb316798ce3651d6730a4521");
     }
 
     #[test]
diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs
index 883abc8ca..8bd3f196b 100644
--- a/milli/src/update/facet/delete.rs
+++ b/milli/src/update/facet/delete.rs
@@ -160,7 +160,6 @@ mod tests {
         index.add_documents(documents).unwrap();
 
         db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
-        db_snap!(index, number_faceted_documents_ids, 1, @"9a0ea88e7c9dcf6dc0ef0b601736ffcf");
 
         let mut wtxn = index.env.write_txn().unwrap();
 
@@ -178,7 +177,6 @@ mod tests {
 
         db_snap!(index, soft_deleted_documents_ids, @"[]");
         db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
-        db_snap!(index, number_faceted_documents_ids, 2, @"3570e0ac0fdb21be9ebe433f59264b56");
     }
 
     // Same test as above but working with string values for the facets
@@ -219,7 +217,6 @@ mod tests {
 
         // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
         db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
-        db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
 
         let mut wtxn = index.env.write_txn().unwrap();
 
@@ -237,7 +234,6 @@ mod tests {
 
         db_snap!(index, soft_deleted_documents_ids, @"[]");
         db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
-        db_snap!(index, string_faceted_documents_ids, 2, @"504152afa5c94fd4e515dcdfa4c7161f");
     }
 
     #[test]
@@ -274,7 +270,6 @@ mod tests {
 
         // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
         db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
-        db_snap!(index, string_faceted_documents_ids, 1, @"54bc15494fa81d93339f43c08fd9d8f5");
 
         let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
 
@@ -291,12 +286,6 @@ mod tests {
 
         db_snap!(index, soft_deleted_documents_ids, @"[]");
         db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
-        db_snap!(index, string_faceted_documents_ids, 2, @r###"
-        0   []
-        1   [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
-        2   [292, 324, 358, 381, 493, 839, 852, ]
-        3   [11, 20, 73, 292, 324, 358, 381, 493, 839, 852, ]
-        "###);
     }
 }
 
diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs
index 802c02b85..77e9874f6 100644
--- a/milli/src/update/facet/incremental.rs
+++ b/milli/src/update/facet/incremental.rs
@@ -30,9 +30,6 @@ enum DeletionResult {
 
 /// Algorithm to incrementally insert and delete elememts into the
 /// `facet_id_(string/f64)_docids` databases.
-///
-/// Rhe `faceted_documents_ids` value in the main database of `Index`
-/// is also updated to contain the new set of faceted documents.
 pub struct FacetsUpdateIncremental<'i> {
     index: &'i Index,
     inner: FacetsUpdateIncrementalInner,
@@ -70,29 +67,6 @@ impl<'i> FacetsUpdateIncremental<'i> {
     }
 
     pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
-        #[derive(Default)]
-        struct DeltaDocids {
-            deleted: RoaringBitmap,
-            added: RoaringBitmap,
-        }
-        impl DeltaDocids {
-            fn add(&mut self, added: &RoaringBitmap) {
-                self.deleted -= added;
-                self.added |= added;
-            }
-            fn delete(&mut self, deleted: &RoaringBitmap) {
-                self.deleted |= deleted;
-                self.added -= deleted;
-            }
-            fn applied(self, mut docids: RoaringBitmap) -> RoaringBitmap {
-                docids -= self.deleted;
-                docids |= self.added;
-                docids
-            }
-        }
-
-        let mut new_faceted_docids = HashMap::<FieldId, DeltaDocids>::default();
-
         let mut cursor = self.delta_data.into_cursor()?;
         while let Some((key, value)) = cursor.move_on_next()? {
             if !valid_lmdb_key(key) {
@@ -102,8 +76,6 @@ impl<'i> FacetsUpdateIncremental<'i> {
                 .ok_or(heed::Error::Encoding)?;
             let value = KvReader::new(value);
 
-            let entry = new_faceted_docids.entry(key.field_id).or_default();
-
             let docids_to_delete = value
                 .get(DelAdd::Deletion)
                 .map(CboRoaringBitmapCodec::bytes_decode)
@@ -117,31 +89,14 @@ impl<'i> FacetsUpdateIncremental<'i> {
             if let Some(docids_to_delete) = docids_to_delete {
                 let docids_to_delete = docids_to_delete?;
                 self.inner.delete(wtxn, key.field_id, key.left_bound, &docids_to_delete)?;
-                entry.delete(&docids_to_delete);
             }
 
             if let Some(docids_to_add) = docids_to_add {
                 let docids_to_add = docids_to_add?;
                 self.inner.insert(wtxn, key.field_id, key.left_bound, &docids_to_add)?;
-                entry.add(&docids_to_add);
             }
         }
 
-        // FIXME: broken for multi-value facets?
-        //
-        // Consider an incremental update: `facet="tags", facet_value="Action", {Del: Some([0, 1]), Add: None }`
-        // The current code will inconditionally remove docs 0 and 1 from faceted docs for "tags".
-        // Now for doc 0: `"tags": "Action"`, it's correct behavior
-        // for doc 1: `"tags": "Action, Adventure"`, it's incorrect behavior
-        for (field_id, new_docids) in new_faceted_docids {
-            let old_docids = self.index.faceted_documents_ids(wtxn, field_id, self.facet_type)?;
-            self.index.put_faceted_documents_ids(
-                wtxn,
-                field_id,
-                self.facet_type,
-                &new_docids.applied(old_docids),
-            )?;
-        }
         Ok(())
     }
 }
diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs
index c016af354..e3c632983 100644
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -599,7 +599,6 @@ mod tests {
         index.add_documents(documents).unwrap();
 
         db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
-        db_snap!(index, number_faceted_documents_ids, "initial", @"bd916ef32b05fd5c3c4c518708f431a9");
         db_snap!(index, soft_deleted_documents_ids, "initial", @"[]");
 
         let mut documents = vec![];
@@ -622,7 +621,6 @@ mod tests {
         index.add_documents(documents).unwrap();
 
         db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
-        db_snap!(index, number_faceted_documents_ids, "replaced_1_soft", @"de76488bd05ad94c6452d725acf1bd06");
         db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
 
         // Then replace the last document while disabling soft_deletion
@@ -647,7 +645,6 @@ mod tests {
         index.add_documents(documents).unwrap();
 
         db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
-        db_snap!(index, number_faceted_documents_ids, "replaced_2_hard", @"60b19824f136affe6b240a7200779028");
         db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]");
     }
 }
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 703d7ee29..27021c3fb 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -1499,12 +1499,6 @@ mod tests {
         3   2    second       second
         3   3    third        third
         "###);
-        db_snap!(index, string_faceted_documents_ids, @r###"
-        0   []
-        1   []
-        2   []
-        3   [0, 1, 2, 3, ]
-        "###);
 
         let rtxn = index.read_txn().unwrap();
 
@@ -1528,12 +1522,6 @@ mod tests {
 
         db_snap!(index, facet_id_string_docids, @"");
         db_snap!(index, field_id_docid_facet_strings, @"");
-        db_snap!(index, string_faceted_documents_ids, @r###"
-        0   []
-        1   []
-        2   []
-        3   [0, 1, 2, 3, ]
-        "###);
 
         let rtxn = index.read_txn().unwrap();
 
@@ -1560,12 +1548,6 @@ mod tests {
         3   2    second       second
         3   3    third        third
         "###);
-        db_snap!(index, string_faceted_documents_ids, @r###"
-        0   []
-        1   []
-        2   []
-        3   [0, 1, 2, 3, ]
-        "###);
 
         let rtxn = index.read_txn().unwrap();
 

From 59f88c14b3087eb78324d36e679bb4f64799f277 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 23 Oct 2023 15:19:33 +0200
Subject: [PATCH 032/127] Simplify facet update after removing
 `Index::faceted_documents_ids`

---
 milli/src/index.rs                            |  2 --
 milli/src/update/clear_documents.rs           |  2 --
 milli/src/update/facet/bulk.rs                | 31 +++++--------------
 milli/src/update/facet/incremental.rs         | 15 +++------
 milli/src/update/facet/mod.rs                 |  1 -
 .../src/update/index_documents/typed_chunk.rs |  5 +--
 6 files changed, 13 insertions(+), 43 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index f8be55545..eb9e153ec 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1,7 +1,6 @@
 use std::borrow::Cow;
 use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
 use std::fs::File;
-use std::mem::size_of;
 use std::path::Path;
 
 use charabia::{Language, Script};
@@ -14,7 +13,6 @@ use time::OffsetDateTime;
 
 use crate::distance::NDotProductPoint;
 use crate::error::{InternalError, UserError};
-use crate::facet::FacetType;
 use crate::fields_ids_map::FieldsIdsMap;
 use crate::heed_codec::facet::{
     FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 52f3e80db..3eb7e0910 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -1,7 +1,6 @@
 use roaring::RoaringBitmap;
 use time::OffsetDateTime;
 
-use crate::facet::FacetType;
 use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result};
 
 pub struct ClearDocuments<'t, 'u, 'i> {
@@ -51,7 +50,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
 
         // We retrieve the number of documents ids that we are deleting.
         let number_of_documents = self.index.number_of_documents(self.wtxn)?;
-        let faceted_fields = self.index.faceted_fields_ids(self.wtxn)?;
 
         // We clean some of the main engine datastructures.
         self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs
index 5247298a4..d2205f9d6 100644
--- a/milli/src/update/facet/bulk.rs
+++ b/milli/src/update/facet/bulk.rs
@@ -1,8 +1,7 @@
-use std::borrow::Cow;
 use std::fs::File;
 use std::io::BufReader;
 
-use grenad::{CompressionType, Reader};
+use grenad::CompressionType;
 use heed::types::ByteSlice;
 use heed::{BytesEncode, Error, RoTxn, RwTxn};
 use obkv::KvReader;
@@ -82,10 +81,7 @@ impl<'i> FacetsUpdateBulk<'i> {
 
         let inner = FacetsUpdateBulkInner { db, delta_data, group_size, min_level_size };
 
-        inner.update(wtxn, &field_ids, |wtxn, field_id, all_docids| {
-            // TODO: remove the lambda altogether
-            Ok(())
-        })?;
+        inner.update(wtxn, &field_ids)?;
 
         Ok(())
     }
@@ -99,21 +95,14 @@ pub(crate) struct FacetsUpdateBulkInner<R: std::io::Read + std::io::Seek> {
     pub min_level_size: u8,
 }
 impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
-    pub fn update(
-        mut self,
-        wtxn: &mut RwTxn,
-        field_ids: &[u16],
-        mut handle_all_docids: impl FnMut(&mut RwTxn, FieldId, RoaringBitmap) -> Result<()>,
-    ) -> Result<()> {
+    pub fn update(mut self, wtxn: &mut RwTxn, field_ids: &[u16]) -> Result<()> {
         self.update_level0(wtxn)?;
         for &field_id in field_ids.iter() {
             self.clear_levels(wtxn, field_id)?;
         }
 
         for &field_id in field_ids.iter() {
-            let (level_readers, all_docids) = self.compute_levels_for_field_id(field_id, wtxn)?;
-
-            handle_all_docids(wtxn, field_id, all_docids)?;
+            let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?;
 
             for level_reader in level_readers {
                 let mut cursor = level_reader.into_cursor()?;
@@ -201,16 +190,10 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
         &self,
         field_id: FieldId,
         txn: &RoTxn,
-    ) -> Result<(Vec<grenad::Reader<BufReader<File>>>, RoaringBitmap)> {
-        let mut all_docids = RoaringBitmap::new();
-        let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |bitmaps, _| {
-            for bitmap in bitmaps {
-                all_docids |= bitmap;
-            }
-            Ok(())
-        })?;
+    ) -> Result<Vec<grenad::Reader<BufReader<File>>>> {
+        let subwriters = self.compute_higher_levels(txn, field_id, 32, &mut |_, _| Ok(()))?;
 
-        Ok((subwriters, all_docids))
+        Ok(subwriters)
     }
     #[allow(clippy::type_complexity)]
     fn read_level_0<'t>(
diff --git a/milli/src/update/facet/incremental.rs b/milli/src/update/facet/incremental.rs
index 77e9874f6..e241c499c 100644
--- a/milli/src/update/facet/incremental.rs
+++ b/milli/src/update/facet/incremental.rs
@@ -1,4 +1,3 @@
-use std::collections::HashMap;
 use std::fs::File;
 use std::io::BufReader;
 
@@ -15,7 +14,7 @@ use crate::heed_codec::ByteSliceRefCodec;
 use crate::search::facet::get_highest_level;
 use crate::update::del_add::DelAdd;
 use crate::update::index_documents::valid_lmdb_key;
-use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
+use crate::{CboRoaringBitmapCodec, Index, Result};
 
 enum InsertionResult {
     InPlace,
@@ -30,16 +29,14 @@ enum DeletionResult {
 
 /// Algorithm to incrementally insert and delete elememts into the
 /// `facet_id_(string/f64)_docids` databases.
-pub struct FacetsUpdateIncremental<'i> {
-    index: &'i Index,
+pub struct FacetsUpdateIncremental {
     inner: FacetsUpdateIncrementalInner,
-    facet_type: FacetType,
     delta_data: grenad::Reader<BufReader<File>>,
 }
 
-impl<'i> FacetsUpdateIncremental<'i> {
+impl FacetsUpdateIncremental {
     pub fn new(
-        index: &'i Index,
+        index: &Index,
         facet_type: FacetType,
         delta_data: grenad::Reader<BufReader<File>>,
         group_size: u8,
@@ -47,7 +44,6 @@ impl<'i> FacetsUpdateIncremental<'i> {
         max_group_size: u8,
     ) -> Self {
         FacetsUpdateIncremental {
-            index,
             inner: FacetsUpdateIncrementalInner {
                 db: match facet_type {
                     FacetType::String => index
@@ -61,12 +57,11 @@ impl<'i> FacetsUpdateIncremental<'i> {
                 max_group_size,
                 min_level_size,
             },
-            facet_type,
             delta_data,
         }
     }
 
-    pub fn execute(self, wtxn: &'i mut RwTxn) -> crate::Result<()> {
+    pub fn execute(self, wtxn: &mut RwTxn) -> crate::Result<()> {
         let mut cursor = self.delta_data.into_cursor()?;
         while let Some((key, value)) = cursor.move_on_next()? {
             if !valid_lmdb_key(key) {
diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs
index e3c632983..3465e5437 100644
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -115,7 +115,6 @@ pub struct FacetsUpdate<'i> {
     min_level_size: u8,
 }
 impl<'i> FacetsUpdate<'i> {
-    // TODO grenad::Reader<Key, Obkv<DelAdd, RoaringBitmap>>
     pub fn new(
         index: &'i Index,
         facet_type: FacetType,
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index faeee944f..0d618ad28 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -1,4 +1,3 @@
-use std::borrow::Cow;
 use std::collections::HashMap;
 use std::convert::TryInto;
 use std::fs::File;
@@ -11,9 +10,7 @@ use heed::types::ByteSlice;
 use heed::RwTxn;
 use roaring::RoaringBitmap;
 
-use super::helpers::{
-    self, merge_ignore_values, serialize_roaring_bitmap, valid_lmdb_key, CursorClonableMmap,
-};
+use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap};
 use super::{ClonableMmap, MergeFn};
 use crate::distance::NDotProductPoint;
 use crate::error::UserError;

From 66abac9364265da9896a33c24bc76249bb063bcb Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 23 Oct 2023 15:55:35 +0200
Subject: [PATCH 033/127] Use specialized `KvReaderDelAdd` type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Clément Renault <clement@meilisearch.com>
---
 milli/src/update/facet/bulk.rs | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs
index d2205f9d6..297d189cd 100644
--- a/milli/src/update/facet/bulk.rs
+++ b/milli/src/update/facet/bulk.rs
@@ -4,7 +4,6 @@ use std::io::BufReader;
 use grenad::CompressionType;
 use heed::types::ByteSlice;
 use heed::{BytesEncode, Error, RoTxn, RwTxn};
-use obkv::KvReader;
 use roaring::RoaringBitmap;
 
 use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
@@ -13,7 +12,7 @@ use crate::heed_codec::facet::{
     FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
 };
 use crate::heed_codec::ByteSliceRefCodec;
-use crate::update::del_add::DelAdd;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd};
 use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
 use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
 
@@ -135,7 +134,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
                 if !valid_lmdb_key(key) {
                     continue;
                 }
-                let value: KvReader<DelAdd> = KvReader::new(value);
+                let value = KvReaderDelAdd::new(value);
 
                 // DB is empty, it is safe to ignore Del operations
                 let Some(value) = value.get(DelAdd::Addition) else {
@@ -159,7 +158,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
                     continue;
                 }
 
-                let value: KvReader<DelAdd> = KvReader::new(value);
+                let value = KvReaderDelAdd::new(value);
 
                 // the value is a CboRoaringBitmap, but I still need to prepend the
                 // group size for level 0 (= 1) to it

From b26dc9aabe774812dedffb96c3efeb0dfd4252dc Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 23 Oct 2023 16:06:06 +0200
Subject: [PATCH 034/127] Explanatory code comment

---
 milli/src/update/facet/bulk.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs
index 297d189cd..c0b159e57 100644
--- a/milli/src/update/facet/bulk.rs
+++ b/milli/src/update/facet/bulk.rs
@@ -167,6 +167,7 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
                 // then we extend the buffer with the docids bitmap
                 match database.get(wtxn, key)? {
                     Some(prev_value) => {
+                        // prev_value is the group size for level 0, followed by the previous bitmap.
                         let old_bitmap = &prev_value[1..];
                         CboRoaringBitmapCodec::merge_deladd_into(value, old_bitmap, &mut buffer)?;
                     }

From ba90a5ec0eb11ab99ab933f7fb65930bec93cc6d Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 23 Oct 2023 16:34:49 +0200
Subject: [PATCH 035/127] update extract fid word count docids

---
 .../extract/extract_fid_word_count_docids.rs  | 51 ++++++++++++++++---
 1 file changed, 43 insertions(+), 8 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
index 289a744da..accf4a510 100644
--- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
@@ -4,11 +4,12 @@ use std::io::{self, BufReader};
 use obkv::KvReaderU16;
 
 use super::helpers::{
-    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
+    create_sorter, merge_deladd_cbo_roaring_bitmaps, sorter_into_reader, try_split_array_at,
     GrenadParameters,
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::Result;
 
 const MAX_COUNTED_WORDS: usize = 30;
@@ -29,7 +30,7 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
 
     let mut fid_word_count_docids_sorter = create_sorter(
         grenad::SortAlgorithm::Unstable,
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         indexer.chunk_compression_type,
         indexer.chunk_compression_level,
         indexer.max_nb_chunks,
@@ -37,18 +38,52 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
     );
 
     let mut key_buffer = Vec::new();
+    let mut value_buffer = Vec::new();
     let mut cursor = docid_word_positions.into_cursor()?;
     while let Some((key, value)) = cursor.move_on_next()? {
         let (document_id_bytes, fid_bytes) = try_split_array_at(key)
             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
         let document_id = u32::from_be_bytes(document_id_bytes);
 
-        let word_count = KvReaderU16::new(&value).iter().take(MAX_COUNTED_WORDS + 1).count();
-        if word_count <= MAX_COUNTED_WORDS {
-            key_buffer.clear();
-            key_buffer.extend_from_slice(fid_bytes);
-            key_buffer.push(word_count as u8);
-            fid_word_count_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
+        let del_add_reader = KvReaderDelAdd::new(&value);
+        let deletion = del_add_reader
+            // get deleted words
+            .get(DelAdd::Deletion)
+            // count deleted words
+            .map(|deletion| KvReaderU16::new(deletion).iter().take(MAX_COUNTED_WORDS + 1).count())
+            // keep the count if under or equal to MAX_COUNTED_WORDS
+            .filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
+        let addition = del_add_reader
+            // get added words
+            .get(DelAdd::Addition)
+            // count added words
+            .map(|addition| KvReaderU16::new(addition).iter().take(MAX_COUNTED_WORDS + 1).count())
+            // keep the count if under or equal to MAX_COUNTED_WORDS
+            .filter(|&word_count| word_count <= MAX_COUNTED_WORDS);
+
+        if deletion != addition {
+            // Insert deleted word count in sorter if exist.
+            if let Some(word_count) = deletion {
+                value_buffer.clear();
+                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
+                key_buffer.clear();
+                key_buffer.extend_from_slice(fid_bytes);
+                key_buffer.push(word_count as u8);
+                fid_word_count_docids_sorter
+                    .insert(&key_buffer, value_writer.into_inner().unwrap())?;
+            }
+            // Insert added word count in sorter if exist.
+            if let Some(word_count) = addition {
+                value_buffer.clear();
+                let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
+                value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
+                key_buffer.clear();
+                key_buffer.extend_from_slice(fid_bytes);
+                key_buffer.push(word_count as u8);
+                fid_word_count_docids_sorter
+                    .insert(&key_buffer, value_writer.into_inner().unwrap())?;
+            }
         }
     }
 

From a3dae4db9beb546517160aab8d896f8f2d12d2a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Thu, 19 Oct 2023 15:55:48 +0200
Subject: [PATCH 036/127] Extract the geo fields DelAdd and generate a new
 DelAdd obkv with it

---
 .../extract/extract_geo_points.rs             | 81 +++++++++++++------
 1 file changed, 58 insertions(+), 23 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs
index 285a4bdba..36be9b5b6 100644
--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -6,6 +6,7 @@ use serde_json::Value;
 
 use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
 use crate::error::GeoError;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::index_documents::extract_finite_float_from_value;
 use crate::{FieldId, InternalError, Result};
 
@@ -14,6 +15,7 @@ use crate::{FieldId, InternalError, Result};
 /// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude)
 #[logging_timer::time]
 pub fn extract_geo_points<R: io::Read + io::Seek>(
+    // TODO grenad::Reader<Obkv<FieldId, Obkv<DelAdd, JsonValue>>>
     obkv_documents: grenad::Reader<R>,
     indexer: GrenadParameters,
     primary_key_id: FieldId,
@@ -30,39 +32,72 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
     let mut cursor = obkv_documents.into_cursor()?;
     while let Some((docid_bytes, value)) = cursor.move_on_next()? {
         let obkv = obkv::KvReader::new(value);
-        // since we only needs the primary key when we throw an error we create this getter to
-        // lazily get it when needed
+        // since we only need the primary key when we throw an error
+        // we create this getter to lazily get it when needed
         let document_id = || -> Value {
             let document_id = obkv.get(primary_key_id).unwrap();
             serde_json::from_slice(document_id).unwrap()
         };
 
+        // HELP we will receive two DelAdds here, one for the lat and one for the lng
+        //      what happens if there is a missing Del or Add for one of them?
+
         // first we get the two fields
-        let lat = obkv.get(lat_fid);
-        let lng = obkv.get(lng_fid);
+        match (obkv.get(lat_fid), obkv.get(lng_fid)) {
+            (Some(lat), Some(lng)) => {
+                let deladd_lat_obkv = KvReaderDelAdd::new(lat);
+                let deladd_lng_obkv = KvReaderDelAdd::new(lng);
 
-        if let Some((lat, lng)) = lat.zip(lng) {
-            // then we extract the values
-            let lat = extract_finite_float_from_value(
-                serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
-            )
-            .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
+                // then we extract the values
+                let del_lat_lng = deladd_lat_obkv
+                    .get(DelAdd::Deletion)
+                    .zip(deladd_lng_obkv.get(DelAdd::Deletion))
+                    .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
+                    .transpose()?;
+                let add_lat_lng = deladd_lat_obkv
+                    .get(DelAdd::Addition)
+                    .zip(deladd_lng_obkv.get(DelAdd::Addition))
+                    .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
+                    .transpose()?;
 
-            let lng = extract_finite_float_from_value(
-                serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
-            )
-            .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
-
-            #[allow(clippy::drop_non_drop)]
-            let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
-            writer.insert(docid_bytes, bytes)?;
-        } else if lat.is_none() && lng.is_some() {
-            return Err(GeoError::MissingLatitude { document_id: document_id() })?;
-        } else if lat.is_some() && lng.is_none() {
-            return Err(GeoError::MissingLongitude { document_id: document_id() })?;
+                let mut obkv = KvWriterDelAdd::memory();
+                if let Some([lat, lng]) = del_lat_lng {
+                    #[allow(clippy::drop_non_drop)]
+                    let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
+                    obkv.insert(DelAdd::Deletion, bytes)?;
+                }
+                if let Some([lat, lng]) = add_lat_lng {
+                    #[allow(clippy::drop_non_drop)]
+                    let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
+                    obkv.insert(DelAdd::Addition, bytes)?;
+                }
+                let bytes = obkv.into_inner()?;
+                writer.insert(docid_bytes, bytes)?;
+            }
+            (None, Some(_)) => {
+                return Err(GeoError::MissingLatitude { document_id: document_id() }.into())
+            }
+            (Some(_), None) => {
+                return Err(GeoError::MissingLongitude { document_id: document_id() }.into())
+            }
+            (None, None) => (),
         }
-        // else => the _geo object was `null`, there is nothing to do
     }
 
     writer_into_reader(writer)
 }
+
+/// Extract the finite floats lat and lng from two bytes slices.
+fn extract_lat_lng(lat: &[u8], lng: &[u8], document_id: impl Fn() -> Value) -> Result<[f64; 2]> {
+    let lat = extract_finite_float_from_value(
+        serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
+    )
+    .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
+
+    let lng = extract_finite_float_from_value(
+        serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
+    )
+    .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
+
+    Ok([lat, lng])
+}

From 544440c363c843da5eb2832a30185480f28b5b07 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Mon, 23 Oct 2023 11:54:45 +0200
Subject: [PATCH 037/127] Ignore geo fields when the Del and Add content is the
 same

---
 .../extract/extract_geo_points.rs             | 26 ++++++++++---------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs
index 36be9b5b6..a818bb91c 100644
--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -60,19 +60,21 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
                     .map(|(lat, lng)| extract_lat_lng(lat, lng, document_id))
                     .transpose()?;
 
-                let mut obkv = KvWriterDelAdd::memory();
-                if let Some([lat, lng]) = del_lat_lng {
-                    #[allow(clippy::drop_non_drop)]
-                    let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
-                    obkv.insert(DelAdd::Deletion, bytes)?;
+                if del_lat_lng != add_lat_lng {
+                    let mut obkv = KvWriterDelAdd::memory();
+                    if let Some([lat, lng]) = del_lat_lng {
+                        #[allow(clippy::drop_non_drop)]
+                        let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
+                        obkv.insert(DelAdd::Deletion, bytes)?;
+                    }
+                    if let Some([lat, lng]) = add_lat_lng {
+                        #[allow(clippy::drop_non_drop)]
+                        let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
+                        obkv.insert(DelAdd::Addition, bytes)?;
+                    }
+                    let bytes = obkv.into_inner()?;
+                    writer.insert(docid_bytes, bytes)?;
                 }
-                if let Some([lat, lng]) = add_lat_lng {
-                    #[allow(clippy::drop_non_drop)]
-                    let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
-                    obkv.insert(DelAdd::Addition, bytes)?;
-                }
-                let bytes = obkv.into_inner()?;
-                writer.insert(docid_bytes, bytes)?;
             }
             (None, Some(_)) => {
                 return Err(GeoError::MissingLatitude { document_id: document_id() }.into())

From 77dcbff6b2355b6a72837c29cec4dc7c355dea22 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Mon, 23 Oct 2023 13:49:54 +0200
Subject: [PATCH 038/127] Remove and Insert the DelAdd geo points

---
 .../extract/extract_geo_points.rs             |  1 -
 .../src/update/index_documents/typed_chunk.rs | 28 +++++++++++++------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs
index a818bb91c..cc283121e 100644
--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -15,7 +15,6 @@ use crate::{FieldId, InternalError, Result};
 /// Returns the generated grenad reader containing the docid as key associated to the (latitude, longitude)
 #[logging_timer::time]
 pub fn extract_geo_points<R: io::Read + io::Seek>(
-    // TODO grenad::Reader<Obkv<FieldId, Obkv<DelAdd, JsonValue>>>
     obkv_documents: grenad::Reader<R>,
     indexer: GrenadParameters,
     primary_key_id: FieldId,
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 0d618ad28..9d4d63f90 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -279,14 +279,17 @@ pub(crate) fn write_typed_chunk_into_index(
                 // convert the key back to a u32 (4 bytes)
                 let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap();
 
-                // convert the latitude and longitude back to a f64 (8 bytes)
-                let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
-                let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
-                let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
-                let xyz_point = lat_lng_to_xyz(&point);
-
-                rtree.insert(GeoPoint::new(xyz_point, (docid, point)));
-                geo_faceted_docids.insert(docid);
+                let deladd_obkv = KvReaderDelAdd::new(value);
+                if let Some(value) = deladd_obkv.get(DelAdd::Deletion) {
+                    let geopoint = extract_geo_point(value, docid);
+                    rtree.remove(&geopoint);
+                    geo_faceted_docids.remove(docid);
+                }
+                if let Some(value) = deladd_obkv.get(DelAdd::Addition) {
+                    let geopoint = extract_geo_point(value, docid);
+                    rtree.insert(geopoint);
+                    geo_faceted_docids.insert(docid);
+                }
             }
             index.put_geo_rtree(wtxn, &rtree)?;
             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
@@ -368,6 +371,15 @@ pub(crate) fn write_typed_chunk_into_index(
     Ok((RoaringBitmap::new(), is_merged_database))
 }
 
+/// Converts the latitude and longitude back to an xyz GeoPoint.
+fn extract_geo_point(value: &[u8], docid: DocumentId) -> GeoPoint {
+    let (lat, tail) = helpers::try_split_array_at::<u8, 8>(value).unwrap();
+    let (lng, _) = helpers::try_split_array_at::<u8, 8>(tail).unwrap();
+    let point = [f64::from_ne_bytes(lat), f64::from_ne_bytes(lng)];
+    let xyz_point = lat_lng_to_xyz(&point);
+    GeoPoint::new(xyz_point, (docid, point))
+}
+
 fn merge_word_docids_reader_into_fst(
     word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
     exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,

From 576fa9c6da0567e73b9598b5ca51f76a1bfd2c6a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Tue, 24 Oct 2023 10:21:47 +0200
Subject: [PATCH 039/127] Remove useless comment

---
 milli/src/update/index_documents/extract/extract_geo_points.rs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs
index cc283121e..5ee7967d2 100644
--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -38,9 +38,6 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
             serde_json::from_slice(document_id).unwrap()
         };
 
-        // HELP we will receive two DelAdds here, one for the lat and one for the lng
-        //      what happens if there is a missing Del or Add for one of them?
-
         // first we get the two fields
         match (obkv.get(lat_fid), obkv.get(lng_fid)) {
             (Some(lat), Some(lng)) => {

From 476e4d3dbed3ccf91c3bb95249a557b92f035562 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 24 Oct 2023 10:19:32 +0200
Subject: [PATCH 040/127] Use value buffer instead of the initial value when
 writting the final result in the sorter

---
 .../extract/extract_docid_word_positions.rs               | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index 36258b275..e5d95cbdb 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -115,6 +115,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
         let (add_obkv, add_script_language_word_count) = add?;
 
         // merge deletions and additions.
+        // transforming two KV<FieldId, KV<u16, String>> into one KV<FieldId, KV<DelAdd, KV<u16, String>>>
         value_buffer.clear();
         del_add_from_two_obkvs(
             KvReader::<FieldId>::new(del_obkv),
@@ -122,8 +123,8 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
             &mut value_buffer,
         )?;
 
-        // write them into the sorter.
-        let obkv = KvReader::<FieldId>::new(value);
+        // write each KV<DelAdd, KV<u16, String>> into the sorter, field by field.
+        let obkv = KvReader::<FieldId>::new(&value_buffer);
         for (field_id, value) in obkv.iter() {
             key_buffer.truncate(mem::size_of::<u32>());
             key_buffer.extend_from_slice(&field_id.to_be_bytes());
@@ -151,6 +152,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
         }
     }
 
+    // the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
     sorter_into_reader(docid_word_positions_sorter, indexer)
         .map(|reader| (documents_ids, reader, script_language_docids))
 }
@@ -266,6 +268,7 @@ fn lang_safe_tokens_from_document<'a>(
         }
     }
 
+    // returns a (KV<FieldId, KV<u16, String>>, HashMap<Script, Vec<(Language, usize)>>)
     Ok((&buffers.obkv_buffer, script_language_word_count))
 }
 
@@ -331,6 +334,7 @@ fn tokens_from_document<'a>(
         }
     }
 
+    // returns a KV<FieldId, KV<u16, String>>
     Ok(document_writer.into_inner().map(|v| v.as_slice())?)
 }
 

From 696fcf4d185793f2ffaa2274dc45700128e06dd2 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 24 Oct 2023 11:03:35 +0200
Subject: [PATCH 041/127] Fix document insertion into LMDB

---
 .../src/update/index_documents/typed_chunk.rs | 21 ++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 9d4d63f90..6a2ea8486 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -8,6 +8,7 @@ use charabia::{Language, Script};
 use grenad::MergerBuilder;
 use heed::types::ByteSlice;
 use heed::RwTxn;
+use obkv::{KvReader, KvWriter};
 use roaring::RoaringBitmap;
 
 use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap};
@@ -19,7 +20,9 @@ use crate::index::Hnsw;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd};
 use crate::update::facet::FacetsUpdate;
 use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
-use crate::{lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, GeoPoint, Index, Result, BEU32};
+use crate::{
+    lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, Result, BEU32,
+};
 
 pub(crate) enum TypedChunk {
     FieldIdDocidFacetStrings(grenad::Reader<CursorClonableMmap>),
@@ -120,8 +123,20 @@ pub(crate) fn write_typed_chunk_into_index(
     match typed_chunk {
         TypedChunk::Documents(obkv_documents_iter) => {
             let mut cursor = obkv_documents_iter.into_cursor()?;
-            while let Some((key, value)) = cursor.move_on_next()? {
-                index.documents.remap_types::<ByteSlice, ByteSlice>().put(wtxn, key, value)?;
+            while let Some((docid, reader)) = cursor.move_on_next()? {
+                let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
+                let reader: KvReader<FieldId> = KvReader::new(reader);
+                for (field_id, value) in reader.iter() {
+                    let Some(value) = KvReaderDelAdd::new(value).get(DelAdd::Addition) else {
+                        continue;
+                    };
+                    writer.insert(field_id, value)?;
+                }
+                index.documents.remap_types::<ByteSlice, ByteSlice>().put(
+                    wtxn,
+                    docid,
+                    &writer.into_inner().unwrap(),
+                )?;
             }
         }
         TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {

From cda6ca1ee6880ebfaaf53a4c969e6a950b1d56c4 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 24 Oct 2023 14:26:14 +0200
Subject: [PATCH 042/127] Remove TypedChunk::NewDocumentIds

---
 milli/src/update/index_documents/extract/mod.rs | 3 ---
 milli/src/update/index_documents/typed_chunk.rs | 7 -------
 2 files changed, 10 deletions(-)

diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 7d643d61f..20ee38c4f 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -366,9 +366,6 @@ fn send_and_extract_flattened_documents_data(
                         max_positions_per_attributes,
                     )?;
 
-                // send documents_ids to DB writer
-                let _ = lmdb_writer_sx.send(Ok(TypedChunk::NewDocumentsIds(documents_ids)));
-
                 // send docid_word_positions_chunk to DB writer
                 let docid_word_positions_chunk =
                     unsafe { as_cloneable_grenad(&docid_word_positions_chunk)? };
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 6a2ea8486..aebfca151 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -29,7 +29,6 @@ pub(crate) enum TypedChunk {
     FieldIdDocidFacetNumbers(grenad::Reader<CursorClonableMmap>),
     Documents(grenad::Reader<CursorClonableMmap>),
     FieldIdWordCountDocids(grenad::Reader<BufReader<File>>),
-    NewDocumentsIds(RoaringBitmap),
     WordDocids {
         word_docids_reader: grenad::Reader<BufReader<File>>,
         exact_word_docids_reader: grenad::Reader<BufReader<File>>,
@@ -62,9 +61,6 @@ impl TypedChunk {
             TypedChunk::FieldIdWordCountDocids(grenad) => {
                 format!("FieldIdWordcountDocids {{ number_of_entries: {} }}", grenad.len())
             }
-            TypedChunk::NewDocumentsIds(grenad) => {
-                format!("NewDocumentsIds {{ number_of_entries: {} }}", grenad.len())
-            }
             TypedChunk::WordDocids {
                 word_docids_reader,
                 exact_word_docids_reader,
@@ -150,9 +146,6 @@ pub(crate) fn write_typed_chunk_into_index(
             )?;
             is_merged_database = true;
         }
-        TypedChunk::NewDocumentsIds(documents_ids) => {
-            return Ok((documents_ids, is_merged_database))
-        }
         TypedChunk::WordDocids {
             word_docids_reader,
             exact_word_docids_reader,

From 946c762d289f4ca468f243226ca2a61f718599ec Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 24 Oct 2023 14:26:49 +0200
Subject: [PATCH 043/127] WIP: reset documents in TypedChunk::Documents

---
 milli/src/update/index_documents/mod.rs       | 17 +++---------
 .../src/update/index_documents/typed_chunk.rs | 26 +++++++++++++++----
 2 files changed, 25 insertions(+), 18 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 27021c3fb..d1fa28826 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -35,7 +35,7 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
 use crate::update::{
-    self, DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
+    DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
     WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
 };
 use crate::{CboRoaringBitmapCodec, Index, Result};
@@ -374,17 +374,6 @@ where
             drop(lmdb_writer_sx)
         });
 
-        // We delete the documents that this document addition replaces. This way we are
-        // able to simply insert all the documents even if they already exist in the database.
-        if !replaced_documents_ids.is_empty() {
-            let mut deletion_builder = update::DeleteDocuments::new(self.wtxn, self.index)?;
-            deletion_builder.strategy(self.config.deletion_strategy);
-            debug!("documents to delete {:?}", replaced_documents_ids);
-            deletion_builder.delete_documents(&replaced_documents_ids);
-            let deleted_documents_result = deletion_builder.execute_inner()?;
-            debug!("{} documents actually deleted", deleted_documents_result.deleted_documents);
-        }
-
         let index_documents_ids = self.index.documents_ids(self.wtxn)?;
         let index_is_empty = index_documents_ids.is_empty();
         let mut final_documents_ids = RoaringBitmap::new();
@@ -437,6 +426,7 @@ where
                 otherwise => otherwise,
             };
 
+            // FIXME: return newly added as well as newly deleted documents
             let (docids, is_merged_database) =
                 write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?;
             if !docids.is_empty() {
@@ -472,8 +462,9 @@ where
         let external_documents_ids = external_documents_ids.into_static();
         self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
 
+        // FIXME: remove `new_documents_ids` entirely and `replaced_documents_ids`
         let all_documents_ids = index_documents_ids | new_documents_ids;
-        self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
+        //self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
 
         // TODO: reactivate prefix DB with diff-indexing
         // self.execute_prefix_databases(
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index aebfca151..39537cce7 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -118,22 +118,38 @@ pub(crate) fn write_typed_chunk_into_index(
     let mut is_merged_database = false;
     match typed_chunk {
         TypedChunk::Documents(obkv_documents_iter) => {
+            let mut docids = index.documents_ids(wtxn)?;
+
             let mut cursor = obkv_documents_iter.into_cursor()?;
             while let Some((docid, reader)) = cursor.move_on_next()? {
                 let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
                 let reader: KvReader<FieldId> = KvReader::new(reader);
+                let mut written = false;
                 for (field_id, value) in reader.iter() {
                     let Some(value) = KvReaderDelAdd::new(value).get(DelAdd::Addition) else {
                         continue;
                     };
+                    // TODO: writer.is_empty
+                    written = true;
                     writer.insert(field_id, value)?;
                 }
-                index.documents.remap_types::<ByteSlice, ByteSlice>().put(
-                    wtxn,
-                    docid,
-                    &writer.into_inner().unwrap(),
-                )?;
+
+                let db = index.documents.remap_data_type::<ByteSlice>();
+                let docid = docid.try_into().map(DocumentId::from_be_bytes).unwrap();
+
+                if written {
+                    db.put(wtxn, &BEU32::new(docid), &writer.into_inner().unwrap())?;
+                    docids.insert(docid);
+                } else {
+                    db.delete(wtxn, &BEU32::new(docid))?;
+                    // FIXME: unwrap
+                    if !docids.remove(docid) {
+                        panic!("Attempt to remove a document id that doesn't exist")
+                    }
+                }
             }
+
+            index.put_documents_ids(wtxn, &docids)?;
         }
         TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {
             append_entries_into_database(

From 5be569e3e2799721a83df877921fde972848e933 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 24 Oct 2023 17:01:30 +0200
Subject: [PATCH 044/127] Update obkv

---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d8cd12cc2..2ab2f706a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2866,9 +2866,9 @@ dependencies = [
 
 [[package]]
 name = "obkv"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f69e48cd7c8e5bb52a1da1287fdbfd877c32673176583ce664cd63b201aba385"
+checksum = "6c459142426056c639ff88d053ebaaaeca0ee1411c94362892398ef4ccd81080"
 
 [[package]]
 name = "once_cell"

From 8fb221dae36ed73475a83fb511775f9b8729e36d Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 24 Oct 2023 17:01:45 +0200
Subject: [PATCH 045/127] Refactor ExternalDocumentsIds

- Remove soft deleted
- Add apply method that takes a list of operations to encapsulate modifications to the external -> internal mapping
---
 milli/src/external_documents_ids.rs | 189 +++++++++++++---------------
 1 file changed, 85 insertions(+), 104 deletions(-)

diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs
index 36b147336..cd6a7e729 100644
--- a/milli/src/external_documents_ids.rs
+++ b/milli/src/external_documents_ids.rs
@@ -7,133 +7,118 @@ use fst::map::IndexedValue;
 use fst::{IntoStreamer, Streamer};
 use roaring::RoaringBitmap;
 
+use crate::DocumentId;
+
 const DELETED_ID: u64 = u64::MAX;
 
-pub struct ExternalDocumentsIds<'a> {
-    pub(crate) hard: fst::Map<Cow<'a, [u8]>>,
-    pub(crate) soft: fst::Map<Cow<'a, [u8]>>,
-    soft_deleted_docids: RoaringBitmap,
+pub enum DocumentOperationKind {
+    Create,
+    Delete,
 }
 
+pub struct DocumentOperation {
+    pub external_id: String,
+    pub internal_id: DocumentId,
+    pub kind: DocumentOperationKind,
+}
+
+pub struct ExternalDocumentsIds<'a>(fst::Map<Cow<'a, [u8]>>);
+
 impl<'a> ExternalDocumentsIds<'a> {
-    pub fn new(
-        hard: fst::Map<Cow<'a, [u8]>>,
-        soft: fst::Map<Cow<'a, [u8]>>,
-        soft_deleted_docids: RoaringBitmap,
-    ) -> ExternalDocumentsIds<'a> {
-        ExternalDocumentsIds { hard, soft, soft_deleted_docids }
+    pub fn new(fst: fst::Map<Cow<'a, [u8]>>) -> ExternalDocumentsIds<'a> {
+        ExternalDocumentsIds(fst)
     }
 
     pub fn into_static(self) -> ExternalDocumentsIds<'static> {
-        ExternalDocumentsIds {
-            hard: self.hard.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
-            soft: self.soft.map_data(|c| Cow::Owned(c.into_owned())).unwrap(),
-            soft_deleted_docids: self.soft_deleted_docids,
-        }
+        ExternalDocumentsIds(self.0.map_data(|c| Cow::Owned(c.into_owned())).unwrap())
     }
 
     /// Returns `true` if hard and soft external documents lists are empty.
     pub fn is_empty(&self) -> bool {
-        self.hard.is_empty() && self.soft.is_empty()
+        self.0.is_empty()
     }
 
     pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
         let external_id = external_id.as_ref();
-        match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
-            Some(id) if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) => {
-                Some(id.try_into().unwrap())
-            }
-            _otherwise => None,
-        }
-    }
-
-    /// Rebuild the internal FSTs in the ExternalDocumentsIds structure such that they
-    /// don't contain any soft deleted document id.
-    pub fn delete_soft_deleted_documents_ids_from_fsts(&mut self) -> fst::Result<()> {
-        let mut new_hard_builder = fst::MapBuilder::memory();
-
-        let union_op = self.hard.op().add(&self.soft).r#union();
-        let mut iter = union_op.into_stream();
-        while let Some((external_id, docids)) = iter.next() {
-            // prefer selecting the ids from soft, always
-            let id = indexed_last_value(docids).unwrap();
-            if id != DELETED_ID && !self.soft_deleted_docids.contains(id as u32) {
-                new_hard_builder.insert(external_id, id)?;
-            }
-        }
-        drop(iter);
-
-        // Delete soft map completely
-        self.soft = fst::Map::default().map_data(Cow::Owned)?;
-        // We save the new map as the new hard map.
-        self.hard = new_hard_builder.into_map().map_data(Cow::Owned)?;
-
-        Ok(())
-    }
-
-    pub fn insert_ids<A: AsRef<[u8]>>(&mut self, other: &fst::Map<A>) -> fst::Result<()> {
-        let union_op = self.soft.op().add(other).r#union();
-
-        let mut new_soft_builder = fst::MapBuilder::memory();
-        let mut iter = union_op.into_stream();
-        while let Some((external_id, marked_docids)) = iter.next() {
-            let id = indexed_last_value(marked_docids).unwrap();
-            new_soft_builder.insert(external_id, id)?;
-        }
-
-        drop(iter);
-
-        // We save the new map as the new soft map.
-        self.soft = new_soft_builder.into_map().map_data(Cow::Owned)?;
-        self.merge_soft_into_hard()
+        self.0.get(external_id).map(|x| x.try_into().unwrap())
     }
 
     /// An helper function to debug this type, returns an `HashMap` of both,
     /// soft and hard fst maps, combined.
     pub fn to_hash_map(&self) -> HashMap<String, u32> {
-        let mut map = HashMap::new();
-
-        let union_op = self.hard.op().add(&self.soft).r#union();
-        let mut iter = union_op.into_stream();
-        while let Some((external_id, marked_docids)) = iter.next() {
-            let id = indexed_last_value(marked_docids).unwrap();
-            if id != DELETED_ID {
-                let external_id = str::from_utf8(external_id).unwrap();
-                map.insert(external_id.to_owned(), id.try_into().unwrap());
-            }
+        let mut map = HashMap::default();
+        let mut stream = self.0.stream();
+        while let Some((k, v)) = stream.next() {
+            let k = String::from_utf8(k.to_vec()).unwrap();
+            map.insert(k, v.try_into().unwrap());
         }
-
         map
     }
 
-    /// Return an fst of the combined hard and soft deleted ID.
-    pub fn to_fst<'b>(&'b self) -> fst::Result<Cow<'b, fst::Map<Cow<'a, [u8]>>>> {
-        if self.soft.is_empty() {
-            return Ok(Cow::Borrowed(&self.hard));
-        }
-        let union_op = self.hard.op().add(&self.soft).r#union();
-
-        let mut iter = union_op.into_stream();
-        let mut new_hard_builder = fst::MapBuilder::memory();
-        while let Some((external_id, marked_docids)) = iter.next() {
-            let value = indexed_last_value(marked_docids).unwrap();
-            if value != DELETED_ID {
-                new_hard_builder.insert(external_id, value)?;
-            }
-        }
-
-        drop(iter);
-
-        Ok(Cow::Owned(new_hard_builder.into_map().map_data(Cow::Owned)?))
+    pub fn as_bytes(&self) -> &[u8] {
+        self.0.as_fst().as_bytes()
     }
 
-    fn merge_soft_into_hard(&mut self) -> fst::Result<()> {
-        if self.soft.len() >= self.hard.len() / 2 {
-            self.hard = self.to_fst()?.into_owned();
-            self.soft = fst::Map::default().map_data(Cow::Owned)?;
-        }
+    /// Apply the list of operations passed as argument, modifying the current external to internal id mapping.
+    ///
+    /// If the list contains multiple operations on the same external id, then the result is unspecified.
+    ///
+    /// # Panics
+    ///
+    /// - If attempting to delete a document that doesn't exist
+    /// - If attempting to create a document that already exists
+    pub fn apply(&mut self, mut operations: Vec<DocumentOperation>) {
+        operations.sort_unstable_by(|left, right| left.external_id.cmp(&right.external_id));
+        operations.dedup_by(|left, right| left.external_id == right.external_id);
 
-        Ok(())
+        let mut builder = fst::MapBuilder::memory();
+
+        let mut stream = self.0.stream();
+        let mut next_stream = stream.next();
+        let mut operations = operations.iter();
+        let mut next_operation = operations.next();
+
+        loop {
+            (next_stream, next_operation) = match (next_stream.take(), next_operation.take()) {
+                (None, None) => break,
+                (None, Some(DocumentOperation { external_id, internal_id, kind })) => {
+                    if matches!(kind, DocumentOperationKind::Delete) {
+                        panic!("Attempting to delete a non-existing document")
+                    }
+                    builder.insert(external_id, (*internal_id).into()).unwrap();
+                    (None, operations.next())
+                }
+                (Some((k, v)), None) => {
+                    builder.insert(k, v).unwrap();
+                    (stream.next(), None)
+                }
+                (
+                    current_stream @ Some((left_external_id, left_internal_id)),
+                    current_operation @ Some(DocumentOperation {
+                        external_id: right_external_id,
+                        internal_id: right_internal_id,
+                        kind,
+                    }),
+                ) => match left_external_id.cmp(right_external_id.as_bytes()) {
+                    std::cmp::Ordering::Less => {
+                        builder.insert(left_external_id, left_internal_id).unwrap();
+                        (stream.next(), current_operation)
+                    }
+                    std::cmp::Ordering::Greater => {
+                        builder.insert(right_external_id, (*right_internal_id).into()).unwrap();
+                        (current_stream, operations.next())
+                    }
+                    std::cmp::Ordering::Equal => {
+                        if matches!(kind, DocumentOperationKind::Create) {
+                            panic!("Attempting to create an already-existing document");
+                        }
+                        // we delete the document, so we just advance both iterators to skip in stream
+                        (stream.next(), operations.next())
+                    }
+                },
+            }
+        }
+        self.0 = builder.into_map().map_data(Cow::Owned).unwrap();
     }
 }
 
@@ -145,11 +130,7 @@ impl fmt::Debug for ExternalDocumentsIds<'_> {
 
 impl Default for ExternalDocumentsIds<'static> {
     fn default() -> Self {
-        ExternalDocumentsIds {
-            hard: fst::Map::default().map_data(Cow::Owned).unwrap(),
-            soft: fst::Map::default().map_data(Cow::Owned).unwrap(),
-            soft_deleted_docids: RoaringBitmap::new(),
-        }
+        ExternalDocumentsIds(fst::Map::default().map_data(Cow::Owned).unwrap())
     }
 }
 

From bafeb892a770fc6d5482044610705ce65b9174bf Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 24 Oct 2023 17:02:55 +0200
Subject: [PATCH 046/127] Modify Index after changes to ExternalDocumentsIds

---
 milli/src/index.rs | 31 +++++++------------------------
 1 file changed, 7 insertions(+), 24 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index eb9e153ec..61ec41788 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -52,11 +52,10 @@ pub mod main_key {
     /// It is concatenated with a big-endian encoded number (non-human readable).
     /// e.g. vector-hnsw0x0032.
     pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw";
-    pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
+    pub const EXTERNAL_DOCUMENTS_IDS_KEY: &str = "external-documents-ids";
     pub const PRIMARY_KEY_KEY: &str = "primary-key";
     pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
     pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
-    pub const SOFT_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "soft-external-documents-ids";
     pub const STOP_WORDS_KEY: &str = "stop-words";
     pub const NON_SEPARATOR_TOKENS_KEY: &str = "non-separator-tokens";
     pub const SEPARATOR_TOKENS_KEY: &str = "separator-tokens";
@@ -417,18 +416,10 @@ impl Index {
         wtxn: &mut RwTxn,
         external_documents_ids: &ExternalDocumentsIds<'_>,
     ) -> heed::Result<()> {
-        let ExternalDocumentsIds { hard, soft, .. } = external_documents_ids;
-        let hard = hard.as_fst().as_bytes();
-        let soft = soft.as_fst().as_bytes();
         self.main.put::<_, Str, ByteSlice>(
             wtxn,
-            main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY,
-            hard,
-        )?;
-        self.main.put::<_, Str, ByteSlice>(
-            wtxn,
-            main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY,
-            soft,
+            main_key::EXTERNAL_DOCUMENTS_IDS_KEY,
+            external_documents_ids.as_bytes(),
         )?;
         Ok(())
     }
@@ -436,20 +427,12 @@ impl Index {
     /// Returns the external documents ids map which associate the external ids
     /// with the internal ids (i.e. `u32`).
     pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result<ExternalDocumentsIds<'t>> {
-        let hard =
-            self.main.get::<_, Str, ByteSlice>(rtxn, main_key::HARD_EXTERNAL_DOCUMENTS_IDS_KEY)?;
-        let soft =
-            self.main.get::<_, Str, ByteSlice>(rtxn, main_key::SOFT_EXTERNAL_DOCUMENTS_IDS_KEY)?;
-        let hard = match hard {
-            Some(hard) => fst::Map::new(hard)?.map_data(Cow::Borrowed)?,
+        let fst = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::EXTERNAL_DOCUMENTS_IDS_KEY)?;
+        let fst = match fst {
+            Some(fst) => fst::Map::new(fst)?.map_data(Cow::Borrowed)?,
             None => fst::Map::default().map_data(Cow::Owned)?,
         };
-        let soft = match soft {
-            Some(soft) => fst::Map::new(soft)?.map_data(Cow::Borrowed)?,
-            None => fst::Map::default().map_data(Cow::Owned)?,
-        };
-        let soft_deleted_docids = self.soft_deleted_documents_ids(rtxn)?;
-        Ok(ExternalDocumentsIds::new(hard, soft, soft_deleted_docids))
+        Ok(ExternalDocumentsIds::new(fst))
     }
 
     /* fields ids map */

From c6b3c18c85e234929f517a2dbfd7dfcd01e71c36 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 24 Oct 2023 17:04:02 +0200
Subject: [PATCH 047/127] WIP: Comment out document deletion in other pipelines
 than update

TODO: fix calls to DELETE route
---
 milli/src/update/delete_documents.rs          | 10 ++++++----
 milli/src/update/index_documents/mod.rs       |  8 ++++----
 milli/src/update/index_documents/transform.rs | 12 +++---------
 3 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 9044f03be..0299e1e4f 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -255,12 +255,14 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
         }
         // We acquire the current external documents ids map...
         // Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
-        let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
+        //let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
         // We then remove the soft-deleted docids from it
-        new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
+        //new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
         // and write it back to the main database.
-        let new_external_documents_ids = new_external_documents_ids.into_static();
-        self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
+        //let new_external_documents_ids = new_external_documents_ids.into_static();
+        //self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
+
+        todo!("please autobatch deletions for now");
 
         let mut words_to_keep = BTreeSet::default();
         let mut words_to_delete = BTreeSet::default();
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index d1fa28826..8d187a89d 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -457,10 +457,10 @@ where
         self.index.put_primary_key(self.wtxn, &primary_key)?;
 
         // We write the external documents ids into the main database.
-        let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
-        external_documents_ids.insert_ids(&new_external_documents_ids)?;
-        let external_documents_ids = external_documents_ids.into_static();
-        self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
+        //let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
+        //external_documents_ids.insert_ids(&new_external_documents_ids)?;
+        //let external_documents_ids = external_documents_ids.into_static();
+        //self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
 
         // FIXME: remove `new_documents_ids` entirely and `replaced_documents_ids`
         let all_documents_ids = index_documents_ids | new_documents_ids;
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 2b77768cb..e02da8cb5 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -763,14 +763,6 @@ impl<'a, 'i> Transform<'a, 'i> {
             .to_string();
         let field_distribution = self.index.field_distribution(wtxn)?;
 
-        // Delete the soft deleted document ids from the maps inside the external_document_ids structure
-        let new_external_documents_ids = {
-            let mut external_documents_ids = self.index.external_documents_ids(wtxn)?;
-            external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
-            // This call should be free and can't fail since the previous method merged both fsts.
-            external_documents_ids.into_static().to_fst()?.into_owned()
-        };
-
         let documents_ids = self.index.documents_ids(wtxn)?;
         let documents_count = documents_ids.len() as usize;
 
@@ -858,8 +850,10 @@ impl<'a, 'i> Transform<'a, 'i> {
             primary_key,
             fields_ids_map: new_fields_ids_map,
             field_distribution,
-            new_external_documents_ids,
+            // FIXME: remove this now unused field
+            new_external_documents_ids: fst::Map::default().map_data(Cow::Owned).unwrap(),
             new_documents_ids: documents_ids,
+            // FIXME: remove this now unused field
             replaced_documents_ids: RoaringBitmap::default(),
             documents_count,
             original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,

From 85f42fbc036e850cf55b044eb948de72abbf5ebe Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 24 Oct 2023 17:04:48 +0200
Subject: [PATCH 048/127] Handle external to internal id mapping from
 TypedChunk::Documents

---
 .../src/update/index_documents/typed_chunk.rs | 81 +++++++++++++++----
 1 file changed, 65 insertions(+), 16 deletions(-)

diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 39537cce7..1f1ac4adf 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -15,13 +15,16 @@ use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMm
 use super::{ClonableMmap, MergeFn};
 use crate::distance::NDotProductPoint;
 use crate::error::UserError;
+use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
 use crate::facet::FacetType;
 use crate::index::Hnsw;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd};
 use crate::update::facet::FacetsUpdate;
 use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
+use crate::update::index_documents::validate_document_id_value;
 use crate::{
-    lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, Result, BEU32,
+    lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
+    Result, BEU32,
 };
 
 pub(crate) enum TypedChunk {
@@ -118,36 +121,82 @@ pub(crate) fn write_typed_chunk_into_index(
     let mut is_merged_database = false;
     match typed_chunk {
         TypedChunk::Documents(obkv_documents_iter) => {
-            let mut docids = index.documents_ids(wtxn)?;
+            let mut operations: Vec<DocumentOperation> = Default::default();
 
+            let mut docids = index.documents_ids(wtxn)?;
+            let primary_key = index.primary_key(wtxn)?.unwrap();
+            let primary_key = index.fields_ids_map(wtxn)?.id(primary_key).unwrap();
             let mut cursor = obkv_documents_iter.into_cursor()?;
             while let Some((docid, reader)) = cursor.move_on_next()? {
                 let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
                 let reader: KvReader<FieldId> = KvReader::new(reader);
-                let mut written = false;
+                let docid = docid.try_into().map(DocumentId::from_be_bytes).unwrap();
+
                 for (field_id, value) in reader.iter() {
-                    let Some(value) = KvReaderDelAdd::new(value).get(DelAdd::Addition) else {
-                        continue;
-                    };
-                    // TODO: writer.is_empty
-                    written = true;
-                    writer.insert(field_id, value)?;
+                    let del_add_reader = KvReaderDelAdd::new(value);
+                    match (
+                        del_add_reader.get(DelAdd::Deletion),
+                        del_add_reader.get(DelAdd::Addition),
+                    ) {
+                        (None, None) => {}
+                        (None, Some(value)) => {
+                            // if primary key, new document
+                            if field_id == primary_key {
+                                // FIXME: we already extracted the external docid before. We should retrieve it in the typed chunk
+                                // rather than re-extract it here
+                                // FIXME: unwraps
+                                let document_id = serde_json::from_slice(value)
+                                    .map_err(InternalError::SerdeJson)
+                                    .unwrap();
+                                let external_id =
+                                    validate_document_id_value(document_id).unwrap().unwrap();
+                                operations.push(DocumentOperation {
+                                    external_id,
+                                    internal_id: docid,
+                                    kind: DocumentOperationKind::Create,
+                                });
+                                docids.insert(docid);
+                            }
+                            // anyway, write
+                            writer.insert(field_id, value)?;
+                        }
+                        (Some(value), None) => {
+                            // if primary key, deleted document
+                            if field_id == primary_key {
+                                // FIXME: we already extracted the external docid before. We should retrieve it in the typed chunk
+                                // rather than re-extract it here
+                                // FIXME: unwraps
+                                let document_id = serde_json::from_slice(value)
+                                    .map_err(InternalError::SerdeJson)
+                                    .unwrap();
+                                let external_id =
+                                    validate_document_id_value(document_id).unwrap().unwrap();
+                                operations.push(DocumentOperation {
+                                    external_id,
+                                    internal_id: docid,
+                                    kind: DocumentOperationKind::Delete,
+                                });
+                                docids.remove(docid);
+                            }
+                        }
+                        (Some(_), Some(value)) => {
+                            // updated field, write
+                            writer.insert(field_id, value)?;
+                        }
+                    }
                 }
 
                 let db = index.documents.remap_data_type::<ByteSlice>();
-                let docid = docid.try_into().map(DocumentId::from_be_bytes).unwrap();
 
-                if written {
+                if !writer.is_empty() {
                     db.put(wtxn, &BEU32::new(docid), &writer.into_inner().unwrap())?;
-                    docids.insert(docid);
                 } else {
                     db.delete(wtxn, &BEU32::new(docid))?;
-                    // FIXME: unwrap
-                    if !docids.remove(docid) {
-                        panic!("Attempt to remove a document id that doesn't exist")
-                    }
                 }
             }
+            let mut external_documents_docids = index.external_documents_ids(wtxn)?.into_static();
+            external_documents_docids.apply(operations);
+            index.put_external_documents_ids(wtxn, &external_documents_docids)?;
 
             index.put_documents_ids(wtxn, &docids)?;
         }

From 8370fbc92b488fc026e27907eddc66c9d1edc63f Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 11:20:01 +0200
Subject: [PATCH 049/127] Fix snaps

---
 milli/src/snapshot_tests.rs | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs
index 4b21cc175..77d9f41ec 100644
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -340,20 +340,12 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String {
 }
 pub fn snap_external_documents_ids(index: &Index) -> String {
     let rtxn = index.read_txn().unwrap();
-    let ExternalDocumentsIds { soft, hard, .. } = index.external_documents_ids(&rtxn).unwrap();
+    let external_ids = index.external_documents_ids(&rtxn).unwrap().to_hash_map();
 
     let mut snap = String::new();
 
-    writeln!(&mut snap, "soft:").unwrap();
-    let stream_soft = soft.stream();
-    let soft_external_ids = stream_soft.into_str_vec().unwrap();
-    for (key, id) in soft_external_ids {
-        writeln!(&mut snap, "{key:<24} {id}").unwrap();
-    }
-    writeln!(&mut snap, "hard:").unwrap();
-    let stream_hard = hard.stream();
-    let hard_external_ids = stream_hard.into_str_vec().unwrap();
-    for (key, id) in hard_external_ids {
+    writeln!(&mut snap, "docids:").unwrap();
+    for (key, id) in external_ids {
         writeln!(&mut snap, "{key:<24} {id}").unwrap();
     }
 

From 073f89db790c66483733d7a00695f692c55a457b Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 11:22:30 +0200
Subject: [PATCH 050/127] Fix facet tests

---
 milli/src/update/facet/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs
index 3465e5437..2b671e5cb 100644
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -468,7 +468,7 @@ pub(crate) mod test_helpers {
                 min_level_size: self.min_level_size.get(),
             };
 
-            update.update(wtxn, field_ids, |_, _, _| Ok(())).unwrap();
+            update.update(wtxn, field_ids).unwrap();
         }
 
         pub fn verify_structure_validity(&self, txn: &RoTxn, field_id: u16) {

From 01d5eedf2f86958c4f70c35ed2f4ed8be8fa916b Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 13:37:42 +0200
Subject: [PATCH 051/127] Remove some warnings

---
 milli/src/external_documents_ids.rs     | 13 ++--------
 milli/src/update/index_documents/mod.rs | 32 +++++--------------------
 2 files changed, 8 insertions(+), 37 deletions(-)

diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs
index cd6a7e729..12db4eb1d 100644
--- a/milli/src/external_documents_ids.rs
+++ b/milli/src/external_documents_ids.rs
@@ -1,16 +1,12 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::convert::TryInto;
-use std::{fmt, str};
+use std::fmt;
 
-use fst::map::IndexedValue;
-use fst::{IntoStreamer, Streamer};
-use roaring::RoaringBitmap;
+use fst::Streamer;
 
 use crate::DocumentId;
 
-const DELETED_ID: u64 = u64::MAX;
-
 pub enum DocumentOperationKind {
     Create,
     Delete,
@@ -133,8 +129,3 @@ impl Default for ExternalDocumentsIds<'static> {
         ExternalDocumentsIds(fst::Map::default().map_data(Cow::Owned).unwrap())
     }
 }
-
-/// Returns the value of the `IndexedValue` with the highest _index_.
-fn indexed_last_value(indexed_values: &[IndexedValue]) -> Option<u64> {
-    indexed_values.iter().copied().max_by_key(|iv| iv.index).map(|iv| iv.value)
-}
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 8d187a89d..7a77f3a96 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -377,11 +377,6 @@ where
         let index_documents_ids = self.index.documents_ids(self.wtxn)?;
         let index_is_empty = index_documents_ids.is_empty();
         let mut final_documents_ids = RoaringBitmap::new();
-        let mut word_pair_proximity_docids = None;
-        let mut word_position_docids = None;
-        let mut word_fid_docids = None;
-        let mut word_docids = None;
-        let mut exact_word_docids = None;
 
         let mut databases_seen = 0;
         (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
@@ -399,30 +394,15 @@ where
                     word_docids_reader,
                     exact_word_docids_reader,
                     word_fid_docids_reader,
-                } => {
-                    let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
-                    word_docids = Some(cloneable_chunk);
-                    let cloneable_chunk =
-                        unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
-                    exact_word_docids = Some(cloneable_chunk);
-                    let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
-                    word_fid_docids = Some(cloneable_chunk);
-                    TypedChunk::WordDocids {
-                        word_docids_reader,
-                        exact_word_docids_reader,
-                        word_fid_docids_reader,
-                    }
-                }
+                } => TypedChunk::WordDocids {
+                    word_docids_reader,
+                    exact_word_docids_reader,
+                    word_fid_docids_reader,
+                },
                 TypedChunk::WordPairProximityDocids(chunk) => {
-                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
-                    word_pair_proximity_docids = Some(cloneable_chunk);
                     TypedChunk::WordPairProximityDocids(chunk)
                 }
-                TypedChunk::WordPositionDocids(chunk) => {
-                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
-                    word_position_docids = Some(cloneable_chunk);
-                    TypedChunk::WordPositionDocids(chunk)
-                }
+                TypedChunk::WordPositionDocids(chunk) => TypedChunk::WordPositionDocids(chunk),
                 otherwise => otherwise,
             };
 

From 762b0b47e6275ac50f5903a923afb4c0ee7d63d9 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 25 Oct 2023 14:15:06 +0200
Subject: [PATCH 052/127] Use deladd merging function in chunks mergers

---
 .../src/update/index_documents/extract/mod.rs | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 20ee38c4f..41722a53e 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -28,8 +28,8 @@ use self::extract_word_docids::extract_word_docids;
 use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
 use self::extract_word_position_docids::extract_word_position_docids;
 use super::helpers::{
-    as_cloneable_grenad, merge_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
-    MergeableReader,
+    as_cloneable_grenad, merge_deladd_cbo_roaring_bitmaps, CursorClonableMmap, GrenadParameters,
+    MergeFn, MergeableReader,
 };
 use super::{helpers, TypedChunk};
 use crate::{FieldId, Result};
@@ -108,7 +108,7 @@ pub(crate) fn data_from_obkv_documents(
         let lmdb_writer_sx = lmdb_writer_sx.clone();
         rayon::spawn(move || {
             debug!("merge {} database", "facet-id-exists-docids");
-            match facet_exists_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
+            match facet_exists_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
                 Ok(reader) => {
                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetExistsDocids(reader)));
                 }
@@ -124,7 +124,7 @@ pub(crate) fn data_from_obkv_documents(
         let lmdb_writer_sx = lmdb_writer_sx.clone();
         rayon::spawn(move || {
             debug!("merge {} database", "facet-id-is-null-docids");
-            match facet_is_null_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
+            match facet_is_null_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
                 Ok(reader) => {
                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsNullDocids(reader)));
                 }
@@ -140,7 +140,7 @@ pub(crate) fn data_from_obkv_documents(
         let lmdb_writer_sx = lmdb_writer_sx.clone();
         rayon::spawn(move || {
             debug!("merge {} database", "facet-id-is-empty-docids");
-            match facet_is_empty_docids_chunks.merge(merge_cbo_roaring_bitmaps, &indexer) {
+            match facet_is_empty_docids_chunks.merge(merge_deladd_cbo_roaring_bitmaps, &indexer) {
                 Ok(reader) => {
                     let _ = lmdb_writer_sx.send(Ok(TypedChunk::FieldIdFacetIsEmptyDocids(reader)));
                 }
@@ -156,7 +156,7 @@ pub(crate) fn data_from_obkv_documents(
         indexer,
         lmdb_writer_sx.clone(),
         extract_word_pair_proximity_docids,
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         TypedChunk::WordPairProximityDocids,
         "word-pair-proximity-docids",
     );
@@ -166,7 +166,7 @@ pub(crate) fn data_from_obkv_documents(
         indexer,
         lmdb_writer_sx.clone(),
         extract_fid_word_count_docids,
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         TypedChunk::FieldIdWordCountDocids,
         "field-id-wordcount-docids",
     );
@@ -184,7 +184,7 @@ pub(crate) fn data_from_obkv_documents(
         indexer,
         lmdb_writer_sx.clone(),
         move |doc_word_pos, indexer| extract_word_docids(doc_word_pos, indexer, &exact_attributes),
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         |(word_docids_reader, exact_word_docids_reader, word_fid_docids_reader)| {
             TypedChunk::WordDocids {
                 word_docids_reader,
@@ -200,7 +200,7 @@ pub(crate) fn data_from_obkv_documents(
         indexer,
         lmdb_writer_sx.clone(),
         extract_word_position_docids,
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         TypedChunk::WordPositionDocids,
         "word-position-docids",
     );
@@ -210,7 +210,7 @@ pub(crate) fn data_from_obkv_documents(
         indexer,
         lmdb_writer_sx.clone(),
         extract_facet_string_docids,
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         TypedChunk::FieldIdFacetStringDocids,
         "field-id-facet-string-docids",
     );
@@ -220,7 +220,7 @@ pub(crate) fn data_from_obkv_documents(
         indexer,
         lmdb_writer_sx,
         extract_facet_number_docids,
-        merge_cbo_roaring_bitmaps,
+        merge_deladd_cbo_roaring_bitmaps,
         TypedChunk::FieldIdFacetNumberDocids,
         "field-id-facet-number-docids",
     );

From d651b3ef01f69c9365ccf87a49de3c5f435c01f3 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 13:38:56 +0200
Subject: [PATCH 053/127] Remove delete documents files

---
 milli/src/update/delete_documents.rs | 1249 --------------------------
 milli/src/update/facet/delete.rs     |  349 -------
 milli/src/update/facet/mod.rs        |    1 -
 milli/src/update/mod.rs              |    1 -
 4 files changed, 1600 deletions(-)
 delete mode 100644 milli/src/update/delete_documents.rs
 delete mode 100644 milli/src/update/facet/delete.rs

diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
deleted file mode 100644
index 0299e1e4f..000000000
--- a/milli/src/update/delete_documents.rs
+++ /dev/null
@@ -1,1249 +0,0 @@
-use std::collections::btree_map::Entry;
-use std::collections::{BTreeSet, HashMap, HashSet};
-
-use fst::IntoStreamer;
-use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
-use heed::{BytesDecode, BytesEncode, Database, RwIter};
-use instant_distance::PointId;
-use roaring::RoaringBitmap;
-use serde::{Deserialize, Serialize};
-use time::OffsetDateTime;
-
-use super::facet::delete::FacetsDelete;
-use super::ClearDocuments;
-use crate::error::InternalError;
-use crate::facet::FacetType;
-use crate::heed_codec::facet::FieldDocIdFacetCodec;
-use crate::heed_codec::CboRoaringBitmapCodec;
-use crate::index::Hnsw;
-use crate::{ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, Index, Result, BEU32};
-
-pub struct DeleteDocuments<'t, 'u, 'i> {
-    wtxn: &'t mut heed::RwTxn<'i, 'u>,
-    index: &'i Index,
-    external_documents_ids: ExternalDocumentsIds<'static>,
-    to_delete_docids: RoaringBitmap,
-    strategy: DeletionStrategy,
-}
-
-/// Result of a [`DeleteDocuments`] operation.
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-pub struct DocumentDeletionResult {
-    pub deleted_documents: u64,
-    pub remaining_documents: u64,
-}
-
-/// Strategy for deleting documents.
-///
-/// - Soft-deleted documents are simply marked as deleted without being actually removed from DB.
-/// - Hard-deleted documents are definitely suppressed from the DB.
-///
-/// Soft-deleted documents trade disk space for runtime performance.
-///
-/// Note that any of these variants can be used at any given moment for any indexation in a database.
-/// For instance, you can use an [`AlwaysSoft`] followed by an [`AlwaysHard`] option without issue.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Default)]
-pub enum DeletionStrategy {
-    #[default]
-    /// Definitely suppress documents according to the number or size of soft-deleted documents
-    Dynamic,
-    /// Never definitely suppress documents
-    AlwaysSoft,
-    /// Always definitely suppress documents
-    AlwaysHard,
-}
-
-impl std::fmt::Display for DeletionStrategy {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            DeletionStrategy::Dynamic => write!(f, "dynamic"),
-            DeletionStrategy::AlwaysSoft => write!(f, "always_soft"),
-            DeletionStrategy::AlwaysHard => write!(f, "always_hard"),
-        }
-    }
-}
-
-/// Result of a [`DeleteDocuments`] operation, used for internal purposes.
-///
-/// It is a superset of the [`DocumentDeletionResult`] structure, giving
-/// additional information about the algorithm used to delete the documents.
-#[derive(Debug)]
-pub(crate) struct DetailedDocumentDeletionResult {
-    pub deleted_documents: u64,
-    pub remaining_documents: u64,
-}
-
-impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
-    pub fn new(
-        wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        index: &'i Index,
-    ) -> Result<DeleteDocuments<'t, 'u, 'i>> {
-        let external_documents_ids = index.external_documents_ids(wtxn)?.into_static();
-
-        Ok(DeleteDocuments {
-            wtxn,
-            index,
-            external_documents_ids,
-            to_delete_docids: RoaringBitmap::new(),
-            strategy: Default::default(),
-        })
-    }
-
-    pub fn strategy(&mut self, strategy: DeletionStrategy) {
-        self.strategy = strategy;
-    }
-
-    pub fn delete_document(&mut self, docid: u32) {
-        self.to_delete_docids.insert(docid);
-    }
-
-    pub fn delete_documents(&mut self, docids: &RoaringBitmap) {
-        self.to_delete_docids |= docids;
-    }
-
-    pub fn delete_external_id(&mut self, external_id: &str) -> Option<u32> {
-        let docid = self.external_documents_ids.get(external_id)?;
-        self.delete_document(docid);
-        Some(docid)
-    }
-
-    pub fn execute(self) -> Result<DocumentDeletionResult> {
-        let DetailedDocumentDeletionResult { deleted_documents, remaining_documents } =
-            self.execute_inner()?;
-
-        Ok(DocumentDeletionResult { deleted_documents, remaining_documents })
-    }
-
-    pub(crate) fn execute_inner(mut self) -> Result<DetailedDocumentDeletionResult> {
-        puffin::profile_function!();
-
-        self.index.set_updated_at(self.wtxn, &OffsetDateTime::now_utc())?;
-
-        // We retrieve the current documents ids that are in the database.
-        let mut documents_ids = self.index.documents_ids(self.wtxn)?;
-        let mut soft_deleted_docids = self.index.soft_deleted_documents_ids(self.wtxn)?;
-        let current_documents_ids_len = documents_ids.len();
-
-        // We can and must stop removing documents in a database that is empty.
-        if documents_ids.is_empty() {
-            // but if there was still documents to delete we clear the database entirely
-            if !soft_deleted_docids.is_empty() {
-                ClearDocuments::new(self.wtxn, self.index).execute()?;
-            }
-            return Ok(DetailedDocumentDeletionResult {
-                deleted_documents: 0,
-                remaining_documents: 0,
-            });
-        }
-
-        // We remove the documents ids that we want to delete
-        // from the documents in the database and write them back.
-        documents_ids -= &self.to_delete_docids;
-        self.index.put_documents_ids(self.wtxn, &documents_ids)?;
-
-        // We can execute a ClearDocuments operation when the number of documents
-        // to delete is exactly the number of documents in the database.
-        if current_documents_ids_len == self.to_delete_docids.len() {
-            let remaining_documents = ClearDocuments::new(self.wtxn, self.index).execute()?;
-            return Ok(DetailedDocumentDeletionResult {
-                deleted_documents: current_documents_ids_len,
-                remaining_documents,
-            });
-        }
-
-        let fields_ids_map = self.index.fields_ids_map(self.wtxn)?;
-        let mut field_distribution = self.index.field_distribution(self.wtxn)?;
-
-        // we update the field distribution
-        for docid in self.to_delete_docids.iter() {
-            let key = BEU32::new(docid);
-            let document =
-                self.index.documents.get(self.wtxn, &key)?.ok_or(
-                    InternalError::DatabaseMissingEntry { db_name: "documents", key: None },
-                )?;
-            for (fid, _value) in document.iter() {
-                let field_name =
-                    fields_ids_map.name(fid).ok_or(FieldIdMapMissingEntry::FieldId {
-                        field_id: fid,
-                        process: "delete documents",
-                    })?;
-                if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string())
-                {
-                    match entry.get().checked_sub(1) {
-                        Some(0) | None => entry.remove(),
-                        Some(count) => entry.insert(count),
-                    };
-                }
-            }
-        }
-
-        self.index.put_field_distribution(self.wtxn, &field_distribution)?;
-
-        soft_deleted_docids |= &self.to_delete_docids;
-
-        // We always soft-delete the documents, even if they will be permanently
-        // deleted immediately after.
-        self.index.put_soft_deleted_documents_ids(self.wtxn, &soft_deleted_docids)?;
-
-        // decide for a hard or soft deletion depending on the strategy
-        let soft_deletion = match self.strategy {
-            DeletionStrategy::Dynamic => {
-                // decide to keep the soft deleted in the DB for now if they meet 2 criteria:
-                // 1. There is less than a fixed rate of 50% of soft-deleted to actual documents, *and*
-                // 2. Soft-deleted occupy an average of less than a fixed size on disk
-
-                let size_used = self.index.used_size()?;
-                let nb_documents = self.index.number_of_documents(self.wtxn)?;
-                let nb_soft_deleted = soft_deleted_docids.len();
-
-                (nb_soft_deleted < nb_documents) && {
-                    const SOFT_DELETED_SIZE_BYTE_THRESHOLD: u64 = 1_073_741_824; // 1GiB
-
-                    // nb_documents + nb_soft_deleted !=0 because if nb_documents is 0 we short-circuit earlier, and then we moved the documents to delete
-                    // from the documents_docids to the soft_deleted_docids.
-                    let estimated_document_size = size_used / (nb_documents + nb_soft_deleted);
-                    let estimated_size_used_by_soft_deleted =
-                        estimated_document_size * nb_soft_deleted;
-                    estimated_size_used_by_soft_deleted < SOFT_DELETED_SIZE_BYTE_THRESHOLD
-                }
-            }
-            DeletionStrategy::AlwaysSoft => true,
-            DeletionStrategy::AlwaysHard => false,
-        };
-
-        if soft_deletion {
-            // Keep the soft-deleted in the DB
-            return Ok(DetailedDocumentDeletionResult {
-                deleted_documents: self.to_delete_docids.len(),
-                remaining_documents: documents_ids.len(),
-            });
-        }
-
-        self.to_delete_docids = soft_deleted_docids;
-
-        let Index {
-            env: _env,
-            main: _main,
-            word_docids,
-            exact_word_docids,
-            word_prefix_docids,
-            exact_word_prefix_docids,
-            word_pair_proximity_docids,
-            field_id_word_count_docids,
-            word_prefix_pair_proximity_docids,
-            prefix_word_pair_proximity_docids,
-            word_position_docids,
-            word_prefix_position_docids,
-            word_fid_docids,
-            word_prefix_fid_docids,
-            facet_id_f64_docids: _,
-            facet_id_string_docids: _,
-            facet_id_normalized_string_strings: _,
-            facet_id_string_fst: _,
-            field_id_docid_facet_f64s: _,
-            field_id_docid_facet_strings: _,
-            script_language_docids,
-            facet_id_exists_docids,
-            facet_id_is_null_docids,
-            facet_id_is_empty_docids,
-            vector_id_docid,
-            documents,
-        } = self.index;
-        // Remove from the documents database
-        for docid in &self.to_delete_docids {
-            documents.delete(self.wtxn, &BEU32::new(docid))?;
-        }
-        // We acquire the current external documents ids map...
-        // Note that its soft-deleted document ids field will be equal to the `to_delete_docids`
-        //let mut new_external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
-        // We then remove the soft-deleted docids from it
-        //new_external_documents_ids.delete_soft_deleted_documents_ids_from_fsts()?;
-        // and write it back to the main database.
-        //let new_external_documents_ids = new_external_documents_ids.into_static();
-        //self.index.put_external_documents_ids(self.wtxn, &new_external_documents_ids)?;
-
-        todo!("please autobatch deletions for now");
-
-        let mut words_to_keep = BTreeSet::default();
-        let mut words_to_delete = BTreeSet::default();
-        // We iterate over the words and delete the documents ids
-        // from the word docids database.
-        remove_from_word_docids(
-            self.wtxn,
-            word_docids,
-            &self.to_delete_docids,
-            &mut words_to_keep,
-            &mut words_to_delete,
-        )?;
-        remove_from_word_docids(
-            self.wtxn,
-            exact_word_docids,
-            &self.to_delete_docids,
-            &mut words_to_keep,
-            &mut words_to_delete,
-        )?;
-
-        // We construct an FST set that contains the words to delete from the words FST.
-        let words_to_delete = fst::Set::from_iter(words_to_delete.difference(&words_to_keep))?;
-
-        let new_words_fst = {
-            // We retrieve the current words FST from the database.
-            let words_fst = self.index.words_fst(self.wtxn)?;
-            let difference = words_fst.op().add(&words_to_delete).difference();
-
-            // We stream the new external ids that does no more contains the to-delete external ids.
-            let mut new_words_fst_builder = fst::SetBuilder::memory();
-            new_words_fst_builder.extend_stream(difference.into_stream())?;
-
-            // We create an words FST set from the above builder.
-            new_words_fst_builder.into_set()
-        };
-
-        // We write the new words FST into the main database.
-        self.index.put_words_fst(self.wtxn, &new_words_fst)?;
-
-        let prefixes_to_delete =
-            remove_from_word_prefix_docids(self.wtxn, word_prefix_docids, &self.to_delete_docids)?;
-
-        let exact_prefix_to_delete = remove_from_word_prefix_docids(
-            self.wtxn,
-            exact_word_prefix_docids,
-            &self.to_delete_docids,
-        )?;
-
-        let all_prefixes_to_delete = prefixes_to_delete.op().add(&exact_prefix_to_delete).union();
-
-        // We compute the new prefix FST and write it only if there is a change.
-        if !prefixes_to_delete.is_empty() || !exact_prefix_to_delete.is_empty() {
-            let new_words_prefixes_fst = {
-                // We retrieve the current words prefixes FST from the database.
-                let words_prefixes_fst = self.index.words_prefixes_fst(self.wtxn)?;
-                let difference =
-                    words_prefixes_fst.op().add(all_prefixes_to_delete.into_stream()).difference();
-
-                // We stream the new external ids that does no more contains the to-delete external ids.
-                let mut new_words_prefixes_fst_builder = fst::SetBuilder::memory();
-                new_words_prefixes_fst_builder.extend_stream(difference.into_stream())?;
-
-                // We create an words FST set from the above builder.
-                new_words_prefixes_fst_builder.into_set()
-            };
-
-            // We write the new words prefixes FST into the main database.
-            self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?;
-        }
-
-        for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] {
-            // We delete the documents ids from the word prefix pair proximity database docids
-            // and remove the empty pairs too.
-            Self::delete_from_db(db.iter_mut(self.wtxn)?.remap_key_type(), &self.to_delete_docids)?;
-        }
-        Self::delete_from_db(
-            word_pair_proximity_docids.iter_mut(self.wtxn)?.remap_key_type(),
-            &self.to_delete_docids,
-        )?;
-        Self::delete_from_db(
-            word_position_docids.iter_mut(self.wtxn)?.remap_key_type(),
-            &self.to_delete_docids,
-        )?;
-        Self::delete_from_db(
-            word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type(),
-            &self.to_delete_docids,
-        )?;
-        Self::delete_from_db(
-            word_fid_docids.iter_mut(self.wtxn)?.remap_key_type(),
-            &self.to_delete_docids,
-        )?;
-        Self::delete_from_db(
-            word_prefix_fid_docids.iter_mut(self.wtxn)?.remap_key_type(),
-            &self.to_delete_docids,
-        )?;
-
-        // Remove the documents ids from the field id word count database.
-        Self::delete_from_db(
-            field_id_word_count_docids.iter_mut(self.wtxn)?.remap_key_type(),
-            &self.to_delete_docids,
-        )?;
-
-        if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? {
-            let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?;
-
-            let (points_to_remove, docids_to_remove): (Vec<_>, RoaringBitmap) = rtree
-                .iter()
-                .filter(|&point| self.to_delete_docids.contains(point.data.0))
-                .cloned()
-                .map(|point| (point, point.data.0))
-                .unzip();
-            points_to_remove.iter().for_each(|point| {
-                rtree.remove(point);
-            });
-            geo_faceted_doc_ids -= docids_to_remove;
-
-            self.index.put_geo_rtree(self.wtxn, &rtree)?;
-            self.index.put_geo_faceted_documents_ids(self.wtxn, &geo_faceted_doc_ids)?;
-        }
-
-        for facet_type in [FacetType::Number, FacetType::String] {
-            let mut affected_facet_values = HashMap::new();
-            for field_id in self.index.faceted_fields_ids(self.wtxn)? {
-                let facet_values = remove_docids_from_field_id_docid_facet_value(
-                    self.index,
-                    self.wtxn,
-                    facet_type,
-                    field_id,
-                    &self.to_delete_docids,
-                )?;
-                if !facet_values.is_empty() {
-                    affected_facet_values.insert(field_id, facet_values);
-                }
-            }
-            FacetsDelete::new(
-                self.index,
-                facet_type,
-                affected_facet_values,
-                &self.to_delete_docids,
-            )
-            .execute(self.wtxn)?;
-        }
-
-        // Remove the documents ids from the script language database.
-        Self::delete_from_db(
-            script_language_docids.iter_mut(self.wtxn)?.remap_key_type(),
-            &self.to_delete_docids,
-        )?;
-        // We delete the documents ids that are under the facet field id values.
-        remove_docids_from_facet_id_docids(
-            self.wtxn,
-            facet_id_exists_docids,
-            &self.to_delete_docids,
-        )?;
-
-        // We delete the documents ids that are under the facet field id values.
-        remove_docids_from_facet_id_docids(
-            self.wtxn,
-            facet_id_is_null_docids,
-            &self.to_delete_docids,
-        )?;
-
-        // We delete the documents ids that are under the facet field id values.
-        remove_docids_from_facet_id_docids(
-            self.wtxn,
-            facet_id_is_empty_docids,
-            &self.to_delete_docids,
-        )?;
-
-        // An ugly and slow way to remove the vectors from the HNSW
-        // It basically reconstructs the HNSW from scratch without editing the current one.
-        if let Some(current_hnsw) = self.index.vector_hnsw(self.wtxn)? {
-            let mut points = Vec::new();
-            let mut docids = Vec::new();
-            for result in vector_id_docid.iter(self.wtxn)? {
-                let (vector_id, docid) = result?;
-                if !self.to_delete_docids.contains(docid.get()) {
-                    let pid = PointId::from(vector_id.get());
-                    let vector = current_hnsw[pid].clone();
-                    points.push(vector);
-                    docids.push(docid);
-                }
-            }
-
-            let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
-
-            vector_id_docid.clear(self.wtxn)?;
-            for (pid, docid) in pids.into_iter().zip(docids) {
-                vector_id_docid.put(self.wtxn, &BEU32::new(pid.into_inner()), &docid)?;
-            }
-            self.index.put_vector_hnsw(self.wtxn, &new_hnsw)?;
-        }
-
-        self.index.put_soft_deleted_documents_ids(self.wtxn, &RoaringBitmap::new())?;
-
-        Ok(DetailedDocumentDeletionResult {
-            deleted_documents: self.to_delete_docids.len(),
-            remaining_documents: documents_ids.len(),
-        })
-    }
-
-    fn delete_from_db<C>(
-        mut iter: RwIter<UnalignedSlice<u8>, C>,
-        to_delete_docids: &RoaringBitmap,
-    ) -> Result<()>
-    where
-        C: for<'a> BytesDecode<'a, DItem = RoaringBitmap>
-            + for<'a> BytesEncode<'a, EItem = RoaringBitmap>,
-    {
-        puffin::profile_function!();
-
-        while let Some(result) = iter.next() {
-            let (bytes, mut docids) = result?;
-            let previous_len = docids.len();
-            docids -= to_delete_docids;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-            } else if docids.len() != previous_len {
-                let bytes = bytes.to_owned();
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&bytes, &docids)? };
-            }
-        }
-        Ok(())
-    }
-}
-
-fn remove_from_word_prefix_docids(
-    txn: &mut heed::RwTxn,
-    db: &Database<Str, CboRoaringBitmapCodec>,
-    to_remove: &RoaringBitmap,
-) -> Result<fst::Set<Vec<u8>>> {
-    puffin::profile_function!();
-
-    let mut prefixes_to_delete = fst::SetBuilder::memory();
-
-    // We iterate over the word prefix docids database and remove the deleted documents ids
-    // from every docids lists. We register the empty prefixes in an fst Set for futur deletion.
-    let mut iter = db.iter_mut(txn)?;
-    while let Some(result) = iter.next() {
-        let (prefix, mut docids) = result?;
-        let prefix = prefix.to_owned();
-        let previous_len = docids.len();
-        docids -= to_remove;
-        if docids.is_empty() {
-            // safety: we don't keep references from inside the LMDB database.
-            unsafe { iter.del_current()? };
-            prefixes_to_delete.insert(prefix)?;
-        } else if docids.len() != previous_len {
-            // safety: we don't keep references from inside the LMDB database.
-            unsafe { iter.put_current(&prefix, &docids)? };
-        }
-    }
-
-    Ok(prefixes_to_delete.into_set())
-}
-
-fn remove_from_word_docids(
-    txn: &mut heed::RwTxn,
-    db: &heed::Database<Str, CboRoaringBitmapCodec>,
-    to_remove: &RoaringBitmap,
-    words_to_keep: &mut BTreeSet<String>,
-    words_to_remove: &mut BTreeSet<String>,
-) -> Result<()> {
-    puffin::profile_function!();
-
-    // We create an iterator to be able to get the content and delete the word docids.
-    // It's faster to acquire a cursor to get and delete or put, as we avoid traversing
-    // the LMDB B-Tree two times but only once.
-    let mut iter = db.iter_mut(txn)?;
-    while let Some((key, mut docids)) = iter.next().transpose()? {
-        let previous_len = docids.len();
-        docids -= to_remove;
-        if docids.is_empty() {
-            // safety: we don't keep references from inside the LMDB database.
-            unsafe { iter.del_current()? };
-            words_to_remove.insert(key.to_owned());
-        } else {
-            words_to_keep.insert(key.to_owned());
-            if docids.len() != previous_len {
-                let key = key.to_owned();
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&key, &docids)? };
-            }
-        }
-    }
-
-    Ok(())
-}
-
-fn remove_docids_from_field_id_docid_facet_value(
-    index: &Index,
-    wtxn: &mut heed::RwTxn,
-    facet_type: FacetType,
-    field_id: FieldId,
-    to_remove: &RoaringBitmap,
-) -> heed::Result<HashSet<Vec<u8>>> {
-    puffin::profile_function!();
-
-    let db = match facet_type {
-        FacetType::String => {
-            index.field_id_docid_facet_strings.remap_types::<ByteSlice, DecodeIgnore>()
-        }
-        FacetType::Number => {
-            index.field_id_docid_facet_f64s.remap_types::<ByteSlice, DecodeIgnore>()
-        }
-    };
-    let mut all_affected_facet_values = HashSet::default();
-    let mut iter = db
-        .prefix_iter_mut(wtxn, &field_id.to_be_bytes())?
-        .remap_key_type::<FieldDocIdFacetCodec<ByteSlice>>();
-
-    while let Some(result) = iter.next() {
-        let ((_, docid, facet_value), _) = result?;
-        if to_remove.contains(docid) {
-            if !all_affected_facet_values.contains(facet_value) {
-                all_affected_facet_values.insert(facet_value.to_owned());
-            }
-            // safety: we don't keep references from inside the LMDB database.
-            unsafe { iter.del_current()? };
-        }
-    }
-
-    Ok(all_affected_facet_values)
-}
-
-fn remove_docids_from_facet_id_docids<'a, C>(
-    wtxn: &'a mut heed::RwTxn,
-    db: &heed::Database<C, CboRoaringBitmapCodec>,
-    to_remove: &RoaringBitmap,
-) -> heed::Result<()>
-where
-    C: heed::BytesDecode<'a> + heed::BytesEncode<'a>,
-{
-    puffin::profile_function!();
-
-    let mut iter = db.remap_key_type::<ByteSlice>().iter_mut(wtxn)?;
-    while let Some(result) = iter.next() {
-        let (bytes, mut docids) = result?;
-        let previous_len = docids.len();
-        docids -= to_remove;
-        if docids.is_empty() {
-            // safety: we don't keep references from inside the LMDB database.
-            unsafe { iter.del_current()? };
-        } else if docids.len() != previous_len {
-            let bytes = bytes.to_owned();
-            // safety: we don't keep references from inside the LMDB database.
-            unsafe { iter.put_current(&bytes, &docids)? };
-        }
-    }
-
-    Ok(())
-}
-
-#[cfg(test)]
-mod tests {
-    use big_s::S;
-    use heed::RwTxn;
-    use maplit::hashset;
-
-    use super::*;
-    use crate::index::tests::TempIndex;
-    use crate::{db_snap, Filter, Search};
-
-    fn delete_documents<'t>(
-        wtxn: &mut RwTxn<'t, '_>,
-        index: &'t Index,
-        external_ids: &[&str],
-        strategy: DeletionStrategy,
-    ) -> Vec<u32> {
-        let external_document_ids = index.external_documents_ids(wtxn).unwrap();
-        let ids_to_delete: Vec<u32> = external_ids
-            .iter()
-            .map(|id| external_document_ids.get(id.as_bytes()).unwrap())
-            .collect();
-
-        // Delete some documents.
-        let mut builder = DeleteDocuments::new(wtxn, index).unwrap();
-        builder.strategy(strategy);
-        external_ids.iter().for_each(|id| {
-            builder.delete_external_id(id);
-        });
-        builder.execute().unwrap();
-
-        ids_to_delete
-    }
-
-    fn delete_documents_with_numbers_as_primary_key_(deletion_strategy: DeletionStrategy) {
-        let index = TempIndex::new();
-
-        let mut wtxn = index.write_txn().unwrap();
-        index
-            .add_documents_using_wtxn(
-                &mut wtxn,
-                documents!([
-                    { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
-                    { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
-                    { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
-                ]),
-            )
-            .unwrap();
-
-        // delete those documents, ids are synchronous therefore 0, 1, and 2.
-        let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        builder.delete_document(0);
-        builder.delete_document(1);
-        builder.delete_document(2);
-        builder.strategy(deletion_strategy);
-        builder.execute().unwrap();
-
-        wtxn.commit().unwrap();
-
-        // All these snapshots should be empty since the database was cleared
-        db_snap!(index, documents_ids, deletion_strategy);
-        db_snap!(index, word_docids, deletion_strategy);
-        db_snap!(index, word_pair_proximity_docids, deletion_strategy);
-        db_snap!(index, facet_id_exists_docids, deletion_strategy);
-        db_snap!(index, soft_deleted_documents_ids, deletion_strategy);
-
-        let rtxn = index.read_txn().unwrap();
-
-        assert!(index.field_distribution(&rtxn).unwrap().is_empty());
-    }
-
-    #[test]
-    fn delete_documents_with_numbers_as_primary_key() {
-        delete_documents_with_numbers_as_primary_key_(DeletionStrategy::AlwaysHard);
-        delete_documents_with_numbers_as_primary_key_(DeletionStrategy::AlwaysSoft);
-    }
-
-    fn delete_documents_with_strange_primary_key_(strategy: DeletionStrategy) {
-        let index = TempIndex::new();
-
-        index
-            .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()]))
-            .unwrap();
-
-        let mut wtxn = index.write_txn().unwrap();
-        index
-            .add_documents_using_wtxn(
-                &mut wtxn,
-                documents!([
-                    { "mysuperid": 0, "name": "kevin" },
-                    { "mysuperid": 1, "name": "kevina" },
-                    { "mysuperid": 2, "name": "benoit" }
-                ]),
-            )
-            .unwrap();
-        wtxn.commit().unwrap();
-
-        let mut wtxn = index.write_txn().unwrap();
-
-        // Delete not all of the documents but some of them.
-        let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        builder.delete_external_id("0");
-        builder.delete_external_id("1");
-        builder.strategy(strategy);
-        builder.execute().unwrap();
-        wtxn.commit().unwrap();
-
-        db_snap!(index, documents_ids, strategy);
-        db_snap!(index, word_docids, strategy);
-        db_snap!(index, word_pair_proximity_docids, strategy);
-        db_snap!(index, soft_deleted_documents_ids, strategy);
-    }
-
-    #[test]
-    fn delete_documents_with_strange_primary_key() {
-        delete_documents_with_strange_primary_key_(DeletionStrategy::AlwaysHard);
-        delete_documents_with_strange_primary_key_(DeletionStrategy::AlwaysSoft);
-    }
-
-    fn filtered_placeholder_search_should_not_return_deleted_documents_(
-        deletion_strategy: DeletionStrategy,
-    ) {
-        let index = TempIndex::new();
-
-        let mut wtxn = index.write_txn().unwrap();
-
-        index
-            .update_settings_using_wtxn(&mut wtxn, |settings| {
-                settings.set_primary_key(S("docid"));
-                settings.set_filterable_fields(hashset! { S("label"), S("label2") });
-            })
-            .unwrap();
-
-        index
-            .add_documents_using_wtxn(
-                &mut wtxn,
-                documents!([
-                    { "docid": "1_4",  "label": ["sign"] },
-                    { "docid": "1_5",  "label": ["letter"] },
-                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
-                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
-                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
-                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
-                    { "docid": "1_39", "label": ["abstract"] },
-                    { "docid": "1_40", "label": ["cartoon"] },
-                    { "docid": "1_41", "label": ["art","drawing"] },
-                    { "docid": "1_42", "label": ["art","pattern"] },
-                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
-                    { "docid": "1_44", "label": ["drawing"] },
-                    { "docid": "1_45", "label": ["art"] },
-                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
-                    { "docid": "1_47", "label": ["abstract","pattern"] },
-                    { "docid": "1_52", "label": ["abstract","cartoon"] },
-                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
-                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
-                    { "docid": "1_68", "label": ["design"] },
-                    { "docid": "1_69", "label": ["geometry"] },
-                    { "docid": "1_70", "label2": ["geometry", 1.2] },
-                    { "docid": "1_71", "label2": ["design", 2.2] },
-                    { "docid": "1_72", "label2": ["geometry", 1.2] }
-                ]),
-            )
-            .unwrap();
-
-        delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"], deletion_strategy);
-
-        // Placeholder search with filter
-        let filter = Filter::from_str("label = sign").unwrap().unwrap();
-        let results = index.search(&wtxn).filter(filter).execute().unwrap();
-        assert!(results.documents_ids.is_empty());
-
-        wtxn.commit().unwrap();
-
-        db_snap!(index, soft_deleted_documents_ids, deletion_strategy);
-        db_snap!(index, word_docids, deletion_strategy);
-        db_snap!(index, facet_id_f64_docids, deletion_strategy);
-        db_snap!(index, word_pair_proximity_docids, deletion_strategy);
-        db_snap!(index, facet_id_exists_docids, deletion_strategy);
-        db_snap!(index, facet_id_string_docids, deletion_strategy);
-    }
-
-    #[test]
-    fn filtered_placeholder_search_should_not_return_deleted_documents() {
-        filtered_placeholder_search_should_not_return_deleted_documents_(
-            DeletionStrategy::AlwaysHard,
-        );
-        filtered_placeholder_search_should_not_return_deleted_documents_(
-            DeletionStrategy::AlwaysSoft,
-        );
-    }
-
-    fn placeholder_search_should_not_return_deleted_documents_(
-        deletion_strategy: DeletionStrategy,
-    ) {
-        let index = TempIndex::new();
-
-        let mut wtxn = index.write_txn().unwrap();
-        index
-            .update_settings_using_wtxn(&mut wtxn, |settings| {
-                settings.set_primary_key(S("docid"));
-            })
-            .unwrap();
-
-        index
-            .add_documents_using_wtxn(
-                &mut wtxn,
-                documents!([
-                    { "docid": "1_4",  "label": ["sign"] },
-                    { "docid": "1_5",  "label": ["letter"] },
-                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
-                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
-                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
-                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
-                    { "docid": "1_39", "label": ["abstract"] },
-                    { "docid": "1_40", "label": ["cartoon"] },
-                    { "docid": "1_41", "label": ["art","drawing"] },
-                    { "docid": "1_42", "label": ["art","pattern"] },
-                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
-                    { "docid": "1_44", "label": ["drawing"] },
-                    { "docid": "1_45", "label": ["art"] },
-                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
-                    { "docid": "1_47", "label": ["abstract","pattern"] },
-                    { "docid": "1_52", "label": ["abstract","cartoon"] },
-                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
-                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
-                    { "docid": "1_68", "label": ["design"] },
-                    { "docid": "1_69", "label": ["geometry"] },
-                    { "docid": "1_70", "label2": ["geometry", 1.2] },
-                    { "docid": "1_71", "label2": ["design", 2.2] },
-                    { "docid": "1_72", "label2": ["geometry", 1.2] }
-                ]),
-            )
-            .unwrap();
-
-        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"], deletion_strategy);
-
-        // Placeholder search
-        let results = index.search(&wtxn).execute().unwrap();
-        assert!(!results.documents_ids.is_empty());
-        for id in results.documents_ids.iter() {
-            assert!(
-                !deleted_internal_ids.contains(id),
-                "The document {} was supposed to be deleted",
-                id
-            );
-        }
-
-        wtxn.commit().unwrap();
-    }
-
-    #[test]
-    fn placeholder_search_should_not_return_deleted_documents() {
-        placeholder_search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard);
-        placeholder_search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft);
-    }
-
-    fn search_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) {
-        let index = TempIndex::new();
-
-        let mut wtxn = index.write_txn().unwrap();
-        index
-            .update_settings_using_wtxn(&mut wtxn, |settings| {
-                settings.set_primary_key(S("docid"));
-            })
-            .unwrap();
-
-        index
-            .add_documents_using_wtxn(
-                &mut wtxn,
-                documents!([
-                    { "docid": "1_4",  "label": ["sign"] },
-                    { "docid": "1_5",  "label": ["letter"] },
-                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
-                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
-                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
-                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
-                    { "docid": "1_39", "label": ["abstract"] },
-                    { "docid": "1_40", "label": ["cartoon"] },
-                    { "docid": "1_41", "label": ["art","drawing"] },
-                    { "docid": "1_42", "label": ["art","pattern"] },
-                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
-                    { "docid": "1_44", "label": ["drawing"] },
-                    { "docid": "1_45", "label": ["art"] },
-                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
-                    { "docid": "1_47", "label": ["abstract","pattern"] },
-                    { "docid": "1_52", "label": ["abstract","cartoon"] },
-                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
-                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
-                    { "docid": "1_68", "label": ["design"] },
-                    { "docid": "1_69", "label": ["geometry"] },
-                    { "docid": "1_70", "label2": ["geometry", 1.2] },
-                    { "docid": "1_71", "label2": ["design", 2.2] },
-                    { "docid": "1_72", "label2": ["geometry", 1.2] }
-                ]),
-            )
-            .unwrap();
-
-        let deleted_internal_ids =
-            delete_documents(&mut wtxn, &index, &["1_7", "1_52"], deletion_strategy);
-
-        // search for abstract
-        let results = index.search(&wtxn).query("abstract").execute().unwrap();
-        assert!(!results.documents_ids.is_empty());
-        for id in results.documents_ids.iter() {
-            assert!(
-                !deleted_internal_ids.contains(id),
-                "The document {} was supposed to be deleted",
-                id
-            );
-        }
-
-        wtxn.commit().unwrap();
-
-        db_snap!(index, soft_deleted_documents_ids, deletion_strategy);
-    }
-
-    #[test]
-    fn search_should_not_return_deleted_documents() {
-        search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard);
-        search_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft);
-    }
-
-    fn geo_filtered_placeholder_search_should_not_return_deleted_documents_(
-        deletion_strategy: DeletionStrategy,
-    ) {
-        let index = TempIndex::new();
-
-        let mut wtxn = index.write_txn().unwrap();
-        index
-            .update_settings_using_wtxn(&mut wtxn, |settings| {
-                settings.set_primary_key(S("id"));
-                settings.set_filterable_fields(hashset!(S("_geo")));
-                settings.set_sortable_fields(hashset!(S("_geo")));
-            })
-            .unwrap();
-
-        index.add_documents_using_wtxn(&mut wtxn, documents!([
-            { "id": "1",  "city": "Lille",             "_geo": { "lat": 50.6299, "lng": 3.0569 } },
-            { "id": "2",  "city": "Mons-en-Barœul",    "_geo": { "lat": 50.6415, "lng": 3.1106 } },
-            { "id": "3",  "city": "Hellemmes",         "_geo": { "lat": 50.6312, "lng": 3.1106 } },
-            { "id": "4",  "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } },
-            { "id": "5",  "city": "Hem",               "_geo": { "lat": 50.6552, "lng": 3.1897 } },
-            { "id": "6",  "city": "Roubaix",           "_geo": { "lat": 50.6924, "lng": 3.1763 } },
-            { "id": "7",  "city": "Tourcoing",         "_geo": { "lat": 50.7263, "lng": 3.1541 } },
-            { "id": "8",  "city": "Mouscron",          "_geo": { "lat": 50.7453, "lng": 3.2206 } },
-            { "id": "9",  "city": "Tournai",           "_geo": { "lat": 50.6053, "lng": 3.3758 } },
-            { "id": "10", "city": "Ghent",             "_geo": { "lat": 51.0537, "lng": 3.6957 } },
-            { "id": "11", "city": "Brussels",          "_geo": { "lat": 50.8466, "lng": 4.3370 } },
-            { "id": "12", "city": "Charleroi",         "_geo": { "lat": 50.4095, "lng": 4.4347 } },
-            { "id": "13", "city": "Mons",              "_geo": { "lat": 50.4502, "lng": 3.9623 } },
-            { "id": "14", "city": "Valenciennes",      "_geo": { "lat": 50.3518, "lng": 3.5326 } },
-            { "id": "15", "city": "Arras",             "_geo": { "lat": 50.2844, "lng": 2.7637 } },
-            { "id": "16", "city": "Cambrai",           "_geo": { "lat": 50.1793, "lng": 3.2189 } },
-            { "id": "17", "city": "Bapaume",           "_geo": { "lat": 50.1112, "lng": 2.8547 } },
-            { "id": "18", "city": "Amiens",            "_geo": { "lat": 49.9314, "lng": 2.2710 } },
-            { "id": "19", "city": "Compiègne",         "_geo": { "lat": 49.4449, "lng": 2.7913 } },
-            { "id": "20", "city": "Paris",             "_geo": { "lat": 48.9021, "lng": 2.3708 } }
-        ])).unwrap();
-
-        let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
-        let deleted_internal_ids =
-            delete_documents(&mut wtxn, &index, &external_ids_to_delete, deletion_strategy);
-
-        // Placeholder search with geo filter
-        let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap();
-        let results = index.search(&wtxn).filter(filter).execute().unwrap();
-        assert!(!results.documents_ids.is_empty());
-        for id in results.documents_ids.iter() {
-            assert!(
-                !deleted_internal_ids.contains(id),
-                "The document {} was supposed to be deleted",
-                id
-            );
-        }
-
-        wtxn.commit().unwrap();
-
-        db_snap!(index, soft_deleted_documents_ids, deletion_strategy);
-        db_snap!(index, facet_id_f64_docids, deletion_strategy);
-        db_snap!(index, facet_id_string_docids, deletion_strategy);
-    }
-
-    #[test]
-    fn geo_filtered_placeholder_search_should_not_return_deleted_documents() {
-        geo_filtered_placeholder_search_should_not_return_deleted_documents_(
-            DeletionStrategy::AlwaysHard,
-        );
-        geo_filtered_placeholder_search_should_not_return_deleted_documents_(
-            DeletionStrategy::AlwaysSoft,
-        );
-    }
-
-    fn get_documents_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) {
-        let index = TempIndex::new();
-
-        let mut wtxn = index.write_txn().unwrap();
-        index
-            .update_settings_using_wtxn(&mut wtxn, |settings| {
-                settings.set_primary_key(S("docid"));
-            })
-            .unwrap();
-
-        index
-            .add_documents_using_wtxn(
-                &mut wtxn,
-                documents!([
-                    { "docid": "1_4",  "label": ["sign"] },
-                    { "docid": "1_5",  "label": ["letter"] },
-                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
-                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
-                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
-                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
-                    { "docid": "1_39", "label": ["abstract"] },
-                    { "docid": "1_40", "label": ["cartoon"] },
-                    { "docid": "1_41", "label": ["art","drawing"] },
-                    { "docid": "1_42", "label": ["art","pattern"] },
-                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
-                    { "docid": "1_44", "label": ["drawing"] },
-                    { "docid": "1_45", "label": ["art"] },
-                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
-                    { "docid": "1_47", "label": ["abstract","pattern"] },
-                    { "docid": "1_52", "label": ["abstract","cartoon"] },
-                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
-                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
-                    { "docid": "1_68", "label": ["design"] },
-                    { "docid": "1_69", "label": ["geometry"] },
-                    { "docid": "1_70", "label2": ["geometry", 1.2] },
-                    { "docid": "1_71", "label2": ["design", 2.2] },
-                    { "docid": "1_72", "label2": ["geometry", 1.2] }
-                ]),
-            )
-            .unwrap();
-
-        let deleted_external_ids = ["1_7", "1_52"];
-        let deleted_internal_ids =
-            delete_documents(&mut wtxn, &index, &deleted_external_ids, deletion_strategy);
-
-        // list all documents
-        let results = index.all_documents(&wtxn).unwrap();
-        for result in results {
-            let (id, _) = result.unwrap();
-            assert!(
-                !deleted_internal_ids.contains(&id),
-                "The document {} was supposed to be deleted",
-                id
-            );
-        }
-
-        // list internal document ids
-        let results = index.documents_ids(&wtxn).unwrap();
-        for id in results {
-            assert!(
-                !deleted_internal_ids.contains(&id),
-                "The document {} was supposed to be deleted",
-                id
-            );
-        }
-        wtxn.commit().unwrap();
-
-        let rtxn = index.read_txn().unwrap();
-
-        // get internal docids from deleted external document ids
-        let results = index.external_documents_ids(&rtxn).unwrap();
-        for id in deleted_external_ids {
-            assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id);
-        }
-        drop(rtxn);
-
-        db_snap!(index, soft_deleted_documents_ids, deletion_strategy);
-    }
-
-    #[test]
-    fn get_documents_should_not_return_deleted_documents() {
-        get_documents_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard);
-        get_documents_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft);
-    }
-
-    fn stats_should_not_return_deleted_documents_(deletion_strategy: DeletionStrategy) {
-        let index = TempIndex::new();
-
-        let mut wtxn = index.write_txn().unwrap();
-
-        index
-            .update_settings_using_wtxn(&mut wtxn, |settings| {
-                settings.set_primary_key(S("docid"));
-            })
-            .unwrap();
-
-        index.add_documents_using_wtxn(&mut wtxn, documents!([
-            { "docid": "1_4",  "label": ["sign"]},
-            { "docid": "1_5",  "label": ["letter"]},
-            { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"},
-            { "docid": "1_36", "label": ["drawing","painting","pattern"]},
-            { "docid": "1_37", "label": ["art","drawing","outdoor"]},
-            { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"},
-            { "docid": "1_39", "label": ["abstract"]},
-            { "docid": "1_40", "label": ["cartoon"]},
-            { "docid": "1_41", "label": ["art","drawing"]},
-            { "docid": "1_42", "label": ["art","pattern"]},
-            { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32},
-            { "docid": "1_44", "label": ["drawing"], "number": 44i32},
-            { "docid": "1_45", "label": ["art"]},
-            { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]},
-            { "docid": "1_47", "label": ["abstract","pattern"]},
-            { "docid": "1_52", "label": ["abstract","cartoon"]},
-            { "docid": "1_57", "label": ["abstract","drawing","pattern"]},
-            { "docid": "1_58", "label": ["abstract","art","cartoon"]},
-            { "docid": "1_68", "label": ["design"]},
-            { "docid": "1_69", "label": ["geometry"]}
-        ])).unwrap();
-
-        delete_documents(&mut wtxn, &index, &["1_7", "1_52"], deletion_strategy);
-
-        // count internal documents
-        let results = index.number_of_documents(&wtxn).unwrap();
-        assert_eq!(18, results);
-
-        // count field distribution
-        let results = index.field_distribution(&wtxn).unwrap();
-        assert_eq!(Some(&18), results.get("label"));
-        assert_eq!(Some(&1), results.get("title"));
-        assert_eq!(Some(&2), results.get("number"));
-
-        wtxn.commit().unwrap();
-
-        db_snap!(index, soft_deleted_documents_ids, deletion_strategy);
-    }
-
-    #[test]
-    fn stats_should_not_return_deleted_documents() {
-        stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysHard);
-        stats_should_not_return_deleted_documents_(DeletionStrategy::AlwaysSoft);
-    }
-
-    fn stored_detected_script_and_language_should_not_return_deleted_documents_(
-        deletion_strategy: DeletionStrategy,
-    ) {
-        use charabia::{Language, Script};
-        let index = TempIndex::new();
-        let mut wtxn = index.write_txn().unwrap();
-        index
-            .add_documents_using_wtxn(
-                &mut wtxn,
-                documents!([
-                { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
-                { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
-                { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
-                { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
-                { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
-                { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
-            ]))
-            .unwrap();
-
-        let key_cmn = (Script::Cj, Language::Cmn);
-        let cj_cmn_docs =
-            index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default();
-        let mut expected_cj_cmn_docids = RoaringBitmap::new();
-        expected_cj_cmn_docids.push(1);
-        expected_cj_cmn_docids.push(5);
-        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
-
-        delete_documents(&mut wtxn, &index, &["1"], deletion_strategy);
-        wtxn.commit().unwrap();
-
-        let rtxn = index.read_txn().unwrap();
-        let cj_cmn_docs =
-            index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default();
-        let mut expected_cj_cmn_docids = RoaringBitmap::new();
-        expected_cj_cmn_docids.push(5);
-        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
-    }
-
-    #[test]
-    fn stored_detected_script_and_language_should_not_return_deleted_documents() {
-        stored_detected_script_and_language_should_not_return_deleted_documents_(
-            DeletionStrategy::AlwaysHard,
-        );
-        stored_detected_script_and_language_should_not_return_deleted_documents_(
-            DeletionStrategy::AlwaysSoft,
-        );
-    }
-
-    #[test]
-    fn delete_words_exact_attributes() {
-        let index = TempIndex::new();
-
-        index
-            .update_settings(|settings| {
-                settings.set_primary_key(S("id"));
-                settings.set_searchable_fields(vec![S("text"), S("exact")]);
-                settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
-            })
-            .unwrap();
-
-        index
-            .add_documents(documents!([
-                { "id": 0, "text": "hello" },
-                { "id": 1, "exact": "hello"}
-            ]))
-            .unwrap();
-        db_snap!(index, word_docids, 1, @r###"
-        hello            [0, ]
-        "###);
-        db_snap!(index, exact_word_docids, 1, @r###"
-        hello            [1, ]
-        "###);
-        db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
-
-        let mut wtxn = index.write_txn().unwrap();
-        let deleted_internal_ids =
-            delete_documents(&mut wtxn, &index, &["1"], DeletionStrategy::AlwaysHard);
-        wtxn.commit().unwrap();
-
-        db_snap!(index, word_docids, 2, @r###"
-        hello            [0, ]
-        "###);
-        db_snap!(index, exact_word_docids, 2, @"");
-        db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
-
-        insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
-        let txn = index.read_txn().unwrap();
-        let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
-        insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
-
-        let mut s = Search::new(&txn, &index);
-        s.query("hello");
-        let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
-        insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
-    }
-}
diff --git a/milli/src/update/facet/delete.rs b/milli/src/update/facet/delete.rs
deleted file mode 100644
index 8bd3f196b..000000000
--- a/milli/src/update/facet/delete.rs
+++ /dev/null
@@ -1,349 +0,0 @@
-use std::collections::{HashMap, HashSet};
-
-use heed::RwTxn;
-use log::debug;
-use roaring::RoaringBitmap;
-use time::OffsetDateTime;
-
-use super::{FACET_GROUP_SIZE, FACET_MAX_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
-use crate::facet::FacetType;
-use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
-use crate::heed_codec::ByteSliceRefCodec;
-use crate::update::{FacetsUpdateBulk, FacetsUpdateIncrementalInner};
-use crate::{FieldId, Index, Result};
-
-/// A builder used to remove elements from the `facet_id_string_docids` or `facet_id_f64_docids` databases.
-///
-/// Depending on the number of removed elements and the existing size of the database, we use either
-/// a bulk delete method or an incremental delete method.
-pub struct FacetsDelete<'i, 'b> {
-    index: &'i Index,
-    database: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
-    facet_type: FacetType,
-    affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
-    docids_to_delete: &'b RoaringBitmap,
-    group_size: u8,
-    max_group_size: u8,
-    min_level_size: u8,
-}
-impl<'i, 'b> FacetsDelete<'i, 'b> {
-    pub fn new(
-        index: &'i Index,
-        facet_type: FacetType,
-        affected_facet_values: HashMap<FieldId, HashSet<Vec<u8>>>,
-        docids_to_delete: &'b RoaringBitmap,
-    ) -> Self {
-        let database = match facet_type {
-            FacetType::String => index
-                .facet_id_string_docids
-                .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
-            FacetType::Number => {
-                index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>()
-            }
-        };
-        Self {
-            index,
-            database,
-            facet_type,
-            affected_facet_values,
-            docids_to_delete,
-            group_size: FACET_GROUP_SIZE,
-            max_group_size: FACET_MAX_GROUP_SIZE,
-            min_level_size: FACET_MIN_LEVEL_SIZE,
-        }
-    }
-
-    pub fn execute(self, wtxn: &mut RwTxn) -> Result<()> {
-        debug!("Computing and writing the facet values levels docids into LMDB on disk...");
-        self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
-
-        for (field_id, affected_facet_values) in self.affected_facet_values {
-            // This is an incorrect condition, since we assume that the length of the database is equal
-            // to the number of facet values for the given field_id. It means that in some cases, we might
-            // wrongly choose the incremental indexer over the bulk indexer. But the only case where that could
-            // really be a performance problem is when we fully delete a large ratio of all facet values for
-            // each field id. This would almost never happen. Still, to be overly cautious, I have added a
-            // 2x penalty to the incremental indexer. That is, instead of assuming a 70x worst-case performance
-            // penalty to the incremental indexer, we assume a 150x worst-case performance penalty instead.
-            if affected_facet_values.len() >= (self.database.len(wtxn)? / 150) {
-                // Bulk delete
-                let mut modified = false;
-
-                for facet_value in affected_facet_values {
-                    let key =
-                        FacetGroupKey { field_id, level: 0, left_bound: facet_value.as_slice() };
-                    let mut old = self.database.get(wtxn, &key)?.unwrap();
-                    let previous_len = old.bitmap.len();
-                    old.bitmap -= self.docids_to_delete;
-                    if old.bitmap.is_empty() {
-                        modified = true;
-                        self.database.delete(wtxn, &key)?;
-                    } else if old.bitmap.len() != previous_len {
-                        modified = true;
-                        self.database.put(wtxn, &key, &old)?;
-                    }
-                }
-                if modified {
-                    let builder = FacetsUpdateBulk::new_not_updating_level_0(
-                        self.index,
-                        vec![field_id],
-                        self.facet_type,
-                    );
-                    builder.execute(wtxn)?;
-                }
-            } else {
-                // Incremental
-                let inc = FacetsUpdateIncrementalInner {
-                    db: self.database,
-                    group_size: self.group_size,
-                    min_level_size: self.min_level_size,
-                    max_group_size: self.max_group_size,
-                };
-                for facet_value in affected_facet_values {
-                    inc.delete(wtxn, field_id, facet_value.as_slice(), self.docids_to_delete)?;
-                }
-            }
-        }
-        Ok(())
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use std::iter::FromIterator;
-
-    use big_s::S;
-    use maplit::hashset;
-    use rand::seq::SliceRandom;
-    use rand::SeedableRng;
-    use roaring::RoaringBitmap;
-
-    use crate::db_snap;
-    use crate::documents::documents_batch_reader_from_objects;
-    use crate::index::tests::TempIndex;
-    use crate::update::facet::test_helpers::ordered_string;
-    use crate::update::{DeleteDocuments, DeletionStrategy};
-
-    #[test]
-    fn delete_mixed_incremental_and_bulk() {
-        // The point of this test is to create an index populated with documents
-        // containing different filterable attributes. Then, we delete a bunch of documents
-        // such that a mix of the incremental and bulk indexer is used (depending on the field id)
-        let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
-
-        index
-            .update_settings(|settings| {
-                settings.set_filterable_fields(
-                    hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
-                );
-            })
-            .unwrap();
-
-        let mut documents = vec![];
-        for i in 0..1000 {
-            documents.push(
-                serde_json::json! {
-                    {
-                        "id": i,
-                        "label": i / 10,
-                        "colour": i / 100,
-                        "timestamp": i / 2,
-                    }
-                }
-                .as_object()
-                .unwrap()
-                .clone(),
-            );
-        }
-
-        let documents = documents_batch_reader_from_objects(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, facet_id_f64_docids, 1, @"550cd138d6fe31ccdd42cd5392fbd576");
-
-        let mut wtxn = index.env.write_txn().unwrap();
-
-        let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        builder.strategy(DeletionStrategy::AlwaysHard);
-        builder.delete_documents(&RoaringBitmap::from_iter(0..100));
-        // by deleting the first 100 documents, we expect that:
-        // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
-        // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
-        // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
-        // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
-        // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
-        builder.execute().unwrap();
-        wtxn.commit().unwrap();
-
-        db_snap!(index, soft_deleted_documents_ids, @"[]");
-        db_snap!(index, facet_id_f64_docids, 2, @"d4d5f14e7f1e1f09b86821a0b6defcc6");
-    }
-
-    // Same test as above but working with string values for the facets
-    #[test]
-    fn delete_mixed_incremental_and_bulk_string() {
-        // The point of this test is to create an index populated with documents
-        // containing different filterable attributes. Then, we delete a bunch of documents
-        // such that a mix of the incremental and bulk indexer is used (depending on the field id)
-        let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
-
-        index
-            .update_settings(|settings| {
-                settings.set_filterable_fields(
-                    hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
-                );
-            })
-            .unwrap();
-
-        let mut documents = vec![];
-        for i in 0..1000 {
-            documents.push(
-                serde_json::json! {
-                    {
-                        "id": i,
-                        "label": ordered_string(i / 10),
-                        "colour": ordered_string(i / 100),
-                        "timestamp": ordered_string(i / 2),
-                    }
-                }
-                .as_object()
-                .unwrap()
-                .clone(),
-            );
-        }
-
-        let documents = documents_batch_reader_from_objects(documents);
-        index.add_documents(documents).unwrap();
-
-        // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
-        db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
-
-        let mut wtxn = index.env.write_txn().unwrap();
-
-        let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        builder.strategy(DeletionStrategy::AlwaysHard);
-        builder.delete_documents(&RoaringBitmap::from_iter(0..100));
-        // by deleting the first 100 documents, we expect that:
-        // - the "id" part of the DB will be updated in bulk, since #affected_facet_value = 100 which is > database_len / 150 (= 13)
-        // - the "label" part will be updated incrementally, since #affected_facet_value = 10 which is < 13
-        // - the "colour" part will also be updated incrementally, since #affected_values = 1 which is < 13
-        // - the "timestamp" part will be updated in bulk, since #affected_values = 50 which is > 13
-        // This has to be verified manually by inserting breakpoint/adding print statements to the code when running the test
-        builder.execute().unwrap();
-        wtxn.commit().unwrap();
-
-        db_snap!(index, soft_deleted_documents_ids, @"[]");
-        db_snap!(index, facet_id_string_docids, 2, @"7f9c00b29e04d58c1821202a5dda0ebc");
-    }
-
-    #[test]
-    fn delete_almost_all_incrementally_string() {
-        let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
-
-        index
-            .update_settings(|settings| {
-                settings.set_filterable_fields(
-                    hashset! { S("id"), S("label"), S("timestamp"), S("colour") },
-                );
-            })
-            .unwrap();
-
-        let mut documents = vec![];
-        for i in 0..1000 {
-            documents.push(
-                serde_json::json! {
-                    {
-                        "id": i,
-                        "label": ordered_string(i / 10),
-                        "colour": ordered_string(i / 100),
-                        "timestamp": ordered_string(i / 2),
-                    }
-                }
-                .as_object()
-                .unwrap()
-                .clone(),
-            );
-        }
-
-        let documents = documents_batch_reader_from_objects(documents);
-        index.add_documents(documents).unwrap();
-
-        // Note that empty strings are not stored in the facet db due to commit 4860fd452965 (comment written on 29 Nov 2022)
-        db_snap!(index, facet_id_string_docids, 1, @"5fd1bd0724c65a6dc1aafb6db93c7503");
-
-        let mut rng = rand::rngs::SmallRng::from_seed([0; 32]);
-
-        let mut docids_to_delete = (0..1000).collect::<Vec<_>>();
-        docids_to_delete.shuffle(&mut rng);
-        for docid in docids_to_delete.into_iter().take(990) {
-            let mut wtxn = index.env.write_txn().unwrap();
-            let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-            builder.strategy(DeletionStrategy::AlwaysHard);
-            builder.delete_documents(&RoaringBitmap::from_iter([docid]));
-            builder.execute().unwrap();
-            wtxn.commit().unwrap();
-        }
-
-        db_snap!(index, soft_deleted_documents_ids, @"[]");
-        db_snap!(index, facet_id_string_docids, 2, @"ece56086e76d50e661fb2b58475b9f7d");
-    }
-}
-
-#[allow(unused)]
-#[cfg(test)]
-mod comparison_bench {
-    use std::iter::once;
-
-    use rand::Rng;
-    use roaring::RoaringBitmap;
-
-    use crate::heed_codec::facet::OrderedF64Codec;
-    use crate::update::facet::test_helpers::FacetIndex;
-
-    // This is a simple test to get an intuition on the relative speed
-    // of the incremental vs. bulk indexer.
-    //
-    // The benchmark shows the worst-case scenario for the incremental indexer, since
-    // each facet value contains only one document ID.
-    //
-    // In that scenario, it appears that the incremental indexer is about 70 times slower than the
-    // bulk indexer.
-    // #[test]
-    fn benchmark_facet_indexing_delete() {
-        let mut r = rand::thread_rng();
-
-        for i in 1..=20 {
-            let size = 50_000 * i;
-            let index = FacetIndex::<OrderedF64Codec>::new(4, 8, 5);
-
-            let mut txn = index.env.write_txn().unwrap();
-            let mut elements = Vec::<((u16, f64), RoaringBitmap)>::new();
-            for i in 0..size {
-                // field id = 0, left_bound = i, docids = [i]
-                elements.push(((0, i as f64), once(i).collect()));
-            }
-            let timer = std::time::Instant::now();
-            index.bulk_insert(&mut txn, &[0], elements.iter());
-            let time_spent = timer.elapsed().as_millis();
-            println!("bulk {size} : {time_spent}ms");
-
-            txn.commit().unwrap();
-
-            for nbr_doc in [1, 100, 1000, 10_000] {
-                let mut txn = index.env.write_txn().unwrap();
-                let timer = std::time::Instant::now();
-                //
-                // delete one document
-                //
-                for _ in 0..nbr_doc {
-                    let deleted_u32 = r.gen::<u32>() % size;
-                    let deleted_f64 = deleted_u32 as f64;
-                    index.delete_single_docid(&mut txn, 0, &deleted_f64, deleted_u32)
-                }
-                let time_spent = timer.elapsed().as_millis();
-                println!("    delete {nbr_doc} : {time_spent}ms");
-                txn.abort().unwrap();
-            }
-        }
-    }
-}
diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs
index 2b671e5cb..f932d5aee 100644
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -98,7 +98,6 @@ use crate::update::merge_btreeset_string;
 use crate::{BEU16StrCodec, Index, Result, BEU16, MAX_FACET_VALUE_LENGTH};
 
 pub mod bulk;
-pub mod delete;
 pub mod incremental;
 
 /// A builder used to add new elements to the `facet_id_string_docids` or `facet_id_f64_docids` databases.
diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs
index 6224995a3..97d802d03 100644
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@@ -22,7 +22,6 @@ pub use self::words_prefixes_fst::WordsPrefixesFst;
 mod available_documents_ids;
 mod clear_documents;
 pub(crate) mod del_add;
-mod delete_documents;
 pub(crate) mod facet;
 mod index_documents;
 mod indexer_config;

From 2263dff02bf7ba62b410b9377bae14e7cd484f79 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 13:40:46 +0200
Subject: [PATCH 054/127] Stop using removed delete pipelines almost everywhere

---
 benchmarks/benches/indexing.rs            |  4 +--
 milli/src/index.rs                        | 32 ++++++++++-------------
 milli/src/update/facet/mod.rs             |  4 ---
 milli/src/update/index_documents/mod.rs   |  7 ++---
 milli/src/update/mod.rs                   |  1 -
 milli/src/update/prefix_word_pairs/mod.rs |  8 +-----
 6 files changed, 18 insertions(+), 38 deletions(-)

diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs
index 9446c0b0f..cb220a5f0 100644
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@@ -6,9 +6,7 @@ use std::path::Path;
 
 use criterion::{criterion_group, criterion_main, Criterion};
 use milli::heed::{EnvOpenOptions, RwTxn};
-use milli::update::{
-    DeleteDocuments, IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings,
-};
+use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
 use milli::Index;
 use rand::seq::SliceRandom;
 use rand_chacha::rand_core::SeedableRng;
diff --git a/milli/src/index.rs b/milli/src/index.rs
index 61ec41788..3e48f5eb1 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1469,8 +1469,7 @@ pub(crate) mod tests {
     use crate::error::{Error, InternalError};
     use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
     use crate::update::{
-        self, DeleteDocuments, DeletionStrategy, IndexDocuments, IndexDocumentsConfig,
-        IndexDocumentsMethod, IndexerConfig, Settings,
+        self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
     };
     use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult};
 
@@ -1563,11 +1562,20 @@ pub(crate) mod tests {
         pub fn delete_document(&self, external_document_id: &str) {
             let mut wtxn = self.write_txn().unwrap();
 
-            let mut delete = DeleteDocuments::new(&mut wtxn, self).unwrap();
-            delete.strategy(self.index_documents_config.deletion_strategy);
+            let builder = IndexDocuments::new(
+                &mut wtxn,
+                self,
+                &self.indexer_config,
+                self.index_documents_config.clone(),
+                |_| (),
+                || false,
+            )
+            .unwrap();
+            let (builder, user_error) =
+                builder.remove_documents(vec![external_document_id.to_owned()]).unwrap();
+            user_error.unwrap();
+            builder.execute().unwrap();
 
-            delete.delete_external_id(external_document_id);
-            delete.execute().unwrap();
             wtxn.commit().unwrap();
         }
     }
@@ -1884,7 +1892,6 @@ pub(crate) mod tests {
         use maplit::hashset;
 
         let mut index = TempIndex::new();
-        index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
         let index = index;
 
         index
@@ -2055,8 +2062,6 @@ pub(crate) mod tests {
         }
         // Second Batch: replace the documents with soft-deletion
         {
-            index.index_documents_config.deletion_strategy =
-                crate::update::DeletionStrategy::AlwaysSoft;
             let mut docs1 = vec![];
             for i in 0..3 {
                 docs1.push(serde_json::json!(
@@ -2125,7 +2130,6 @@ pub(crate) mod tests {
         drop(rtxn);
         // Third Batch: replace the documents with soft-deletion again
         {
-            index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
             let mut docs1 = vec![];
             for i in 0..3 {
                 docs1.push(serde_json::json!(
@@ -2194,7 +2198,6 @@ pub(crate) mod tests {
 
         // Fourth Batch: replace the documents without soft-deletion
         {
-            index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
             let mut docs1 = vec![];
             for i in 0..3 {
                 docs1.push(serde_json::json!(
@@ -2266,7 +2269,6 @@ pub(crate) mod tests {
     fn bug_3021_first() {
         // https://github.com/meilisearch/meilisearch/issues/3021
         let mut index = TempIndex::new();
-        index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
         index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
 
         index
@@ -2379,7 +2381,6 @@ pub(crate) mod tests {
     fn bug_3021_second() {
         // https://github.com/meilisearch/meilisearch/issues/3021
         let mut index = TempIndex::new();
-        index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
         index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
 
         index
@@ -2505,7 +2506,6 @@ pub(crate) mod tests {
     fn bug_3021_third() {
         // https://github.com/meilisearch/meilisearch/issues/3021
         let mut index = TempIndex::new();
-        index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
         index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
 
         index
@@ -2544,8 +2544,6 @@ pub(crate) mod tests {
         "###);
         db_snap!(index, soft_deleted_documents_ids, 2, @"[0, ]");
 
-        index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
-
         index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap();
 
         db_snap!(index, documents_ids, @"[2, 3, ]");
@@ -2579,7 +2577,6 @@ pub(crate) mod tests {
         // https://github.com/meilisearch/meilisearch/issues/3021
         let mut index = TempIndex::new();
         index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
-        index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
 
         index
             .update_settings(|settings| {
@@ -2622,7 +2619,6 @@ pub(crate) mod tests {
 
         let mut wtxn = index.write_txn().unwrap();
         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        delete.strategy(DeletionStrategy::AlwaysHard);
         delete.execute().unwrap();
         wtxn.commit().unwrap();
 
diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs
index f932d5aee..71e434599 100644
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -563,14 +563,11 @@ mod tests {
     use crate::db_snap;
     use crate::documents::documents_batch_reader_from_objects;
     use crate::index::tests::TempIndex;
-    use crate::update::DeletionStrategy;
 
     #[test]
     fn replace_all_identical_soft_deletion_then_hard_deletion() {
         let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100);
 
-        index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
-
         index
             .update_settings(|settings| {
                 settings.set_primary_key("id".to_owned());
@@ -622,7 +619,6 @@ mod tests {
         db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
 
         // Then replace the last document while disabling soft_deletion
-        index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
         let mut documents = vec![];
         for i in 999..1000 {
             documents.push(
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 7a77f3a96..0b000da06 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -35,8 +35,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
 use crate::update::{
-    DeletionStrategy, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
-    WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
+    IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids,
+    WordPrefixIntegerDocids, WordsPrefixesFst,
 };
 use crate::{CboRoaringBitmapCodec, Index, Result};
 
@@ -89,7 +89,6 @@ pub struct IndexDocumentsConfig {
     pub words_positions_level_group_size: Option<NonZeroU32>,
     pub words_positions_min_level_size: Option<NonZeroU32>,
     pub update_method: IndexDocumentsMethod,
-    pub deletion_strategy: DeletionStrategy,
     pub autogenerate_docids: bool,
 }
 
@@ -2497,7 +2496,6 @@ mod tests {
 
         // Delete not all of the documents but some of them.
         let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        builder.strategy(DeletionStrategy::AlwaysHard);
         builder.delete_external_id("0");
         builder.delete_external_id("3");
         let result = builder.execute().unwrap();
@@ -2559,7 +2557,6 @@ mod tests {
         ]
         */
         let mut index = TempIndex::new();
-        index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
 
         // START OF BATCH
 
diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs
index 97d802d03..dd8851ccb 100644
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@@ -1,6 +1,5 @@
 pub use self::available_documents_ids::AvailableDocumentsIds;
 pub use self::clear_documents::ClearDocuments;
-pub use self::delete_documents::{DeleteDocuments, DeletionStrategy, DocumentDeletionResult};
 pub use self::facet::bulk::FacetsUpdateBulk;
 pub use self::facet::incremental::FacetsUpdateIncrementalInner;
 pub use self::index_documents::{
diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs
index e3135d546..7d77490bc 100644
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@@ -149,7 +149,7 @@ mod tests {
     use crate::db_snap;
     use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
     use crate::index::tests::TempIndex;
-    use crate::update::{DeleteDocuments, DeletionStrategy, IndexDocumentsMethod};
+    use crate::update::IndexDocumentsMethod;
 
     fn documents_with_enough_different_words_for_prefixes(
         prefixes: &[&str],
@@ -337,7 +337,6 @@ mod tests {
 
         let mut wtxn = index.write_txn().unwrap();
         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        delete.strategy(DeletionStrategy::AlwaysHard);
         delete.delete_documents(&RoaringBitmap::from_iter([50]));
         delete.execute().unwrap();
         wtxn.commit().unwrap();
@@ -349,7 +348,6 @@ mod tests {
 
         let mut wtxn = index.write_txn().unwrap();
         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        delete.strategy(DeletionStrategy::AlwaysHard);
         delete.delete_documents(&RoaringBitmap::from_iter(0..50));
         delete.execute().unwrap();
         wtxn.commit().unwrap();
@@ -421,7 +419,6 @@ mod tests {
 
         let mut wtxn = index.write_txn().unwrap();
         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        delete.strategy(DeletionStrategy::AlwaysSoft);
         delete.delete_documents(&RoaringBitmap::from_iter([50]));
         delete.execute().unwrap();
         wtxn.commit().unwrap();
@@ -433,7 +430,6 @@ mod tests {
 
         let mut wtxn = index.write_txn().unwrap();
         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        delete.strategy(DeletionStrategy::AlwaysSoft);
 
         delete.delete_documents(&RoaringBitmap::from_iter(0..50));
         delete.execute().unwrap();
@@ -460,7 +456,6 @@ mod tests {
         let mut index = TempIndex::new();
         index.index_documents_config.words_prefix_threshold = Some(50);
         index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
-        index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysSoft;
 
         index
             .update_settings(|settings| {
@@ -520,7 +515,6 @@ mod tests {
     fn replace_hard_deletion() {
         let mut index = TempIndex::new();
         index.index_documents_config.words_prefix_threshold = Some(50);
-        index.index_documents_config.deletion_strategy = DeletionStrategy::AlwaysHard;
         index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
 
         index

From c534a1b68764005018fceb767ec737a4dcc21784 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 13:41:11 +0200
Subject: [PATCH 055/127] Stop using delete documents pipeline in batch runner

---
 index-scheduler/src/batch.rs            | 68 ++++++++++++++-----------
 milli/src/update/index_documents/mod.rs |  2 +
 2 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs
index 3e2cc4281..a4b7e5c45 100644
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -30,8 +30,7 @@ use meilisearch_types::heed::{RoTxn, RwTxn};
 use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
 use meilisearch_types::milli::heed::CompactionOption;
 use meilisearch_types::milli::update::{
-    DeleteDocuments, DocumentDeletionResult, IndexDocumentsConfig, IndexDocumentsMethod,
-    Settings as MilliSettings,
+    IndexDocumentsConfig, IndexDocumentsMethod, Settings as MilliSettings,
 };
 use meilisearch_types::milli::{self, Filter, BEU32};
 use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
@@ -1238,7 +1237,8 @@ impl IndexScheduler {
                             let (new_builder, user_result) =
                                 builder.remove_documents(document_ids)?;
                             builder = new_builder;
-
+                            // Uses Invariant: remove documents actually always returns Ok for the inner result
+                            let count = user_result.unwrap();
                             let provided_ids =
                                 if let Some(Details::DocumentDeletion { provided_ids, .. }) =
                                     task.details
@@ -1249,23 +1249,11 @@ impl IndexScheduler {
                                     unreachable!();
                                 };
 
-                            match user_result {
-                                Ok(count) => {
-                                    task.status = Status::Succeeded;
-                                    task.details = Some(Details::DocumentDeletion {
-                                        provided_ids,
-                                        deleted_documents: Some(count),
-                                    });
-                                }
-                                Err(e) => {
-                                    task.status = Status::Failed;
-                                    task.details = Some(Details::DocumentDeletion {
-                                        provided_ids,
-                                        deleted_documents: Some(0),
-                                    });
-                                    task.error = Some(milli::Error::from(e).into());
-                                }
-                            }
+                            task.status = Status::Succeeded;
+                            task.details = Some(Details::DocumentDeletion {
+                                provided_ids,
+                                deleted_documents: Some(count),
+                            });
                         }
                     }
                 }
@@ -1288,21 +1276,42 @@ impl IndexScheduler {
                 Ok(tasks)
             }
             IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => {
-                let mut builder = milli::update::DeleteDocuments::new(index_wtxn, index)?;
-                documents.iter().flatten().for_each(|id| {
-                    builder.delete_external_id(id);
-                });
+                let indexer_config = self.index_mapper.indexer_config();
+                let config = IndexDocumentsConfig {
+                    update_method: IndexDocumentsMethod::ReplaceDocuments,
+                    ..Default::default()
+                };
+                let must_stop_processing = self.must_stop_processing.clone();
 
-                let DocumentDeletionResult { deleted_documents, .. } = builder.execute()?;
+                let mut builder = milli::update::IndexDocuments::new(
+                    index_wtxn,
+                    index,
+                    indexer_config,
+                    config,
+                    |indexing_step| debug!("update: {:?}", indexing_step),
+                    || must_stop_processing.get(),
+                )?;
+
+                let document_ids = documents.iter().cloned().flatten().collect();
+
+                let (new_builder, user_result) = builder.remove_documents(document_ids)?;
+                builder = new_builder;
+                // Uses Invariant: remove documents actually always returns Ok for the inner result
+                let count = user_result.unwrap();
 
                 for (task, documents) in tasks.iter_mut().zip(documents) {
                     task.status = Status::Succeeded;
                     task.details = Some(Details::DocumentDeletion {
                         provided_ids: documents.len(),
-                        deleted_documents: Some(deleted_documents.min(documents.len() as u64)),
+                        deleted_documents: Some(count.min(documents.len() as u64)),
                     });
                 }
 
+                if !tasks.iter().all(|res| res.error.is_some()) {
+                    let addition = builder.execute()?;
+                    info!("document deletion done: {:?}", addition);
+                }
+
                 Ok(tasks)
             }
             IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
@@ -1558,9 +1567,10 @@ fn delete_document_by_filter<'a>(
             }
             e => e.into(),
         })?;
-        let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
-        delete_operation.delete_documents(&candidates);
-        delete_operation.execute().map(|result| result.deleted_documents)?
+        todo!("need a way to get back the external ids from the internal ids");
+        // let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
+        // delete_operation.delete_documents(&candidates);
+        // delete_operation.execute().map(|result| result.deleted_documents)?
     } else {
         0
     })
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 0b000da06..c8481bd48 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -180,6 +180,7 @@ where
 
         // Early return when there is no document to add
         if to_delete.is_empty() {
+            // Maintains Invariant: remove documents actually always returns Ok for the inner result
             return Ok((self, Ok(0)));
         }
 
@@ -192,6 +193,7 @@ where
 
         self.deleted_documents += deleted_documents;
 
+        // Maintains Invariant: remove documents actually always returns Ok for the inner result
         Ok((self, Ok(deleted_documents)))
     }
 

From 113527f4660b8c062beae43eace63cb16a9d2bd5 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 14:14:15 +0200
Subject: [PATCH 056/127] Remove soft-deleted related methods from Index

---
 meilisearch-types/src/error.rs                |  1 -
 milli/src/error.rs                            |  2 -
 milli/src/index.rs                            | 67 +------------------
 milli/src/search/facet/filter.rs              |  3 -
 milli/src/snapshot_tests.rs                   | 10 ---
 milli/src/update/available_documents_ids.rs   | 35 ++--------
 milli/src/update/clear_documents.rs           |  1 -
 milli/src/update/facet/mod.rs                 |  3 -
 milli/src/update/index_documents/mod.rs       |  3 -
 milli/src/update/index_documents/transform.rs |  6 +-
 milli/src/update/prefix_word_pairs/mod.rs     |  2 -
 11 files changed, 9 insertions(+), 124 deletions(-)

diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs
index 4b6711601..afe9c5189 100644
--- a/meilisearch-types/src/error.rs
+++ b/meilisearch-types/src/error.rs
@@ -324,7 +324,6 @@ impl ErrorCode for milli::Error {
                     UserError::SerdeJson(_)
                     | UserError::InvalidLmdbOpenOptions
                     | UserError::DocumentLimitReached
-                    | UserError::AccessingSoftDeletedDocument { .. }
                     | UserError::UnknownInternalDocumentId { .. } => Code::Internal,
                     UserError::InvalidStoreFile => Code::InvalidStoreFile,
                     UserError::NoSpaceLeftOnDevice => Code::NoSpaceLeftOnDevice,
diff --git a/milli/src/error.rs b/milli/src/error.rs
index e9e1fddd3..b249f2977 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -89,8 +89,6 @@ pub enum FieldIdMapMissingEntry {
 
 #[derive(Error, Debug)]
 pub enum UserError {
-    #[error("A soft deleted internal document id have been used: `{document_id}`.")]
-    AccessingSoftDeletedDocument { document_id: DocumentId },
     #[error("A document cannot contain more than 65,535 fields.")]
     AttributeLimitReached,
     #[error(transparent)]
diff --git a/milli/src/index.rs b/milli/src/index.rs
index 3e48f5eb1..b20674d4c 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -40,7 +40,6 @@ pub mod main_key {
     pub const DISPLAYED_FIELDS_KEY: &str = "displayed-fields";
     pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key";
     pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
-    pub const SOFT_DELETED_DOCUMENTS_IDS_KEY: &str = "soft-deleted-documents-ids";
     pub const HIDDEN_FACETED_FIELDS_KEY: &str = "hidden-faceted-fields";
     pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields";
     pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields";
@@ -367,29 +366,6 @@ impl Index {
         Ok(count.unwrap_or_default())
     }
 
-    /* deleted documents ids */
-
-    /// Writes the soft deleted documents ids.
-    pub(crate) fn put_soft_deleted_documents_ids(
-        &self,
-        wtxn: &mut RwTxn,
-        docids: &RoaringBitmap,
-    ) -> heed::Result<()> {
-        self.main.put::<_, Str, RoaringBitmapCodec>(
-            wtxn,
-            main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY,
-            docids,
-        )
-    }
-
-    /// Returns the soft deleted documents ids.
-    pub(crate) fn soft_deleted_documents_ids(&self, rtxn: &RoTxn) -> heed::Result<RoaringBitmap> {
-        Ok(self
-            .main
-            .get::<_, Str, RoaringBitmapCodec>(rtxn, main_key::SOFT_DELETED_DOCUMENTS_IDS_KEY)?
-            .unwrap_or_default())
-    }
-
     /* primary key */
 
     /// Writes the documents primary key, this is the field name that is used to store the id.
@@ -1187,12 +1163,7 @@ impl Index {
         rtxn: &'t RoTxn,
         ids: impl IntoIterator<Item = DocumentId> + 'a,
     ) -> Result<impl Iterator<Item = Result<(DocumentId, obkv::KvReaderU16<'t>)>> + 'a> {
-        let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
-
         Ok(ids.into_iter().map(move |id| {
-            if soft_deleted_documents.contains(id) {
-                return Err(UserError::AccessingSoftDeletedDocument { document_id: id })?;
-            }
             let kv = self
                 .documents
                 .get(rtxn, &BEU32::new(id))?
@@ -1418,14 +1389,10 @@ impl Index {
         rtxn: &RoTxn,
         key: &(Script, Language),
     ) -> heed::Result<Option<RoaringBitmap>> {
-        let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
-        let doc_ids = self.script_language_docids.get(rtxn, key)?;
-        Ok(doc_ids.map(|ids| ids - soft_deleted_documents))
+        Ok(self.script_language_docids.get(rtxn, key)?)
     }
 
     pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Script, Vec<Language>>> {
-        let soft_deleted_documents = self.soft_deleted_documents_ids(rtxn)?;
-
         let mut script_language: HashMap<Script, Vec<Language>> = HashMap::new();
         let mut script_language_doc_count: Vec<(Script, Language, u64)> = Vec::new();
         let mut total = 0;
@@ -1433,7 +1400,7 @@ impl Index {
             let ((script, language), docids) = sl?;
 
             // keep only Languages that contains at least 1 document.
-            let remaining_documents_count = (docids - &soft_deleted_documents).len();
+            let remaining_documents_count = docids.len();
             total += remaining_documents_count;
             if remaining_documents_count > 0 {
                 script_language_doc_count.push((script, language, remaining_documents_count));
@@ -1918,7 +1885,6 @@ pub(crate) mod tests {
         2                        2
         3                        3
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
         db_snap!(index, facet_id_f64_docids, 1, @r###"
         1   0  0      1  [0, ]
         1   0  1      1  [1, ]
@@ -1943,7 +1909,6 @@ pub(crate) mod tests {
         2                        6
         3                        3
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 2, @"[0, 1, 2, ]");
         db_snap!(index, facet_id_f64_docids, 2, @r###"
         1   0  0      1  [0, ]
         1   0  1      1  [1, 4, ]
@@ -1965,7 +1930,6 @@ pub(crate) mod tests {
         2                        6
         3                        3
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 3, @"[0, 1, 2, 3, ]");
         db_snap!(index, facet_id_f64_docids, 3, @r###"
         1   0  0      1  [0, ]
         1   0  1      1  [1, 4, ]
@@ -1989,7 +1953,6 @@ pub(crate) mod tests {
         2                        6
         3                        7
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 3, @"[]");
         db_snap!(index, facet_id_f64_docids, 3, @r###"
         0   0  0      1  [4, ]
         0   0  1      1  [5, ]
@@ -2052,7 +2015,6 @@ pub(crate) mod tests {
             2                        2
             3                        3
             "###);
-            db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
             db_snap!(index, facet_id_f64_docids, 1, @r###"
             1   0  0      1  [0, ]
             1   0  1      1  [1, ]
@@ -2085,7 +2047,6 @@ pub(crate) mod tests {
             2                        6
             3                        3
             "###);
-            db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, ]");
             db_snap!(index, facet_id_f64_docids, 1, @r###"
             1   0  0      1  [0, 4, ]
             1   0  1      1  [1, 5, ]
@@ -2153,7 +2114,6 @@ pub(crate) mod tests {
             2                        9
             3                        3
             "###);
-            db_snap!(index, soft_deleted_documents_ids, 1, @"[0, 1, 2, 4, 5, 6, ]");
             db_snap!(index, facet_id_f64_docids, 1, @r###"
             1   0  0      1  [0, 4, 7, ]
             1   0  1      1  [1, 5, 8, ]
@@ -2221,7 +2181,7 @@ pub(crate) mod tests {
             2                        12
             3                        3
             "###);
-            db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
+
             db_snap!(index, facet_id_f64_docids, 1, @r###"
             1   0  0      1  [10, ]
             1   0  3      1  [3, 11, ]
@@ -2291,7 +2251,6 @@ pub(crate) mod tests {
         34                       1
         38                       0
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
 
         index.delete_document("34");
 
@@ -2302,7 +2261,6 @@ pub(crate) mod tests {
         34                       1
         38                       0
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]");
 
         index
             .update_settings(|s| {
@@ -2318,7 +2276,6 @@ pub(crate) mod tests {
         hard:
         38                       0
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 3, @"[]");
 
         // So that this document addition works correctly now.
         // It would be wrongly interpreted as a replacement before
@@ -2331,7 +2288,6 @@ pub(crate) mod tests {
         34                       1
         38                       0
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 4, @"[]");
 
         // We do the test again, but deleting the document with id 0 instead of id 1 now
         index.delete_document("38");
@@ -2343,7 +2299,6 @@ pub(crate) mod tests {
         34                       1
         38                       0
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 5, @"[0, ]");
 
         index
             .update_settings(|s| {
@@ -2357,7 +2312,6 @@ pub(crate) mod tests {
         hard:
         34                       1
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 6, @"[]");
 
         // And adding lots of documents afterwards instead of just one.
         // These extra subtests don't add much, but it's better than nothing.
@@ -2374,7 +2328,6 @@ pub(crate) mod tests {
         41                       3
         42                       5
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 7, @"[]");
     }
 
     #[test]
@@ -2403,7 +2356,6 @@ pub(crate) mod tests {
         30                       0
         34                       1
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
 
         index.delete_document("34");
 
@@ -2414,7 +2366,6 @@ pub(crate) mod tests {
         30                       0
         34                       1
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 2, @"[1, ]");
 
         index
             .update_settings(|s| {
@@ -2430,7 +2381,6 @@ pub(crate) mod tests {
         hard:
         30                       0
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 3, @"[]");
 
         // So that when we add a new document
         index.add_documents(documents!({ "primary_key": 35, "b": 2 })).unwrap();
@@ -2444,7 +2394,6 @@ pub(crate) mod tests {
         30                       0
         35                       1
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 4, @"[]");
 
         // And when we add 34 again, we don't replace document 35
         index.add_documents(documents!({ "primary_key": 34, "a": 1 })).unwrap();
@@ -2458,7 +2407,6 @@ pub(crate) mod tests {
         34                       2
         35                       1
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 5, @"[]");
 
         let rtxn = index.read_txn().unwrap();
         let (_docid, obkv) = index.documents(&rtxn, [0]).unwrap()[0];
@@ -2499,7 +2447,6 @@ pub(crate) mod tests {
         38                       4
         39                       5
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 6, @"[]");
     }
 
     #[test]
@@ -2530,7 +2477,6 @@ pub(crate) mod tests {
         4                        1
         5                        2
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 1, @"[]");
 
         index.delete_document("3");
 
@@ -2542,7 +2488,6 @@ pub(crate) mod tests {
         4                        1
         5                        2
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 2, @"[0, ]");
 
         index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap();
 
@@ -2553,7 +2498,6 @@ pub(crate) mod tests {
         4                        3
         5                        2
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 2, @"[]");
 
         index
             .add_documents(documents!([
@@ -2569,7 +2513,6 @@ pub(crate) mod tests {
         4                        3
         5                        2
         "###);
-        db_snap!(index, soft_deleted_documents_ids, 2, @"[]");
     }
 
     #[test]
@@ -2598,7 +2541,6 @@ pub(crate) mod tests {
         11                       0
         4                        1
         "###);
-        db_snap!(index, soft_deleted_documents_ids, @"[]");
 
         index
             .add_documents(documents!([
@@ -2615,7 +2557,6 @@ pub(crate) mod tests {
         11                       0
         4                        2
         "###);
-        db_snap!(index, soft_deleted_documents_ids, @"[1, ]");
 
         let mut wtxn = index.write_txn().unwrap();
         let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
@@ -2630,7 +2571,6 @@ pub(crate) mod tests {
         11                       0
         4                        2
         "###);
-        db_snap!(index, soft_deleted_documents_ids, @"[]");
 
         index
             .add_documents(documents!([
@@ -2647,7 +2587,6 @@ pub(crate) mod tests {
         11                       0
         4                        1
         "###);
-        db_snap!(index, soft_deleted_documents_ids, @"[2, 3, ]");
 
         let rtxn = index.read_txn().unwrap();
         let search = Search::new(&rtxn, &index);
diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs
index fac7b68ea..4d9bbc183 100644
--- a/milli/src/search/facet/filter.rs
+++ b/milli/src/search/facet/filter.rs
@@ -223,12 +223,9 @@ impl<'a> Filter<'a> {
 impl<'a> Filter<'a> {
     pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
         // to avoid doing this for each recursive call we're going to do it ONCE ahead of time
-        let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?;
         let filterable_fields = index.filterable_fields(rtxn)?;
 
-        // and finally we delete all the soft_deleted_documents, again, only once at the very end
         self.inner_evaluate(rtxn, index, &filterable_fields)
-            .map(|result| result - soft_deleted_documents)
     }
 
     fn evaluate_operator(
diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs
index 77d9f41ec..c22038f81 100644
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -98,7 +98,6 @@ Create a snapshot test of the given database.
     - `facet_id_string_docids`
     - `documents_ids`
     - `stop_words`
-    - `soft_deleted_documents_ids`
     - `field_distribution`
     - `fields_ids_map`
     - `geo_faceted_documents_ids`
@@ -308,12 +307,6 @@ pub fn snap_stop_words(index: &Index) -> String {
     let snap = format!("{stop_words:?}");
     snap
 }
-pub fn snap_soft_deleted_documents_ids(index: &Index) -> String {
-    let rtxn = index.read_txn().unwrap();
-    let soft_deleted_documents_ids = index.soft_deleted_documents_ids(&rtxn).unwrap();
-
-    display_bitmap(&soft_deleted_documents_ids)
-}
 pub fn snap_field_distributions(index: &Index) -> String {
     let rtxn = index.read_txn().unwrap();
     let mut snap = String::new();
@@ -484,9 +477,6 @@ macro_rules! full_snap_of_db {
     ($index:ident, stop_words) => {{
         $crate::snapshot_tests::snap_stop_words(&$index)
     }};
-    ($index:ident, soft_deleted_documents_ids) => {{
-        $crate::snapshot_tests::snap_soft_deleted_documents_ids(&$index)
-    }};
     ($index:ident, field_distribution) => {{
         $crate::snapshot_tests::snap_field_distributions(&$index)
     }};
diff --git a/milli/src/update/available_documents_ids.rs b/milli/src/update/available_documents_ids.rs
index 784bee5a7..f460693ba 100644
--- a/milli/src/update/available_documents_ids.rs
+++ b/milli/src/update/available_documents_ids.rs
@@ -8,16 +8,11 @@ pub struct AvailableDocumentsIds {
 }
 
 impl AvailableDocumentsIds {
-    pub fn from_documents_ids(
-        docids: &RoaringBitmap,
-        soft_deleted_docids: &RoaringBitmap,
-    ) -> AvailableDocumentsIds {
-        let used_docids = docids | soft_deleted_docids;
-
-        match used_docids.max() {
+    pub fn from_documents_ids(docids: &RoaringBitmap) -> AvailableDocumentsIds {
+        match docids.max() {
             Some(last_id) => {
                 let mut available = RoaringBitmap::from_iter(0..last_id);
-                available -= used_docids;
+                available -= docids;
 
                 let iter = match last_id.checked_add(1) {
                     Some(id) => id..=u32::max_value(),
@@ -50,7 +45,7 @@ mod tests {
     #[test]
     fn empty() {
         let base = RoaringBitmap::new();
-        let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
+        let left = AvailableDocumentsIds::from_documents_ids(&base);
         let right = 0..=u32::max_value();
         left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
     }
@@ -63,28 +58,8 @@ mod tests {
         base.insert(100);
         base.insert(405);
 
-        let left = AvailableDocumentsIds::from_documents_ids(&base, &RoaringBitmap::new());
+        let left = AvailableDocumentsIds::from_documents_ids(&base);
         let right = (0..=u32::max_value()).filter(|&n| n != 0 && n != 10 && n != 100 && n != 405);
         left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
     }
-
-    #[test]
-    fn soft_deleted() {
-        let mut base = RoaringBitmap::new();
-        base.insert(0);
-        base.insert(10);
-        base.insert(100);
-        base.insert(405);
-
-        let mut soft_deleted = RoaringBitmap::new();
-        soft_deleted.insert(1);
-        soft_deleted.insert(11);
-        soft_deleted.insert(101);
-        soft_deleted.insert(406);
-
-        let left = AvailableDocumentsIds::from_documents_ids(&base, &soft_deleted);
-        let right =
-            (0..=u32::max_value()).filter(|&n| ![0, 1, 10, 11, 100, 101, 405, 406].contains(&n));
-        left.zip(right).take(500).for_each(|(l, r)| assert_eq!(l, r));
-    }
 }
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 3eb7e0910..ca5f69808 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -56,7 +56,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
         self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
         self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
-        self.index.put_soft_deleted_documents_ids(self.wtxn, &empty_roaring)?;
         self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
         self.index.delete_geo_rtree(self.wtxn)?;
         self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs
index 71e434599..70a5e24c8 100644
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -594,7 +594,6 @@ mod tests {
         index.add_documents(documents).unwrap();
 
         db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
-        db_snap!(index, soft_deleted_documents_ids, "initial", @"[]");
 
         let mut documents = vec![];
         for i in 0..999 {
@@ -616,7 +615,6 @@ mod tests {
         index.add_documents(documents).unwrap();
 
         db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
-        db_snap!(index, soft_deleted_documents_ids, "replaced_1_soft", @"6c975deb900f286d2f6456d2d5c3a123");
 
         // Then replace the last document while disabling soft_deletion
         let mut documents = vec![];
@@ -639,7 +637,6 @@ mod tests {
         index.add_documents(documents).unwrap();
 
         db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
-        db_snap!(index, soft_deleted_documents_ids, "replaced_2_hard", @"[]");
     }
 }
 
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index c8481bd48..864e13d04 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -999,7 +999,6 @@ mod tests {
         assert_eq!(count, 6);
 
         db_snap!(index, word_docids, "updated");
-        db_snap!(index, soft_deleted_documents_ids, "updated", @"[0, 1, 4, ]");
 
         drop(rtxn);
     }
@@ -2649,8 +2648,6 @@ mod tests {
         0                        1
         "###);
 
-        db_snap!(index, soft_deleted_documents_ids, @"[]");
-
         // BATCH 3
 
         println!("--- ENTERING BATCH 3");
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index e02da8cb5..872230d99 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -132,17 +132,13 @@ impl<'a, 'i> Transform<'a, 'i> {
             indexer_settings.max_memory.map(|mem| mem / 2),
         );
         let documents_ids = index.documents_ids(wtxn)?;
-        let soft_deleted_documents_ids = index.soft_deleted_documents_ids(wtxn)?;
 
         Ok(Transform {
             index,
             fields_ids_map: index.fields_ids_map(wtxn)?,
             indexer_settings,
             autogenerate_docids,
-            available_documents_ids: AvailableDocumentsIds::from_documents_ids(
-                &documents_ids,
-                &soft_deleted_documents_ids,
-            ),
+            available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids),
             original_sorter,
             flattened_sorter,
             index_documents_method,
diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs
index 7d77490bc..d6aa8e5a3 100644
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@@ -508,7 +508,6 @@ mod tests {
         db_snap!(index, word_docids, "replaced");
         db_snap!(index, word_prefix_pair_proximity_docids, "replaced");
         db_snap!(index, prefix_word_pair_proximity_docids, "replaced");
-        db_snap!(index, soft_deleted_documents_ids, "replaced", @"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, ]");
     }
 
     #[test]
@@ -568,6 +567,5 @@ mod tests {
         db_snap!(index, word_docids, "replaced");
         db_snap!(index, word_prefix_pair_proximity_docids, "replaced");
         db_snap!(index, prefix_word_pair_proximity_docids, "replaced");
-        db_snap!(index, soft_deleted_documents_ids, "replaced", @"[]");
     }
 }

From fa6c7f65cae9fcb232bcc671c82421304da47d25 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 14:42:09 +0200
Subject: [PATCH 057/127] Add TmpIndex::delete_documents

---
 milli/src/index.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index b20674d4c..64aff636b 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1526,7 +1526,7 @@ pub(crate) mod tests {
             Ok(())
         }
 
-        pub fn delete_document(&self, external_document_id: &str) {
+        pub fn delete_documents(&self, external_document_ids: Vec<String>) {
             let mut wtxn = self.write_txn().unwrap();
 
             let builder = IndexDocuments::new(
@@ -1538,13 +1538,16 @@ pub(crate) mod tests {
                 || false,
             )
             .unwrap();
-            let (builder, user_error) =
-                builder.remove_documents(vec![external_document_id.to_owned()]).unwrap();
+            let (builder, user_error) = builder.remove_documents(external_document_ids).unwrap();
             user_error.unwrap();
             builder.execute().unwrap();
 
             wtxn.commit().unwrap();
         }
+
+        pub fn delete_document(&self, external_document_id: &str) {
+            self.delete_documents(vec![external_document_id.to_string()])
+        }
     }
 
     #[test]

From 290e773d23a4c0108b9c9330ed2a5ca76028e973 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 14:49:25 +0200
Subject: [PATCH 058/127] remove more warnings and fix some tests

---
 milli/src/index.rs                        | 10 +++-----
 milli/src/snapshot_tests.rs               |  3 +--
 milli/src/update/facet/mod.rs             |  2 +-
 milli/src/update/index_documents/mod.rs   | 25 ++++++--------------
 milli/src/update/prefix_word_pairs/mod.rs | 28 ++++-------------------
 milli/src/update/settings.rs              | 10 +++-----
 6 files changed, 19 insertions(+), 59 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 64aff636b..a280a1a48 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1861,8 +1861,7 @@ pub(crate) mod tests {
         use big_s::S;
         use maplit::hashset;
 
-        let mut index = TempIndex::new();
-        let index = index;
+        let index = TempIndex::new();
 
         index
             .update_settings(|settings| {
@@ -1973,7 +1972,7 @@ pub(crate) mod tests {
         use big_s::S;
         use maplit::hashset;
 
-        let mut index = TempIndex::new();
+        let index = TempIndex::new();
 
         index
             .update_settings(|settings| {
@@ -2561,10 +2560,7 @@ pub(crate) mod tests {
         4                        2
         "###);
 
-        let mut wtxn = index.write_txn().unwrap();
-        let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        delete.execute().unwrap();
-        wtxn.commit().unwrap();
+        index.delete_documents(Default::default());
 
         db_snap!(index, documents_ids, @"[0, 2, 3, ]");
         db_snap!(index, external_documents_ids, @r###"
diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs
index c22038f81..1d8d63277 100644
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -4,9 +4,8 @@ use std::path::Path;
 
 use roaring::RoaringBitmap;
 
-use crate::facet::FacetType;
 use crate::heed_codec::facet::{FacetGroupKey, FacetGroupValue};
-use crate::{make_db_snap_from_iter, obkv_to_json, ExternalDocumentsIds, Index};
+use crate::{make_db_snap_from_iter, obkv_to_json, Index};
 
 #[track_caller]
 pub fn default_db_snapshot_settings_for_test(name: Option<&str>) -> (insta::Settings, String) {
diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs
index 70a5e24c8..05e6a93d8 100644
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -566,7 +566,7 @@ mod tests {
 
     #[test]
     fn replace_all_identical_soft_deletion_then_hard_deletion() {
-        let mut index = TempIndex::new_with_map_size(4096 * 1000 * 100);
+        let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
 
         index
             .update_settings(|settings| {
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 864e13d04..c1e40373f 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -696,7 +696,6 @@ mod tests {
     use crate::documents::documents_batch_reader_from_objects;
     use crate::index::tests::TempIndex;
     use crate::search::TermsMatchingStrategy;
-    use crate::update::DeleteDocuments;
     use crate::{db_snap, BEU16};
 
     #[test]
@@ -1101,17 +1100,15 @@ mod tests {
                 { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
             ]))
             .unwrap();
-        let mut wtxn = index.write_txn().unwrap();
-        assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId"));
 
         // Delete not all of the documents but some of them.
-        let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        builder.delete_external_id("30");
-        builder.execute().unwrap();
+        index.delete_document("30");
 
-        let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
+        let txn = index.read_txn().unwrap();
+        assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId"));
+
+        let external_documents_ids = index.external_documents_ids(&txn).unwrap();
         assert!(external_documents_ids.get("30").is_none());
-        wtxn.commit().unwrap();
 
         index
             .add_documents(documents!([
@@ -2493,16 +2490,8 @@ mod tests {
         db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
         db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
 
-        let mut wtxn = index.write_txn().unwrap();
-
         // Delete not all of the documents but some of them.
-        let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        builder.delete_external_id("0");
-        builder.delete_external_id("3");
-        let result = builder.execute().unwrap();
-        println!("{result:?}");
-
-        wtxn.commit().unwrap();
+        index.delete_documents(vec!["0".into(), "3".into()]);
 
         db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
         db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
@@ -2557,7 +2546,7 @@ mod tests {
             ),
         ]
         */
-        let mut index = TempIndex::new();
+        let index = TempIndex::new();
 
         // START OF BATCH
 
diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs
index d6aa8e5a3..1ec57e080 100644
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@@ -142,9 +142,6 @@ pub fn write_into_lmdb_database_without_merging(
 #[cfg(test)]
 mod tests {
     use std::io::Cursor;
-    use std::iter::FromIterator;
-
-    use roaring::RoaringBitmap;
 
     use crate::db_snap;
     use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
@@ -335,22 +332,14 @@ mod tests {
         db_snap!(index, word_prefix_pair_proximity_docids, "initial");
         db_snap!(index, prefix_word_pair_proximity_docids, "initial");
 
-        let mut wtxn = index.write_txn().unwrap();
-        let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        delete.delete_documents(&RoaringBitmap::from_iter([50]));
-        delete.execute().unwrap();
-        wtxn.commit().unwrap();
+        index.delete_document("9000");
 
         db_snap!(index, documents_ids, "first_delete");
         db_snap!(index, word_docids, "first_delete");
         db_snap!(index, word_prefix_pair_proximity_docids, "first_delete");
         db_snap!(index, prefix_word_pair_proximity_docids, "first_delete");
 
-        let mut wtxn = index.write_txn().unwrap();
-        let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        delete.delete_documents(&RoaringBitmap::from_iter(0..50));
-        delete.execute().unwrap();
-        wtxn.commit().unwrap();
+        index.delete_documents((0..50).map(|id| id.to_string()).collect());
 
         db_snap!(index, documents_ids, "second_delete");
         db_snap!(index, word_docids, "second_delete");
@@ -417,23 +406,14 @@ mod tests {
         db_snap!(index, word_prefix_pair_proximity_docids, "initial");
         db_snap!(index, prefix_word_pair_proximity_docids, "initial");
 
-        let mut wtxn = index.write_txn().unwrap();
-        let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        delete.delete_documents(&RoaringBitmap::from_iter([50]));
-        delete.execute().unwrap();
-        wtxn.commit().unwrap();
+        index.delete_document("9000");
 
         db_snap!(index, documents_ids, "first_delete");
         db_snap!(index, word_docids, "first_delete");
         db_snap!(index, word_prefix_pair_proximity_docids, "first_delete");
         db_snap!(index, prefix_word_pair_proximity_docids, "first_delete");
 
-        let mut wtxn = index.write_txn().unwrap();
-        let mut delete = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-
-        delete.delete_documents(&RoaringBitmap::from_iter(0..50));
-        delete.execute().unwrap();
-        wtxn.commit().unwrap();
+        index.delete_documents((0..50).map(|id| id.to_string()).collect());
 
         db_snap!(index, documents_ids, "second_delete");
         db_snap!(index, word_docids, "second_delete");
diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index c2c0e9084..fd7ffa760 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -923,7 +923,7 @@ mod tests {
     use super::*;
     use crate::error::Error;
     use crate::index::tests::TempIndex;
-    use crate::update::{ClearDocuments, DeleteDocuments};
+    use crate::update::ClearDocuments;
     use crate::{Criterion, Filter, SearchResult};
 
     #[test]
@@ -1768,13 +1768,9 @@ mod tests {
         }
         index.add_documents(documents! { docs }).unwrap();
 
-        let mut wtxn = index.write_txn().unwrap();
-        let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-        (0..5).for_each(|id| {
-            builder.delete_external_id(&id.to_string());
-        });
-        builder.execute().unwrap();
+        index.delete_documents((0..5).map(|id| id.to_string()).collect());
 
+        let mut wtxn = index.write_txn().unwrap();
         index
             .update_settings_using_wtxn(&mut wtxn, |settings| {
                 settings.set_searchable_fields(vec!["id".to_string()]);

From 73c06d31d973771d667ba40939db8f399061a5cb Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 16:50:49 +0200
Subject: [PATCH 059/127] snapshot always display stuff in consistent order

---
 milli/src/snapshot_tests.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs
index 1d8d63277..730d0a5c8 100644
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -333,6 +333,9 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String {
 pub fn snap_external_documents_ids(index: &Index) -> String {
     let rtxn = index.read_txn().unwrap();
     let external_ids = index.external_documents_ids(&rtxn).unwrap().to_hash_map();
+    // ensure fixed order (not guaranteed by hashmap)
+    let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect();
+    external_ids.sort_by(|(l, _), (r, _)| l.cmp(r));
 
     let mut snap = String::new();
 

From 3c158818187313eec79684ac498e43fd71e47409 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 17:32:36 +0200
Subject: [PATCH 060/127] Add simple delete test

---
 milli/src/index.rs | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index a280a1a48..481f698fc 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -2332,6 +2332,32 @@ pub(crate) mod tests {
         "###);
     }
 
+    #[test]
+    fn simple_delete() {
+        let mut index = TempIndex::new();
+        index.index_documents_config.update_method = IndexDocumentsMethod::UpdateDocuments;
+        index
+            .add_documents(documents!([
+                { "id": 30 },
+                { "id": 34 }
+            ]))
+            .unwrap();
+
+        db_snap!(index, documents_ids, @"[0, 1, ]");
+        db_snap!(index, external_documents_ids, 1, @r###"
+        docids:
+        30                       0
+        34                       1"###);
+
+        index.delete_document("34");
+
+        db_snap!(index, documents_ids, @"[0, ]");
+        db_snap!(index, external_documents_ids, 2, @r###"
+        docids:
+        30                       0
+        "###);
+    }
+
     #[test]
     fn bug_3021_second() {
         // https://github.com/meilisearch/meilisearch/issues/3021

From e78281785ca8568e4a55833ec1fa4139dc097611 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 17:32:45 +0200
Subject: [PATCH 061/127] Actually execute the transform even if there are only
 documents to delete

---
 milli/src/update/index_documents/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index c1e40373f..ee1dea7d5 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -201,7 +201,7 @@ where
     pub fn execute(mut self) -> Result<DocumentAdditionResult> {
         puffin::profile_function!();
 
-        if self.added_documents == 0 {
+        if self.added_documents == 0 && self.deleted_documents == 0 {
             let number_of_documents = self.index.number_of_documents(self.wtxn)?;
             return Ok(DocumentAdditionResult { indexed_documents: 0, number_of_documents });
         }

From a35988550cc785b66368f0e7cc4904930486f793 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 25 Oct 2023 18:02:43 +0200
Subject: [PATCH 062/127] Fix some snapshots

---
 milli/src/index.rs                      | 144 +++++++++---------------
 milli/src/update/index_documents/mod.rs |   6 +-
 2 files changed, 58 insertions(+), 92 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 481f698fc..ba00111b3 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1880,8 +1880,7 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
         db_snap!(index, external_documents_ids, 1, @r###"
-        soft:
-        hard:
+        docids:
         0                        0
         1                        1
         2                        2
@@ -1902,13 +1901,12 @@ pub(crate) mod tests {
         }
         index.add_documents(documents!(docs)).unwrap();
 
-        db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]");
+        db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
         db_snap!(index, external_documents_ids, 2, @r###"
-        soft:
-        hard:
-        0                        4
-        1                        5
-        2                        6
+        docids:
+        0                        0
+        1                        1
+        2                        2
         3                        3
         "###);
         db_snap!(index, facet_id_f64_docids, 2, @r###"
@@ -1922,14 +1920,12 @@ pub(crate) mod tests {
             .add_documents(documents!([{ "id": 3, "doggo": 4 }, { "id": 3, "doggo": 5 },{ "id": 3, "doggo": 4 }]))
             .unwrap();
 
-        db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]");
+        db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
         db_snap!(index, external_documents_ids, 3, @r###"
-        soft:
-        3                        7
-        hard:
-        0                        4
-        1                        5
-        2                        6
+        docids:
+        0                        0
+        1                        1
+        2                        2
         3                        3
         "###);
         db_snap!(index, facet_id_f64_docids, 3, @r###"
@@ -1946,14 +1942,13 @@ pub(crate) mod tests {
             })
             .unwrap();
 
-        db_snap!(index, documents_ids, @"[4, 5, 6, 7, ]");
+        db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
         db_snap!(index, external_documents_ids, 3, @r###"
-        soft:
-        hard:
-        0                        4
-        1                        5
-        2                        6
-        3                        7
+        docids:
+        0                        0
+        1                        1
+        2                        2
+        3                        3
         "###);
         db_snap!(index, facet_id_f64_docids, 3, @r###"
         0   0  0      1  [4, ]
@@ -2010,8 +2005,7 @@ pub(crate) mod tests {
 
             db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
             db_snap!(index, external_documents_ids, 1, @r###"
-            soft:
-            hard:
+            docids:
             0                        0
             1                        1
             2                        2
@@ -2040,13 +2034,12 @@ pub(crate) mod tests {
             }
             add_documents(&index, vec![docs1, docs2]);
 
-            db_snap!(index, documents_ids, @"[3, 4, 5, 6, ]");
+            db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
             db_snap!(index, external_documents_ids, 1, @r###"
-            soft:
-            hard:
-            0                        4
-            1                        5
-            2                        6
+            docids:
+            0                        0
+            1                        1
+            2                        2
             3                        3
             "###);
             db_snap!(index, facet_id_f64_docids, 1, @r###"
@@ -2248,8 +2241,7 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[0, 1, ]");
         db_snap!(index, external_documents_ids, 1, @r###"
-        soft:
-        hard:
+        docids:
         34                       1
         38                       0
         "###);
@@ -2258,9 +2250,7 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[0, ]");
         db_snap!(index, external_documents_ids, 2, @r###"
-        soft:
-        hard:
-        34                       1
+        docids:
         38                       0
         "###);
 
@@ -2274,8 +2264,7 @@ pub(crate) mod tests {
         // do not contain any entry for previously soft-deleted document ids
         db_snap!(index, documents_ids, @"[0, ]");
         db_snap!(index, external_documents_ids, 3, @r###"
-        soft:
-        hard:
+        docids:
         38                       0
         "###);
 
@@ -2285,8 +2274,7 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[0, 1, ]");
         db_snap!(index, external_documents_ids, 4, @r###"
-        soft:
-        hard:
+        docids:
         34                       1
         38                       0
         "###);
@@ -2296,10 +2284,8 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[1, ]");
         db_snap!(index, external_documents_ids, 5, @r###"
-        soft:
-        hard:
+        docids:
         34                       1
-        38                       0
         "###);
 
         index
@@ -2310,8 +2296,7 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[1, ]");
         db_snap!(index, external_documents_ids, 6, @r###"
-        soft:
-        hard:
+        docids:
         34                       1
         "###);
 
@@ -2321,8 +2306,7 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]");
         db_snap!(index, external_documents_ids, 7, @r###"
-        soft:
-        hard:
+        docids:
         34                       1
         38                       0
         39                       2
@@ -2379,8 +2363,7 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[0, 1, ]");
         db_snap!(index, external_documents_ids, 1, @r###"
-        soft:
-        hard:
+        docids:
         30                       0
         34                       1
         "###);
@@ -2389,10 +2372,8 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[0, ]");
         db_snap!(index, external_documents_ids, 2, @r###"
-        soft:
-        hard:
+        docids:
         30                       0
-        34                       1
         "###);
 
         index
@@ -2405,8 +2386,7 @@ pub(crate) mod tests {
         // do not contain any entry for previously soft-deleted document ids
         db_snap!(index, documents_ids, @"[0, ]");
         db_snap!(index, external_documents_ids, 3, @r###"
-        soft:
-        hard:
+        docids:
         30                       0
         "###);
 
@@ -2417,8 +2397,7 @@ pub(crate) mod tests {
         // The external documents ids don't have several external ids pointing to the same
         // internal document id
         db_snap!(index, external_documents_ids, 4, @r###"
-        soft:
-        hard:
+        docids:
         30                       0
         35                       1
         "###);
@@ -2429,8 +2408,7 @@ pub(crate) mod tests {
         // And document 35 still exists, is not deleted
         db_snap!(index, documents_ids, @"[0, 1, 2, ]");
         db_snap!(index, external_documents_ids, 5, @r###"
-        soft:
-        hard:
+        docids:
         30                       0
         34                       2
         35                       1
@@ -2466,8 +2444,7 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[0, 1, 2, 3, 4, 5, ]");
         db_snap!(index, external_documents_ids, 6, @r###"
-        soft:
-        hard:
+        docids:
         30                       0
         34                       2
         35                       1
@@ -2499,8 +2476,7 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[0, 1, 2, ]");
         db_snap!(index, external_documents_ids, 1, @r###"
-        soft:
-        hard:
+        docids:
         3                        0
         4                        1
         5                        2
@@ -2510,20 +2486,17 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[1, 2, ]");
         db_snap!(index, external_documents_ids, 2, @r###"
-        soft:
-        hard:
-        3                        0
+        docids:
         4                        1
         5                        2
         "###);
 
         index.add_documents(documents!([{ "primary_key": "4", "a": 2 }])).unwrap();
 
-        db_snap!(index, documents_ids, @"[2, 3, ]");
+        db_snap!(index, documents_ids, @"[1, 2, ]");
         db_snap!(index, external_documents_ids, 2, @r###"
-        soft:
-        hard:
-        4                        3
+        docids:
+        4                        1
         5                        2
         "###);
 
@@ -2533,12 +2506,11 @@ pub(crate) mod tests {
             ]))
             .unwrap();
 
-        db_snap!(index, documents_ids, @"[0, 2, 3, ]");
+        db_snap!(index, documents_ids, @"[0, 1, 2, ]");
         db_snap!(index, external_documents_ids, 2, @r###"
-        soft:
-        hard:
+        docids:
         3                        0
-        4                        3
+        4                        1
         5                        2
         "###);
     }
@@ -2564,8 +2536,7 @@ pub(crate) mod tests {
 
         db_snap!(index, documents_ids, @"[0, 1, ]");
         db_snap!(index, external_documents_ids, @r###"
-        soft:
-        hard:
+        docids:
         11                       0
         4                        1
         "###);
@@ -2577,24 +2548,22 @@ pub(crate) mod tests {
             ]))
             .unwrap();
 
-        db_snap!(index, documents_ids, @"[0, 2, 3, ]");
+        db_snap!(index, documents_ids, @"[0, 1, 2, ]");
         db_snap!(index, external_documents_ids, @r###"
-        soft:
-        hard:
-        1                        3
+        docids:
+        1                        2
         11                       0
-        4                        2
+        4                        1
         "###);
 
         index.delete_documents(Default::default());
 
-        db_snap!(index, documents_ids, @"[0, 2, 3, ]");
+        db_snap!(index, documents_ids, @"[0, 1, 2, ]");
         db_snap!(index, external_documents_ids, @r###"
-        soft:
-        hard:
-        1                        3
+        docids:
+        1                        2
         11                       0
-        4                        2
+        4                        1
         "###);
 
         index
@@ -2604,11 +2573,10 @@ pub(crate) mod tests {
             ]))
             .unwrap();
 
-        db_snap!(index, documents_ids, @"[0, 1, 4, ]");
+        db_snap!(index, documents_ids, @"[0, 1, 2, ]");
         db_snap!(index, external_documents_ids, @r###"
-        soft:
-        hard:
-        1                        4
+        docids:
+        1                        2
         11                       0
         4                        1
         "###);
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index ee1dea7d5..5f5c418d9 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -2586,8 +2586,7 @@ mod tests {
         {"id":1,"doggo":"bernese"}
         "###);
         db_snap!(index, external_documents_ids, @r###"
-        soft:
-        hard:
+        docids:
         1                        0
         "###);
 
@@ -2632,8 +2631,7 @@ mod tests {
         "###);
 
         db_snap!(index, external_documents_ids, @r###"
-        soft:
-        hard:
+        docids:
         0                        1
         "###);
 

From 9a2dccc3bcda089cdc596481d585b486f9b38729 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 26 Oct 2023 10:36:34 +0200
Subject: [PATCH 063/127] Add iterator to find external ids of a bitmap of
 internal ids

---
 milli/src/external_documents_ids.rs | 68 ++++++++++++++++++++++++++++-
 1 file changed, 67 insertions(+), 1 deletion(-)

diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs
index 12db4eb1d..02794609f 100644
--- a/milli/src/external_documents_ids.rs
+++ b/milli/src/external_documents_ids.rs
@@ -4,6 +4,7 @@ use std::convert::TryInto;
 use std::fmt;
 
 use fst::Streamer;
+use roaring::RoaringBitmap;
 
 use crate::DocumentId;
 
@@ -55,7 +56,24 @@ impl<'a> ExternalDocumentsIds<'a> {
         self.0.as_fst().as_bytes()
     }
 
-    /// Apply the list of operations passed as argument, modifying the current external to internal id mapping.
+    /// Looks for the internal ids in the passed bitmap, and returns an iterator over the mapping between
+    /// these internal ids and their external id.
+    ///
+    /// The returned iterator has `Result<(String, DocumentId), RoaringBitmap>` as `Item`,
+    /// where the returned values can be:
+    /// - `Ok((external_id, internal_id))`: if a mapping was found
+    /// - `Err(remaining_ids)`: if the external ids for some of the requested internal ids weren't found.
+    ///   In that case the returned bitmap contains the internal ids whose external ids were not found after traversing
+    ///   the entire fst.
+    pub fn find_external_id_of(
+        &self,
+        internal_ids: RoaringBitmap,
+    ) -> ExternalToInternalOwnedIterator<'_> {
+        let it = ExternalToInternalOwnedIterator { stream: self.0.stream(), internal_ids };
+        it
+    }
+
+    /// Applies the list of operations passed as argument, modifying the current external to internal id mapping.
     ///
     /// If the list contains multiple operations on the same external id, then the result is unspecified.
     ///
@@ -129,3 +147,51 @@ impl Default for ExternalDocumentsIds<'static> {
         ExternalDocumentsIds(fst::Map::default().map_data(Cow::Owned).unwrap())
     }
 }
+
+/// An iterator over mappings between requested internal ids and external ids.
+///
+/// See [`ExternalDocumentsIds::find_external_id_of`] for details.
+pub struct ExternalToInternalOwnedIterator<'it> {
+    stream: fst::map::Stream<'it>,
+    internal_ids: RoaringBitmap,
+}
+
+impl<'it> Iterator for ExternalToInternalOwnedIterator<'it> {
+    /// A result indicating if a mapping was found, or if the stream was exhausted without finding all internal ids.
+    type Item = Result<(String, DocumentId), RoaringBitmap>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // if all requested ids were found, we won't find any other, so short-circuit
+        if self.internal_ids.is_empty() {
+            return None;
+        }
+        loop {
+            let Some((external, internal)) = self.stream.next() else {
+                // we exhausted the stream but we still have some internal ids to find
+                let remaining_ids = std::mem::take(&mut self.internal_ids);
+                return Some(Err(remaining_ids));
+                // note: next calls to `next` will return `None` since we replaced the internal_ids
+                // with the default empty bitmap
+            };
+            let internal = internal.try_into().unwrap();
+            let was_contained = self.internal_ids.remove(internal);
+            if was_contained {
+                return Some(Ok((std::str::from_utf8(external).unwrap().to_owned(), internal)));
+            }
+        }
+    }
+}
+
+impl<'it> ExternalToInternalOwnedIterator<'it> {
+    /// Returns the bitmap of internal ids whose external id are yet to be found
+    pub fn remaining_internal_ids(&self) -> &RoaringBitmap {
+        &self.internal_ids
+    }
+
+    /// Consumes this iterator and returns an iterator over only the external ids, ignoring the internal ids.
+    ///
+    /// Use this when you don't need the mapping between the external and the internal ids.
+    pub fn only_external_ids(self) -> impl Iterator<Item = Result<String, RoaringBitmap>> + 'it {
+        self.map(|res| res.map(|(external, _internal)| external))
+    }
+}

From 652ac3052d518433c69fab9b3cdfaccc0f6bed68 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 26 Oct 2023 10:54:20 +0200
Subject: [PATCH 064/127] use new iterator in batch

---
 index-scheduler/src/batch.rs | 50 +++++++++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs
index a4b7e5c45..c4f9c12be 100644
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -30,7 +30,7 @@ use meilisearch_types::heed::{RoTxn, RwTxn};
 use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
 use meilisearch_types::milli::heed::CompactionOption;
 use meilisearch_types::milli::update::{
-    IndexDocumentsConfig, IndexDocumentsMethod, Settings as MilliSettings,
+    IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings as MilliSettings,
 };
 use meilisearch_types::milli::{self, Filter, BEU32};
 use meilisearch_types::settings::{apply_settings_to_builder, Settings, Unchecked};
@@ -43,7 +43,7 @@ use uuid::Uuid;
 
 use crate::autobatcher::{self, BatchKind};
 use crate::utils::{self, swap_index_uid_in_task};
-use crate::{Error, IndexScheduler, ProcessingTasks, Result, TaskId};
+use crate::{Error, IndexScheduler, MustStopProcessing, ProcessingTasks, Result, TaskId};
 
 /// Represents a combination of tasks that can all be processed at the same time.
 ///
@@ -1323,7 +1323,13 @@ impl IndexScheduler {
                     } else {
                         unreachable!()
                     };
-                let deleted_documents = delete_document_by_filter(index_wtxn, filter, index);
+                let deleted_documents = delete_document_by_filter(
+                    index_wtxn,
+                    filter,
+                    self.index_mapper.indexer_config(),
+                    self.must_stop_processing.clone(),
+                    index,
+                );
                 let original_filter = if let Some(Details::DocumentDeletionByFilter {
                     original_filter,
                     deleted_documents: _,
@@ -1557,6 +1563,8 @@ impl IndexScheduler {
 fn delete_document_by_filter<'a>(
     wtxn: &mut RwTxn<'a, '_>,
     filter: &serde_json::Value,
+    indexer_config: &IndexerConfig,
+    must_stop_processing: MustStopProcessing,
     index: &'a Index,
 ) -> Result<u64> {
     let filter = Filter::from_json(filter)?;
@@ -1567,10 +1575,38 @@ fn delete_document_by_filter<'a>(
             }
             e => e.into(),
         })?;
-        todo!("need a way to get back the external ids from the internal ids");
-        // let mut delete_operation = DeleteDocuments::new(wtxn, index)?;
-        // delete_operation.delete_documents(&candidates);
-        // delete_operation.execute().map(|result| result.deleted_documents)?
+        let external_documents_ids = index.external_documents_ids(wtxn)?;
+        // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
+        // Since what we have is an iterator, it would be better to delete in chunks
+        let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
+            external_documents_ids.find_external_id_of(candidates).only_external_ids().collect();
+        let document_ids = match external_to_internal {
+            Ok(external_ids) => external_ids,
+            Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids),
+        };
+
+        let config = IndexDocumentsConfig {
+            update_method: IndexDocumentsMethod::ReplaceDocuments,
+            ..Default::default()
+        };
+
+        let mut builder = milli::update::IndexDocuments::new(
+            wtxn,
+            index,
+            indexer_config,
+            config,
+            |indexing_step| debug!("update: {:?}", indexing_step),
+            || must_stop_processing.get(),
+        )?;
+
+        let (new_builder, user_result) = builder.remove_documents(document_ids)?;
+        builder = new_builder;
+        // Uses Invariant: remove documents actually always returns Ok for the inner result
+        let count = user_result.unwrap();
+
+        let _ = builder.execute()?;
+
+        count
     } else {
         0
     })

From ae4ec8ea55bc976cd3aacab90cb6d845642e40d3 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 26 Oct 2023 12:15:55 +0200
Subject: [PATCH 065/127] Add delete_document_using_wtxn to TempIndex

---
 milli/src/index.rs | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index ba00111b3..d99c36b65 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1526,11 +1526,13 @@ pub(crate) mod tests {
             Ok(())
         }
 
-        pub fn delete_documents(&self, external_document_ids: Vec<String>) {
-            let mut wtxn = self.write_txn().unwrap();
-
+        pub fn delete_documents_using_wtxn<'t>(
+            &'t self,
+            wtxn: &mut RwTxn<'t, '_>,
+            external_document_ids: Vec<String>,
+        ) {
             let builder = IndexDocuments::new(
-                &mut wtxn,
+                wtxn,
                 self,
                 &self.indexer_config,
                 self.index_documents_config.clone(),
@@ -1541,6 +1543,12 @@ pub(crate) mod tests {
             let (builder, user_error) = builder.remove_documents(external_document_ids).unwrap();
             user_error.unwrap();
             builder.execute().unwrap();
+        }
+
+        pub fn delete_documents(&self, external_document_ids: Vec<String>) {
+            let mut wtxn = self.write_txn().unwrap();
+
+            self.delete_documents_using_wtxn(&mut wtxn, external_document_ids);
 
             wtxn.commit().unwrap();
         }

From 8e0d9c9a5e89e0fc0612ba61af6f25fbc358e2b6 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 26 Oct 2023 12:16:16 +0200
Subject: [PATCH 066/127] Recover delete_documents tests that were too eagerly
 deleted

---
 milli/src/update/index_documents/mod.rs | 533 +++++++++++++++++++++++-
 1 file changed, 532 insertions(+), 1 deletion(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 5f5c418d9..b439ca409 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -690,13 +690,15 @@ fn execute_word_prefix_docids(
 #[cfg(test)]
 mod tests {
     use big_s::S;
+    use fst::IntoStreamer;
+    use heed::RwTxn;
     use maplit::hashset;
 
     use super::*;
     use crate::documents::documents_batch_reader_from_objects;
     use crate::index::tests::TempIndex;
     use crate::search::TermsMatchingStrategy;
-    use crate::{db_snap, BEU16};
+    use crate::{db_snap, Filter, Search, BEU16};
 
     #[test]
     fn simple_document_replacement() {
@@ -2676,4 +2678,533 @@ mod tests {
         let res = index.search(&rtxn).execute().unwrap();
         index.documents(&rtxn, res.documents_ids).unwrap();
     }
+
+    fn delete_documents<'t>(
+        wtxn: &mut RwTxn<'t, '_>,
+        index: &'t TempIndex,
+        external_ids: &[&str],
+    ) -> Vec<u32> {
+        let external_document_ids = index.external_documents_ids(wtxn).unwrap();
+        let ids_to_delete: Vec<u32> = external_ids
+            .iter()
+            .map(|id| external_document_ids.get(id.as_bytes()).unwrap())
+            .collect();
+
+        // Delete some documents.
+        index.delete_documents_using_wtxn(
+            wtxn,
+            external_ids.iter().map(ToString::to_string).collect(),
+        );
+
+        ids_to_delete
+    }
+
+    #[test]
+    fn delete_documents_with_numbers_as_primary_key() {
+        let index = TempIndex::new();
+
+        let mut wtxn = index.write_txn().unwrap();
+        index
+            .add_documents_using_wtxn(
+                &mut wtxn,
+                documents!([
+                    { "id": 0, "name": "kevin", "object": { "key1": "value1", "key2": "value2" } },
+                    { "id": 1, "name": "kevina", "array": ["I", "am", "fine"] },
+                    { "id": 2, "name": "benoit", "array_of_object": [{ "wow": "amazing" }] }
+                ]),
+            )
+            .unwrap();
+
+        // delete those documents, ids are synchronous therefore 0, 1, and 2.
+        index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1"), S("2")]);
+
+        wtxn.commit().unwrap();
+
+        // All these snapshots should be empty since the database was cleared
+        db_snap!(index, documents_ids);
+        db_snap!(index, word_docids);
+        db_snap!(index, word_pair_proximity_docids);
+        db_snap!(index, facet_id_exists_docids);
+
+        let rtxn = index.read_txn().unwrap();
+
+        assert!(index.field_distribution(&rtxn).unwrap().is_empty());
+    }
+
+    #[test]
+    fn delete_documents_with_strange_primary_key() {
+        let index = TempIndex::new();
+
+        index
+            .update_settings(|settings| settings.set_searchable_fields(vec!["name".to_string()]))
+            .unwrap();
+
+        let mut wtxn = index.write_txn().unwrap();
+        index
+            .add_documents_using_wtxn(
+                &mut wtxn,
+                documents!([
+                    { "mysuperid": 0, "name": "kevin" },
+                    { "mysuperid": 1, "name": "kevina" },
+                    { "mysuperid": 2, "name": "benoit" }
+                ]),
+            )
+            .unwrap();
+        wtxn.commit().unwrap();
+
+        let mut wtxn = index.write_txn().unwrap();
+
+        // Delete not all of the documents but some of them.
+        index.delete_documents_using_wtxn(&mut wtxn, vec![S("0"), S("1")]);
+
+        wtxn.commit().unwrap();
+
+        db_snap!(index, documents_ids);
+        db_snap!(index, word_docids);
+        db_snap!(index, word_pair_proximity_docids);
+    }
+
+    #[test]
+    fn filtered_placeholder_search_should_not_return_deleted_documents_() {
+        let index = TempIndex::new();
+
+        let mut wtxn = index.write_txn().unwrap();
+
+        index
+            .update_settings_using_wtxn(&mut wtxn, |settings| {
+                settings.set_primary_key(S("docid"));
+                settings.set_filterable_fields(hashset! { S("label"), S("label2") });
+            })
+            .unwrap();
+
+        index
+            .add_documents_using_wtxn(
+                &mut wtxn,
+                documents!([
+                    { "docid": "1_4",  "label": ["sign"] },
+                    { "docid": "1_5",  "label": ["letter"] },
+                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
+                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
+                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
+                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
+                    { "docid": "1_39", "label": ["abstract"] },
+                    { "docid": "1_40", "label": ["cartoon"] },
+                    { "docid": "1_41", "label": ["art","drawing"] },
+                    { "docid": "1_42", "label": ["art","pattern"] },
+                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
+                    { "docid": "1_44", "label": ["drawing"] },
+                    { "docid": "1_45", "label": ["art"] },
+                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
+                    { "docid": "1_47", "label": ["abstract","pattern"] },
+                    { "docid": "1_52", "label": ["abstract","cartoon"] },
+                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
+                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
+                    { "docid": "1_68", "label": ["design"] },
+                    { "docid": "1_69", "label": ["geometry"] },
+                    { "docid": "1_70", "label2": ["geometry", 1.2] },
+                    { "docid": "1_71", "label2": ["design", 2.2] },
+                    { "docid": "1_72", "label2": ["geometry", 1.2] }
+                ]),
+            )
+            .unwrap();
+
+        delete_documents(&mut wtxn, &index, &["1_4", "1_70", "1_72"]);
+
+        // Placeholder search with filter
+        let filter = Filter::from_str("label = sign").unwrap().unwrap();
+        let results = index.search(&wtxn).filter(filter).execute().unwrap();
+        assert!(results.documents_ids.is_empty());
+
+        wtxn.commit().unwrap();
+
+        db_snap!(index, word_docids);
+        db_snap!(index, facet_id_f64_docids);
+        db_snap!(index, word_pair_proximity_docids);
+        db_snap!(index, facet_id_exists_docids);
+        db_snap!(index, facet_id_string_docids);
+    }
+
+    #[test]
+    fn placeholder_search_should_not_return_deleted_documents() {
+        let index = TempIndex::new();
+
+        let mut wtxn = index.write_txn().unwrap();
+        index
+            .update_settings_using_wtxn(&mut wtxn, |settings| {
+                settings.set_primary_key(S("docid"));
+            })
+            .unwrap();
+
+        index
+            .add_documents_using_wtxn(
+                &mut wtxn,
+                documents!([
+                    { "docid": "1_4",  "label": ["sign"] },
+                    { "docid": "1_5",  "label": ["letter"] },
+                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
+                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
+                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
+                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
+                    { "docid": "1_39", "label": ["abstract"] },
+                    { "docid": "1_40", "label": ["cartoon"] },
+                    { "docid": "1_41", "label": ["art","drawing"] },
+                    { "docid": "1_42", "label": ["art","pattern"] },
+                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
+                    { "docid": "1_44", "label": ["drawing"] },
+                    { "docid": "1_45", "label": ["art"] },
+                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
+                    { "docid": "1_47", "label": ["abstract","pattern"] },
+                    { "docid": "1_52", "label": ["abstract","cartoon"] },
+                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
+                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
+                    { "docid": "1_68", "label": ["design"] },
+                    { "docid": "1_69", "label": ["geometry"] },
+                    { "docid": "1_70", "label2": ["geometry", 1.2] },
+                    { "docid": "1_71", "label2": ["design", 2.2] },
+                    { "docid": "1_72", "label2": ["geometry", 1.2] }
+                ]),
+            )
+            .unwrap();
+
+        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_4"]);
+
+        // Placeholder search
+        let results = index.search(&wtxn).execute().unwrap();
+        assert!(!results.documents_ids.is_empty());
+        for id in results.documents_ids.iter() {
+            assert!(
+                !deleted_internal_ids.contains(id),
+                "The document {} was supposed to be deleted",
+                id
+            );
+        }
+
+        wtxn.commit().unwrap();
+    }
+
+    #[test]
+    fn search_should_not_return_deleted_documents() {
+        let index = TempIndex::new();
+
+        let mut wtxn = index.write_txn().unwrap();
+        index
+            .update_settings_using_wtxn(&mut wtxn, |settings| {
+                settings.set_primary_key(S("docid"));
+            })
+            .unwrap();
+
+        index
+            .add_documents_using_wtxn(
+                &mut wtxn,
+                documents!([
+                    { "docid": "1_4",  "label": ["sign"] },
+                    { "docid": "1_5",  "label": ["letter"] },
+                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
+                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
+                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
+                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
+                    { "docid": "1_39", "label": ["abstract"] },
+                    { "docid": "1_40", "label": ["cartoon"] },
+                    { "docid": "1_41", "label": ["art","drawing"] },
+                    { "docid": "1_42", "label": ["art","pattern"] },
+                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
+                    { "docid": "1_44", "label": ["drawing"] },
+                    { "docid": "1_45", "label": ["art"] },
+                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
+                    { "docid": "1_47", "label": ["abstract","pattern"] },
+                    { "docid": "1_52", "label": ["abstract","cartoon"] },
+                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
+                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
+                    { "docid": "1_68", "label": ["design"] },
+                    { "docid": "1_69", "label": ["geometry"] },
+                    { "docid": "1_70", "label2": ["geometry", 1.2] },
+                    { "docid": "1_71", "label2": ["design", 2.2] },
+                    { "docid": "1_72", "label2": ["geometry", 1.2] }
+                ]),
+            )
+            .unwrap();
+
+        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
+
+        // search for abstract
+        let results = index.search(&wtxn).query("abstract").execute().unwrap();
+        assert!(!results.documents_ids.is_empty());
+        for id in results.documents_ids.iter() {
+            assert!(
+                !deleted_internal_ids.contains(id),
+                "The document {} was supposed to be deleted",
+                id
+            );
+        }
+
+        wtxn.commit().unwrap();
+    }
+
+    #[test]
+    fn geo_filtered_placeholder_search_should_not_return_deleted_documents() {
+        let index = TempIndex::new();
+
+        let mut wtxn = index.write_txn().unwrap();
+        index
+            .update_settings_using_wtxn(&mut wtxn, |settings| {
+                settings.set_primary_key(S("id"));
+                settings.set_filterable_fields(hashset!(S("_geo")));
+                settings.set_sortable_fields(hashset!(S("_geo")));
+            })
+            .unwrap();
+
+        index.add_documents_using_wtxn(&mut wtxn, documents!([
+            { "id": "1",  "city": "Lille",             "_geo": { "lat": 50.6299, "lng": 3.0569 } },
+            { "id": "2",  "city": "Mons-en-Barœul",    "_geo": { "lat": 50.6415, "lng": 3.1106 } },
+            { "id": "3",  "city": "Hellemmes",         "_geo": { "lat": 50.6312, "lng": 3.1106 } },
+            { "id": "4",  "city": "Villeneuve-d'Ascq", "_geo": { "lat": 50.6224, "lng": 3.1476 } },
+            { "id": "5",  "city": "Hem",               "_geo": { "lat": 50.6552, "lng": 3.1897 } },
+            { "id": "6",  "city": "Roubaix",           "_geo": { "lat": 50.6924, "lng": 3.1763 } },
+            { "id": "7",  "city": "Tourcoing",         "_geo": { "lat": 50.7263, "lng": 3.1541 } },
+            { "id": "8",  "city": "Mouscron",          "_geo": { "lat": 50.7453, "lng": 3.2206 } },
+            { "id": "9",  "city": "Tournai",           "_geo": { "lat": 50.6053, "lng": 3.3758 } },
+            { "id": "10", "city": "Ghent",             "_geo": { "lat": 51.0537, "lng": 3.6957 } },
+            { "id": "11", "city": "Brussels",          "_geo": { "lat": 50.8466, "lng": 4.3370 } },
+            { "id": "12", "city": "Charleroi",         "_geo": { "lat": 50.4095, "lng": 4.4347 } },
+            { "id": "13", "city": "Mons",              "_geo": { "lat": 50.4502, "lng": 3.9623 } },
+            { "id": "14", "city": "Valenciennes",      "_geo": { "lat": 50.3518, "lng": 3.5326 } },
+            { "id": "15", "city": "Arras",             "_geo": { "lat": 50.2844, "lng": 2.7637 } },
+            { "id": "16", "city": "Cambrai",           "_geo": { "lat": 50.1793, "lng": 3.2189 } },
+            { "id": "17", "city": "Bapaume",           "_geo": { "lat": 50.1112, "lng": 2.8547 } },
+            { "id": "18", "city": "Amiens",            "_geo": { "lat": 49.9314, "lng": 2.2710 } },
+            { "id": "19", "city": "Compiègne",         "_geo": { "lat": 49.4449, "lng": 2.7913 } },
+            { "id": "20", "city": "Paris",             "_geo": { "lat": 48.9021, "lng": 2.3708 } }
+        ])).unwrap();
+
+        let external_ids_to_delete = ["5", "6", "7", "12", "17", "19"];
+        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &external_ids_to_delete);
+
+        // Placeholder search with geo filter
+        let filter = Filter::from_str("_geoRadius(50.6924, 3.1763, 20000)").unwrap().unwrap();
+        let results = index.search(&wtxn).filter(filter).execute().unwrap();
+        assert!(!results.documents_ids.is_empty());
+        for id in results.documents_ids.iter() {
+            assert!(
+                !deleted_internal_ids.contains(id),
+                "The document {} was supposed to be deleted",
+                id
+            );
+        }
+
+        wtxn.commit().unwrap();
+
+        db_snap!(index, facet_id_f64_docids);
+        db_snap!(index, facet_id_string_docids);
+    }
+
+    #[test]
+    fn get_documents_should_not_return_deleted_documents() {
+        let index = TempIndex::new();
+
+        let mut wtxn = index.write_txn().unwrap();
+        index
+            .update_settings_using_wtxn(&mut wtxn, |settings| {
+                settings.set_primary_key(S("docid"));
+            })
+            .unwrap();
+
+        index
+            .add_documents_using_wtxn(
+                &mut wtxn,
+                documents!([
+                    { "docid": "1_4",  "label": ["sign"] },
+                    { "docid": "1_5",  "label": ["letter"] },
+                    { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"] },
+                    { "docid": "1_36", "label": ["drawing","painting","pattern"] },
+                    { "docid": "1_37", "label": ["art","drawing","outdoor"] },
+                    { "docid": "1_38", "label": ["aquarium","art","drawing"] },
+                    { "docid": "1_39", "label": ["abstract"] },
+                    { "docid": "1_40", "label": ["cartoon"] },
+                    { "docid": "1_41", "label": ["art","drawing"] },
+                    { "docid": "1_42", "label": ["art","pattern"] },
+                    { "docid": "1_43", "label": ["abstract","art","drawing","pattern"] },
+                    { "docid": "1_44", "label": ["drawing"] },
+                    { "docid": "1_45", "label": ["art"] },
+                    { "docid": "1_46", "label": ["abstract","colorfulness","pattern"] },
+                    { "docid": "1_47", "label": ["abstract","pattern"] },
+                    { "docid": "1_52", "label": ["abstract","cartoon"] },
+                    { "docid": "1_57", "label": ["abstract","drawing","pattern"] },
+                    { "docid": "1_58", "label": ["abstract","art","cartoon"] },
+                    { "docid": "1_68", "label": ["design"] },
+                    { "docid": "1_69", "label": ["geometry"] },
+                    { "docid": "1_70", "label2": ["geometry", 1.2] },
+                    { "docid": "1_71", "label2": ["design", 2.2] },
+                    { "docid": "1_72", "label2": ["geometry", 1.2] }
+                ]),
+            )
+            .unwrap();
+
+        let deleted_external_ids = ["1_7", "1_52"];
+        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &deleted_external_ids);
+
+        // list all documents
+        let results = index.all_documents(&wtxn).unwrap();
+        for result in results {
+            let (id, _) = result.unwrap();
+            assert!(
+                !deleted_internal_ids.contains(&id),
+                "The document {} was supposed to be deleted",
+                id
+            );
+        }
+
+        // list internal document ids
+        let results = index.documents_ids(&wtxn).unwrap();
+        for id in results {
+            assert!(
+                !deleted_internal_ids.contains(&id),
+                "The document {} was supposed to be deleted",
+                id
+            );
+        }
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+
+        // get internal docids from deleted external document ids
+        let results = index.external_documents_ids(&rtxn).unwrap();
+        for id in deleted_external_ids {
+            assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id);
+        }
+        drop(rtxn);
+    }
+
+    #[test]
+    fn stats_should_not_return_deleted_documents() {
+        let index = TempIndex::new();
+
+        let mut wtxn = index.write_txn().unwrap();
+
+        index
+            .update_settings_using_wtxn(&mut wtxn, |settings| {
+                settings.set_primary_key(S("docid"));
+            })
+            .unwrap();
+
+        index.add_documents_using_wtxn(&mut wtxn, documents!([
+            { "docid": "1_4",  "label": ["sign"]},
+            { "docid": "1_5",  "label": ["letter"]},
+            { "docid": "1_7",  "label": ["abstract","cartoon","design","pattern"], "title": "Mickey Mouse"},
+            { "docid": "1_36", "label": ["drawing","painting","pattern"]},
+            { "docid": "1_37", "label": ["art","drawing","outdoor"]},
+            { "docid": "1_38", "label": ["aquarium","art","drawing"], "title": "Nemo"},
+            { "docid": "1_39", "label": ["abstract"]},
+            { "docid": "1_40", "label": ["cartoon"]},
+            { "docid": "1_41", "label": ["art","drawing"]},
+            { "docid": "1_42", "label": ["art","pattern"]},
+            { "docid": "1_43", "label": ["abstract","art","drawing","pattern"], "number": 32i32},
+            { "docid": "1_44", "label": ["drawing"], "number": 44i32},
+            { "docid": "1_45", "label": ["art"]},
+            { "docid": "1_46", "label": ["abstract","colorfulness","pattern"]},
+            { "docid": "1_47", "label": ["abstract","pattern"]},
+            { "docid": "1_52", "label": ["abstract","cartoon"]},
+            { "docid": "1_57", "label": ["abstract","drawing","pattern"]},
+            { "docid": "1_58", "label": ["abstract","art","cartoon"]},
+            { "docid": "1_68", "label": ["design"]},
+            { "docid": "1_69", "label": ["geometry"]}
+        ])).unwrap();
+
+        delete_documents(&mut wtxn, &index, &["1_7", "1_52"]);
+
+        // count internal documents
+        let results = index.number_of_documents(&wtxn).unwrap();
+        assert_eq!(18, results);
+
+        // count field distribution
+        let results = index.field_distribution(&wtxn).unwrap();
+        assert_eq!(Some(&18), results.get("label"));
+        assert_eq!(Some(&1), results.get("title"));
+        assert_eq!(Some(&2), results.get("number"));
+
+        wtxn.commit().unwrap();
+    }
+
+    #[test]
+    fn stored_detected_script_and_language_should_not_return_deleted_documents() {
+        use charabia::{Language, Script};
+        let index = TempIndex::new();
+        let mut wtxn = index.write_txn().unwrap();
+        index
+            .add_documents_using_wtxn(
+                &mut wtxn,
+                documents!([
+                { "id": "0", "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" },
+                { "id": "1", "title": "人人生而自由﹐在尊嚴和權利上一律平等。他們賦有理性和良心﹐並應以兄弟關係的精神互相對待。" },
+                { "id": "2", "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" },
+                { "id": "3", "title": "関西国際空港限定トートバッグ すもももももももものうち" },
+                { "id": "4", "title": "ภาษาไทยง่ายนิดเดียว" },
+                { "id": "5", "title": "The quick 在尊嚴和權利上一律平等。" },
+            ]))
+            .unwrap();
+
+        let key_cmn = (Script::Cj, Language::Cmn);
+        let cj_cmn_docs =
+            index.script_language_documents_ids(&wtxn, &key_cmn).unwrap().unwrap_or_default();
+        let mut expected_cj_cmn_docids = RoaringBitmap::new();
+        expected_cj_cmn_docids.push(1);
+        expected_cj_cmn_docids.push(5);
+        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
+
+        delete_documents(&mut wtxn, &index, &["1"]);
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let cj_cmn_docs =
+            index.script_language_documents_ids(&rtxn, &key_cmn).unwrap().unwrap_or_default();
+        let mut expected_cj_cmn_docids = RoaringBitmap::new();
+        expected_cj_cmn_docids.push(5);
+        assert_eq!(cj_cmn_docs, expected_cj_cmn_docids);
+    }
+
+    #[test]
+    fn delete_words_exact_attributes() {
+        let index = TempIndex::new();
+
+        index
+            .update_settings(|settings| {
+                settings.set_primary_key(S("id"));
+                settings.set_searchable_fields(vec![S("text"), S("exact")]);
+                settings.set_exact_attributes(vec![S("exact")].into_iter().collect());
+            })
+            .unwrap();
+
+        index
+            .add_documents(documents!([
+                { "id": 0, "text": "hello" },
+                { "id": 1, "exact": "hello"}
+            ]))
+            .unwrap();
+        db_snap!(index, word_docids, 1, @r###"
+        hello            [0, ]
+        "###);
+        db_snap!(index, exact_word_docids, 1, @r###"
+        hello            [1, ]
+        "###);
+        db_snap!(index, words_fst, 1, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
+
+        let mut wtxn = index.write_txn().unwrap();
+        let deleted_internal_ids = delete_documents(&mut wtxn, &index, &["1"]);
+        wtxn.commit().unwrap();
+
+        db_snap!(index, word_docids, 2, @r###"
+        hello            [0, ]
+        "###);
+        db_snap!(index, exact_word_docids, 2, @"");
+        db_snap!(index, words_fst, 2, @"300000000000000001084cfcfc2ce1000000016000000090ea47f");
+
+        insta::assert_snapshot!(format!("{deleted_internal_ids:?}"), @"[1]");
+        let txn = index.read_txn().unwrap();
+        let words = index.words_fst(&txn).unwrap().into_stream().into_strs().unwrap();
+        insta::assert_snapshot!(format!("{words:?}"), @r###"["hello"]"###);
+
+        let mut s = Search::new(&txn, &index);
+        s.query("hello");
+        let crate::SearchResult { documents_ids, .. } = s.execute().unwrap();
+        insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]");
+    }
 }

From 6260cff65ff435aae61878c45275e0d8922546c9 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 26 Oct 2023 18:06:41 +0200
Subject: [PATCH 067/127] Actually delete documents from DB when the merge
 function says so

---
 .../cbo_roaring_bitmap_codec.rs               | 13 +++++++---
 milli/src/update/index_documents/mod.rs       | 17 +-----------
 .../src/update/index_documents/typed_chunk.rs | 26 ++++++++++---------
 3 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
index 117da1308..f635e55af 100644
--- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
+++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs
@@ -102,11 +102,11 @@ impl CboRoaringBitmapCodec {
     }
 
     /// Merges a DelAdd delta into a CboRoaringBitmap.
-    pub fn merge_deladd_into(
+    pub fn merge_deladd_into<'a>(
         deladd: KvReaderDelAdd<'_>,
         previous: &[u8],
-        buffer: &mut Vec<u8>,
-    ) -> io::Result<()> {
+        buffer: &'a mut Vec<u8>,
+    ) -> io::Result<Option<&'a [u8]>> {
         // Deserialize the bitmap that is already there
         let mut previous = Self::deserialize_from(previous)?;
 
@@ -120,7 +120,12 @@ impl CboRoaringBitmapCodec {
             previous |= Self::deserialize_from(value)?;
         }
 
-        previous.serialize_into(buffer)
+        if previous.is_empty() {
+            return Ok(None);
+        }
+
+        Self::serialize_into(&previous, buffer);
+        Ok(Some(&buffer[..]))
     }
 }
 
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index b439ca409..45ceec7b0 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -390,22 +390,7 @@ where
                 return Err(Error::InternalError(InternalError::AbortedIndexation));
             }
 
-            let typed_chunk = match result? {
-                TypedChunk::WordDocids {
-                    word_docids_reader,
-                    exact_word_docids_reader,
-                    word_fid_docids_reader,
-                } => TypedChunk::WordDocids {
-                    word_docids_reader,
-                    exact_word_docids_reader,
-                    word_fid_docids_reader,
-                },
-                TypedChunk::WordPairProximityDocids(chunk) => {
-                    TypedChunk::WordPairProximityDocids(chunk)
-                }
-                TypedChunk::WordPositionDocids(chunk) => TypedChunk::WordPositionDocids(chunk),
-                otherwise => otherwise,
-            };
+            let typed_chunk = result?;
 
             // FIXME: return newly added as well as newly deleted documents
             let (docids, is_merged_database) =
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 1f1ac4adf..8257f7c93 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -484,11 +484,11 @@ fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec<u8>) -> Resul
 ///
 /// The first argument is the DelAdd obkv of CboRoaringBitmaps and
 /// the second one is the CboRoaringBitmap to merge into.
-fn merge_deladd_cbo_roaring_bitmaps(
+fn merge_deladd_cbo_roaring_bitmaps<'a>(
     deladd_obkv: &[u8],
     previous: &[u8],
-    buffer: &mut Vec<u8>,
-) -> Result<()> {
+    buffer: &'a mut Vec<u8>,
+) -> Result<Option<&'a [u8]>> {
     Ok(CboRoaringBitmapCodec::merge_deladd_into(
         KvReaderDelAdd::new(deladd_obkv),
         previous,
@@ -509,7 +509,7 @@ fn write_entries_into_database<R, K, V, FS, FM>(
 where
     R: io::Read + io::Seek,
     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
-    FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
+    FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
 {
     puffin::profile_function!(format!("number of entries: {}", data.len()));
 
@@ -521,17 +521,19 @@ where
         if valid_lmdb_key(key) {
             buffer.clear();
             let value = if index_is_empty {
-                serialize_value(value, &mut buffer)?
+                Some(serialize_value(value, &mut buffer)?)
             } else {
                 match database.get(wtxn, key)? {
-                    Some(prev_value) => {
-                        merge_values(value, prev_value, &mut buffer)?;
-                        &buffer[..]
-                    }
-                    None => serialize_value(value, &mut buffer)?,
+                    Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
+                    None => Some(serialize_value(value, &mut buffer)?),
                 }
             };
-            database.put(wtxn, key, value)?;
+            match value {
+                Some(value) => database.put(wtxn, key, value)?,
+                None => {
+                    database.delete(wtxn, key)?;
+                }
+            }
         }
     }
 
@@ -553,7 +555,7 @@ fn append_entries_into_database<R, K, V, FS, FM>(
 where
     R: io::Read + io::Seek,
     FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
-    FM: Fn(&[u8], &[u8], &mut Vec<u8>) -> Result<()>,
+    FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
     K: for<'a> heed::BytesDecode<'a>,
 {
     puffin::profile_function!(format!("number of entries: {}", data.len()));

From fdf3f7f627aad98ecbd599e04230a505907f97c2 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 26 Oct 2023 18:22:03 +0200
Subject: [PATCH 068/127] Fix facet distribution test

---
 .../src/update/index_documents/typed_chunk.rs | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 8257f7c93..192f3d139 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -328,8 +328,18 @@ pub(crate) fn write_typed_chunk_into_index(
                 index.field_id_docid_facet_f64s.remap_types::<ByteSlice, ByteSlice>();
             let mut cursor = fid_docid_facet_number.into_cursor()?;
             while let Some((key, value)) = cursor.move_on_next()? {
+                let reader = KvReaderDelAdd::new(value);
                 if valid_lmdb_key(key) {
-                    index_fid_docid_facet_numbers.put(wtxn, key, value)?;
+                    match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
+                        (None, None) => {}
+                        (None, Some(new)) => index_fid_docid_facet_numbers.put(wtxn, key, new)?,
+                        (Some(_), None) => {
+                            index_fid_docid_facet_numbers.delete(wtxn, key)?;
+                        }
+                        (Some(_), Some(new)) => {
+                            index_fid_docid_facet_numbers.put(wtxn, key, new)?
+                        }
+                    }
                 }
             }
         }
@@ -338,8 +348,18 @@ pub(crate) fn write_typed_chunk_into_index(
                 index.field_id_docid_facet_strings.remap_types::<ByteSlice, ByteSlice>();
             let mut cursor = fid_docid_facet_string.into_cursor()?;
             while let Some((key, value)) = cursor.move_on_next()? {
+                let reader = KvReaderDelAdd::new(value);
                 if valid_lmdb_key(key) {
-                    index_fid_docid_facet_strings.put(wtxn, key, value)?;
+                    match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
+                        (None, None) => {}
+                        (None, Some(new)) => index_fid_docid_facet_strings.put(wtxn, key, new)?,
+                        (Some(_), None) => {
+                            index_fid_docid_facet_strings.delete(wtxn, key)?;
+                        }
+                        (Some(_), Some(new)) => {
+                            index_fid_docid_facet_strings.put(wtxn, key, new)?
+                        }
+                    }
                 }
             }
         }

From dfab6293c9f8829c41833e95d92963a6323f9b03 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Sat, 28 Oct 2023 12:56:46 +0200
Subject: [PATCH 069/127] Use an LMDB database to store the external documents
 ids

---
 index-scheduler/src/batch.rs                  |   7 +-
 meilisearch/src/routes/indexes/documents.rs   |   4 +-
 milli/src/external_documents_ids.rs           | 157 ++++++------------
 milli/src/index.rs                            |  33 ++--
 milli/src/update/clear_documents.rs           |   5 +-
 milli/src/update/index_documents/transform.rs |   8 +-
 .../src/update/index_documents/typed_chunk.rs |   6 +-
 7 files changed, 79 insertions(+), 141 deletions(-)

diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs
index c4f9c12be..c273d8ebb 100644
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -1575,11 +1575,14 @@ fn delete_document_by_filter<'a>(
             }
             e => e.into(),
         })?;
-        let external_documents_ids = index.external_documents_ids(wtxn)?;
+        let external_documents_ids = index.external_documents_ids();
         // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
         // Since what we have is an iterator, it would be better to delete in chunks
         let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
-            external_documents_ids.find_external_id_of(candidates).only_external_ids().collect();
+            external_documents_ids
+                .find_external_id_of(wtxn, candidates)?
+                .only_external_ids()
+                .collect();
         let document_ids = match external_to_internal {
             Ok(external_ids) => external_ids,
             Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids),
diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs
index 2afc1b5fb..b6950ae6e 100644
--- a/meilisearch/src/routes/indexes/documents.rs
+++ b/meilisearch/src/routes/indexes/documents.rs
@@ -612,8 +612,8 @@ fn retrieve_document<S: AsRef<str>>(
     let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect();
 
     let internal_id = index
-        .external_documents_ids(&txn)?
-        .get(doc_id.as_bytes())
+        .external_documents_ids()
+        .get(&txn, doc_id)?
         .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?;
 
     let document = index
diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs
index 02794609f..1bf08396a 100644
--- a/milli/src/external_documents_ids.rs
+++ b/milli/src/external_documents_ids.rs
@@ -1,12 +1,11 @@
-use std::borrow::Cow;
 use std::collections::HashMap;
 use std::convert::TryInto;
-use std::fmt;
 
-use fst::Streamer;
+use heed::types::{OwnedType, Str};
+use heed::{Database, RoIter, RoTxn, RwTxn};
 use roaring::RoaringBitmap;
 
-use crate::DocumentId;
+use crate::{DocumentId, BEU32};
 
 pub enum DocumentOperationKind {
     Create,
@@ -19,41 +18,31 @@ pub struct DocumentOperation {
     pub kind: DocumentOperationKind,
 }
 
-pub struct ExternalDocumentsIds<'a>(fst::Map<Cow<'a, [u8]>>);
+pub struct ExternalDocumentsIds(Database<Str, OwnedType<BEU32>>);
 
-impl<'a> ExternalDocumentsIds<'a> {
-    pub fn new(fst: fst::Map<Cow<'a, [u8]>>) -> ExternalDocumentsIds<'a> {
-        ExternalDocumentsIds(fst)
-    }
-
-    pub fn into_static(self) -> ExternalDocumentsIds<'static> {
-        ExternalDocumentsIds(self.0.map_data(|c| Cow::Owned(c.into_owned())).unwrap())
+impl ExternalDocumentsIds {
+    pub fn new(db: Database<Str, OwnedType<BEU32>>) -> ExternalDocumentsIds {
+        ExternalDocumentsIds(db)
     }
 
     /// Returns `true` if hard and soft external documents lists are empty.
-    pub fn is_empty(&self) -> bool {
-        self.0.is_empty()
+    pub fn is_empty(&self, rtxn: &RoTxn) -> heed::Result<bool> {
+        self.0.is_empty(rtxn).map_err(Into::into)
     }
 
-    pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
-        let external_id = external_id.as_ref();
-        self.0.get(external_id).map(|x| x.try_into().unwrap())
+    pub fn get<A: AsRef<str>>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result<Option<u32>> {
+        Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get().try_into().unwrap()))
     }
 
     /// An helper function to debug this type, returns an `HashMap` of both,
     /// soft and hard fst maps, combined.
-    pub fn to_hash_map(&self) -> HashMap<String, u32> {
+    pub fn to_hash_map(&self, rtxn: &RoTxn) -> heed::Result<HashMap<String, u32>> {
         let mut map = HashMap::default();
-        let mut stream = self.0.stream();
-        while let Some((k, v)) = stream.next() {
-            let k = String::from_utf8(k.to_vec()).unwrap();
-            map.insert(k, v.try_into().unwrap());
+        for result in self.0.iter(rtxn)? {
+            let (external, internal) = result?;
+            map.insert(external.to_owned(), internal.get().try_into().unwrap());
         }
-        map
-    }
-
-    pub fn as_bytes(&self) -> &[u8] {
-        self.0.as_fst().as_bytes()
+        Ok(map)
     }
 
     /// Looks for the internal ids in the passed bitmap, and returns an iterator over the mapping between
@@ -65,12 +54,12 @@ impl<'a> ExternalDocumentsIds<'a> {
     /// - `Err(remaining_ids)`: if the external ids for some of the requested internal ids weren't found.
     ///   In that case the returned bitmap contains the internal ids whose external ids were not found after traversing
     ///   the entire fst.
-    pub fn find_external_id_of(
+    pub fn find_external_id_of<'t>(
         &self,
+        rtxn: &'t RoTxn,
         internal_ids: RoaringBitmap,
-    ) -> ExternalToInternalOwnedIterator<'_> {
-        let it = ExternalToInternalOwnedIterator { stream: self.0.stream(), internal_ids };
-        it
+    ) -> heed::Result<ExternalToInternalOwnedIterator<'t>> {
+        self.0.iter(rtxn).map(|iter| ExternalToInternalOwnedIterator { iter, internal_ids })
     }
 
     /// Applies the list of operations passed as argument, modifying the current external to internal id mapping.
@@ -81,84 +70,39 @@ impl<'a> ExternalDocumentsIds<'a> {
     ///
     /// - If attempting to delete a document that doesn't exist
     /// - If attempting to create a document that already exists
-    pub fn apply(&mut self, mut operations: Vec<DocumentOperation>) {
-        operations.sort_unstable_by(|left, right| left.external_id.cmp(&right.external_id));
-        operations.dedup_by(|left, right| left.external_id == right.external_id);
-
-        let mut builder = fst::MapBuilder::memory();
-
-        let mut stream = self.0.stream();
-        let mut next_stream = stream.next();
-        let mut operations = operations.iter();
-        let mut next_operation = operations.next();
-
-        loop {
-            (next_stream, next_operation) = match (next_stream.take(), next_operation.take()) {
-                (None, None) => break,
-                (None, Some(DocumentOperation { external_id, internal_id, kind })) => {
-                    if matches!(kind, DocumentOperationKind::Delete) {
+    pub fn apply(&self, wtxn: &mut RwTxn, operations: Vec<DocumentOperation>) -> heed::Result<()> {
+        for DocumentOperation { external_id, internal_id, kind } in operations {
+            match kind {
+                DocumentOperationKind::Create => {
+                    // TODO should we get before insert to be able to detect bugs?
+                    // if matches!(kind, DocumentOperationKind::Create) {
+                    //     panic!("Attempting to create an already-existing document");
+                    // }
+                    self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?;
+                }
+                DocumentOperationKind::Delete => {
+                    if !self.0.delete(wtxn, &external_id)? {
                         panic!("Attempting to delete a non-existing document")
                     }
-                    builder.insert(external_id, (*internal_id).into()).unwrap();
-                    (None, operations.next())
                 }
-                (Some((k, v)), None) => {
-                    builder.insert(k, v).unwrap();
-                    (stream.next(), None)
-                }
-                (
-                    current_stream @ Some((left_external_id, left_internal_id)),
-                    current_operation @ Some(DocumentOperation {
-                        external_id: right_external_id,
-                        internal_id: right_internal_id,
-                        kind,
-                    }),
-                ) => match left_external_id.cmp(right_external_id.as_bytes()) {
-                    std::cmp::Ordering::Less => {
-                        builder.insert(left_external_id, left_internal_id).unwrap();
-                        (stream.next(), current_operation)
-                    }
-                    std::cmp::Ordering::Greater => {
-                        builder.insert(right_external_id, (*right_internal_id).into()).unwrap();
-                        (current_stream, operations.next())
-                    }
-                    std::cmp::Ordering::Equal => {
-                        if matches!(kind, DocumentOperationKind::Create) {
-                            panic!("Attempting to create an already-existing document");
-                        }
-                        // we delete the document, so we just advance both iterators to skip in stream
-                        (stream.next(), operations.next())
-                    }
-                },
             }
         }
-        self.0 = builder.into_map().map_data(Cow::Owned).unwrap();
-    }
-}
 
-impl fmt::Debug for ExternalDocumentsIds<'_> {
-    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        f.debug_tuple("ExternalDocumentsIds").field(&self.to_hash_map()).finish()
-    }
-}
-
-impl Default for ExternalDocumentsIds<'static> {
-    fn default() -> Self {
-        ExternalDocumentsIds(fst::Map::default().map_data(Cow::Owned).unwrap())
+        Ok(())
     }
 }
 
 /// An iterator over mappings between requested internal ids and external ids.
 ///
 /// See [`ExternalDocumentsIds::find_external_id_of`] for details.
-pub struct ExternalToInternalOwnedIterator<'it> {
-    stream: fst::map::Stream<'it>,
+pub struct ExternalToInternalOwnedIterator<'t> {
+    iter: RoIter<'t, Str, OwnedType<BEU32>>,
     internal_ids: RoaringBitmap,
 }
 
-impl<'it> Iterator for ExternalToInternalOwnedIterator<'it> {
+impl<'t> Iterator for ExternalToInternalOwnedIterator<'t> {
     /// A result indicating if a mapping was found, or if the stream was exhausted without finding all internal ids.
-    type Item = Result<(String, DocumentId), RoaringBitmap>;
+    type Item = Result<(&'t str, DocumentId), RoaringBitmap>;
 
     fn next(&mut self) -> Option<Self::Item> {
         // if all requested ids were found, we won't find any other, so short-circuit
@@ -166,23 +110,28 @@ impl<'it> Iterator for ExternalToInternalOwnedIterator<'it> {
             return None;
         }
         loop {
-            let Some((external, internal)) = self.stream.next() else {
-                // we exhausted the stream but we still have some internal ids to find
-                let remaining_ids = std::mem::take(&mut self.internal_ids);
-                return Some(Err(remaining_ids));
-                // note: next calls to `next` will return `None` since we replaced the internal_ids
-                // with the default empty bitmap
+            let (external, internal) = match self.iter.next() {
+                Some(Ok((external, internal))) => (external, internal),
+                // TODO manage this better, remove panic
+                Some(Err(e)) => panic!("{}", e),
+                _ => {
+                    // we exhausted the stream but we still have some internal ids to find
+                    let remaining_ids = std::mem::take(&mut self.internal_ids);
+                    return Some(Err(remaining_ids));
+                    // note: next calls to `next` will return `None` since we replaced the internal_ids
+                    // with the default empty bitmap
+                }
             };
-            let internal = internal.try_into().unwrap();
+            let internal = internal.get();
             let was_contained = self.internal_ids.remove(internal);
             if was_contained {
-                return Some(Ok((std::str::from_utf8(external).unwrap().to_owned(), internal)));
+                return Some(Ok((external, internal)));
             }
         }
     }
 }
 
-impl<'it> ExternalToInternalOwnedIterator<'it> {
+impl<'t> ExternalToInternalOwnedIterator<'t> {
     /// Returns the bitmap of internal ids whose external id are yet to be found
     pub fn remaining_internal_ids(&self) -> &RoaringBitmap {
         &self.internal_ids
@@ -191,7 +140,7 @@ impl<'it> ExternalToInternalOwnedIterator<'it> {
     /// Consumes this iterator and returns an iterator over only the external ids, ignoring the internal ids.
     ///
     /// Use this when you don't need the mapping between the external and the internal ids.
-    pub fn only_external_ids(self) -> impl Iterator<Item = Result<String, RoaringBitmap>> + 'it {
-        self.map(|res| res.map(|(external, _internal)| external))
+    pub fn only_external_ids(self) -> impl Iterator<Item = Result<String, RoaringBitmap>> + 't {
+        self.map(|res| res.map(|(external, _internal)| external.to_owned()))
     }
 }
diff --git a/milli/src/index.rs b/milli/src/index.rs
index d99c36b65..f8a37fb2b 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -51,7 +51,6 @@ pub mod main_key {
     /// It is concatenated with a big-endian encoded number (non-human readable).
     /// e.g. vector-hnsw0x0032.
     pub const VECTOR_HNSW_KEY_PREFIX: &str = "vector-hnsw";
-    pub const EXTERNAL_DOCUMENTS_IDS_KEY: &str = "external-documents-ids";
     pub const PRIMARY_KEY_KEY: &str = "primary-key";
     pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
     pub const USER_DEFINED_SEARCHABLE_FIELDS_KEY: &str = "user-defined-searchable-fields";
@@ -81,6 +80,7 @@ pub mod db_name {
     pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids";
     pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
     pub const EXACT_WORD_PREFIX_DOCIDS: &str = "exact-word-prefix-docids";
+    pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids";
     pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
     pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
     pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
@@ -112,6 +112,9 @@ pub struct Index {
     /// Contains many different types (e.g. the fields ids map).
     pub(crate) main: PolyDatabase,
 
+    /// Maps the external documents ids with the internal document id.
+    pub external_documents_ids: Database<Str, OwnedType<BEU32>>,
+
     /// A word and all the documents ids containing the word.
     pub word_docids: Database<Str, CboRoaringBitmapCodec>,
 
@@ -183,13 +186,15 @@ impl Index {
     ) -> Result<Index> {
         use db_name::*;
 
-        options.max_dbs(25);
+        options.max_dbs(26);
         unsafe { options.flag(Flags::MdbAlwaysFreePages) };
 
         let env = options.open(path)?;
         let mut wtxn = env.write_txn()?;
         let main = env.create_poly_database(&mut wtxn, Some(MAIN))?;
         let word_docids = env.create_database(&mut wtxn, Some(WORD_DOCIDS))?;
+        let external_documents_ids =
+            env.create_database(&mut wtxn, Some(EXTERNAL_DOCUMENTS_IDS))?;
         let exact_word_docids = env.create_database(&mut wtxn, Some(EXACT_WORD_DOCIDS))?;
         let word_prefix_docids = env.create_database(&mut wtxn, Some(WORD_PREFIX_DOCIDS))?;
         let exact_word_prefix_docids =
@@ -235,6 +240,7 @@ impl Index {
         Ok(Index {
             env,
             main,
+            external_documents_ids,
             word_docids,
             exact_word_docids,
             word_prefix_docids,
@@ -386,29 +392,10 @@ impl Index {
 
     /* external documents ids */
 
-    /// Writes the external documents ids and internal ids (i.e. `u32`).
-    pub(crate) fn put_external_documents_ids(
-        &self,
-        wtxn: &mut RwTxn,
-        external_documents_ids: &ExternalDocumentsIds<'_>,
-    ) -> heed::Result<()> {
-        self.main.put::<_, Str, ByteSlice>(
-            wtxn,
-            main_key::EXTERNAL_DOCUMENTS_IDS_KEY,
-            external_documents_ids.as_bytes(),
-        )?;
-        Ok(())
-    }
-
     /// Returns the external documents ids map which associate the external ids
     /// with the internal ids (i.e. `u32`).
-    pub fn external_documents_ids<'t>(&self, rtxn: &'t RoTxn) -> Result<ExternalDocumentsIds<'t>> {
-        let fst = self.main.get::<_, Str, ByteSlice>(rtxn, main_key::EXTERNAL_DOCUMENTS_IDS_KEY)?;
-        let fst = match fst {
-            Some(fst) => fst::Map::new(fst)?.map_data(Cow::Borrowed)?,
-            None => fst::Map::default().map_data(Cow::Owned)?,
-        };
-        Ok(ExternalDocumentsIds::new(fst))
+    pub fn external_documents_ids(&self) -> ExternalDocumentsIds {
+        ExternalDocumentsIds::new(self.external_documents_ids)
     }
 
     /* fields ids map */
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index ca5f69808..7f528e928 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -1,7 +1,7 @@
 use roaring::RoaringBitmap;
 use time::OffsetDateTime;
 
-use crate::{ExternalDocumentsIds, FieldDistribution, Index, Result};
+use crate::{FieldDistribution, Index, Result};
 
 pub struct ClearDocuments<'t, 'u, 'i> {
     wtxn: &'t mut heed::RwTxn<'i, 'u>,
@@ -20,6 +20,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         let Index {
             env: _env,
             main: _main,
+            external_documents_ids,
             word_docids,
             exact_word_docids,
             word_prefix_docids,
@@ -54,7 +55,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         // We clean some of the main engine datastructures.
         self.index.put_words_fst(self.wtxn, &fst::Set::default())?;
         self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
-        self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
         self.index.put_documents_ids(self.wtxn, &empty_roaring)?;
         self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
         self.index.delete_geo_rtree(self.wtxn)?;
@@ -62,6 +62,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         self.index.delete_vector_hnsw(self.wtxn)?;
 
         // Clear the other databases.
+        external_documents_ids.clear(self.wtxn)?;
         word_docids.clear(self.wtxn)?;
         exact_word_docids.clear(self.wtxn)?;
         word_prefix_docids.clear(self.wtxn)?;
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 872230d99..98079e07b 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -162,7 +162,7 @@ impl<'a, 'i> Transform<'a, 'i> {
         FA: Fn() -> bool + Sync,
     {
         let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
-        let external_documents_ids = self.index.external_documents_ids(wtxn)?;
+        let external_documents_ids = self.index.external_documents_ids();
         let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
 
         let primary_key = cursor.primary_key().to_string();
@@ -221,7 +221,7 @@ impl<'a, 'i> Transform<'a, 'i> {
             let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
                 Entry::Occupied(entry) => *entry.get() as u32,
                 Entry::Vacant(entry) => {
-                    let docid = match external_documents_ids.get(entry.key()) {
+                    let docid = match external_documents_ids.get(wtxn, entry.key())? {
                         Some(docid) => {
                             // If it was already in the list of replaced documents it means it was deleted
                             // by the remove_document method. We should starts as if it never existed.
@@ -373,7 +373,7 @@ impl<'a, 'i> Transform<'a, 'i> {
         to_remove.sort_unstable();
         to_remove.dedup();
 
-        let external_documents_ids = self.index.external_documents_ids(wtxn)?;
+        let external_documents_ids = self.index.external_documents_ids();
 
         let mut documents_deleted = 0;
         let mut document_sorter_buffer = Vec::new();
@@ -410,7 +410,7 @@ impl<'a, 'i> Transform<'a, 'i> {
 
             // If the document was already in the db we mark it as a `to_delete` document.
             // Then we push the document in sorters in deletion mode.
-            let deleted_from_db = match external_documents_ids.get(&to_remove) {
+            let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? {
                 Some(docid) => {
                     self.replaced_documents_ids.insert(docid);
 
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 192f3d139..1b38be03b 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -194,10 +194,8 @@ pub(crate) fn write_typed_chunk_into_index(
                     db.delete(wtxn, &BEU32::new(docid))?;
                 }
             }
-            let mut external_documents_docids = index.external_documents_ids(wtxn)?.into_static();
-            external_documents_docids.apply(operations);
-            index.put_external_documents_ids(wtxn, &external_documents_docids)?;
-
+            let external_documents_docids = index.external_documents_ids();
+            external_documents_docids.apply(wtxn, operations)?;
             index.put_documents_ids(wtxn, &docids)?;
         }
         TypedChunk::FieldIdWordCountDocids(fid_word_count_docids_iter) => {

From abf424ebfc1addeb60ad897e9bf210e9d4a38e04 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 30 Oct 2023 11:41:56 +0100
Subject: [PATCH 070/127] Remove unused FromIterator

---
 milli/src/update/index_documents/extract/extract_word_docids.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index a95162236..5266e9bff 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -1,7 +1,6 @@
 use std::collections::{BTreeSet, HashSet};
 use std::fs::File;
 use std::io::{self, BufReader};
-use std::iter::FromIterator;
 
 use heed::BytesDecode;
 use obkv::KvReaderU16;

From 58690dfb19971fa4d5dc949135e987fc19ea4b63 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 30 Oct 2023 13:34:07 +0100
Subject: [PATCH 071/127] Fix tests compilation after changes to
 ExternalDocumentsIds API

---
 milli/src/snapshot_tests.rs             |  2 +-
 milli/src/update/clear_documents.rs     |  2 +-
 milli/src/update/index_documents/mod.rs | 26 ++++++++++++++-----------
 milli/tests/search/mod.rs               |  8 +++++---
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs
index 730d0a5c8..f3f1eb5a5 100644
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -332,7 +332,7 @@ pub fn snap_geo_faceted_documents_ids(index: &Index) -> String {
 }
 pub fn snap_external_documents_ids(index: &Index) -> String {
     let rtxn = index.read_txn().unwrap();
-    let external_ids = index.external_documents_ids(&rtxn).unwrap().to_hash_map();
+    let external_ids = index.external_documents_ids().to_hash_map(&rtxn).unwrap();
     // ensure fixed order (not guaranteed by hashmap)
     let mut external_ids: Vec<(String, u32)> = external_ids.into_iter().collect();
     external_ids.sort_by(|(l, _), (r, _)| l.cmp(r));
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 7f528e928..265c6f15a 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -122,7 +122,7 @@ mod tests {
 
         assert!(index.words_fst(&rtxn).unwrap().is_empty());
         assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
-        assert!(index.external_documents_ids(&rtxn).unwrap().is_empty());
+        assert!(index.external_documents_ids().is_empty(&rtxn).unwrap());
         assert!(index.documents_ids(&rtxn).unwrap().is_empty());
         assert!(index.field_distribution(&rtxn).unwrap().is_empty());
         assert!(index.geo_rtree(&rtxn).unwrap().is_none());
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 45ceec7b0..3026ce81c 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -1094,8 +1094,8 @@ mod tests {
         let txn = index.read_txn().unwrap();
         assert_eq!(index.primary_key(&txn).unwrap(), Some("objectId"));
 
-        let external_documents_ids = index.external_documents_ids(&txn).unwrap();
-        assert!(external_documents_ids.get("30").is_none());
+        let external_documents_ids = index.external_documents_ids();
+        assert!(external_documents_ids.get(&txn, "30").unwrap().is_none());
 
         index
             .add_documents(documents!([
@@ -1104,8 +1104,8 @@ mod tests {
             .unwrap();
 
         let wtxn = index.write_txn().unwrap();
-        let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
-        assert!(external_documents_ids.get("30").is_some());
+        let external_documents_ids = index.external_documents_ids();
+        assert!(external_documents_ids.get(&wtxn, "30").unwrap().is_some());
         wtxn.commit().unwrap();
 
         index
@@ -1399,8 +1399,8 @@ mod tests {
         index.add_documents(documents!({ "a" : { "b" : { "c" :  1 }}})).unwrap();
 
         let rtxn = index.read_txn().unwrap();
-        let external_documents_ids = index.external_documents_ids(&rtxn).unwrap();
-        assert!(external_documents_ids.get("1").is_some());
+        let external_documents_ids = index.external_documents_ids();
+        assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
     }
 
     #[test]
@@ -1665,7 +1665,7 @@ mod tests {
 
         let wtxn = index.read_txn().unwrap();
 
-        let map = index.external_documents_ids(&wtxn).unwrap().to_hash_map();
+        let map = index.external_documents_ids().to_hash_map(&wtxn).unwrap();
         let ids = map.values().collect::<HashSet<_>>();
 
         assert_eq!(ids.len(), map.len());
@@ -2669,10 +2669,10 @@ mod tests {
         index: &'t TempIndex,
         external_ids: &[&str],
     ) -> Vec<u32> {
-        let external_document_ids = index.external_documents_ids(wtxn).unwrap();
+        let external_document_ids = index.external_documents_ids();
         let ids_to_delete: Vec<u32> = external_ids
             .iter()
-            .map(|id| external_document_ids.get(id.as_bytes()).unwrap())
+            .map(|id| external_document_ids.get(&wtxn, id).unwrap().unwrap())
             .collect();
 
         // Delete some documents.
@@ -3052,9 +3052,13 @@ mod tests {
         let rtxn = index.read_txn().unwrap();
 
         // get internal docids from deleted external document ids
-        let results = index.external_documents_ids(&rtxn).unwrap();
+        let results = index.external_documents_ids();
         for id in deleted_external_ids {
-            assert!(results.get(id).is_none(), "The document {} was supposed to be deleted", id);
+            assert!(
+                results.get(&rtxn, id).unwrap().is_none(),
+                "The document {} was supposed to be deleted",
+                id
+            );
         }
         drop(rtxn);
     }
diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs
index 1c68cfff2..9193ab762 100644
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@@ -88,9 +88,11 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
 
 pub fn internal_to_external_ids(index: &Index, internal_ids: &[DocumentId]) -> Vec<String> {
     let rtxn = index.read_txn().unwrap();
-    let docid_map = index.external_documents_ids(&rtxn).unwrap();
-    let docid_map: std::collections::HashMap<_, _> =
-        EXTERNAL_DOCUMENTS_IDS.iter().map(|id| (docid_map.get(id).unwrap(), id)).collect();
+    let docid_map = index.external_documents_ids();
+    let docid_map: std::collections::HashMap<_, _> = EXTERNAL_DOCUMENTS_IDS
+        .iter()
+        .map(|id| (docid_map.get(&rtxn, id).unwrap().unwrap(), id))
+        .collect();
     internal_ids.iter().map(|id| docid_map.get(id).unwrap().to_string()).collect()
 }
 

From 54d07a8da3854a99263c6c74096d09fd139d5f20 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 30 Oct 2023 14:47:51 +0100
Subject: [PATCH 072/127] Update field distribution taking into account both
 deletions and additions

---
 milli/src/update/index_documents/transform.rs | 65 ++++++++++++++-----
 1 file changed, 50 insertions(+), 15 deletions(-)

diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 98079e07b..05940822a 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -1,5 +1,6 @@
 use std::borrow::Cow;
-use std::collections::hash_map::Entry;
+use std::collections::btree_map::Entry as BEntry;
+use std::collections::hash_map::Entry as HEntry;
 use std::collections::{HashMap, HashSet};
 use std::fs::File;
 use std::io::{Read, Seek};
@@ -20,7 +21,7 @@ use super::{IndexDocumentsMethod, IndexerConfig};
 use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::{db_name, main_key};
-use crate::update::del_add::into_del_add_obkv;
+use crate::update::del_add::{into_del_add_obkv, DelAdd, KvReaderDelAdd};
 use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
 use crate::{
     FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
@@ -219,8 +220,8 @@ impl<'a, 'i> Transform<'a, 'i> {
             let mut original_docid = None;
 
             let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
-                Entry::Occupied(entry) => *entry.get() as u32,
-                Entry::Vacant(entry) => {
+                HEntry::Occupied(entry) => *entry.get() as u32,
+                HEntry::Vacant(entry) => {
                     let docid = match external_documents_ids.get(wtxn, entry.key())? {
                         Some(docid) => {
                             // If it was already in the list of replaced documents it means it was deleted
@@ -388,7 +389,7 @@ impl<'a, 'i> Transform<'a, 'i> {
                 .entry((*to_remove).into())
             {
                 // if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
-                Entry::Occupied(entry) => {
+                HEntry::Occupied(entry) => {
                     let doc_id = *entry.get() as u32;
                     document_sorter_buffer.clear();
                     document_sorter_buffer.push(Operation::Deletion as u8);
@@ -405,7 +406,7 @@ impl<'a, 'i> Transform<'a, 'i> {
                     entry.remove_entry();
                     true
                 }
-                Entry::Vacant(_) => false,
+                HEntry::Vacant(_) => false,
             };
 
             // If the document was already in the db we mark it as a `to_delete` document.
@@ -657,8 +658,6 @@ impl<'a, 'i> Transform<'a, 'i> {
         // 2. Add all the new documents to the field distribution
         let mut field_distribution = self.index.field_distribution(wtxn)?;
 
-        self.remove_deleted_documents_from_field_distribution(wtxn, &mut field_distribution)?;
-
         // Here we are going to do the document count + field distribution + `write_into_stream_writer`
         let mut iter = self.original_sorter.into_stream_merger_iter()?;
         // used only for the callback
@@ -678,13 +677,49 @@ impl<'a, 'i> Transform<'a, 'i> {
             // We increment all the field of the current document in the field distribution.
             let obkv = KvReader::new(val);
 
-            for (key, _) in obkv.iter() {
-                let name =
-                    self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
-                        field_id: key,
-                        process: "Computing field distribution in transform.",
-                    })?;
-                *field_distribution.entry(name.to_string()).or_insert(0) += 1;
+            for (key, value) in obkv.iter() {
+                let reader = KvReaderDelAdd::new(value);
+                match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
+                    (None, None) => {}
+                    (None, Some(_)) => {
+                        // New field
+                        let name = self.fields_ids_map.name(key).ok_or(
+                            FieldIdMapMissingEntry::FieldId {
+                                field_id: key,
+                                process: "Computing field distribution in transform.",
+                            },
+                        )?;
+                        *field_distribution.entry(name.to_string()).or_insert(0) += 1;
+                    }
+                    (Some(_), None) => {
+                        // Field removed
+                        let name = self.fields_ids_map.name(key).ok_or(
+                            FieldIdMapMissingEntry::FieldId {
+                                field_id: key,
+                                process: "Computing field distribution in transform.",
+                            },
+                        )?;
+                        match field_distribution.entry(name.to_string()) {
+                            BEntry::Vacant(_) => { /* Bug? trying to remove a non-existing field */
+                            }
+                            BEntry::Occupied(mut entry) => {
+                                // attempt to remove one
+                                match entry.get_mut().checked_sub(1) {
+                                    Some(new_val) => {
+                                        *entry.get_mut() = new_val;
+                                    }
+                                    None => {
+                                        // was 0, remove field from distribution
+                                        entry.remove();
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    (Some(_), Some(_)) => {
+                        // Value change, no field distribution change
+                    }
+                }
             }
             writer.insert(key, val)?;
         }

From 9fedd8101aaa380f68738778994f603ddb75de2b Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 30 Oct 2023 14:48:41 +0100
Subject: [PATCH 073/127] Fix tests

---
 .../tests/documents/delete_documents.rs       |  2 +-
 milli/src/index.rs                            | 22 +++---
 .../documents_ids.snap                        |  4 ++
 .../facet_id_exists_docids.snap               |  4 ++
 .../word_docids.snap                          |  4 ++
 .../word_pair_proximity_docids.snap           |  4 ++
 .../documents_ids.snap                        |  4 ++
 .../word_docids.snap                          |  5 ++
 .../word_pair_proximity_docids.snap           |  4 ++
 .../facet_id_exists_docids.snap               |  6 ++
 .../facet_id_f64_docids.snap                  | 53 +++++++++++++++
 .../facet_id_string_docids.snap               |  4 ++
 .../updated/word_docids.snap                  | 68 +++++++++----------
 13 files changed, 136 insertions(+), 48 deletions(-)
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap

diff --git a/meilisearch/tests/documents/delete_documents.rs b/meilisearch/tests/documents/delete_documents.rs
index b3f04aea0..5a15e95ff 100644
--- a/meilisearch/tests/documents/delete_documents.rs
+++ b/meilisearch/tests/documents/delete_documents.rs
@@ -397,7 +397,7 @@ async fn delete_document_by_complex_filter() {
       "canceledBy": null,
       "details": {
         "providedIds": 0,
-        "deletedDocuments": 4,
+        "deletedDocuments": 2,
         "originalFilter": "[[\"color = green\",\"color NOT EXISTS\"]]"
       },
       "error": null,
diff --git a/milli/src/index.rs b/milli/src/index.rs
index f8a37fb2b..27ad72fad 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1946,14 +1946,14 @@ pub(crate) mod tests {
         3                        3
         "###);
         db_snap!(index, facet_id_f64_docids, 3, @r###"
-        0   0  0      1  [4, ]
-        0   0  1      1  [5, ]
-        0   0  2      1  [6, ]
-        0   0  3      1  [7, ]
-        1   0  1      1  [4, ]
-        1   0  2      1  [5, ]
-        1   0  3      1  [6, ]
-        1   0  4      1  [7, ]
+        0   0  0      1  [0, ]
+        0   0  1      1  [1, ]
+        0   0  2      1  [2, ]
+        0   0  3      1  [3, ]
+        1   0  1      1  [0, ]
+        1   0  2      1  [1, ]
+        1   0  3      1  [2, ]
+        1   0  4      1  [3, ]
         "###);
     }
 
@@ -2038,9 +2038,9 @@ pub(crate) mod tests {
             3                        3
             "###);
             db_snap!(index, facet_id_f64_docids, 1, @r###"
-            1   0  0      1  [0, 4, ]
-            1   0  1      1  [1, 5, ]
-            1   0  2      1  [2, 6, ]
+            1   0  0      1  [0, ]
+            1   0  1      1  [1, ]
+            1   0  2      1  [2, ]
             1   0  3      1  [3, ]
             "###);
         }
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap
new file mode 100644
index 000000000..8b27dcb0d
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/documents_ids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+[]
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap
new file mode 100644
index 000000000..cdff1a607
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/facet_id_exists_docids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap
new file mode 100644
index 000000000..cdff1a607
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_docids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap
new file mode 100644
index 000000000..cdff1a607
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_numbers_as_primary_key/word_pair_proximity_docids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap
new file mode 100644
index 000000000..8a9805f8d
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/documents_ids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+[2, ]
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap
new file mode 100644
index 000000000..bb2f64873
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_docids.snap
@@ -0,0 +1,5 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+benoit           [2, ]
+
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap
new file mode 100644
index 000000000..cdff1a607
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/delete_documents_with_strange_primary_key/word_pair_proximity_docids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap
new file mode 100644
index 000000000..ed120bf02
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap
@@ -0,0 +1,6 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+1   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
+2   [21, ]
+
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
new file mode 100644
index 000000000..5d6009823
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
@@ -0,0 +1,53 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+3   0  48.9021 1  [19, ]
+3   0  49.4449 1  []
+3   0  49.9314 1  [17, ]
+3   0  50.1112 1  []
+3   0  50.1793 1  [15, ]
+3   0  50.2844 1  [14, ]
+3   0  50.3518 1  [13, ]
+3   0  50.4095 1  []
+3   0  50.4502 1  [12, ]
+3   0  50.6053 1  [8, ]
+3   0  50.6224 1  [3, ]
+3   0  50.6299 1  [0, ]
+3   0  50.6312 1  [2, ]
+3   0  50.6415 1  [1, ]
+3   0  50.6552 1  []
+3   0  50.6924 1  []
+3   0  50.7263 1  []
+3   0  50.7453 1  [7, ]
+3   0  50.8466 1  [10, ]
+3   0  51.0537 1  [9, ]
+3   1  48.9021 4  [17, 19, ]
+3   1  50.1793 4  [13, 14, 15, ]
+3   1  50.4502 4  [0, 3, 8, 12, ]
+3   1  50.6312 4  [1, 2, ]
+3   1  50.7263 4  [7, 9, 10, ]
+4   0  2.271  1  [17, ]
+4   0  2.3708 1  [19, ]
+4   0  2.7637 1  [14, ]
+4   0  2.7913 1  []
+4   0  2.8547 1  []
+4   0  3.0569 1  [0, ]
+4   0  3.1106 1  [1, 2, ]
+4   0  3.1476 1  [3, ]
+4   0  3.1541 1  []
+4   0  3.1763 1  []
+4   0  3.1897 1  []
+4   0  3.2189 1  [15, ]
+4   0  3.2206 1  [7, ]
+4   0  3.3758 1  [8, ]
+4   0  3.5326 1  [13, ]
+4   0  3.6957 1  [9, ]
+4   0  3.9623 1  [12, ]
+4   0  4.337  1  [10, ]
+4   0  4.4347 1  []
+4   1  2.271  4  [14, 17, 19, ]
+4   1  2.8547 4  [0, 1, 2, 3, ]
+4   1  3.1541 4  [15, ]
+4   1  3.2206 4  [7, 8, 9, 13, ]
+4   1  3.9623 3  [10, 12, ]
+
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap
new file mode 100644
index 000000000..cdff1a607
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap
index b0ef38b93..80dbce9e8 100644
--- a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap
+++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap
@@ -1,60 +1,56 @@
 ---
 source: milli/src/update/index_documents/mod.rs
 ---
-0                [1, 7, ]
+0                [1, ]
 1                [2, ]
-10               [1, 7, ]
-12               [0, 8, ]
+10               [1, ]
+12               [0, ]
 1344             [3, ]
-1813             [8, ]
-2                [0, 8, ]
+1813             [0, ]
+2                [0, ]
 23               [5, ]
 25               [2, ]
-3                [0, 8, ]
+3                [0, ]
 35               [5, ]
-4                [4, 6, ]
-42               [0, 5, 8, ]
-456              [1, 7, ]
-5                [0, 8, ]
+4                [4, ]
+42               [0, 5, ]
+456              [1, ]
+5                [0, ]
 99               [2, ]
 adams            [5, ]
-adventure        [1, 7, ]
+adventure        [1, ]
 alice            [2, ]
-and              [0, 4, 6, 8, ]
-antoine          [1, 7, ]
-austen           [8, ]
-austin           [0, ]
-blood            [4, 6, ]
+and              [0, 4, ]
+antoine          [1, ]
+austen           [0, ]
+blood            [4, ]
 carroll          [2, ]
-de               [1, 7, ]
+de               [1, ]
 douglas          [5, ]
-exupery          [1, 7, ]
-fantasy          [2, 3, 4, 6, ]
+exupery          [1, ]
+fantasy          [2, 3, 4, ]
 galaxy           [5, ]
 guide            [5, ]
-half             [4, 6, ]
-harry            [4, 6, ]
+half             [4, ]
+harry            [4, ]
 hitchhiker       [5, ]
 hobbit           [3, ]
 in               [2, ]
-j                [3, 4, 6, 8, ]
-jane             [0, ]
-k                [4, 6, ]
-le               [1, ]
+j                [0, 3, 4, ]
+k                [4, ]
 lewis            [2, ]
-little           [7, ]
-petit            [1, ]
-potter           [4, 6, ]
-prejudice        [0, 8, ]
-pride            [0, 8, ]
-prince           [1, 4, 7, ]
-princess         [6, ]
+little           [1, ]
+potter           [4, ]
+prejudice        [0, ]
+pride            [0, ]
+prince           [1, ]
+princess         [4, ]
 r                [3, ]
-romance          [0, 8, ]
-rowling          [4, 6, ]
+romance          [0, ]
+rowling          [4, ]
 s                [5, ]
-saint            [1, 7, ]
-the              [3, 4, 5, 6, 7, ]
+saint            [1, ]
+the              [1, 3, 4, 5, ]
 to               [5, ]
 tolkien          [3, ]
 wonderland       [2, ]

From be395c7944204d90e9fd663f8bc5d01f1855be50 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 30 Oct 2023 16:26:29 +0100
Subject: [PATCH 074/127] Change order of arguments to tokenizer_builder

---
 .../index_documents/extract/extract_docid_word_positions.rs   | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index e5d95cbdb..96156adb4 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -56,7 +56,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     let mut value_buffer = Vec::new();
 
     // initialize tokenizer.
-    let mut builder = tokenizer_builder(stop_words, dictionary, allowed_separators, None);
+    let mut builder = tokenizer_builder(stop_words, allowed_separators, dictionary, None);
     let tokenizer = builder.build();
 
     // iterate over documents.
@@ -247,8 +247,8 @@ fn lang_safe_tokens_from_document<'a>(
             // build a new temporary tokenizer including the allow list.
             let mut builder = tokenizer_builder(
                 stop_words,
-                dictionary,
                 allowed_separators,
+                dictionary,
                 Some(&script_language),
             );
             let tokenizer = builder.build();

From de10f20732accd83b096a6b0dea5121673bf4ab4 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 30 Oct 2023 16:57:08 +0100
Subject: [PATCH 075/127] Fix field distribution again

---
 milli/src/update/index_documents/transform.rs | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 05940822a..840bade2e 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -674,10 +674,7 @@ impl<'a, 'i> Transform<'a, 'i> {
                 total_documents: self.documents_count,
             });
 
-            // We increment all the field of the current document in the field distribution.
-            let obkv = KvReader::new(val);
-
-            for (key, value) in obkv.iter() {
+            for (key, value) in KvReader::new(val) {
                 let reader = KvReaderDelAdd::new(value);
                 match (reader.get(DelAdd::Deletion), reader.get(DelAdd::Addition)) {
                     (None, None) => {}
@@ -705,12 +702,14 @@ impl<'a, 'i> Transform<'a, 'i> {
                             BEntry::Occupied(mut entry) => {
                                 // attempt to remove one
                                 match entry.get_mut().checked_sub(1) {
+                                    Some(0) => {
+                                        entry.remove();
+                                    }
                                     Some(new_val) => {
                                         *entry.get_mut() = new_val;
                                     }
                                     None => {
-                                        // was 0, remove field from distribution
-                                        entry.remove();
+                                        unreachable!("Attempting to remove a field that wasn't in the field distribution")
                                     }
                                 }
                             }

From 4e91707a064f2ab0c5b5cf2baab425f15e2a915e Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 31 Oct 2023 09:41:17 +0100
Subject: [PATCH 076/127] Rename test

---
 milli/src/update/index_documents/mod.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 3026ce81c..ad2f63beb 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -2750,7 +2750,7 @@ mod tests {
     }
 
     #[test]
-    fn filtered_placeholder_search_should_not_return_deleted_documents_() {
+    fn filtered_placeholder_search_should_not_return_deleted_documents() {
         let index = TempIndex::new();
 
         let mut wtxn = index.write_txn().unwrap();

From dad78cbf8de82434b00621a4a3693b32a33c2a70 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 31 Oct 2023 09:53:55 +0100
Subject: [PATCH 077/127] Bulk facet remove deletes keys from DB when value
 empty

---
 milli/src/update/facet/bulk.rs | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/milli/src/update/facet/bulk.rs b/milli/src/update/facet/bulk.rs
index c0b159e57..5626a4aae 100644
--- a/milli/src/update/facet/bulk.rs
+++ b/milli/src/update/facet/bulk.rs
@@ -3,7 +3,7 @@ use std::io::BufReader;
 
 use grenad::CompressionType;
 use heed::types::ByteSlice;
-use heed::{BytesEncode, Error, RoTxn, RwTxn};
+use heed::{BytesDecode, BytesEncode, Error, RoTxn, RwTxn};
 use roaring::RoaringBitmap;
 
 use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE};
@@ -14,7 +14,7 @@ use crate::heed_codec::facet::{
 use crate::heed_codec::ByteSliceRefCodec;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd};
 use crate::update::index_documents::{create_writer, valid_lmdb_key, writer_into_reader};
-use crate::{CboRoaringBitmapCodec, FieldId, Index, Result};
+use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, FieldId, Index, Result};
 
 /// Algorithm to insert elememts into the `facet_id_(string/f64)_docids` databases
 /// by rebuilding the database "from scratch".
@@ -181,7 +181,13 @@ impl<R: std::io::Read + std::io::Seek> FacetsUpdateBulkInner<R> {
                         buffer.extend_from_slice(value);
                     }
                 };
-                database.put(wtxn, key, &buffer)?;
+                let new_bitmap = &buffer[1..];
+                // if the new bitmap is empty, let's remove it
+                if CboRoaringBitmapLenCodec::bytes_decode(new_bitmap).unwrap_or_default() == 0 {
+                    database.delete(wtxn, key)?;
+                } else {
+                    database.put(wtxn, key, &buffer)?;
+                }
             }
         }
         Ok(())

From 9d59e8011ace80b403b290cef29abf0d84f66835 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 31 Oct 2023 10:08:36 +0100
Subject: [PATCH 078/127] fix some tests

---
 milli/src/index.rs                            | 16 ++++++--------
 .../facet_id_f64_docids.snap                  | 22 -------------------
 2 files changed, 7 insertions(+), 31 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 27ad72fad..f7450a672 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1905,10 +1905,9 @@ pub(crate) mod tests {
         3                        3
         "###);
         db_snap!(index, facet_id_f64_docids, 2, @r###"
-        1   0  0      1  [0, ]
-        1   0  1      1  [1, 4, ]
-        1   0  2      1  [2, 5, ]
-        1   0  3      1  [3, 6, ]
+        1   0  1      1  [0, ]
+        1   0  2      1  [1, ]
+        1   0  3      1  [2, 3, ]
         "###);
 
         index
@@ -1924,11 +1923,10 @@ pub(crate) mod tests {
         3                        3
         "###);
         db_snap!(index, facet_id_f64_docids, 3, @r###"
-        1   0  0      1  [0, ]
-        1   0  1      1  [1, 4, ]
-        1   0  2      1  [2, 5, ]
-        1   0  3      1  [3, 6, ]
-        1   0  4      1  [7, ]
+        1   0  1      1  [0, ]
+        1   0  2      1  [1, ]
+        1   0  3      1  [2, ]
+        1   0  4      1  [3, ]
         "###);
 
         index
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
index 5d6009823..c45c350e7 100644
--- a/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
+++ b/milli/src/update/index_documents/snapshots/mod.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
@@ -2,41 +2,25 @@
 source: milli/src/update/index_documents/mod.rs
 ---
 3   0  48.9021 1  [19, ]
-3   0  49.4449 1  []
 3   0  49.9314 1  [17, ]
-3   0  50.1112 1  []
 3   0  50.1793 1  [15, ]
 3   0  50.2844 1  [14, ]
 3   0  50.3518 1  [13, ]
-3   0  50.4095 1  []
 3   0  50.4502 1  [12, ]
 3   0  50.6053 1  [8, ]
 3   0  50.6224 1  [3, ]
 3   0  50.6299 1  [0, ]
 3   0  50.6312 1  [2, ]
 3   0  50.6415 1  [1, ]
-3   0  50.6552 1  []
-3   0  50.6924 1  []
-3   0  50.7263 1  []
 3   0  50.7453 1  [7, ]
 3   0  50.8466 1  [10, ]
 3   0  51.0537 1  [9, ]
-3   1  48.9021 4  [17, 19, ]
-3   1  50.1793 4  [13, 14, 15, ]
-3   1  50.4502 4  [0, 3, 8, 12, ]
-3   1  50.6312 4  [1, 2, ]
-3   1  50.7263 4  [7, 9, 10, ]
 4   0  2.271  1  [17, ]
 4   0  2.3708 1  [19, ]
 4   0  2.7637 1  [14, ]
-4   0  2.7913 1  []
-4   0  2.8547 1  []
 4   0  3.0569 1  [0, ]
 4   0  3.1106 1  [1, 2, ]
 4   0  3.1476 1  [3, ]
-4   0  3.1541 1  []
-4   0  3.1763 1  []
-4   0  3.1897 1  []
 4   0  3.2189 1  [15, ]
 4   0  3.2206 1  [7, ]
 4   0  3.3758 1  [8, ]
@@ -44,10 +28,4 @@ source: milli/src/update/index_documents/mod.rs
 4   0  3.6957 1  [9, ]
 4   0  3.9623 1  [12, ]
 4   0  4.337  1  [10, ]
-4   0  4.4347 1  []
-4   1  2.271  4  [14, 17, 19, ]
-4   1  2.8547 4  [0, 1, 2, 3, ]
-4   1  3.1541 4  [15, ]
-4   1  3.2206 4  [7, 8, 9, 13, ]
-4   1  3.9623 3  [10, 12, ]
 

From d8bf3f3fc2fc3511ceb7d2d15dc445766c597b76 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 31 Oct 2023 10:12:49 +0100
Subject: [PATCH 079/127] Remove unused snapshots

---
 ...dump__reader__test__import_dump_v1-11.snap | 24 -------
 .../dump__reader__test__import_dump_v1-5.snap | 38 -----------
 .../dump__reader__test__import_dump_v1-8.snap | 31 ---------
 .../bulk.rs/insert/default.hash.snap          |  4 --
 .../large_group_small_min_level.hash.snap     |  4 --
 .../insert/odd_group_odd_min_level.hash.snap  |  4 --
 .../small_group_large_min_level.hash.snap     |  4 --
 .../small_group_small_min_level.hash.snap     |  4 --
 .../default.hash.snap                         |  4 --
 .../large_group_small_min_level.hash.snap     |  4 --
 .../odd_group_odd_min_level.hash.snap         |  4 --
 .../small_group_large_min_level.hash.snap     |  4 --
 .../small_group_small_min_level.hash.snap     |  4 --
 .../bulk.rs/insert_string/default.hash.snap   |  4 --
 .../large_group_small_min_level.hash.snap     |  4 --
 .../odd_group_odd_min_level.hash.snap         |  4 --
 .../small_group_large_min_level.hash.snap     |  4 --
 .../small_group_small_min_level.hash.snap     |  4 --
 .../facet_id_exists_docids.snap               |  6 --
 .../prefix_word_pair_proximity_docids.snap    | 20 ------
 .../word_prefix_pair_proximity_docids.snap    | 23 -------
 .../prefix_word_pair_proximity_docids.snap    | 29 ---------
 .../update/word_pair_proximity_docids.snap    | 33 ----------
 .../word_prefix_pair_proximity_docids.snap    | 31 ---------
 .../prefix_word_pair_proximity_docids.snap    |  4 --
 .../word_pair_proximity_docids.snap           |  8 ---
 .../word_prefix_pair_proximity_docids.snap    |  7 --
 .../first_delete/documents_ids.snap           |  4 --
 .../prefix_word_pair_proximity_docids.snap    |  6 --
 .../first_delete/word_docids.snap             | 60 -----------------
 .../word_prefix_pair_proximity_docids.snap    | 10 ---
 .../initial/documents_ids.snap                |  4 --
 .../prefix_word_pair_proximity_docids.snap    | 14 ----
 .../initial/word_docids.snap                  | 65 -------------------
 .../word_prefix_pair_proximity_docids.snap    | 15 -----
 .../reupdate/documents_ids.snap               |  4 --
 .../prefix_word_pair_proximity_docids.snap    |  6 --
 .../reupdate/word_docids.snap                 | 60 -----------------
 .../word_prefix_pair_proximity_docids.snap    |  5 --
 .../second_delete/documents_ids.snap          |  4 --
 .../prefix_word_pair_proximity_docids.snap    |  6 --
 .../second_delete/word_docids.snap            | 10 ---
 .../word_prefix_pair_proximity_docids.snap    | 10 ---
 .../initial/documents_ids.snap                |  4 --
 .../prefix_word_pair_proximity_docids.snap    |  9 ---
 .../initial/word_docids.snap                  | 61 -----------------
 .../word_prefix_pair_proximity_docids.snap    |  7 --
 .../replaced/documents_ids.snap               |  4 --
 .../prefix_word_pair_proximity_docids.snap    |  5 --
 .../replaced/word_docids.snap                 | 61 -----------------
 .../word_prefix_pair_proximity_docids.snap    |  5 --
 .../initial/documents_ids.snap                |  4 --
 .../prefix_word_pair_proximity_docids.snap    |  9 ---
 .../initial/word_docids.snap                  | 61 -----------------
 .../word_prefix_pair_proximity_docids.snap    |  7 --
 .../replaced/documents_ids.snap               |  4 --
 .../prefix_word_pair_proximity_docids.snap    | 10 ---
 .../replaced/word_docids.hash.snap            |  4 --
 .../word_prefix_pair_proximity_docids.snap    |  8 ---
 .../first_delete/documents_ids.snap           |  4 --
 .../prefix_word_pair_proximity_docids.snap    | 14 ----
 .../first_delete/word_docids.snap             | 65 -------------------
 .../word_prefix_pair_proximity_docids.snap    | 15 -----
 .../initial/documents_ids.snap                |  4 --
 .../prefix_word_pair_proximity_docids.snap    | 14 ----
 .../initial/word_docids.snap                  | 65 -------------------
 .../word_prefix_pair_proximity_docids.snap    | 15 -----
 .../reupdate/documents_ids.snap               |  4 --
 .../prefix_word_pair_proximity_docids.snap    | 17 -----
 .../reupdate/word_docids.hash.snap            |  4 --
 .../word_prefix_pair_proximity_docids.snap    | 19 ------
 .../second_delete/documents_ids.snap          |  4 --
 .../prefix_word_pair_proximity_docids.snap    | 14 ----
 .../second_delete/word_docids.snap            | 65 -------------------
 .../word_prefix_pair_proximity_docids.snap    | 15 -----
 .../always_hard/documents_ids.snap            |  4 --
 .../always_hard/facet_id_exists_docids.snap   |  4 --
 .../soft_deleted_documents_ids.snap           |  4 --
 .../always_hard/word_docids.snap              |  4 --
 .../word_pair_proximity_docids.snap           |  4 --
 .../always_soft/documents_ids.snap            |  4 --
 .../always_soft/facet_id_exists_docids.snap   |  4 --
 .../soft_deleted_documents_ids.snap           |  4 --
 .../always_soft/word_docids.snap              |  4 --
 .../word_pair_proximity_docids.snap           |  4 --
 .../always_hard/documents_ids.snap            |  4 --
 .../soft_deleted_documents_ids.snap           |  4 --
 .../always_hard/word_docids.snap              |  5 --
 .../word_pair_proximity_docids.snap           |  4 --
 .../always_soft/documents_ids.snap            |  4 --
 .../soft_deleted_documents_ids.snap           |  4 --
 .../always_soft/word_docids.snap              |  7 --
 .../word_pair_proximity_docids.snap           |  4 --
 .../always_hard/facet_id_exists_docids.snap   |  6 --
 .../always_hard/facet_id_f64_docids.snap      |  5 --
 .../always_hard/facet_id_string_docids.snap   | 17 -----
 .../soft_deleted_documents_ids.snap           |  4 --
 .../always_hard/word_docids.snap              | 38 -----------
 .../word_pair_proximity_docids.snap           | 25 -------
 .../always_soft/facet_id_exists_docids.snap   |  6 --
 .../always_soft/facet_id_f64_docids.snap      |  6 --
 .../always_soft/facet_id_string_docids.snap   | 19 ------
 .../soft_deleted_documents_ids.snap           |  4 --
 .../always_soft/word_docids.snap              | 42 ------------
 .../word_pair_proximity_docids.snap           | 29 ---------
 .../always_hard/facet_id_f64_docids.snap      | 31 ---------
 .../always_hard/facet_id_string_docids.snap   |  4 --
 .../soft_deleted_documents_ids.snap           |  4 --
 .../always_soft/facet_id_f64_docids.snap      | 53 ---------------
 .../always_soft/facet_id_string_docids.snap   |  4 --
 .../soft_deleted_documents_ids.snap           |  4 --
 .../soft_deleted_documents_ids.snap           |  4 --
 .../soft_deleted_documents_ids.snap           |  4 --
 .../soft_deleted_documents_ids.snap           |  4 --
 .../soft_deleted_documents_ids.snap           |  4 --
 .../soft_deleted_documents_ids.snap           |  4 --
 .../soft_deleted_documents_ids.snap           |  4 --
 117 files changed, 1599 deletions(-)
 delete mode 100644 dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap
 delete mode 100644 dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap
 delete mode 100644 dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap
 delete mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap
 delete mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap
 delete mode 100644 milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
 delete mode 100644 milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap

diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap
deleted file mode 100644
index 92fc61d72..000000000
--- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-11.snap
+++ /dev/null
@@ -1,24 +0,0 @@
----
-source: dump/src/reader/mod.rs
-expression: spells.settings().unwrap()
----
-{
-  "displayedAttributes": [
-    "*"
-  ],
-  "searchableAttributes": [
-    "*"
-  ],
-  "filterableAttributes": [],
-  "sortableAttributes": [],
-  "rankingRules": [
-    "typo",
-    "words",
-    "proximity",
-    "attribute",
-    "exactness"
-  ],
-  "stopWords": [],
-  "synonyms": {},
-  "distinctAttribute": null
-}
diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap
deleted file mode 100644
index b0b54c136..000000000
--- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-5.snap
+++ /dev/null
@@ -1,38 +0,0 @@
----
-source: dump/src/reader/mod.rs
-expression: products.settings().unwrap()
----
-{
-  "displayedAttributes": [
-    "*"
-  ],
-  "searchableAttributes": [
-    "*"
-  ],
-  "filterableAttributes": [],
-  "sortableAttributes": [],
-  "rankingRules": [
-    "typo",
-    "words",
-    "proximity",
-    "attribute",
-    "exactness"
-  ],
-  "stopWords": [],
-  "synonyms": {
-    "android": [
-      "phone",
-      "smartphone"
-    ],
-    "iphone": [
-      "phone",
-      "smartphone"
-    ],
-    "phone": [
-      "android",
-      "iphone",
-      "smartphone"
-    ]
-  },
-  "distinctAttribute": null
-}
diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap
deleted file mode 100644
index 5c12a0438..000000000
--- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v1-8.snap
+++ /dev/null
@@ -1,31 +0,0 @@
----
-source: dump/src/reader/mod.rs
-expression: movies.settings().unwrap()
----
-{
-  "displayedAttributes": [
-    "*"
-  ],
-  "searchableAttributes": [
-    "*"
-  ],
-  "filterableAttributes": [
-    "genres",
-    "id"
-  ],
-  "sortableAttributes": [
-    "genres",
-    "id"
-  ],
-  "rankingRules": [
-    "typo",
-    "words",
-    "proximity",
-    "attribute",
-    "exactness",
-    "release_date:asc"
-  ],
-  "stopWords": [],
-  "synonyms": {},
-  "distinctAttribute": null
-}
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap
deleted file mode 100644
index bef20823c..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-b40dd31a65e033ffc6b35c027ce19506
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap
deleted file mode 100644
index 74c40e6a3..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-7ee22d8e9387e72758f00918eb67e4c6
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap
deleted file mode 100644
index 6fb086d35..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-60f567359382507afdaf45fb075740c3
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap
deleted file mode 100644
index 0271a6c6b..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-b986d6e6cbf425685f409a8b417010e1
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap
deleted file mode 100644
index d801ef19f..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-ee10dd2ae2b5c6621a89a5d0a9aa8ccc
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap
deleted file mode 100644
index e9988f527..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-fa877559eef78b383b496c15a364a2dc
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap
deleted file mode 100644
index aa52901da..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-16a96353bc42f2ff3e91611ca4d5b184
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap
deleted file mode 100644
index 64f5012a4..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-be1b08073b9d9788d18080c1320151d7
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap
deleted file mode 100644
index aa52901da..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-16a96353bc42f2ff3e91611ca4d5b184
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap
deleted file mode 100644
index bb0e9aa69..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-32a45d555df2e001420fea149818d376
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap
deleted file mode 100644
index b7705b72e..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-353d70f52eea66e5031dca989ea8a037
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap
deleted file mode 100644
index 15030a1ea..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-52a093c909133d84023a4a7b83864808
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap
deleted file mode 100644
index 949ec6647..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-9d86c72ddb241d0aeca2995d61a3648a
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap
deleted file mode 100644
index d8797f1ab..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-c0943177594534bfe5527cbf40fe388e
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap
deleted file mode 100644
index f7949c5f3..000000000
--- a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/facet/bulk.rs
----
-6ed86f234028ae3df5881bee5512f11e
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap
deleted file mode 100644
index ed120bf02..000000000
--- a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents_/facet_id_exists_docids.snap
+++ /dev/null
@@ -1,6 +0,0 @@
----
-source: milli/src/update/index_documents/mod.rs
----
-1   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
-2   [21, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index 6609786a3..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,20 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a    5                [101, ]
-1  a    amazing          [100, ]
-1  a    an               [100, ]
-1  a    and              [100, ]
-1  a    beautiful        [100, ]
-1  b    house            [100, ]
-1  b    rings            [101, ]
-1  be   house            [100, ]
-1  be   rings            [101, ]
-2  a    am               [101, ]
-2  a    amazing          [100, ]
-2  a    and              [100, ]
-2  a    beautiful        [100, ]
-2  a    house            [100, ]
-2  b    at               [101, ]
-2  be   at               [101, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 52b29e136..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/initial/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,23 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  5                a    [101, ]
-1  amazing          a    [100, ]
-1  an               a    [100, ]
-1  and              b    [100, ]
-1  and              be   [100, ]
-1  at               a    [100, ]
-1  rings            a    [101, ]
-1  the              b    [101, ]
-1  the              be   [101, ]
-2  amazing          b    [100, ]
-2  amazing          be   [100, ]
-2  an               a    [100, ]
-2  at               a    [100, 101, ]
-2  bell             a    [101, ]
-3  an               b    [100, ]
-3  an               be   [100, ]
-3  at               a    [100, ]
-3  rings            a    [101, ]
-3  the              a    [101, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index 7644c433d..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,29 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a    5                [101, ]
-1  a    amazing          [100, ]
-1  a    an               [100, 202, ]
-1  a    and              [100, ]
-1  a    beautiful        [100, ]
-1  a    extraordinary    [202, ]
-1  am   and              [100, ]
-1  an   amazing          [100, ]
-1  an   beautiful        [100, ]
-1  an   extraordinary    [202, ]
-1  b    house            [100, ]
-1  b    rings            [101, ]
-1  be   house            [100, ]
-1  be   rings            [101, ]
-2  a    am               [101, ]
-2  a    amazing          [100, ]
-2  a    and              [100, ]
-2  a    beautiful        [100, ]
-2  a    extraordinary    [202, ]
-2  a    house            [100, 202, ]
-2  am   beautiful        [100, ]
-2  an   and              [100, ]
-2  an   house            [100, 202, ]
-2  b    at               [101, ]
-2  be   at               [101, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap
deleted file mode 100644
index 1b56974c2..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,33 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  5                am               [101, ]
-1  amazing          and              [100, ]
-1  an               amazing          [100, ]
-1  an               extraordinary    [202, ]
-1  and              beautiful        [100, ]
-1  at               5                [101, ]
-1  at               an               [100, 202, ]
-1  beautiful        house            [100, ]
-1  bell             rings            [101, ]
-1  extraordinary    house            [202, ]
-1  rings            at               [101, ]
-1  the              bell             [101, ]
-2  amazing          beautiful        [100, ]
-2  an               and              [100, ]
-2  an               house            [202, ]
-2  and              house            [100, ]
-2  at               am               [101, ]
-2  at               amazing          [100, ]
-2  at               extraordinary    [202, ]
-2  bell             at               [101, ]
-2  rings            5                [101, ]
-2  the              rings            [101, ]
-3  amazing          house            [100, ]
-3  an               beautiful        [100, ]
-3  at               and              [100, ]
-3  at               house            [202, ]
-3  bell             5                [101, ]
-3  rings            am               [101, ]
-3  the              at               [101, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 008a4b21d..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/add_new_documents/update/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,31 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  5                a    [101, ]
-1  5                am   [101, ]
-1  amazing          a    [100, ]
-1  amazing          an   [100, ]
-1  an               a    [100, ]
-1  an               am   [100, ]
-1  and              b    [100, ]
-1  and              be   [100, ]
-1  at               a    [100, 202, ]
-1  at               an   [100, 202, ]
-1  rings            a    [101, ]
-1  the              b    [101, ]
-1  the              be   [101, ]
-2  amazing          b    [100, ]
-2  amazing          be   [100, ]
-2  an               a    [100, ]
-2  an               an   [100, ]
-2  at               a    [100, 101, ]
-2  at               am   [100, 101, ]
-2  bell             a    [101, ]
-3  an               b    [100, ]
-3  an               be   [100, ]
-3  at               a    [100, ]
-3  at               an   [100, ]
-3  rings            a    [101, ]
-3  rings            am   [101, ]
-3  the              a    [101, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index d212999bb..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap
deleted file mode 100644
index 816895dcf..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,8 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a                y                [51, ]
-1  x                a                [51, ]
-1  x                y                [50, ]
-2  x                y                [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 03530a2f1..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/batch_bug_3043/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,7 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a                y    [51, ]
-1  x                y    [50, ]
-2  x                y    [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap
deleted file mode 100644
index 39e9fbe65..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ]
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index 61987fd4a..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,6 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a    5                [51, ]
-2  a    am               [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap
deleted file mode 100644
index 1caf1a9a3..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_docids.snap
+++ /dev/null
@@ -1,60 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-5                [51, ]
-a0               [0, ]
-a1               [1, ]
-a10              [16, ]
-a11              [17, ]
-a12              [18, ]
-a13              [19, ]
-a14              [20, ]
-a15              [21, ]
-a16              [22, ]
-a17              [23, ]
-a18              [24, ]
-a19              [25, ]
-a1a              [26, ]
-a1b              [27, ]
-a1c              [28, ]
-a1d              [29, ]
-a1e              [30, ]
-a1f              [31, ]
-a2               [2, ]
-a20              [32, ]
-a21              [33, ]
-a22              [34, ]
-a23              [35, ]
-a24              [36, ]
-a25              [37, ]
-a26              [38, ]
-a27              [39, ]
-a28              [40, ]
-a29              [41, ]
-a2a              [42, ]
-a2b              [43, ]
-a2c              [44, ]
-a2d              [45, ]
-a2e              [46, ]
-a2f              [47, ]
-a3               [3, ]
-a30              [48, ]
-a31              [49, ]
-a4               [4, ]
-a5               [5, ]
-a6               [6, ]
-a7               [7, ]
-a8               [8, ]
-a9               [9, ]
-aa               [10, ]
-ab               [11, ]
-ac               [12, ]
-ad               [13, ]
-ae               [14, ]
-af               [15, ]
-am               [51, ]
-at               [51, ]
-bell             [51, ]
-rings            [51, ]
-the              [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 618a0b076..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,10 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  5                a    [51, ]
-1  rings            a    [51, ]
-2  at               a    [51, ]
-2  bell             a    [51, ]
-3  rings            a    [51, ]
-3  the              a    [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap
deleted file mode 100644
index 78008f83b..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ]
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index b380ba9b5..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,14 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a    5                [51, ]
-1  a    amazing          [50, ]
-1  a    an               [50, ]
-1  a    and              [50, ]
-1  a    beautiful        [50, ]
-2  a    am               [51, ]
-2  a    amazing          [50, ]
-2  a    and              [50, ]
-2  a    beautiful        [50, ]
-2  a    house            [50, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap
deleted file mode 100644
index 6b5658b74..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_docids.snap
+++ /dev/null
@@ -1,65 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-5                [51, ]
-a0               [0, ]
-a1               [1, ]
-a10              [16, ]
-a11              [17, ]
-a12              [18, ]
-a13              [19, ]
-a14              [20, ]
-a15              [21, ]
-a16              [22, ]
-a17              [23, ]
-a18              [24, ]
-a19              [25, ]
-a1a              [26, ]
-a1b              [27, ]
-a1c              [28, ]
-a1d              [29, ]
-a1e              [30, ]
-a1f              [31, ]
-a2               [2, ]
-a20              [32, ]
-a21              [33, ]
-a22              [34, ]
-a23              [35, ]
-a24              [36, ]
-a25              [37, ]
-a26              [38, ]
-a27              [39, ]
-a28              [40, ]
-a29              [41, ]
-a2a              [42, ]
-a2b              [43, ]
-a2c              [44, ]
-a2d              [45, ]
-a2e              [46, ]
-a2f              [47, ]
-a3               [3, ]
-a30              [48, ]
-a31              [49, ]
-a4               [4, ]
-a5               [5, ]
-a6               [6, ]
-a7               [7, ]
-a8               [8, ]
-a9               [9, ]
-aa               [10, ]
-ab               [11, ]
-ac               [12, ]
-ad               [13, ]
-ae               [14, ]
-af               [15, ]
-am               [51, ]
-amazing          [50, ]
-an               [50, ]
-and              [50, ]
-at               [50, 51, ]
-beautiful        [50, ]
-bell             [51, ]
-house            [50, ]
-rings            [51, ]
-the              [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 885985bdf..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,15 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  5                a    [51, ]
-1  amazing          a    [50, ]
-1  an               a    [50, ]
-1  at               a    [50, ]
-1  rings            a    [51, ]
-2  an               a    [50, ]
-2  at               a    [50, 51, ]
-2  bell             a    [51, ]
-3  at               a    [50, ]
-3  rings            a    [51, ]
-3  the              a    [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap
deleted file mode 100644
index 39e9fbe65..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ]
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index 267a1c01d..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,6 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  b    rings            [51, ]
-2  b    at               [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap
deleted file mode 100644
index e5336d58c..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_docids.snap
+++ /dev/null
@@ -1,60 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-5                [51, ]
-am               [51, ]
-at               [51, ]
-b0               [0, ]
-b1               [1, ]
-b10              [16, ]
-b11              [17, ]
-b12              [18, ]
-b13              [19, ]
-b14              [20, ]
-b15              [21, ]
-b16              [22, ]
-b17              [23, ]
-b18              [24, ]
-b19              [25, ]
-b1a              [26, ]
-b1b              [27, ]
-b1c              [28, ]
-b1d              [29, ]
-b1e              [30, ]
-b1f              [31, ]
-b2               [2, ]
-b20              [32, ]
-b21              [33, ]
-b22              [34, ]
-b23              [35, ]
-b24              [36, ]
-b25              [37, ]
-b26              [38, ]
-b27              [39, ]
-b28              [40, ]
-b29              [41, ]
-b2a              [42, ]
-b2b              [43, ]
-b2c              [44, ]
-b2d              [45, ]
-b2e              [46, ]
-b2f              [47, ]
-b3               [3, ]
-b30              [48, ]
-b31              [49, ]
-b4               [4, ]
-b5               [5, ]
-b6               [6, ]
-b7               [7, ]
-b8               [8, ]
-b9               [9, ]
-ba               [10, ]
-bb               [11, ]
-bc               [12, ]
-bd               [13, ]
-be               [14, ]
-bell             [51, ]
-bf               [15, ]
-rings            [51, ]
-the              [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 4cdf756ac..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,5 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  the              b    [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap
deleted file mode 100644
index 4dca775e6..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-[51, ]
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index 61987fd4a..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,6 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a    5                [51, ]
-2  a    am               [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap
deleted file mode 100644
index 7949d464e..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_docids.snap
+++ /dev/null
@@ -1,10 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-5                [51, ]
-am               [51, ]
-at               [51, ]
-bell             [51, ]
-rings            [51, ]
-the              [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 618a0b076..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/hard_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,10 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  5                a    [51, ]
-1  rings            a    [51, ]
-2  at               a    [51, ]
-2  bell             a    [51, ]
-3  rings            a    [51, ]
-3  the              a    [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap
deleted file mode 100644
index 78008f83b..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ]
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index 78b6a3885..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,9 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a    amazing          [50, ]
-1  a    an               [50, ]
-1  a    house            [50, ]
-2  a    amazing          [50, ]
-2  a    house            [50, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap
deleted file mode 100644
index 8c7809973..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_docids.snap
+++ /dev/null
@@ -1,61 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-a0               [0, ]
-a1               [1, ]
-a10              [16, ]
-a11              [17, ]
-a12              [18, ]
-a13              [19, ]
-a14              [20, ]
-a15              [21, ]
-a16              [22, ]
-a17              [23, ]
-a18              [24, ]
-a19              [25, ]
-a1a              [26, ]
-a1b              [27, ]
-a1c              [28, ]
-a1d              [29, ]
-a1e              [30, ]
-a1f              [31, ]
-a2               [2, ]
-a20              [32, ]
-a21              [33, ]
-a22              [34, ]
-a23              [35, ]
-a24              [36, ]
-a25              [37, ]
-a26              [38, ]
-a27              [39, ]
-a28              [40, ]
-a29              [41, ]
-a2a              [42, ]
-a2b              [43, ]
-a2c              [44, ]
-a2d              [45, ]
-a2e              [46, ]
-a2f              [47, ]
-a3               [3, ]
-a30              [48, ]
-a31              [49, ]
-a4               [4, ]
-a5               [5, ]
-a6               [6, ]
-a7               [7, ]
-a8               [8, ]
-a9               [9, ]
-aa               [10, ]
-ab               [11, ]
-ac               [12, ]
-ad               [13, ]
-ae               [14, ]
-af               [15, ]
-amazing          [50, ]
-an               [50, ]
-at               [50, ]
-bell             [51, ]
-house            [50, ]
-rings            [51, ]
-the              [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 65d8b806b..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/initial/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,7 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  an               a    [50, ]
-1  at               a    [50, ]
-2  at               a    [50, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap
deleted file mode 100644
index 775d41a3d..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ]
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index 54c9e4b9b..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,5 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  b    rings            [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap
deleted file mode 100644
index f86fdcb8b..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_docids.snap
+++ /dev/null
@@ -1,61 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-amazing          [50, ]
-an               [50, ]
-at               [50, ]
-b0               [52, ]
-b1               [53, ]
-b10              [68, ]
-b11              [69, ]
-b12              [70, ]
-b13              [71, ]
-b14              [72, ]
-b15              [73, ]
-b16              [74, ]
-b17              [75, ]
-b18              [76, ]
-b19              [77, ]
-b1a              [78, ]
-b1b              [79, ]
-b1c              [80, ]
-b1d              [81, ]
-b1e              [82, ]
-b1f              [83, ]
-b2               [54, ]
-b20              [84, ]
-b21              [85, ]
-b22              [86, ]
-b23              [87, ]
-b24              [88, ]
-b25              [89, ]
-b26              [90, ]
-b27              [91, ]
-b28              [92, ]
-b29              [93, ]
-b2a              [94, ]
-b2b              [95, ]
-b2c              [96, ]
-b2d              [97, ]
-b2e              [98, ]
-b2f              [99, ]
-b3               [55, ]
-b30              [100, ]
-b31              [101, ]
-b4               [56, ]
-b5               [57, ]
-b6               [58, ]
-b7               [59, ]
-b8               [60, ]
-b9               [61, ]
-ba               [62, ]
-bb               [63, ]
-bc               [64, ]
-bd               [65, ]
-be               [66, ]
-bell             [51, ]
-bf               [67, ]
-house            [50, ]
-rings            [51, ]
-the              [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 4cdf756ac..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_hard_deletion/replaced/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,5 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  the              b    [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap
deleted file mode 100644
index 78008f83b..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ]
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index 78b6a3885..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,9 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a    amazing          [50, ]
-1  a    an               [50, ]
-1  a    house            [50, ]
-2  a    amazing          [50, ]
-2  a    house            [50, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap
deleted file mode 100644
index 8c7809973..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_docids.snap
+++ /dev/null
@@ -1,61 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-a0               [0, ]
-a1               [1, ]
-a10              [16, ]
-a11              [17, ]
-a12              [18, ]
-a13              [19, ]
-a14              [20, ]
-a15              [21, ]
-a16              [22, ]
-a17              [23, ]
-a18              [24, ]
-a19              [25, ]
-a1a              [26, ]
-a1b              [27, ]
-a1c              [28, ]
-a1d              [29, ]
-a1e              [30, ]
-a1f              [31, ]
-a2               [2, ]
-a20              [32, ]
-a21              [33, ]
-a22              [34, ]
-a23              [35, ]
-a24              [36, ]
-a25              [37, ]
-a26              [38, ]
-a27              [39, ]
-a28              [40, ]
-a29              [41, ]
-a2a              [42, ]
-a2b              [43, ]
-a2c              [44, ]
-a2d              [45, ]
-a2e              [46, ]
-a2f              [47, ]
-a3               [3, ]
-a30              [48, ]
-a31              [49, ]
-a4               [4, ]
-a5               [5, ]
-a6               [6, ]
-a7               [7, ]
-a8               [8, ]
-a9               [9, ]
-aa               [10, ]
-ab               [11, ]
-ac               [12, ]
-ad               [13, ]
-ae               [14, ]
-af               [15, ]
-amazing          [50, ]
-an               [50, ]
-at               [50, ]
-bell             [51, ]
-house            [50, ]
-rings            [51, ]
-the              [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 65d8b806b..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/initial/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,7 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  an               a    [50, ]
-1  at               a    [50, ]
-2  at               a    [50, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap
deleted file mode 100644
index 775d41a3d..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-[50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ]
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index 0241f26a5..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,10 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a    amazing          [50, ]
-1  a    an               [50, ]
-1  a    house            [50, ]
-1  b    rings            [51, ]
-2  a    amazing          [50, ]
-2  a    house            [50, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap
deleted file mode 100644
index 6a481eeee..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_docids.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-5f6443e54fae188aa96d4f27fce28939
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index d20582970..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/replace_soft_deletion/replaced/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,8 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  an               a    [50, ]
-1  at               a    [50, ]
-1  the              b    [51, ]
-2  at               a    [50, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap
deleted file mode 100644
index 39e9fbe65..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, ]
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index b380ba9b5..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,14 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a    5                [51, ]
-1  a    amazing          [50, ]
-1  a    an               [50, ]
-1  a    and              [50, ]
-1  a    beautiful        [50, ]
-2  a    am               [51, ]
-2  a    amazing          [50, ]
-2  a    and              [50, ]
-2  a    beautiful        [50, ]
-2  a    house            [50, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap
deleted file mode 100644
index 6b5658b74..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_docids.snap
+++ /dev/null
@@ -1,65 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-5                [51, ]
-a0               [0, ]
-a1               [1, ]
-a10              [16, ]
-a11              [17, ]
-a12              [18, ]
-a13              [19, ]
-a14              [20, ]
-a15              [21, ]
-a16              [22, ]
-a17              [23, ]
-a18              [24, ]
-a19              [25, ]
-a1a              [26, ]
-a1b              [27, ]
-a1c              [28, ]
-a1d              [29, ]
-a1e              [30, ]
-a1f              [31, ]
-a2               [2, ]
-a20              [32, ]
-a21              [33, ]
-a22              [34, ]
-a23              [35, ]
-a24              [36, ]
-a25              [37, ]
-a26              [38, ]
-a27              [39, ]
-a28              [40, ]
-a29              [41, ]
-a2a              [42, ]
-a2b              [43, ]
-a2c              [44, ]
-a2d              [45, ]
-a2e              [46, ]
-a2f              [47, ]
-a3               [3, ]
-a30              [48, ]
-a31              [49, ]
-a4               [4, ]
-a5               [5, ]
-a6               [6, ]
-a7               [7, ]
-a8               [8, ]
-a9               [9, ]
-aa               [10, ]
-ab               [11, ]
-ac               [12, ]
-ad               [13, ]
-ae               [14, ]
-af               [15, ]
-am               [51, ]
-amazing          [50, ]
-an               [50, ]
-and              [50, ]
-at               [50, 51, ]
-beautiful        [50, ]
-bell             [51, ]
-house            [50, ]
-rings            [51, ]
-the              [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 885985bdf..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/first_delete/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,15 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  5                a    [51, ]
-1  amazing          a    [50, ]
-1  an               a    [50, ]
-1  at               a    [50, ]
-1  rings            a    [51, ]
-2  an               a    [50, ]
-2  at               a    [50, 51, ]
-2  bell             a    [51, ]
-3  at               a    [50, ]
-3  rings            a    [51, ]
-3  the              a    [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap
deleted file mode 100644
index 78008f83b..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, ]
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index b380ba9b5..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,14 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a    5                [51, ]
-1  a    amazing          [50, ]
-1  a    an               [50, ]
-1  a    and              [50, ]
-1  a    beautiful        [50, ]
-2  a    am               [51, ]
-2  a    amazing          [50, ]
-2  a    and              [50, ]
-2  a    beautiful        [50, ]
-2  a    house            [50, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap
deleted file mode 100644
index 6b5658b74..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_docids.snap
+++ /dev/null
@@ -1,65 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-5                [51, ]
-a0               [0, ]
-a1               [1, ]
-a10              [16, ]
-a11              [17, ]
-a12              [18, ]
-a13              [19, ]
-a14              [20, ]
-a15              [21, ]
-a16              [22, ]
-a17              [23, ]
-a18              [24, ]
-a19              [25, ]
-a1a              [26, ]
-a1b              [27, ]
-a1c              [28, ]
-a1d              [29, ]
-a1e              [30, ]
-a1f              [31, ]
-a2               [2, ]
-a20              [32, ]
-a21              [33, ]
-a22              [34, ]
-a23              [35, ]
-a24              [36, ]
-a25              [37, ]
-a26              [38, ]
-a27              [39, ]
-a28              [40, ]
-a29              [41, ]
-a2a              [42, ]
-a2b              [43, ]
-a2c              [44, ]
-a2d              [45, ]
-a2e              [46, ]
-a2f              [47, ]
-a3               [3, ]
-a30              [48, ]
-a31              [49, ]
-a4               [4, ]
-a5               [5, ]
-a6               [6, ]
-a7               [7, ]
-a8               [8, ]
-a9               [9, ]
-aa               [10, ]
-ab               [11, ]
-ac               [12, ]
-ad               [13, ]
-ae               [14, ]
-af               [15, ]
-am               [51, ]
-amazing          [50, ]
-an               [50, ]
-and              [50, ]
-at               [50, 51, ]
-beautiful        [50, ]
-bell             [51, ]
-house            [50, ]
-rings            [51, ]
-the              [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 885985bdf..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/initial/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,15 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  5                a    [51, ]
-1  amazing          a    [50, ]
-1  an               a    [50, ]
-1  at               a    [50, ]
-1  rings            a    [51, ]
-2  an               a    [50, ]
-2  at               a    [50, 51, ]
-2  bell             a    [51, ]
-3  at               a    [50, ]
-3  rings            a    [51, ]
-3  the              a    [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap
deleted file mode 100644
index c8a1e54b4..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-[51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, ]
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index db62b6566..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,17 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a    5                [51, ]
-1  a    amazing          [50, ]
-1  a    an               [50, ]
-1  a    and              [50, ]
-1  a    beautiful        [50, ]
-1  b    house            [50, ]
-1  b    rings            [51, ]
-2  a    am               [51, ]
-2  a    amazing          [50, ]
-2  a    and              [50, ]
-2  a    beautiful        [50, ]
-2  a    house            [50, ]
-2  b    at               [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap
deleted file mode 100644
index 7fd726325..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_docids.hash.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-9f4866b80177e321a33ce434992022b5
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 2ea0d46f4..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/reupdate/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,19 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  5                a    [51, ]
-1  amazing          a    [50, ]
-1  an               a    [50, ]
-1  and              b    [50, ]
-1  at               a    [50, ]
-1  rings            a    [51, ]
-1  the              b    [51, ]
-2  amazing          b    [50, ]
-2  an               a    [50, ]
-2  at               a    [50, 51, ]
-2  bell             a    [51, ]
-3  an               b    [50, ]
-3  at               a    [50, ]
-3  rings            a    [51, ]
-3  the              a    [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap
deleted file mode 100644
index 4dca775e6..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-[51, ]
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap
deleted file mode 100644
index b380ba9b5..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/prefix_word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,14 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  a    5                [51, ]
-1  a    amazing          [50, ]
-1  a    an               [50, ]
-1  a    and              [50, ]
-1  a    beautiful        [50, ]
-2  a    am               [51, ]
-2  a    amazing          [50, ]
-2  a    and              [50, ]
-2  a    beautiful        [50, ]
-2  a    house            [50, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap
deleted file mode 100644
index 6b5658b74..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_docids.snap
+++ /dev/null
@@ -1,65 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-5                [51, ]
-a0               [0, ]
-a1               [1, ]
-a10              [16, ]
-a11              [17, ]
-a12              [18, ]
-a13              [19, ]
-a14              [20, ]
-a15              [21, ]
-a16              [22, ]
-a17              [23, ]
-a18              [24, ]
-a19              [25, ]
-a1a              [26, ]
-a1b              [27, ]
-a1c              [28, ]
-a1d              [29, ]
-a1e              [30, ]
-a1f              [31, ]
-a2               [2, ]
-a20              [32, ]
-a21              [33, ]
-a22              [34, ]
-a23              [35, ]
-a24              [36, ]
-a25              [37, ]
-a26              [38, ]
-a27              [39, ]
-a28              [40, ]
-a29              [41, ]
-a2a              [42, ]
-a2b              [43, ]
-a2c              [44, ]
-a2d              [45, ]
-a2e              [46, ]
-a2f              [47, ]
-a3               [3, ]
-a30              [48, ]
-a31              [49, ]
-a4               [4, ]
-a5               [5, ]
-a6               [6, ]
-a7               [7, ]
-a8               [8, ]
-a9               [9, ]
-aa               [10, ]
-ab               [11, ]
-ac               [12, ]
-ad               [13, ]
-ae               [14, ]
-af               [15, ]
-am               [51, ]
-amazing          [50, ]
-an               [50, ]
-and              [50, ]
-at               [50, 51, ]
-beautiful        [50, ]
-bell             [51, ]
-house            [50, ]
-rings            [51, ]
-the              [51, ]
-
diff --git a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap
deleted file mode 100644
index 885985bdf..000000000
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/soft_delete_and_reupdate/second_delete/word_prefix_pair_proximity_docids.snap
+++ /dev/null
@@ -1,15 +0,0 @@
----
-source: milli/src/update/prefix_word_pairs/mod.rs
----
-1  5                a    [51, ]
-1  amazing          a    [50, ]
-1  an               a    [50, ]
-1  at               a    [50, ]
-1  rings            a    [51, ]
-2  an               a    [50, ]
-2  at               a    [50, 51, ]
-2  bell             a    [51, ]
-3  at               a    [50, ]
-3  rings            a    [51, ]
-3  the              a    [51, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap
deleted file mode 100644
index e87bce206..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[]
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap
deleted file mode 100644
index 88031d24a..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/facet_id_exists_docids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap
deleted file mode 100644
index e87bce206..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[]
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap
deleted file mode 100644
index 88031d24a..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_docids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap
deleted file mode 100644
index 88031d24a..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_hard/word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap
deleted file mode 100644
index e87bce206..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[]
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap
deleted file mode 100644
index 88031d24a..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/facet_id_exists_docids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap
deleted file mode 100644
index e87bce206..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[]
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap
deleted file mode 100644
index 88031d24a..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_docids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap
deleted file mode 100644
index 88031d24a..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_numbers_as_primary_key/always_soft/word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap
deleted file mode 100644
index 6d69b2ffb..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[2, ]
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap
deleted file mode 100644
index e87bce206..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[]
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap
deleted file mode 100644
index 88d3a98aa..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_docids.snap
+++ /dev/null
@@ -1,5 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-benoit           [2, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap
deleted file mode 100644
index 88031d24a..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_hard/word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap
deleted file mode 100644
index 6d69b2ffb..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[2, ]
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap
deleted file mode 100644
index 9139b7a05..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[0, 1, ]
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap
deleted file mode 100644
index 15c881e87..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_docids.snap
+++ /dev/null
@@ -1,7 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-benoit           [2, ]
-kevin            [0, ]
-kevina           [1, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap
deleted file mode 100644
index 88031d24a..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/delete_documents_with_strange_primary_key/always_soft/word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap
deleted file mode 100644
index 7481b11c4..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_exists_docids.snap
+++ /dev/null
@@ -1,6 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-1   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
-2   [21, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap
deleted file mode 100644
index 87856f6dc..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap
+++ /dev/null
@@ -1,5 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-2   0  2.2    1  [21, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap
deleted file mode 100644
index ab1d2175f..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap
+++ /dev/null
@@ -1,17 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-1   0  abstract     1  [2, 6, 10, 13, 14, 15, 16, 17, ]
-1   0  aquarium     1  [5, ]
-1   0  art          1  [4, 5, 8, 9, 10, 12, 17, ]
-1   0  cartoon      1  [2, 7, 15, 17, ]
-1   0  colorfulness 1  [13, ]
-1   0  design       1  [2, 18, ]
-1   0  drawing      1  [3, 4, 5, 8, 10, 11, 16, ]
-1   0  geometry     1  [19, ]
-1   0  letter       1  [1, ]
-1   0  outdoor      1  [4, ]
-1   0  painting     1  [3, ]
-1   0  pattern      1  [2, 3, 9, 10, 13, 14, 16, ]
-2   0  design       1  [21, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
deleted file mode 100644
index e87bce206..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[]
diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap
deleted file mode 100644
index f8d64e001..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap
+++ /dev/null
@@ -1,38 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-1                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
-2                [21, ]
-36               [3, ]
-37               [4, ]
-38               [5, ]
-39               [6, ]
-40               [7, ]
-41               [8, ]
-42               [9, ]
-43               [10, ]
-44               [11, ]
-45               [12, ]
-46               [13, ]
-47               [14, ]
-5                [1, ]
-52               [15, ]
-57               [16, ]
-58               [17, ]
-68               [18, ]
-69               [19, ]
-7                [2, ]
-71               [21, ]
-abstract         [2, 6, 10, 13, 14, 15, 16, 17, ]
-aquarium         [5, ]
-art              [4, 5, 8, 9, 10, 12, 17, ]
-cartoon          [2, 7, 15, 17, ]
-colorfulness     [13, ]
-design           [2, 18, 21, ]
-drawing          [3, 4, 5, 8, 10, 11, 16, ]
-geometry         [19, ]
-letter           [1, ]
-outdoor          [4, ]
-painting         [3, ]
-pattern          [2, 3, 9, 10, 13, 14, 16, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap
deleted file mode 100644
index 36add107b..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,25 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-1  1                36               [3, ]
-1  1                37               [4, ]
-1  1                38               [5, ]
-1  1                39               [6, ]
-1  1                40               [7, ]
-1  1                41               [8, ]
-1  1                42               [9, ]
-1  1                43               [10, ]
-1  1                44               [11, ]
-1  1                45               [12, ]
-1  1                46               [13, ]
-1  1                47               [14, ]
-1  1                5                [1, ]
-1  1                52               [15, ]
-1  1                57               [16, ]
-1  1                58               [17, ]
-1  1                68               [18, ]
-1  1                69               [19, ]
-1  1                7                [2, ]
-1  1                71               [21, ]
-1  2                2                [21, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap
deleted file mode 100644
index a7ee4348d..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_exists_docids.snap
+++ /dev/null
@@ -1,6 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-1   [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
-2   [20, 21, 22, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap
deleted file mode 100644
index cfa649653..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap
+++ /dev/null
@@ -1,6 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-2   0  1.2    1  [20, 22, ]
-2   0  2.2    1  [21, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap
deleted file mode 100644
index 8336bd712..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap
+++ /dev/null
@@ -1,19 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-1   0  abstract     1  [2, 6, 10, 13, 14, 15, 16, 17, ]
-1   0  aquarium     1  [5, ]
-1   0  art          1  [4, 5, 8, 9, 10, 12, 17, ]
-1   0  cartoon      1  [2, 7, 15, 17, ]
-1   0  colorfulness 1  [13, ]
-1   0  design       1  [2, 18, ]
-1   0  drawing      1  [3, 4, 5, 8, 10, 11, 16, ]
-1   0  geometry     1  [19, ]
-1   0  letter       1  [1, ]
-1   0  outdoor      1  [4, ]
-1   0  painting     1  [3, ]
-1   0  pattern      1  [2, 3, 9, 10, 13, 14, 16, ]
-1   0  sign         1  [0, ]
-2   0  design       1  [21, ]
-2   0  geometry     1  [20, 22, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
deleted file mode 100644
index dfac98e59..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[0, 20, 22, ]
diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap
deleted file mode 100644
index 972a733e2..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap
+++ /dev/null
@@ -1,42 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-1                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ]
-2                [20, 21, 22, ]
-36               [3, ]
-37               [4, ]
-38               [5, ]
-39               [6, ]
-4                [0, ]
-40               [7, ]
-41               [8, ]
-42               [9, ]
-43               [10, ]
-44               [11, ]
-45               [12, ]
-46               [13, ]
-47               [14, ]
-5                [1, ]
-52               [15, ]
-57               [16, ]
-58               [17, ]
-68               [18, ]
-69               [19, ]
-7                [2, ]
-70               [20, ]
-71               [21, ]
-72               [22, ]
-abstract         [2, 6, 10, 13, 14, 15, 16, 17, ]
-aquarium         [5, ]
-art              [4, 5, 8, 9, 10, 12, 17, ]
-cartoon          [2, 7, 15, 17, ]
-colorfulness     [13, ]
-design           [2, 18, 21, ]
-drawing          [3, 4, 5, 8, 10, 11, 16, ]
-geometry         [19, 20, 22, ]
-letter           [1, ]
-outdoor          [4, ]
-painting         [3, ]
-pattern          [2, 3, 9, 10, 13, 14, 16, ]
-sign             [0, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap
deleted file mode 100644
index 941838e34..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap
+++ /dev/null
@@ -1,29 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-1  1                2                [20, 22, ]
-1  1                36               [3, ]
-1  1                37               [4, ]
-1  1                38               [5, ]
-1  1                39               [6, ]
-1  1                4                [0, ]
-1  1                40               [7, ]
-1  1                41               [8, ]
-1  1                42               [9, ]
-1  1                43               [10, ]
-1  1                44               [11, ]
-1  1                45               [12, ]
-1  1                46               [13, ]
-1  1                47               [14, ]
-1  1                5                [1, ]
-1  1                52               [15, ]
-1  1                57               [16, ]
-1  1                58               [17, ]
-1  1                68               [18, ]
-1  1                69               [19, ]
-1  1                7                [2, ]
-1  1                70               [20, ]
-1  1                71               [21, ]
-1  1                72               [22, ]
-1  2                2                [21, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap
deleted file mode 100644
index 18a9d9309..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_f64_docids.snap
+++ /dev/null
@@ -1,31 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-3   0  48.9021 1  [19, ]
-3   0  49.9314 1  [17, ]
-3   0  50.1793 1  [15, ]
-3   0  50.2844 1  [14, ]
-3   0  50.3518 1  [13, ]
-3   0  50.4502 1  [12, ]
-3   0  50.6053 1  [8, ]
-3   0  50.6224 1  [3, ]
-3   0  50.6299 1  [0, ]
-3   0  50.6312 1  [2, ]
-3   0  50.6415 1  [1, ]
-3   0  50.7453 1  [7, ]
-3   0  50.8466 1  [10, ]
-3   0  51.0537 1  [9, ]
-4   0  2.271  1  [17, ]
-4   0  2.3708 1  [19, ]
-4   0  2.7637 1  [14, ]
-4   0  3.0569 1  [0, ]
-4   0  3.1106 1  [1, 2, ]
-4   0  3.1476 1  [3, ]
-4   0  3.2189 1  [15, ]
-4   0  3.2206 1  [7, ]
-4   0  3.3758 1  [8, ]
-4   0  3.5326 1  [13, ]
-4   0  3.6957 1  [9, ]
-4   0  3.9623 1  [12, ]
-4   0  4.337  1  [10, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap
deleted file mode 100644
index 88031d24a..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/facet_id_string_docids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
deleted file mode 100644
index e87bce206..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[]
diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap
deleted file mode 100644
index c909a3cd8..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_f64_docids.snap
+++ /dev/null
@@ -1,53 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-3   0  48.9021 1  [19, ]
-3   0  49.4449 1  [18, ]
-3   0  49.9314 1  [17, ]
-3   0  50.1112 1  [16, ]
-3   0  50.1793 1  [15, ]
-3   0  50.2844 1  [14, ]
-3   0  50.3518 1  [13, ]
-3   0  50.4095 1  [11, ]
-3   0  50.4502 1  [12, ]
-3   0  50.6053 1  [8, ]
-3   0  50.6224 1  [3, ]
-3   0  50.6299 1  [0, ]
-3   0  50.6312 1  [2, ]
-3   0  50.6415 1  [1, ]
-3   0  50.6552 1  [4, ]
-3   0  50.6924 1  [5, ]
-3   0  50.7263 1  [6, ]
-3   0  50.7453 1  [7, ]
-3   0  50.8466 1  [10, ]
-3   0  51.0537 1  [9, ]
-3   1  48.9021 4  [16, 17, 18, 19, ]
-3   1  50.1793 4  [11, 13, 14, 15, ]
-3   1  50.4502 4  [0, 3, 8, 12, ]
-3   1  50.6312 4  [1, 2, 4, 5, ]
-3   1  50.7263 4  [6, 7, 9, 10, ]
-4   0  2.271  1  [17, ]
-4   0  2.3708 1  [19, ]
-4   0  2.7637 1  [14, ]
-4   0  2.7913 1  [18, ]
-4   0  2.8547 1  [16, ]
-4   0  3.0569 1  [0, ]
-4   0  3.1106 1  [1, 2, ]
-4   0  3.1476 1  [3, ]
-4   0  3.1541 1  [6, ]
-4   0  3.1763 1  [5, ]
-4   0  3.1897 1  [4, ]
-4   0  3.2189 1  [15, ]
-4   0  3.2206 1  [7, ]
-4   0  3.3758 1  [8, ]
-4   0  3.5326 1  [13, ]
-4   0  3.6957 1  [9, ]
-4   0  3.9623 1  [12, ]
-4   0  4.337  1  [10, ]
-4   0  4.4347 1  [11, ]
-4   1  2.271  4  [14, 17, 18, 19, ]
-4   1  2.8547 4  [0, 1, 2, 3, 16, ]
-4   1  3.1541 4  [4, 5, 6, 15, ]
-4   1  3.2206 4  [7, 8, 9, 13, ]
-4   1  3.9623 3  [10, 11, 12, ]
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap
deleted file mode 100644
index 88031d24a..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/facet_id_string_docids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-
diff --git a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
deleted file mode 100644
index 1260b12de..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/geo_filtered_placeholder_search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[4, 5, 6, 11, 16, 18, ]
diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
deleted file mode 100644
index e87bce206..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[]
diff --git a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
deleted file mode 100644
index efcd7af8c..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/get_documents_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[2, 15, ]
diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
deleted file mode 100644
index e87bce206..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[]
diff --git a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
deleted file mode 100644
index efcd7af8c..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/search_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[2, 15, ]
diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
deleted file mode 100644
index e87bce206..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_hard/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[]
diff --git a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap b/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
deleted file mode 100644
index efcd7af8c..000000000
--- a/milli/src/update/snapshots/delete_documents.rs/stats_should_not_return_deleted_documents/always_soft/soft_deleted_documents_ids.snap
+++ /dev/null
@@ -1,4 +0,0 @@
----
-source: milli/src/update/delete_documents.rs
----
-[2, 15, ]

From b40253bf18bbb1a2981124936028a5e9e4208d78 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 31 Oct 2023 10:15:34 +0100
Subject: [PATCH 080/127] update snapshots

---
 .../facet_id_exists_docids.snap               |  6 +++
 .../facet_id_f64_docids.snap                  |  5 +++
 .../facet_id_string_docids.snap               | 17 +++++++++
 .../word_docids.snap                          | 38 +++++++++++++++++++
 .../word_pair_proximity_docids.snap           | 25 ++++++++++++
 5 files changed, 91 insertions(+)
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap
 create mode 100644 milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap

diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap
new file mode 100644
index 000000000..ed120bf02
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_exists_docids.snap
@@ -0,0 +1,6 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+1   [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, ]
+2   [21, ]
+
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
new file mode 100644
index 000000000..deeddff0d
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_f64_docids.snap
@@ -0,0 +1,5 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+2   0  2.2    1  [21, ]
+
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap
new file mode 100644
index 000000000..2d0b98623
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/facet_id_string_docids.snap
@@ -0,0 +1,17 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+1   0  abstract     1  [2, 6, 10, 13, 14, 15, 16, 17, ]
+1   0  aquarium     1  [5, ]
+1   0  art          1  [4, 5, 8, 9, 10, 12, 17, ]
+1   0  cartoon      1  [2, 7, 15, 17, ]
+1   0  colorfulness 1  [13, ]
+1   0  design       1  [2, 18, ]
+1   0  drawing      1  [3, 4, 5, 8, 10, 11, 16, ]
+1   0  geometry     1  [19, ]
+1   0  letter       1  [1, ]
+1   0  outdoor      1  [4, ]
+1   0  painting     1  [3, ]
+1   0  pattern      1  [2, 3, 9, 10, 13, 14, 16, ]
+2   0  design       1  [21, ]
+
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap
new file mode 100644
index 000000000..73503f098
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_docids.snap
@@ -0,0 +1,38 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+1                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
+2                [21, ]
+36               [3, ]
+37               [4, ]
+38               [5, ]
+39               [6, ]
+40               [7, ]
+41               [8, ]
+42               [9, ]
+43               [10, ]
+44               [11, ]
+45               [12, ]
+46               [13, ]
+47               [14, ]
+5                [1, ]
+52               [15, ]
+57               [16, ]
+58               [17, ]
+68               [18, ]
+69               [19, ]
+7                [2, ]
+71               [21, ]
+abstract         [2, 6, 10, 13, 14, 15, 16, 17, ]
+aquarium         [5, ]
+art              [4, 5, 8, 9, 10, 12, 17, ]
+cartoon          [2, 7, 15, 17, ]
+colorfulness     [13, ]
+design           [2, 18, 21, ]
+drawing          [3, 4, 5, 8, 10, 11, 16, ]
+geometry         [19, ]
+letter           [1, ]
+outdoor          [4, ]
+painting         [3, ]
+pattern          [2, 3, 9, 10, 13, 14, 16, ]
+
diff --git a/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap
new file mode 100644
index 000000000..022e9f5b1
--- /dev/null
+++ b/milli/src/update/index_documents/snapshots/mod.rs/filtered_placeholder_search_should_not_return_deleted_documents/word_pair_proximity_docids.snap
@@ -0,0 +1,25 @@
+---
+source: milli/src/update/index_documents/mod.rs
+---
+1  1                36               [3, ]
+1  1                37               [4, ]
+1  1                38               [5, ]
+1  1                39               [6, ]
+1  1                40               [7, ]
+1  1                41               [8, ]
+1  1                42               [9, ]
+1  1                43               [10, ]
+1  1                44               [11, ]
+1  1                45               [12, ]
+1  1                46               [13, ]
+1  1                47               [14, ]
+1  1                5                [1, ]
+1  1                52               [15, ]
+1  1                57               [16, ]
+1  1                58               [17, ]
+1  1                68               [18, ]
+1  1                69               [19, ]
+1  1                7                [2, ]
+1  1                71               [21, ]
+1  2                2                [21, ]
+

From 94206b00552523c104bebf720716bf53a3c6e12a Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 31 Oct 2023 13:48:47 +0100
Subject: [PATCH 081/127] Update tests

---
 milli/src/search/new/tests/proximity.rs                | 10 +++++-----
 ...__new__tests__proximity__proximity_prefix_db-8.snap |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs
index 4d340ae1c..217ebe9b3 100644
--- a/milli/src/search/new/tests/proximity.rs
+++ b/milli/src/search/new/tests/proximity.rs
@@ -423,20 +423,20 @@ fn test_proximity_prefix_db() {
     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
     s.query("best win");
     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
     insta::assert_snapshot!(format!("{document_scores:#?}"));
     let texts = collect_field_values(&index, &txn, "text", &documents_ids);
 
     insta::assert_debug_snapshot!(texts, @r###"
     [
+        "\"this is the best winter meal\"",
+        "\"winter best\"",
+        "\"this is the best meal of winter\"",
+        "\"winter x best\"",
         "\"this is the best meal I have ever had in such a beautiful winter day\"",
         "\"this is the best cooked meal of the winter\"",
         "\"this is the best meal of the winter\"",
-        "\"this is the best meal of winter\"",
-        "\"this is the best winter meal\"",
         "\"winter x y best\"",
-        "\"winter x best\"",
-        "\"winter best\"",
     ]
     "###);
 
diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap
index 5129f1b3b..8f3b964c1 100644
--- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap
+++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap
@@ -6,7 +6,7 @@ expression: "format!(\"{document_scores:#?}\")"
     [
         Proximity(
             Rank {
-                rank: 1,
+                rank: 4,
                 max_rank: 4,
             },
         ),
@@ -14,7 +14,7 @@ expression: "format!(\"{document_scores:#?}\")"
     [
         Proximity(
             Rank {
-                rank: 1,
+                rank: 3,
                 max_rank: 4,
             },
         ),
@@ -22,7 +22,7 @@ expression: "format!(\"{document_scores:#?}\")"
     [
         Proximity(
             Rank {
-                rank: 1,
+                rank: 2,
                 max_rank: 4,
             },
         ),
@@ -30,7 +30,7 @@ expression: "format!(\"{document_scores:#?}\")"
     [
         Proximity(
             Rank {
-                rank: 1,
+                rank: 2,
                 max_rank: 4,
             },
         ),

From da0503ef80f57cff27eb521aa3089f4146eff2c3 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 31 Oct 2023 10:57:08 +0100
Subject: [PATCH 082/127] Fix document count

---
 milli/src/update/index_documents/mod.rs | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index ad2f63beb..0174fe319 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -371,12 +371,11 @@ where
                 let _ = lmdb_writer_sx.send(Err(e));
             }
 
-            // needs to be droped to avoid channel waiting lock.
+            // needs to be dropped to avoid channel waiting lock.
             drop(lmdb_writer_sx)
         });
 
-        let index_documents_ids = self.index.documents_ids(self.wtxn)?;
-        let index_is_empty = index_documents_ids.is_empty();
+        let index_is_empty = self.index.number_of_documents(self.wtxn)? == 0;
         let mut final_documents_ids = RoaringBitmap::new();
 
         let mut databases_seen = 0;
@@ -422,16 +421,6 @@ where
         // We write the primary key field id into the main database
         self.index.put_primary_key(self.wtxn, &primary_key)?;
 
-        // We write the external documents ids into the main database.
-        //let mut external_documents_ids = self.index.external_documents_ids(self.wtxn)?;
-        //external_documents_ids.insert_ids(&new_external_documents_ids)?;
-        //let external_documents_ids = external_documents_ids.into_static();
-        //self.index.put_external_documents_ids(self.wtxn, &external_documents_ids)?;
-
-        // FIXME: remove `new_documents_ids` entirely and `replaced_documents_ids`
-        let all_documents_ids = index_documents_ids | new_documents_ids;
-        //self.index.put_documents_ids(self.wtxn, &all_documents_ids)?;
-
         // TODO: reactivate prefix DB with diff-indexing
         // self.execute_prefix_databases(
         //     word_docids,
@@ -441,7 +430,7 @@ where
         //     word_fid_docids,
         // )?;
 
-        Ok(all_documents_ids.len())
+        self.index.number_of_documents(self.wtxn)
     }
 
     #[logging_timer::time("IndexDocuments::{}")]

From c855cc27215ca6a88ae41e010872b9ab337be4c1 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 31 Oct 2023 11:54:15 +0100
Subject: [PATCH 083/127] Remove unused test

---
 milli/src/index.rs | 258 ---------------------------------------------
 1 file changed, 258 deletions(-)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index f7450a672..a52033fb6 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1955,264 +1955,6 @@ pub(crate) mod tests {
         "###);
     }
 
-    #[test]
-    fn replace_documents_in_batches_external_ids_and_soft_deletion_check() {
-        use big_s::S;
-        use maplit::hashset;
-
-        let index = TempIndex::new();
-
-        index
-            .update_settings(|settings| {
-                settings.set_primary_key("id".to_owned());
-                settings.set_filterable_fields(hashset! { S("doggo") });
-            })
-            .unwrap();
-
-        let add_documents = |index: &TempIndex, docs: Vec<Vec<serde_json::Value>>| {
-            let mut wtxn = index.write_txn().unwrap();
-            let mut builder = IndexDocuments::new(
-                &mut wtxn,
-                index,
-                &index.indexer_config,
-                index.index_documents_config.clone(),
-                |_| (),
-                || false,
-            )
-            .unwrap();
-            for docs in docs {
-                (builder, _) = builder.add_documents(documents!(docs)).unwrap();
-            }
-            builder.execute().unwrap();
-            wtxn.commit().unwrap();
-        };
-        // First Batch
-        {
-            let mut docs1 = vec![];
-            for i in 0..4 {
-                docs1.push(serde_json::json!(
-                    { "id": i, "doggo": i }
-                ));
-            }
-            add_documents(&index, vec![docs1]);
-
-            db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
-            db_snap!(index, external_documents_ids, 1, @r###"
-            docids:
-            0                        0
-            1                        1
-            2                        2
-            3                        3
-            "###);
-            db_snap!(index, facet_id_f64_docids, 1, @r###"
-            1   0  0      1  [0, ]
-            1   0  1      1  [1, ]
-            1   0  2      1  [2, ]
-            1   0  3      1  [3, ]
-            "###);
-        }
-        // Second Batch: replace the documents with soft-deletion
-        {
-            let mut docs1 = vec![];
-            for i in 0..3 {
-                docs1.push(serde_json::json!(
-                    { "id": i, "doggo": i+1 }
-                ));
-            }
-            let mut docs2 = vec![];
-            for i in 0..3 {
-                docs2.push(serde_json::json!(
-                    { "id": i, "doggo": i }
-                ));
-            }
-            add_documents(&index, vec![docs1, docs2]);
-
-            db_snap!(index, documents_ids, @"[0, 1, 2, 3, ]");
-            db_snap!(index, external_documents_ids, 1, @r###"
-            docids:
-            0                        0
-            1                        1
-            2                        2
-            3                        3
-            "###);
-            db_snap!(index, facet_id_f64_docids, 1, @r###"
-            1   0  0      1  [0, ]
-            1   0  1      1  [1, ]
-            1   0  2      1  [2, ]
-            1   0  3      1  [3, ]
-            "###);
-        }
-        let rtxn = index.read_txn().unwrap();
-        let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0];
-        let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
-        insta::assert_debug_snapshot!(json, @r###"
-        {
-            "id": Number(3),
-            "doggo": Number(3),
-        }
-        "###);
-        let (_docid, obkv) = index.documents(&rtxn, [4]).unwrap()[0];
-
-        let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
-        insta::assert_debug_snapshot!(json, @r###"
-        {
-            "id": Number(0),
-            "doggo": Number(0),
-        }
-        "###);
-        let (_docid, obkv) = index.documents(&rtxn, [5]).unwrap()[0];
-        let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
-        insta::assert_debug_snapshot!(json, @r###"
-        {
-            "id": Number(1),
-            "doggo": Number(1),
-        }
-        "###);
-        let (_docid, obkv) = index.documents(&rtxn, [6]).unwrap()[0];
-        let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
-        insta::assert_debug_snapshot!(json, @r###"
-        {
-            "id": Number(2),
-            "doggo": Number(2),
-        }
-        "###);
-        drop(rtxn);
-        // Third Batch: replace the documents with soft-deletion again
-        {
-            let mut docs1 = vec![];
-            for i in 0..3 {
-                docs1.push(serde_json::json!(
-                    { "id": i, "doggo": i+1 }
-                ));
-            }
-            let mut docs2 = vec![];
-            for i in 0..4 {
-                docs2.push(serde_json::json!(
-                    { "id": i, "doggo": i }
-                ));
-            }
-            add_documents(&index, vec![docs1, docs2]);
-
-            db_snap!(index, documents_ids, @"[3, 7, 8, 9, ]");
-            db_snap!(index, external_documents_ids, 1, @r###"
-            soft:
-            hard:
-            0                        7
-            1                        8
-            2                        9
-            3                        3
-            "###);
-            db_snap!(index, facet_id_f64_docids, 1, @r###"
-            1   0  0      1  [0, 4, 7, ]
-            1   0  1      1  [1, 5, 8, ]
-            1   0  2      1  [2, 6, 9, ]
-            1   0  3      1  [3, ]
-            "###);
-        }
-        let rtxn = index.read_txn().unwrap();
-        let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0];
-        let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
-        insta::assert_debug_snapshot!(json, @r###"
-        {
-            "id": Number(3),
-            "doggo": Number(3),
-        }
-        "###);
-        let (_docid, obkv) = index.documents(&rtxn, [7]).unwrap()[0];
-        let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
-        insta::assert_debug_snapshot!(json, @r###"
-        {
-            "id": Number(0),
-            "doggo": Number(0),
-        }
-        "###);
-        let (_docid, obkv) = index.documents(&rtxn, [8]).unwrap()[0];
-        let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
-        insta::assert_debug_snapshot!(json, @r###"
-        {
-            "id": Number(1),
-            "doggo": Number(1),
-        }
-        "###);
-        let (_docid, obkv) = index.documents(&rtxn, [9]).unwrap()[0];
-        let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
-        insta::assert_debug_snapshot!(json, @r###"
-        {
-            "id": Number(2),
-            "doggo": Number(2),
-        }
-        "###);
-        drop(rtxn);
-
-        // Fourth Batch: replace the documents without soft-deletion
-        {
-            let mut docs1 = vec![];
-            for i in 0..3 {
-                docs1.push(serde_json::json!(
-                    { "id": i, "doggo": i+2 }
-                ));
-            }
-            let mut docs2 = vec![];
-            for i in 0..1 {
-                docs2.push(serde_json::json!(
-                    { "id": i, "doggo": i }
-                ));
-            }
-            add_documents(&index, vec![docs1, docs2]);
-
-            db_snap!(index, documents_ids, @"[3, 10, 11, 12, ]");
-            db_snap!(index, external_documents_ids, 1, @r###"
-            soft:
-            hard:
-            0                        10
-            1                        11
-            2                        12
-            3                        3
-            "###);
-
-            db_snap!(index, facet_id_f64_docids, 1, @r###"
-            1   0  0      1  [10, ]
-            1   0  3      1  [3, 11, ]
-            1   0  4      1  [12, ]
-            "###);
-
-            let rtxn = index.read_txn().unwrap();
-            let (_docid, obkv) = index.documents(&rtxn, [3]).unwrap()[0];
-            let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
-            insta::assert_debug_snapshot!(json, @r###"
-            {
-                "id": Number(3),
-                "doggo": Number(3),
-            }
-            "###);
-            let (_docid, obkv) = index.documents(&rtxn, [10]).unwrap()[0];
-            let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
-            insta::assert_debug_snapshot!(json, @r###"
-            {
-                "id": Number(0),
-                "doggo": Number(0),
-            }
-            "###);
-            let (_docid, obkv) = index.documents(&rtxn, [11]).unwrap()[0];
-            let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
-            insta::assert_debug_snapshot!(json, @r###"
-            {
-                "id": Number(1),
-                "doggo": Number(3),
-            }
-            "###);
-            let (_docid, obkv) = index.documents(&rtxn, [12]).unwrap()[0];
-            let json = obkv_to_json(&[0, 1], &index.fields_ids_map(&rtxn).unwrap(), obkv).unwrap();
-            insta::assert_debug_snapshot!(json, @r###"
-            {
-                "id": Number(2),
-                "doggo": Number(4),
-            }
-            "###);
-            drop(rtxn);
-        }
-    }
-
     #[test]
     fn bug_3021_first() {
         // https://github.com/meilisearch/meilisearch/issues/3021

From 03ddb4f3106466f9e4056835c0285604097927af Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 31 Oct 2023 11:54:22 +0100
Subject: [PATCH 084/127] use deladd in facet update tests

---
 milli/src/update/facet/mod.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs
index 05e6a93d8..0839acf08 100644
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -278,6 +278,7 @@ pub(crate) mod test_helpers {
     use crate::heed_codec::ByteSliceRefCodec;
     use crate::search::facet::get_highest_level;
     use crate::snapshot_tests::display_bitmap;
+    use crate::update::del_add::{DelAdd, KvWriterDelAdd};
     use crate::update::FacetsUpdateIncrementalInner;
     use crate::CboRoaringBitmapCodec;
 
@@ -454,8 +455,10 @@ pub(crate) mod test_helpers {
                 let key: FacetGroupKey<&[u8]> =
                     FacetGroupKey { field_id: *field_id, level: 0, left_bound: &left_bound_bytes };
                 let key = FacetGroupKeyCodec::<ByteSliceRefCodec>::bytes_encode(&key).unwrap();
+                let mut inner_writer = KvWriterDelAdd::memory();
                 let value = CboRoaringBitmapCodec::bytes_encode(docids).unwrap();
-                writer.insert(&key, &value).unwrap();
+                inner_writer.insert(DelAdd::Addition, value).unwrap();
+                writer.insert(&key, inner_writer.into_inner().unwrap()).unwrap();
             }
             writer.finish().unwrap();
             let reader = grenad::Reader::new(std::io::Cursor::new(new_data)).unwrap();

From f19332466eea45212bbefaa37e4736b5917511c4 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 31 Oct 2023 16:35:38 +0100
Subject: [PATCH 085/127] Extract field value as values instead of
 Option<Value>

---
 .../index_documents/extract/extract_fid_docid_facet_values.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index 87320a675..2dce90cfc 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -102,11 +102,11 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
 
                 let del_add_obkv = obkv::KvReader::new(field_bytes);
                 let del_value = match del_add_obkv.get(DelAdd::Deletion) {
-                    Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
+                    Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
                     None => None,
                 };
                 let add_value = match del_add_obkv.get(DelAdd::Addition) {
-                    Some(bytes) => from_slice(bytes).map_err(InternalError::SerdeJson)?,
+                    Some(bytes) => Some(from_slice(bytes).map_err(InternalError::SerdeJson)?),
                     None => None,
                 };
 

From b1d1355b6983f099f770f6b1a453915fee88dcaf Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 31 Oct 2023 16:36:18 +0100
Subject: [PATCH 086/127] remove tests on soft-deleted

---
 milli/src/update/facet/mod.rs             |  85 --------------
 milli/src/update/index_documents/mod.rs   |   7 +-
 milli/src/update/prefix_word_pairs/mod.rs | 133 ----------------------
 3 files changed, 3 insertions(+), 222 deletions(-)

diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs
index 0839acf08..7358ceb6c 100644
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -558,91 +558,6 @@ pub(crate) mod test_helpers {
     }
 }
 
-#[cfg(test)]
-mod tests {
-    use big_s::S;
-    use maplit::hashset;
-
-    use crate::db_snap;
-    use crate::documents::documents_batch_reader_from_objects;
-    use crate::index::tests::TempIndex;
-
-    #[test]
-    fn replace_all_identical_soft_deletion_then_hard_deletion() {
-        let index = TempIndex::new_with_map_size(4096 * 1000 * 100);
-
-        index
-            .update_settings(|settings| {
-                settings.set_primary_key("id".to_owned());
-                settings.set_filterable_fields(hashset! { S("size") });
-            })
-            .unwrap();
-
-        let mut documents = vec![];
-        for i in 0..1000 {
-            documents.push(
-                serde_json::json! {
-                    {
-                        "id": i,
-                        "size": i % 250,
-                    }
-                }
-                .as_object()
-                .unwrap()
-                .clone(),
-            );
-        }
-
-        let documents = documents_batch_reader_from_objects(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, facet_id_f64_docids, "initial", @"777e0e221d778764b472c512617eeb3b");
-
-        let mut documents = vec![];
-        for i in 0..999 {
-            documents.push(
-                serde_json::json! {
-                    {
-                        "id": i,
-                        "size": i % 250,
-                        "other": 0,
-                    }
-                }
-                .as_object()
-                .unwrap()
-                .clone(),
-            );
-        }
-
-        let documents = documents_batch_reader_from_objects(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, facet_id_f64_docids, "replaced_1_soft", @"abba175d7bed727d0efadaef85a4388f");
-
-        // Then replace the last document while disabling soft_deletion
-        let mut documents = vec![];
-        for i in 999..1000 {
-            documents.push(
-                serde_json::json! {
-                    {
-                        "id": i,
-                        "size": i % 250,
-                        "other": 0,
-                    }
-                }
-                .as_object()
-                .unwrap()
-                .clone(),
-            );
-        }
-
-        let documents = documents_batch_reader_from_objects(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, facet_id_f64_docids, "replaced_2_hard", @"029e27a46d09c574ae949aa4289b45e6");
-    }
-}
-
 #[allow(unused)]
 #[cfg(test)]
 mod comparison_bench {
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 0174fe319..c32f907b2 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -763,11 +763,10 @@ mod tests {
         assert_eq!(count, 1);
 
         // Check that we get only one document from the database.
-        // Since the document has been deleted and re-inserted, its internal docid has been incremented to 1
-        let docs = index.documents(&rtxn, Some(1)).unwrap();
+        let docs = index.documents(&rtxn, Some(0)).unwrap();
         assert_eq!(docs.len(), 1);
         let (id, doc) = docs[0];
-        assert_eq!(id, 1);
+        assert_eq!(id, 0);
 
         // Check that this document is equal to the last one sent.
         let mut doc_iter = doc.iter();
@@ -828,7 +827,7 @@ mod tests {
         assert_eq!(count, 3);
 
         // the document 0 has been deleted and reinserted with the id 3
-        let docs = index.documents(&rtxn, vec![1, 2, 3]).unwrap();
+        let docs = index.documents(&rtxn, vec![1, 2, 0]).unwrap();
         let kevin_position =
             docs.iter().position(|(_, d)| d.get(0).unwrap() == br#""updated kevin""#).unwrap();
         assert_eq!(kevin_position, 2);
diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs
index 1ec57e080..e718f9b77 100644
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@@ -357,139 +357,6 @@ mod tests {
         db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
     }
 
-    #[test]
-    fn soft_delete_and_reupdate() {
-        let mut index = TempIndex::new();
-        index.index_documents_config.words_prefix_threshold = Some(50);
-
-        index
-            .update_settings(|settings| {
-                settings.set_primary_key("id".to_owned());
-                settings.set_searchable_fields(vec!["text".to_owned()]);
-            })
-            .unwrap();
-
-        let batch_reader_from_documents = |documents| {
-            let mut builder = DocumentsBatchBuilder::new(Vec::new());
-            for object in documents {
-                builder.append_json_object(&object).unwrap();
-            }
-            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
-        };
-
-        let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-        documents.push(
-            serde_json::json!({
-                "id": 9000,
-                "text": "At an amazing and beautiful house"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-        documents.push(
-            serde_json::json!({
-                "id": 9001,
-                "text": "The bell rings at 5 am"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-
-        let documents = batch_reader_from_documents(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, documents_ids, "initial");
-        db_snap!(index, word_docids, "initial");
-        db_snap!(index, word_prefix_pair_proximity_docids, "initial");
-        db_snap!(index, prefix_word_pair_proximity_docids, "initial");
-
-        index.delete_document("9000");
-
-        db_snap!(index, documents_ids, "first_delete");
-        db_snap!(index, word_docids, "first_delete");
-        db_snap!(index, word_prefix_pair_proximity_docids, "first_delete");
-        db_snap!(index, prefix_word_pair_proximity_docids, "first_delete");
-
-        index.delete_documents((0..50).map(|id| id.to_string()).collect());
-
-        db_snap!(index, documents_ids, "second_delete");
-        db_snap!(index, word_docids, "second_delete");
-        db_snap!(index, word_prefix_pair_proximity_docids, "second_delete");
-        db_snap!(index, prefix_word_pair_proximity_docids, "second_delete");
-
-        let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-
-        index.add_documents(batch_reader_from_documents(documents)).unwrap();
-
-        db_snap!(index, documents_ids, "reupdate");
-        db_snap!(index, word_docids, "reupdate");
-        db_snap!(index, word_prefix_pair_proximity_docids, "reupdate");
-        db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
-    }
-
-    #[test]
-    fn replace_soft_deletion() {
-        let mut index = TempIndex::new();
-        index.index_documents_config.words_prefix_threshold = Some(50);
-        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
-
-        index
-            .update_settings(|settings| {
-                settings.set_primary_key("id".to_owned());
-                settings.set_searchable_fields(vec!["text".to_owned()]);
-            })
-            .unwrap();
-
-        let batch_reader_from_documents = |documents| {
-            let mut builder = DocumentsBatchBuilder::new(Vec::new());
-            for object in documents {
-                builder.append_json_object(&object).unwrap();
-            }
-            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
-        };
-
-        let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-        documents.push(
-            serde_json::json!({
-                "id": 9000,
-                "text": "At an amazing house"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-        documents.push(
-            serde_json::json!({
-                "id": 9001,
-                "text": "The bell rings"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-
-        let documents = batch_reader_from_documents(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, documents_ids, "initial");
-        db_snap!(index, word_docids, "initial");
-        db_snap!(index, word_prefix_pair_proximity_docids, "initial");
-        db_snap!(index, prefix_word_pair_proximity_docids, "initial");
-
-        let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0);
-        index.add_documents(batch_reader_from_documents(documents)).unwrap();
-
-        db_snap!(index, documents_ids, "replaced");
-        db_snap!(index, word_docids, "replaced");
-        db_snap!(index, word_prefix_pair_proximity_docids, "replaced");
-        db_snap!(index, prefix_word_pair_proximity_docids, "replaced");
-    }
-
     #[test]
     fn replace_hard_deletion() {
         let mut index = TempIndex::new();

From 0fb6acefc3503c220796684a1e64fd550358d0eb Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 31 Oct 2023 17:11:08 +0100
Subject: [PATCH 087/127] Add snapshots for facets

---
 .../update/facet/snapshots/bulk.rs/insert/default.hash.snap   | 4 ++++
 .../bulk.rs/insert/large_group_small_min_level.hash.snap      | 4 ++++
 .../bulk.rs/insert/odd_group_odd_min_level.hash.snap          | 4 ++++
 .../bulk.rs/insert/small_group_large_min_level.hash.snap      | 4 ++++
 .../bulk.rs/insert/small_group_small_min_level.hash.snap      | 4 ++++
 .../bulk.rs/insert_delete_field_insert/default.hash.snap      | 4 ++++
 .../large_group_small_min_level.hash.snap                     | 4 ++++
 .../odd_group_odd_min_level.hash.snap                         | 4 ++++
 .../small_group_large_min_level.hash.snap                     | 4 ++++
 .../small_group_small_min_level.hash.snap                     | 4 ++++
 .../facet/snapshots/bulk.rs/insert_string/default.hash.snap   | 4 ++++
 .../insert_string/large_group_small_min_level.hash.snap       | 4 ++++
 .../bulk.rs/insert_string/odd_group_odd_min_level.hash.snap   | 4 ++++
 .../insert_string/small_group_large_min_level.hash.snap       | 4 ++++
 .../insert_string/small_group_small_min_level.hash.snap       | 4 ++++
 15 files changed, 60 insertions(+)
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap
 create mode 100644 milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap

diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap
new file mode 100644
index 000000000..bef20823c
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert/default.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+b40dd31a65e033ffc6b35c027ce19506
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap
new file mode 100644
index 000000000..74c40e6a3
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert/large_group_small_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+7ee22d8e9387e72758f00918eb67e4c6
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap
new file mode 100644
index 000000000..6fb086d35
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert/odd_group_odd_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+60f567359382507afdaf45fb075740c3
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap
new file mode 100644
index 000000000..0271a6c6b
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_large_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+b986d6e6cbf425685f409a8b417010e1
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap
new file mode 100644
index 000000000..d801ef19f
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert/small_group_small_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+ee10dd2ae2b5c6621a89a5d0a9aa8ccc
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap
new file mode 100644
index 000000000..e9988f527
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/default.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+fa877559eef78b383b496c15a364a2dc
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap
new file mode 100644
index 000000000..aa52901da
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/large_group_small_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+16a96353bc42f2ff3e91611ca4d5b184
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap
new file mode 100644
index 000000000..64f5012a4
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/odd_group_odd_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+be1b08073b9d9788d18080c1320151d7
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap
new file mode 100644
index 000000000..aa52901da
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_large_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+16a96353bc42f2ff3e91611ca4d5b184
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap
new file mode 100644
index 000000000..bb0e9aa69
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert_delete_field_insert/small_group_small_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+32a45d555df2e001420fea149818d376
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap
new file mode 100644
index 000000000..b7705b72e
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/default.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+353d70f52eea66e5031dca989ea8a037
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap
new file mode 100644
index 000000000..15030a1ea
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/large_group_small_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+52a093c909133d84023a4a7b83864808
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap
new file mode 100644
index 000000000..949ec6647
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/odd_group_odd_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+9d86c72ddb241d0aeca2995d61a3648a
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap
new file mode 100644
index 000000000..d8797f1ab
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_large_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+c0943177594534bfe5527cbf40fe388e
diff --git a/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap
new file mode 100644
index 000000000..f7949c5f3
--- /dev/null
+++ b/milli/src/update/facet/snapshots/bulk.rs/insert_string/small_group_small_min_level.hash.snap
@@ -0,0 +1,4 @@
+---
+source: milli/src/update/facet/bulk.rs
+---
+6ed86f234028ae3df5881bee5512f11e

From 0fc446c62f07ce4e5802a2affc39abdcd6a0ef1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 1 Nov 2023 10:07:03 +0100
Subject: [PATCH 088/127] Add more timing logs to the Transform

---
 milli/src/update/index_documents/transform.rs | 130 ++++++------------
 1 file changed, 44 insertions(+), 86 deletions(-)

diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 840bade2e..23b5c78c1 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -150,6 +150,7 @@ impl<'a, 'i> Transform<'a, 'i> {
         })
     }
 
+    #[logging_timer::time]
     pub fn read_documents<R, FP, FA>(
         &mut self,
         reader: EnrichedDocumentsBatchReader<R>,
@@ -162,6 +163,8 @@ impl<'a, 'i> Transform<'a, 'i> {
         FP: Fn(UpdateIndexingStep) + Sync,
         FA: Fn() -> bool + Sync,
     {
+        puffin::profile_function!();
+
         let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
         let external_documents_ids = self.index.external_documents_ids();
         let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
@@ -212,13 +215,12 @@ impl<'a, 'i> Transform<'a, 'i> {
             field_buffer_cache.sort_unstable_by(|(f1, _), (f2, _)| f1.cmp(f2));
 
             // Build the new obkv document.
-            let mut writer = obkv::KvWriter::new(&mut obkv_buffer);
+            let mut writer = KvWriter::new(&mut obkv_buffer);
             for (k, v) in field_buffer_cache.iter() {
                 writer.insert(*k, v)?;
             }
 
             let mut original_docid = None;
-
             let docid = match self.new_external_documents_ids_builder.entry((*external_id).into()) {
                 HEntry::Occupied(entry) => *entry.get() as u32,
                 HEntry::Vacant(entry) => {
@@ -275,24 +277,19 @@ impl<'a, 'i> Transform<'a, 'i> {
                         &mut document_sorter_buffer,
                     )?;
                     self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
-                    match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
-                        Some(flattened_obkv) => {
-                            // we recreate our buffer with the flattened documents
-                            document_sorter_buffer.clear();
-                            document_sorter_buffer.push(Operation::Addition as u8);
-                            into_del_add_obkv(
-                                KvReaderU16::new(&flattened_obkv),
-                                true,
-                                keep_original_version,
-                                &mut document_sorter_buffer,
-                            )?;
-                            self.flattened_sorter
-                                .insert(docid.to_be_bytes(), &document_sorter_buffer)?
-                        }
-                        None => self
-                            .flattened_sorter
-                            .insert(docid.to_be_bytes(), &document_sorter_buffer)?,
+                    let base_obkv = KvReader::new(base_obkv);
+                    if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? {
+                        // we recreate our buffer with the flattened documents
+                        document_sorter_buffer.clear();
+                        document_sorter_buffer.push(Operation::Addition as u8);
+                        into_del_add_obkv(
+                            KvReaderU16::new(&flattened_obkv),
+                            true,
+                            keep_original_version,
+                            &mut document_sorter_buffer,
+                        )?;
                     }
+                    self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
                 }
             }
 
@@ -310,23 +307,18 @@ impl<'a, 'i> Transform<'a, 'i> {
                 // We use the extracted/generated user id as the key for this document.
                 self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
 
-                match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
-                    Some(flattened_obkv) => {
-                        document_sorter_buffer.clear();
-                        document_sorter_buffer.push(Operation::Addition as u8);
-                        into_del_add_obkv(
-                            KvReaderU16::new(&flattened_obkv),
-                            false,
-                            true,
-                            &mut document_sorter_buffer,
-                        )?;
-                        self.flattened_sorter
-                            .insert(docid.to_be_bytes(), &document_sorter_buffer)?
-                    }
-                    None => self
-                        .flattened_sorter
-                        .insert(docid.to_be_bytes(), &document_sorter_buffer)?,
+                let flattened_obkv = KvReader::new(&obkv_buffer);
+                if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
+                    document_sorter_buffer.clear();
+                    document_sorter_buffer.push(Operation::Addition as u8);
+                    into_del_add_obkv(
+                        KvReaderU16::new(&obkv),
+                        false,
+                        true,
+                        &mut document_sorter_buffer,
+                    )?
                 }
+                self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
             }
             documents_count += 1;
 
@@ -361,6 +353,7 @@ impl<'a, 'i> Transform<'a, 'i> {
     /// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db,
     ///   it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids.
     /// - If the document to remove was not present in either the db or the transform we do nothing.
+    #[logging_timer::time]
     pub fn remove_documents<FA>(
         &mut self,
         mut to_remove: Vec<String>,
@@ -370,6 +363,8 @@ impl<'a, 'i> Transform<'a, 'i> {
     where
         FA: Fn() -> bool + Sync,
     {
+        puffin::profile_function!();
+
         // there may be duplicates in the documents to remove.
         to_remove.sort_unstable();
         to_remove.dedup();
@@ -439,24 +434,19 @@ impl<'a, 'i> Transform<'a, 'i> {
                     self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
 
                     // flatten it and push it as to delete in the flattened_sorter
-                    match self.flatten_from_fields_ids_map(KvReader::new(base_obkv))? {
-                        Some(flattened_obkv) => {
-                            // we recreate our buffer with the flattened documents
-                            document_sorter_buffer.clear();
-                            document_sorter_buffer.push(Operation::Deletion as u8);
-                            into_del_add_obkv(
-                                KvReaderU16::new(&flattened_obkv),
-                                true,
-                                false,
-                                &mut document_sorter_buffer,
-                            )?;
-                            self.flattened_sorter
-                                .insert(docid.to_be_bytes(), &document_sorter_buffer)?
-                        }
-                        None => self
-                            .flattened_sorter
-                            .insert(docid.to_be_bytes(), &document_sorter_buffer)?,
+                    let flattened_obkv = KvReader::new(base_obkv);
+                    if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
+                        // we recreate our buffer with the flattened documents
+                        document_sorter_buffer.clear();
+                        document_sorter_buffer.push(Operation::Deletion as u8);
+                        into_del_add_obkv(
+                            KvReaderU16::new(&obkv),
+                            true,
+                            false,
+                            &mut document_sorter_buffer,
+                        )?;
                     }
+                    self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
 
                     true
                 }
@@ -591,42 +581,10 @@ impl<'a, 'i> Transform<'a, 'i> {
         Ok(())
     }
 
-    fn remove_deleted_documents_from_field_distribution(
-        &self,
-        rtxn: &RoTxn,
-        field_distribution: &mut FieldDistribution,
-    ) -> Result<()> {
-        for deleted_docid in self.replaced_documents_ids.iter() {
-            let obkv = self.index.documents.get(rtxn, &BEU32::new(deleted_docid))?.ok_or(
-                InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
-            )?;
-
-            for (key, _) in obkv.iter() {
-                let name =
-                    self.fields_ids_map.name(key).ok_or(FieldIdMapMissingEntry::FieldId {
-                        field_id: key,
-                        process: "Computing field distribution in transform.",
-                    })?;
-                // We checked that the document was in the db earlier. If we can't find it it means
-                // there is an inconsistency between the field distribution and the field id map.
-                let field =
-                    field_distribution.get_mut(name).ok_or(FieldIdMapMissingEntry::FieldId {
-                        field_id: key,
-                        process: "Accessing field distribution in transform.",
-                    })?;
-                *field -= 1;
-                if *field == 0 {
-                    // since we were able to get the field right before it's safe to unwrap here
-                    field_distribution.remove(name).unwrap();
-                }
-            }
-        }
-        Ok(())
-    }
-
     /// Generate the `TransformOutput` based on the given sorter that can be generated from any
     /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
     /// id for the user side and the value must be an obkv where keys are valid fields ids.
+    #[logging_timer::time]
     pub(crate) fn output_from_sorter<F>(
         self,
         wtxn: &mut heed::RwTxn,
@@ -816,7 +774,7 @@ impl<'a, 'i> Transform<'a, 'i> {
             let (docid, obkv) = result?;
 
             obkv_buffer.clear();
-            let mut obkv_writer = obkv::KvWriter::<_, FieldId>::new(&mut obkv_buffer);
+            let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer);
 
             // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv.
             for (id, name) in new_fields_ids_map.iter() {

From c71b1d33ae5de96ae013e4695b13bc16263b4c3a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 1 Nov 2023 10:39:16 +0100
Subject: [PATCH 089/127] Sort entries using rayon in the transform sorters

---
 Cargo.lock                                    |  5 +-
 milli/Cargo.toml                              |  3 +-
 milli/src/update/index_documents/transform.rs | 51 +++++++++++++------
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2ab2f706a..957dffbe4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1664,11 +1664,12 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 [[package]]
 name = "grenad"
 version = "0.4.4"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5232b2d157b7bf63d7abe1b12177039e58db2f29e377517c0cdee1578cca4c93"
+source = "git+https://github.com/meilisearch/grenad?branch=parallel-sorter#eafb6ae795af6078e087edf77e7cd31a26238707"
 dependencies = [
  "bytemuck",
  "byteorder",
+ "crossbeam-channel",
+ "rayon",
  "tempfile",
 ]
 
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index 68bc2d2b5..da259c65d 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -26,7 +26,8 @@ flatten-serde-json = { path = "../flatten-serde-json" }
 fst = "0.4.7"
 fxhash = "0.2.1"
 geoutils = "0.5.1"
-grenad = { version = "0.4.4", default-features = false, features = [
+grenad = { git = "https://github.com/meilisearch/grenad", branch = "parallel-sorter", default-features = false, features = [
+    "rayon",
     "tempfile",
 ] }
 heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 23b5c78c1..8d1750c49 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -114,24 +114,43 @@ impl<'a, 'i> Transform<'a, 'i> {
         };
 
         // We initialize the sorter with the user indexing settings.
-        let original_sorter = create_sorter(
-            grenad::SortAlgorithm::Stable,
-            merge_function,
-            indexer_settings.chunk_compression_type,
-            indexer_settings.chunk_compression_level,
-            indexer_settings.max_nb_chunks,
-            indexer_settings.max_memory.map(|mem| mem / 2),
-        );
+        let original_sorter = {
+            let mut builder = grenad::Sorter::builder(merge_function);
+            builder.chunk_compression_type(indexer_settings.chunk_compression_type);
+            if let Some(level) = indexer_settings.chunk_compression_level {
+                builder.chunk_compression_level(level);
+            }
+            if let Some(nb_chunks) = indexer_settings.max_nb_chunks {
+                builder.max_nb_chunks(nb_chunks);
+            }
+            if let Some(memory) = indexer_settings.max_memory.map(|mem| mem / 2) {
+                builder.dump_threshold(memory);
+                builder.allow_realloc(false);
+            }
+            builder.sort_algorithm(grenad::SortAlgorithm::Stable);
+            builder.sort_in_parallel(true);
+            builder.build()
+        };
 
         // We initialize the sorter with the user indexing settings.
-        let flattened_sorter = create_sorter(
-            grenad::SortAlgorithm::Stable,
-            merge_function,
-            indexer_settings.chunk_compression_type,
-            indexer_settings.chunk_compression_level,
-            indexer_settings.max_nb_chunks,
-            indexer_settings.max_memory.map(|mem| mem / 2),
-        );
+        let flattened_sorter = {
+            let mut builder = grenad::Sorter::builder(merge_function);
+            builder.chunk_compression_type(indexer_settings.chunk_compression_type);
+            if let Some(level) = indexer_settings.chunk_compression_level {
+                builder.chunk_compression_level(level);
+            }
+            if let Some(nb_chunks) = indexer_settings.max_nb_chunks {
+                builder.max_nb_chunks(nb_chunks);
+            }
+            if let Some(memory) = indexer_settings.max_memory.map(|mem| mem / 2) {
+                builder.dump_threshold(memory);
+                builder.allow_realloc(false);
+            }
+            builder.sort_algorithm(grenad::SortAlgorithm::Stable);
+            builder.sort_in_parallel(true);
+            builder.build()
+        };
+
         let documents_ids = index.documents_ids(wtxn)?;
 
         Ok(Transform {

From e507ef593267795b4a88fde05477e58fb6948724 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 1 Nov 2023 11:06:58 +0100
Subject: [PATCH 090/127] Slow the logging down

---
 index-scheduler/src/batch.rs | 8 ++++----
 meilisearch/src/lib.rs       | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs
index c273d8ebb..ebdba0a8c 100644
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -24,7 +24,7 @@ use std::fs::{self, File};
 use std::io::BufWriter;
 
 use dump::IndexMetadata;
-use log::{debug, error, info};
+use log::{debug, error, info, trace};
 use meilisearch_types::error::Code;
 use meilisearch_types::heed::{RoTxn, RwTxn};
 use meilisearch_types::milli::documents::{obkv_to_object, DocumentsBatchReader};
@@ -1190,7 +1190,7 @@ impl IndexScheduler {
                     index,
                     indexer_config,
                     config,
-                    |indexing_step| debug!("update: {:?}", indexing_step),
+                    |indexing_step| trace!("update: {:?}", indexing_step),
                     || must_stop_processing.get(),
                 )?;
 
@@ -1268,7 +1268,7 @@ impl IndexScheduler {
                         milli::update::Settings::new(index_wtxn, index, indexer_config);
                     builder.reset_primary_key();
                     builder.execute(
-                        |indexing_step| debug!("update: {:?}", indexing_step),
+                        |indexing_step| trace!("update: {:?}", indexing_step),
                         || must_stop_processing.clone().get(),
                     )?;
                 }
@@ -1288,7 +1288,7 @@ impl IndexScheduler {
                     index,
                     indexer_config,
                     config,
-                    |indexing_step| debug!("update: {:?}", indexing_step),
+                    |indexing_step| trace!("update: {:?}", indexing_step),
                     || must_stop_processing.get(),
                 )?;
 
diff --git a/meilisearch/src/lib.rs b/meilisearch/src/lib.rs
index 603d8ff86..16c08c6c2 100644
--- a/meilisearch/src/lib.rs
+++ b/meilisearch/src/lib.rs
@@ -362,7 +362,7 @@ fn import_dump(
                 update_method: IndexDocumentsMethod::ReplaceDocuments,
                 ..Default::default()
             },
-            |indexing_step| log::debug!("update: {:?}", indexing_step),
+            |indexing_step| log::trace!("update: {:?}", indexing_step),
             || false,
         )?;
 

From b10c060bf7e5c99d4789096d0cf15d4aa9e4fa24 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Wed, 1 Nov 2023 13:55:18 +0100
Subject: [PATCH 091/127] Cleanup TOML

---
 Cargo.lock       | 6 +++---
 milli/Cargo.toml | 5 ++---
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 957dffbe4..91fdc13be 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1663,12 +1663,12 @@ checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
 [[package]]
 name = "grenad"
-version = "0.4.4"
-source = "git+https://github.com/meilisearch/grenad?branch=parallel-sorter#eafb6ae795af6078e087edf77e7cd31a26238707"
+version = "0.4.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a007932af5475ebb5c63bef8812bb1c36f317983bb4ca663e9d6dd58d6a0f8c"
 dependencies = [
  "bytemuck",
  "byteorder",
- "crossbeam-channel",
  "rayon",
  "tempfile",
 ]
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index da259c65d..9cef4795b 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -26,9 +26,8 @@ flatten-serde-json = { path = "../flatten-serde-json" }
 fst = "0.4.7"
 fxhash = "0.2.1"
 geoutils = "0.5.1"
-grenad = { git = "https://github.com/meilisearch/grenad", branch = "parallel-sorter", default-features = false, features = [
-    "rayon",
-    "tempfile",
+grenad = { version = "0.4.5", default-features = false, features = [
+    "rayon", "tempfile"
 ] }
 heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.7", default-features = false, features = [
     "lmdb", "read-txn-no-tls"

From 4d864f0702578e6540207c1472992fab06d63b15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Thu, 2 Nov 2023 14:47:43 +0100
Subject: [PATCH 092/127] Always sort internal Sorter entries in parallel

---
 .../index_documents/helpers/grenad_helpers.rs |  1 +
 milli/src/update/index_documents/transform.rs | 51 ++++++-------------
 2 files changed, 17 insertions(+), 35 deletions(-)

diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs
index cc0ccb609..03a3d6f5f 100644
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -47,6 +47,7 @@ pub fn create_sorter(
         builder.allow_realloc(false);
     }
     builder.sort_algorithm(sort_algorithm);
+    builder.sort_in_parallel(true);
     builder.build()
 }
 
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 8d1750c49..23b5c78c1 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -114,43 +114,24 @@ impl<'a, 'i> Transform<'a, 'i> {
         };
 
         // We initialize the sorter with the user indexing settings.
-        let original_sorter = {
-            let mut builder = grenad::Sorter::builder(merge_function);
-            builder.chunk_compression_type(indexer_settings.chunk_compression_type);
-            if let Some(level) = indexer_settings.chunk_compression_level {
-                builder.chunk_compression_level(level);
-            }
-            if let Some(nb_chunks) = indexer_settings.max_nb_chunks {
-                builder.max_nb_chunks(nb_chunks);
-            }
-            if let Some(memory) = indexer_settings.max_memory.map(|mem| mem / 2) {
-                builder.dump_threshold(memory);
-                builder.allow_realloc(false);
-            }
-            builder.sort_algorithm(grenad::SortAlgorithm::Stable);
-            builder.sort_in_parallel(true);
-            builder.build()
-        };
+        let original_sorter = create_sorter(
+            grenad::SortAlgorithm::Stable,
+            merge_function,
+            indexer_settings.chunk_compression_type,
+            indexer_settings.chunk_compression_level,
+            indexer_settings.max_nb_chunks,
+            indexer_settings.max_memory.map(|mem| mem / 2),
+        );
 
         // We initialize the sorter with the user indexing settings.
-        let flattened_sorter = {
-            let mut builder = grenad::Sorter::builder(merge_function);
-            builder.chunk_compression_type(indexer_settings.chunk_compression_type);
-            if let Some(level) = indexer_settings.chunk_compression_level {
-                builder.chunk_compression_level(level);
-            }
-            if let Some(nb_chunks) = indexer_settings.max_nb_chunks {
-                builder.max_nb_chunks(nb_chunks);
-            }
-            if let Some(memory) = indexer_settings.max_memory.map(|mem| mem / 2) {
-                builder.dump_threshold(memory);
-                builder.allow_realloc(false);
-            }
-            builder.sort_algorithm(grenad::SortAlgorithm::Stable);
-            builder.sort_in_parallel(true);
-            builder.build()
-        };
-
+        let flattened_sorter = create_sorter(
+            grenad::SortAlgorithm::Stable,
+            merge_function,
+            indexer_settings.chunk_compression_type,
+            indexer_settings.chunk_compression_level,
+            indexer_settings.max_nb_chunks,
+            indexer_settings.max_memory.map(|mem| mem / 2),
+        );
         let documents_ids = index.documents_ids(wtxn)?;
 
         Ok(Transform {

From 12323d610e33b1f0dcdaa97ddc90c5b59b599417 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 31 Oct 2023 16:46:16 +0100
Subject: [PATCH 093/127] Change the original document sorter key from the
 internal docid to a concatenation of the internal and the external docid

---
 milli/src/update/index_documents/mod.rs       |   2 +
 milli/src/update/index_documents/transform.rs | 116 ++++++++++--------
 2 files changed, 69 insertions(+), 49 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index c32f907b2..129b67cf0 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -1387,6 +1387,8 @@ mod tests {
         index.add_documents(documents!({ "a" : { "b" : { "c" :  1 }}})).unwrap();
 
         let rtxn = index.read_txn().unwrap();
+        let all_documents_count = index.all_documents(&rtxn).unwrap().count();
+        assert_eq!(all_documents_count, 1);
         let external_documents_ids = index.external_documents_ids();
         assert!(external_documents_ids.get(&rtxn, "1").unwrap().is_some());
     }
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 23b5c78c1..3863d5a54 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -174,7 +174,8 @@ impl<'a, 'i> Transform<'a, 'i> {
             self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
 
         let mut obkv_buffer = Vec::new();
-        let mut document_sorter_buffer = Vec::new();
+        let mut document_sorter_value_buffer = Vec::new();
+        let mut document_sorter_key_buffer = Vec::new();
         let mut documents_count = 0;
         let mut docid_buffer: Vec<u8> = Vec::new();
         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
@@ -268,57 +269,64 @@ impl<'a, 'i> Transform<'a, 'i> {
                     // we associate the base document with the new key, everything will get merged later.
                     let keep_original_version =
                         self.index_documents_method == IndexDocumentsMethod::UpdateDocuments;
-                    document_sorter_buffer.clear();
-                    document_sorter_buffer.push(Operation::Addition as u8);
+                    document_sorter_key_buffer.clear();
+                    document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
+                    document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
+                    document_sorter_value_buffer.clear();
+                    document_sorter_value_buffer.push(Operation::Addition as u8);
                     into_del_add_obkv(
                         KvReaderU16::new(base_obkv),
                         true,
                         keep_original_version,
-                        &mut document_sorter_buffer,
+                        &mut document_sorter_value_buffer,
                     )?;
-                    self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
+                    self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_buffer)?;
                     let base_obkv = KvReader::new(base_obkv);
                     if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? {
                         // we recreate our buffer with the flattened documents
-                        document_sorter_buffer.clear();
-                        document_sorter_buffer.push(Operation::Addition as u8);
+                        document_sorter_value_buffer.clear();
+                        document_sorter_value_buffer.push(Operation::Addition as u8);
                         into_del_add_obkv(
                             KvReaderU16::new(&flattened_obkv),
                             true,
                             keep_original_version,
-                            &mut document_sorter_buffer,
+                            &mut document_sorter_value_buffer,
                         )?;
                     }
-                    self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
+                    self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
                 }
             }
 
             if !skip_insertion {
                 self.new_documents_ids.insert(docid);
 
-                document_sorter_buffer.clear();
-                document_sorter_buffer.push(Operation::Addition as u8);
+                document_sorter_key_buffer.clear();
+                document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
+                document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
+                document_sorter_value_buffer.clear();
+                document_sorter_value_buffer.push(Operation::Addition as u8);
                 into_del_add_obkv(
                     KvReaderU16::new(&obkv_buffer),
                     false,
                     true,
-                    &mut document_sorter_buffer,
+                    &mut document_sorter_value_buffer,
                 )?;
                 // We use the extracted/generated user id as the key for this document.
-                self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
+                self.original_sorter
+                    .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
 
                 let flattened_obkv = KvReader::new(&obkv_buffer);
                 if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
-                    document_sorter_buffer.clear();
-                    document_sorter_buffer.push(Operation::Addition as u8);
+                    document_sorter_value_buffer.clear();
+                    document_sorter_value_buffer.push(Operation::Addition as u8);
                     into_del_add_obkv(
                         KvReaderU16::new(&obkv),
                         false,
                         true,
-                        &mut document_sorter_buffer,
+                        &mut document_sorter_value_buffer,
                     )?
                 }
-                self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
+                self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
             }
             documents_count += 1;
 
@@ -372,37 +380,42 @@ impl<'a, 'i> Transform<'a, 'i> {
         let external_documents_ids = self.index.external_documents_ids();
 
         let mut documents_deleted = 0;
-        let mut document_sorter_buffer = Vec::new();
+        let mut document_sorter_value_buffer = Vec::new();
+        let mut document_sorter_key_buffer = Vec::new();
         for to_remove in to_remove {
             if should_abort() {
                 return Err(Error::InternalError(InternalError::AbortedIndexation));
             }
 
             // Check if the document has been added in the current indexing process.
-            let deleted_from_current = match self
-                .new_external_documents_ids_builder
-                .entry((*to_remove).into())
-            {
-                // if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
-                HEntry::Occupied(entry) => {
-                    let doc_id = *entry.get() as u32;
-                    document_sorter_buffer.clear();
-                    document_sorter_buffer.push(Operation::Deletion as u8);
-                    obkv::KvWriterU16::new(&mut document_sorter_buffer).finish().unwrap();
-                    self.original_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?;
-                    self.flattened_sorter.insert(doc_id.to_be_bytes(), &document_sorter_buffer)?;
+            let deleted_from_current =
+                match self.new_external_documents_ids_builder.entry((*to_remove).into()) {
+                    // if the document was added in a previous iteration of the transform we make it as deleted in the sorters.
+                    HEntry::Occupied(entry) => {
+                        let docid = *entry.get() as u32;
+                        // Key is the concatenation of the internal docid and the external one.
+                        document_sorter_key_buffer.clear();
+                        document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
+                        document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes());
+                        document_sorter_value_buffer.clear();
+                        document_sorter_value_buffer.push(Operation::Deletion as u8);
+                        obkv::KvWriterU16::new(&mut document_sorter_value_buffer).finish().unwrap();
+                        self.original_sorter
+                            .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
+                        self.flattened_sorter
+                            .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
 
-                    // we must NOT update the list of replaced_documents_ids
-                    // Either:
-                    // 1. It's already in it and there is nothing to do
-                    // 2. It wasn't in it because the document was created by a previous batch and since
-                    //    we're removing it there is nothing to do.
-                    self.new_documents_ids.remove(doc_id);
-                    entry.remove_entry();
-                    true
-                }
-                HEntry::Vacant(_) => false,
-            };
+                        // we must NOT update the list of replaced_documents_ids
+                        // Either:
+                        // 1. It's already in it and there is nothing to do
+                        // 2. It wasn't in it because the document was created by a previous batch and since
+                        //    we're removing it there is nothing to do.
+                        self.new_documents_ids.remove(docid);
+                        entry.remove_entry();
+                        true
+                    }
+                    HEntry::Vacant(_) => false,
+                };
 
             // If the document was already in the db we mark it as a `to_delete` document.
             // Then we push the document in sorters in deletion mode.
@@ -422,31 +435,36 @@ impl<'a, 'i> Transform<'a, 'i> {
                             key: None,
                         })?;
 
+                    // Key is the concatenation of the internal docid and the external one.
+                    document_sorter_key_buffer.clear();
+                    document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
+                    document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes());
                     // push it as to delete in the original_sorter
-                    document_sorter_buffer.clear();
-                    document_sorter_buffer.push(Operation::Deletion as u8);
+                    document_sorter_value_buffer.clear();
+                    document_sorter_value_buffer.push(Operation::Deletion as u8);
                     into_del_add_obkv(
                         KvReaderU16::new(base_obkv),
                         true,
                         false,
-                        &mut document_sorter_buffer,
+                        &mut document_sorter_value_buffer,
                     )?;
-                    self.original_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
+                    self.original_sorter
+                        .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
 
                     // flatten it and push it as to delete in the flattened_sorter
                     let flattened_obkv = KvReader::new(base_obkv);
                     if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
                         // we recreate our buffer with the flattened documents
-                        document_sorter_buffer.clear();
-                        document_sorter_buffer.push(Operation::Deletion as u8);
+                        document_sorter_value_buffer.clear();
+                        document_sorter_value_buffer.push(Operation::Deletion as u8);
                         into_del_add_obkv(
                             KvReaderU16::new(&obkv),
                             true,
                             false,
-                            &mut document_sorter_buffer,
+                            &mut document_sorter_value_buffer,
                         )?;
                     }
-                    self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
+                    self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
 
                     true
                 }

From 4b64c33aa2525a8fc79e7a318ec2566c867e5f66 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 31 Oct 2023 17:44:42 +0100
Subject: [PATCH 094/127] update vector extractor

---
 .../extract/extract_vector_points.rs              | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs
index 863bc07c3..9aed862ab 100644
--- a/milli/src/update/index_documents/extract/extract_vector_points.rs
+++ b/milli/src/update/index_documents/extract/extract_vector_points.rs
@@ -7,7 +7,8 @@ use serde_json::{from_slice, Value};
 
 use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
 use crate::error::UserError;
-use crate::{FieldId, InternalError, Result, VectorOrArrayOfVectors};
+use crate::update::index_documents::helpers::try_split_at;
+use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors};
 
 /// Extracts the embedding vector contained in each document under the `_vectors` field.
 ///
@@ -28,15 +29,17 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
     );
 
     let mut cursor = obkv_documents.into_cursor()?;
-    while let Some((docid_bytes, value)) = cursor.move_on_next()? {
+    while let Some((key, value)) = cursor.move_on_next()? {
+        // this must always be serialized as (docid, external_docid);
+        let (docid_bytes, external_id_bytes) =
+            try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
+        debug_assert!(std::str::from_utf8(external_id_bytes).is_ok());
+
         let obkv = obkv::KvReader::new(value);
 
         // since we only needs the primary key when we throw an error we create this getter to
         // lazily get it when needed
-        let document_id = || -> Value {
-            let document_id = obkv.get(primary_key_id).unwrap();
-            from_slice(document_id).unwrap()
-        };
+        let document_id = || -> Value { std::str::from_utf8(external_id_bytes).unwrap().into() };
 
         // first we retrieve the _vectors field
         if let Some(vectors) = obkv.get(vectors_fid) {

From 1b4ff991c03ec56455a8d18f72e657c63ccd07a1 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 31 Oct 2023 17:44:57 +0100
Subject: [PATCH 095/127] update typed chunks

---
 milli/src/external_documents_ids.rs           |  4 --
 .../extract/extract_vector_points.rs          |  1 -
 .../src/update/index_documents/extract/mod.rs |  9 +--
 .../src/update/index_documents/typed_chunk.rs | 62 +++++++------------
 4 files changed, 22 insertions(+), 54 deletions(-)

diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs
index 1bf08396a..ee8d29ffc 100644
--- a/milli/src/external_documents_ids.rs
+++ b/milli/src/external_documents_ids.rs
@@ -74,10 +74,6 @@ impl ExternalDocumentsIds {
         for DocumentOperation { external_id, internal_id, kind } in operations {
             match kind {
                 DocumentOperationKind::Create => {
-                    // TODO should we get before insert to be able to detect bugs?
-                    // if matches!(kind, DocumentOperationKind::Create) {
-                    //     panic!("Attempting to create an already-existing document");
-                    // }
                     self.0.put(wtxn, &external_id, &BEU32::new(internal_id))?;
                 }
                 DocumentOperationKind::Delete => {
diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs
index 9aed862ab..1f5edeeeb 100644
--- a/milli/src/update/index_documents/extract/extract_vector_points.rs
+++ b/milli/src/update/index_documents/extract/extract_vector_points.rs
@@ -17,7 +17,6 @@ use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors};
 pub fn extract_vector_points<R: io::Read + io::Seek>(
     obkv_documents: grenad::Reader<R>,
     indexer: GrenadParameters,
-    primary_key_id: FieldId,
     vectors_fid: FieldId,
 ) -> Result<grenad::Reader<BufReader<File>>> {
     puffin::profile_function!();
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 41722a53e..ee8713ee8 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -63,7 +63,6 @@ pub(crate) fn data_from_obkv_documents(
                 indexer,
                 lmdb_writer_sx.clone(),
                 vectors_field_id,
-                primary_key_id,
             )
         })
         .collect::<Result<()>>()?;
@@ -274,7 +273,6 @@ fn send_original_documents_data(
     indexer: GrenadParameters,
     lmdb_writer_sx: Sender<Result<TypedChunk>>,
     vectors_field_id: Option<FieldId>,
-    primary_key_id: FieldId,
 ) -> Result<()> {
     let original_documents_chunk =
         original_documents_chunk.and_then(|c| unsafe { as_cloneable_grenad(&c) })?;
@@ -283,12 +281,7 @@ fn send_original_documents_data(
         let documents_chunk_cloned = original_documents_chunk.clone();
         let lmdb_writer_sx_cloned = lmdb_writer_sx.clone();
         rayon::spawn(move || {
-            let result = extract_vector_points(
-                documents_chunk_cloned,
-                indexer,
-                primary_key_id,
-                vectors_field_id,
-            );
+            let result = extract_vector_points(documents_chunk_cloned, indexer, vectors_field_id);
             let _ = match result {
                 Ok(vector_points) => {
                     lmdb_writer_sx_cloned.send(Ok(TypedChunk::VectorPoints(vector_points)))
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 1b38be03b..7c3f587d2 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -17,6 +17,7 @@ use crate::distance::NDotProductPoint;
 use crate::error::UserError;
 use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
 use crate::facet::FacetType;
+use crate::index::db_name::DOCUMENTS;
 use crate::index::Hnsw;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd};
 use crate::update::facet::FacetsUpdate;
@@ -24,7 +25,7 @@ use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_arr
 use crate::update::index_documents::validate_document_id_value;
 use crate::{
     lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
-    Result, BEU32,
+    Result, SerializationError, BEU32,
 };
 
 pub(crate) enum TypedChunk {
@@ -124,13 +125,15 @@ pub(crate) fn write_typed_chunk_into_index(
             let mut operations: Vec<DocumentOperation> = Default::default();
 
             let mut docids = index.documents_ids(wtxn)?;
-            let primary_key = index.primary_key(wtxn)?.unwrap();
-            let primary_key = index.fields_ids_map(wtxn)?.id(primary_key).unwrap();
             let mut cursor = obkv_documents_iter.into_cursor()?;
-            while let Some((docid, reader)) = cursor.move_on_next()? {
+            while let Some((key, reader)) = cursor.move_on_next()? {
                 let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
                 let reader: KvReader<FieldId> = KvReader::new(reader);
-                let docid = docid.try_into().map(DocumentId::from_be_bytes).unwrap();
+
+                let (document_id_bytes, external_id_bytes) = try_split_array_at(key)
+                    .ok_or(SerializationError::Decoding { db_name: Some(DOCUMENTS) })?;
+                let docid = DocumentId::from_be_bytes(document_id_bytes);
+                let external_id = std::str::from_utf8(external_id_bytes)?;
 
                 for (field_id, value) in reader.iter() {
                     let del_add_reader = KvReaderDelAdd::new(value);
@@ -140,45 +143,10 @@ pub(crate) fn write_typed_chunk_into_index(
                     ) {
                         (None, None) => {}
                         (None, Some(value)) => {
-                            // if primary key, new document
-                            if field_id == primary_key {
-                                // FIXME: we already extracted the external docid before. We should retrieve it in the typed chunk
-                                // rather than re-extract it here
-                                // FIXME: unwraps
-                                let document_id = serde_json::from_slice(value)
-                                    .map_err(InternalError::SerdeJson)
-                                    .unwrap();
-                                let external_id =
-                                    validate_document_id_value(document_id).unwrap().unwrap();
-                                operations.push(DocumentOperation {
-                                    external_id,
-                                    internal_id: docid,
-                                    kind: DocumentOperationKind::Create,
-                                });
-                                docids.insert(docid);
-                            }
                             // anyway, write
                             writer.insert(field_id, value)?;
                         }
-                        (Some(value), None) => {
-                            // if primary key, deleted document
-                            if field_id == primary_key {
-                                // FIXME: we already extracted the external docid before. We should retrieve it in the typed chunk
-                                // rather than re-extract it here
-                                // FIXME: unwraps
-                                let document_id = serde_json::from_slice(value)
-                                    .map_err(InternalError::SerdeJson)
-                                    .unwrap();
-                                let external_id =
-                                    validate_document_id_value(document_id).unwrap().unwrap();
-                                operations.push(DocumentOperation {
-                                    external_id,
-                                    internal_id: docid,
-                                    kind: DocumentOperationKind::Delete,
-                                });
-                                docids.remove(docid);
-                            }
-                        }
+                        (Some(_), None) => {}
                         (Some(_), Some(value)) => {
                             // updated field, write
                             writer.insert(field_id, value)?;
@@ -190,8 +158,20 @@ pub(crate) fn write_typed_chunk_into_index(
 
                 if !writer.is_empty() {
                     db.put(wtxn, &BEU32::new(docid), &writer.into_inner().unwrap())?;
+                    operations.push(DocumentOperation {
+                        external_id: external_id.to_string(),
+                        internal_id: docid,
+                        kind: DocumentOperationKind::Create,
+                    });
+                    docids.insert(docid);
                 } else {
                     db.delete(wtxn, &BEU32::new(docid))?;
+                    operations.push(DocumentOperation {
+                        external_id: external_id.to_string(),
+                        internal_id: docid,
+                        kind: DocumentOperationKind::Delete,
+                    });
+                    docids.remove(docid);
                 }
             }
             let external_documents_docids = index.external_documents_ids();

From bc51d6157adceee09f529789e66f66f9d08d8fac Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 2 Nov 2023 13:37:54 +0100
Subject: [PATCH 096/127] Fix transform reindexing path

---
 milli/src/external_documents_ids.rs           |  2 +-
 milli/src/update/index_documents/transform.rs | 82 +++++++++++++------
 2 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs
index ee8d29ffc..a002fc064 100644
--- a/milli/src/external_documents_ids.rs
+++ b/milli/src/external_documents_ids.rs
@@ -18,7 +18,7 @@ pub struct DocumentOperation {
     pub kind: DocumentOperationKind,
 }
 
-pub struct ExternalDocumentsIds(Database<Str, OwnedType<BEU32>>);
+pub struct ExternalDocumentsIds(pub Database<Str, OwnedType<BEU32>>);
 
 impl ExternalDocumentsIds {
     pub fn new(db: Database<Str, OwnedType<BEU32>>) -> ExternalDocumentsIds {
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 3863d5a54..82cf55d42 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -14,14 +14,15 @@ use serde_json::Value;
 use smartstring::SmartString;
 
 use super::helpers::{
-    create_sorter, create_writer, obkvs_keep_last_addition_merge_deletions,
-    obkvs_merge_additions_and_deletions, MergeFn,
+    create_sorter, create_writer, keep_first, obkvs_keep_last_addition_merge_deletions,
+    obkvs_merge_additions_and_deletions, sorter_into_reader, MergeFn,
 };
 use super::{IndexDocumentsMethod, IndexerConfig};
 use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::{db_name, main_key};
 use crate::update::del_add::{into_del_add_obkv, DelAdd, KvReaderDelAdd};
+use crate::update::index_documents::GrenadParameters;
 use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
 use crate::{
     FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32,
@@ -772,24 +773,35 @@ impl<'a, 'i> Transform<'a, 'i> {
         let documents_ids = self.index.documents_ids(wtxn)?;
         let documents_count = documents_ids.len() as usize;
 
-        // We create a final writer to write the new documents in order from the sorter.
-        let mut original_writer = create_writer(
+        // We initialize the sorter with the user indexing settings.
+        let mut original_sorter = create_sorter(
+            grenad::SortAlgorithm::Stable,
+            keep_first,
             self.indexer_settings.chunk_compression_type,
             self.indexer_settings.chunk_compression_level,
-            tempfile::tempfile()?,
+            self.indexer_settings.max_nb_chunks,
+            self.indexer_settings.max_memory.map(|mem| mem / 2),
         );
 
-        // We create a final writer to write the new documents in order from the sorter.
-        let mut flattened_writer = create_writer(
+        // We initialize the sorter with the user indexing settings.
+        let mut flattened_sorter = create_sorter(
+            grenad::SortAlgorithm::Stable,
+            keep_first,
             self.indexer_settings.chunk_compression_type,
             self.indexer_settings.chunk_compression_level,
-            tempfile::tempfile()?,
+            self.indexer_settings.max_nb_chunks,
+            self.indexer_settings.max_memory.map(|mem| mem / 2),
         );
 
         let mut obkv_buffer = Vec::new();
-        let mut document_sorter_buffer = Vec::new();
-        for result in self.index.all_documents(wtxn)? {
-            let (docid, obkv) = result?;
+        let mut document_sorter_key_buffer = Vec::new();
+        let mut document_sorter_value_buffer = Vec::new();
+        for result in self.index.external_documents_ids().0.iter(wtxn)? {
+            let (external_id, docid) = result?;
+            let obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
+                InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },
+            )?;
+            let docid = docid.get();
 
             obkv_buffer.clear();
             let mut obkv_writer = KvWriter::<_, FieldId>::new(&mut obkv_buffer);
@@ -802,9 +814,18 @@ impl<'a, 'i> Transform<'a, 'i> {
             }
 
             let buffer = obkv_writer.into_inner()?;
-            document_sorter_buffer.clear();
-            into_del_add_obkv(KvReaderU16::new(buffer), false, true, &mut document_sorter_buffer)?;
-            original_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
+
+            document_sorter_key_buffer.clear();
+            document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
+            document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
+            document_sorter_value_buffer.clear();
+            into_del_add_obkv(
+                KvReaderU16::new(buffer),
+                false,
+                true,
+                &mut document_sorter_value_buffer,
+            )?;
+            original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
 
             // Once we have the document. We're going to flatten it
             // and insert it in the flattened sorter.
@@ -839,18 +860,27 @@ impl<'a, 'i> Transform<'a, 'i> {
                 let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
                 writer.insert(fid, &value)?;
             }
-            document_sorter_buffer.clear();
-            into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut document_sorter_buffer)?;
-            flattened_writer.insert(docid.to_be_bytes(), &document_sorter_buffer)?;
+            document_sorter_value_buffer.clear();
+            into_del_add_obkv(
+                KvReaderU16::new(&buffer),
+                false,
+                true,
+                &mut document_sorter_value_buffer,
+            )?;
+            flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
         }
 
-        // Once we have written all the documents, we extract
-        // the file and reset the seek to be able to read it again.
-        let mut original_documents = original_writer.into_inner()?;
-        original_documents.rewind()?;
+        let grenad_params = GrenadParameters {
+            chunk_compression_type: self.indexer_settings.chunk_compression_type,
+            chunk_compression_level: self.indexer_settings.chunk_compression_level,
+            max_memory: self.indexer_settings.max_memory,
+            max_nb_chunks: self.indexer_settings.max_nb_chunks, // default value, may be chosen.
+        };
 
-        let mut flattened_documents = flattened_writer.into_inner()?;
-        flattened_documents.rewind()?;
+        // Once we have written all the documents, we merge everything into a Reader.
+        let original_documents = sorter_into_reader(original_sorter, grenad_params)?;
+
+        let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?;
 
         let output = TransformOutput {
             primary_key,
@@ -862,10 +892,8 @@ impl<'a, 'i> Transform<'a, 'i> {
             // FIXME: remove this now unused field
             replaced_documents_ids: RoaringBitmap::default(),
             documents_count,
-            original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
-            flattened_documents: flattened_documents
-                .into_inner()
-                .map_err(|err| err.into_error())?,
+            original_documents: original_documents.into_inner().into_inner(),
+            flattened_documents: flattened_documents.into_inner().into_inner(),
         };
 
         let new_facets = output.compute_real_facets(wtxn, self.index)?;

From 5b20e625f3a9cf543a47e88a70446de55a463844 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 2 Nov 2023 15:31:37 +0100
Subject: [PATCH 097/127] fix merge

---
 milli/src/update/index_documents/transform.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 82cf55d42..2eec69da5 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -281,7 +281,8 @@ impl<'a, 'i> Transform<'a, 'i> {
                         keep_original_version,
                         &mut document_sorter_value_buffer,
                     )?;
-                    self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_buffer)?;
+                    self.original_sorter
+                        .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
                     let base_obkv = KvReader::new(base_obkv);
                     if let Some(flattened_obkv) = self.flatten_from_fields_ids_map(base_obkv)? {
                         // we recreate our buffer with the flattened documents
@@ -294,7 +295,8 @@ impl<'a, 'i> Transform<'a, 'i> {
                             &mut document_sorter_value_buffer,
                         )?;
                     }
-                    self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
+                    self.flattened_sorter
+                        .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
                 }
             }
 
@@ -465,7 +467,8 @@ impl<'a, 'i> Transform<'a, 'i> {
                             &mut document_sorter_value_buffer,
                         )?;
                     }
-                    self.flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
+                    self.flattened_sorter
+                        .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
 
                     true
                 }

From bf0651f23cdb7decc9b3c7fe31dbc9fc21e429be Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 2 Nov 2023 15:37:05 +0100
Subject: [PATCH 098/127] Implement iter method on ExternalDocumentsIds

---
 milli/src/external_documents_ids.rs           | 7 ++++++-
 milli/src/update/index_documents/transform.rs | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs
index a002fc064..e0a71b7cd 100644
--- a/milli/src/external_documents_ids.rs
+++ b/milli/src/external_documents_ids.rs
@@ -18,7 +18,7 @@ pub struct DocumentOperation {
     pub kind: DocumentOperationKind,
 }
 
-pub struct ExternalDocumentsIds(pub Database<Str, OwnedType<BEU32>>);
+pub struct ExternalDocumentsIds(Database<Str, OwnedType<BEU32>>);
 
 impl ExternalDocumentsIds {
     pub fn new(db: Database<Str, OwnedType<BEU32>>) -> ExternalDocumentsIds {
@@ -86,6 +86,11 @@ impl ExternalDocumentsIds {
 
         Ok(())
     }
+
+    /// Returns an iterator over all the external ids.
+    pub fn iter<'t>(&self, rtxn: &'t RoTxn) -> heed::Result<RoIter<'t, Str, OwnedType<BEU32>>> {
+        self.0.iter(rtxn)
+    }
 }
 
 /// An iterator over mappings between requested internal ids and external ids.
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 2eec69da5..7c500799d 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -799,7 +799,7 @@ impl<'a, 'i> Transform<'a, 'i> {
         let mut obkv_buffer = Vec::new();
         let mut document_sorter_key_buffer = Vec::new();
         let mut document_sorter_value_buffer = Vec::new();
-        for result in self.index.external_documents_ids().0.iter(wtxn)? {
+        for result in self.index.external_documents_ids().iter(wtxn)? {
             let (external_id, docid) = result?;
             let obkv = self.index.documents.get(wtxn, &docid)?.ok_or(
                 InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None },

From ff522c919d5065499fa01bdf2c21747bade40e60 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= <clement@meilisearch.com>
Date: Thu, 2 Nov 2023 15:58:08 +0100
Subject: [PATCH 099/127] Fix the vector extractions for the diff indexing

---
 .../extract/extract_vector_points.rs          | 127 ++++++++++++++----
 .../src/update/index_documents/typed_chunk.rs |  81 +++++++----
 2 files changed, 156 insertions(+), 52 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs
index 1f5edeeeb..317a9aec3 100644
--- a/milli/src/update/index_documents/extract/extract_vector_points.rs
+++ b/milli/src/update/index_documents/extract/extract_vector_points.rs
@@ -1,15 +1,25 @@
+use std::cmp::Ordering;
 use std::convert::TryFrom;
 use std::fs::File;
-use std::io::{self, BufReader};
+use std::io::{self, BufReader, BufWriter};
+use std::mem::size_of;
+use std::str::from_utf8;
 
 use bytemuck::cast_slice;
+use grenad::Writer;
+use itertools::EitherOrBoth;
+use ordered_float::OrderedFloat;
 use serde_json::{from_slice, Value};
 
 use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
 use crate::error::UserError;
+use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
 use crate::update::index_documents::helpers::try_split_at;
 use crate::{DocumentId, FieldId, InternalError, Result, VectorOrArrayOfVectors};
 
+/// The length of the elements that are always in the buffer when inserting new values.
+const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
+
 /// Extracts the embedding vector contained in each document under the `_vectors` field.
 ///
 /// Returns the generated grenad reader containing the docid as key associated to the Vec<f32>
@@ -27,45 +37,112 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
         tempfile::tempfile()?,
     );
 
+    let mut key_buffer = Vec::new();
     let mut cursor = obkv_documents.into_cursor()?;
     while let Some((key, value)) = cursor.move_on_next()? {
         // this must always be serialized as (docid, external_docid);
         let (docid_bytes, external_id_bytes) =
             try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
-        debug_assert!(std::str::from_utf8(external_id_bytes).is_ok());
+        debug_assert!(from_utf8(external_id_bytes).is_ok());
 
         let obkv = obkv::KvReader::new(value);
+        key_buffer.clear();
+        key_buffer.extend_from_slice(docid_bytes);
 
         // since we only needs the primary key when we throw an error we create this getter to
         // lazily get it when needed
-        let document_id = || -> Value { std::str::from_utf8(external_id_bytes).unwrap().into() };
+        let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
 
         // first we retrieve the _vectors field
-        if let Some(vectors) = obkv.get(vectors_fid) {
-            // extract the vectors
-            let vectors = match from_slice(vectors) {
-                Ok(vectors) => VectorOrArrayOfVectors::into_array_of_vectors(vectors),
-                Err(_) => {
-                    return Err(UserError::InvalidVectorsType {
-                        document_id: document_id(),
-                        value: from_slice(vectors).map_err(InternalError::SerdeJson)?,
-                    }
-                    .into())
-                }
-            };
+        if let Some(value) = obkv.get(vectors_fid) {
+            let vectors_obkv = KvReaderDelAdd::new(value);
 
-            if let Some(vectors) = vectors {
-                for (i, vector) in vectors.into_iter().enumerate().take(u16::MAX as usize) {
-                    let index = u16::try_from(i).unwrap();
-                    let mut key = docid_bytes.to_vec();
-                    key.extend_from_slice(&index.to_be_bytes());
-                    let bytes = cast_slice(&vector);
-                    writer.insert(key, bytes)?;
-                }
-            }
+            // then we extract the values
+            let del_vectors = vectors_obkv
+                .get(DelAdd::Deletion)
+                .map(|vectors| extract_vectors(vectors, document_id))
+                .transpose()?
+                .flatten();
+            let add_vectors = vectors_obkv
+                .get(DelAdd::Addition)
+                .map(|vectors| extract_vectors(vectors, document_id))
+                .transpose()?
+                .flatten();
+
+            // and we finally push the unique vectors into the writer
+            push_vectors_diff(
+                &mut writer,
+                &mut key_buffer,
+                del_vectors.unwrap_or_default(),
+                add_vectors.unwrap_or_default(),
+            )?;
         }
-        // else => the `_vectors` object was `null`, there is nothing to do
     }
 
     writer_into_reader(writer)
 }
+
+/// Computes the diff between both Del and Add numbers and
+/// only inserts the parts that differ in the sorter.
+fn push_vectors_diff(
+    writer: &mut Writer<BufWriter<File>>,
+    key_buffer: &mut Vec<u8>,
+    mut del_vectors: Vec<Vec<f32>>,
+    mut add_vectors: Vec<Vec<f32>>,
+) -> Result<()> {
+    // We sort and dedup the vectors
+    del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
+    add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
+    del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
+    add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
+
+    let merged_vectors_iter =
+        itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
+
+    // insert vectors into the writer
+    for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) {
+        // Generate the key by extending the unique index to it.
+        key_buffer.truncate(TRUNCATE_SIZE);
+        let index = u16::try_from(i).unwrap();
+        key_buffer.extend_from_slice(&index.to_be_bytes());
+
+        match eob {
+            EitherOrBoth::Both(_, _) => (), // no need to touch anything
+            EitherOrBoth::Left(vector) => {
+                // We insert only the Del part of the Obkv to inform
+                // that we only want to remove all those vectors.
+                let mut obkv = KvWriterDelAdd::memory();
+                obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
+                let bytes = obkv.into_inner()?;
+                writer.insert(&key_buffer, bytes)?;
+            }
+            EitherOrBoth::Right(vector) => {
+                // We insert only the Add part of the Obkv to inform
+                // that we only want to remove all those vectors.
+                let mut obkv = KvWriterDelAdd::memory();
+                obkv.insert(DelAdd::Addition, cast_slice(&vector))?;
+                let bytes = obkv.into_inner()?;
+                writer.insert(&key_buffer, bytes)?;
+            }
+        }
+    }
+
+    Ok(())
+}
+
+/// Compares two vectors by using the OrderingFloat helper.
+fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering {
+    a.iter().copied().map(OrderedFloat).cmp(b.iter().copied().map(OrderedFloat))
+}
+
+/// Extracts the vectors from a JSON value.
+fn extract_vectors(value: &[u8], document_id: impl Fn() -> Value) -> Result<Option<Vec<Vec<f32>>>> {
+    match from_slice(value) {
+        Ok(vectors) => Ok(VectorOrArrayOfVectors::into_array_of_vectors(vectors)),
+        Err(_) => Err(UserError::InvalidVectorsType {
+            document_id: document_id(),
+            value: from_slice(value).map_err(InternalError::SerdeJson)?,
+        }
+        .into()),
+    }
+}
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 7c3f587d2..80671e39f 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::convert::TryInto;
 use std::fs::File;
 use std::io::{self, BufReader};
@@ -8,7 +8,9 @@ use charabia::{Language, Script};
 use grenad::MergerBuilder;
 use heed::types::ByteSlice;
 use heed::RwTxn;
+use log::error;
 use obkv::{KvReader, KvWriter};
+use ordered_float::OrderedFloat;
 use roaring::RoaringBitmap;
 
 use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap};
@@ -22,10 +24,9 @@ use crate::index::Hnsw;
 use crate::update::del_add::{DelAdd, KvReaderDelAdd};
 use crate::update::facet::FacetsUpdate;
 use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
-use crate::update::index_documents::validate_document_id_value;
 use crate::{
-    lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError,
-    Result, SerializationError, BEU32,
+    lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, Result,
+    SerializationError, BEU32,
 };
 
 pub(crate) enum TypedChunk {
@@ -366,44 +367,70 @@ pub(crate) fn write_typed_chunk_into_index(
             index.put_geo_faceted_documents_ids(wtxn, &geo_faceted_docids)?;
         }
         TypedChunk::VectorPoints(vector_points) => {
-            let (pids, mut points): (Vec<_>, Vec<_>) = match index.vector_hnsw(wtxn)? {
-                Some(hnsw) => hnsw.iter().map(|(pid, point)| (pid, point.clone())).unzip(),
-                None => Default::default(),
-            };
-
-            // Convert the PointIds into DocumentIds
-            let mut docids = Vec::new();
-            for pid in pids {
-                let docid =
-                    index.vector_id_docid.get(wtxn, &BEU32::new(pid.into_inner()))?.unwrap();
-                docids.push(docid.get());
+            let mut vectors_set = HashSet::new();
+            // We extract and store the previous vectors
+            if let Some(hnsw) = index.vector_hnsw(wtxn)? {
+                for (pid, point) in hnsw.iter() {
+                    let pid_key = BEU32::new(pid.into_inner());
+                    let docid = index.vector_id_docid.get(wtxn, &pid_key)?.unwrap().get();
+                    let vector: Vec<_> = point.iter().copied().map(OrderedFloat).collect();
+                    vectors_set.insert((docid, vector));
+                }
             }
 
-            let mut expected_dimensions = points.get(0).map(|p| p.len());
             let mut cursor = vector_points.into_cursor()?;
             while let Some((key, value)) = cursor.move_on_next()? {
                 // convert the key back to a u32 (4 bytes)
                 let (left, _index) = try_split_array_at(key).unwrap();
                 let docid = DocumentId::from_be_bytes(left);
-                // convert the vector back to a Vec<f32>
-                let vector: Vec<f32> = pod_collect_to_vec(value);
 
-                // TODO Inform the user about the document that has a wrong `_vectors`
-                let found = vector.len();
-                let expected = *expected_dimensions.get_or_insert(found);
-                if expected != found {
-                    return Err(UserError::InvalidVectorDimensions { expected, found }.into());
+                let vector_deladd_obkv = KvReaderDelAdd::new(value);
+                if let Some(value) = vector_deladd_obkv.get(DelAdd::Deletion) {
+                    // convert the vector back to a Vec<f32>
+                    let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
+                    let key = (docid, vector);
+                    if !vectors_set.remove(&key) {
+                        error!("Unable to delete the vector: {:?}", key.1);
+                    }
+                }
+                if let Some(value) = vector_deladd_obkv.get(DelAdd::Addition) {
+                    // convert the vector back to a Vec<f32>
+                    let vector = pod_collect_to_vec(value).into_iter().map(OrderedFloat).collect();
+                    vectors_set.insert((docid, vector));
                 }
-
-                points.push(NDotProductPoint::new(vector));
-                docids.push(docid);
             }
 
-            assert_eq!(docids.len(), points.len());
+            // Extract the most common vector dimension
+            let expected_dimension_size = {
+                let mut dims = HashMap::new();
+                vectors_set.iter().for_each(|(_, v)| *dims.entry(v.len()).or_insert(0) += 1);
+                dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len)
+            };
+
+            // Ensure that the vector lenghts are correct and
+            // prepare the vectors before inserting them in the HNSW.
+            let mut points = Vec::new();
+            let mut docids = Vec::new();
+            for (docid, vector) in vectors_set {
+                if expected_dimension_size.map_or(false, |expected| expected != vector.len()) {
+                    return Err(UserError::InvalidVectorDimensions {
+                        expected: expected_dimension_size.unwrap_or(vector.len()),
+                        found: vector.len(),
+                    }
+                    .into());
+                } else {
+                    let vector = vector.into_iter().map(OrderedFloat::into_inner).collect();
+                    points.push(NDotProductPoint::new(vector));
+                    docids.push(docid);
+                }
+            }
 
             let hnsw_length = points.len();
             let (new_hnsw, pids) = Hnsw::builder().build_hnsw(points);
 
+            assert_eq!(docids.len(), pids.len());
+
+            // Store the vectors in the point-docid relation database
             index.vector_id_docid.clear(wtxn)?;
             for (docid, pid) in docids.into_iter().zip(pids) {
                 index.vector_id_docid.put(

From 87610a5f988ac59c786feff1cd1fd019ccf67366 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 2 Nov 2023 16:49:03 +0100
Subject: [PATCH 100/127] Don't try to delete a document that is not in the
 database

---
 .../update/index_documents/helpers/grenad_helpers.rs   | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs
index 03a3d6f5f..4f764ab95 100644
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -223,11 +223,13 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
         );
 
         while let Some((document_id, obkv)) = cursor.move_on_next()? {
-            obkv_documents.insert(document_id, obkv)?;
-            current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
+            if !obkv.is_empty() {
+                obkv_documents.insert(document_id, obkv)?;
+                current_chunk_size += document_id.len() as u64 + obkv.len() as u64;
 
-            if current_chunk_size >= documents_chunk_size as u64 {
-                return writer_into_reader(obkv_documents).map(Some);
+                if current_chunk_size >= documents_chunk_size as u64 {
+                    return writer_into_reader(obkv_documents).map(Some);
+                }
             }
         }
 

From 1ad1fcc8c83863d4ff261d62787bf14cddc0c78a Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 6 Nov 2023 10:31:14 +0100
Subject: [PATCH 101/127] Remove all warnings

---
 .../extract/extract_docid_word_positions.rs          |  4 ++--
 milli/src/update/index_documents/extract/mod.rs      |  2 +-
 milli/src/update/index_documents/mod.rs              |  3 ---
 milli/src/update/index_documents/transform.rs        | 12 ------------
 4 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index 96156adb4..9895c1a64 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -30,7 +30,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
     allowed_separators: Option<&[&str]>,
     dictionary: Option<&[&str]>,
     max_positions_per_attributes: Option<u32>,
-) -> Result<(RoaringBitmap, grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
+) -> Result<(grenad::Reader<BufReader<File>>, ScriptLanguageDocidsMap)> {
     puffin::profile_function!();
 
     let max_positions_per_attributes = max_positions_per_attributes
@@ -154,7 +154,7 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
 
     // the returned sorter is serialized as: key: (DocId, FieldId), value: KV<DelAdd, KV<u16, String>>.
     sorter_into_reader(docid_word_positions_sorter, indexer)
-        .map(|reader| (documents_ids, reader, script_language_docids))
+        .map(|reader| (reader, script_language_docids))
 }
 
 /// Check if any searchable fields of a document changed.
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index ee8713ee8..91f3e1c62 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -348,7 +348,7 @@ fn send_and_extract_flattened_documents_data(
     let (docid_word_positions_chunk, fid_docid_facet_values_chunks): (Result<_>, Result<_>) =
         rayon::join(
             || {
-                let (documents_ids, docid_word_positions_chunk, script_language_pair) =
+                let (docid_word_positions_chunk, script_language_pair) =
                     extract_docid_word_positions(
                         flattened_documents_chunk.clone(),
                         indexer,
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 129b67cf0..66e6d16dc 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -245,9 +245,6 @@ where
             primary_key,
             fields_ids_map,
             field_distribution,
-            new_external_documents_ids,
-            new_documents_ids,
-            replaced_documents_ids,
             documents_count,
             original_documents,
             flattened_documents,
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 7c500799d..186974bfe 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -32,9 +32,6 @@ pub struct TransformOutput {
     pub primary_key: String,
     pub fields_ids_map: FieldsIdsMap,
     pub field_distribution: FieldDistribution,
-    pub new_external_documents_ids: fst::Map<Cow<'static, [u8]>>,
-    pub new_documents_ids: RoaringBitmap,
-    pub replaced_documents_ids: RoaringBitmap,
     pub documents_count: usize,
     pub original_documents: File,
     pub flattened_documents: File,
@@ -735,15 +732,11 @@ impl<'a, 'i> Transform<'a, 'i> {
         new_external_documents_ids_builder.into_iter().try_for_each(|(key, value)| {
             fst_new_external_documents_ids_builder.insert(key, value)
         })?;
-        let new_external_documents_ids = fst_new_external_documents_ids_builder.into_map();
 
         Ok(TransformOutput {
             primary_key,
             fields_ids_map: self.fields_ids_map,
             field_distribution,
-            new_external_documents_ids: new_external_documents_ids.map_data(Cow::Owned).unwrap(),
-            new_documents_ids: self.new_documents_ids,
-            replaced_documents_ids: self.replaced_documents_ids,
             documents_count: self.documents_count,
             original_documents: original_documents.into_inner().map_err(|err| err.into_error())?,
             flattened_documents: flattened_documents
@@ -889,11 +882,6 @@ impl<'a, 'i> Transform<'a, 'i> {
             primary_key,
             fields_ids_map: new_fields_ids_map,
             field_distribution,
-            // FIXME: remove this now unused field
-            new_external_documents_ids: fst::Map::default().map_data(Cow::Owned).unwrap(),
-            new_documents_ids: documents_ids,
-            // FIXME: remove this now unused field
-            replaced_documents_ids: RoaringBitmap::default(),
             documents_count,
             original_documents: original_documents.into_inner().into_inner(),
             flattened_documents: flattened_documents.into_inner().into_inner(),

From 1b2ea6cf19309782a2e3b2ff2fe6d7708dd5de4f Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 6 Nov 2023 10:46:22 +0100
Subject: [PATCH 102/127] REVERT ME: ignore prefix pair databases tests

---
 milli/src/update/prefix_word_pairs/mod.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs
index e718f9b77..320c01461 100644
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@@ -171,6 +171,7 @@ mod tests {
         documents
     }
 
+    #[ignore]
     #[test]
     fn add_new_documents() {
         let mut index = TempIndex::new();
@@ -235,6 +236,7 @@ mod tests {
         db_snap!(index, word_prefix_pair_proximity_docids, "update");
         db_snap!(index, prefix_word_pair_proximity_docids, "update");
     }
+    #[ignore]
     #[test]
     fn batch_bug_3043() {
         // https://github.com/meilisearch/meilisearch/issues/3043
@@ -283,6 +285,7 @@ mod tests {
         db_snap!(index, prefix_word_pair_proximity_docids);
     }
 
+    #[ignore]
     #[test]
     fn hard_delete_and_reupdate() {
         let mut index = TempIndex::new();
@@ -357,6 +360,7 @@ mod tests {
         db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
     }
 
+    #[ignore]
     #[test]
     fn replace_hard_deletion() {
         let mut index = TempIndex::new();

From 1bccf2079ed6c92669f7100dbebeb79d23668956 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 6 Nov 2023 11:03:56 +0100
Subject: [PATCH 103/127] Correctly mark non-tests as non-tests

---
 milli/src/search/facet/facet_sort_ascending.rs | 4 ++--
 milli/src/update/facet/mod.rs                  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs
index 892401c08..0197639e4 100644
--- a/milli/src/search/facet/facet_sort_ascending.rs
+++ b/milli/src/search/facet/facet_sort_ascending.rs
@@ -13,7 +13,7 @@ use crate::heed_codec::ByteSliceRefCodec;
 /// The documents returned by the iterator are grouped by the facet values that
 /// determined their rank. For example, given the documents:
 ///
-/// ```ignore
+/// ```text
 /// 0: { "colour": ["blue", "green"] }
 /// 1: { "colour": ["blue", "red"] }
 /// 2: { "colour": ["orange", "red"] }
@@ -22,7 +22,7 @@ use crate::heed_codec::ByteSliceRefCodec;
 /// ```
 /// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator
 /// over the following elements:
-/// ```ignore
+/// ```text
 /// [0, 4]  // corresponds to all the documents within the candidates that have the facet value "blue"
 /// [3]     // same for "green"
 /// [2]     // same for "orange"
diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs
index 7358ceb6c..52fea0f5f 100644
--- a/milli/src/update/facet/mod.rs
+++ b/milli/src/update/facet/mod.rs
@@ -14,7 +14,7 @@ The databases must be able to return results for queries such as:
 The algorithms that implement these queries are found in the `src/search/facet` folder.
 
 To make these queries fast to compute, the database adopts a tree structure:
-```ignore
+```text
             ┌───────────────────────────────┬───────────────────────────────┬───────────────┐
 ┌───────┐   │           "ab" (2)            │           "gaf" (2)           │   "woz" (1)   │
 │Level 2│   │                               │                               │               │
@@ -41,7 +41,7 @@ These documents all contain a facet value that is contained within `ab .. gaf`.
 In the database, each node is represented by a key/value pair encoded as a [`FacetGroupKey`] and a
 [`FacetGroupValue`], which have the following format:
 
-```ignore
+```text
 FacetGroupKey:
 - field id  : u16
 - level     : u8

From cbaa54cafdf8e91c798958b5335e125880b78a1d Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 6 Nov 2023 11:19:31 +0100
Subject: [PATCH 104/127] Fix clippy issues

---
 index-scheduler/src/batch.rs                         |  2 +-
 milli/src/external_documents_ids.rs                  |  5 ++---
 milli/src/index.rs                                   |  2 +-
 .../extract/extract_docid_word_positions.rs          |  9 +++++----
 .../extract/extract_fid_word_count_docids.rs         |  2 +-
 .../index_documents/extract/extract_word_docids.rs   | 12 ++++++------
 .../extract/extract_word_pair_proximity_docids.rs    |  9 ++++-----
 .../extract/extract_word_position_docids.rs          |  2 +-
 .../index_documents/helpers/merge_functions.rs       |  2 +-
 milli/src/update/index_documents/mod.rs              |  2 +-
 milli/src/update/index_documents/typed_chunk.rs      |  3 ++-
 11 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs
index ebdba0a8c..d96891d82 100644
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -1292,7 +1292,7 @@ impl IndexScheduler {
                     || must_stop_processing.get(),
                 )?;
 
-                let document_ids = documents.iter().cloned().flatten().collect();
+                let document_ids = documents.iter().flatten().cloned().collect();
 
                 let (new_builder, user_result) = builder.remove_documents(document_ids)?;
                 builder = new_builder;
diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs
index e0a71b7cd..0e4891649 100644
--- a/milli/src/external_documents_ids.rs
+++ b/milli/src/external_documents_ids.rs
@@ -1,5 +1,4 @@
 use std::collections::HashMap;
-use std::convert::TryInto;
 
 use heed::types::{OwnedType, Str};
 use heed::{Database, RoIter, RoTxn, RwTxn};
@@ -31,7 +30,7 @@ impl ExternalDocumentsIds {
     }
 
     pub fn get<A: AsRef<str>>(&self, rtxn: &RoTxn, external_id: A) -> heed::Result<Option<u32>> {
-        Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get().try_into().unwrap()))
+        Ok(self.0.get(rtxn, external_id.as_ref())?.map(|x| x.get()))
     }
 
     /// An helper function to debug this type, returns an `HashMap` of both,
@@ -40,7 +39,7 @@ impl ExternalDocumentsIds {
         let mut map = HashMap::default();
         for result in self.0.iter(rtxn)? {
             let (external, internal) = result?;
-            map.insert(external.to_owned(), internal.get().try_into().unwrap());
+            map.insert(external.to_owned(), internal.get());
         }
         Ok(map)
     }
diff --git a/milli/src/index.rs b/milli/src/index.rs
index a52033fb6..86ef6105b 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1376,7 +1376,7 @@ impl Index {
         rtxn: &RoTxn,
         key: &(Script, Language),
     ) -> heed::Result<Option<RoaringBitmap>> {
-        Ok(self.script_language_docids.get(rtxn, key)?)
+        self.script_language_docids.get(rtxn, key)
     }
 
     pub fn script_language(&self, rtxn: &RoTxn) -> heed::Result<HashMap<Script, Vec<Language>>> {
diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index 9895c1a64..0dcd6a42a 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -198,7 +198,7 @@ fn tokenizer_builder<'a>(
     }
 
     if let Some(script_language) = script_language {
-        tokenizer_builder.allow_list(&script_language);
+        tokenizer_builder.allow_list(script_language);
     }
 
     tokenizer_builder
@@ -206,6 +206,7 @@ fn tokenizer_builder<'a>(
 
 /// Extract words maped with their positions of a document,
 /// ensuring no Language detection mistakes was made.
+#[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct
 fn lang_safe_tokens_from_document<'a>(
     obkv: &KvReader<FieldId>,
     searchable_fields: &Option<HashSet<FieldId>>,
@@ -220,9 +221,9 @@ fn lang_safe_tokens_from_document<'a>(
     let mut script_language_word_count = HashMap::new();
 
     tokens_from_document(
-        &obkv,
+        obkv,
         searchable_fields,
-        &tokenizer,
+        tokenizer,
         max_positions_per_attributes,
         del_add,
         buffers,
@@ -257,7 +258,7 @@ fn lang_safe_tokens_from_document<'a>(
 
             // rerun the extraction.
             tokens_from_document(
-                &obkv,
+                obkv,
                 searchable_fields,
                 &tokenizer,
                 max_positions_per_attributes,
diff --git a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
index accf4a510..182d0c5d8 100644
--- a/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_word_count_docids.rs
@@ -45,7 +45,7 @@ pub fn extract_fid_word_count_docids<R: io::Read + io::Seek>(
             .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
         let document_id = u32::from_be_bytes(document_id_bytes);
 
-        let del_add_reader = KvReaderDelAdd::new(&value);
+        let del_add_reader = KvReaderDelAdd::new(value);
         let deletion = del_add_reader
             // get deleted words
             .get(DelAdd::Deletion)
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 5266e9bff..f278012c7 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -57,17 +57,17 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         let document_id = u32::from_be_bytes(document_id_bytes);
         let fid = u16::from_be_bytes(fid_bytes);
 
-        let del_add_reader = KvReaderDelAdd::new(&value);
+        let del_add_reader = KvReaderDelAdd::new(value);
         // extract all unique words to remove.
         if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
-            for (_pos, word) in KvReaderU16::new(&deletion).iter() {
+            for (_pos, word) in KvReaderU16::new(deletion).iter() {
                 del_words.insert(word.to_vec());
             }
         }
 
         // extract all unique additional words.
         if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
-            for (_pos, word) in KvReaderU16::new(&addition).iter() {
+            for (_pos, word) in KvReaderU16::new(addition).iter() {
                 add_words.insert(word.to_vec());
             }
         }
@@ -122,9 +122,9 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
 
         // every words contained in an attribute set to exact must be pushed in the exact_words list.
         if exact_attributes.contains(&fid) {
-            exact_word_docids_sorter.insert(word.as_bytes(), &value)?;
+            exact_word_docids_sorter.insert(word.as_bytes(), value)?;
         } else {
-            word_docids_sorter.insert(word.as_bytes(), &value)?;
+            word_docids_sorter.insert(word.as_bytes(), value)?;
         }
     }
 
@@ -169,7 +169,7 @@ fn words_into_sorter(
         };
 
         key_buffer.clear();
-        key_buffer.extend_from_slice(&word_bytes);
+        key_buffer.extend_from_slice(word_bytes);
         key_buffer.push(0);
         key_buffer.extend_from_slice(&fid.to_be_bytes());
         word_fid_docids_sorter.insert(&key_buffer, value_writer.into_inner().unwrap())?;
diff --git a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
index 76a1d1d68..b8a377247 100644
--- a/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_pair_proximity_docids.rs
@@ -29,7 +29,6 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
     let max_memory = indexer.max_memory_by_thread();
 
     let mut word_pair_proximity_docids_sorters: Vec<_> = (1..MAX_DISTANCE)
-        .into_iter()
         .map(|_| {
             create_sorter(
                 grenad::SortAlgorithm::Unstable,
@@ -75,7 +74,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
         let (del, add): (Result<_>, Result<_>) = rayon::join(
             || {
                 // deletions
-                if let Some(deletion) = KvReaderDelAdd::new(&value).get(DelAdd::Deletion) {
+                if let Some(deletion) = KvReaderDelAdd::new(value).get(DelAdd::Deletion) {
                     for (position, word) in KvReaderU16::new(deletion).iter() {
                         // drain the proximity window until the head word is considered close to the word we are inserting.
                         while del_word_positions.get(0).map_or(false, |(_w, p)| {
@@ -104,7 +103,7 @@ pub fn extract_word_pair_proximity_docids<R: io::Read + io::Seek>(
             },
             || {
                 // additions
-                if let Some(addition) = KvReaderDelAdd::new(&value).get(DelAdd::Addition) {
+                if let Some(addition) = KvReaderDelAdd::new(value).get(DelAdd::Addition) {
                     for (position, word) in KvReaderU16::new(addition).iter() {
                         // drain the proximity window until the head word is considered close to the word we are inserting.
                         while add_word_positions.get(0).map_or(false, |(_w, p)| {
@@ -170,7 +169,7 @@ fn document_word_positions_into_sorter(
     document_id: DocumentId,
     del_word_pair_proximity: &BTreeMap<(String, String), u8>,
     add_word_pair_proximity: &BTreeMap<(String, String), u8>,
-    word_pair_proximity_docids_sorters: &mut Vec<grenad::Sorter<MergeFn>>,
+    word_pair_proximity_docids_sorters: &mut [grenad::Sorter<MergeFn>],
 ) -> Result<()> {
     use itertools::merge_join_by;
     use itertools::EitherOrBoth::{Both, Left, Right};
@@ -201,7 +200,7 @@ fn document_word_positions_into_sorter(
         };
 
         key_buffer.clear();
-        key_buffer.push(*prox as u8);
+        key_buffer.push(*prox);
         key_buffer.extend_from_slice(w1.as_bytes());
         key_buffer.push(0);
         key_buffer.extend_from_slice(w2.as_bytes());
diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
index 2ff2f2ad5..1b9ec66ff 100644
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -60,7 +60,7 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
 
         current_document_id = Some(document_id);
 
-        let del_add_reader = KvReaderDelAdd::new(&value);
+        let del_add_reader = KvReaderDelAdd::new(value);
         // extract all unique words to remove.
         if let Some(deletion) = del_add_reader.get(DelAdd::Deletion) {
             for (position, word_bytes) in KvReaderU16::new(deletion).iter() {
diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs
index 770629c8e..98c1c1a04 100644
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@@ -157,7 +157,7 @@ fn inner_merge_del_add_obkvs<'a>(
     let mut acc = newest[1..].to_vec();
     let mut buffer = Vec::new();
     // reverse iter from the most recent to the oldest.
-    for current in obkvs.into_iter().rev() {
+    for current in obkvs.iter().rev() {
         // if in the previous iteration there was a complete deletion,
         // stop the merge process.
         if acc_operation_type == Operation::Deletion as u8 {
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 66e6d16dc..2be410ace 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -2659,7 +2659,7 @@ mod tests {
         let external_document_ids = index.external_documents_ids();
         let ids_to_delete: Vec<u32> = external_ids
             .iter()
-            .map(|id| external_document_ids.get(&wtxn, id).unwrap().unwrap())
+            .map(|id| external_document_ids.get(wtxn, id).unwrap().unwrap())
             .collect();
 
         // Delete some documents.
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 80671e39f..b53d859cd 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -456,7 +456,7 @@ pub(crate) fn write_typed_chunk_into_index(
 
                 if final_value.is_empty() {
                     // If the database entry exists, delete it.
-                    if db_key_exists == true {
+                    if db_key_exists {
                         index.script_language_docids.delete(wtxn, &key)?;
                     }
                 } else {
@@ -501,6 +501,7 @@ fn merge_word_docids_reader_into_fst(
 ///
 /// If there is no Add side we currently write an empty buffer
 /// which is a valid CboRoaringBitmap.
+#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
 fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec<u8>) -> Result<&'a [u8]> {
     Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
 }

From 620fee35f98db38715ad9be6ad54c15b13d692a3 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 6 Nov 2023 11:56:46 +0100
Subject: [PATCH 105/127] Fix benches

---
 benchmarks/benches/indexing.rs | 89 +++++++++++++---------------------
 1 file changed, 34 insertions(+), 55 deletions(-)

diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs
index cb220a5f0..c31bfab89 100644
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@@ -264,17 +264,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
                 (index, document_ids_to_delete)
             },
             move |(index, document_ids_to_delete)| {
-                let mut wtxn = index.write_txn().unwrap();
-
-                for ids in document_ids_to_delete {
-                    let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-                    builder.delete_documents(&ids);
-                    builder.execute().unwrap();
-                }
-
-                wtxn.commit().unwrap();
-
-                index.prepare_for_closing().wait();
+                delete_documents_from_ids(index, document_ids_to_delete)
             },
         )
     });
@@ -611,17 +601,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
                 (index, document_ids_to_delete)
             },
             move |(index, document_ids_to_delete)| {
-                let mut wtxn = index.write_txn().unwrap();
-
-                for ids in document_ids_to_delete {
-                    let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-                    builder.delete_documents(&ids);
-                    builder.execute().unwrap();
-                }
-
-                wtxn.commit().unwrap();
-
-                index.prepare_for_closing().wait();
+                delete_documents_from_ids(index, document_ids_to_delete)
             },
         )
     });
@@ -873,22 +853,41 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
                 (index, document_ids_to_delete)
             },
             move |(index, document_ids_to_delete)| {
-                let mut wtxn = index.write_txn().unwrap();
-
-                for ids in document_ids_to_delete {
-                    let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-                    builder.delete_documents(&ids);
-                    builder.execute().unwrap();
-                }
-
-                wtxn.commit().unwrap();
-
-                index.prepare_for_closing().wait();
+                delete_documents_from_ids(index, document_ids_to_delete)
             },
         )
     });
 }
 
+fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBitmap>) {
+    let mut wtxn = index.write_txn().unwrap();
+
+    let indexer_config = IndexerConfig::default();
+    for ids in document_ids_to_delete {
+        let external_documents_ids = index.external_documents_ids();
+        // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
+        // Since what we have is an iterator, it would be better to delete in chunks
+        let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
+            external_documents_ids
+                .find_external_id_of(&wtxn, ids)
+                .unwrap()
+                .only_external_ids()
+                .collect();
+        let ids = external_to_internal.unwrap();
+        let config = IndexDocumentsConfig::default();
+
+        let mut builder =
+            IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false)
+                .unwrap();
+        (builder, _) = builder.remove_documents(ids).unwrap();
+        builder.execute().unwrap();
+    }
+
+    wtxn.commit().unwrap();
+
+    index.prepare_for_closing().wait();
+}
+
 fn indexing_movies_in_three_batches(c: &mut Criterion) {
     let mut group = c.benchmark_group("indexing");
     group.sample_size(BENCHMARK_ITERATION);
@@ -1110,17 +1109,7 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
                 (index, document_ids_to_delete)
             },
             move |(index, document_ids_to_delete)| {
-                let mut wtxn = index.write_txn().unwrap();
-
-                for ids in document_ids_to_delete {
-                    let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-                    builder.delete_documents(&ids);
-                    builder.execute().unwrap();
-                }
-
-                wtxn.commit().unwrap();
-
-                index.prepare_for_closing().wait();
+                delete_documents_from_ids(index, document_ids_to_delete)
             },
         )
     });
@@ -1336,17 +1325,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
                 (index, document_ids_to_delete)
             },
             move |(index, document_ids_to_delete)| {
-                let mut wtxn = index.write_txn().unwrap();
-
-                for ids in document_ids_to_delete {
-                    let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
-                    builder.delete_documents(&ids);
-                    builder.execute().unwrap();
-                }
-
-                wtxn.commit().unwrap();
-
-                index.prepare_for_closing().wait();
+                delete_documents_from_ids(index, document_ids_to_delete)
             },
         )
     });

From ef6fa10f7a93e8aedd36feb3dce327bd6f896636 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 6 Nov 2023 12:16:15 +0100
Subject: [PATCH 106/127] Remove `IndexOperation::DocumentDeletion`

---
 index-scheduler/src/batch.rs | 65 +++++++-----------------------------
 1 file changed, 12 insertions(+), 53 deletions(-)

diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs
index d96891d82..c9deedb37 100644
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -104,12 +104,6 @@ pub(crate) enum IndexOperation {
         operations: Vec<DocumentOperation>,
         tasks: Vec<Task>,
     },
-    DocumentDeletion {
-        index_uid: String,
-        // The vec associated with each document deletion tasks.
-        documents: Vec<Vec<String>>,
-        tasks: Vec<Task>,
-    },
     IndexDocumentDeletionByFilter {
         index_uid: String,
         task: Task,
@@ -161,7 +155,6 @@ impl Batch {
             }
             Batch::IndexOperation { op, .. } => match op {
                 IndexOperation::DocumentOperation { tasks, .. }
-                | IndexOperation::DocumentDeletion { tasks, .. }
                 | IndexOperation::Settings { tasks, .. }
                 | IndexOperation::DocumentClear { tasks, .. } => {
                     tasks.iter().map(|task| task.uid).collect()
@@ -226,7 +219,6 @@ impl IndexOperation {
     pub fn index_uid(&self) -> &str {
         match self {
             IndexOperation::DocumentOperation { index_uid, .. }
-            | IndexOperation::DocumentDeletion { index_uid, .. }
             | IndexOperation::IndexDocumentDeletionByFilter { index_uid, .. }
             | IndexOperation::DocumentClear { index_uid, .. }
             | IndexOperation::Settings { index_uid, .. }
@@ -242,9 +234,6 @@ impl fmt::Display for IndexOperation {
             IndexOperation::DocumentOperation { .. } => {
                 f.write_str("IndexOperation::DocumentOperation")
             }
-            IndexOperation::DocumentDeletion { .. } => {
-                f.write_str("IndexOperation::DocumentDeletion")
-            }
             IndexOperation::IndexDocumentDeletionByFilter { .. } => {
                 f.write_str("IndexOperation::IndexDocumentDeletionByFilter")
             }
@@ -347,18 +336,27 @@ impl IndexScheduler {
             BatchKind::DocumentDeletion { deletion_ids } => {
                 let tasks = self.get_existing_tasks(rtxn, deletion_ids)?;
 
-                let mut documents = Vec::new();
+                let mut operations = Vec::with_capacity(tasks.len());
+                let mut documents_counts = Vec::with_capacity(tasks.len());
                 for task in &tasks {
                     match task.kind {
                         KindWithContent::DocumentDeletion { ref documents_ids, .. } => {
-                            documents.push(documents_ids.clone())
+                            operations.push(DocumentOperation::Delete(documents_ids.clone()));
+                            documents_counts.push(documents_ids.len() as u64);
                         }
                         _ => unreachable!(),
                     }
                 }
 
                 Ok(Some(Batch::IndexOperation {
-                    op: IndexOperation::DocumentDeletion { index_uid, documents, tasks },
+                    op: IndexOperation::DocumentOperation {
+                        index_uid,
+                        primary_key: None,
+                        method: IndexDocumentsMethod::ReplaceDocuments,
+                        documents_counts,
+                        operations,
+                        tasks,
+                    },
                     must_create_index,
                 }))
             }
@@ -1275,45 +1273,6 @@ impl IndexScheduler {
 
                 Ok(tasks)
             }
-            IndexOperation::DocumentDeletion { index_uid: _, documents, mut tasks } => {
-                let indexer_config = self.index_mapper.indexer_config();
-                let config = IndexDocumentsConfig {
-                    update_method: IndexDocumentsMethod::ReplaceDocuments,
-                    ..Default::default()
-                };
-                let must_stop_processing = self.must_stop_processing.clone();
-
-                let mut builder = milli::update::IndexDocuments::new(
-                    index_wtxn,
-                    index,
-                    indexer_config,
-                    config,
-                    |indexing_step| trace!("update: {:?}", indexing_step),
-                    || must_stop_processing.get(),
-                )?;
-
-                let document_ids = documents.iter().flatten().cloned().collect();
-
-                let (new_builder, user_result) = builder.remove_documents(document_ids)?;
-                builder = new_builder;
-                // Uses Invariant: remove documents actually always returns Ok for the inner result
-                let count = user_result.unwrap();
-
-                for (task, documents) in tasks.iter_mut().zip(documents) {
-                    task.status = Status::Succeeded;
-                    task.details = Some(Details::DocumentDeletion {
-                        provided_ids: documents.len(),
-                        deleted_documents: Some(count.min(documents.len() as u64)),
-                    });
-                }
-
-                if !tasks.iter().all(|res| res.error.is_some()) {
-                    let addition = builder.execute()?;
-                    info!("document deletion done: {:?}", addition);
-                }
-
-                Ok(tasks)
-            }
             IndexOperation::IndexDocumentDeletionByFilter { mut task, index_uid: _ } => {
                 let filter =
                     if let KindWithContent::DocumentDeletionByFilter { filter_expr, .. } =

From 1e2fbc6a421e5dbea38e40c04d4511f28a5b7ea0 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 7 Nov 2023 16:46:52 +0100
Subject: [PATCH 107/127] revert "REVERT ME: ignore prefix pair databases
 tests"

This reverts commit 1b2ea6cf19309782a2e3b2ff2fe6d7708dd5de4f.
---
 milli/src/update/prefix_word_pairs/mod.rs | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs
index 320c01461..e718f9b77 100644
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@@ -171,7 +171,6 @@ mod tests {
         documents
     }
 
-    #[ignore]
     #[test]
     fn add_new_documents() {
         let mut index = TempIndex::new();
@@ -236,7 +235,6 @@ mod tests {
         db_snap!(index, word_prefix_pair_proximity_docids, "update");
         db_snap!(index, prefix_word_pair_proximity_docids, "update");
     }
-    #[ignore]
     #[test]
     fn batch_bug_3043() {
         // https://github.com/meilisearch/meilisearch/issues/3043
@@ -285,7 +283,6 @@ mod tests {
         db_snap!(index, prefix_word_pair_proximity_docids);
     }
 
-    #[ignore]
     #[test]
     fn hard_delete_and_reupdate() {
         let mut index = TempIndex::new();
@@ -360,7 +357,6 @@ mod tests {
         db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
     }
 
-    #[ignore]
     #[test]
     fn replace_hard_deletion() {
         let mut index = TempIndex::new();

From 6dab826908b815c5e62e63f8848f22cf7196e5de Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 8 Nov 2023 11:52:08 +0100
Subject: [PATCH 108/127] Reactivate prefix databases

---
 milli/src/update/index_documents/mod.rs | 56 ++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 2be410ace..f5fbe2797 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -381,12 +381,48 @@ where
             total_databases: TOTAL_POSTING_DATABASE_COUNT,
         });
 
+        let mut word_pair_proximity_docids = None;
+        let mut word_position_docids = None;
+        let mut word_fid_docids = None;
+        let mut word_docids = None;
+        let mut exact_word_docids = None;
+
         for result in lmdb_writer_rx {
             if (self.should_abort)() {
                 return Err(Error::InternalError(InternalError::AbortedIndexation));
             }
 
-            let typed_chunk = result?;
+            let typed_chunk = match result? {
+                TypedChunk::WordDocids {
+                    word_docids_reader,
+                    exact_word_docids_reader,
+                    word_fid_docids_reader,
+                } => {
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
+                    word_docids = Some(cloneable_chunk);
+                    let cloneable_chunk =
+                        unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
+                    exact_word_docids = Some(cloneable_chunk);
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
+                    word_fid_docids = Some(cloneable_chunk);
+                    TypedChunk::WordDocids {
+                        word_docids_reader,
+                        exact_word_docids_reader,
+                        word_fid_docids_reader,
+                    }
+                }
+                TypedChunk::WordPairProximityDocids(chunk) => {
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
+                    word_pair_proximity_docids = Some(cloneable_chunk);
+                    TypedChunk::WordPairProximityDocids(chunk)
+                }
+                TypedChunk::WordPositionDocids(chunk) => {
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
+                    word_position_docids = Some(cloneable_chunk);
+                    TypedChunk::WordPositionDocids(chunk)
+                }
+                otherwise => otherwise,
+            };
 
             // FIXME: return newly added as well as newly deleted documents
             let (docids, is_merged_database) =
@@ -417,17 +453,17 @@ where
 
         // We write the primary key field id into the main database
         self.index.put_primary_key(self.wtxn, &primary_key)?;
+        let number_of_documents = self.index.number_of_documents(self.wtxn)?;
 
-        // TODO: reactivate prefix DB with diff-indexing
-        // self.execute_prefix_databases(
-        //     word_docids,
-        //     exact_word_docids,
-        //     word_pair_proximity_docids,
-        //     word_position_docids,
-        //     word_fid_docids,
-        // )?;
+        self.execute_prefix_databases(
+            word_docids,
+            exact_word_docids,
+            word_pair_proximity_docids,
+            word_position_docids,
+            word_fid_docids,
+        )?;
 
-        self.index.number_of_documents(self.wtxn)
+        Ok(number_of_documents)
     }
 
     #[logging_timer::time("IndexDocuments::{}")]

From 688266c83e59d4c311b070fe8d274ac071cafae3 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 8 Nov 2023 14:16:01 +0100
Subject: [PATCH 109/127] Remove word pair proximity prefix cache and compute
 it at search time

---
 milli/src/index.rs                            |  14 +-
 milli/src/search/new/db_cache.rs              |  62 +-
 milli/src/update/clear_documents.rs           |   5 -
 milli/src/update/index_documents/mod.rs       |  39 +-
 milli/src/update/mod.rs                       |   5 -
 milli/src/update/prefix_word_pairs/mod.rs     | 418 ----------
 .../update/prefix_word_pairs/prefix_word.rs   | 182 -----
 .../update/prefix_word_pairs/word_prefix.rs   | 728 ------------------
 8 files changed, 41 insertions(+), 1412 deletions(-)
 delete mode 100644 milli/src/update/prefix_word_pairs/mod.rs
 delete mode 100644 milli/src/update/prefix_word_pairs/prefix_word.rs
 delete mode 100644 milli/src/update/prefix_word_pairs/word_prefix.rs

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 86ef6105b..5023d8fa5 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -83,8 +83,6 @@ pub mod db_name {
     pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids";
     pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
     pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
-    pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
-    pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
     pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
     pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids";
     pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
@@ -129,10 +127,6 @@ pub struct Index {
 
     /// Maps the proximity between a pair of words with all the docids where this relation appears.
     pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-    /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
-    pub word_prefix_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-    /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears.
-    pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
 
     /// Maps the word and the position with the docids that corresponds to it.
     pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
@@ -186,7 +180,7 @@ impl Index {
     ) -> Result<Index> {
         use db_name::*;
 
-        options.max_dbs(26);
+        options.max_dbs(24);
         unsafe { options.flag(Flags::MdbAlwaysFreePages) };
 
         let env = options.open(path)?;
@@ -203,10 +197,6 @@ impl Index {
             env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
         let script_language_docids =
             env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?;
-        let word_prefix_pair_proximity_docids =
-            env.create_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
-        let prefix_word_pair_proximity_docids =
-            env.create_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
         let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?;
         let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?;
         let field_id_word_count_docids =
@@ -247,8 +237,6 @@ impl Index {
             exact_word_prefix_docids,
             word_pair_proximity_docids,
             script_language_docids,
-            word_prefix_pair_proximity_docids,
-            prefix_word_pair_proximity_docids,
             word_position_docids,
             word_fid_docids,
             word_prefix_position_docids,
diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs
index 3376cebb2..2c670658d 100644
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@@ -11,7 +11,9 @@ use super::interner::Interned;
 use super::Word;
 use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
 use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
-use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext};
+use crate::{
+    CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
+};
 
 /// A cache storing pointers to values in the LMDB databases.
 ///
@@ -23,7 +25,7 @@ pub struct DatabaseCache<'ctx> {
     pub word_pair_proximity_docids:
         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
     pub word_prefix_pair_proximity_docids:
-        FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
+        FxHashMap<(u8, Interned<String>, Interned<String>), Option<RoaringBitmap>>,
     pub prefix_word_pair_proximity_docids:
         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
     pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
@@ -295,35 +297,47 @@ impl<'ctx> SearchContext<'ctx> {
         prefix2: Interned<String>,
         proximity: u8,
     ) -> Result<Option<RoaringBitmap>> {
-        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
-            self.txn,
-            (proximity, word1, prefix2),
-            &(
-                proximity,
-                self.word_interner.get(word1).as_str(),
-                self.word_interner.get(prefix2).as_str(),
-            ),
-            &mut self.db_cache.word_prefix_pair_proximity_docids,
-            self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
-        )
+        let docids = match self
+            .db_cache
+            .word_prefix_pair_proximity_docids
+            .entry((proximity, word1, prefix2))
+        {
+            Entry::Occupied(docids) => docids.get().clone(),
+            Entry::Vacant(entry) => {
+                // compute docids using prefix iter and store the result in the cache.
+                let key = U8StrStrCodec::bytes_encode(&(
+                    proximity,
+                    self.word_interner.get(word1).as_str(),
+                    self.word_interner.get(prefix2).as_str(),
+                ))
+                .unwrap()
+                .into_owned();
+                let mut prefix_docids = RoaringBitmap::new();
+                let remap_key_type = self
+                    .index
+                    .word_pair_proximity_docids
+                    .remap_key_type::<ByteSlice>()
+                    .prefix_iter(self.txn, &key)?;
+                for result in remap_key_type {
+                    let (_, docids) = result?;
+
+                    prefix_docids |= docids;
+                }
+                entry.insert(Some(prefix_docids.clone()));
+                Some(prefix_docids)
+            }
+        };
+        Ok(docids)
     }
+
     pub fn get_db_prefix_word_pair_proximity_docids(
         &mut self,
         left_prefix: Interned<String>,
         right: Interned<String>,
         proximity: u8,
     ) -> Result<Option<RoaringBitmap>> {
-        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
-            self.txn,
-            (proximity, left_prefix, right),
-            &(
-                proximity,
-                self.word_interner.get(left_prefix).as_str(),
-                self.word_interner.get(right).as_str(),
-            ),
-            &mut self.db_cache.prefix_word_pair_proximity_docids,
-            self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
-        )
+        // only accept exact matches on reverted positions
+        self.get_db_word_pair_proximity_docids(left_prefix, right, proximity)
     }
 
     pub fn get_db_word_fid_docids(
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 265c6f15a..afe0191b1 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -26,8 +26,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             word_prefix_docids,
             exact_word_prefix_docids,
             word_pair_proximity_docids,
-            word_prefix_pair_proximity_docids,
-            prefix_word_pair_proximity_docids,
             word_position_docids,
             word_fid_docids,
             field_id_word_count_docids,
@@ -68,8 +66,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         word_prefix_docids.clear(self.wtxn)?;
         exact_word_prefix_docids.clear(self.wtxn)?;
         word_pair_proximity_docids.clear(self.wtxn)?;
-        word_prefix_pair_proximity_docids.clear(self.wtxn)?;
-        prefix_word_pair_proximity_docids.clear(self.wtxn)?;
         word_position_docids.clear(self.wtxn)?;
         word_fid_docids.clear(self.wtxn)?;
         field_id_word_count_docids.clear(self.wtxn)?;
@@ -132,7 +128,6 @@ mod tests {
         assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
         assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
         assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
-        assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
         assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
         assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
         assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap());
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index f5fbe2797..8552cf52b 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -35,13 +35,12 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
 use crate::update::{
-    IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids,
-    WordPrefixIntegerDocids, WordsPrefixesFst,
+    IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
 };
 use crate::{CboRoaringBitmapCodec, Index, Result};
 
 static MERGED_DATABASE_COUNT: usize = 7;
-static PREFIX_DATABASE_COUNT: usize = 5;
+static PREFIX_DATABASE_COUNT: usize = 4;
 static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;
 
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -381,7 +380,6 @@ where
             total_databases: TOTAL_POSTING_DATABASE_COUNT,
         });
 
-        let mut word_pair_proximity_docids = None;
         let mut word_position_docids = None;
         let mut word_fid_docids = None;
         let mut word_docids = None;
@@ -411,11 +409,6 @@ where
                         word_fid_docids_reader,
                     }
                 }
-                TypedChunk::WordPairProximityDocids(chunk) => {
-                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
-                    word_pair_proximity_docids = Some(cloneable_chunk);
-                    TypedChunk::WordPairProximityDocids(chunk)
-                }
                 TypedChunk::WordPositionDocids(chunk) => {
                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
                     word_position_docids = Some(cloneable_chunk);
@@ -458,7 +451,6 @@ where
         self.execute_prefix_databases(
             word_docids,
             exact_word_docids,
-            word_pair_proximity_docids,
             word_position_docids,
             word_fid_docids,
         )?;
@@ -471,7 +463,6 @@ where
         self,
         word_docids: Option<grenad::Reader<CursorClonableMmap>>,
         exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
-        word_pair_proximity_docids: Option<grenad::Reader<CursorClonableMmap>>,
         word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
         word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>,
     ) -> Result<()>
@@ -592,32 +583,6 @@ where
             total_databases: TOTAL_POSTING_DATABASE_COUNT,
         });
 
-        if let Some(word_pair_proximity_docids) = word_pair_proximity_docids {
-            // Run the word prefix pair proximity docids update operation.
-            PrefixWordPairsProximityDocids::new(
-                self.wtxn,
-                self.index,
-                self.indexer_config.chunk_compression_type,
-                self.indexer_config.chunk_compression_level,
-            )
-            .execute(
-                word_pair_proximity_docids,
-                &new_prefix_fst_words,
-                &common_prefix_fst_words,
-                &del_prefix_fst_words,
-            )?;
-        }
-
-        if (self.should_abort)() {
-            return Err(Error::InternalError(InternalError::AbortedIndexation));
-        }
-
-        databases_seen += 1;
-        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
-            databases_seen,
-            total_databases: TOTAL_POSTING_DATABASE_COUNT,
-        });
-
         if let Some(word_position_docids) = word_position_docids {
             // Run the words prefix position docids update operation.
             let mut builder = WordPrefixIntegerDocids::new(
diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs
index dd8851ccb..eb2b6e69a 100644
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@@ -8,10 +8,6 @@ pub use self::index_documents::{
     MergeFn,
 };
 pub use self::indexer_config::IndexerConfig;
-pub use self::prefix_word_pairs::{
-    PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
-    MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
-};
 pub use self::settings::{Setting, Settings};
 pub use self::update_step::UpdateIndexingStep;
 pub use self::word_prefix_docids::WordPrefixDocids;
@@ -24,7 +20,6 @@ pub(crate) mod del_add;
 pub(crate) mod facet;
 mod index_documents;
 mod indexer_config;
-mod prefix_word_pairs;
 mod settings;
 mod update_step;
 mod word_prefix_docids;
diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs
deleted file mode 100644
index e718f9b77..000000000
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ /dev/null
@@ -1,418 +0,0 @@
-use std::borrow::Cow;
-use std::collections::HashSet;
-use std::io::{BufReader, BufWriter};
-
-use grenad::CompressionType;
-use heed::types::ByteSlice;
-
-use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap};
-use crate::{Index, Result};
-
-mod prefix_word;
-mod word_prefix;
-
-pub use prefix_word::index_prefix_word_database;
-pub use word_prefix::index_word_prefix_database;
-
-pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4;
-pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2;
-
-pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
-    wtxn: &'t mut heed::RwTxn<'i, 'u>,
-    index: &'i Index,
-    max_proximity: u8,
-    max_prefix_length: usize,
-    chunk_compression_type: CompressionType,
-    chunk_compression_level: Option<u32>,
-}
-impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
-    pub fn new(
-        wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        index: &'i Index,
-        chunk_compression_type: CompressionType,
-        chunk_compression_level: Option<u32>,
-    ) -> Self {
-        Self {
-            wtxn,
-            index,
-            max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
-            max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
-            chunk_compression_type,
-            chunk_compression_level,
-        }
-    }
-
-    #[logging_timer::time("WordPrefixPairProximityDocids::{}")]
-    pub fn execute<'a>(
-        self,
-        new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
-        new_prefix_fst_words: &'a [String],
-        common_prefix_fst_words: &[&'a [String]],
-        del_prefix_fst_words: &HashSet<Vec<u8>>,
-    ) -> Result<()> {
-        puffin::profile_function!();
-
-        index_word_prefix_database(
-            self.wtxn,
-            self.index.word_pair_proximity_docids,
-            self.index.word_prefix_pair_proximity_docids,
-            self.max_proximity,
-            self.max_prefix_length,
-            new_word_pair_proximity_docids.clone(),
-            new_prefix_fst_words,
-            common_prefix_fst_words,
-            del_prefix_fst_words,
-            self.chunk_compression_type,
-            self.chunk_compression_level,
-        )?;
-
-        index_prefix_word_database(
-            self.wtxn,
-            self.index.word_pair_proximity_docids,
-            self.index.prefix_word_pair_proximity_docids,
-            self.max_proximity,
-            self.max_prefix_length,
-            new_word_pair_proximity_docids,
-            new_prefix_fst_words,
-            common_prefix_fst_words,
-            del_prefix_fst_words,
-            self.chunk_compression_type,
-            self.chunk_compression_level,
-        )?;
-
-        Ok(())
-    }
-}
-
-// This is adapted from `sorter_into_lmdb_database`
-pub fn insert_into_database(
-    wtxn: &mut heed::RwTxn,
-    database: heed::PolyDatabase,
-    new_key: &[u8],
-    new_value: &[u8],
-) -> Result<()> {
-    let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
-    match iter.next().transpose()? {
-        Some((key, old_val)) if new_key == key => {
-            let val =
-                merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
-                    .map_err(|_| {
-                        // TODO just wrap this error?
-                        crate::error::InternalError::IndexingMergingKeys {
-                            process: "get-put-merge",
-                        }
-                    })?;
-            // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
-            unsafe { iter.put_current(new_key, &val)? };
-        }
-        _ => {
-            drop(iter);
-            database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
-        }
-    }
-    Ok(())
-}
-
-// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
-// but it uses `append` if the database is empty, and it assumes that the values in the
-// writer don't conflict with values in the database.
-pub fn write_into_lmdb_database_without_merging(
-    wtxn: &mut heed::RwTxn,
-    database: heed::PolyDatabase,
-    writer: grenad::Writer<BufWriter<std::fs::File>>,
-) -> Result<()> {
-    let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
-    let reader = grenad::Reader::new(BufReader::new(file))?;
-    if database.is_empty(wtxn)? {
-        let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
-        let mut cursor = reader.into_cursor()?;
-        while let Some((k, v)) = cursor.move_on_next()? {
-            // safety: the key comes from the grenad reader, not the database
-            unsafe { out_iter.append(k, v)? };
-        }
-    } else {
-        let mut cursor = reader.into_cursor()?;
-        while let Some((k, v)) = cursor.move_on_next()? {
-            database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
-        }
-    }
-    Ok(())
-}
-
-#[cfg(test)]
-mod tests {
-    use std::io::Cursor;
-
-    use crate::db_snap;
-    use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
-    use crate::index::tests::TempIndex;
-    use crate::update::IndexDocumentsMethod;
-
-    fn documents_with_enough_different_words_for_prefixes(
-        prefixes: &[&str],
-        start_id: usize,
-    ) -> Vec<crate::Object> {
-        let mut documents = Vec::new();
-        let mut id = start_id;
-        for prefix in prefixes {
-            for i in 0..50 {
-                documents.push(
-                    serde_json::json!({
-                        "id": id,
-                        "text": format!("{prefix}{i:x}"),
-                    })
-                    .as_object()
-                    .unwrap()
-                    .clone(),
-                );
-                id += 1;
-            }
-        }
-        documents
-    }
-
-    #[test]
-    fn add_new_documents() {
-        let mut index = TempIndex::new();
-        index.index_documents_config.words_prefix_threshold = Some(50);
-        index.index_documents_config.autogenerate_docids = true;
-
-        index
-            .update_settings(|settings| {
-                settings.set_searchable_fields(vec!["text".to_owned()]);
-            })
-            .unwrap();
-
-        let batch_reader_from_documents = |documents| {
-            let mut builder = DocumentsBatchBuilder::new(Vec::new());
-            for object in documents {
-                builder.append_json_object(&object).unwrap();
-            }
-            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
-        };
-
-        let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-        documents.push(
-            serde_json::json!({
-                "id": "9000",
-                "text": "At an amazing and beautiful house"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-        documents.push(
-            serde_json::json!({
-                "id": "9001",
-                "text": "The bell rings at 5 am"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-
-        let documents = batch_reader_from_documents(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, word_prefix_pair_proximity_docids, "initial");
-        db_snap!(index, prefix_word_pair_proximity_docids, "initial");
-
-        let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100);
-        documents.push(
-            serde_json::json!({
-                "id": "9002",
-                "text": "At an extraordinary house"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-        let documents = batch_reader_from_documents(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, word_pair_proximity_docids, "update");
-        db_snap!(index, word_prefix_pair_proximity_docids, "update");
-        db_snap!(index, prefix_word_pair_proximity_docids, "update");
-    }
-    #[test]
-    fn batch_bug_3043() {
-        // https://github.com/meilisearch/meilisearch/issues/3043
-        let mut index = TempIndex::new();
-        index.index_documents_config.words_prefix_threshold = Some(50);
-        index.index_documents_config.autogenerate_docids = true;
-
-        index
-            .update_settings(|settings| {
-                settings.set_searchable_fields(vec!["text".to_owned()]);
-            })
-            .unwrap();
-
-        let batch_reader_from_documents = |documents| {
-            let mut builder = DocumentsBatchBuilder::new(Vec::new());
-            for object in documents {
-                builder.append_json_object(&object).unwrap();
-            }
-            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
-        };
-
-        let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-        documents.push(
-            serde_json::json!({
-                "text": "x y"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-        documents.push(
-            serde_json::json!({
-                "text": "x a y"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-
-        let documents = batch_reader_from_documents(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, word_pair_proximity_docids);
-        db_snap!(index, word_prefix_pair_proximity_docids);
-        db_snap!(index, prefix_word_pair_proximity_docids);
-    }
-
-    #[test]
-    fn hard_delete_and_reupdate() {
-        let mut index = TempIndex::new();
-        index.index_documents_config.words_prefix_threshold = Some(50);
-
-        index
-            .update_settings(|settings| {
-                settings.set_primary_key("id".to_owned());
-                settings.set_searchable_fields(vec!["text".to_owned()]);
-            })
-            .unwrap();
-
-        let batch_reader_from_documents = |documents| {
-            let mut builder = DocumentsBatchBuilder::new(Vec::new());
-            for object in documents {
-                builder.append_json_object(&object).unwrap();
-            }
-            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
-        };
-
-        let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-        documents.push(
-            serde_json::json!({
-                "id": 9000,
-                "text": "At an amazing and beautiful house"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-        documents.push(
-            serde_json::json!({
-                "id": 9001,
-                "text": "The bell rings at 5 am"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-
-        let documents = batch_reader_from_documents(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, documents_ids, "initial");
-        db_snap!(index, word_docids, "initial");
-        db_snap!(index, word_prefix_pair_proximity_docids, "initial");
-        db_snap!(index, prefix_word_pair_proximity_docids, "initial");
-
-        index.delete_document("9000");
-
-        db_snap!(index, documents_ids, "first_delete");
-        db_snap!(index, word_docids, "first_delete");
-        db_snap!(index, word_prefix_pair_proximity_docids, "first_delete");
-        db_snap!(index, prefix_word_pair_proximity_docids, "first_delete");
-
-        index.delete_documents((0..50).map(|id| id.to_string()).collect());
-
-        db_snap!(index, documents_ids, "second_delete");
-        db_snap!(index, word_docids, "second_delete");
-        db_snap!(index, word_prefix_pair_proximity_docids, "second_delete");
-        db_snap!(index, prefix_word_pair_proximity_docids, "second_delete");
-
-        let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-
-        index.add_documents(batch_reader_from_documents(documents)).unwrap();
-
-        db_snap!(index, documents_ids, "reupdate");
-        db_snap!(index, word_docids, "reupdate");
-        db_snap!(index, word_prefix_pair_proximity_docids, "reupdate");
-        db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
-    }
-
-    #[test]
-    fn replace_hard_deletion() {
-        let mut index = TempIndex::new();
-        index.index_documents_config.words_prefix_threshold = Some(50);
-        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
-
-        index
-            .update_settings(|settings| {
-                settings.set_primary_key("id".to_owned());
-                settings.set_searchable_fields(vec!["text".to_owned()]);
-            })
-            .unwrap();
-
-        let batch_reader_from_documents = |documents| {
-            let mut builder = DocumentsBatchBuilder::new(Vec::new());
-            for object in documents {
-                builder.append_json_object(&object).unwrap();
-            }
-            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
-        };
-
-        let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-        documents.push(
-            serde_json::json!({
-                "id": 9000,
-                "text": "At an amazing house"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-        documents.push(
-            serde_json::json!({
-                "id": 9001,
-                "text": "The bell rings"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-
-        let documents = batch_reader_from_documents(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, documents_ids, "initial");
-        db_snap!(index, word_docids, "initial");
-        db_snap!(index, word_prefix_pair_proximity_docids, "initial");
-        db_snap!(index, prefix_word_pair_proximity_docids, "initial");
-
-        let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0);
-        index.add_documents(batch_reader_from_documents(documents)).unwrap();
-
-        db_snap!(index, documents_ids, "replaced");
-        db_snap!(index, word_docids, "replaced");
-        db_snap!(index, word_prefix_pair_proximity_docids, "replaced");
-        db_snap!(index, prefix_word_pair_proximity_docids, "replaced");
-    }
-}
diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs
deleted file mode 100644
index 1ec66d010..000000000
--- a/milli/src/update/prefix_word_pairs/prefix_word.rs
+++ /dev/null
@@ -1,182 +0,0 @@
-use std::borrow::Cow;
-use std::collections::{BTreeMap, HashSet};
-
-use grenad::CompressionType;
-use heed::types::ByteSlice;
-use heed::BytesDecode;
-use log::debug;
-
-use crate::update::index_documents::{create_writer, CursorClonableMmap};
-use crate::update::prefix_word_pairs::{
-    insert_into_database, write_into_lmdb_database_without_merging,
-};
-use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
-
-#[allow(clippy::too_many_arguments)]
-#[logging_timer::time]
-pub fn index_prefix_word_database(
-    wtxn: &mut heed::RwTxn,
-    word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-    prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-    max_proximity: u8,
-    max_prefix_length: usize,
-    new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
-    new_prefix_fst_words: &[String],
-    common_prefix_fst_words: &[&[String]],
-    del_prefix_fst_words: &HashSet<Vec<u8>>,
-    chunk_compression_type: CompressionType,
-    chunk_compression_level: Option<u32>,
-) -> Result<()> {
-    puffin::profile_function!();
-
-    let max_proximity = max_proximity - 1;
-    debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
-
-    let common_prefixes: Vec<_> = common_prefix_fst_words
-        .iter()
-        .flat_map(|s| s.iter())
-        .map(|s| s.as_str())
-        .filter(|s| s.len() <= max_prefix_length)
-        .collect();
-
-    for proximity in 1..max_proximity {
-        for prefix in common_prefixes.iter() {
-            let mut prefix_key = vec![proximity];
-            prefix_key.extend_from_slice(prefix.as_bytes());
-            let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?;
-            // This is the core of the algorithm
-            execute_on_word_pairs_and_prefixes(
-                proximity,
-                prefix.as_bytes(),
-                // the next two arguments tell how to iterate over the new word pairs
-                &mut cursor,
-                |cursor| {
-                    if let Some((key, value)) = cursor.next()? {
-                        let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
-                            .ok_or(heed::Error::Decoding)?;
-                        Ok(Some((word2, value)))
-                    } else {
-                        Ok(None)
-                    }
-                },
-                // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap)
-                |key, value| {
-                    insert_into_database(
-                        wtxn,
-                        *prefix_word_pair_proximity_docids.as_polymorph(),
-                        key,
-                        value,
-                    )
-                },
-            )?;
-        }
-    }
-
-    // Now we do the same thing with the new prefixes and all word pairs in the DB
-    let new_prefixes: Vec<_> = new_prefix_fst_words
-        .iter()
-        .map(|s| s.as_str())
-        .filter(|s| s.len() <= max_prefix_length)
-        .collect();
-
-    // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
-    // element in an intermediary grenad
-    let mut writer =
-        create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?);
-
-    for proximity in 1..max_proximity {
-        for prefix in new_prefixes.iter() {
-            let mut prefix_key = vec![proximity];
-            prefix_key.extend_from_slice(prefix.as_bytes());
-            let mut db_iter = word_pair_proximity_docids
-                .as_polymorph()
-                .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())?
-                .remap_key_type::<UncheckedU8StrStrCodec>();
-            execute_on_word_pairs_and_prefixes(
-                proximity,
-                prefix.as_bytes(),
-                &mut db_iter,
-                |db_iter| {
-                    db_iter
-                        .next()
-                        .transpose()
-                        .map(|x| x.map(|((_, _, word2), value)| (word2, value)))
-                        .map_err(|e| e.into())
-                },
-                |key, value| writer.insert(key, value).map_err(|e| e.into()),
-            )?;
-            drop(db_iter);
-        }
-    }
-
-    // and then we write the grenad into the DB
-    // Since the grenad contains only new prefixes, we know in advance that none
-    // of its elements already exist in the DB, thus there is no need to specify
-    // how to merge conflicting elements
-    write_into_lmdb_database_without_merging(
-        wtxn,
-        *prefix_word_pair_proximity_docids.as_polymorph(),
-        writer,
-    )?;
-
-    // All of the word prefix pairs in the database that have a w2
-    // that is contained in the `suppr_pw` set must be removed as well.
-    if !del_prefix_fst_words.is_empty() {
-        let mut iter =
-            prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
-        while let Some(((_, prefix, _), _)) = iter.next().transpose()? {
-            if del_prefix_fst_words.contains(prefix.as_bytes()) {
-                // Delete this entry as the w2 prefix is no more in the words prefix fst.
-                unsafe { iter.del_current()? };
-            }
-        }
-    }
-
-    Ok(())
-}
-
-/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database.
-///
-/// Its arguments are:
-/// - an iterator over the words following the given `prefix` with the given `proximity`
-/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements
-fn execute_on_word_pairs_and_prefixes<I>(
-    proximity: u8,
-    prefix: &[u8],
-    iter: &mut I,
-    mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>,
-    mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
-) -> Result<()> {
-    let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = BTreeMap::default();
-
-    // Memory usage check:
-    // The content of the loop will be called for each `word2` that follows a word beginning
-    // with `prefix` with the given proximity.
-    // In practice, I don't think the batch can ever get too big.
-    while let Some((word2, docids)) = next_word2_and_docids(iter)? {
-        let entry = batch.entry(word2.to_owned()).or_default();
-        entry.push(Cow::Owned(docids.to_owned()));
-    }
-
-    let mut key_buffer = Vec::with_capacity(512);
-    key_buffer.push(proximity);
-    key_buffer.extend_from_slice(prefix);
-    key_buffer.push(0);
-
-    let mut value_buffer = Vec::with_capacity(65_536);
-
-    for (word2, docids) in batch {
-        key_buffer.truncate(prefix.len() + 2);
-        value_buffer.clear();
-
-        key_buffer.extend_from_slice(&word2);
-        let data = if docids.len() > 1 {
-            CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?;
-            value_buffer.as_slice()
-        } else {
-            &docids[0]
-        };
-        insert(key_buffer.as_slice(), data)?;
-    }
-    Ok(())
-}
diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs
deleted file mode 100644
index 570adece9..000000000
--- a/milli/src/update/prefix_word_pairs/word_prefix.rs
+++ /dev/null
@@ -1,728 +0,0 @@
-/*!
-The word-prefix-pair-proximity-docids database is a database whose keys are of
-the form `(proximity, word, prefix)` and the values are roaring bitmaps of
-the documents which contain `word` followed by another word starting with
-`prefix` at a distance of `proximity`.
-
-The prefixes present in this database are only those that correspond to many
-different words in the documents.
-
-## How is it created/updated? (simplified version)
-To compute it, we have access to (mainly) two inputs:
-
-* a list of sorted prefixes, such as:
-```text
-c
-ca
-cat
-d
-do
-dog
-```
-Note that only prefixes which correspond to more than a certain number of
-different words from the database are included in this list.
-
-* a sorted list of proximities and word pairs (the proximity is the distance between the two words),
-associated with a roaring bitmap, such as:
-```text
-1 good doggo         -> docids1: [8]
-1 good door          -> docids2: [7, 19, 20]
-1 good ghost         -> docids3: [1]
-2 good dog           -> docids4: [2, 5, 6]
-2 horror cathedral   -> docids5: [1, 2]
-```
-
-I illustrate a simplified version of the algorithm to create the word-prefix
-pair-proximity database below:
-
-1. **Outer loop:** First, we iterate over each proximity and word pair:
-```text
-proximity: 1
-word1    : good
-word2    : doggo
-```
-2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
-in the list of sorted prefixes. And we insert the key `prefix`
-and the value (`docids`) to a sorted map which we call the “batch”. For example,
-at the end of the first outer loop, we may have:
-```text
-Outer loop 1:
-------------------------------
-proximity: 1
-word1    : good
-word2    : doggo
-docids   : docids1
-
-prefixes: [d, do, dog]
-
-batch: [
-    d,   -> [docids1]
-    do   -> [docids1]
-    dog  -> [docids1]
-]
-```
-3. For illustration purpose, let's run through a second iteration of the outer loop:
-```text
-Outer loop 2:
-------------------------------
-proximity: 1
-word1    : good
-word2    : door
-docids   : docids2
-
-prefixes: [d, do, doo]
-
-batch: [
-    d   -> [docids1, docids2]
-    do  -> [docids1, docids2]
-    dog -> [docids1]
-    doo -> [docids2]
-]
-```
-Notice that there were some conflicts which were resolved by merging the
-conflicting values together. Also, an additional prefix was added at the
-end of the batch.
-
-4. On the third iteration of the outer loop, we have:
-```text
-Outer loop 3:
-------------------------------
-proximity: 1
-word1    : good
-word2    : ghost
-```
-Because `word2` begins with a different letter than the previous `word2`,
-we know that all the prefixes of `word2` are greater than the prefixes of the previous word2
-
-Therefore, we know that we can insert every element from the batch into the
-database before proceeding any further. This operation is called
-“flushing the batch”. Flushing the batch should also be done whenever:
-* `proximity` is different than the previous `proximity`.
-* `word1` is different than the previous `word1`.
-* `word2` starts with a different letter than the previous word2
-
-6. **Flushing the batch:** to flush the batch, we iterate over its elements:
-```text
-Flushing Batch loop 1:
-------------------------------
-proximity  : 1
-word1      : good
-prefix     : d
-
-docids   : [docids2, docids3]
-```
-We then merge the array of `docids` (of type `Vec<Vec<u8>>`) using
-`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a
-roaring bitmap of all the document ids where `word1` is followed by `prefix`
-at a distance of `proximity`.
-Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids`
-into the database.
-
-7. That's it! ... except...
-
-## How is it created/updated (continued)
-
-I lied a little bit about the input data. In reality, we get two sets of the
-inputs described above, which come from different places:
-
-* For the list of sorted prefixes, we have:
-    1. `new_prefixes`, which are all the prefixes that were not present in the
-    database before the insertion of the new documents
-
-    2. `common_prefixes` which are the prefixes that are present both in the
-    database and in the newly added documents
-
-* For the list of word pairs and proximities, we have:
-    1. `new_word_pairs`, which is the list of word pairs and their proximities
-    present in the newly added documents
-
-    2. `word_pairs_db`, which is the list of word pairs from the database.
-    This list includes all elements in `new_word_pairs` since `new_word_pairs`
-    was added to the database prior to calling the `WordPrefix::execute`
-    function.
-
-To update the prefix database correctly, we call the algorithm described earlier first
-on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`).
-Thus:
-
-1. For all the word pairs that were already present in the DB, we insert them
-again with the `new_prefixes`. Calling the algorithm on them with the
-`common_prefixes` would not result in any new data.
-
-2. For all the new word pairs, we insert them twice: first with the `common_prefixes`,
-and then, because they are part of `word_pairs_db`, with the `new_prefixes`.
-
-Note, also, that since we read data from the database when iterating over
-`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-
-docids from the batch directly into the database (we would have a concurrent
-reader and writer). Therefore, when calling the algorithm on
-`(new_prefixes, word_pairs_db)`, we insert the computed
-`((proximity, word, prefix), docids)` elements in an intermediary grenad
-Writer instead of the DB. At the end of the outer loop, we finally read from
-the grenad and insert its elements in the database.
-*/
-
-use std::borrow::Cow;
-use std::collections::HashSet;
-
-use grenad::CompressionType;
-use heed::types::ByteSlice;
-use heed::BytesDecode;
-use log::debug;
-
-use crate::update::index_documents::{create_writer, CursorClonableMmap};
-use crate::update::prefix_word_pairs::{
-    insert_into_database, write_into_lmdb_database_without_merging,
-};
-use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
-
-#[allow(clippy::too_many_arguments)]
-#[logging_timer::time]
-pub fn index_word_prefix_database(
-    wtxn: &mut heed::RwTxn,
-    word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-    word_prefix_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-    max_proximity: u8,
-    max_prefix_length: usize,
-    new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
-    new_prefix_fst_words: &[String],
-    common_prefix_fst_words: &[&[String]],
-    del_prefix_fst_words: &HashSet<Vec<u8>>,
-    chunk_compression_type: CompressionType,
-    chunk_compression_level: Option<u32>,
-) -> Result<()> {
-    puffin::profile_function!();
-    debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
-
-    // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
-    let prefixes = PrefixTrieNode::from_sorted_prefixes(
-        common_prefix_fst_words
-            .iter()
-            .flat_map(|s| s.iter())
-            .map(|s| s.as_str())
-            .filter(|s| s.len() <= max_prefix_length),
-    );
-
-    // If the prefix trie is not empty, then we can iterate over all new
-    // word pairs to look for new (proximity, word1, common_prefix) elements
-    // to insert in the DB
-    if !prefixes.is_empty() {
-        let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
-        // This is the core of the algorithm
-        execute_on_word_pairs_and_prefixes(
-            // the first two arguments tell how to iterate over the new word pairs
-            &mut cursor,
-            |cursor| {
-                if let Some((key, value)) = cursor.move_on_next()? {
-                    let (proximity, word1, word2) =
-                        UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
-                    Ok(Some(((proximity, word1, word2), value)))
-                } else {
-                    Ok(None)
-                }
-            },
-            &prefixes,
-            max_proximity,
-            // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap)
-            |key, value| {
-                insert_into_database(
-                    wtxn,
-                    *word_prefix_pair_proximity_docids.as_polymorph(),
-                    key,
-                    value,
-                )
-            },
-        )?;
-    }
-
-    // Now we do the same thing with the new prefixes and all word pairs in the DB
-
-    let prefixes = PrefixTrieNode::from_sorted_prefixes(
-        new_prefix_fst_words.iter().map(|s| s.as_str()).filter(|s| s.len() <= max_prefix_length),
-    );
-
-    if !prefixes.is_empty() {
-        let mut db_iter = word_pair_proximity_docids
-            .remap_key_type::<UncheckedU8StrStrCodec>()
-            .remap_data_type::<ByteSlice>()
-            .iter(wtxn)?;
-
-        // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix)
-        // element in an intermediary grenad
-        let mut writer =
-            create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?);
-
-        execute_on_word_pairs_and_prefixes(
-            &mut db_iter,
-            |db_iter| db_iter.next().transpose().map_err(|e| e.into()),
-            &prefixes,
-            max_proximity,
-            |key, value| writer.insert(key, value).map_err(|e| e.into()),
-        )?;
-        drop(db_iter);
-
-        // and then we write the grenad into the DB
-        // Since the grenad contains only new prefixes, we know in advance that none
-        // of its elements already exist in the DB, thus there is no need to specify
-        // how to merge conflicting elements
-        write_into_lmdb_database_without_merging(
-            wtxn,
-            *word_prefix_pair_proximity_docids.as_polymorph(),
-            writer,
-        )?;
-    }
-
-    // All of the word prefix pairs in the database that have a w2
-    // that is contained in the `suppr_pw` set must be removed as well.
-    if !del_prefix_fst_words.is_empty() {
-        let mut iter =
-            word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
-        while let Some(((_, _, prefix), _)) = iter.next().transpose()? {
-            if del_prefix_fst_words.contains(prefix.as_bytes()) {
-                // Delete this entry as the w2 prefix is no more in the words prefix fst.
-                unsafe { iter.del_current()? };
-            }
-        }
-    }
-
-    Ok(())
-}
-
-/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
-///
-/// Its main arguments are:
-/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements
-/// 2. a prefix trie
-/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements
-///
-/// For more information about what this function does, read the module documentation.
-fn execute_on_word_pairs_and_prefixes<I>(
-    iter: &mut I,
-    mut next_word_pair_proximity: impl for<'a> FnMut(
-        &'a mut I,
-    ) -> Result<
-        Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>,
-    >,
-    prefixes: &PrefixTrieNode,
-    max_proximity: u8,
-    mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
-) -> Result<()> {
-    let mut batch = PrefixAndProximityBatch::default();
-    let mut prev_word2_start = 0;
-
-    // Optimisation: the index at the root of the prefix trie where to search for
-    let mut prefix_search_start = PrefixTrieNodeSearchStart(0);
-
-    // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter
-    let mut empty_prefixes = false;
-
-    let mut prefix_buffer = Vec::with_capacity(8);
-    let mut merge_buffer = Vec::with_capacity(65_536);
-
-    while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? {
-        // stop indexing if the proximity is over the threshold
-        if proximity > max_proximity {
-            break;
-        };
-        let word2_start_different_than_prev = word2[0] != prev_word2_start;
-        // if there were no potential prefixes for the previous word2 based on its first letter,
-        // and if the current word2 starts with the same letter, then there is also no potential
-        // prefixes for the current word2, and we can skip to the next iteration
-        if empty_prefixes && !word2_start_different_than_prev {
-            continue;
-        }
-
-        // if the proximity is different to the previous one, OR
-        // if word1 is different than the previous word1, OR
-        // if the start of word2 is different than the previous start of word2,
-        // THEN we'll need to flush the batch
-        let prox_different_than_prev = proximity != batch.proximity;
-        let word1_different_than_prev = word1 != batch.word1;
-        if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
-        {
-            batch.flush(&mut merge_buffer, &mut insert)?;
-            batch.proximity = proximity;
-            // don't forget to reset the value of batch.word1 and prev_word2_start
-            if word1_different_than_prev {
-                batch.word1.clear();
-                batch.word1.extend_from_slice(word1);
-            }
-            if word2_start_different_than_prev {
-                prev_word2_start = word2[0];
-            }
-            prefix_search_start.0 = 0;
-            // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2
-            empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start);
-        }
-
-        if !empty_prefixes {
-            // All conditions are satisfied, we can now insert each new prefix of word2 into the batch
-            prefix_buffer.clear();
-            prefixes.for_each_prefix_of(
-                word2,
-                &mut prefix_buffer,
-                &prefix_search_start,
-                |prefix_buffer| {
-                    batch.insert(prefix_buffer, data.to_vec());
-                },
-            );
-        }
-    }
-    batch.flush(&mut merge_buffer, &mut insert)?;
-    Ok(())
-}
-/**
-A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps).
-The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together.
-
-It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently.
-
-The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content
-can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments:
-- key   : (proximity, word1, prefix) as bytes
-- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes
-*/
-#[derive(Default)]
-struct PrefixAndProximityBatch {
-    proximity: u8,
-    word1: Vec<u8>,
-    #[allow(clippy::type_complexity)]
-    batch: Vec<(Vec<u8>, Vec<Cow<'static, [u8]>>)>,
-}
-
-impl PrefixAndProximityBatch {
-    /// Insert the new key and value into the batch
-    ///
-    /// The key must either exist in the batch or be greater than all existing keys
-    fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) {
-        match self.batch.iter_mut().find(|el| el.0 == new_key) {
-            Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)),
-            None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])),
-        }
-    }
-
-    /// Empties the batch, calling `insert` on each element.
-    ///
-    /// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap.
-    fn flush(
-        &mut self,
-        merge_buffer: &mut Vec<u8>,
-        insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>,
-    ) -> Result<()> {
-        let PrefixAndProximityBatch { proximity, word1, batch } = self;
-        if batch.is_empty() {
-            return Ok(());
-        }
-        merge_buffer.clear();
-
-        let mut buffer = Vec::with_capacity(word1.len() + 1 + 6);
-        buffer.push(*proximity);
-        buffer.extend_from_slice(word1);
-        buffer.push(0);
-
-        for (key, mergeable_data) in batch.drain(..) {
-            buffer.truncate(1 + word1.len() + 1);
-            buffer.extend_from_slice(key.as_slice());
-
-            let data = if mergeable_data.len() > 1 {
-                CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?;
-                merge_buffer.as_slice()
-            } else {
-                &mergeable_data[0]
-            };
-            insert(buffer.as_slice(), data)?;
-            merge_buffer.clear();
-        }
-
-        Ok(())
-    }
-}
-
-/** A prefix trie. Used to iterate quickly over the prefixes of a word that are
-within a set.
-
-## Structure
-The trie is made of nodes composed of:
-1. a byte character (e.g. 'a')
-2. whether the node is an end node or not
-3. a list of children nodes, sorted by their byte character
-
-For example, the trie that stores the strings `[ac, ae, ar, ch, cei, cel, ch, r, rel, ri]`
-is drawn below. Nodes with a double border are "end nodes".
-
-┌──────────────────────┐ ┌──────────────────────┐ ╔══════════════════════╗
-│          a           │ │          c           │ ║          r           ║
-└──────────────────────┘ └──────────────────────┘ ╚══════════════════════╝
-╔══════╗╔══════╗╔══════╗ ┌─────────┐  ╔═════════╗ ┌─────────┐ ╔══════════╗
-║  c   ║║  e   ║║  r   ║ │    e    │  ║    h    ║ │    e    │ ║    i     ║
-╚══════╝╚══════╝╚══════╝ └─────────┘  ╚═════════╝ └─────────┘ ╚══════════╝
-                         ╔═══╗ ╔═══╗                 ╔═══╗
-                         ║ i ║ ║ l ║                 ║ l ║
-                         ╚═══╝ ╚═══╝                 ╚═══╝
-*/
-#[derive(Default, Debug)]
-struct PrefixTrieNode {
-    children: Vec<(PrefixTrieNode, u8)>,
-    is_end_node: bool,
-}
-
-#[derive(Debug)]
-struct PrefixTrieNodeSearchStart(usize);
-
-impl PrefixTrieNode {
-    fn is_empty(&self) -> bool {
-        self.children.is_empty()
-    }
-
-    /// Returns false if the trie does not contain a prefix of the given word.
-    /// Returns true if the trie *may* contain a prefix of the given word.
-    ///
-    /// Moves the search start to the first node equal to the first letter of the word,
-    /// or to 0 otherwise.
-    fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool {
-        let byte = word[0];
-        if self.children[search_start.0].1 == byte {
-            true
-        } else {
-            match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) {
-                Ok(position) => {
-                    search_start.0 += position;
-                    true
-                }
-                Err(_) => {
-                    search_start.0 = 0;
-                    false
-                }
-            }
-        }
-    }
-
-    fn from_sorted_prefixes<'a>(prefixes: impl Iterator<Item = &'a str>) -> Self {
-        let mut node = PrefixTrieNode::default();
-        for prefix in prefixes {
-            node.insert_sorted_prefix(prefix.as_bytes().iter());
-        }
-        node
-    }
-    fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter<u8>) {
-        if let Some(&c) = prefix.next() {
-            if let Some((node, byte)) = self.children.last_mut() {
-                if *byte == c {
-                    node.insert_sorted_prefix(prefix);
-                    return;
-                }
-            }
-            let mut new_node = PrefixTrieNode::default();
-            new_node.insert_sorted_prefix(prefix);
-            self.children.push((new_node, c));
-        } else {
-            self.is_end_node = true;
-        }
-    }
-
-    /// Call the given closure on each prefix of the word contained in the prefix trie.
-    ///
-    /// The search starts from the given `search_start`.
-    fn for_each_prefix_of(
-        &self,
-        word: &[u8],
-        buffer: &mut Vec<u8>,
-        search_start: &PrefixTrieNodeSearchStart,
-        mut do_fn: impl FnMut(&mut Vec<u8>),
-    ) {
-        let first_byte = word[0];
-        let mut cur_node = self;
-        buffer.push(first_byte);
-        if let Some((child_node, c)) =
-            cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte)
-        {
-            if *c == first_byte {
-                cur_node = child_node;
-                if cur_node.is_end_node {
-                    do_fn(buffer);
-                }
-                for &byte in &word[1..] {
-                    buffer.push(byte);
-                    if let Some((child_node, c)) =
-                        cur_node.children.iter().find(|(_, c)| *c >= byte)
-                    {
-                        if *c == byte {
-                            cur_node = child_node;
-                            if cur_node.is_end_node {
-                                do_fn(buffer);
-                            }
-                        } else {
-                            break;
-                        }
-                    } else {
-                        break;
-                    }
-                }
-            }
-        }
-    }
-}
-#[cfg(test)]
-mod tests {
-    use roaring::RoaringBitmap;
-
-    use super::*;
-    use crate::{CboRoaringBitmapCodec, U8StrStrCodec};
-
-    fn check_prefixes(
-        trie: &PrefixTrieNode,
-        search_start: &PrefixTrieNodeSearchStart,
-        word: &str,
-        expected_prefixes: &[&str],
-    ) {
-        let mut actual_prefixes = vec![];
-        trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), search_start, |x| {
-            let s = String::from_utf8(x.to_owned()).unwrap();
-            actual_prefixes.push(s);
-        });
-        assert_eq!(actual_prefixes, expected_prefixes);
-    }
-
-    #[test]
-    fn test_trie() {
-        let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
-            "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au",
-            "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c",
-            "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com",
-            "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des",
-            "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f",
-            "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi",
-            "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i",
-            "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka",
-            "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar",
-            "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni",
-            "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi",
-            "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res",
-            "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si",
-            "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t",
-            "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve",
-            "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z",
-        ]));
-
-        let mut search_start = PrefixTrieNodeSearchStart(0);
-
-        let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start);
-        assert!(!is_empty);
-        assert_eq!(search_start.0, 2);
-
-        check_prefixes(&trie, &search_start, "affair", &["a"]);
-        check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]);
-
-        let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start);
-        assert!(!is_empty);
-        assert_eq!(trie.children[search_start.0].1, b'u');
-
-        check_prefixes(&trie, &search_start, "unique", &["u", "un"]);
-
-        // NOTE: this should fail, because the search start is already beyong 'a'
-        let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start);
-        assert!(!is_empty);
-        // search start is reset
-        assert_eq!(search_start.0, 0);
-
-        let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
-            "arb", "arbre", "cat", "catto",
-        ]));
-        check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]);
-        check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]);
-    }
-
-    #[test]
-    fn test_execute_on_word_pairs_and_prefixes() {
-        let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
-            "arb", "arbre", "cat", "catto",
-        ]));
-
-        let mut serialised_bitmap123 = vec![];
-        let mut bitmap123 = RoaringBitmap::new();
-        bitmap123.insert(1);
-        bitmap123.insert(2);
-        bitmap123.insert(3);
-        CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123);
-
-        let mut serialised_bitmap456 = vec![];
-        let mut bitmap456 = RoaringBitmap::new();
-        bitmap456.insert(4);
-        bitmap456.insert(5);
-        bitmap456.insert(6);
-        CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456);
-
-        let mut serialised_bitmap789 = vec![];
-        let mut bitmap789 = RoaringBitmap::new();
-        bitmap789.insert(7);
-        bitmap789.insert(8);
-        bitmap789.insert(9);
-        CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789);
-
-        let mut serialised_bitmap_ranges = vec![];
-        let mut bitmap_ranges = RoaringBitmap::new();
-        bitmap_ranges.insert_range(63_000..65_000);
-        bitmap_ranges.insert_range(123_000..128_000);
-        CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges);
-
-        let word_pairs = [
-            ((1, "healthy", "arbres"), &serialised_bitmap123),
-            ((1, "healthy", "boat"), &serialised_bitmap123),
-            ((1, "healthy", "ca"), &serialised_bitmap123),
-            ((1, "healthy", "cats"), &serialised_bitmap456),
-            ((1, "healthy", "cattos"), &serialised_bitmap123),
-            ((1, "jittery", "cat"), &serialised_bitmap123),
-            ((1, "jittery", "cata"), &serialised_bitmap456),
-            ((1, "jittery", "catb"), &serialised_bitmap789),
-            ((1, "jittery", "catc"), &serialised_bitmap_ranges),
-            ((2, "healthy", "arbre"), &serialised_bitmap123),
-            ((2, "healthy", "arbres"), &serialised_bitmap456),
-            ((2, "healthy", "cats"), &serialised_bitmap789),
-            ((2, "healthy", "cattos"), &serialised_bitmap_ranges),
-            ((3, "healthy", "arbre"), &serialised_bitmap456),
-            ((3, "healthy", "arbres"), &serialised_bitmap789),
-        ];
-
-        let expected_result = [
-            ((1, "healthy", "arb"), bitmap123.clone()),
-            ((1, "healthy", "arbre"), bitmap123.clone()),
-            ((1, "healthy", "cat"), &bitmap456 | &bitmap123),
-            ((1, "healthy", "catto"), bitmap123.clone()),
-            ((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)),
-            ((2, "healthy", "arb"), &bitmap123 | &bitmap456),
-            ((2, "healthy", "arbre"), &bitmap123 | &bitmap456),
-            ((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges),
-            ((2, "healthy", "catto"), bitmap_ranges.clone()),
-        ];
-
-        let mut result = vec![];
-
-        let mut iter =
-            IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| {
-                ((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice())
-            });
-        execute_on_word_pairs_and_prefixes(
-            &mut iter,
-            |iter| Ok(iter.next()),
-            &prefixes,
-            2,
-            |k, v| {
-                let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap();
-                let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap();
-                result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap));
-                Ok(())
-            },
-        )
-        .unwrap();
-
-        for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) {
-            let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x;
-            let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y;
-
-            assert_eq!(actual_word1, expected_word1);
-            assert_eq!(actual_prefix, expected_prefix);
-            assert_eq!(actual_proximity, expected_proximity);
-            assert_eq!(actual_bitmap, expected_bitmap);
-        }
-    }
-}

From 70ce40828c3d58a667a314a063d1c7c5b5a05645 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 8 Nov 2023 16:41:26 +0100
Subject: [PATCH 110/127] Compute word docids prefix cache

---
 milli/src/update/del_add.rs                   | 14 +++++
 .../index_documents/helpers/grenad_helpers.rs | 44 +++++++++++++++
 .../helpers/merge_functions.rs                | 16 ++++++
 .../src/update/index_documents/helpers/mod.rs |  7 ++-
 milli/src/update/index_documents/mod.rs       |  6 +-
 .../src/update/index_documents/typed_chunk.rs | 55 +++++--------------
 milli/src/update/word_prefix_docids.rs        | 27 ++++++---
 7 files changed, 116 insertions(+), 53 deletions(-)

diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs
index c8b7f0f6a..dc7c0409a 100644
--- a/milli/src/update/del_add.rs
+++ b/milli/src/update/del_add.rs
@@ -102,3 +102,17 @@ pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
 pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool {
     del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
 }
+
+/// A function that extracts and returns the Add side of a DelAdd obkv.
+/// This is useful when there are no previous value in the database and
+/// therefore we don't need to do a diff with what's already there.
+///
+/// If there is no Add side we currently write an empty buffer
+/// which is a valid CboRoaringBitmap.
+#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
+pub fn deladd_serialize_add_side<'a>(
+    obkv: &'a [u8],
+    _buffer: &mut Vec<u8>,
+) -> crate::Result<&'a [u8]> {
+    Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
+}
diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs
index 4f764ab95..f520ea7b0 100644
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -9,6 +9,7 @@ use log::debug;
 
 use super::{ClonableMmap, MergeFn};
 use crate::error::InternalError;
+use crate::update::index_documents::valid_lmdb_key;
 use crate::Result;
 
 pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
@@ -282,6 +283,49 @@ pub fn sorter_into_lmdb_database(
     Ok(())
 }
 
+/// Write provided sorter in database using serialize_value function.
+/// merge_values function is used if an entry already exist in the database.
+pub fn write_sorter_into_database<K, V, FS, FM>(
+    sorter: Sorter<MergeFn>,
+    database: &heed::Database<K, V>,
+    wtxn: &mut heed::RwTxn,
+    index_is_empty: bool,
+    serialize_value: FS,
+    merge_values: FM,
+) -> Result<()>
+where
+    FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
+    FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
+{
+    puffin::profile_function!();
+
+    let mut buffer = Vec::new();
+    let database = database.remap_types::<ByteSlice, ByteSlice>();
+
+    let mut merger_iter = sorter.into_stream_merger_iter()?;
+    while let Some((key, value)) = merger_iter.next()? {
+        if valid_lmdb_key(key) {
+            buffer.clear();
+            let value = if index_is_empty {
+                Some(serialize_value(value, &mut buffer)?)
+            } else {
+                match database.get(wtxn, key)? {
+                    Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
+                    None => Some(serialize_value(value, &mut buffer)?),
+                }
+            };
+            match value {
+                Some(value) => database.put(wtxn, key, value)?,
+                None => {
+                    database.delete(wtxn, key)?;
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
 /// Used when trying to merge readers, but you don't actually care about the values.
 pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
     Ok(Cow::Owned(Vec::new()))
diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs
index 98c1c1a04..5d9ca7ef2 100644
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@@ -239,3 +239,19 @@ pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
         output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
     }
 }
+
+/// A function that merges a DelAdd of bitmao into an already existing bitmap.
+///
+/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
+/// the second one is the CboRoaringBitmap to merge into.
+pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
+    deladd_obkv: &[u8],
+    previous: &[u8],
+    buffer: &'a mut Vec<u8>,
+) -> Result<Option<&'a [u8]>> {
+    Ok(CboRoaringBitmapCodec::merge_deladd_into(
+        KvReaderDelAdd::new(deladd_obkv),
+        previous,
+        buffer,
+    )?)
+}
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index 1f2f8e6ef..c167f1cd3 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -9,12 +9,13 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
 use fst::{IntoStreamer, Streamer};
 pub use grenad_helpers::{
     as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
-    merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader,
-    GrenadParameters, MergeableReader,
+    merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_sorter_into_database,
+    writer_into_reader, GrenadParameters, MergeableReader,
 };
 pub use merge_functions::{
     concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
-    merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_roaring_bitmaps,
+    merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
+    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps,
     obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions,
     serialize_roaring_bitmap, MergeFn,
 };
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 8552cf52b..5dbb4dd0b 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -26,8 +26,10 @@ pub use self::enrich::{
 };
 pub use self::helpers::{
     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
-    fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
-    sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
+    fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
+    merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
+    merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, write_sorter_into_database,
+    writer_into_reader, ClonableMmap, MergeFn,
 };
 use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index b53d859cd..90f9b7739 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -13,7 +13,10 @@ use obkv::{KvReader, KvWriter};
 use ordered_float::OrderedFloat;
 use roaring::RoaringBitmap;
 
-use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap};
+use super::helpers::{
+    self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values,
+    valid_lmdb_key, CursorClonableMmap,
+};
 use super::{ClonableMmap, MergeFn};
 use crate::distance::NDotProductPoint;
 use crate::error::UserError;
@@ -21,12 +24,11 @@ use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
 use crate::facet::FacetType;
 use crate::index::db_name::DOCUMENTS;
 use crate::index::Hnsw;
-use crate::update::del_add::{DelAdd, KvReaderDelAdd};
+use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
 use crate::update::facet::FacetsUpdate;
 use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
 use crate::{
-    lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, Result,
-    SerializationError, BEU32,
+    lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError, BEU32,
 };
 
 pub(crate) enum TypedChunk {
@@ -186,7 +188,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
             is_merged_database = true;
         }
@@ -202,7 +204,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
 
             let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
@@ -212,7 +214,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
 
             let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
@@ -222,7 +224,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
 
             // create fst from word docids
@@ -244,7 +246,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
             is_merged_database = true;
         }
@@ -265,7 +267,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
             is_merged_database = true;
         }
@@ -276,7 +278,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
             is_merged_database = true;
         }
@@ -287,7 +289,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
             is_merged_database = true;
         }
@@ -298,7 +300,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
             is_merged_database = true;
         }
@@ -495,33 +497,6 @@ fn merge_word_docids_reader_into_fst(
     Ok(builder.into_set())
 }
 
-/// A function that extracts and returns the Add side of a DelAdd obkv.
-/// This is useful when there are no previous value in the database and
-/// therefore we don't need to do a diff with what's already there.
-///
-/// If there is no Add side we currently write an empty buffer
-/// which is a valid CboRoaringBitmap.
-#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
-fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec<u8>) -> Result<&'a [u8]> {
-    Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
-}
-
-/// A function that merges a DelAdd of bitmao into an already existing bitmap.
-///
-/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
-/// the second one is the CboRoaringBitmap to merge into.
-fn merge_deladd_cbo_roaring_bitmaps<'a>(
-    deladd_obkv: &[u8],
-    previous: &[u8],
-    buffer: &'a mut Vec<u8>,
-) -> Result<Option<&'a [u8]>> {
-    Ok(CboRoaringBitmapCodec::merge_deladd_into(
-        KvReaderDelAdd::new(deladd_obkv),
-        previous,
-        buffer,
-    )?)
-}
-
 /// Write provided entries in database using serialize_value function.
 /// merge_values function is used if an entry already exist in the database.
 fn write_entries_into_database<R, K, V, FS, FM>(
diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs
index 8220aa777..618f451dc 100644
--- a/milli/src/update/word_prefix_docids.rs
+++ b/milli/src/update/word_prefix_docids.rs
@@ -4,9 +4,11 @@ use grenad::CompressionType;
 use heed::types::{ByteSlice, Str};
 use heed::Database;
 
+use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
 use crate::update::index_documents::{
-    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
-    CursorClonableMmap, MergeFn,
+    create_sorter, merge_deladd_cbo_roaring_bitmaps,
+    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
+    write_sorter_into_database, CursorClonableMmap, MergeFn,
 };
 use crate::{CboRoaringBitmapCodec, Result};
 
@@ -51,7 +53,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
         // and write into it at the same time, therefore we write into another file.
         let mut prefix_docids_sorter = create_sorter(
             grenad::SortAlgorithm::Unstable,
-            merge_cbo_roaring_bitmaps,
+            merge_deladd_cbo_roaring_bitmaps,
             self.chunk_compression_type,
             self.chunk_compression_level,
             self.max_nb_chunks,
@@ -92,11 +94,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
 
         // We fetch the docids associated to the newly added word prefix fst only.
         let db = self.word_docids.remap_data_type::<ByteSlice>();
+        let mut buffer = Vec::new();
         for prefix in new_prefix_fst_words {
             let prefix = std::str::from_utf8(prefix.as_bytes())?;
             for result in db.prefix_iter(self.wtxn, prefix)? {
                 let (_word, data) = result?;
-                prefix_docids_sorter.insert(prefix, data)?;
+                buffer.clear();
+                let mut writer = KvWriterDelAdd::new(&mut buffer);
+                writer.insert(DelAdd::Addition, data)?;
+
+                prefix_docids_sorter.insert(prefix, writer.into_inner()?)?;
             }
         }
 
@@ -110,12 +117,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
 
         drop(iter);
 
+        let database_is_empty = self.word_prefix_docids.is_empty(self.wtxn)?;
+
         // We finally write the word prefix docids into the LMDB database.
-        sorter_into_lmdb_database(
-            self.wtxn,
-            *self.word_prefix_docids.as_polymorph(),
+        write_sorter_into_database(
             prefix_docids_sorter,
-            merge_cbo_roaring_bitmaps,
+            &self.word_prefix_docids,
+            self.wtxn,
+            database_is_empty,
+            deladd_serialize_add_side,
+            merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
         )?;
 
         Ok(())

From 5a9c96e1db0b2ec1de77c0c01b76676072aec754 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 9 Nov 2023 11:34:26 +0100
Subject: [PATCH 111/127] Compute word integer prefix cache

---
 .../src/update/words_prefix_integer_docids.rs | 28 +++++++++++++------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/milli/src/update/words_prefix_integer_docids.rs b/milli/src/update/words_prefix_integer_docids.rs
index c65438928..e083f510a 100644
--- a/milli/src/update/words_prefix_integer_docids.rs
+++ b/milli/src/update/words_prefix_integer_docids.rs
@@ -9,9 +9,11 @@ use log::debug;
 use crate::error::SerializationError;
 use crate::heed_codec::StrBEU16Codec;
 use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
+use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
 use crate::update::index_documents::{
-    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
-    CursorClonableMmap, MergeFn,
+    create_sorter, merge_deladd_cbo_roaring_bitmaps,
+    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
+    write_sorter_into_database, CursorClonableMmap, MergeFn,
 };
 use crate::{CboRoaringBitmapCodec, Result};
 
@@ -55,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
 
         let mut prefix_integer_docids_sorter = create_sorter(
             grenad::SortAlgorithm::Unstable,
-            merge_cbo_roaring_bitmaps,
+            merge_deladd_cbo_roaring_bitmaps,
             self.chunk_compression_type,
             self.chunk_compression_level,
             self.max_nb_chunks,
@@ -108,6 +110,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
 
         // We fetch the docids associated to the newly added word prefix fst only.
         let db = self.word_database.remap_data_type::<ByteSlice>();
+        let mut buffer = Vec::new();
         for prefix_bytes in new_prefix_fst_words {
             let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| {
                 SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) }
@@ -123,7 +126,11 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
                 if word.starts_with(prefix) {
                     let key = (prefix, pos);
                     let bytes = StrBEU16Codec::bytes_encode(&key).unwrap();
-                    prefix_integer_docids_sorter.insert(bytes, data)?;
+
+                    buffer.clear();
+                    let mut writer = KvWriterDelAdd::new(&mut buffer);
+                    writer.insert(DelAdd::Addition, data)?;
+                    prefix_integer_docids_sorter.insert(bytes, writer.into_inner()?)?;
                 }
             }
         }
@@ -143,12 +150,16 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
             drop(iter);
         }
 
+        let database_is_empty = self.prefix_database.is_empty(self.wtxn)?;
+
         // We finally write all the word prefix integer docids into the LMDB database.
-        sorter_into_lmdb_database(
-            self.wtxn,
-            *self.prefix_database.as_polymorph(),
+        write_sorter_into_database(
             prefix_integer_docids_sorter,
-            merge_cbo_roaring_bitmaps,
+            &self.prefix_database,
+            self.wtxn,
+            database_is_empty,
+            deladd_serialize_add_side,
+            merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
         )?;
 
         Ok(())
@@ -159,6 +170,7 @@ fn write_prefixes_in_sorter(
     prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
     sorter: &mut grenad::Sorter<MergeFn>,
 ) -> Result<()> {
+    // TODO: Merge before insertion.
     for (key, data_slices) in prefixes.drain() {
         for data in data_slices {
             if valid_lmdb_key(&key) {

From 882ab9cc857fde9394b9fc4f1d46599617b8ccd7 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 9 Nov 2023 11:35:33 +0100
Subject: [PATCH 112/127] remove warnings

---
 .../index_documents/helpers/grenad_helpers.rs | 45 -------------------
 .../src/update/index_documents/helpers/mod.rs |  4 +-
 milli/src/update/index_documents/mod.rs       |  4 +-
 3 files changed, 4 insertions(+), 49 deletions(-)

diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs
index f520ea7b0..061cbe5a0 100644
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -1,14 +1,11 @@
 use std::borrow::Cow;
 use std::fs::File;
 use std::io::{self, BufReader, BufWriter, Seek};
-use std::time::Instant;
 
 use grenad::{CompressionType, Sorter};
 use heed::types::ByteSlice;
-use log::debug;
 
 use super::{ClonableMmap, MergeFn};
-use crate::error::InternalError;
 use crate::update::index_documents::valid_lmdb_key;
 use crate::Result;
 
@@ -241,48 +238,6 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
     Ok(std::iter::from_fn(move || transposer().transpose()))
 }
 
-pub fn sorter_into_lmdb_database(
-    wtxn: &mut heed::RwTxn,
-    database: heed::PolyDatabase,
-    sorter: Sorter<MergeFn>,
-    merge: MergeFn,
-) -> Result<()> {
-    puffin::profile_function!();
-    debug!("Writing MTBL sorter...");
-    let before = Instant::now();
-
-    let mut merger_iter = sorter.into_stream_merger_iter()?;
-    if database.is_empty(wtxn)? {
-        let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
-        while let Some((k, v)) = merger_iter.next()? {
-            // safety: we don't keep references from inside the LMDB database.
-            unsafe { out_iter.append(k, v)? };
-        }
-    } else {
-        while let Some((k, v)) = merger_iter.next()? {
-            let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
-            match iter.next().transpose()? {
-                Some((key, old_val)) if key == k => {
-                    let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
-                    let val = merge(k, &vals).map_err(|_| {
-                        // TODO just wrap this error?
-                        InternalError::IndexingMergingKeys { process: "get-put-merge" }
-                    })?;
-                    // safety: we don't keep references from inside the LMDB database.
-                    unsafe { iter.put_current(k, &val)? };
-                }
-                _ => {
-                    drop(iter);
-                    database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
-                }
-            }
-        }
-    }
-
-    debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
-    Ok(())
-}
-
 /// Write provided sorter in database using serialize_value function.
 /// merge_values function is used if an entry already exist in the database.
 pub fn write_sorter_into_database<K, V, FS, FM>(
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index c167f1cd3..841c09543 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -9,8 +9,8 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
 use fst::{IntoStreamer, Streamer};
 pub use grenad_helpers::{
     as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
-    merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_sorter_into_database,
-    writer_into_reader, GrenadParameters, MergeableReader,
+    merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
+    GrenadParameters, MergeableReader,
 };
 pub use merge_functions::{
     concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 5dbb4dd0b..de0361936 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -28,8 +28,8 @@ pub use self::helpers::{
     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
     fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
     merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
-    merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, write_sorter_into_database,
-    writer_into_reader, ClonableMmap, MergeFn,
+    merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader,
+    ClonableMmap, MergeFn,
 };
 use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};

From db2fb86b8bbb69cb79781d74dda885460ea45560 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 9 Nov 2023 14:19:16 +0100
Subject: [PATCH 113/127] Extract PrimaryKey logic to a type

---
 milli/src/documents/mod.rs         |  10 ++
 milli/src/documents/primary_key.rs | 168 +++++++++++++++++++++++++++++
 milli/src/fields_ids_map.rs        |   6 ++
 3 files changed, 184 insertions(+)
 create mode 100644 milli/src/documents/primary_key.rs

diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index 7c037b3bf..4429f083d 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -1,5 +1,6 @@
 mod builder;
 mod enriched;
+mod primary_key;
 mod reader;
 mod serde_impl;
 
@@ -11,6 +12,9 @@ use bimap::BiHashMap;
 pub use builder::DocumentsBatchBuilder;
 pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
 use obkv::KvReader;
+pub use primary_key::{
+    DocumentIdExtractionError, FieldDistribution, PrimaryKey, DEFAULT_PRIMARY_KEY,
+};
 pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
 use serde::{Deserialize, Serialize};
 
@@ -87,6 +91,12 @@ impl DocumentsBatchIndex {
     }
 }
 
+impl FieldDistribution for DocumentsBatchIndex {
+    fn id(&self, name: &str) -> Option<FieldId> {
+        self.id(name)
+    }
+}
+
 #[derive(Debug, thiserror::Error)]
 pub enum Error {
     #[error("Error parsing number {value:?} at line {line}: {error}")]
diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs
new file mode 100644
index 000000000..dd97f2608
--- /dev/null
+++ b/milli/src/documents/primary_key.rs
@@ -0,0 +1,168 @@
+use std::iter;
+use std::result::Result as StdResult;
+
+use serde_json::Value;
+
+use crate::{FieldId, InternalError, Object, Result, UserError};
+
+/// The symbol used to define levels in a nested primary key.
+const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
+
+/// The default primary that is used when not specified.
+pub const DEFAULT_PRIMARY_KEY: &str = "id";
+
+pub trait FieldDistribution {
+    fn id(&self, name: &str) -> Option<FieldId>;
+}
+
+/// A type that represent the type of primary key that has been set
+/// for this index, a classic flat one or a nested one.
+#[derive(Debug, Clone, Copy)]
+pub enum PrimaryKey<'a> {
+    Flat { name: &'a str, field_id: FieldId },
+    Nested { name: &'a str },
+}
+
+pub enum DocumentIdExtractionError {
+    InvalidDocumentId(UserError),
+    MissingDocumentId,
+    TooManyDocumentIds(usize),
+}
+
+impl<'a> PrimaryKey<'a> {
+    pub fn new(path: &'a str, fields: &impl FieldDistribution) -> Option<Self> {
+        Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) {
+            Self::Nested { name: path }
+        } else {
+            let field_id = fields.id(path)?;
+            Self::Flat { name: path, field_id }
+        })
+    }
+
+    pub fn name(&self) -> &str {
+        match self {
+            PrimaryKey::Flat { name, .. } => name,
+            PrimaryKey::Nested { name } => name,
+        }
+    }
+
+    pub fn document_id(
+        &self,
+        document: &obkv::KvReader<FieldId>,
+        fields: &impl FieldDistribution,
+    ) -> Result<StdResult<String, DocumentIdExtractionError>> {
+        match self {
+            PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) {
+                Some(document_id_bytes) => {
+                    let document_id = serde_json::from_slice(document_id_bytes)
+                        .map_err(InternalError::SerdeJson)?;
+                    match validate_document_id_value(document_id)? {
+                        Ok(document_id) => Ok(Ok(document_id)),
+                        Err(user_error) => {
+                            Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
+                        }
+                    }
+                }
+                None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
+            },
+            nested @ PrimaryKey::Nested { .. } => {
+                let mut matching_documents_ids = Vec::new();
+                for (first_level_name, right) in nested.possible_level_names() {
+                    if let Some(field_id) = fields.id(first_level_name) {
+                        if let Some(value_bytes) = document.get(field_id) {
+                            let object = serde_json::from_slice(value_bytes)
+                                .map_err(InternalError::SerdeJson)?;
+                            fetch_matching_values(object, right, &mut matching_documents_ids);
+
+                            if matching_documents_ids.len() >= 2 {
+                                return Ok(Err(DocumentIdExtractionError::TooManyDocumentIds(
+                                    matching_documents_ids.len(),
+                                )));
+                            }
+                        }
+                    }
+                }
+
+                match matching_documents_ids.pop() {
+                    Some(document_id) => match validate_document_id_value(document_id)? {
+                        Ok(document_id) => Ok(Ok(document_id)),
+                        Err(user_error) => {
+                            Ok(Err(DocumentIdExtractionError::InvalidDocumentId(user_error)))
+                        }
+                    },
+                    None => Ok(Err(DocumentIdExtractionError::MissingDocumentId)),
+                }
+            }
+        }
+    }
+
+    /// Returns an `Iterator` that gives all the possible fields names the primary key
+    /// can have depending of the first level name and depth of the objects.
+    pub fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
+        let name = self.name();
+        name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
+            .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
+            .chain(iter::once((name, "")))
+    }
+}
+
+fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
+    match value {
+        Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
+        otherwise => output.push(otherwise),
+    }
+}
+
+fn fetch_matching_values_in_object(
+    object: Object,
+    selector: &str,
+    base_key: &str,
+    output: &mut Vec<Value>,
+) {
+    for (key, value) in object {
+        let base_key = if base_key.is_empty() {
+            key.to_string()
+        } else {
+            format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
+        };
+
+        if starts_with(selector, &base_key) {
+            match value {
+                Value::Object(object) => {
+                    fetch_matching_values_in_object(object, selector, &base_key, output)
+                }
+                value => output.push(value),
+            }
+        }
+    }
+}
+
+fn starts_with(selector: &str, key: &str) -> bool {
+    selector.strip_prefix(key).map_or(false, |tail| {
+        tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
+    })
+}
+
+// FIXME: move to a DocumentId struct
+
+fn validate_document_id(document_id: &str) -> Option<&str> {
+    if !document_id.is_empty()
+        && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
+    {
+        Some(document_id)
+    } else {
+        None
+    }
+}
+
+pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
+    match document_id {
+        Value::String(string) => match validate_document_id(&string) {
+            Some(s) if s.len() == string.len() => Ok(Ok(string)),
+            Some(s) => Ok(Ok(s.to_string())),
+            None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
+        },
+        Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
+        content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
+    }
+}
diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs
index 810ff755b..85320c168 100644
--- a/milli/src/fields_ids_map.rs
+++ b/milli/src/fields_ids_map.rs
@@ -81,6 +81,12 @@ impl Default for FieldsIdsMap {
     }
 }
 
+impl crate::documents::FieldDistribution for FieldsIdsMap {
+    fn id(&self, name: &str) -> Option<FieldId> {
+        self.id(name)
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;

From 9cef800b2aa8bceb31bd82ca3bcd11a59157a8dc Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 9 Nov 2023 14:22:05 +0100
Subject: [PATCH 114/127] Enrich uses the new type

---
 milli/src/update/index_documents/enrich.rs | 207 ++++-----------------
 milli/src/update/index_documents/mod.rs    |   5 +-
 2 files changed, 34 insertions(+), 178 deletions(-)

diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs
index 22b16f253..03eb3f4de 100644
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@@ -1,20 +1,17 @@
+use std::fmt;
 use std::io::{BufWriter, Read, Seek};
 use std::result::Result as StdResult;
-use std::{fmt, iter};
 
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
 
-use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader};
+use crate::documents::{
+    DocumentIdExtractionError, DocumentsBatchIndex, DocumentsBatchReader,
+    EnrichedDocumentsBatchReader, PrimaryKey, DEFAULT_PRIMARY_KEY,
+};
 use crate::error::{GeoError, InternalError, UserError};
 use crate::update::index_documents::{obkv_to_object, writer_into_reader};
-use crate::{FieldId, Index, Object, Result};
-
-/// The symbol used to define levels in a nested primary key.
-const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
-
-/// The default primary that is used when not specified.
-const DEFAULT_PRIMARY_KEY: &str = "id";
+use crate::{FieldId, Index, Result};
 
 /// This function validates and enrich the documents by checking that:
 ///  - we can infer a primary key,
@@ -41,14 +38,12 @@ pub fn enrich_documents_batch<R: Read + Seek>(
     // The primary key *field id* that has already been set for this index or the one
     // we will guess by searching for the first key that contains "id" as a substring.
     let primary_key = match index.primary_key(rtxn)? {
-        Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => {
-            PrimaryKey::nested(primary_key)
-        }
-        Some(primary_key) => match documents_batch_index.id(primary_key) {
-            Some(id) => PrimaryKey::flat(primary_key, id),
-            None if autogenerate_docids => {
-                PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key))
-            }
+        Some(primary_key) => match PrimaryKey::new(primary_key, &documents_batch_index) {
+            Some(primary_key) => primary_key,
+            None if autogenerate_docids => PrimaryKey::Flat {
+                name: primary_key,
+                field_id: documents_batch_index.insert(primary_key),
+            },
             None => {
                 return match cursor.next_document()? {
                     Some(first_document) => Ok(Err(UserError::MissingDocumentId {
@@ -76,14 +71,14 @@ pub fn enrich_documents_batch<R: Read + Seek>(
             });
 
             match guesses.as_slice() {
-                [] if autogenerate_docids => PrimaryKey::flat(
-                    DEFAULT_PRIMARY_KEY,
-                    documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
-                ),
+                [] if autogenerate_docids => PrimaryKey::Flat {
+                    name: DEFAULT_PRIMARY_KEY,
+                    field_id: documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
+                },
                 [] => return Ok(Err(UserError::NoPrimaryKeyCandidateFound)),
                 [(field_id, name)] => {
                     log::info!("Primary key was not specified in index. Inferred to '{name}'");
-                    PrimaryKey::flat(name, *field_id)
+                    PrimaryKey::Flat { name, field_id: *field_id }
                 }
                 multiple => {
                     return Ok(Err(UserError::MultiplePrimaryKeyCandidatesFound {
@@ -156,92 +151,24 @@ fn fetch_or_generate_document_id(
     uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH],
     count: u32,
 ) -> Result<StdResult<DocumentId, UserError>> {
-    match primary_key {
-        PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => {
-            match document.get(primary_key_id) {
-                Some(document_id_bytes) => {
-                    let document_id = serde_json::from_slice(document_id_bytes)
-                        .map_err(InternalError::SerdeJson)?;
-                    match validate_document_id_value(document_id)? {
-                        Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
-                        Err(user_error) => Ok(Err(user_error)),
-                    }
-                }
-                None if autogenerate_docids => {
-                    let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
-                    Ok(Ok(DocumentId::generated(uuid.to_string(), count)))
-                }
-                None => Ok(Err(UserError::MissingDocumentId {
-                    primary_key: primary_key.to_string(),
-                    document: obkv_to_object(document, documents_batch_index)?,
-                })),
-            }
+    Ok(match primary_key.document_id(document, documents_batch_index)? {
+        Ok(document_id) => Ok(DocumentId::Retrieved { value: document_id }),
+        Err(DocumentIdExtractionError::InvalidDocumentId(user_error)) => Err(user_error),
+        Err(DocumentIdExtractionError::MissingDocumentId) if autogenerate_docids => {
+            let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
+            Ok(DocumentId::Generated { value: uuid.to_string(), document_nth: count })
         }
-        nested @ PrimaryKey::Nested { .. } => {
-            let mut matching_documents_ids = Vec::new();
-            for (first_level_name, right) in nested.possible_level_names() {
-                if let Some(field_id) = documents_batch_index.id(first_level_name) {
-                    if let Some(value_bytes) = document.get(field_id) {
-                        let object = serde_json::from_slice(value_bytes)
-                            .map_err(InternalError::SerdeJson)?;
-                        fetch_matching_values(object, right, &mut matching_documents_ids);
-
-                        if matching_documents_ids.len() >= 2 {
-                            return Ok(Err(UserError::TooManyDocumentIds {
-                                primary_key: nested.name().to_string(),
-                                document: obkv_to_object(document, documents_batch_index)?,
-                            }));
-                        }
-                    }
-                }
-            }
-
-            match matching_documents_ids.pop() {
-                Some(document_id) => match validate_document_id_value(document_id)? {
-                    Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
-                    Err(user_error) => Ok(Err(user_error)),
-                },
-                None => Ok(Err(UserError::MissingDocumentId {
-                    primary_key: nested.name().to_string(),
-                    document: obkv_to_object(document, documents_batch_index)?,
-                })),
-            }
+        Err(DocumentIdExtractionError::MissingDocumentId) => Err(UserError::MissingDocumentId {
+            primary_key: primary_key.name().to_string(),
+            document: obkv_to_object(document, documents_batch_index)?,
+        }),
+        Err(DocumentIdExtractionError::TooManyDocumentIds(_)) => {
+            Err(UserError::TooManyDocumentIds {
+                primary_key: primary_key.name().to_string(),
+                document: obkv_to_object(document, documents_batch_index)?,
+            })
         }
-    }
-}
-
-/// A type that represent the type of primary key that has been set
-/// for this index, a classic flat one or a nested one.
-#[derive(Debug, Clone, Copy)]
-enum PrimaryKey<'a> {
-    Flat { name: &'a str, field_id: FieldId },
-    Nested { name: &'a str },
-}
-
-impl PrimaryKey<'_> {
-    fn flat(name: &str, field_id: FieldId) -> PrimaryKey {
-        PrimaryKey::Flat { name, field_id }
-    }
-
-    fn nested(name: &str) -> PrimaryKey {
-        PrimaryKey::Nested { name }
-    }
-
-    fn name(&self) -> &str {
-        match self {
-            PrimaryKey::Flat { name, .. } => name,
-            PrimaryKey::Nested { name } => name,
-        }
-    }
-
-    /// Returns an `Iterator` that gives all the possible fields names the primary key
-    /// can have depending of the first level name and deepnes of the objects.
-    fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
-        let name = self.name();
-        name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
-            .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
-            .chain(iter::once((name, "")))
-    }
+    })
 }
 
 /// A type that represents a document id that has been retrieved from a document or auto-generated.
@@ -255,14 +182,6 @@ pub enum DocumentId {
 }
 
 impl DocumentId {
-    fn retrieved(value: String) -> DocumentId {
-        DocumentId::Retrieved { value }
-    }
-
-    fn generated(value: String, document_nth: u32) -> DocumentId {
-        DocumentId::Generated { value, document_nth }
-    }
-
     fn debug(&self) -> String {
         format!("{:?}", self)
     }
@@ -290,66 +209,6 @@ impl fmt::Debug for DocumentId {
     }
 }
 
-fn starts_with(selector: &str, key: &str) -> bool {
-    selector.strip_prefix(key).map_or(false, |tail| {
-        tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
-    })
-}
-
-pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
-    match value {
-        Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
-        otherwise => output.push(otherwise),
-    }
-}
-
-pub fn fetch_matching_values_in_object(
-    object: Object,
-    selector: &str,
-    base_key: &str,
-    output: &mut Vec<Value>,
-) {
-    for (key, value) in object {
-        let base_key = if base_key.is_empty() {
-            key.to_string()
-        } else {
-            format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
-        };
-
-        if starts_with(selector, &base_key) {
-            match value {
-                Value::Object(object) => {
-                    fetch_matching_values_in_object(object, selector, &base_key, output)
-                }
-                value => output.push(value),
-            }
-        }
-    }
-}
-
-pub fn validate_document_id(document_id: &str) -> Option<&str> {
-    if !document_id.is_empty()
-        && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
-    {
-        Some(document_id)
-    } else {
-        None
-    }
-}
-
-/// Parses a Json encoded document id and validate it, returning a user error when it is one.
-pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
-    match document_id {
-        Value::String(string) => match validate_document_id(&string) {
-            Some(s) if s.len() == string.len() => Ok(Ok(string)),
-            Some(s) => Ok(Ok(s.to_string())),
-            None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
-        },
-        Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
-        content => Ok(Err(UserError::InvalidDocumentId { document_id: content })),
-    }
-}
-
 /// Try to extract an `f64` from a JSON `Value` and return the `Value`
 /// in the `Err` variant if it failed.
 pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 2be410ace..d60006289 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -20,10 +20,7 @@ use slice_group_by::GroupBy;
 use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
 
 use self::enrich::enrich_documents_batch;
-pub use self::enrich::{
-    extract_finite_float_from_value, validate_document_id, validate_document_id_value,
-    validate_geo_from_json, DocumentId,
-};
+pub use self::enrich::{extract_finite_float_from_value, validate_geo_from_json, DocumentId};
 pub use self::helpers::{
     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
     fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,

From b11c2afac09bd1eae5a1f73e97efe1651add7e67 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 9 Nov 2023 14:22:43 +0100
Subject: [PATCH 115/127] Index::external_id_of

---
 milli/src/index.rs | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 86ef6105b..5b705e0b2 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -12,6 +12,7 @@ use rstar::RTree;
 use time::OffsetDateTime;
 
 use crate::distance::NDotProductPoint;
+use crate::documents::PrimaryKey;
 use crate::error::{InternalError, UserError};
 use crate::fields_ids_map::FieldsIdsMap;
 use crate::heed_codec::facet::{
@@ -1176,6 +1177,36 @@ impl Index {
         self.iter_documents(rtxn, self.documents_ids(rtxn)?)
     }
 
+    pub fn external_id_of<'a, 't: 'a>(
+        &'a self,
+        rtxn: &'t RoTxn,
+        ids: impl IntoIterator<Item = DocumentId> + 'a,
+    ) -> Result<impl IntoIterator<Item = Result<String>> + 'a> {
+        let fields = self.fields_ids_map(rtxn)?;
+
+        // uses precondition "never called on an empty index"
+        let primary_key = self.primary_key(rtxn)?.ok_or(InternalError::DatabaseMissingEntry {
+            db_name: db_name::MAIN,
+            key: Some(main_key::PRIMARY_KEY_KEY),
+        })?;
+        let primary_key = PrimaryKey::new(primary_key, &fields).ok_or_else(|| {
+            InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldName {
+                field_name: primary_key.to_owned(),
+                process: "external_id_of",
+            })
+        })?;
+        Ok(self.iter_documents(rtxn, ids)?.map(move |entry| -> Result<_> {
+            let (_docid, obkv) = entry?;
+            match primary_key.document_id(&obkv, &fields)? {
+                Ok(document_id) => Ok(document_id),
+                Err(_) => Err(InternalError::DocumentsError(
+                    crate::documents::Error::InvalidDocumentFormat,
+                )
+                .into()),
+            }
+        }))
+    }
+
     pub fn facets_distribution<'a>(&'a self, rtxn: &'a RoTxn) -> FacetDistribution<'a> {
         FacetDistribution::new(rtxn, self)
     }

From 3053e01c05df5d840c1d4efe4810cdafef5a8c70 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 9 Nov 2023 14:23:02 +0100
Subject: [PATCH 116/127] Batch::remove_documents_from_db_no_batch

---
 milli/src/update/index_documents/mod.rs       | 33 ++++++++
 milli/src/update/index_documents/transform.rs | 83 +++++++++++++++++++
 2 files changed, 116 insertions(+)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index d60006289..de40e0b9b 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -194,6 +194,39 @@ where
         Ok((self, Ok(deleted_documents)))
     }
 
+    /// Removes documents from db using their internal document ids.
+    ///
+    /// # Warning
+    ///
+    /// This function is dangerous and will only work correctly if:
+    ///
+    /// - All the passed ids currently exist in the database
+    /// - No batching using the standards `remove_documents` and `add_documents` took place
+    ///
+    /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
+    pub fn remove_documents_from_db_no_batch(
+        mut self,
+        to_delete: &RoaringBitmap,
+    ) -> Result<(Self, u64)> {
+        puffin::profile_function!();
+
+        // Early return when there is no document to add
+        if to_delete.is_empty() {
+            return Ok((self, 0));
+        }
+
+        let deleted_documents = self
+            .transform
+            .as_mut()
+            .expect("Invalid document deletion state")
+            .remove_documents_from_db_no_batch(to_delete, self.wtxn, &self.should_abort)?
+            as u64;
+
+        self.deleted_documents += deleted_documents;
+
+        Ok((self, deleted_documents))
+    }
+
     #[logging_timer::time("IndexDocuments::{}")]
     pub fn execute(mut self) -> Result<DocumentAdditionResult> {
         puffin::profile_function!();
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 186974bfe..5f5e698d3 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -481,6 +481,89 @@ impl<'a, 'i> Transform<'a, 'i> {
         Ok(documents_deleted)
     }
 
+    /// The counter part of `read_documents` that removes documents either from the transform or the database.
+    /// It can be called before, after or in between two calls of the `read_documents`.
+    ///
+    /// It needs to update all the internal datastructure in the transform.
+    /// - If the document is coming from the database -> it's marked as a to_delete document
+    /// - If the document to remove was inserted by the `read_documents` method before AND was present in the db,
+    ///   it's marked as `to_delete` + added into the grenad to ensure we don't reinsert it.
+    /// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db,
+    ///   it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids.
+    /// - If the document to remove was not present in either the db or the transform we do nothing.
+    #[logging_timer::time]
+    pub fn remove_documents_from_db_no_batch<FA>(
+        &mut self,
+        to_remove: &RoaringBitmap,
+        wtxn: &mut heed::RwTxn,
+        should_abort: FA,
+    ) -> Result<usize>
+    where
+        FA: Fn() -> bool + Sync,
+    {
+        puffin::profile_function!();
+
+        let mut documents_deleted = 0;
+        let mut document_sorter_value_buffer = Vec::new();
+        let mut document_sorter_key_buffer = Vec::new();
+        let external_ids = self.index.external_id_of(wtxn, to_remove.iter())?;
+
+        for (to_remove, external_docid) in to_remove.iter().zip(external_ids) {
+            let external_docid = external_docid?;
+            if should_abort() {
+                return Err(Error::InternalError(InternalError::AbortedIndexation));
+            }
+            self.replaced_documents_ids.insert(to_remove);
+
+            // fetch the obkv document
+            let original_key = BEU32::new(to_remove);
+            let base_obkv = self
+                .index
+                .documents
+                .remap_data_type::<heed::types::ByteSlice>()
+                .get(wtxn, &original_key)?
+                .ok_or(InternalError::DatabaseMissingEntry {
+                    db_name: db_name::DOCUMENTS,
+                    key: None,
+                })?;
+
+            // Key is the concatenation of the internal docid and the external one.
+            document_sorter_key_buffer.clear();
+            document_sorter_key_buffer.extend_from_slice(&to_remove.to_be_bytes());
+            document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes());
+            // push it as to delete in the original_sorter
+            document_sorter_value_buffer.clear();
+            document_sorter_value_buffer.push(Operation::Deletion as u8);
+            into_del_add_obkv(
+                KvReaderU16::new(base_obkv),
+                true,
+                false,
+                &mut document_sorter_value_buffer,
+            )?;
+            self.original_sorter
+                .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
+
+            // flatten it and push it as to delete in the flattened_sorter
+            let flattened_obkv = KvReader::new(base_obkv);
+            if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
+                // we recreate our buffer with the flattened documents
+                document_sorter_value_buffer.clear();
+                document_sorter_value_buffer.push(Operation::Deletion as u8);
+                into_del_add_obkv(
+                    KvReaderU16::new(&obkv),
+                    true,
+                    false,
+                    &mut document_sorter_value_buffer,
+                )?;
+            }
+            self.flattened_sorter.insert(to_remove.to_be_bytes(), &document_sorter_value_buffer)?;
+
+            documents_deleted += 1;
+        }
+
+        Ok(documents_deleted)
+    }
+
     // Flatten a document from the fields ids map contained in self and insert the new
     // created fields. Returns `None` if the document doesn't need to be flattened.
     fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {

From f8289cd974d957d38645ca66c993ca518ec81955 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 9 Nov 2023 14:23:15 +0100
Subject: [PATCH 117/127] Use it from delete-by-filter

---
 index-scheduler/src/batch.rs | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs
index c9deedb37..5260a9d7e 100644
--- a/index-scheduler/src/batch.rs
+++ b/index-scheduler/src/batch.rs
@@ -1534,18 +1534,6 @@ fn delete_document_by_filter<'a>(
             }
             e => e.into(),
         })?;
-        let external_documents_ids = index.external_documents_ids();
-        // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
-        // Since what we have is an iterator, it would be better to delete in chunks
-        let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
-            external_documents_ids
-                .find_external_id_of(wtxn, candidates)?
-                .only_external_ids()
-                .collect();
-        let document_ids = match external_to_internal {
-            Ok(external_ids) => external_ids,
-            Err(remaining_ids) => panic!("Couldn't find some external ids {:?}", remaining_ids),
-        };
 
         let config = IndexDocumentsConfig {
             update_method: IndexDocumentsMethod::ReplaceDocuments,
@@ -1561,13 +1549,10 @@ fn delete_document_by_filter<'a>(
             || must_stop_processing.get(),
         )?;
 
-        let (new_builder, user_result) = builder.remove_documents(document_ids)?;
+        let (new_builder, count) = builder.remove_documents_from_db_no_batch(&candidates)?;
         builder = new_builder;
-        // Uses Invariant: remove documents actually always returns Ok for the inner result
-        let count = user_result.unwrap();
 
         let _ = builder.execute()?;
-
         count
     } else {
         0

From 825257da76b33809cbb0496773449c63de023260 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 9 Nov 2023 16:13:15 +0100
Subject: [PATCH 118/127] Use more efficient method for deletion in benchmarks

---
 benchmarks/benches/indexing.rs | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs
index c31bfab89..65f581b93 100644
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@@ -864,22 +864,12 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec<RoaringBi
 
     let indexer_config = IndexerConfig::default();
     for ids in document_ids_to_delete {
-        let external_documents_ids = index.external_documents_ids();
-        // FIXME: for filters matching a lot of documents, this will allocate a huge vec of external docids (strings).
-        // Since what we have is an iterator, it would be better to delete in chunks
-        let external_to_internal: std::result::Result<Vec<_>, RoaringBitmap> =
-            external_documents_ids
-                .find_external_id_of(&wtxn, ids)
-                .unwrap()
-                .only_external_ids()
-                .collect();
-        let ids = external_to_internal.unwrap();
         let config = IndexDocumentsConfig::default();
 
         let mut builder =
             IndexDocuments::new(&mut wtxn, &index, &indexer_config, config, |_| (), || false)
                 .unwrap();
-        (builder, _) = builder.remove_documents(ids).unwrap();
+        (builder, _) = builder.remove_documents_from_db_no_batch(&ids).unwrap();
         builder.execute().unwrap();
     }
 

From 264b10ec20956cfe599cfeb5e3fc08ae2298bfc8 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Thu, 9 Nov 2023 16:23:20 +0100
Subject: [PATCH 119/127] Fixup documentation

---
 milli/src/update/index_documents/transform.rs | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 5f5e698d3..23313547a 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -481,16 +481,16 @@ impl<'a, 'i> Transform<'a, 'i> {
         Ok(documents_deleted)
     }
 
-    /// The counter part of `read_documents` that removes documents either from the transform or the database.
-    /// It can be called before, after or in between two calls of the `read_documents`.
+    /// Removes documents from db using their internal document ids.
     ///
-    /// It needs to update all the internal datastructure in the transform.
-    /// - If the document is coming from the database -> it's marked as a to_delete document
-    /// - If the document to remove was inserted by the `read_documents` method before AND was present in the db,
-    ///   it's marked as `to_delete` + added into the grenad to ensure we don't reinsert it.
-    /// - If the document to remove was inserted by the `read_documents` method before but was NOT present in the db,
-    ///   it's added into the grenad to ensure we don't insert it + removed from the list of new documents ids.
-    /// - If the document to remove was not present in either the db or the transform we do nothing.
+    /// # Warning
+    ///
+    /// This function is dangerous and will only work correctly if:
+    ///
+    /// - All the passed ids currently exist in the database
+    /// - No batching using the standards `remove_documents` and `add_documents` took place
+    ///
+    /// TODO: make it impossible to call `remove_documents` or `add_documents` on an instance that calls this function.
     #[logging_timer::time]
     pub fn remove_documents_from_db_no_batch<FA>(
         &mut self,

From 1f364105419170f8c5a65a57e23f43a45c58725d Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 13 Nov 2023 13:36:39 +0100
Subject: [PATCH 120/127] Update tests

---
 milli/src/search/new/tests/proximity.rs        | 18 +++++++++---------
 ...sts__proximity__proximity_prefix_db-14.snap | 18 +++++++++---------
 ...ests__proximity__proximity_prefix_db-2.snap | 18 +++++++++---------
 ...ests__proximity__proximity_prefix_db-8.snap | 18 +++++++++---------
 milli/src/snapshot_tests.rs                    | 16 ----------------
 5 files changed, 36 insertions(+), 52 deletions(-)

diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs
index 217ebe9b3..2d181a537 100644
--- a/milli/src/search/new/tests/proximity.rs
+++ b/milli/src/search/new/tests/proximity.rs
@@ -371,7 +371,7 @@ fn test_proximity_prefix_db() {
     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
     s.query("best s");
     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]");
     insta::assert_snapshot!(format!("{document_scores:#?}"));
     let texts = collect_field_values(&index, &txn, "text", &documents_ids);
 
@@ -379,13 +379,13 @@ fn test_proximity_prefix_db() {
     insta::assert_debug_snapshot!(texts, @r###"
     [
         "\"this is the best summer meal\"",
-        "\"summer best\"",
         "\"this is the best meal of summer\"",
-        "\"summer x best\"",
         "\"this is the best meal I have ever had in such a beautiful summer day\"",
         "\"this is the best cooked meal of the summer\"",
         "\"this is the best meal of the summer\"",
         "\"summer x y best\"",
+        "\"summer x best\"",
+        "\"summer best\"",
         "\"this is the best meal I have ever had in such a beautiful winter day\"",
     ]
     "###);
@@ -423,20 +423,20 @@ fn test_proximity_prefix_db() {
     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
     s.query("best win");
     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
     insta::assert_snapshot!(format!("{document_scores:#?}"));
     let texts = collect_field_values(&index, &txn, "text", &documents_ids);
 
     insta::assert_debug_snapshot!(texts, @r###"
     [
         "\"this is the best winter meal\"",
-        "\"winter best\"",
         "\"this is the best meal of winter\"",
-        "\"winter x best\"",
         "\"this is the best meal I have ever had in such a beautiful winter day\"",
         "\"this is the best cooked meal of the winter\"",
         "\"this is the best meal of the winter\"",
         "\"winter x y best\"",
+        "\"winter x best\"",
+        "\"winter best\"",
     ]
     "###);
 
@@ -471,20 +471,20 @@ fn test_proximity_prefix_db() {
     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
     s.query("best wi");
     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
     insta::assert_snapshot!(format!("{document_scores:#?}"));
     let texts = collect_field_values(&index, &txn, "text", &documents_ids);
 
     insta::assert_debug_snapshot!(texts, @r###"
     [
         "\"this is the best winter meal\"",
-        "\"winter best\"",
         "\"this is the best meal of winter\"",
-        "\"winter x best\"",
         "\"this is the best meal I have ever had in such a beautiful winter day\"",
         "\"this is the best cooked meal of the winter\"",
         "\"this is the best meal of the winter\"",
         "\"winter x y best\"",
+        "\"winter x best\"",
+        "\"winter best\"",
     ]
     "###);
 }
diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap
index 8f3b964c1..efcfef7f1 100644
--- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap
+++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap
@@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
             },
         ),
     ],
-    [
-        Proximity(
-            Rank {
-                rank: 3,
-                max_rank: 4,
-            },
-        ),
-    ],
     [
         Proximity(
             Rank {
@@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
     [
         Proximity(
             Rank {
-                rank: 2,
+                rank: 1,
+                max_rank: 4,
+            },
+        ),
+    ],
+    [
+        Proximity(
+            Rank {
+                rank: 1,
                 max_rank: 4,
             },
         ),
diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap
index 1ee6bfc91..242bc3424 100644
--- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap
+++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap
@@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
             },
         ),
     ],
-    [
-        Proximity(
-            Rank {
-                rank: 3,
-                max_rank: 4,
-            },
-        ),
-    ],
     [
         Proximity(
             Rank {
@@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
     [
         Proximity(
             Rank {
-                rank: 2,
+                rank: 1,
+                max_rank: 4,
+            },
+        ),
+    ],
+    [
+        Proximity(
+            Rank {
+                rank: 1,
                 max_rank: 4,
             },
         ),
diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap
index 8f3b964c1..efcfef7f1 100644
--- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap
+++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap
@@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
             },
         ),
     ],
-    [
-        Proximity(
-            Rank {
-                rank: 3,
-                max_rank: 4,
-            },
-        ),
-    ],
     [
         Proximity(
             Rank {
@@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
     [
         Proximity(
             Rank {
-                rank: 2,
+                rank: 1,
+                max_rank: 4,
+            },
+        ),
+    ],
+    [
+        Proximity(
+            Rank {
+                rank: 1,
                 max_rank: 4,
             },
         ),
diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs
index f3f1eb5a5..28c4cb45c 100644
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -219,22 +219,6 @@ pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
         &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
     })
 }
-pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
-    make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |(
-        (proximity, word1, prefix),
-        b,
-    )| {
-        &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b))
-    })
-}
-pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String {
-    make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |(
-        (proximity, prefix, word2),
-        b,
-    )| {
-        &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b))
-    })
-}
 pub fn snap_word_position_docids(index: &Index) -> String {
     make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| {
         &format!("{word:<16} {position:<6} {}", display_bitmap(&b))

From 378deb0bef48269ee373fc3a6426e24c80393b2e Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 13 Nov 2023 13:37:58 +0100
Subject: [PATCH 121/127] Rename trait

---
 milli/src/documents/mod.rs         |  6 ++----
 milli/src/documents/primary_key.rs | 10 +++++++---
 milli/src/fields_ids_map.rs        |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index 4429f083d..a874ac17e 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -12,9 +12,7 @@ use bimap::BiHashMap;
 pub use builder::DocumentsBatchBuilder;
 pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
 use obkv::KvReader;
-pub use primary_key::{
-    DocumentIdExtractionError, FieldDistribution, PrimaryKey, DEFAULT_PRIMARY_KEY,
-};
+pub use primary_key::{DocumentIdExtractionError, FieldIdMapper, PrimaryKey, DEFAULT_PRIMARY_KEY};
 pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
 use serde::{Deserialize, Serialize};
 
@@ -91,7 +89,7 @@ impl DocumentsBatchIndex {
     }
 }
 
-impl FieldDistribution for DocumentsBatchIndex {
+impl FieldIdMapper for DocumentsBatchIndex {
     fn id(&self, name: &str) -> Option<FieldId> {
         self.id(name)
     }
diff --git a/milli/src/documents/primary_key.rs b/milli/src/documents/primary_key.rs
index dd97f2608..16a95c21f 100644
--- a/milli/src/documents/primary_key.rs
+++ b/milli/src/documents/primary_key.rs
@@ -11,7 +11,11 @@ const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
 /// The default primary that is used when not specified.
 pub const DEFAULT_PRIMARY_KEY: &str = "id";
 
-pub trait FieldDistribution {
+/// Trait for objects that can map the name of a field to its [`FieldId`].
+pub trait FieldIdMapper {
+    /// Attempts to map the passed name to its [`FieldId`].
+    ///
+    /// `None` if the field with this name was not found.
     fn id(&self, name: &str) -> Option<FieldId>;
 }
 
@@ -30,7 +34,7 @@ pub enum DocumentIdExtractionError {
 }
 
 impl<'a> PrimaryKey<'a> {
-    pub fn new(path: &'a str, fields: &impl FieldDistribution) -> Option<Self> {
+    pub fn new(path: &'a str, fields: &impl FieldIdMapper) -> Option<Self> {
         Some(if path.contains(PRIMARY_KEY_SPLIT_SYMBOL) {
             Self::Nested { name: path }
         } else {
@@ -49,7 +53,7 @@ impl<'a> PrimaryKey<'a> {
     pub fn document_id(
         &self,
         document: &obkv::KvReader<FieldId>,
-        fields: &impl FieldDistribution,
+        fields: &impl FieldIdMapper,
     ) -> Result<StdResult<String, DocumentIdExtractionError>> {
         match self {
             PrimaryKey::Flat { name: _, field_id } => match document.get(*field_id) {
diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs
index 85320c168..9c1c87f82 100644
--- a/milli/src/fields_ids_map.rs
+++ b/milli/src/fields_ids_map.rs
@@ -81,7 +81,7 @@ impl Default for FieldsIdsMap {
     }
 }
 
-impl crate::documents::FieldDistribution for FieldsIdsMap {
+impl crate::documents::FieldIdMapper for FieldsIdsMap {
     fn id(&self, name: &str) -> Option<FieldId> {
         self.id(name)
     }

From 772964125d6f59ef4eaa1957c305211f45f07526 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 13 Nov 2023 13:51:22 +0100
Subject: [PATCH 122/127] Factor removal of document from DB

---
 milli/src/update/index_documents/transform.rs | 143 +++++++-----------
 1 file changed, 56 insertions(+), 87 deletions(-)

diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 23313547a..8dc88efb9 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -421,52 +421,13 @@ impl<'a, 'i> Transform<'a, 'i> {
             // Then we push the document in sorters in deletion mode.
             let deleted_from_db = match external_documents_ids.get(wtxn, &to_remove)? {
                 Some(docid) => {
-                    self.replaced_documents_ids.insert(docid);
-
-                    // fetch the obkv document
-                    let original_key = BEU32::new(docid);
-                    let base_obkv = self
-                        .index
-                        .documents
-                        .remap_data_type::<heed::types::ByteSlice>()
-                        .get(wtxn, &original_key)?
-                        .ok_or(InternalError::DatabaseMissingEntry {
-                            db_name: db_name::DOCUMENTS,
-                            key: None,
-                        })?;
-
-                    // Key is the concatenation of the internal docid and the external one.
-                    document_sorter_key_buffer.clear();
-                    document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
-                    document_sorter_key_buffer.extend_from_slice(to_remove.as_bytes());
-                    // push it as to delete in the original_sorter
-                    document_sorter_value_buffer.clear();
-                    document_sorter_value_buffer.push(Operation::Deletion as u8);
-                    into_del_add_obkv(
-                        KvReaderU16::new(base_obkv),
-                        true,
-                        false,
+                    self.remove_document_from_db(
+                        docid,
+                        to_remove,
+                        wtxn,
+                        &mut document_sorter_key_buffer,
                         &mut document_sorter_value_buffer,
                     )?;
-                    self.original_sorter
-                        .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
-
-                    // flatten it and push it as to delete in the flattened_sorter
-                    let flattened_obkv = KvReader::new(base_obkv);
-                    if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
-                        // we recreate our buffer with the flattened documents
-                        document_sorter_value_buffer.clear();
-                        document_sorter_value_buffer.push(Operation::Deletion as u8);
-                        into_del_add_obkv(
-                            KvReaderU16::new(&obkv),
-                            true,
-                            false,
-                            &mut document_sorter_value_buffer,
-                        )?;
-                    }
-                    self.flattened_sorter
-                        .insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
-
                     true
                 }
                 None => false,
@@ -508,55 +469,18 @@ impl<'a, 'i> Transform<'a, 'i> {
         let mut document_sorter_key_buffer = Vec::new();
         let external_ids = self.index.external_id_of(wtxn, to_remove.iter())?;
 
-        for (to_remove, external_docid) in to_remove.iter().zip(external_ids) {
+        for (internal_docid, external_docid) in to_remove.iter().zip(external_ids) {
             let external_docid = external_docid?;
             if should_abort() {
                 return Err(Error::InternalError(InternalError::AbortedIndexation));
             }
-            self.replaced_documents_ids.insert(to_remove);
-
-            // fetch the obkv document
-            let original_key = BEU32::new(to_remove);
-            let base_obkv = self
-                .index
-                .documents
-                .remap_data_type::<heed::types::ByteSlice>()
-                .get(wtxn, &original_key)?
-                .ok_or(InternalError::DatabaseMissingEntry {
-                    db_name: db_name::DOCUMENTS,
-                    key: None,
-                })?;
-
-            // Key is the concatenation of the internal docid and the external one.
-            document_sorter_key_buffer.clear();
-            document_sorter_key_buffer.extend_from_slice(&to_remove.to_be_bytes());
-            document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes());
-            // push it as to delete in the original_sorter
-            document_sorter_value_buffer.clear();
-            document_sorter_value_buffer.push(Operation::Deletion as u8);
-            into_del_add_obkv(
-                KvReaderU16::new(base_obkv),
-                true,
-                false,
+            self.remove_document_from_db(
+                internal_docid,
+                external_docid,
+                wtxn,
+                &mut document_sorter_key_buffer,
                 &mut document_sorter_value_buffer,
             )?;
-            self.original_sorter
-                .insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
-
-            // flatten it and push it as to delete in the flattened_sorter
-            let flattened_obkv = KvReader::new(base_obkv);
-            if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
-                // we recreate our buffer with the flattened documents
-                document_sorter_value_buffer.clear();
-                document_sorter_value_buffer.push(Operation::Deletion as u8);
-                into_del_add_obkv(
-                    KvReaderU16::new(&obkv),
-                    true,
-                    false,
-                    &mut document_sorter_value_buffer,
-                )?;
-            }
-            self.flattened_sorter.insert(to_remove.to_be_bytes(), &document_sorter_value_buffer)?;
 
             documents_deleted += 1;
         }
@@ -564,6 +488,51 @@ impl<'a, 'i> Transform<'a, 'i> {
         Ok(documents_deleted)
     }
 
+    fn remove_document_from_db(
+        &mut self,
+        internal_docid: u32,
+        external_docid: String,
+        txn: &heed::RoTxn,
+        document_sorter_key_buffer: &mut Vec<u8>,
+        document_sorter_value_buffer: &mut Vec<u8>,
+    ) -> Result<()> {
+        self.replaced_documents_ids.insert(internal_docid);
+
+        // fetch the obkv document
+        let original_key = BEU32::new(internal_docid);
+        let base_obkv = self
+            .index
+            .documents
+            .remap_data_type::<heed::types::ByteSlice>()
+            .get(txn, &original_key)?
+            .ok_or(InternalError::DatabaseMissingEntry {
+                db_name: db_name::DOCUMENTS,
+                key: None,
+            })?;
+
+        // Key is the concatenation of the internal docid and the external one.
+        document_sorter_key_buffer.clear();
+        document_sorter_key_buffer.extend_from_slice(&internal_docid.to_be_bytes());
+        document_sorter_key_buffer.extend_from_slice(external_docid.as_bytes());
+        // push it as to delete in the original_sorter
+        document_sorter_value_buffer.clear();
+        document_sorter_value_buffer.push(Operation::Deletion as u8);
+        into_del_add_obkv(KvReaderU16::new(base_obkv), true, false, document_sorter_value_buffer)?;
+        self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
+
+        // flatten it and push it as to delete in the flattened_sorter
+        let flattened_obkv = KvReader::new(base_obkv);
+        if let Some(obkv) = self.flatten_from_fields_ids_map(flattened_obkv)? {
+            // we recreate our buffer with the flattened documents
+            document_sorter_value_buffer.clear();
+            document_sorter_value_buffer.push(Operation::Deletion as u8);
+            into_del_add_obkv(KvReaderU16::new(&obkv), true, false, document_sorter_value_buffer)?;
+        }
+        self.flattened_sorter
+            .insert(internal_docid.to_be_bytes(), &document_sorter_value_buffer)?;
+        Ok(())
+    }
+
     // Flatten a document from the fields ids map contained in self and insert the new
     // created fields. Returns `None` if the document doesn't need to be flattened.
     fn flatten_from_fields_ids_map(&mut self, obkv: KvReader<FieldId>) -> Result<Option<Vec<u8>>> {

From 263e82561973020c112cd7d74a76c479d76de1c8 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 20 Nov 2023 10:06:29 +0100
Subject: [PATCH 123/127] Fix typos in comments

---
 .../extract/extract_docid_word_positions.rs               | 8 ++++----
 .../extract/extract_fid_docid_facet_values.rs             | 4 ++--
 milli/src/update/index_documents/mod.rs                   | 1 -
 milli/src/update/index_documents/typed_chunk.rs           | 2 +-
 4 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index 0dcd6a42a..303b64271 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -204,7 +204,7 @@ fn tokenizer_builder<'a>(
     tokenizer_builder
 }
 
-/// Extract words maped with their positions of a document,
+/// Extract words mapped with their positions of a document,
 /// ensuring no Language detection mistakes was made.
 #[allow(clippy::too_many_arguments)] // FIXME: consider grouping arguments in a struct
 fn lang_safe_tokens_from_document<'a>(
@@ -273,7 +273,7 @@ fn lang_safe_tokens_from_document<'a>(
     Ok((&buffers.obkv_buffer, script_language_word_count))
 }
 
-/// Extract words maped with their positions of a document.
+/// Extract words mapped with their positions of a document.
 fn tokens_from_document<'a>(
     obkv: &KvReader<FieldId>,
     searchable_fields: &Option<HashSet<FieldId>>,
@@ -294,11 +294,11 @@ fn tokens_from_document<'a>(
                 let value =
                     serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
 
-                // prepare writting destination.
+                // prepare writing destination.
                 buffers.obkv_positions_buffer.clear();
                 let mut writer = KvWriterU16::new(&mut buffers.obkv_positions_buffer);
 
-                // convert json into an unique string.
+                // convert json into a unique string.
                 buffers.field_buffer.clear();
                 if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) {
                     // create an iterator of token with their positions.
diff --git a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
index 2dce90cfc..3fcec3e79 100644
--- a/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
+++ b/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs
@@ -75,7 +75,7 @@ pub fn extract_fid_docid_facet_values<R: io::Read + io::Seek>(
     let mut facet_is_null_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
     let mut facet_is_empty_docids = BTreeMap::<FieldId, (RoaringBitmap, RoaringBitmap)>::new();
 
-    // We create two buffer for mutable ref issues with closures.
+    // We create two buffers for mutable ref issues with closures.
     let mut numbers_key_buffer = Vec::new();
     let mut strings_key_buffer = Vec::new();
 
@@ -333,7 +333,7 @@ where
                     key_buffer.extend_from_slice(&value_bytes);
                     key_buffer.extend_from_slice(&number.to_be_bytes());
 
-                    // We insert only the Del part of the Obkv to inform
+                    // We insert only the Add part of the Obkv to inform
                     // that we only want to remove all those numbers.
                     let mut obkv = KvWriterDelAdd::memory();
                     obkv.insert(DelAdd::Addition, ().as_bytes())?;
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 2289666ed..113114681 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -449,7 +449,6 @@ where
                 otherwise => otherwise,
             };
 
-            // FIXME: return newly added as well as newly deleted documents
             let (docids, is_merged_database) =
                 write_typed_chunk_into_index(typed_chunk, self.index, self.wtxn, index_is_empty)?;
             if !docids.is_empty() {
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 90f9b7739..dda2ebc1c 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -409,7 +409,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 dims.into_iter().max_by_key(|(_, count)| *count).map(|(len, _)| len)
             };
 
-            // Ensure that the vector lenghts are correct and
+            // Ensure that the vector lengths are correct and
             // prepare the vectors before inserting them in the HNSW.
             let mut points = Vec::new();
             let mut docids = Vec::new();

From d59b7db8d09bb5881adc223c59cc2e3c8a999546 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 20 Nov 2023 10:10:05 +0100
Subject: [PATCH 124/127] remove unused code

---
 milli/src/external_documents_ids.rs           | 71 -------------------
 .../helpers/merge_functions.rs                | 12 ----
 .../src/update/index_documents/helpers/mod.rs | 14 ++--
 3 files changed, 4 insertions(+), 93 deletions(-)

diff --git a/milli/src/external_documents_ids.rs b/milli/src/external_documents_ids.rs
index 0e4891649..ec419446c 100644
--- a/milli/src/external_documents_ids.rs
+++ b/milli/src/external_documents_ids.rs
@@ -2,7 +2,6 @@ use std::collections::HashMap;
 
 use heed::types::{OwnedType, Str};
 use heed::{Database, RoIter, RoTxn, RwTxn};
-use roaring::RoaringBitmap;
 
 use crate::{DocumentId, BEU32};
 
@@ -44,23 +43,6 @@ impl ExternalDocumentsIds {
         Ok(map)
     }
 
-    /// Looks for the internal ids in the passed bitmap, and returns an iterator over the mapping between
-    /// these internal ids and their external id.
-    ///
-    /// The returned iterator has `Result<(String, DocumentId), RoaringBitmap>` as `Item`,
-    /// where the returned values can be:
-    /// - `Ok((external_id, internal_id))`: if a mapping was found
-    /// - `Err(remaining_ids)`: if the external ids for some of the requested internal ids weren't found.
-    ///   In that case the returned bitmap contains the internal ids whose external ids were not found after traversing
-    ///   the entire fst.
-    pub fn find_external_id_of<'t>(
-        &self,
-        rtxn: &'t RoTxn,
-        internal_ids: RoaringBitmap,
-    ) -> heed::Result<ExternalToInternalOwnedIterator<'t>> {
-        self.0.iter(rtxn).map(|iter| ExternalToInternalOwnedIterator { iter, internal_ids })
-    }
-
     /// Applies the list of operations passed as argument, modifying the current external to internal id mapping.
     ///
     /// If the list contains multiple operations on the same external id, then the result is unspecified.
@@ -91,56 +73,3 @@ impl ExternalDocumentsIds {
         self.0.iter(rtxn)
     }
 }
-
-/// An iterator over mappings between requested internal ids and external ids.
-///
-/// See [`ExternalDocumentsIds::find_external_id_of`] for details.
-pub struct ExternalToInternalOwnedIterator<'t> {
-    iter: RoIter<'t, Str, OwnedType<BEU32>>,
-    internal_ids: RoaringBitmap,
-}
-
-impl<'t> Iterator for ExternalToInternalOwnedIterator<'t> {
-    /// A result indicating if a mapping was found, or if the stream was exhausted without finding all internal ids.
-    type Item = Result<(&'t str, DocumentId), RoaringBitmap>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        // if all requested ids were found, we won't find any other, so short-circuit
-        if self.internal_ids.is_empty() {
-            return None;
-        }
-        loop {
-            let (external, internal) = match self.iter.next() {
-                Some(Ok((external, internal))) => (external, internal),
-                // TODO manage this better, remove panic
-                Some(Err(e)) => panic!("{}", e),
-                _ => {
-                    // we exhausted the stream but we still have some internal ids to find
-                    let remaining_ids = std::mem::take(&mut self.internal_ids);
-                    return Some(Err(remaining_ids));
-                    // note: next calls to `next` will return `None` since we replaced the internal_ids
-                    // with the default empty bitmap
-                }
-            };
-            let internal = internal.get();
-            let was_contained = self.internal_ids.remove(internal);
-            if was_contained {
-                return Some(Ok((external, internal)));
-            }
-        }
-    }
-}
-
-impl<'t> ExternalToInternalOwnedIterator<'t> {
-    /// Returns the bitmap of internal ids whose external id are yet to be found
-    pub fn remaining_internal_ids(&self) -> &RoaringBitmap {
-        &self.internal_ids
-    }
-
-    /// Consumes this iterator and returns an iterator over only the external ids, ignoring the internal ids.
-    ///
-    /// Use this when you don't need the mapping between the external and the internal ids.
-    pub fn only_external_ids(self) -> impl Iterator<Item = Result<String, RoaringBitmap>> + 't {
-        self.map(|res| res.map(|(external, _internal)| external.to_owned()))
-    }
-}
diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs
index 5d9ca7ef2..d355ead68 100644
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@@ -12,18 +12,6 @@ use crate::Result;
 
 pub type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>>;
 
-#[allow(unused)]
-pub fn concat_u32s_array<'a>(_key: &[u8], values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
-    if values.len() == 1 {
-        Ok(values[0].clone())
-    } else {
-        let capacity = values.iter().map(|v| v.len()).sum::<usize>();
-        let mut output = Vec::with_capacity(capacity);
-        values.iter().for_each(|integers| output.extend_from_slice(integers));
-        Ok(Cow::Owned(output))
-    }
-}
-
 pub fn serialize_roaring_bitmap(bitmap: &RoaringBitmap, buffer: &mut Vec<u8>) -> io::Result<()> {
     buffer.clear();
     buffer.reserve(bitmap.serialized_size());
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index 841c09543..52638d6f6 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -13,11 +13,10 @@ pub use grenad_helpers::{
     GrenadParameters, MergeableReader,
 };
 pub use merge_functions::{
-    concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
-    merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
-    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps,
-    obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions,
-    serialize_roaring_bitmap, MergeFn,
+    keep_first, keep_latest_obkv, merge_btreeset_string, merge_cbo_roaring_bitmaps,
+    merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
+    merge_roaring_bitmaps, obkvs_keep_last_addition_merge_deletions,
+    obkvs_merge_additions_and_deletions, serialize_roaring_bitmap, MergeFn,
 };
 
 use crate::MAX_WORD_LENGTH;
@@ -46,11 +45,6 @@ where
     Some((head, tail))
 }
 
-#[allow(unused)]
-pub fn read_u32_ne_bytes(bytes: &[u8]) -> impl Iterator<Item = u32> + '_ {
-    bytes.chunks_exact(4).flat_map(TryInto::try_into).map(u32::from_ne_bytes)
-}
-
 /// Converts an fst Stream into an HashSet of Strings.
 pub fn fst_stream_into_hashset<'f, I, S>(stream: I) -> HashSet<Vec<u8>>
 where

From ebef6bc24db04dac8b463c820b372d1895ea7584 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 20 Nov 2023 10:14:57 +0100
Subject: [PATCH 125/127] Simplify documents database writing

---
 milli/src/update/index_documents/typed_chunk.rs | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index dda2ebc1c..4f9f0ef6f 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -140,20 +140,9 @@ pub(crate) fn write_typed_chunk_into_index(
 
                 for (field_id, value) in reader.iter() {
                     let del_add_reader = KvReaderDelAdd::new(value);
-                    match (
-                        del_add_reader.get(DelAdd::Deletion),
-                        del_add_reader.get(DelAdd::Addition),
-                    ) {
-                        (None, None) => {}
-                        (None, Some(value)) => {
-                            // anyway, write
-                            writer.insert(field_id, value)?;
-                        }
-                        (Some(_), None) => {}
-                        (Some(_), Some(value)) => {
-                            // updated field, write
-                            writer.insert(field_id, value)?;
-                        }
+
+                    if let Some(addition) = del_add_reader.get(DelAdd::Addition) {
+                        writer.insert(field_id, addition)?;
                     }
                 }
 

From 39cbb499c2db09fe2385a5bdb2f294298d5b6366 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 20 Nov 2023 10:20:39 +0100
Subject: [PATCH 126/127] Small fixes

---
 milli/src/update/del_add.rs                                   | 4 +++-
 .../index_documents/extract/extract_word_position_docids.rs   | 1 +
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs
index dc7c0409a..07a20b025 100644
--- a/milli/src/update/del_add.rs
+++ b/milli/src/update/del_add.rs
@@ -53,7 +53,9 @@ pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
             value_writer.insert(DelAdd::Addition, value)?;
         }
         value_writer.finish()?;
-        writer.insert(key, &value_buffer)?;
+        if !value_buffer.is_empty() {
+            writer.insert(key, &value_buffer)?;
+        }
     }
 
     writer.finish()
diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
index 1b9ec66ff..89b77d140 100644
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -121,6 +121,7 @@ fn words_position_into_sorter(
                 key
             }
             Both(key, _) => {
+                // both values needs to be kept because it will be used in other extractors.
                 value_writer.insert(DelAdd::Deletion, document_id.to_ne_bytes()).unwrap();
                 value_writer.insert(DelAdd::Addition, document_id.to_ne_bytes()).unwrap();
                 key

From d3575fb0280cb5a37cd17d1a904026080092ffe4 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 20 Nov 2023 10:53:40 +0100
Subject: [PATCH 127/127] Make into_del_add_obkv parameters more human readable

---
 milli/src/update/del_add.rs                   | 25 ++++---
 milli/src/update/index_documents/transform.rs | 75 +++++++++++++------
 2 files changed, 68 insertions(+), 32 deletions(-)

diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs
index 07a20b025..794beb5df 100644
--- a/milli/src/update/del_add.rs
+++ b/milli/src/update/del_add.rs
@@ -32,13 +32,12 @@ impl Key for DelAdd {
 
 /// Creates a Kv<K, Kv<DelAdd, value>> from Kv<K, value>
 ///
-/// if deletion is `true`, the value will be inserted behind a DelAdd::Deletion key.
-/// if addition is `true`, the value will be inserted behind a DelAdd::Addition key.
-/// if both deletion and addition are `true, the value will be inserted in both keys.
+/// Deletion: put all the values under DelAdd::Deletion
+/// Addition: put all the values under DelAdd::Addition,
+/// DeletionAndAddition: put all the values under DelAdd::Deletion and DelAdd::Addition,
 pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
     reader: obkv::KvReader<K>,
-    deletion: bool,
-    addition: bool,
+    operation: DelAddOperation,
     buffer: &mut Vec<u8>,
 ) -> Result<(), std::io::Error> {
     let mut writer = obkv::KvWriter::new(buffer);
@@ -46,21 +45,27 @@ pub fn into_del_add_obkv<K: obkv::Key + PartialOrd>(
     for (key, value) in reader.iter() {
         value_buffer.clear();
         let mut value_writer = KvWriterDelAdd::new(&mut value_buffer);
-        if deletion {
+        if matches!(operation, DelAddOperation::Deletion | DelAddOperation::DeletionAndAddition) {
             value_writer.insert(DelAdd::Deletion, value)?;
         }
-        if addition {
+        if matches!(operation, DelAddOperation::Addition | DelAddOperation::DeletionAndAddition) {
             value_writer.insert(DelAdd::Addition, value)?;
         }
         value_writer.finish()?;
-        if !value_buffer.is_empty() {
-            writer.insert(key, &value_buffer)?;
-        }
+        writer.insert(key, &value_buffer)?;
     }
 
     writer.finish()
 }
 
+/// Enum controlling the side of the DelAdd obkv in which the provided value will be written.
+#[derive(Debug, Clone, Copy)]
+pub enum DelAddOperation {
+    Deletion,
+    Addition,
+    DeletionAndAddition,
+}
+
 /// Creates a Kv<K, Kv<DelAdd, value>> from two Kv<K, value>
 ///
 /// putting each deletion obkv's keys under an DelAdd::Deletion
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 8dc88efb9..323bc3da7 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -21,7 +21,7 @@ use super::{IndexDocumentsMethod, IndexerConfig};
 use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::{db_name, main_key};
-use crate::update::del_add::{into_del_add_obkv, DelAdd, KvReaderDelAdd};
+use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd};
 use crate::update::index_documents::GrenadParameters;
 use crate::update::{AvailableDocumentsIds, ClearDocuments, UpdateIndexingStep};
 use crate::{
@@ -265,8 +265,12 @@ impl<'a, 'i> Transform<'a, 'i> {
                     skip_insertion = true;
                 } else {
                     // we associate the base document with the new key, everything will get merged later.
-                    let keep_original_version =
-                        self.index_documents_method == IndexDocumentsMethod::UpdateDocuments;
+                    let deladd_operation = match self.index_documents_method {
+                        IndexDocumentsMethod::UpdateDocuments => {
+                            DelAddOperation::DeletionAndAddition
+                        }
+                        IndexDocumentsMethod::ReplaceDocuments => DelAddOperation::Deletion,
+                    };
                     document_sorter_key_buffer.clear();
                     document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes());
                     document_sorter_key_buffer.extend_from_slice(external_id.as_bytes());
@@ -274,8 +278,7 @@ impl<'a, 'i> Transform<'a, 'i> {
                     document_sorter_value_buffer.push(Operation::Addition as u8);
                     into_del_add_obkv(
                         KvReaderU16::new(base_obkv),
-                        true,
-                        keep_original_version,
+                        deladd_operation,
                         &mut document_sorter_value_buffer,
                     )?;
                     self.original_sorter
@@ -287,8 +290,7 @@ impl<'a, 'i> Transform<'a, 'i> {
                         document_sorter_value_buffer.push(Operation::Addition as u8);
                         into_del_add_obkv(
                             KvReaderU16::new(&flattened_obkv),
-                            true,
-                            keep_original_version,
+                            deladd_operation,
                             &mut document_sorter_value_buffer,
                         )?;
                     }
@@ -307,8 +309,7 @@ impl<'a, 'i> Transform<'a, 'i> {
                 document_sorter_value_buffer.push(Operation::Addition as u8);
                 into_del_add_obkv(
                     KvReaderU16::new(&obkv_buffer),
-                    false,
-                    true,
+                    DelAddOperation::Addition,
                     &mut document_sorter_value_buffer,
                 )?;
                 // We use the extracted/generated user id as the key for this document.
@@ -321,8 +322,7 @@ impl<'a, 'i> Transform<'a, 'i> {
                     document_sorter_value_buffer.push(Operation::Addition as u8);
                     into_del_add_obkv(
                         KvReaderU16::new(&obkv),
-                        false,
-                        true,
+                        DelAddOperation::Addition,
                         &mut document_sorter_value_buffer,
                     )?
                 }
@@ -517,7 +517,11 @@ impl<'a, 'i> Transform<'a, 'i> {
         // push it as to delete in the original_sorter
         document_sorter_value_buffer.clear();
         document_sorter_value_buffer.push(Operation::Deletion as u8);
-        into_del_add_obkv(KvReaderU16::new(base_obkv), true, false, document_sorter_value_buffer)?;
+        into_del_add_obkv(
+            KvReaderU16::new(base_obkv),
+            DelAddOperation::Deletion,
+            document_sorter_value_buffer,
+        )?;
         self.original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
 
         // flatten it and push it as to delete in the flattened_sorter
@@ -526,7 +530,11 @@ impl<'a, 'i> Transform<'a, 'i> {
             // we recreate our buffer with the flattened documents
             document_sorter_value_buffer.clear();
             document_sorter_value_buffer.push(Operation::Deletion as u8);
-            into_del_add_obkv(KvReaderU16::new(&obkv), true, false, document_sorter_value_buffer)?;
+            into_del_add_obkv(
+                KvReaderU16::new(&obkv),
+                DelAddOperation::Deletion,
+                document_sorter_value_buffer,
+            )?;
         }
         self.flattened_sorter
             .insert(internal_docid.to_be_bytes(), &document_sorter_value_buffer)?;
@@ -869,8 +877,7 @@ impl<'a, 'i> Transform<'a, 'i> {
             document_sorter_value_buffer.clear();
             into_del_add_obkv(
                 KvReaderU16::new(buffer),
-                false,
-                true,
+                DelAddOperation::Addition,
                 &mut document_sorter_value_buffer,
             )?;
             original_sorter.insert(&document_sorter_key_buffer, &document_sorter_value_buffer)?;
@@ -911,8 +918,7 @@ impl<'a, 'i> Transform<'a, 'i> {
             document_sorter_value_buffer.clear();
             into_del_add_obkv(
                 KvReaderU16::new(&buffer),
-                false,
-                true,
+                DelAddOperation::Addition,
                 &mut document_sorter_value_buffer,
             )?;
             flattened_sorter.insert(docid.to_be_bytes(), &document_sorter_value_buffer)?;
@@ -986,18 +992,38 @@ mod test {
         let mut kv_writer = KvWriter::memory();
         kv_writer.insert(0_u8, [0]).unwrap();
         let buffer = kv_writer.into_inner().unwrap();
-        into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0).unwrap();
+        into_del_add_obkv(
+            KvReaderU16::new(&buffer),
+            DelAddOperation::Addition,
+            &mut additive_doc_0,
+        )
+        .unwrap();
         additive_doc_0.insert(0, Operation::Addition as u8);
-        into_del_add_obkv(KvReaderU16::new(&buffer), true, false, &mut deletive_doc_0).unwrap();
+        into_del_add_obkv(
+            KvReaderU16::new(&buffer),
+            DelAddOperation::Deletion,
+            &mut deletive_doc_0,
+        )
+        .unwrap();
         deletive_doc_0.insert(0, Operation::Deletion as u8);
-        into_del_add_obkv(KvReaderU16::new(&buffer), true, true, &mut del_add_doc_0).unwrap();
+        into_del_add_obkv(
+            KvReaderU16::new(&buffer),
+            DelAddOperation::DeletionAndAddition,
+            &mut del_add_doc_0,
+        )
+        .unwrap();
         del_add_doc_0.insert(0, Operation::Addition as u8);
 
         let mut additive_doc_1 = Vec::new();
         let mut kv_writer = KvWriter::memory();
         kv_writer.insert(1_u8, [1]).unwrap();
         let buffer = kv_writer.into_inner().unwrap();
-        into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_1).unwrap();
+        into_del_add_obkv(
+            KvReaderU16::new(&buffer),
+            DelAddOperation::Addition,
+            &mut additive_doc_1,
+        )
+        .unwrap();
         additive_doc_1.insert(0, Operation::Addition as u8);
 
         let mut additive_doc_0_1 = Vec::new();
@@ -1005,7 +1031,12 @@ mod test {
         kv_writer.insert(0_u8, [0]).unwrap();
         kv_writer.insert(1_u8, [1]).unwrap();
         let buffer = kv_writer.into_inner().unwrap();
-        into_del_add_obkv(KvReaderU16::new(&buffer), false, true, &mut additive_doc_0_1).unwrap();
+        into_del_add_obkv(
+            KvReaderU16::new(&buffer),
+            DelAddOperation::Addition,
+            &mut additive_doc_0_1,
+        )
+        .unwrap();
         additive_doc_0_1.insert(0, Operation::Addition as u8);
 
         let ret = obkvs_merge_additions_and_deletions(&[], &[Cow::from(additive_doc_0.as_slice())])