From 1e2fbc6a421e5dbea38e40c04d4511f28a5b7ea0 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 7 Nov 2023 16:46:52 +0100
Subject: [PATCH 1/7] revert "REVERT ME: ignore prefix pair databases tests"

This reverts commit 1b2ea6cf19309782a2e3b2ff2fe6d7708dd5de4f.
---
 milli/src/update/prefix_word_pairs/mod.rs | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs
index 320c01461..e718f9b77 100644
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@@ -171,7 +171,6 @@ mod tests {
         documents
     }
 
-    #[ignore]
     #[test]
     fn add_new_documents() {
         let mut index = TempIndex::new();
@@ -236,7 +235,6 @@ mod tests {
         db_snap!(index, word_prefix_pair_proximity_docids, "update");
         db_snap!(index, prefix_word_pair_proximity_docids, "update");
     }
-    #[ignore]
     #[test]
     fn batch_bug_3043() {
         // https://github.com/meilisearch/meilisearch/issues/3043
@@ -285,7 +283,6 @@ mod tests {
         db_snap!(index, prefix_word_pair_proximity_docids);
     }
 
-    #[ignore]
     #[test]
     fn hard_delete_and_reupdate() {
         let mut index = TempIndex::new();
@@ -360,7 +357,6 @@ mod tests {
         db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
     }
 
-    #[ignore]
     #[test]
     fn replace_hard_deletion() {
         let mut index = TempIndex::new();

From 6dab826908b815c5e62e63f8848f22cf7196e5de Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 8 Nov 2023 11:52:08 +0100
Subject: [PATCH 2/7] Reactivate prefix databases

---
 milli/src/update/index_documents/mod.rs | 56 ++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 2be410ace..f5fbe2797 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -381,12 +381,48 @@ where
             total_databases: TOTAL_POSTING_DATABASE_COUNT,
         });
 
+        let mut word_pair_proximity_docids = None;
+        let mut word_position_docids = None;
+        let mut word_fid_docids = None;
+        let mut word_docids = None;
+        let mut exact_word_docids = None;
+
         for result in lmdb_writer_rx {
             if (self.should_abort)() {
                 return Err(Error::InternalError(InternalError::AbortedIndexation));
             }
 
-            let typed_chunk = result?;
+            let typed_chunk = match result? {
+                TypedChunk::WordDocids {
+                    word_docids_reader,
+                    exact_word_docids_reader,
+                    word_fid_docids_reader,
+                } => {
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
+                    word_docids = Some(cloneable_chunk);
+                    let cloneable_chunk =
+                        unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
+                    exact_word_docids = Some(cloneable_chunk);
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&word_fid_docids_reader)? };
+                    word_fid_docids = Some(cloneable_chunk);
+                    TypedChunk::WordDocids {
+                        word_docids_reader,
+                        exact_word_docids_reader,
+                        word_fid_docids_reader,
+                    }
+                }
+                TypedChunk::WordPairProximityDocids(chunk) => {
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
+                    word_pair_proximity_docids = Some(cloneable_chunk);
+                    TypedChunk::WordPairProximityDocids(chunk)
+                }
+                TypedChunk::WordPositionDocids(chunk) => {
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
+                    word_position_docids = Some(cloneable_chunk);
+                    TypedChunk::WordPositionDocids(chunk)
+                }
+                otherwise => otherwise,
+            };
 
             // FIXME: return newly added as well as newly deleted documents
             let (docids, is_merged_database) =
@@ -417,17 +453,17 @@ where
 
         // We write the primary key field id into the main database
         self.index.put_primary_key(self.wtxn, &primary_key)?;
+        let number_of_documents = self.index.number_of_documents(self.wtxn)?;
 
-        // TODO: reactivate prefix DB with diff-indexing
-        // self.execute_prefix_databases(
-        //     word_docids,
-        //     exact_word_docids,
-        //     word_pair_proximity_docids,
-        //     word_position_docids,
-        //     word_fid_docids,
-        // )?;
+        self.execute_prefix_databases(
+            word_docids,
+            exact_word_docids,
+            word_pair_proximity_docids,
+            word_position_docids,
+            word_fid_docids,
+        )?;
 
-        self.index.number_of_documents(self.wtxn)
+        Ok(number_of_documents)
     }
 
     #[logging_timer::time("IndexDocuments::{}")]

From 688266c83e59d4c311b070fe8d274ac071cafae3 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 8 Nov 2023 14:16:01 +0100
Subject: [PATCH 3/7] Remove word pair proximity prefix cache and compute it at
 search time

---
 milli/src/index.rs                            |  14 +-
 milli/src/search/new/db_cache.rs              |  62 +-
 milli/src/update/clear_documents.rs           |   5 -
 milli/src/update/index_documents/mod.rs       |  39 +-
 milli/src/update/mod.rs                       |   5 -
 milli/src/update/prefix_word_pairs/mod.rs     | 418 ----------
 .../update/prefix_word_pairs/prefix_word.rs   | 182 -----
 .../update/prefix_word_pairs/word_prefix.rs   | 728 ------------------
 8 files changed, 41 insertions(+), 1412 deletions(-)
 delete mode 100644 milli/src/update/prefix_word_pairs/mod.rs
 delete mode 100644 milli/src/update/prefix_word_pairs/prefix_word.rs
 delete mode 100644 milli/src/update/prefix_word_pairs/word_prefix.rs

diff --git a/milli/src/index.rs b/milli/src/index.rs
index 86ef6105b..5023d8fa5 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -83,8 +83,6 @@ pub mod db_name {
     pub const EXTERNAL_DOCUMENTS_IDS: &str = "external-documents-ids";
     pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
     pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
-    pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
-    pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
     pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
     pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids";
     pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
@@ -129,10 +127,6 @@ pub struct Index {
 
     /// Maps the proximity between a pair of words with all the docids where this relation appears.
     pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-    /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
-    pub word_prefix_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-    /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears.
-    pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
 
     /// Maps the word and the position with the docids that corresponds to it.
     pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
@@ -186,7 +180,7 @@ impl Index {
     ) -> Result<Index> {
         use db_name::*;
 
-        options.max_dbs(26);
+        options.max_dbs(24);
         unsafe { options.flag(Flags::MdbAlwaysFreePages) };
 
         let env = options.open(path)?;
@@ -203,10 +197,6 @@ impl Index {
             env.create_database(&mut wtxn, Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
         let script_language_docids =
             env.create_database(&mut wtxn, Some(SCRIPT_LANGUAGE_DOCIDS))?;
-        let word_prefix_pair_proximity_docids =
-            env.create_database(&mut wtxn, Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
-        let prefix_word_pair_proximity_docids =
-            env.create_database(&mut wtxn, Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
         let word_position_docids = env.create_database(&mut wtxn, Some(WORD_POSITION_DOCIDS))?;
         let word_fid_docids = env.create_database(&mut wtxn, Some(WORD_FIELD_ID_DOCIDS))?;
         let field_id_word_count_docids =
@@ -247,8 +237,6 @@ impl Index {
             exact_word_prefix_docids,
             word_pair_proximity_docids,
             script_language_docids,
-            word_prefix_pair_proximity_docids,
-            prefix_word_pair_proximity_docids,
             word_position_docids,
             word_fid_docids,
             word_prefix_position_docids,
diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs
index 3376cebb2..2c670658d 100644
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@@ -11,7 +11,9 @@ use super::interner::Interned;
 use super::Word;
 use crate::heed_codec::{BytesDecodeOwned, StrBEU16Codec};
 use crate::update::{merge_cbo_roaring_bitmaps, MergeFn};
-use crate::{CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext};
+use crate::{
+    CboRoaringBitmapCodec, CboRoaringBitmapLenCodec, Result, SearchContext, U8StrStrCodec,
+};
 
 /// A cache storing pointers to values in the LMDB databases.
 ///
@@ -23,7 +25,7 @@ pub struct DatabaseCache<'ctx> {
     pub word_pair_proximity_docids:
         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
     pub word_prefix_pair_proximity_docids:
-        FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
+        FxHashMap<(u8, Interned<String>, Interned<String>), Option<RoaringBitmap>>,
     pub prefix_word_pair_proximity_docids:
         FxHashMap<(u8, Interned<String>, Interned<String>), Option<Cow<'ctx, [u8]>>>,
     pub word_docids: FxHashMap<Interned<String>, Option<Cow<'ctx, [u8]>>>,
@@ -295,35 +297,47 @@ impl<'ctx> SearchContext<'ctx> {
         prefix2: Interned<String>,
         proximity: u8,
     ) -> Result<Option<RoaringBitmap>> {
-        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
-            self.txn,
-            (proximity, word1, prefix2),
-            &(
-                proximity,
-                self.word_interner.get(word1).as_str(),
-                self.word_interner.get(prefix2).as_str(),
-            ),
-            &mut self.db_cache.word_prefix_pair_proximity_docids,
-            self.index.word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>(),
-        )
+        let docids = match self
+            .db_cache
+            .word_prefix_pair_proximity_docids
+            .entry((proximity, word1, prefix2))
+        {
+            Entry::Occupied(docids) => docids.get().clone(),
+            Entry::Vacant(entry) => {
+                // compute docids using prefix iter and store the result in the cache.
+                let key = U8StrStrCodec::bytes_encode(&(
+                    proximity,
+                    self.word_interner.get(word1).as_str(),
+                    self.word_interner.get(prefix2).as_str(),
+                ))
+                .unwrap()
+                .into_owned();
+                let mut prefix_docids = RoaringBitmap::new();
+                let remap_key_type = self
+                    .index
+                    .word_pair_proximity_docids
+                    .remap_key_type::<ByteSlice>()
+                    .prefix_iter(self.txn, &key)?;
+                for result in remap_key_type {
+                    let (_, docids) = result?;
+
+                    prefix_docids |= docids;
+                }
+                entry.insert(Some(prefix_docids.clone()));
+                Some(prefix_docids)
+            }
+        };
+        Ok(docids)
     }
+
     pub fn get_db_prefix_word_pair_proximity_docids(
         &mut self,
         left_prefix: Interned<String>,
         right: Interned<String>,
         proximity: u8,
     ) -> Result<Option<RoaringBitmap>> {
-        DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>(
-            self.txn,
-            (proximity, left_prefix, right),
-            &(
-                proximity,
-                self.word_interner.get(left_prefix).as_str(),
-                self.word_interner.get(right).as_str(),
-            ),
-            &mut self.db_cache.prefix_word_pair_proximity_docids,
-            self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
-        )
+        // only accept exact matches on reverted positions
+        self.get_db_word_pair_proximity_docids(left_prefix, right, proximity)
     }
 
     pub fn get_db_word_fid_docids(
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 265c6f15a..afe0191b1 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -26,8 +26,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             word_prefix_docids,
             exact_word_prefix_docids,
             word_pair_proximity_docids,
-            word_prefix_pair_proximity_docids,
-            prefix_word_pair_proximity_docids,
             word_position_docids,
             word_fid_docids,
             field_id_word_count_docids,
@@ -68,8 +66,6 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
         word_prefix_docids.clear(self.wtxn)?;
         exact_word_prefix_docids.clear(self.wtxn)?;
         word_pair_proximity_docids.clear(self.wtxn)?;
-        word_prefix_pair_proximity_docids.clear(self.wtxn)?;
-        prefix_word_pair_proximity_docids.clear(self.wtxn)?;
         word_position_docids.clear(self.wtxn)?;
         word_fid_docids.clear(self.wtxn)?;
         field_id_word_count_docids.clear(self.wtxn)?;
@@ -132,7 +128,6 @@ mod tests {
         assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
         assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
         assert!(index.field_id_word_count_docids.is_empty(&rtxn).unwrap());
-        assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
         assert!(index.facet_id_f64_docids.is_empty(&rtxn).unwrap());
         assert!(index.facet_id_string_docids.is_empty(&rtxn).unwrap());
         assert!(index.field_id_docid_facet_f64s.is_empty(&rtxn).unwrap());
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index f5fbe2797..8552cf52b 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -35,13 +35,12 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
 use crate::update::{
-    IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep, WordPrefixDocids,
-    WordPrefixIntegerDocids, WordsPrefixesFst,
+    IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
 };
 use crate::{CboRoaringBitmapCodec, Index, Result};
 
 static MERGED_DATABASE_COUNT: usize = 7;
-static PREFIX_DATABASE_COUNT: usize = 5;
+static PREFIX_DATABASE_COUNT: usize = 4;
 static TOTAL_POSTING_DATABASE_COUNT: usize = MERGED_DATABASE_COUNT + PREFIX_DATABASE_COUNT;
 
 #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -381,7 +380,6 @@ where
             total_databases: TOTAL_POSTING_DATABASE_COUNT,
         });
 
-        let mut word_pair_proximity_docids = None;
         let mut word_position_docids = None;
         let mut word_fid_docids = None;
         let mut word_docids = None;
@@ -411,11 +409,6 @@ where
                         word_fid_docids_reader,
                     }
                 }
-                TypedChunk::WordPairProximityDocids(chunk) => {
-                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
-                    word_pair_proximity_docids = Some(cloneable_chunk);
-                    TypedChunk::WordPairProximityDocids(chunk)
-                }
                 TypedChunk::WordPositionDocids(chunk) => {
                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
                     word_position_docids = Some(cloneable_chunk);
@@ -458,7 +451,6 @@ where
         self.execute_prefix_databases(
             word_docids,
             exact_word_docids,
-            word_pair_proximity_docids,
             word_position_docids,
             word_fid_docids,
         )?;
@@ -471,7 +463,6 @@ where
         self,
         word_docids: Option<grenad::Reader<CursorClonableMmap>>,
         exact_word_docids: Option<grenad::Reader<CursorClonableMmap>>,
-        word_pair_proximity_docids: Option<grenad::Reader<CursorClonableMmap>>,
         word_position_docids: Option<grenad::Reader<CursorClonableMmap>>,
         word_fid_docids: Option<grenad::Reader<CursorClonableMmap>>,
     ) -> Result<()>
@@ -592,32 +583,6 @@ where
             total_databases: TOTAL_POSTING_DATABASE_COUNT,
         });
 
-        if let Some(word_pair_proximity_docids) = word_pair_proximity_docids {
-            // Run the word prefix pair proximity docids update operation.
-            PrefixWordPairsProximityDocids::new(
-                self.wtxn,
-                self.index,
-                self.indexer_config.chunk_compression_type,
-                self.indexer_config.chunk_compression_level,
-            )
-            .execute(
-                word_pair_proximity_docids,
-                &new_prefix_fst_words,
-                &common_prefix_fst_words,
-                &del_prefix_fst_words,
-            )?;
-        }
-
-        if (self.should_abort)() {
-            return Err(Error::InternalError(InternalError::AbortedIndexation));
-        }
-
-        databases_seen += 1;
-        (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
-            databases_seen,
-            total_databases: TOTAL_POSTING_DATABASE_COUNT,
-        });
-
         if let Some(word_position_docids) = word_position_docids {
             // Run the words prefix position docids update operation.
             let mut builder = WordPrefixIntegerDocids::new(
diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs
index dd8851ccb..eb2b6e69a 100644
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@@ -8,10 +8,6 @@ pub use self::index_documents::{
     MergeFn,
 };
 pub use self::indexer_config::IndexerConfig;
-pub use self::prefix_word_pairs::{
-    PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
-    MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
-};
 pub use self::settings::{Setting, Settings};
 pub use self::update_step::UpdateIndexingStep;
 pub use self::word_prefix_docids::WordPrefixDocids;
@@ -24,7 +20,6 @@ pub(crate) mod del_add;
 pub(crate) mod facet;
 mod index_documents;
 mod indexer_config;
-mod prefix_word_pairs;
 mod settings;
 mod update_step;
 mod word_prefix_docids;
diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs
deleted file mode 100644
index e718f9b77..000000000
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ /dev/null
@@ -1,418 +0,0 @@
-use std::borrow::Cow;
-use std::collections::HashSet;
-use std::io::{BufReader, BufWriter};
-
-use grenad::CompressionType;
-use heed::types::ByteSlice;
-
-use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap};
-use crate::{Index, Result};
-
-mod prefix_word;
-mod word_prefix;
-
-pub use prefix_word::index_prefix_word_database;
-pub use word_prefix::index_word_prefix_database;
-
-pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4;
-pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2;
-
-pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
-    wtxn: &'t mut heed::RwTxn<'i, 'u>,
-    index: &'i Index,
-    max_proximity: u8,
-    max_prefix_length: usize,
-    chunk_compression_type: CompressionType,
-    chunk_compression_level: Option<u32>,
-}
-impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
-    pub fn new(
-        wtxn: &'t mut heed::RwTxn<'i, 'u>,
-        index: &'i Index,
-        chunk_compression_type: CompressionType,
-        chunk_compression_level: Option<u32>,
-    ) -> Self {
-        Self {
-            wtxn,
-            index,
-            max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
-            max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
-            chunk_compression_type,
-            chunk_compression_level,
-        }
-    }
-
-    #[logging_timer::time("WordPrefixPairProximityDocids::{}")]
-    pub fn execute<'a>(
-        self,
-        new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
-        new_prefix_fst_words: &'a [String],
-        common_prefix_fst_words: &[&'a [String]],
-        del_prefix_fst_words: &HashSet<Vec<u8>>,
-    ) -> Result<()> {
-        puffin::profile_function!();
-
-        index_word_prefix_database(
-            self.wtxn,
-            self.index.word_pair_proximity_docids,
-            self.index.word_prefix_pair_proximity_docids,
-            self.max_proximity,
-            self.max_prefix_length,
-            new_word_pair_proximity_docids.clone(),
-            new_prefix_fst_words,
-            common_prefix_fst_words,
-            del_prefix_fst_words,
-            self.chunk_compression_type,
-            self.chunk_compression_level,
-        )?;
-
-        index_prefix_word_database(
-            self.wtxn,
-            self.index.word_pair_proximity_docids,
-            self.index.prefix_word_pair_proximity_docids,
-            self.max_proximity,
-            self.max_prefix_length,
-            new_word_pair_proximity_docids,
-            new_prefix_fst_words,
-            common_prefix_fst_words,
-            del_prefix_fst_words,
-            self.chunk_compression_type,
-            self.chunk_compression_level,
-        )?;
-
-        Ok(())
-    }
-}
-
-// This is adapted from `sorter_into_lmdb_database`
-pub fn insert_into_database(
-    wtxn: &mut heed::RwTxn,
-    database: heed::PolyDatabase,
-    new_key: &[u8],
-    new_value: &[u8],
-) -> Result<()> {
-    let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
-    match iter.next().transpose()? {
-        Some((key, old_val)) if new_key == key => {
-            let val =
-                merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
-                    .map_err(|_| {
-                        // TODO just wrap this error?
-                        crate::error::InternalError::IndexingMergingKeys {
-                            process: "get-put-merge",
-                        }
-                    })?;
-            // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
-            unsafe { iter.put_current(new_key, &val)? };
-        }
-        _ => {
-            drop(iter);
-            database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
-        }
-    }
-    Ok(())
-}
-
-// This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
-// but it uses `append` if the database is empty, and it assumes that the values in the
-// writer don't conflict with values in the database.
-pub fn write_into_lmdb_database_without_merging(
-    wtxn: &mut heed::RwTxn,
-    database: heed::PolyDatabase,
-    writer: grenad::Writer<BufWriter<std::fs::File>>,
-) -> Result<()> {
-    let file = writer.into_inner()?.into_inner().map_err(|err| err.into_error())?;
-    let reader = grenad::Reader::new(BufReader::new(file))?;
-    if database.is_empty(wtxn)? {
-        let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
-        let mut cursor = reader.into_cursor()?;
-        while let Some((k, v)) = cursor.move_on_next()? {
-            // safety: the key comes from the grenad reader, not the database
-            unsafe { out_iter.append(k, v)? };
-        }
-    } else {
-        let mut cursor = reader.into_cursor()?;
-        while let Some((k, v)) = cursor.move_on_next()? {
-            database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
-        }
-    }
-    Ok(())
-}
-
-#[cfg(test)]
-mod tests {
-    use std::io::Cursor;
-
-    use crate::db_snap;
-    use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
-    use crate::index::tests::TempIndex;
-    use crate::update::IndexDocumentsMethod;
-
-    fn documents_with_enough_different_words_for_prefixes(
-        prefixes: &[&str],
-        start_id: usize,
-    ) -> Vec<crate::Object> {
-        let mut documents = Vec::new();
-        let mut id = start_id;
-        for prefix in prefixes {
-            for i in 0..50 {
-                documents.push(
-                    serde_json::json!({
-                        "id": id,
-                        "text": format!("{prefix}{i:x}"),
-                    })
-                    .as_object()
-                    .unwrap()
-                    .clone(),
-                );
-                id += 1;
-            }
-        }
-        documents
-    }
-
-    #[test]
-    fn add_new_documents() {
-        let mut index = TempIndex::new();
-        index.index_documents_config.words_prefix_threshold = Some(50);
-        index.index_documents_config.autogenerate_docids = true;
-
-        index
-            .update_settings(|settings| {
-                settings.set_searchable_fields(vec!["text".to_owned()]);
-            })
-            .unwrap();
-
-        let batch_reader_from_documents = |documents| {
-            let mut builder = DocumentsBatchBuilder::new(Vec::new());
-            for object in documents {
-                builder.append_json_object(&object).unwrap();
-            }
-            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
-        };
-
-        let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"], 0);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-        documents.push(
-            serde_json::json!({
-                "id": "9000",
-                "text": "At an amazing and beautiful house"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-        documents.push(
-            serde_json::json!({
-                "id": "9001",
-                "text": "The bell rings at 5 am"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-
-        let documents = batch_reader_from_documents(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, word_prefix_pair_proximity_docids, "initial");
-        db_snap!(index, prefix_word_pair_proximity_docids, "initial");
-
-        let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"], 100);
-        documents.push(
-            serde_json::json!({
-                "id": "9002",
-                "text": "At an extraordinary house"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-        let documents = batch_reader_from_documents(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, word_pair_proximity_docids, "update");
-        db_snap!(index, word_prefix_pair_proximity_docids, "update");
-        db_snap!(index, prefix_word_pair_proximity_docids, "update");
-    }
-    #[test]
-    fn batch_bug_3043() {
-        // https://github.com/meilisearch/meilisearch/issues/3043
-        let mut index = TempIndex::new();
-        index.index_documents_config.words_prefix_threshold = Some(50);
-        index.index_documents_config.autogenerate_docids = true;
-
-        index
-            .update_settings(|settings| {
-                settings.set_searchable_fields(vec!["text".to_owned()]);
-            })
-            .unwrap();
-
-        let batch_reader_from_documents = |documents| {
-            let mut builder = DocumentsBatchBuilder::new(Vec::new());
-            for object in documents {
-                builder.append_json_object(&object).unwrap();
-            }
-            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
-        };
-
-        let mut documents = documents_with_enough_different_words_for_prefixes(&["y"], 0);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-        documents.push(
-            serde_json::json!({
-                "text": "x y"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-        documents.push(
-            serde_json::json!({
-                "text": "x a y"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-
-        let documents = batch_reader_from_documents(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, word_pair_proximity_docids);
-        db_snap!(index, word_prefix_pair_proximity_docids);
-        db_snap!(index, prefix_word_pair_proximity_docids);
-    }
-
-    #[test]
-    fn hard_delete_and_reupdate() {
-        let mut index = TempIndex::new();
-        index.index_documents_config.words_prefix_threshold = Some(50);
-
-        index
-            .update_settings(|settings| {
-                settings.set_primary_key("id".to_owned());
-                settings.set_searchable_fields(vec!["text".to_owned()]);
-            })
-            .unwrap();
-
-        let batch_reader_from_documents = |documents| {
-            let mut builder = DocumentsBatchBuilder::new(Vec::new());
-            for object in documents {
-                builder.append_json_object(&object).unwrap();
-            }
-            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
-        };
-
-        let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-        documents.push(
-            serde_json::json!({
-                "id": 9000,
-                "text": "At an amazing and beautiful house"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-        documents.push(
-            serde_json::json!({
-                "id": 9001,
-                "text": "The bell rings at 5 am"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-
-        let documents = batch_reader_from_documents(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, documents_ids, "initial");
-        db_snap!(index, word_docids, "initial");
-        db_snap!(index, word_prefix_pair_proximity_docids, "initial");
-        db_snap!(index, prefix_word_pair_proximity_docids, "initial");
-
-        index.delete_document("9000");
-
-        db_snap!(index, documents_ids, "first_delete");
-        db_snap!(index, word_docids, "first_delete");
-        db_snap!(index, word_prefix_pair_proximity_docids, "first_delete");
-        db_snap!(index, prefix_word_pair_proximity_docids, "first_delete");
-
-        index.delete_documents((0..50).map(|id| id.to_string()).collect());
-
-        db_snap!(index, documents_ids, "second_delete");
-        db_snap!(index, word_docids, "second_delete");
-        db_snap!(index, word_prefix_pair_proximity_docids, "second_delete");
-        db_snap!(index, prefix_word_pair_proximity_docids, "second_delete");
-
-        let documents = documents_with_enough_different_words_for_prefixes(&["b"], 1000);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-
-        index.add_documents(batch_reader_from_documents(documents)).unwrap();
-
-        db_snap!(index, documents_ids, "reupdate");
-        db_snap!(index, word_docids, "reupdate");
-        db_snap!(index, word_prefix_pair_proximity_docids, "reupdate");
-        db_snap!(index, prefix_word_pair_proximity_docids, "reupdate");
-    }
-
-    #[test]
-    fn replace_hard_deletion() {
-        let mut index = TempIndex::new();
-        index.index_documents_config.words_prefix_threshold = Some(50);
-        index.index_documents_config.update_method = IndexDocumentsMethod::ReplaceDocuments;
-
-        index
-            .update_settings(|settings| {
-                settings.set_primary_key("id".to_owned());
-                settings.set_searchable_fields(vec!["text".to_owned()]);
-            })
-            .unwrap();
-
-        let batch_reader_from_documents = |documents| {
-            let mut builder = DocumentsBatchBuilder::new(Vec::new());
-            for object in documents {
-                builder.append_json_object(&object).unwrap();
-            }
-            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
-        };
-
-        let mut documents = documents_with_enough_different_words_for_prefixes(&["a"], 0);
-        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
-        documents.push(
-            serde_json::json!({
-                "id": 9000,
-                "text": "At an amazing house"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-        documents.push(
-            serde_json::json!({
-                "id": 9001,
-                "text": "The bell rings"
-            })
-            .as_object()
-            .unwrap()
-            .clone(),
-        );
-
-        let documents = batch_reader_from_documents(documents);
-        index.add_documents(documents).unwrap();
-
-        db_snap!(index, documents_ids, "initial");
-        db_snap!(index, word_docids, "initial");
-        db_snap!(index, word_prefix_pair_proximity_docids, "initial");
-        db_snap!(index, prefix_word_pair_proximity_docids, "initial");
-
-        let documents = documents_with_enough_different_words_for_prefixes(&["b"], 0);
-        index.add_documents(batch_reader_from_documents(documents)).unwrap();
-
-        db_snap!(index, documents_ids, "replaced");
-        db_snap!(index, word_docids, "replaced");
-        db_snap!(index, word_prefix_pair_proximity_docids, "replaced");
-        db_snap!(index, prefix_word_pair_proximity_docids, "replaced");
-    }
-}
diff --git a/milli/src/update/prefix_word_pairs/prefix_word.rs b/milli/src/update/prefix_word_pairs/prefix_word.rs
deleted file mode 100644
index 1ec66d010..000000000
--- a/milli/src/update/prefix_word_pairs/prefix_word.rs
+++ /dev/null
@@ -1,182 +0,0 @@
-use std::borrow::Cow;
-use std::collections::{BTreeMap, HashSet};
-
-use grenad::CompressionType;
-use heed::types::ByteSlice;
-use heed::BytesDecode;
-use log::debug;
-
-use crate::update::index_documents::{create_writer, CursorClonableMmap};
-use crate::update::prefix_word_pairs::{
-    insert_into_database, write_into_lmdb_database_without_merging,
-};
-use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
-
-#[allow(clippy::too_many_arguments)]
-#[logging_timer::time]
-pub fn index_prefix_word_database(
-    wtxn: &mut heed::RwTxn,
-    word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-    prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-    max_proximity: u8,
-    max_prefix_length: usize,
-    new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
-    new_prefix_fst_words: &[String],
-    common_prefix_fst_words: &[&[String]],
-    del_prefix_fst_words: &HashSet<Vec<u8>>,
-    chunk_compression_type: CompressionType,
-    chunk_compression_level: Option<u32>,
-) -> Result<()> {
-    puffin::profile_function!();
-
-    let max_proximity = max_proximity - 1;
-    debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
-
-    let common_prefixes: Vec<_> = common_prefix_fst_words
-        .iter()
-        .flat_map(|s| s.iter())
-        .map(|s| s.as_str())
-        .filter(|s| s.len() <= max_prefix_length)
-        .collect();
-
-    for proximity in 1..max_proximity {
-        for prefix in common_prefixes.iter() {
-            let mut prefix_key = vec![proximity];
-            prefix_key.extend_from_slice(prefix.as_bytes());
-            let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?;
-            // This is the core of the algorithm
-            execute_on_word_pairs_and_prefixes(
-                proximity,
-                prefix.as_bytes(),
-                // the next two arguments tell how to iterate over the new word pairs
-                &mut cursor,
-                |cursor| {
-                    if let Some((key, value)) = cursor.next()? {
-                        let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
-                            .ok_or(heed::Error::Decoding)?;
-                        Ok(Some((word2, value)))
-                    } else {
-                        Ok(None)
-                    }
-                },
-                // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap)
-                |key, value| {
-                    insert_into_database(
-                        wtxn,
-                        *prefix_word_pair_proximity_docids.as_polymorph(),
-                        key,
-                        value,
-                    )
-                },
-            )?;
-        }
-    }
-
-    // Now we do the same thing with the new prefixes and all word pairs in the DB
-    let new_prefixes: Vec<_> = new_prefix_fst_words
-        .iter()
-        .map(|s| s.as_str())
-        .filter(|s| s.len() <= max_prefix_length)
-        .collect();
-
-    // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
-    // element in an intermediary grenad
-    let mut writer =
-        create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?);
-
-    for proximity in 1..max_proximity {
-        for prefix in new_prefixes.iter() {
-            let mut prefix_key = vec![proximity];
-            prefix_key.extend_from_slice(prefix.as_bytes());
-            let mut db_iter = word_pair_proximity_docids
-                .as_polymorph()
-                .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())?
-                .remap_key_type::<UncheckedU8StrStrCodec>();
-            execute_on_word_pairs_and_prefixes(
-                proximity,
-                prefix.as_bytes(),
-                &mut db_iter,
-                |db_iter| {
-                    db_iter
-                        .next()
-                        .transpose()
-                        .map(|x| x.map(|((_, _, word2), value)| (word2, value)))
-                        .map_err(|e| e.into())
-                },
-                |key, value| writer.insert(key, value).map_err(|e| e.into()),
-            )?;
-            drop(db_iter);
-        }
-    }
-
-    // and then we write the grenad into the DB
-    // Since the grenad contains only new prefixes, we know in advance that none
-    // of its elements already exist in the DB, thus there is no need to specify
-    // how to merge conflicting elements
-    write_into_lmdb_database_without_merging(
-        wtxn,
-        *prefix_word_pair_proximity_docids.as_polymorph(),
-        writer,
-    )?;
-
-    // All of the word prefix pairs in the database that have a w2
-    // that is contained in the `suppr_pw` set must be removed as well.
-    if !del_prefix_fst_words.is_empty() {
-        let mut iter =
-            prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
-        while let Some(((_, prefix, _), _)) = iter.next().transpose()? {
-            if del_prefix_fst_words.contains(prefix.as_bytes()) {
-                // Delete this entry as the w2 prefix is no more in the words prefix fst.
-                unsafe { iter.del_current()? };
-            }
-        }
-    }
-
-    Ok(())
-}
-
-/// This is the core of the algorithm to initialise the Prefix Word Pair Proximity Docids database.
-///
-/// Its arguments are:
-/// - an iterator over the words following the given `prefix` with the given `proximity`
-/// - a closure to describe how to handle the new computed (proximity, prefix, word2) elements
-fn execute_on_word_pairs_and_prefixes<I>(
-    proximity: u8,
-    prefix: &[u8],
-    iter: &mut I,
-    mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>,
-    mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
-) -> Result<()> {
-    let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = BTreeMap::default();
-
-    // Memory usage check:
-    // The content of the loop will be called for each `word2` that follows a word beginning
-    // with `prefix` with the given proximity.
-    // In practice, I don't think the batch can ever get too big.
-    while let Some((word2, docids)) = next_word2_and_docids(iter)? {
-        let entry = batch.entry(word2.to_owned()).or_default();
-        entry.push(Cow::Owned(docids.to_owned()));
-    }
-
-    let mut key_buffer = Vec::with_capacity(512);
-    key_buffer.push(proximity);
-    key_buffer.extend_from_slice(prefix);
-    key_buffer.push(0);
-
-    let mut value_buffer = Vec::with_capacity(65_536);
-
-    for (word2, docids) in batch {
-        key_buffer.truncate(prefix.len() + 2);
-        value_buffer.clear();
-
-        key_buffer.extend_from_slice(&word2);
-        let data = if docids.len() > 1 {
-            CboRoaringBitmapCodec::merge_into(&docids, &mut value_buffer)?;
-            value_buffer.as_slice()
-        } else {
-            &docids[0]
-        };
-        insert(key_buffer.as_slice(), data)?;
-    }
-    Ok(())
-}
diff --git a/milli/src/update/prefix_word_pairs/word_prefix.rs b/milli/src/update/prefix_word_pairs/word_prefix.rs
deleted file mode 100644
index 570adece9..000000000
--- a/milli/src/update/prefix_word_pairs/word_prefix.rs
+++ /dev/null
@@ -1,728 +0,0 @@
-/*!
-The word-prefix-pair-proximity-docids database is a database whose keys are of
-the form `(proximity, word, prefix)` and the values are roaring bitmaps of
-the documents which contain `word` followed by another word starting with
-`prefix` at a distance of `proximity`.
-
-The prefixes present in this database are only those that correspond to many
-different words in the documents.
-
-## How is it created/updated? (simplified version)
-To compute it, we have access to (mainly) two inputs:
-
-* a list of sorted prefixes, such as:
-```text
-c
-ca
-cat
-d
-do
-dog
-```
-Note that only prefixes which correspond to more than a certain number of
-different words from the database are included in this list.
-
-* a sorted list of proximities and word pairs (the proximity is the distance between the two words),
-associated with a roaring bitmap, such as:
-```text
-1 good doggo         -> docids1: [8]
-1 good door          -> docids2: [7, 19, 20]
-1 good ghost         -> docids3: [1]
-2 good dog           -> docids4: [2, 5, 6]
-2 horror cathedral   -> docids5: [1, 2]
-```
-
-I illustrate a simplified version of the algorithm to create the word-prefix
-pair-proximity database below:
-
-1. **Outer loop:** First, we iterate over each proximity and word pair:
-```text
-proximity: 1
-word1    : good
-word2    : doggo
-```
-2. **Inner loop:** Then, we iterate over all the prefixes of `word2` that are
-in the list of sorted prefixes. And we insert the key `prefix`
-and the value (`docids`) to a sorted map which we call the “batch”. For example,
-at the end of the first outer loop, we may have:
-```text
-Outer loop 1:
-------------------------------
-proximity: 1
-word1    : good
-word2    : doggo
-docids   : docids1
-
-prefixes: [d, do, dog]
-
-batch: [
-    d,   -> [docids1]
-    do   -> [docids1]
-    dog  -> [docids1]
-]
-```
-3. For illustration purpose, let's run through a second iteration of the outer loop:
-```text
-Outer loop 2:
-------------------------------
-proximity: 1
-word1    : good
-word2    : door
-docids   : docids2
-
-prefixes: [d, do, doo]
-
-batch: [
-    d   -> [docids1, docids2]
-    do  -> [docids1, docids2]
-    dog -> [docids1]
-    doo -> [docids2]
-]
-```
-Notice that there were some conflicts which were resolved by merging the
-conflicting values together. Also, an additional prefix was added at the
-end of the batch.
-
-4. On the third iteration of the outer loop, we have:
-```text
-Outer loop 3:
-------------------------------
-proximity: 1
-word1    : good
-word2    : ghost
-```
-Because `word2` begins with a different letter than the previous `word2`,
-we know that all the prefixes of `word2` are greater than the prefixes of the previous word2
-
-Therefore, we know that we can insert every element from the batch into the
-database before proceeding any further. This operation is called
-“flushing the batch”. Flushing the batch should also be done whenever:
-* `proximity` is different than the previous `proximity`.
-* `word1` is different than the previous `word1`.
-* `word2` starts with a different letter than the previous word2
-
-6. **Flushing the batch:** to flush the batch, we iterate over its elements:
-```text
-Flushing Batch loop 1:
-------------------------------
-proximity  : 1
-word1      : good
-prefix     : d
-
-docids   : [docids2, docids3]
-```
-We then merge the array of `docids` (of type `Vec<Vec<u8>>`) using
-`merge_cbo_roaring_bitmap` in order to get a single byte vector representing a
-roaring bitmap of all the document ids where `word1` is followed by `prefix`
-at a distance of `proximity`.
-Once we have done that, we insert `(proximity, word1, prefix) -> merged_docids`
-into the database.
-
-7. That's it! ... except...
-
-## How is it created/updated (continued)
-
-I lied a little bit about the input data. In reality, we get two sets of the
-inputs described above, which come from different places:
-
-* For the list of sorted prefixes, we have:
-    1. `new_prefixes`, which are all the prefixes that were not present in the
-    database before the insertion of the new documents
-
-    2. `common_prefixes` which are the prefixes that are present both in the
-    database and in the newly added documents
-
-* For the list of word pairs and proximities, we have:
-    1. `new_word_pairs`, which is the list of word pairs and their proximities
-    present in the newly added documents
-
-    2. `word_pairs_db`, which is the list of word pairs from the database.
-    This list includes all elements in `new_word_pairs` since `new_word_pairs`
-    was added to the database prior to calling the `WordPrefix::execute`
-    function.
-
-To update the prefix database correctly, we call the algorithm described earlier first
-on (`common_prefixes`, `new_word_pairs`) and then on (`new_prefixes`, `word_pairs_db`).
-Thus:
-
-1. For all the word pairs that were already present in the DB, we insert them
-again with the `new_prefixes`. Calling the algorithm on them with the
-`common_prefixes` would not result in any new data.
-
-2. For all the new word pairs, we insert them twice: first with the `common_prefixes`,
-and then, because they are part of `word_pairs_db`, with the `new_prefixes`.
-
-Note, also, that since we read data from the database when iterating over
-`word_pairs_db`, we cannot insert the computed word-prefix-pair-proximity-
-docids from the batch directly into the database (we would have a concurrent
-reader and writer). Therefore, when calling the algorithm on
-`(new_prefixes, word_pairs_db)`, we insert the computed
-`((proximity, word, prefix), docids)` elements in an intermediary grenad
-Writer instead of the DB. At the end of the outer loop, we finally read from
-the grenad and insert its elements in the database.
-*/
-
-use std::borrow::Cow;
-use std::collections::HashSet;
-
-use grenad::CompressionType;
-use heed::types::ByteSlice;
-use heed::BytesDecode;
-use log::debug;
-
-use crate::update::index_documents::{create_writer, CursorClonableMmap};
-use crate::update::prefix_word_pairs::{
-    insert_into_database, write_into_lmdb_database_without_merging,
-};
-use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
-
-#[allow(clippy::too_many_arguments)]
-#[logging_timer::time]
-pub fn index_word_prefix_database(
-    wtxn: &mut heed::RwTxn,
-    word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-    word_prefix_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-    max_proximity: u8,
-    max_prefix_length: usize,
-    new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
-    new_prefix_fst_words: &[String],
-    common_prefix_fst_words: &[&[String]],
-    del_prefix_fst_words: &HashSet<Vec<u8>>,
-    chunk_compression_type: CompressionType,
-    chunk_compression_level: Option<u32>,
-) -> Result<()> {
-    puffin::profile_function!();
-    debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
-
-    // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
-    let prefixes = PrefixTrieNode::from_sorted_prefixes(
-        common_prefix_fst_words
-            .iter()
-            .flat_map(|s| s.iter())
-            .map(|s| s.as_str())
-            .filter(|s| s.len() <= max_prefix_length),
-    );
-
-    // If the prefix trie is not empty, then we can iterate over all new
-    // word pairs to look for new (proximity, word1, common_prefix) elements
-    // to insert in the DB
-    if !prefixes.is_empty() {
-        let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
-        // This is the core of the algorithm
-        execute_on_word_pairs_and_prefixes(
-            // the first two arguments tell how to iterate over the new word pairs
-            &mut cursor,
-            |cursor| {
-                if let Some((key, value)) = cursor.move_on_next()? {
-                    let (proximity, word1, word2) =
-                        UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
-                    Ok(Some(((proximity, word1, word2), value)))
-                } else {
-                    Ok(None)
-                }
-            },
-            &prefixes,
-            max_proximity,
-            // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap)
-            |key, value| {
-                insert_into_database(
-                    wtxn,
-                    *word_prefix_pair_proximity_docids.as_polymorph(),
-                    key,
-                    value,
-                )
-            },
-        )?;
-    }
-
-    // Now we do the same thing with the new prefixes and all word pairs in the DB
-
-    let prefixes = PrefixTrieNode::from_sorted_prefixes(
-        new_prefix_fst_words.iter().map(|s| s.as_str()).filter(|s| s.len() <= max_prefix_length),
-    );
-
-    if !prefixes.is_empty() {
-        let mut db_iter = word_pair_proximity_docids
-            .remap_key_type::<UncheckedU8StrStrCodec>()
-            .remap_data_type::<ByteSlice>()
-            .iter(wtxn)?;
-
-        // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix)
-        // element in an intermediary grenad
-        let mut writer =
-            create_writer(chunk_compression_type, chunk_compression_level, tempfile::tempfile()?);
-
-        execute_on_word_pairs_and_prefixes(
-            &mut db_iter,
-            |db_iter| db_iter.next().transpose().map_err(|e| e.into()),
-            &prefixes,
-            max_proximity,
-            |key, value| writer.insert(key, value).map_err(|e| e.into()),
-        )?;
-        drop(db_iter);
-
-        // and then we write the grenad into the DB
-        // Since the grenad contains only new prefixes, we know in advance that none
-        // of its elements already exist in the DB, thus there is no need to specify
-        // how to merge conflicting elements
-        write_into_lmdb_database_without_merging(
-            wtxn,
-            *word_prefix_pair_proximity_docids.as_polymorph(),
-            writer,
-        )?;
-    }
-
-    // All of the word prefix pairs in the database that have a w2
-    // that is contained in the `suppr_pw` set must be removed as well.
-    if !del_prefix_fst_words.is_empty() {
-        let mut iter =
-            word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
-        while let Some(((_, _, prefix), _)) = iter.next().transpose()? {
-            if del_prefix_fst_words.contains(prefix.as_bytes()) {
-                // Delete this entry as the w2 prefix is no more in the words prefix fst.
-                unsafe { iter.del_current()? };
-            }
-        }
-    }
-
-    Ok(())
-}
-
-/// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
-///
-/// Its main arguments are:
-/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements
-/// 2. a prefix trie
-/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements
-///
-/// For more information about what this function does, read the module documentation.
-fn execute_on_word_pairs_and_prefixes<I>(
-    iter: &mut I,
-    mut next_word_pair_proximity: impl for<'a> FnMut(
-        &'a mut I,
-    ) -> Result<
-        Option<((u8, &'a [u8], &'a [u8]), &'a [u8])>,
-    >,
-    prefixes: &PrefixTrieNode,
-    max_proximity: u8,
-    mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
-) -> Result<()> {
-    let mut batch = PrefixAndProximityBatch::default();
-    let mut prev_word2_start = 0;
-
-    // Optimisation: the index at the root of the prefix trie where to search for
-    let mut prefix_search_start = PrefixTrieNodeSearchStart(0);
-
-    // Optimisation: true if there are no potential prefixes for the current word2 based on its first letter
-    let mut empty_prefixes = false;
-
-    let mut prefix_buffer = Vec::with_capacity(8);
-    let mut merge_buffer = Vec::with_capacity(65_536);
-
-    while let Some(((proximity, word1, word2), data)) = next_word_pair_proximity(iter)? {
-        // stop indexing if the proximity is over the threshold
-        if proximity > max_proximity {
-            break;
-        };
-        let word2_start_different_than_prev = word2[0] != prev_word2_start;
-        // if there were no potential prefixes for the previous word2 based on its first letter,
-        // and if the current word2 starts with the same letter, then there is also no potential
-        // prefixes for the current word2, and we can skip to the next iteration
-        if empty_prefixes && !word2_start_different_than_prev {
-            continue;
-        }
-
-        // if the proximity is different to the previous one, OR
-        // if word1 is different than the previous word1, OR
-        // if the start of word2 is different than the previous start of word2,
-        // THEN we'll need to flush the batch
-        let prox_different_than_prev = proximity != batch.proximity;
-        let word1_different_than_prev = word1 != batch.word1;
-        if prox_different_than_prev || word1_different_than_prev || word2_start_different_than_prev
-        {
-            batch.flush(&mut merge_buffer, &mut insert)?;
-            batch.proximity = proximity;
-            // don't forget to reset the value of batch.word1 and prev_word2_start
-            if word1_different_than_prev {
-                batch.word1.clear();
-                batch.word1.extend_from_slice(word1);
-            }
-            if word2_start_different_than_prev {
-                prev_word2_start = word2[0];
-            }
-            prefix_search_start.0 = 0;
-            // Optimisation: find the search start in the prefix trie to iterate over the prefixes of word2
-            empty_prefixes = !prefixes.set_search_start(word2, &mut prefix_search_start);
-        }
-
-        if !empty_prefixes {
-            // All conditions are satisfied, we can now insert each new prefix of word2 into the batch
-            prefix_buffer.clear();
-            prefixes.for_each_prefix_of(
-                word2,
-                &mut prefix_buffer,
-                &prefix_search_start,
-                |prefix_buffer| {
-                    batch.insert(prefix_buffer, data.to_vec());
-                },
-            );
-        }
-    }
-    batch.flush(&mut merge_buffer, &mut insert)?;
-    Ok(())
-}
-/**
-A map structure whose keys are prefixes and whose values are vectors of bitstrings (serialized roaring bitmaps).
-The keys are sorted and conflicts are resolved by merging the vectors of bitstrings together.
-
-It is used to ensure that all ((proximity, word1, prefix), docids) are inserted into the database in sorted order and efficiently.
-
-The batch is flushed as often as possible, when we are sure that every (proximity, word1, prefix) key derived from its content
-can be inserted into the database in sorted order. When it is flushed, it calls a user-provided closure with the following arguments:
-- key   : (proximity, word1, prefix) as bytes
-- value : merged roaring bitmaps from all values associated with prefix in the batch, serialised to bytes
-*/
-#[derive(Default)]
-struct PrefixAndProximityBatch {
-    proximity: u8,
-    word1: Vec<u8>,
-    #[allow(clippy::type_complexity)]
-    batch: Vec<(Vec<u8>, Vec<Cow<'static, [u8]>>)>,
-}
-
-impl PrefixAndProximityBatch {
-    /// Insert the new key and value into the batch
-    ///
-    /// The key must either exist in the batch or be greater than all existing keys
-    fn insert(&mut self, new_key: &[u8], new_value: Vec<u8>) {
-        match self.batch.iter_mut().find(|el| el.0 == new_key) {
-            Some((_prefix, docids)) => docids.push(Cow::Owned(new_value)),
-            None => self.batch.push((new_key.to_vec(), vec![Cow::Owned(new_value)])),
-        }
-    }
-
-    /// Empties the batch, calling `insert` on each element.
-    ///
-    /// The key given to `insert` is `(proximity, word1, prefix)` and the value is the associated merged roaring bitmap.
-    fn flush(
-        &mut self,
-        merge_buffer: &mut Vec<u8>,
-        insert: &mut impl for<'buffer> FnMut(&'buffer [u8], &'buffer [u8]) -> Result<()>,
-    ) -> Result<()> {
-        let PrefixAndProximityBatch { proximity, word1, batch } = self;
-        if batch.is_empty() {
-            return Ok(());
-        }
-        merge_buffer.clear();
-
-        let mut buffer = Vec::with_capacity(word1.len() + 1 + 6);
-        buffer.push(*proximity);
-        buffer.extend_from_slice(word1);
-        buffer.push(0);
-
-        for (key, mergeable_data) in batch.drain(..) {
-            buffer.truncate(1 + word1.len() + 1);
-            buffer.extend_from_slice(key.as_slice());
-
-            let data = if mergeable_data.len() > 1 {
-                CboRoaringBitmapCodec::merge_into(&mergeable_data, merge_buffer)?;
-                merge_buffer.as_slice()
-            } else {
-                &mergeable_data[0]
-            };
-            insert(buffer.as_slice(), data)?;
-            merge_buffer.clear();
-        }
-
-        Ok(())
-    }
-}
-
-/** A prefix trie. Used to iterate quickly over the prefixes of a word that are
-within a set.
-
-## Structure
-The trie is made of nodes composed of:
-1. a byte character (e.g. 'a')
-2. whether the node is an end node or not
-3. a list of children nodes, sorted by their byte character
-
-For example, the trie that stores the strings `[ac, ae, ar, ch, cei, cel, ch, r, rel, ri]`
-is drawn below. Nodes with a double border are "end nodes".
-
-┌──────────────────────┐ ┌──────────────────────┐ ╔══════════════════════╗
-│          a           │ │          c           │ ║          r           ║
-└──────────────────────┘ └──────────────────────┘ ╚══════════════════════╝
-╔══════╗╔══════╗╔══════╗ ┌─────────┐  ╔═════════╗ ┌─────────┐ ╔══════════╗
-║  c   ║║  e   ║║  r   ║ │    e    │  ║    h    ║ │    e    │ ║    i     ║
-╚══════╝╚══════╝╚══════╝ └─────────┘  ╚═════════╝ └─────────┘ ╚══════════╝
-                         ╔═══╗ ╔═══╗                 ╔═══╗
-                         ║ i ║ ║ l ║                 ║ l ║
-                         ╚═══╝ ╚═══╝                 ╚═══╝
-*/
-#[derive(Default, Debug)]
-struct PrefixTrieNode {
-    children: Vec<(PrefixTrieNode, u8)>,
-    is_end_node: bool,
-}
-
-#[derive(Debug)]
-struct PrefixTrieNodeSearchStart(usize);
-
-impl PrefixTrieNode {
-    fn is_empty(&self) -> bool {
-        self.children.is_empty()
-    }
-
-    /// Returns false if the trie does not contain a prefix of the given word.
-    /// Returns true if the trie *may* contain a prefix of the given word.
-    ///
-    /// Moves the search start to the first node equal to the first letter of the word,
-    /// or to 0 otherwise.
-    fn set_search_start(&self, word: &[u8], search_start: &mut PrefixTrieNodeSearchStart) -> bool {
-        let byte = word[0];
-        if self.children[search_start.0].1 == byte {
-            true
-        } else {
-            match self.children[search_start.0..].binary_search_by_key(&byte, |x| x.1) {
-                Ok(position) => {
-                    search_start.0 += position;
-                    true
-                }
-                Err(_) => {
-                    search_start.0 = 0;
-                    false
-                }
-            }
-        }
-    }
-
-    fn from_sorted_prefixes<'a>(prefixes: impl Iterator<Item = &'a str>) -> Self {
-        let mut node = PrefixTrieNode::default();
-        for prefix in prefixes {
-            node.insert_sorted_prefix(prefix.as_bytes().iter());
-        }
-        node
-    }
-    fn insert_sorted_prefix(&mut self, mut prefix: std::slice::Iter<u8>) {
-        if let Some(&c) = prefix.next() {
-            if let Some((node, byte)) = self.children.last_mut() {
-                if *byte == c {
-                    node.insert_sorted_prefix(prefix);
-                    return;
-                }
-            }
-            let mut new_node = PrefixTrieNode::default();
-            new_node.insert_sorted_prefix(prefix);
-            self.children.push((new_node, c));
-        } else {
-            self.is_end_node = true;
-        }
-    }
-
-    /// Call the given closure on each prefix of the word contained in the prefix trie.
-    ///
-    /// The search starts from the given `search_start`.
-    fn for_each_prefix_of(
-        &self,
-        word: &[u8],
-        buffer: &mut Vec<u8>,
-        search_start: &PrefixTrieNodeSearchStart,
-        mut do_fn: impl FnMut(&mut Vec<u8>),
-    ) {
-        let first_byte = word[0];
-        let mut cur_node = self;
-        buffer.push(first_byte);
-        if let Some((child_node, c)) =
-            cur_node.children[search_start.0..].iter().find(|(_, c)| *c >= first_byte)
-        {
-            if *c == first_byte {
-                cur_node = child_node;
-                if cur_node.is_end_node {
-                    do_fn(buffer);
-                }
-                for &byte in &word[1..] {
-                    buffer.push(byte);
-                    if let Some((child_node, c)) =
-                        cur_node.children.iter().find(|(_, c)| *c >= byte)
-                    {
-                        if *c == byte {
-                            cur_node = child_node;
-                            if cur_node.is_end_node {
-                                do_fn(buffer);
-                            }
-                        } else {
-                            break;
-                        }
-                    } else {
-                        break;
-                    }
-                }
-            }
-        }
-    }
-}
-#[cfg(test)]
-mod tests {
-    use roaring::RoaringBitmap;
-
-    use super::*;
-    use crate::{CboRoaringBitmapCodec, U8StrStrCodec};
-
-    fn check_prefixes(
-        trie: &PrefixTrieNode,
-        search_start: &PrefixTrieNodeSearchStart,
-        word: &str,
-        expected_prefixes: &[&str],
-    ) {
-        let mut actual_prefixes = vec![];
-        trie.for_each_prefix_of(word.as_bytes(), &mut Vec::new(), search_start, |x| {
-            let s = String::from_utf8(x.to_owned()).unwrap();
-            actual_prefixes.push(s);
-        });
-        assert_eq!(actual_prefixes, expected_prefixes);
-    }
-
-    #[test]
-    fn test_trie() {
-        let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
-            "1", "19", "2", "a", "ab", "ac", "ad", "al", "am", "an", "ap", "ar", "as", "at", "au",
-            "b", "ba", "bar", "be", "bi", "bl", "bla", "bo", "br", "bra", "bri", "bro", "bu", "c",
-            "ca", "car", "ce", "ch", "cha", "che", "chi", "ci", "cl", "cla", "co", "col", "com",
-            "comp", "con", "cons", "cont", "cor", "cou", "cr", "cu", "d", "da", "de", "dec", "des",
-            "di", "dis", "do", "dr", "du", "e", "el", "em", "en", "es", "ev", "ex", "exp", "f",
-            "fa", "fe", "fi", "fl", "fo", "for", "fr", "fra", "fre", "fu", "g", "ga", "ge", "gi",
-            "gl", "go", "gr", "gra", "gu", "h", "ha", "har", "he", "hea", "hi", "ho", "hu", "i",
-            "im", "imp", "in", "ind", "ins", "int", "inte", "j", "ja", "je", "jo", "ju", "k", "ka",
-            "ke", "ki", "ko", "l", "la", "le", "li", "lo", "lu", "m", "ma", "mal", "man", "mar",
-            "mat", "mc", "me", "mi", "min", "mis", "mo", "mon", "mor", "mu", "n", "na", "ne", "ni",
-            "no", "o", "or", "ou", "ov", "ove", "over", "p", "pa", "par", "pe", "per", "ph", "pi",
-            "pl", "po", "pr", "pre", "pro", "pu", "q", "qu", "r", "ra", "re", "rec", "rep", "res",
-            "ri", "ro", "ru", "s", "sa", "san", "sc", "sch", "se", "sh", "sha", "shi", "sho", "si",
-            "sk", "sl", "sn", "so", "sp", "st", "sta", "ste", "sto", "str", "su", "sup", "sw", "t",
-            "ta", "te", "th", "ti", "to", "tr", "tra", "tri", "tu", "u", "un", "v", "va", "ve",
-            "vi", "vo", "w", "wa", "we", "wh", "wi", "wo", "y", "yo", "z",
-        ]));
-
-        let mut search_start = PrefixTrieNodeSearchStart(0);
-
-        let is_empty = !trie.set_search_start("affair".as_bytes(), &mut search_start);
-        assert!(!is_empty);
-        assert_eq!(search_start.0, 2);
-
-        check_prefixes(&trie, &search_start, "affair", &["a"]);
-        check_prefixes(&trie, &search_start, "shampoo", &["s", "sh", "sha"]);
-
-        let is_empty = !trie.set_search_start("unique".as_bytes(), &mut search_start);
-        assert!(!is_empty);
-        assert_eq!(trie.children[search_start.0].1, b'u');
-
-        check_prefixes(&trie, &search_start, "unique", &["u", "un"]);
-
-        // NOTE: this should fail, because the search start is already beyong 'a'
-        let is_empty = trie.set_search_start("abba".as_bytes(), &mut search_start);
-        assert!(!is_empty);
-        // search start is reset
-        assert_eq!(search_start.0, 0);
-
-        let trie = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
-            "arb", "arbre", "cat", "catto",
-        ]));
-        check_prefixes(&trie, &search_start, "arbres", &["arb", "arbre"]);
-        check_prefixes(&trie, &search_start, "cattos", &["cat", "catto"]);
-    }
-
-    #[test]
-    fn test_execute_on_word_pairs_and_prefixes() {
-        let prefixes = PrefixTrieNode::from_sorted_prefixes(IntoIterator::into_iter([
-            "arb", "arbre", "cat", "catto",
-        ]));
-
-        let mut serialised_bitmap123 = vec![];
-        let mut bitmap123 = RoaringBitmap::new();
-        bitmap123.insert(1);
-        bitmap123.insert(2);
-        bitmap123.insert(3);
-        CboRoaringBitmapCodec::serialize_into(&bitmap123, &mut serialised_bitmap123);
-
-        let mut serialised_bitmap456 = vec![];
-        let mut bitmap456 = RoaringBitmap::new();
-        bitmap456.insert(4);
-        bitmap456.insert(5);
-        bitmap456.insert(6);
-        CboRoaringBitmapCodec::serialize_into(&bitmap456, &mut serialised_bitmap456);
-
-        let mut serialised_bitmap789 = vec![];
-        let mut bitmap789 = RoaringBitmap::new();
-        bitmap789.insert(7);
-        bitmap789.insert(8);
-        bitmap789.insert(9);
-        CboRoaringBitmapCodec::serialize_into(&bitmap789, &mut serialised_bitmap789);
-
-        let mut serialised_bitmap_ranges = vec![];
-        let mut bitmap_ranges = RoaringBitmap::new();
-        bitmap_ranges.insert_range(63_000..65_000);
-        bitmap_ranges.insert_range(123_000..128_000);
-        CboRoaringBitmapCodec::serialize_into(&bitmap_ranges, &mut serialised_bitmap_ranges);
-
-        let word_pairs = [
-            ((1, "healthy", "arbres"), &serialised_bitmap123),
-            ((1, "healthy", "boat"), &serialised_bitmap123),
-            ((1, "healthy", "ca"), &serialised_bitmap123),
-            ((1, "healthy", "cats"), &serialised_bitmap456),
-            ((1, "healthy", "cattos"), &serialised_bitmap123),
-            ((1, "jittery", "cat"), &serialised_bitmap123),
-            ((1, "jittery", "cata"), &serialised_bitmap456),
-            ((1, "jittery", "catb"), &serialised_bitmap789),
-            ((1, "jittery", "catc"), &serialised_bitmap_ranges),
-            ((2, "healthy", "arbre"), &serialised_bitmap123),
-            ((2, "healthy", "arbres"), &serialised_bitmap456),
-            ((2, "healthy", "cats"), &serialised_bitmap789),
-            ((2, "healthy", "cattos"), &serialised_bitmap_ranges),
-            ((3, "healthy", "arbre"), &serialised_bitmap456),
-            ((3, "healthy", "arbres"), &serialised_bitmap789),
-        ];
-
-        let expected_result = [
-            ((1, "healthy", "arb"), bitmap123.clone()),
-            ((1, "healthy", "arbre"), bitmap123.clone()),
-            ((1, "healthy", "cat"), &bitmap456 | &bitmap123),
-            ((1, "healthy", "catto"), bitmap123.clone()),
-            ((1, "jittery", "cat"), (&bitmap123 | &bitmap456 | &bitmap789 | &bitmap_ranges)),
-            ((2, "healthy", "arb"), &bitmap123 | &bitmap456),
-            ((2, "healthy", "arbre"), &bitmap123 | &bitmap456),
-            ((2, "healthy", "cat"), &bitmap789 | &bitmap_ranges),
-            ((2, "healthy", "catto"), bitmap_ranges.clone()),
-        ];
-
-        let mut result = vec![];
-
-        let mut iter =
-            IntoIterator::into_iter(word_pairs).map(|((proximity, word1, word2), data)| {
-                ((proximity, word1.as_bytes(), word2.as_bytes()), data.as_slice())
-            });
-        execute_on_word_pairs_and_prefixes(
-            &mut iter,
-            |iter| Ok(iter.next()),
-            &prefixes,
-            2,
-            |k, v| {
-                let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap();
-                let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap();
-                result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap));
-                Ok(())
-            },
-        )
-        .unwrap();
-
-        for (x, y) in result.into_iter().zip(IntoIterator::into_iter(expected_result)) {
-            let ((actual_proximity, actual_word1, actual_prefix), actual_bitmap) = x;
-            let ((expected_proximity, expected_word1, expected_prefix), expected_bitmap) = y;
-
-            assert_eq!(actual_word1, expected_word1);
-            assert_eq!(actual_prefix, expected_prefix);
-            assert_eq!(actual_proximity, expected_proximity);
-            assert_eq!(actual_bitmap, expected_bitmap);
-        }
-    }
-}

From 70ce40828c3d58a667a314a063d1c7c5b5a05645 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Wed, 8 Nov 2023 16:41:26 +0100
Subject: [PATCH 4/7] Compute word docids prefix cache

---
 milli/src/update/del_add.rs                   | 14 +++++
 .../index_documents/helpers/grenad_helpers.rs | 44 +++++++++++++++
 .../helpers/merge_functions.rs                | 16 ++++++
 .../src/update/index_documents/helpers/mod.rs |  7 ++-
 milli/src/update/index_documents/mod.rs       |  6 +-
 .../src/update/index_documents/typed_chunk.rs | 55 +++++--------------
 milli/src/update/word_prefix_docids.rs        | 27 ++++++---
 7 files changed, 116 insertions(+), 53 deletions(-)

diff --git a/milli/src/update/del_add.rs b/milli/src/update/del_add.rs
index c8b7f0f6a..dc7c0409a 100644
--- a/milli/src/update/del_add.rs
+++ b/milli/src/update/del_add.rs
@@ -102,3 +102,17 @@ pub fn del_add_from_two_obkvs<K: obkv::Key + PartialOrd + Ord>(
 pub fn is_noop_del_add_obkv(del_add: KvReaderDelAdd) -> bool {
     del_add.get(DelAdd::Deletion) == del_add.get(DelAdd::Addition)
 }
+
+/// A function that extracts and returns the Add side of a DelAdd obkv.
+/// This is useful when there are no previous value in the database and
+/// therefore we don't need to do a diff with what's already there.
+///
+/// If there is no Add side we currently write an empty buffer
+/// which is a valid CboRoaringBitmap.
+#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
+pub fn deladd_serialize_add_side<'a>(
+    obkv: &'a [u8],
+    _buffer: &mut Vec<u8>,
+) -> crate::Result<&'a [u8]> {
+    Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
+}
diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs
index 4f764ab95..f520ea7b0 100644
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -9,6 +9,7 @@ use log::debug;
 
 use super::{ClonableMmap, MergeFn};
 use crate::error::InternalError;
+use crate::update::index_documents::valid_lmdb_key;
 use crate::Result;
 
 pub type CursorClonableMmap = io::Cursor<ClonableMmap>;
@@ -282,6 +283,49 @@ pub fn sorter_into_lmdb_database(
     Ok(())
 }
 
+/// Write provided sorter in database using serialize_value function.
+/// merge_values function is used if an entry already exist in the database.
+pub fn write_sorter_into_database<K, V, FS, FM>(
+    sorter: Sorter<MergeFn>,
+    database: &heed::Database<K, V>,
+    wtxn: &mut heed::RwTxn,
+    index_is_empty: bool,
+    serialize_value: FS,
+    merge_values: FM,
+) -> Result<()>
+where
+    FS: for<'a> Fn(&'a [u8], &'a mut Vec<u8>) -> Result<&'a [u8]>,
+    FM: for<'a> Fn(&[u8], &[u8], &'a mut Vec<u8>) -> Result<Option<&'a [u8]>>,
+{
+    puffin::profile_function!();
+
+    let mut buffer = Vec::new();
+    let database = database.remap_types::<ByteSlice, ByteSlice>();
+
+    let mut merger_iter = sorter.into_stream_merger_iter()?;
+    while let Some((key, value)) = merger_iter.next()? {
+        if valid_lmdb_key(key) {
+            buffer.clear();
+            let value = if index_is_empty {
+                Some(serialize_value(value, &mut buffer)?)
+            } else {
+                match database.get(wtxn, key)? {
+                    Some(prev_value) => merge_values(value, prev_value, &mut buffer)?,
+                    None => Some(serialize_value(value, &mut buffer)?),
+                }
+            };
+            match value {
+                Some(value) => database.put(wtxn, key, value)?,
+                None => {
+                    database.delete(wtxn, key)?;
+                }
+            }
+        }
+    }
+
+    Ok(())
+}
+
 /// Used when trying to merge readers, but you don't actually care about the values.
 pub fn merge_ignore_values<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
     Ok(Cow::Owned(Vec::new()))
diff --git a/milli/src/update/index_documents/helpers/merge_functions.rs b/milli/src/update/index_documents/helpers/merge_functions.rs
index 98c1c1a04..5d9ca7ef2 100644
--- a/milli/src/update/index_documents/helpers/merge_functions.rs
+++ b/milli/src/update/index_documents/helpers/merge_functions.rs
@@ -239,3 +239,19 @@ pub fn merge_deladd_cbo_roaring_bitmaps<'a>(
         output_deladd_obkv.into_inner().map(Cow::from).map_err(Into::into)
     }
 }
+
+/// A function that merges a DelAdd of bitmao into an already existing bitmap.
+///
+/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
+/// the second one is the CboRoaringBitmap to merge into.
+pub fn merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap<'a>(
+    deladd_obkv: &[u8],
+    previous: &[u8],
+    buffer: &'a mut Vec<u8>,
+) -> Result<Option<&'a [u8]>> {
+    Ok(CboRoaringBitmapCodec::merge_deladd_into(
+        KvReaderDelAdd::new(deladd_obkv),
+        previous,
+        buffer,
+    )?)
+}
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index 1f2f8e6ef..c167f1cd3 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -9,12 +9,13 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
 use fst::{IntoStreamer, Streamer};
 pub use grenad_helpers::{
     as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
-    merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, writer_into_reader,
-    GrenadParameters, MergeableReader,
+    merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_sorter_into_database,
+    writer_into_reader, GrenadParameters, MergeableReader,
 };
 pub use merge_functions::{
     concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
-    merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps, merge_roaring_bitmaps,
+    merge_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps,
+    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_roaring_bitmaps,
     obkvs_keep_last_addition_merge_deletions, obkvs_merge_additions_and_deletions,
     serialize_roaring_bitmap, MergeFn,
 };
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 8552cf52b..5dbb4dd0b 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -26,8 +26,10 @@ pub use self::enrich::{
 };
 pub use self::helpers::{
     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
-    fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
-    sorter_into_lmdb_database, valid_lmdb_key, writer_into_reader, ClonableMmap, MergeFn,
+    fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
+    merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
+    merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, write_sorter_into_database,
+    writer_into_reader, ClonableMmap, MergeFn,
 };
 use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index b53d859cd..90f9b7739 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -13,7 +13,10 @@ use obkv::{KvReader, KvWriter};
 use ordered_float::OrderedFloat;
 use roaring::RoaringBitmap;
 
-use super::helpers::{self, merge_ignore_values, valid_lmdb_key, CursorClonableMmap};
+use super::helpers::{
+    self, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, merge_ignore_values,
+    valid_lmdb_key, CursorClonableMmap,
+};
 use super::{ClonableMmap, MergeFn};
 use crate::distance::NDotProductPoint;
 use crate::error::UserError;
@@ -21,12 +24,11 @@ use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
 use crate::facet::FacetType;
 use crate::index::db_name::DOCUMENTS;
 use crate::index::Hnsw;
-use crate::update::del_add::{DelAdd, KvReaderDelAdd};
+use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd};
 use crate::update::facet::FacetsUpdate;
 use crate::update::index_documents::helpers::{as_cloneable_grenad, try_split_array_at};
 use crate::{
-    lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, Result,
-    SerializationError, BEU32,
+    lat_lng_to_xyz, DocumentId, FieldId, GeoPoint, Index, Result, SerializationError, BEU32,
 };
 
 pub(crate) enum TypedChunk {
@@ -186,7 +188,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
             is_merged_database = true;
         }
@@ -202,7 +204,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
 
             let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
@@ -212,7 +214,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
 
             let word_fid_docids_iter = unsafe { as_cloneable_grenad(&word_fid_docids_reader) }?;
@@ -222,7 +224,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
 
             // create fst from word docids
@@ -244,7 +246,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
             is_merged_database = true;
         }
@@ -265,7 +267,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
             is_merged_database = true;
         }
@@ -276,7 +278,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
             is_merged_database = true;
         }
@@ -287,7 +289,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
             is_merged_database = true;
         }
@@ -298,7 +300,7 @@ pub(crate) fn write_typed_chunk_into_index(
                 wtxn,
                 index_is_empty,
                 deladd_serialize_add_side,
-                merge_deladd_cbo_roaring_bitmaps,
+                merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
             )?;
             is_merged_database = true;
         }
@@ -495,33 +497,6 @@ fn merge_word_docids_reader_into_fst(
     Ok(builder.into_set())
 }
 
-/// A function that extracts and returns the Add side of a DelAdd obkv.
-/// This is useful when there are no previous value in the database and
-/// therefore we don't need to do a diff with what's already there.
-///
-/// If there is no Add side we currently write an empty buffer
-/// which is a valid CboRoaringBitmap.
-#[allow(clippy::ptr_arg)] // required to avoid signature mismatch
-fn deladd_serialize_add_side<'a>(obkv: &'a [u8], _buffer: &mut Vec<u8>) -> Result<&'a [u8]> {
-    Ok(KvReaderDelAdd::new(obkv).get(DelAdd::Addition).unwrap_or_default())
-}
-
-/// A function that merges a DelAdd of bitmao into an already existing bitmap.
-///
-/// The first argument is the DelAdd obkv of CboRoaringBitmaps and
-/// the second one is the CboRoaringBitmap to merge into.
-fn merge_deladd_cbo_roaring_bitmaps<'a>(
-    deladd_obkv: &[u8],
-    previous: &[u8],
-    buffer: &'a mut Vec<u8>,
-) -> Result<Option<&'a [u8]>> {
-    Ok(CboRoaringBitmapCodec::merge_deladd_into(
-        KvReaderDelAdd::new(deladd_obkv),
-        previous,
-        buffer,
-    )?)
-}
-
 /// Write provided entries in database using serialize_value function.
 /// merge_values function is used if an entry already exist in the database.
 fn write_entries_into_database<R, K, V, FS, FM>(
diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs
index 8220aa777..618f451dc 100644
--- a/milli/src/update/word_prefix_docids.rs
+++ b/milli/src/update/word_prefix_docids.rs
@@ -4,9 +4,11 @@ use grenad::CompressionType;
 use heed::types::{ByteSlice, Str};
 use heed::Database;
 
+use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
 use crate::update::index_documents::{
-    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
-    CursorClonableMmap, MergeFn,
+    create_sorter, merge_deladd_cbo_roaring_bitmaps,
+    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
+    write_sorter_into_database, CursorClonableMmap, MergeFn,
 };
 use crate::{CboRoaringBitmapCodec, Result};
 
@@ -51,7 +53,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
         // and write into it at the same time, therefore we write into another file.
         let mut prefix_docids_sorter = create_sorter(
             grenad::SortAlgorithm::Unstable,
-            merge_cbo_roaring_bitmaps,
+            merge_deladd_cbo_roaring_bitmaps,
             self.chunk_compression_type,
             self.chunk_compression_level,
             self.max_nb_chunks,
@@ -92,11 +94,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
 
         // We fetch the docids associated to the newly added word prefix fst only.
         let db = self.word_docids.remap_data_type::<ByteSlice>();
+        let mut buffer = Vec::new();
         for prefix in new_prefix_fst_words {
             let prefix = std::str::from_utf8(prefix.as_bytes())?;
             for result in db.prefix_iter(self.wtxn, prefix)? {
                 let (_word, data) = result?;
-                prefix_docids_sorter.insert(prefix, data)?;
+                buffer.clear();
+                let mut writer = KvWriterDelAdd::new(&mut buffer);
+                writer.insert(DelAdd::Addition, data)?;
+
+                prefix_docids_sorter.insert(prefix, writer.into_inner()?)?;
             }
         }
 
@@ -110,12 +117,16 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
 
         drop(iter);
 
+        let database_is_empty = self.word_prefix_docids.is_empty(self.wtxn)?;
+
         // We finally write the word prefix docids into the LMDB database.
-        sorter_into_lmdb_database(
-            self.wtxn,
-            *self.word_prefix_docids.as_polymorph(),
+        write_sorter_into_database(
             prefix_docids_sorter,
-            merge_cbo_roaring_bitmaps,
+            &self.word_prefix_docids,
+            self.wtxn,
+            database_is_empty,
+            deladd_serialize_add_side,
+            merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
         )?;
 
         Ok(())

From 5a9c96e1db0b2ec1de77c0c01b76676072aec754 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 9 Nov 2023 11:34:26 +0100
Subject: [PATCH 5/7] Compute word integer prefix cache

---
 .../src/update/words_prefix_integer_docids.rs | 28 +++++++++++++------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/milli/src/update/words_prefix_integer_docids.rs b/milli/src/update/words_prefix_integer_docids.rs
index c65438928..e083f510a 100644
--- a/milli/src/update/words_prefix_integer_docids.rs
+++ b/milli/src/update/words_prefix_integer_docids.rs
@@ -9,9 +9,11 @@ use log::debug;
 use crate::error::SerializationError;
 use crate::heed_codec::StrBEU16Codec;
 use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
+use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvWriterDelAdd};
 use crate::update::index_documents::{
-    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
-    CursorClonableMmap, MergeFn,
+    create_sorter, merge_deladd_cbo_roaring_bitmaps,
+    merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap, valid_lmdb_key,
+    write_sorter_into_database, CursorClonableMmap, MergeFn,
 };
 use crate::{CboRoaringBitmapCodec, Result};
 
@@ -55,7 +57,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
 
         let mut prefix_integer_docids_sorter = create_sorter(
             grenad::SortAlgorithm::Unstable,
-            merge_cbo_roaring_bitmaps,
+            merge_deladd_cbo_roaring_bitmaps,
             self.chunk_compression_type,
             self.chunk_compression_level,
             self.max_nb_chunks,
@@ -108,6 +110,7 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
 
         // We fetch the docids associated to the newly added word prefix fst only.
         let db = self.word_database.remap_data_type::<ByteSlice>();
+        let mut buffer = Vec::new();
         for prefix_bytes in new_prefix_fst_words {
             let prefix = str::from_utf8(prefix_bytes.as_bytes()).map_err(|_| {
                 SerializationError::Decoding { db_name: Some(WORDS_PREFIXES_FST_KEY) }
@@ -123,7 +126,11 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
                 if word.starts_with(prefix) {
                     let key = (prefix, pos);
                     let bytes = StrBEU16Codec::bytes_encode(&key).unwrap();
-                    prefix_integer_docids_sorter.insert(bytes, data)?;
+
+                    buffer.clear();
+                    let mut writer = KvWriterDelAdd::new(&mut buffer);
+                    writer.insert(DelAdd::Addition, data)?;
+                    prefix_integer_docids_sorter.insert(bytes, writer.into_inner()?)?;
                 }
             }
         }
@@ -143,12 +150,16 @@ impl<'t, 'u, 'i> WordPrefixIntegerDocids<'t, 'u, 'i> {
             drop(iter);
         }
 
+        let database_is_empty = self.prefix_database.is_empty(self.wtxn)?;
+
         // We finally write all the word prefix integer docids into the LMDB database.
-        sorter_into_lmdb_database(
-            self.wtxn,
-            *self.prefix_database.as_polymorph(),
+        write_sorter_into_database(
             prefix_integer_docids_sorter,
-            merge_cbo_roaring_bitmaps,
+            &self.prefix_database,
+            self.wtxn,
+            database_is_empty,
+            deladd_serialize_add_side,
+            merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
         )?;
 
         Ok(())
@@ -159,6 +170,7 @@ fn write_prefixes_in_sorter(
     prefixes: &mut HashMap<Vec<u8>, Vec<Vec<u8>>>,
     sorter: &mut grenad::Sorter<MergeFn>,
 ) -> Result<()> {
+    // TODO: Merge before insertion.
     for (key, data_slices) in prefixes.drain() {
         for data in data_slices {
             if valid_lmdb_key(&key) {

From 882ab9cc857fde9394b9fc4f1d46599617b8ccd7 Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Thu, 9 Nov 2023 11:35:33 +0100
Subject: [PATCH 6/7] remove warnings

---
 .../index_documents/helpers/grenad_helpers.rs | 45 -------------------
 .../src/update/index_documents/helpers/mod.rs |  4 +-
 milli/src/update/index_documents/mod.rs       |  4 +-
 3 files changed, 4 insertions(+), 49 deletions(-)

diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs
index f520ea7b0..061cbe5a0 100644
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -1,14 +1,11 @@
 use std::borrow::Cow;
 use std::fs::File;
 use std::io::{self, BufReader, BufWriter, Seek};
-use std::time::Instant;
 
 use grenad::{CompressionType, Sorter};
 use heed::types::ByteSlice;
-use log::debug;
 
 use super::{ClonableMmap, MergeFn};
-use crate::error::InternalError;
 use crate::update::index_documents::valid_lmdb_key;
 use crate::Result;
 
@@ -241,48 +238,6 @@ pub fn grenad_obkv_into_chunks<R: io::Read + io::Seek>(
     Ok(std::iter::from_fn(move || transposer().transpose()))
 }
 
-pub fn sorter_into_lmdb_database(
-    wtxn: &mut heed::RwTxn,
-    database: heed::PolyDatabase,
-    sorter: Sorter<MergeFn>,
-    merge: MergeFn,
-) -> Result<()> {
-    puffin::profile_function!();
-    debug!("Writing MTBL sorter...");
-    let before = Instant::now();
-
-    let mut merger_iter = sorter.into_stream_merger_iter()?;
-    if database.is_empty(wtxn)? {
-        let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
-        while let Some((k, v)) = merger_iter.next()? {
-            // safety: we don't keep references from inside the LMDB database.
-            unsafe { out_iter.append(k, v)? };
-        }
-    } else {
-        while let Some((k, v)) = merger_iter.next()? {
-            let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, k)?;
-            match iter.next().transpose()? {
-                Some((key, old_val)) if key == k => {
-                    let vals = vec![Cow::Borrowed(old_val), Cow::Borrowed(v)];
-                    let val = merge(k, &vals).map_err(|_| {
-                        // TODO just wrap this error?
-                        InternalError::IndexingMergingKeys { process: "get-put-merge" }
-                    })?;
-                    // safety: we don't keep references from inside the LMDB database.
-                    unsafe { iter.put_current(k, &val)? };
-                }
-                _ => {
-                    drop(iter);
-                    database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
-                }
-            }
-        }
-    }
-
-    debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
-    Ok(())
-}
-
 /// Write provided sorter in database using serialize_value function.
 /// merge_values function is used if an entry already exist in the database.
 pub fn write_sorter_into_database<K, V, FS, FM>(
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index c167f1cd3..841c09543 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -9,8 +9,8 @@ pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
 use fst::{IntoStreamer, Streamer};
 pub use grenad_helpers::{
     as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
-    merge_ignore_values, sorter_into_lmdb_database, sorter_into_reader, write_sorter_into_database,
-    writer_into_reader, GrenadParameters, MergeableReader,
+    merge_ignore_values, sorter_into_reader, write_sorter_into_database, writer_into_reader,
+    GrenadParameters, MergeableReader,
 };
 pub use merge_functions::{
     concat_u32s_array, keep_first, keep_latest_obkv, merge_btreeset_string,
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 5dbb4dd0b..de0361936 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -28,8 +28,8 @@ pub use self::helpers::{
     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
     fst_stream_into_vec, merge_btreeset_string, merge_cbo_roaring_bitmaps,
     merge_deladd_cbo_roaring_bitmaps, merge_deladd_cbo_roaring_bitmaps_into_cbo_roaring_bitmap,
-    merge_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key, write_sorter_into_database,
-    writer_into_reader, ClonableMmap, MergeFn,
+    merge_roaring_bitmaps, valid_lmdb_key, write_sorter_into_database, writer_into_reader,
+    ClonableMmap, MergeFn,
 };
 use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};

From 1f364105419170f8c5a65a57e23f43a45c58725d Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Mon, 13 Nov 2023 13:36:39 +0100
Subject: [PATCH 7/7] Update tests

---
 milli/src/search/new/tests/proximity.rs        | 18 +++++++++---------
 ...sts__proximity__proximity_prefix_db-14.snap | 18 +++++++++---------
 ...ests__proximity__proximity_prefix_db-2.snap | 18 +++++++++---------
 ...ests__proximity__proximity_prefix_db-8.snap | 18 +++++++++---------
 milli/src/snapshot_tests.rs                    | 16 ----------------
 5 files changed, 36 insertions(+), 52 deletions(-)

diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs
index 217ebe9b3..2d181a537 100644
--- a/milli/src/search/new/tests/proximity.rs
+++ b/milli/src/search/new/tests/proximity.rs
@@ -371,7 +371,7 @@ fn test_proximity_prefix_db() {
     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
     s.query("best s");
     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 6, 7, 8, 11, 15]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 9, 6, 7, 8, 11, 12, 13, 15]");
     insta::assert_snapshot!(format!("{document_scores:#?}"));
     let texts = collect_field_values(&index, &txn, "text", &documents_ids);
 
@@ -379,13 +379,13 @@ fn test_proximity_prefix_db() {
     insta::assert_debug_snapshot!(texts, @r###"
     [
         "\"this is the best summer meal\"",
-        "\"summer best\"",
         "\"this is the best meal of summer\"",
-        "\"summer x best\"",
         "\"this is the best meal I have ever had in such a beautiful summer day\"",
         "\"this is the best cooked meal of the summer\"",
         "\"this is the best meal of the summer\"",
         "\"summer x y best\"",
+        "\"summer x best\"",
+        "\"summer best\"",
         "\"this is the best meal I have ever had in such a beautiful winter day\"",
     ]
     "###);
@@ -423,20 +423,20 @@ fn test_proximity_prefix_db() {
     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
     s.query("best win");
     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
     insta::assert_snapshot!(format!("{document_scores:#?}"));
     let texts = collect_field_values(&index, &txn, "text", &documents_ids);
 
     insta::assert_debug_snapshot!(texts, @r###"
     [
         "\"this is the best winter meal\"",
-        "\"winter best\"",
         "\"this is the best meal of winter\"",
-        "\"winter x best\"",
         "\"this is the best meal I have ever had in such a beautiful winter day\"",
         "\"this is the best cooked meal of the winter\"",
         "\"this is the best meal of the winter\"",
         "\"winter x y best\"",
+        "\"winter x best\"",
+        "\"winter best\"",
     ]
     "###);
 
@@ -471,20 +471,20 @@ fn test_proximity_prefix_db() {
     s.scoring_strategy(crate::score_details::ScoringStrategy::Detailed);
     s.query("best wi");
     let SearchResult { documents_ids, document_scores, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 15, 16, 17, 20]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 15, 16, 17, 20, 21, 22]");
     insta::assert_snapshot!(format!("{document_scores:#?}"));
     let texts = collect_field_values(&index, &txn, "text", &documents_ids);
 
     insta::assert_debug_snapshot!(texts, @r###"
     [
         "\"this is the best winter meal\"",
-        "\"winter best\"",
         "\"this is the best meal of winter\"",
-        "\"winter x best\"",
         "\"this is the best meal I have ever had in such a beautiful winter day\"",
         "\"this is the best cooked meal of the winter\"",
         "\"this is the best meal of the winter\"",
         "\"winter x y best\"",
+        "\"winter x best\"",
+        "\"winter best\"",
     ]
     "###);
 }
diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap
index 8f3b964c1..efcfef7f1 100644
--- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap
+++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-14.snap
@@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
             },
         ),
     ],
-    [
-        Proximity(
-            Rank {
-                rank: 3,
-                max_rank: 4,
-            },
-        ),
-    ],
     [
         Proximity(
             Rank {
@@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
     [
         Proximity(
             Rank {
-                rank: 2,
+                rank: 1,
+                max_rank: 4,
+            },
+        ),
+    ],
+    [
+        Proximity(
+            Rank {
+                rank: 1,
                 max_rank: 4,
             },
         ),
diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap
index 1ee6bfc91..242bc3424 100644
--- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap
+++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-2.snap
@@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
             },
         ),
     ],
-    [
-        Proximity(
-            Rank {
-                rank: 3,
-                max_rank: 4,
-            },
-        ),
-    ],
     [
         Proximity(
             Rank {
@@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
     [
         Proximity(
             Rank {
-                rank: 2,
+                rank: 1,
+                max_rank: 4,
+            },
+        ),
+    ],
+    [
+        Proximity(
+            Rank {
+                rank: 1,
                 max_rank: 4,
             },
         ),
diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap
index 8f3b964c1..efcfef7f1 100644
--- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap
+++ b/milli/src/search/new/tests/snapshots/milli__search__new__tests__proximity__proximity_prefix_db-8.snap
@@ -11,14 +11,6 @@ expression: "format!(\"{document_scores:#?}\")"
             },
         ),
     ],
-    [
-        Proximity(
-            Rank {
-                rank: 3,
-                max_rank: 4,
-            },
-        ),
-    ],
     [
         Proximity(
             Rank {
@@ -30,7 +22,15 @@ expression: "format!(\"{document_scores:#?}\")"
     [
         Proximity(
             Rank {
-                rank: 2,
+                rank: 1,
+                max_rank: 4,
+            },
+        ),
+    ],
+    [
+        Proximity(
+            Rank {
+                rank: 1,
                 max_rank: 4,
             },
         ),
diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs
index f3f1eb5a5..28c4cb45c 100644
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -219,22 +219,6 @@ pub fn snap_word_pair_proximity_docids(index: &Index) -> String {
         &format!("{proximity:<2} {word1:<16} {word2:<16} {}", display_bitmap(&b))
     })
 }
-pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
-    make_db_snap_from_iter!(index, word_prefix_pair_proximity_docids, |(
-        (proximity, word1, prefix),
-        b,
-    )| {
-        &format!("{proximity:<2} {word1:<16} {prefix:<4} {}", display_bitmap(&b))
-    })
-}
-pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String {
-    make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |(
-        (proximity, prefix, word2),
-        b,
-    )| {
-        &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b))
-    })
-}
 pub fn snap_word_position_docids(index: &Index) -> String {
     make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| {
         &format!("{word:<16} {position:<6} {}", display_bitmap(&b))