Add prefix_word_pair_proximity database

Similar to the word_prefix_pair_proximity one but instead the keys are: (proximity, prefix, word2)
2025-01-18 17:11:15 +08:00 · 2022-09-14 15:33:13 +02:00 · 2022-09-14 15:33:13 +02:00 · 264a04922d
commit 264a04922d
parent 1dbbd8694f
14 changed files with 653 additions and 339 deletions
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -71,6 +71,7 @@ pub mod db_name {
    pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
    pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
    pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
    pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
    pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
    pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
    pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
@ -109,6 +110,8 @@ pub struct Index {
    pub word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
    /// Maps the proximity between a pair of word and prefix with all the docids where this relation appears.
    pub word_prefix_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
    /// Maps the proximity between a pair of prefix and word with all the docids where this relation appears.
    pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
    /// Maps the word and the position with the docids that corresponds to it.
    pub word_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
@ -138,7 +141,7 @@ impl Index {
    pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
        use db_name::*;
-        options.max_dbs(17);
+        options.max_dbs(18);
        unsafe { options.flag(Flags::MdbAlwaysFreePages) };
        let env = options.open(path)?;
@ -151,6 +154,8 @@ impl Index {
        let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
        let word_prefix_pair_proximity_docids =
            env.create_database(Some(WORD_PREFIX_PAIR_PROXIMITY_DOCIDS))?;
        let prefix_word_pair_proximity_docids =
            env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
        let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
        let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
        let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
@ -175,6 +180,7 @@ impl Index {
            docid_word_positions,
            word_pair_proximity_docids,
            word_prefix_pair_proximity_docids,
            prefix_word_pair_proximity_docids,
            word_position_docids,
            word_prefix_position_docids,
            field_id_word_count_docids,
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@ -198,6 +198,15 @@ pub fn snap_word_prefix_pair_proximity_docids(index: &Index) -> String {
    });
    snap
 }
 pub fn snap_prefix_word_pair_proximity_docids(index: &Index) -> String {
    let snap = make_db_snap_from_iter!(index, prefix_word_pair_proximity_docids, |(
        (proximity, prefix, word2),
        b,
    )| {
        &format!("{proximity:<2} {prefix:<4} {word2:<16} {}", display_bitmap(&b))
    });
    snap
 }
 pub fn snap_word_position_docids(index: &Index) -> String {
    let snap = make_db_snap_from_iter!(index, word_position_docids, |((word, position), b)| {
        &format!("{word:<16} {position:<6} {}", display_bitmap(&b))
@ -427,6 +436,9 @@ macro_rules! full_snap_of_db {
    ($index:ident, word_prefix_pair_proximity_docids) => {{
        $crate::snapshot_tests::snap_word_prefix_pair_proximity_docids(&$index)
    }};
    ($index:ident, prefix_word_pair_proximity_docids) => {{
        $crate::snapshot_tests::snap_prefix_word_pair_proximity_docids(&$index)
    }};
    ($index:ident, word_position_docids) => {{
        $crate::snapshot_tests::snap_word_position_docids(&$index)
    }};
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@ -25,6 +25,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
            docid_word_positions,
            word_pair_proximity_docids,
            word_prefix_pair_proximity_docids,
            prefix_word_pair_proximity_docids,
            word_position_docids,
            field_id_word_count_docids,
            word_prefix_position_docids,
@ -66,6 +67,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
        docid_word_positions.clear(self.wtxn)?;
        word_pair_proximity_docids.clear(self.wtxn)?;
        word_prefix_pair_proximity_docids.clear(self.wtxn)?;
        prefix_word_pair_proximity_docids.clear(self.wtxn)?;
        word_position_docids.clear(self.wtxn)?;
        field_id_word_count_docids.clear(self.wtxn)?;
        word_prefix_position_docids.clear(self.wtxn)?;
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@ -183,6 +183,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            word_pair_proximity_docids,
            field_id_word_count_docids,
            word_prefix_pair_proximity_docids,
            prefix_word_pair_proximity_docids,
            word_position_docids,
            word_prefix_position_docids,
            facet_id_f64_docids,
@ -327,26 +328,26 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            self.index.put_words_prefixes_fst(self.wtxn, &new_words_prefixes_fst)?;
        }
-        // We delete the documents ids from the word prefix pair proximity database docids
+        for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] {
-        // and remove the empty pairs too.
+            // We delete the documents ids from the word prefix pair proximity database docids
-        let db = word_prefix_pair_proximity_docids.remap_key_type::<ByteSlice>();
+            // and remove the empty pairs too.
-        let mut iter = db.iter_mut(self.wtxn)?;
+            let db = db.remap_key_type::<ByteSlice>();
-        while let Some(result) = iter.next() {
+            let mut iter = db.iter_mut(self.wtxn)?;
-            let (key, mut docids) = result?;
+            while let Some(result) = iter.next() {
-            let previous_len = docids.len();
+                let (key, mut docids) = result?;
-            docids -= &self.to_delete_docids;
+                let previous_len = docids.len();
-            if docids.is_empty() {
+                docids -= &self.to_delete_docids;
-                // safety: we don't keep references from inside the LMDB database.
+                if docids.is_empty() {
-                unsafe { iter.del_current()? };
+                    // safety: we don't keep references from inside the LMDB database.
-            } else if docids.len() != previous_len {
+                    unsafe { iter.del_current()? };
-                let key = key.to_owned();
+                } else if docids.len() != previous_len {
-                // safety: we don't keep references from inside the LMDB database.
+                    let key = key.to_owned();
-                unsafe { iter.put_current(&key, &docids)? };
+                    // safety: we don't keep references from inside the LMDB database.
                    unsafe { iter.put_current(&key, &docids)? };
                }
            }
        }
        drop(iter);
        // We delete the documents ids that are under the pairs of words,
        // it is faster and use no memory to iterate over all the words pairs than
        // to compute the cartesian product of every words of the deleted documents.
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -36,8 +36,8 @@ use crate::documents::{obkv_to_object, DocumentsBatchReader};
 use crate::error::UserError;
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
 use crate::update::{
-    self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
+    self, Facets, IndexerConfig, PrefixWordPairsProximityDocids, UpdateIndexingStep,
-    WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst,
+    WordPrefixDocids, WordPrefixPositionDocids, WordsPrefixesFst,
 };
 use crate::{Index, Result, RoaringBitmapCodec};
@ -528,12 +528,7 @@ where
        if let Some(word_pair_proximity_docids) = word_pair_proximity_docids {
            // Run the word prefix pair proximity docids update operation.
-            let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index);
+            PrefixWordPairsProximityDocids::new(self.wtxn, self.index).execute(
            builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
            builder.chunk_compression_level = self.indexer_config.chunk_compression_level;
            builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
            builder.max_memory = self.indexer_config.max_memory;
            builder.execute(
                word_pair_proximity_docids,
                &new_prefix_fst_words,
                &common_prefix_fst_words,
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@ -6,10 +6,10 @@ pub use self::index_documents::{
    DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
 };
 pub use self::indexer_config::IndexerConfig;
 pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids;
 pub use self::settings::{Setting, Settings};
 pub use self::update_step::UpdateIndexingStep;
 pub use self::word_prefix_docids::WordPrefixDocids;
 pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids;
 pub use self::words_prefix_position_docids::WordPrefixPositionDocids;
 pub use self::words_prefixes_fst::WordsPrefixesFst;
@ -19,9 +19,9 @@ mod delete_documents;
 mod facets;
 mod index_documents;
 mod indexer_config;
 mod prefix_word_pairs;
 mod settings;
 mod update_step;
 mod word_prefix_docids;
 mod word_prefix_pair_proximity_docids;
 mod words_prefix_position_docids;
 mod words_prefixes_fst;
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@ -0,0 +1,216 @@
 use super::index_documents::{merge_cbo_roaring_bitmaps, CursorClonableMmap};
 use crate::{Index, Result};
 use heed::types::ByteSlice;
 use std::{borrow::Cow, collections::HashSet, io::BufReader};
 mod prefix_word;
 mod word_prefix;
 pub use prefix_word::index_prefix_word_database;
 pub use word_prefix::index_word_prefix_database;
 pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
    wtxn: &'t mut heed::RwTxn<'i, 'u>,
    index: &'i Index,
    max_proximity: u8,
    max_prefix_length: usize,
 }
 impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
    pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> Self {
        Self { wtxn, index, max_proximity: 4, max_prefix_length: 2 }
    }
    /// Set the maximum proximity required to make a prefix be part of the words prefixes
    /// database. If two words are too far from the threshold the associated documents will
    /// not be part of the prefix database.
    ///
    /// Default value is 4. This value must be lower or equal than 7 and will be clamped
    /// to this bound otherwise.
    pub fn max_proximity(&mut self, value: u8) -> &mut Self {
        self.max_proximity = value.max(7);
        self
    }
    /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
    /// prefixes database. If the prefix length is higher than the threshold, the associated documents
    /// will not be part of the prefix database.
    ///
    /// Default value is 2.
    pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
        self.max_prefix_length = value;
        self
    }
    #[logging_timer::time("WordPrefixPairProximityDocids::{}")]
    pub fn execute<'a>(
        self,
        new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
        new_prefix_fst_words: &'a [String],
        common_prefix_fst_words: &[&'a [String]],
        del_prefix_fst_words: &HashSet<Vec<u8>>,
    ) -> Result<()> {
        index_word_prefix_database(
            self.wtxn,
            self.index.word_pair_proximity_docids,
            self.index.word_prefix_pair_proximity_docids,
            self.max_proximity,
            self.max_prefix_length,
            new_word_pair_proximity_docids.clone(),
            new_prefix_fst_words,
            common_prefix_fst_words,
            del_prefix_fst_words,
        )?;
        index_prefix_word_database(
            self.wtxn,
            self.index.word_pair_proximity_docids,
            self.index.prefix_word_pair_proximity_docids,
            self.max_proximity,
            self.max_prefix_length,
            new_word_pair_proximity_docids,
            new_prefix_fst_words,
            common_prefix_fst_words,
            del_prefix_fst_words,
        )?;
        Ok(())
    }
 }
 // This is adapted from `sorter_into_lmdb_database`
 pub fn insert_into_database(
    wtxn: &mut heed::RwTxn,
    database: heed::PolyDatabase,
    new_key: &[u8],
    new_value: &[u8],
 ) -> Result<()> {
    let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
    match iter.next().transpose()? {
        Some((key, old_val)) if new_key == key => {
            let val =
                merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
                    .map_err(|_| {
                        // TODO just wrap this error?
                        crate::error::InternalError::IndexingMergingKeys {
                            process: "get-put-merge",
                        }
                    })?;
            // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
            unsafe { iter.put_current(new_key, &val)? };
        }
        _ => {
            drop(iter);
            database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
        }
    }
    Ok(())
 }
 // This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
 // but it uses `append` if the database is empty, and it assumes that the values in the
 // writer don't conflict with values in the database.
 pub fn write_into_lmdb_database_without_merging(
    wtxn: &mut heed::RwTxn,
    database: heed::PolyDatabase,
    writer: grenad::Writer<std::fs::File>,
 ) -> Result<()> {
    let file = writer.into_inner()?;
    let reader = grenad::Reader::new(BufReader::new(file))?;
    if database.is_empty(wtxn)? {
        let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
        let mut cursor = reader.into_cursor()?;
        while let Some((k, v)) = cursor.move_on_next()? {
            // safety: the key comes from the grenad reader, not the database
            unsafe { out_iter.append(k, v)? };
        }
    } else {
        let mut cursor = reader.into_cursor()?;
        while let Some((k, v)) = cursor.move_on_next()? {
            database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
        }
    }
    Ok(())
 }
 #[cfg(test)]
 mod tests {
    use crate::db_snap;
    use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
    use crate::index::tests::TempIndex;
    use std::io::Cursor;
    fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> {
        let mut documents = Vec::new();
        for prefix in prefixes {
            for i in 0..50 {
                documents.push(
                    serde_json::json!({
                        "text": format!("{prefix}{i:x}"),
                    })
                    .as_object()
                    .unwrap()
                    .clone(),
                )
            }
        }
        documents
    }
    #[test]
    fn test_update() {
        let mut index = TempIndex::new();
        index.index_documents_config.words_prefix_threshold = Some(50);
        index.index_documents_config.autogenerate_docids = true;
        index
            .update_settings(|settings| {
                settings.set_searchable_fields(vec!["text".to_owned()]);
            })
            .unwrap();
        let batch_reader_from_documents = |documents| {
            let mut builder = DocumentsBatchBuilder::new(Vec::new());
            for object in documents {
                builder.append_json_object(&object).unwrap();
            }
            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
        };
        let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]);
        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
        documents.push(
            serde_json::json!({
                "text": "At an amazing and beautiful house"
            })
            .as_object()
            .unwrap()
            .clone(),
        );
        documents.push(
            serde_json::json!({
                "text": "The bell rings at 5 am"
            })
            .as_object()
            .unwrap()
            .clone(),
        );
        let documents = batch_reader_from_documents(documents);
        index.add_documents(documents).unwrap();
        db_snap!(index, word_prefix_pair_proximity_docids, "initial");
        let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]);
        documents.push(
            serde_json::json!({
                "text": "At an extraordinary house"
            })
            .as_object()
            .unwrap()
            .clone(),
        );
        let documents = batch_reader_from_documents(documents);
        index.add_documents(documents).unwrap();
        db_snap!(index, word_pair_proximity_docids, "update");
        db_snap!(index, word_prefix_pair_proximity_docids, "update");
        db_snap!(index, prefix_word_pair_proximity_docids, "update");
    }
 }
--- a/milli/src/update/prefix_word_pairs/prefix_word.rs
+++ b/milli/src/update/prefix_word_pairs/prefix_word.rs
@ -0,0 +1,178 @@
 use crate::update::index_documents::{create_writer, CursorClonableMmap};
 use crate::update::prefix_word_pairs::{
    insert_into_database, write_into_lmdb_database_without_merging,
 };
 use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
 use grenad::CompressionType;
 use heed::types::ByteSlice;
 use heed::BytesDecode;
 use log::debug;
 use std::borrow::Cow;
 use std::collections::{BTreeMap, HashSet};
 #[logging_timer::time]
 pub fn index_prefix_word_database(
    wtxn: &mut heed::RwTxn,
    word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
    prefix_word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
    max_proximity: u8,
    max_prefix_length: usize,
    new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
    new_prefix_fst_words: &[String],
    common_prefix_fst_words: &[&[String]],
    del_prefix_fst_words: &HashSet<Vec<u8>>,
 ) -> Result<()> {
    let max_proximity = max_proximity - 1;
    debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
    let common_prefixes: Vec<_> = common_prefix_fst_words
        .into_iter()
        .map(|s| s.into_iter())
        .flatten()
        .map(|s| s.as_str())
        .filter(|s| s.len() <= max_prefix_length)
        .collect();
    // If the prefix trie is not empty, then we can iterate over all new
    // word pairs to look for new (word1, common_prefix, proximity) elements
    // to insert in the DB
    for proximity in 1..=max_proximity - 1 {
        for prefix in common_prefixes.iter() {
            let mut prefix_key = vec![];
            prefix_key.push(proximity);
            prefix_key.extend_from_slice(prefix.as_bytes());
            let mut cursor = new_word_pair_proximity_docids.clone().into_prefix_iter(prefix_key)?;
            // This is the core of the algorithm
            execute_on_word_pairs_and_prefixes(
                proximity + 1,
                prefix.as_bytes(),
                // the next two arguments tell how to iterate over the new word pairs
                &mut cursor,
                |cursor| {
                    if let Some((key, value)) = cursor.next()? {
                        let (_, _, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
                            .ok_or(heed::Error::Decoding)?;
                        Ok(Some((word2, value)))
                    } else {
                        Ok(None)
                    }
                },
                // and this argument tells what to do with each new key (proximity, prefix, word2) and value (roaring bitmap)
                |key, value| {
                    insert_into_database(
                        wtxn,
                        *prefix_word_pair_proximity_docids.as_polymorph(),
                        key,
                        value,
                    )
                },
            )?;
        }
    }
    // Now we do the same thing with the new prefixes and all word pairs in the DB
    let new_prefixes: Vec<_> = new_prefix_fst_words
        .into_iter()
        .map(|s| s.as_str())
        .filter(|s| s.len() <= max_prefix_length)
        .collect();
    // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
    // element in an intermediary grenad
    let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?);
    for proximity in 1..=max_proximity - 1 {
        for prefix in new_prefixes.iter() {
            let mut prefix_key = vec![];
            prefix_key.push(proximity);
            prefix_key.extend_from_slice(prefix.as_bytes());
            let mut db_iter = word_pair_proximity_docids
                .as_polymorph()
                .prefix_iter::<_, ByteSlice, ByteSlice>(wtxn, prefix_key.as_slice())?
                .remap_key_type::<UncheckedU8StrStrCodec>();
            execute_on_word_pairs_and_prefixes(
                proximity + 1,
                prefix.as_bytes(),
                &mut db_iter,
                |db_iter| {
                    db_iter
                        .next()
                        .transpose()
                        .map(|x| x.map(|((_, _, word2), value)| (word2, value)))
                        .map_err(|e| e.into())
                },
                |key, value| writer.insert(key, value).map_err(|e| e.into()),
            )?;
            drop(db_iter);
        }
    }
    // and then we write the grenad into the DB
    // Since the grenad contains only new prefixes, we know in advance that none
    // of its elements already exist in the DB, thus there is no need to specify
    // how to merge conflicting elements
    write_into_lmdb_database_without_merging(
        wtxn,
        *prefix_word_pair_proximity_docids.as_polymorph(),
        writer,
    )?;
    // All of the word prefix pairs in the database that have a w2
    // that is contained in the `suppr_pw` set must be removed as well.
    if !del_prefix_fst_words.is_empty() {
        let mut iter =
            prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
        while let Some(((_, prefix, _), _)) = iter.next().transpose()? {
            if del_prefix_fst_words.contains(prefix.as_bytes()) {
                // Delete this entry as the w2 prefix is no more in the words prefix fst.
                unsafe { iter.del_current()? };
            }
        }
    }
    Ok(())
 }
 /// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
 ///
 /// Its main arguments are:
 /// 1. a sorted prefix iterator over ((word1, word2, proximity), docids) elements
 /// 2. a closure to describe how to handle the new computed (word1, prefix, proximity) elements
 ///
 /// For more information about what this function does, read the module documentation.
 fn execute_on_word_pairs_and_prefixes<I>(
    proximity: u8,
    prefix: &[u8],
    iter: &mut I,
    mut next_word2_and_docids: impl for<'a> FnMut(&'a mut I) -> Result<Option<(&'a [u8], &'a [u8])>>,
    mut insert: impl for<'a> FnMut(&'a [u8], &'a [u8]) -> Result<()>,
 ) -> Result<()> {
    let mut batch: BTreeMap<Vec<u8>, Vec<Cow<'static, [u8]>>> = <_>::default();
    while let Some((word2, data)) = next_word2_and_docids(iter)? {
        let entry = batch.entry(word2.to_owned()).or_default();
        entry.push(Cow::Owned(data.to_owned()));
    }
    let mut key_buffer = Vec::with_capacity(8);
    key_buffer.push(proximity);
    key_buffer.extend_from_slice(prefix);
    key_buffer.push(0);
    let mut value_buffer = Vec::with_capacity(65_536);
    for (key, values) in batch {
        key_buffer.truncate(prefix.len() + 2);
        value_buffer.clear();
        key_buffer.extend_from_slice(&key);
        let data = if values.len() > 1 {
            CboRoaringBitmapCodec::merge_into(&values, &mut value_buffer)?;
            value_buffer.as_slice()
        } else {
            &values[0]
        };
        insert(key_buffer.as_slice(), data)?;
    }
    Ok(())
 }
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap
+++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/initial/word_prefix_pair_proximity_docids.snap
@ -0,0 +1,46 @@
 ---
 source: milli/src/update/prefix_word_pairs/mod.rs
 ---
 1  5                a    [101, ]
 1  amazing          a    [100, ]
 1  an               a    [100, ]
 1  and              b    [100, ]
 1  and              be   [100, ]
 1  at               a    [100, ]
 1  rings            a    [101, ]
 1  the              b    [101, ]
 1  the              be   [101, ]
 2  5                a    [101, ]
 2  amazing          a    [100, ]
 2  amazing          b    [100, ]
 2  amazing          be   [100, ]
 2  an               a    [100, ]
 2  and              a    [100, ]
 2  at               a    [100, 101, ]
 2  beautiful        a    [100, ]
 2  bell             a    [101, ]
 2  house            b    [100, ]
 2  house            be   [100, ]
 2  rings            b    [101, ]
 2  rings            be   [101, ]
 3  am               a    [101, ]
 3  amazing          a    [100, ]
 3  an               b    [100, ]
 3  an               be   [100, ]
 3  and              a    [100, ]
 3  at               a    [100, ]
 3  at               b    [101, ]
 3  at               be   [101, ]
 3  beautiful        a    [100, ]
 3  house            a    [100, ]
 3  rings            a    [101, ]
 3  the              a    [101, ]
 4  5                b    [101, ]
 4  5                be   [101, ]
 4  and              a    [100, ]
 4  at               b    [100, ]
 4  at               be   [100, ]
 4  beautiful        a    [100, ]
 4  bell             a    [101, ]
 4  house            a    [100, ]
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap
+++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.hash.snap
@ -0,0 +1,4 @@
 ---
 source: milli/src/update/prefix_word_pairs/mod.rs
 ---
 fb88e49fd666886731b62baef8f44995
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap
+++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/prefix_word_pair_proximity_docids.snap
@ -0,0 +1,41 @@
 ---
 source: milli/src/update/prefix_word_pairs/mod.rs
 ---
 2  a    5                [101, ]
 2  a    amazing          [100, ]
 2  a    an               [100, 202, ]
 2  a    and              [100, ]
 2  a    beautiful        [100, ]
 2  a    extraordinary    [202, ]
 2  am   and              [100, ]
 2  an   amazing          [100, ]
 2  an   beautiful        [100, ]
 2  an   extraordinary    [202, ]
 2  b    house            [100, ]
 2  b    rings            [101, ]
 2  be   house            [100, ]
 2  be   rings            [101, ]
 3  a    5                [101, ]
 3  a    am               [101, ]
 3  a    amazing          [100, ]
 3  a    an               [100, ]
 3  a    and              [100, ]
 3  a    at               [100, 202, ]
 3  a    beautiful        [100, ]
 3  a    extraordinary    [202, ]
 3  a    house            [100, 202, ]
 3  a    rings            [101, ]
 3  am   5                [101, ]
 3  am   an               [100, ]
 3  am   beautiful        [100, ]
 3  an   amazing          [100, ]
 3  an   and              [100, ]
 3  an   at               [100, 202, ]
 3  an   house            [100, 202, ]
 3  b    and              [100, ]
 3  b    at               [101, ]
 3  b    the              [101, ]
 3  be   and              [100, ]
 3  be   at               [101, ]
 3  be   the              [101, ]
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap
+++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_pair_proximity_docids.hash.snap
@ -0,0 +1,4 @@
 ---
 source: milli/src/update/prefix_word_pairs/mod.rs
 ---
 6965ecd1bf821f1cf921c2ab751b36cf
--- a/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap
+++ b/milli/src/update/prefix_word_pairs/snapshots/mod.rs/test_update/update/word_prefix_pair_proximity_docids.hash.snap
@ -0,0 +1,4 @@
 ---
 source: milli/src/update/prefix_word_pairs/mod.rs
 ---
 fb88e49fd666886731b62baef8f44995
--- a/milli/src/update/word_prefix_pair_proximity_docids.rs
+++ b/milli/src/update/word_prefix_pair_proximity_docids.rs
@ -1,5 +1,5 @@
 /*!
- ## What is WordPrefixPairProximityDocids?
+ ## What is WordPrefix?
 The word-prefix-pair-proximity-docids database is a database whose keys are of
 the form `(proximity, word, prefix)` and the values are roaring bitmaps of
 the documents which contain `word` followed by another word starting with
@ -139,7 +139,7 @@ inputs described above, which come from different places:
    2. `word_pairs_db`, which is the list of word pairs from the database.
    This list includes all elements in `new_word_pairs` since `new_word_pairs`
-    was added to the database prior to calling the `WordPrefixPairProximityDocIds::execute`
+    was added to the database prior to calling the `WordPrefix::execute`
    function.
 To update the prefix database correctly, we call the algorithm described earlier first
@ -161,196 +161,137 @@ reader and writer). Therefore, when calling the algorithm on
 `((proximity, word, prefix), docids)` elements in an intermediary grenad
 Writer instead of the DB. At the end of the outer loop, we finally read from
 the grenad and insert its elements in the database.
 */
 use std::borrow::Cow;
 use std::collections::HashSet;
 use std::io::BufReader;
 use crate::update::index_documents::{create_writer, CursorClonableMmap};
 use crate::update::prefix_word_pairs::{
    insert_into_database, write_into_lmdb_database_without_merging,
 };
 use crate::{CboRoaringBitmapCodec, Result, U8StrStrCodec, UncheckedU8StrStrCodec};
 use grenad::CompressionType;
 use heed::types::ByteSlice;
 use heed::BytesDecode;
 use log::debug;
 use std::borrow::Cow;
 use std::collections::HashSet;
-use crate::update::index_documents::{
+#[logging_timer::time]
-    create_writer, merge_cbo_roaring_bitmaps, CursorClonableMmap,
+pub fn index_word_prefix_database(
-};
+    wtxn: &mut heed::RwTxn,
-use crate::{CboRoaringBitmapCodec, Index, Result, UncheckedU8StrStrCodec};
+    word_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
-
+    word_prefix_pair_proximity_docids: heed::Database<U8StrStrCodec, CboRoaringBitmapCodec>,
 pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
    wtxn: &'t mut heed::RwTxn<'i, 'u>,
    index: &'i Index,
    pub(crate) chunk_compression_type: CompressionType,
    pub(crate) chunk_compression_level: Option<u32>,
    pub(crate) max_nb_chunks: Option<usize>,
    pub(crate) max_memory: Option<usize>,
    max_proximity: u8,
    max_prefix_length: usize,
-}
+    new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
    new_prefix_fst_words: &[String],
    common_prefix_fst_words: &[&[String]],
    del_prefix_fst_words: &HashSet<Vec<u8>>,
 ) -> Result<()> {
    debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
-impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
+    // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
-    pub fn new(
+    let prefixes = PrefixTrieNode::from_sorted_prefixes(
-        wtxn: &'t mut heed::RwTxn<'i, 'u>,
+        common_prefix_fst_words
-        index: &'i Index,
+            .into_iter()
-    ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> {
+            .map(|s| s.into_iter())
-        WordPrefixPairProximityDocids {
+            .flatten()
-            wtxn,
+            .map(|s| s.as_str())
-            index,
+            .filter(|s| s.len() <= max_prefix_length),
-            chunk_compression_type: CompressionType::None,
+    );
            chunk_compression_level: None,
            max_nb_chunks: None,
            max_memory: None,
            max_proximity: 4,
            max_prefix_length: 2,
        }
    }
-    /// Set the maximum proximity required to make a prefix be part of the words prefixes
+    // If the prefix trie is not empty, then we can iterate over all new
-    /// database. If two words are too far from the threshold the associated documents will
+    // word pairs to look for new (proximity, word1, common_prefix) elements
-    /// not be part of the prefix database.
+    // to insert in the DB
-    ///
+    if !prefixes.is_empty() {
-    /// Default value is 4. This value must be lower or equal than 7 and will be clamped
+        let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
-    /// to this bound otherwise.
+        // This is the core of the algorithm
-    pub fn max_proximity(&mut self, value: u8) -> &mut Self {
+        execute_on_word_pairs_and_prefixes(
-        self.max_proximity = value.max(7);
+            // the first two arguments tell how to iterate over the new word pairs
-        self
+            &mut cursor,
-    }
+            |cursor| {
-
+                if let Some((key, value)) = cursor.move_on_next()? {
-    /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
+                    let (proximity, word1, word2) =
-    /// prefixes database. If the prefix length is higher than the threshold, the associated documents
+                        UncheckedU8StrStrCodec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
-    /// will not be part of the prefix database.
+                    Ok(Some(((proximity, word1, word2), value)))
-    ///
+                } else {
-    /// Default value is 2.
+                    Ok(None)
    pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
        self.max_prefix_length = value;
        self
    }
    #[logging_timer::time("WordPrefixPairProximityDocids::{}")]
    pub fn execute<'a>(
        mut self,
        new_word_pair_proximity_docids: grenad::Reader<CursorClonableMmap>,
        new_prefix_fst_words: &'a [String],
        common_prefix_fst_words: &[&'a [String]],
        del_prefix_fst_words: &HashSet<Vec<u8>>,
    ) -> Result<()> {
        debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
        // Make a prefix trie from the common prefixes that are shorter than self.max_prefix_length
        let prefixes = PrefixTrieNode::from_sorted_prefixes(
            common_prefix_fst_words
                .into_iter()
                .map(|s| s.into_iter())
                .flatten()
                .map(|s| s.as_str())
                .filter(|s| s.len() <= self.max_prefix_length),
        );
        // If the prefix trie is not empty, then we can iterate over all new
        // word pairs to look for new (word1, common_prefix, proximity) elements
        // to insert in the DB
        if !prefixes.is_empty() {
            let mut cursor = new_word_pair_proximity_docids.into_cursor()?;
            // This is the core of the algorithm
            execute_on_word_pairs_and_prefixes(
                // the first two arguments tell how to iterate over the new word pairs
                &mut cursor,
                |cursor| {
                    if let Some((key, value)) = cursor.move_on_next()? {
                        let (proximity, word1, word2) = UncheckedU8StrStrCodec::bytes_decode(key)
                            .ok_or(heed::Error::Decoding)?;
                        Ok(Some(((proximity, word1, word2), value)))
                    } else {
                        Ok(None)
                    }
                },
                &prefixes,
                self.max_proximity,
                // and this argument tells what to do with each new key (word1, prefix, proximity) and value (roaring bitmap)
                |key, value| {
                    insert_into_database(
                        &mut self.wtxn,
                        *self.index.word_prefix_pair_proximity_docids.as_polymorph(),
                        key,
                        value,
                    )
                },
            )?;
        }
        // Now we do the same thing with the new prefixes and all word pairs in the DB
        let prefixes = PrefixTrieNode::from_sorted_prefixes(
            new_prefix_fst_words
                .into_iter()
                .map(|s| s.as_str())
                .filter(|s| s.len() <= self.max_prefix_length),
        );
        if !prefixes.is_empty() {
            let mut db_iter = self
                .index
                .word_pair_proximity_docids
                .remap_key_type::<UncheckedU8StrStrCodec>()
                .remap_data_type::<ByteSlice>()
                .iter(self.wtxn)?;
            // Since we read the DB, we can't write to it directly, so we add each new (word1, prefix, proximity)
            // element in an intermediary grenad
            let mut writer = create_writer(
                self.chunk_compression_type,
                self.chunk_compression_level,
                tempfile::tempfile()?,
            );
            execute_on_word_pairs_and_prefixes(
                &mut db_iter,
                |db_iter| db_iter.next().transpose().map_err(|e| e.into()),
                &prefixes,
                self.max_proximity,
                |key, value| writer.insert(key, value).map_err(|e| e.into()),
            )?;
            drop(db_iter);
            // and then we write the grenad into the DB
            // Since the grenad contains only new prefixes, we know in advance that none
            // of its elements already exist in the DB, thus there is no need to specify
            // how to merge conflicting elements
            write_into_lmdb_database_without_merging(
                self.wtxn,
                *self.index.word_prefix_pair_proximity_docids.as_polymorph(),
                writer,
            )?;
        }
        // All of the word prefix pairs in the database that have a w2
        // that is contained in the `suppr_pw` set must be removed as well.
        if !del_prefix_fst_words.is_empty() {
            let mut iter = self
                .index
                .word_prefix_pair_proximity_docids
                .remap_data_type::<ByteSlice>()
                .iter_mut(self.wtxn)?;
            while let Some(((_, w2, _), _)) = iter.next().transpose()? {
                if del_prefix_fst_words.contains(w2.as_bytes()) {
                    // Delete this entry as the w2 prefix is no more in the words prefix fst.
                    unsafe { iter.del_current()? };
                }
            },
            &prefixes,
            max_proximity,
            // and this argument tells what to do with each new key (proximity, word1, prefix) and value (roaring bitmap)
            |key, value| {
                insert_into_database(
                    wtxn,
                    *word_prefix_pair_proximity_docids.as_polymorph(),
                    key,
                    value,
                )
            },
        )?;
    }
    // Now we do the same thing with the new prefixes and all word pairs in the DB
    let prefixes = PrefixTrieNode::from_sorted_prefixes(
        new_prefix_fst_words
            .into_iter()
            .map(|s| s.as_str())
            .filter(|s| s.len() <= max_prefix_length),
    );
    if !prefixes.is_empty() {
        let mut db_iter = word_pair_proximity_docids
            .remap_key_type::<UncheckedU8StrStrCodec>()
            .remap_data_type::<ByteSlice>()
            .iter(wtxn)?;
        // Since we read the DB, we can't write to it directly, so we add each new (proximity, word1, prefix)
        // element in an intermediary grenad
        let mut writer = create_writer(CompressionType::None, None, tempfile::tempfile()?);
        execute_on_word_pairs_and_prefixes(
            &mut db_iter,
            |db_iter| db_iter.next().transpose().map_err(|e| e.into()),
            &prefixes,
            max_proximity,
            |key, value| writer.insert(key, value).map_err(|e| e.into()),
        )?;
        drop(db_iter);
        // and then we write the grenad into the DB
        // Since the grenad contains only new prefixes, we know in advance that none
        // of its elements already exist in the DB, thus there is no need to specify
        // how to merge conflicting elements
        write_into_lmdb_database_without_merging(
            wtxn,
            *word_prefix_pair_proximity_docids.as_polymorph(),
            writer,
        )?;
    }
    // All of the word prefix pairs in the database that have a w2
    // that is contained in the `suppr_pw` set must be removed as well.
    if !del_prefix_fst_words.is_empty() {
        let mut iter =
            word_prefix_pair_proximity_docids.remap_data_type::<ByteSlice>().iter_mut(wtxn)?;
        while let Some(((_, _, prefix), _)) = iter.next().transpose()? {
            if del_prefix_fst_words.contains(prefix.as_bytes()) {
                // Delete this entry as the w2 prefix is no more in the words prefix fst.
                unsafe { iter.del_current()? };
            }
        }
        Ok(())
    }
    Ok(())
 }
 /// This is the core of the algorithm to initialise the Word Prefix Pair Proximity Docids database.
 ///
 /// Its main arguments are:
-/// 1. a sorted iterator over ((word1, word2, proximity), docids) elements
+/// 1. a sorted iterator over ((proximity, word1, word2), docids) elements
 /// 2. a prefix trie
-/// 3. a closure to describe how to handle the new computed (word1, prefix, proximity) elements
+/// 3. a closure to describe how to handle the new computed (proximity, word1, prefix) elements
 ///
 /// For more information about what this function does, read the module documentation.
 fn execute_on_word_pairs_and_prefixes<I>(
@ -495,61 +436,6 @@ impl PrefixAndProximityBatch {
    }
 }
 // This is adapted from `sorter_into_lmdb_database`
 fn insert_into_database(
    wtxn: &mut heed::RwTxn,
    database: heed::PolyDatabase,
    new_key: &[u8],
    new_value: &[u8],
 ) -> Result<()> {
    let mut iter = database.prefix_iter_mut::<_, ByteSlice, ByteSlice>(wtxn, new_key)?;
    match iter.next().transpose()? {
        Some((key, old_val)) if new_key == key => {
            let val =
                merge_cbo_roaring_bitmaps(key, &[Cow::Borrowed(old_val), Cow::Borrowed(new_value)])
                    .map_err(|_| {
                        // TODO just wrap this error?
                        crate::error::InternalError::IndexingMergingKeys {
                            process: "get-put-merge",
                        }
                    })?;
            // safety: we use the new_key, not the one from the database iterator, to avoid undefined behaviour
            unsafe { iter.put_current(new_key, &val)? };
        }
        _ => {
            drop(iter);
            database.put::<_, ByteSlice, ByteSlice>(wtxn, new_key, new_value)?;
        }
    }
    Ok(())
 }
 // This is adapted from `sorter_into_lmdb_database` and `write_into_lmdb_database`,
 // but it uses `append` if the database is empty, and it assumes that the values in the
 // writer don't conflict with values in the database.
 pub fn write_into_lmdb_database_without_merging(
    wtxn: &mut heed::RwTxn,
    database: heed::PolyDatabase,
    writer: grenad::Writer<std::fs::File>,
 ) -> Result<()> {
    let file = writer.into_inner()?;
    let reader = grenad::Reader::new(BufReader::new(file))?;
    if database.is_empty(wtxn)? {
        let mut out_iter = database.iter_mut::<_, ByteSlice, ByteSlice>(wtxn)?;
        let mut cursor = reader.into_cursor()?;
        while let Some((k, v)) = cursor.move_on_next()? {
            // safety: the key comes from the grenad reader, not the database
            unsafe { out_iter.append(k, v)? };
        }
    } else {
        let mut cursor = reader.into_cursor()?;
        while let Some((k, v)) = cursor.move_on_next()? {
            database.put::<_, ByteSlice, ByteSlice>(wtxn, k, v)?;
        }
    }
    Ok(())
 }
 /** A prefix trie. Used to iterate quickly over the prefixes of a word that are
 within a set.
@ -676,90 +562,9 @@ impl PrefixTrieNode {
 }
 #[cfg(test)]
 mod tests {
    use std::io::Cursor;
    use roaring::RoaringBitmap;
    use super::*;
-    use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
+    use crate::{CboRoaringBitmapCodec, U8StrStrCodec};
-    use crate::index::tests::TempIndex;
+    use roaring::RoaringBitmap;
    use crate::{db_snap, CboRoaringBitmapCodec, U8StrStrCodec};
    fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec<crate::Object> {
        let mut documents = Vec::new();
        for prefix in prefixes {
            for i in 0..50 {
                documents.push(
                    serde_json::json!({
                        "text": format!("{prefix}{i:x}"),
                    })
                    .as_object()
                    .unwrap()
                    .clone(),
                )
            }
        }
        documents
    }
    #[test]
    fn test_update() {
        let mut index = TempIndex::new();
        index.index_documents_config.words_prefix_threshold = Some(50);
        index.index_documents_config.autogenerate_docids = true;
        index
            .update_settings(|settings| {
                settings.set_searchable_fields(vec!["text".to_owned()]);
            })
            .unwrap();
        let batch_reader_from_documents = |documents| {
            let mut builder = DocumentsBatchBuilder::new(Vec::new());
            for object in documents {
                builder.append_json_object(&object).unwrap();
            }
            DocumentsBatchReader::from_reader(Cursor::new(builder.into_inner().unwrap())).unwrap()
        };
        let mut documents = documents_with_enough_different_words_for_prefixes(&["a", "be"]);
        // now we add some documents where the text should populate the word_prefix_pair_proximity_docids database
        documents.push(
            serde_json::json!({
                "text": "At an amazing and beautiful house"
            })
            .as_object()
            .unwrap()
            .clone(),
        );
        documents.push(
            serde_json::json!({
                "text": "The bell rings at 5 am"
            })
            .as_object()
            .unwrap()
            .clone(),
        );
        let documents = batch_reader_from_documents(documents);
        index.add_documents(documents).unwrap();
        db_snap!(index, word_prefix_pair_proximity_docids, "initial");
        let mut documents = documents_with_enough_different_words_for_prefixes(&["am", "an"]);
        documents.push(
            serde_json::json!({
                "text": "At an extraordinary house"
            })
            .as_object()
            .unwrap()
            .clone(),
        );
        let documents = batch_reader_from_documents(documents);
        index.add_documents(documents).unwrap();
        db_snap!(index, word_prefix_pair_proximity_docids, "update");
    }
    fn check_prefixes(
        trie: &PrefixTrieNode,
@ -899,9 +704,9 @@ mod tests {
            &prefixes,
            2,
            |k, v| {
-                let (word1, prefix, proximity) = U8StrStrCodec::bytes_decode(k).unwrap();
+                let (proximity, word1, prefix) = U8StrStrCodec::bytes_decode(k).unwrap();
                let bitmap = CboRoaringBitmapCodec::bytes_decode(v).unwrap();
-                result.push(((word1.to_owned(), prefix.to_owned(), proximity.to_owned()), bitmap));
+                result.push(((proximity.to_owned(), word1.to_owned(), prefix.to_owned()), bitmap));
                Ok(())
            },
        )