Split position DB into fid and relative position DB

2024-11-27 04:25:06 +08:00 · 2023-03-23 09:22:01 +01:00 · 2023-03-23 09:22:01 +01:00 · 9b2653427d
commit 9b2653427d
parent 56b7209f26
11 changed files with 162 additions and 135 deletions
--- a/milli/src/heed_codec/mod.rs
+++ b/milli/src/heed_codec/mod.rs
@ -21,5 +21,5 @@ pub use self::roaring_bitmap_length::{
    BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
 };
 pub use self::script_language_codec::ScriptLanguageCodec;
-pub use self::str_beu32_codec::StrBEU32Codec;
+pub use self::str_beu32_codec::{StrBEU32Codec, StrBEU16Codec};
 pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
--- a/milli/src/heed_codec/str_beu32_codec.rs
+++ b/milli/src/heed_codec/str_beu32_codec.rs
@ -36,3 +36,37 @@ impl<'a> heed::BytesEncode<'a> for StrBEU32Codec {
        Some(Cow::Owned(bytes))
    }
 }
 pub struct StrBEU16Codec;
 impl<'a> heed::BytesDecode<'a> for StrBEU16Codec {
    type DItem = (&'a str, u16);
    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
        let footer_len = size_of::<u16>();
        if bytes.len() < footer_len {
            return None;
        }
        let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
        let word = str::from_utf8(word).ok()?;
        let pos = bytes.try_into().map(u16::from_be_bytes).ok()?;
        Some((word, pos))
    }
 }
 impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
    type EItem = (&'a str, u16);
    fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
        let pos = pos.to_be_bytes();
        let mut bytes = Vec::with_capacity(word.len() + pos.len());
        bytes.extend_from_slice(word.as_bytes());
        bytes.extend_from_slice(&pos[..]);
        Some(Cow::Owned(bytes))
    }
 }
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -19,12 +19,12 @@ use crate::heed_codec::facet::{
    FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
    FieldIdCodec, OrderedF64Codec,
 };
-use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec};
+use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
 use crate::{
    default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
    DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
    FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
-    Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32,
+    Search, U8StrStrCodec, BEU16, BEU32,
 };
 pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
@ -76,7 +76,9 @@ pub mod db_name {
    pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
    pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
    pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
    pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids";
    pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
    pub const WORD_PREFIX_FIELD_ID_DOCIDS: &str = "word-prefix-field-id-docids";
    pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
    pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
    pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
@ -118,11 +120,16 @@ pub struct Index {
    pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,
    /// Maps the word and the position with the docids that corresponds to it.
-    pub word_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
+    pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
    /// Maps the word and the field id with the docids that corresponds to it.
    pub word_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
    /// Maps the field id and the word count with the docids that corresponds to it.
    pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
    /// Maps the position of a word prefix with all the docids where this prefix appears.
-    pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
+    pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
    /// Maps the word and the field id with the docids that corresponds to it.
    pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
    /// Maps the script and language with all the docids that corresponds to it.
    pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>,
@ -153,7 +160,7 @@ impl Index {
    ) -> Result<Index> {
        use db_name::*;
-        options.max_dbs(19);
+        options.max_dbs(21);
        unsafe { options.flag(Flags::MdbAlwaysFreePages) };
        let env = options.open(path)?;
@ -170,8 +177,10 @@ impl Index {
        let prefix_word_pair_proximity_docids =
            env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
        let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
        let word_fid_docids = env.create_database(Some(WORD_FIELD_ID_DOCIDS))?;
        let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
        let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
        let word_prefix_fid_docids = env.create_database(Some(WORD_PREFIX_FIELD_ID_DOCIDS))?;
        let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
        let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
        let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
@ -196,7 +205,9 @@ impl Index {
            word_prefix_pair_proximity_docids,
            prefix_word_pair_proximity_docids,
            word_position_docids,
            word_fid_docids,
            word_prefix_position_docids,
            word_prefix_fid_docids,
            field_id_word_count_docids,
            facet_id_f64_docids,
            facet_id_string_docids,
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -152,6 +152,23 @@ pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, Relative
 pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
    (field_id as u32) << 16 | (relative as u32)
 }
 // TODO: this is wrong, but will do for now
 /// Compute the "bucketed" absolute position from the field id and relative position in the field.
 ///
 /// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger.
 pub fn bucketed_position(relative: u16) -> u16 {
    // The first few relative positions are kept intact.
    if relative < 16 {
        relative
    } else if relative < 24 {
        // Relative positions between 16 and 24 all become equal to 24
        24
    } else {
        // Then, groups of positions that have the same base-2 logarithm are reduced to
        // the same relative position: the smallest power of 2 that is greater than them
        (relative as f64).log2().ceil().exp2() as u16
    }
 }
 /// Transform a raw obkv store into a JSON Object.
 pub fn obkv_to_json(
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@ -199,7 +199,7 @@ impl<'t> Criterion for Attribute<'t> {
 struct QueryPositionIterator<'t> {
    #[allow(clippy::type_complexity)]
    inner:
-        Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u32), RoaringBitmap)>> + 't>>>,
+        Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u16), RoaringBitmap)>> + 't>>>,
 }
 impl<'t> QueryPositionIterator<'t> {
@ -241,7 +241,7 @@ impl<'t> QueryPositionIterator<'t> {
 }
 impl<'t> Iterator for QueryPositionIterator<'t> {
-    type Item = heed::Result<(u32, RoaringBitmap)>;
+    type Item = heed::Result<(u16, RoaringBitmap)>;
    fn next(&mut self) -> Option<Self::Item> {
        // sort inner words from the closest next position to the farthest next position.
@ -281,9 +281,9 @@ impl<'t> Iterator for QueryPositionIterator<'t> {
 /// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
 /// This branch allows us to iterate over meta-interval of positions.
 struct Branch<'t> {
-    query_level_iterator: Vec<(u32, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>,
+    query_level_iterator: Vec<(u16, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>,
-    last_result: (u32, RoaringBitmap),
+    last_result: (u16, RoaringBitmap),
-    branch_size: u32,
+    branch_size: u16,
 }
 impl<'t> Branch<'t> {
@ -303,7 +303,7 @@ impl<'t> Branch<'t> {
        let mut branch = Self {
            query_level_iterator,
            last_result: (0, RoaringBitmap::new()),
-            branch_size: flatten_branch.len() as u32,
+            branch_size: flatten_branch.len() as u16,
        };
        branch.update_last_result();
@ -342,7 +342,7 @@ impl<'t> Branch<'t> {
                        Some(result) => {
                            result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0)
                        }
-                        None => u32::MAX,
+                        None => u16::MAX,
                    }
                }
            })
@ -378,7 +378,8 @@ impl<'t> Branch<'t> {
    fn compute_rank(&self) -> u32 {
        // we compute a rank from the position.
        let (pos, _) = self.last_result;
-        pos.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size
+        pos.saturating_sub((0..self.branch_size).sum()) as u32 * LCM_10_FIRST_NUMBERS
            / self.branch_size as u32
    }
    fn cmp(&self, other: &Self) -> Ordering {
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@ -171,7 +171,7 @@ pub trait Context<'c> {
        &self,
        word: &str,
        in_prefix_cache: bool,
-    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>>;
+    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>>;
    fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>;
    fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>;
    fn field_id_word_count_docids(
@ -322,11 +322,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
        &self,
        word: &str,
        in_prefix_cache: bool,
-    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>>
+    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>>
    {
        let range = {
-            let left = u32::min_value();
+            let left = u16::min_value(); // TODO: this is wrong
-            let right = u32::max_value();
+            let right = u16::max_value(); // TODO: this is wrong
            let left = (word, left);
            let right = (word, right);
            left..=right
@ -360,7 +360,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
    }
    fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result<Option<RoaringBitmap>> {
-        let key = (word, pos);
+        let key = (word, pos as u16); // TODO: this is wrong
        self.index.word_position_docids.get(self.rtxn, &key)
    }
 }
@ -899,7 +899,7 @@ pub mod test {
            _word: &str,
            _in_prefix_cache: bool,
        ) -> heed::Result<
-            Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>,
+            Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>,
        > {
            todo!()
        }
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@ -28,8 +28,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
            word_prefix_pair_proximity_docids,
            prefix_word_pair_proximity_docids,
            word_position_docids,
            word_fid_docids,
            field_id_word_count_docids,
            word_prefix_position_docids,
            word_prefix_fid_docids,
            script_language_docids,
            facet_id_f64_docids,
            facet_id_string_docids,
@ -81,8 +83,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
        word_prefix_pair_proximity_docids.clear(self.wtxn)?;
        prefix_word_pair_proximity_docids.clear(self.wtxn)?;
        word_position_docids.clear(self.wtxn)?;
        word_fid_docids.clear(self.wtxn)?;
        field_id_word_count_docids.clear(self.wtxn)?;
        word_prefix_position_docids.clear(self.wtxn)?;
        word_prefix_fid_docids.clear(self.wtxn)?;
        script_language_docids.clear(self.wtxn)?;
        facet_id_f64_docids.clear(self.wtxn)?;
        facet_id_exists_docids.clear(self.wtxn)?;
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@ -2,8 +2,8 @@ use std::collections::btree_map::Entry;
 use std::collections::{HashMap, HashSet};
 use fst::IntoStreamer;
-use heed::types::{ByteSlice, DecodeIgnore, Str};
+use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
-use heed::Database;
+use heed::{BytesDecode, BytesEncode, Database, RwIter};
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
 use time::OffsetDateTime;
@ -239,6 +239,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            prefix_word_pair_proximity_docids,
            word_position_docids,
            word_prefix_position_docids,
            word_fid_docids,
            word_prefix_fid_docids,
            facet_id_f64_docids: _,
            facet_id_string_docids: _,
            field_id_docid_facet_f64s: _,
@ -361,97 +363,34 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
        for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] {
            // We delete the documents ids from the word prefix pair proximity database docids
            // and remove the empty pairs too.
-            let db = db.remap_key_type::<ByteSlice>();
+            Self::delete_from_db(db.iter_mut(self.wtxn)?.remap_key_type(), &self.to_delete_docids)?;
            let mut iter = db.iter_mut(self.wtxn)?;
            while let Some(result) = iter.next() {
                let (key, mut docids) = result?;
                let previous_len = docids.len();
                docids -= &self.to_delete_docids;
                if docids.is_empty() {
                    // safety: we don't keep references from inside the LMDB database.
                    unsafe { iter.del_current()? };
                } else if docids.len() != previous_len {
                    let key = key.to_owned();
                    // safety: we don't keep references from inside the LMDB database.
                    unsafe { iter.put_current(&key, &docids)? };
        }
-            }
+        Self::delete_from_db(
-        }
+            word_pair_proximity_docids.iter_mut(self.wtxn)?.remap_key_type(),
-
+            &self.to_delete_docids,
-        // We delete the documents ids that are under the pairs of words,
+        )?;
-        // it is faster and use no memory to iterate over all the words pairs than
+        Self::delete_from_db(
-        // to compute the cartesian product of every words of the deleted documents.
+            word_position_docids.iter_mut(self.wtxn)?.remap_key_type(),
-        let mut iter =
+            &self.to_delete_docids,
-            word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
+        )?;
-        while let Some(result) = iter.next() {
+        Self::delete_from_db(
-            let (bytes, mut docids) = result?;
+            word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type(),
-            let previous_len = docids.len();
+            &self.to_delete_docids,
-            docids -= &self.to_delete_docids;
+        )?;
-            if docids.is_empty() {
+        Self::delete_from_db(
-                // safety: we don't keep references from inside the LMDB database.
+            word_fid_docids.iter_mut(self.wtxn)?.remap_key_type(),
-                unsafe { iter.del_current()? };
+            &self.to_delete_docids,
-            } else if docids.len() != previous_len {
+        )?;
-                let bytes = bytes.to_owned();
+        Self::delete_from_db(
-                // safety: we don't keep references from inside the LMDB database.
+            word_prefix_fid_docids.iter_mut(self.wtxn)?.remap_key_type(),
-                unsafe { iter.put_current(&bytes, &docids)? };
+            &self.to_delete_docids,
-            }
+        )?;
        }
        drop(iter);
        // We delete the documents ids that are under the word level position docids.
        let mut iter = word_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
        while let Some(result) = iter.next() {
            let (bytes, mut docids) = result?;
            let previous_len = docids.len();
            docids -= &self.to_delete_docids;
            if docids.is_empty() {
                // safety: we don't keep references from inside the LMDB database.
                unsafe { iter.del_current()? };
            } else if docids.len() != previous_len {
                let bytes = bytes.to_owned();
                // safety: we don't keep references from inside the LMDB database.
                unsafe { iter.put_current(&bytes, &docids)? };
            }
        }
        drop(iter);
        // We delete the documents ids that are under the word prefix level position docids.
        let mut iter =
            word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
        while let Some(result) = iter.next() {
            let (bytes, mut docids) = result?;
            let previous_len = docids.len();
            docids -= &self.to_delete_docids;
            if docids.is_empty() {
                // safety: we don't keep references from inside the LMDB database.
                unsafe { iter.del_current()? };
            } else if docids.len() != previous_len {
                let bytes = bytes.to_owned();
                // safety: we don't keep references from inside the LMDB database.
                unsafe { iter.put_current(&bytes, &docids)? };
            }
        }
        drop(iter);
        // Remove the documents ids from the field id word count database.
-        let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?;
+        Self::delete_from_db(
-        while let Some((key, mut docids)) = iter.next().transpose()? {
+            field_id_word_count_docids.iter_mut(self.wtxn)?.remap_key_type(),
-            let previous_len = docids.len();
+            &self.to_delete_docids,
-            docids -= &self.to_delete_docids;
+        )?;
            if docids.is_empty() {
                // safety: we don't keep references from inside the LMDB database.
                unsafe { iter.del_current()? };
            } else if docids.len() != previous_len {
                let key = key.to_owned();
                // safety: we don't keep references from inside the LMDB database.
                unsafe { iter.put_current(&key, &docids)? };
            }
        }
        drop(iter);
        if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? {
            let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?;
@ -501,21 +440,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
        }
        // Remove the documents ids from the script language database.
-        let mut iter = script_language_docids.iter_mut(self.wtxn)?;
+        Self::delete_from_db(
-        while let Some((key, mut docids)) = iter.next().transpose()? {
+            script_language_docids.iter_mut(self.wtxn)?.remap_key_type(),
-            let previous_len = docids.len();
+            &self.to_delete_docids,
-            docids -= &self.to_delete_docids;
+        )?;
            if docids.is_empty() {
                // safety: we don't keep references from inside the LMDB database.
                unsafe { iter.del_current()? };
            } else if docids.len() != previous_len {
                let key = key.to_owned();
                // safety: we don't keep references from inside the LMDB database.
                unsafe { iter.put_current(&key, &docids)? };
            }
        }
        drop(iter);
        // We delete the documents ids that are under the facet field id values.
        remove_docids_from_facet_id_exists_docids(
            self.wtxn,
@ -531,6 +459,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            soft_deletion_used: false,
        })
    }
    fn delete_from_db<C>(
        mut iter: RwIter<UnalignedSlice<u8>, C>,
        to_delete_docids: &RoaringBitmap,
    ) -> Result<()>
    where
        C: for<'a> BytesDecode<'a, DItem = RoaringBitmap>
            + for<'a> BytesEncode<'a, EItem = RoaringBitmap>,
    {
        while let Some(result) = iter.next() {
            let (bytes, mut docids) = result?;
            let previous_len = docids.len();
            docids -= to_delete_docids;
            if docids.is_empty() {
                // safety: we don't keep references from inside the LMDB database.
                unsafe { iter.del_current()? };
            } else if docids.len() != previous_len {
                let bytes = bytes.to_owned();
                // safety: we don't keep references from inside the LMDB database.
                unsafe { iter.put_current(&bytes, &docids)? };
            }
        }
        Ok(())
    }
 }
 fn remove_from_word_prefix_docids(
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@ -7,14 +7,17 @@ use super::helpers::{
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::{DocumentId, Result};
+use crate::{
    absolute_from_relative_position, bucketed_position, relative_from_absolute_position,
    DocumentId, Result,
 };
 /// Extracts the word positions and the documents ids where this word appear.
 ///
 /// Returns a grenad reader with the list of extracted words at positions and
 /// documents ids from the given chunk of docid word positions.
 #[logging_timer::time]
-pub fn extract_word_position_docids<R: io::Read + io::Seek>(
+pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
    docid_word_positions: grenad::Reader<R>,
    indexer: GrenadParameters,
 ) -> Result<grenad::Reader<File>> {
@ -39,11 +42,15 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
        for position in read_u32_ne_bytes(value) {
            key_buffer.clear();
            key_buffer.extend_from_slice(word_bytes);
            let (fid, position) = relative_from_absolute_position(position);
            let position = bucketed_position(position);
            let position = absolute_from_relative_position(fid, position);
            key_buffer.extend_from_slice(&position.to_be_bytes());
            word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
        }
    }
-    sorter_into_reader(word_position_docids_sorter, indexer)
+    let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
    Ok(word_position_docids_reader)
 }
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@ -23,7 +23,7 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
 use self::extract_geo_points::extract_geo_points;
 use self::extract_word_docids::extract_word_docids;
 use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
-use self::extract_word_position_docids::extract_word_position_docids;
+use self::extract_word_position_docids::extract_word_fid_and_position_docids;
 use super::helpers::{
    as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
    GrenadParameters, MergeFn, MergeableReader,
@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents(
        docid_word_positions_chunks,
        indexer,
        lmdb_writer_sx.clone(),
-        extract_word_position_docids,
+        extract_word_fid_and_position_docids,
        merge_cbo_roaring_bitmaps,
        TypedChunk::WordPositionDocids,
        "word-position-docids",
--- a/milli/src/update/words_prefix_position_docids.rs
+++ b/milli/src/update/words_prefix_position_docids.rs
@ -8,13 +8,13 @@ use heed::{BytesDecode, BytesEncode};
 use log::debug;
 use crate::error::SerializationError;
-use crate::heed_codec::StrBEU32Codec;
+use crate::heed_codec::{StrBEU16Codec, StrBEU32Codec};
 use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
 use crate::update::index_documents::{
    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
    CursorClonableMmap, MergeFn,
 };
-use crate::{Index, Result};
+use crate::{bucketed_position, relative_from_absolute_position, Index, Result};
 pub struct WordPrefixPositionDocids<'t, 'u, 'i> {
    wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -82,6 +82,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
            let mut prefixes_cache = HashMap::new();
            while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? {
                let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
                let (_fid, pos) = relative_from_absolute_position(pos);
                current_prefixes = match current_prefixes.take() {
                    Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes),
@ -127,12 +128,12 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
            let iter = db
                .remap_key_type::<ByteSlice>()
                .prefix_iter(self.wtxn, prefix_bytes.as_bytes())?
-                .remap_key_type::<StrBEU32Codec>();
+                .remap_key_type::<StrBEU16Codec>();
            for result in iter {
                let ((word, pos), data) = result?;
                if word.starts_with(prefix) {
                    let key = (prefix, pos);
-                    let bytes = StrBEU32Codec::bytes_encode(&key).unwrap();
+                    let bytes = StrBEU16Codec::bytes_encode(&key).unwrap();
                    prefix_position_docids_sorter.insert(bytes, data)?;
                }
            }