Split position DB into fid and relative position DB

2025-01-31 15:31:53 +08:00 · 2023-03-23 09:22:01 +01:00 · 2023-03-23 09:22:01 +01:00 · 9b2653427d
commit 9b2653427d
parent 56b7209f26
11 changed files with 162 additions and 135 deletions
--- a/milli/src/heed_codec/mod.rs
+++ b/milli/src/heed_codec/mod.rs
@ -21,5 +21,5 @@ pub use self::roaring_bitmap_length::{
    BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
 };
 pub use self::script_language_codec::ScriptLanguageCodec;
-pub use self::str_beu32_codec::StrBEU32Codec;
+pub use self::str_beu32_codec::{StrBEU32Codec, StrBEU16Codec};
 pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
--- a/milli/src/heed_codec/str_beu32_codec.rs
+++ b/milli/src/heed_codec/str_beu32_codec.rs
@ -36,3 +36,37 @@ impl<'a> heed::BytesEncode<'a> for StrBEU32Codec {
        Some(Cow::Owned(bytes))
    }
 }
+
+pub struct StrBEU16Codec;
+
+impl<'a> heed::BytesDecode<'a> for StrBEU16Codec {
+    type DItem = (&'a str, u16);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let footer_len = size_of::<u16>();
+
+        if bytes.len() < footer_len {
+            return None;
+        }
+
+        let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
+        let word = str::from_utf8(word).ok()?;
+        let pos = bytes.try_into().map(u16::from_be_bytes).ok()?;
+
+        Some((word, pos))
+    }
+}
+
+impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
+    type EItem = (&'a str, u16);
+
+    fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
+        let pos = pos.to_be_bytes();
+
+        let mut bytes = Vec::with_capacity(word.len() + pos.len());
+        bytes.extend_from_slice(word.as_bytes());
+        bytes.extend_from_slice(&pos[..]);
+
+        Some(Cow::Owned(bytes))
+    }
+}
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -19,12 +19,12 @@ use crate::heed_codec::facet::{
    FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
    FieldIdCodec, OrderedF64Codec,
 };
-use crate::heed_codec::{ScriptLanguageCodec, StrRefCodec};
+use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
 use crate::{
    default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
    DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId,
    FieldIdWordCountCodec, GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec,
-    Search, StrBEU32Codec, U8StrStrCodec, BEU16, BEU32,
+    Search, U8StrStrCodec, BEU16, BEU32,
 };

 pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5;
@ -76,7 +76,9 @@ pub mod db_name {
    pub const WORD_PREFIX_PAIR_PROXIMITY_DOCIDS: &str = "word-prefix-pair-proximity-docids";
    pub const PREFIX_WORD_PAIR_PROXIMITY_DOCIDS: &str = "prefix-word-pair-proximity-docids";
    pub const WORD_POSITION_DOCIDS: &str = "word-position-docids";
+    pub const WORD_FIELD_ID_DOCIDS: &str = "word-field-id-docids";
    pub const WORD_PREFIX_POSITION_DOCIDS: &str = "word-prefix-position-docids";
+    pub const WORD_PREFIX_FIELD_ID_DOCIDS: &str = "word-prefix-field-id-docids";
    pub const FIELD_ID_WORD_COUNT_DOCIDS: &str = "field-id-word-count-docids";
    pub const FACET_ID_F64_DOCIDS: &str = "facet-id-f64-docids";
    pub const FACET_ID_EXISTS_DOCIDS: &str = "facet-id-exists-docids";
@ -118,11 +120,16 @@ pub struct Index {
    pub prefix_word_pair_proximity_docids: Database<U8StrStrCodec, CboRoaringBitmapCodec>,

    /// Maps the word and the position with the docids that corresponds to it.
-    pub word_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
+    pub word_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
+    /// Maps the word and the field id with the docids that corresponds to it.
+    pub word_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
+
    /// Maps the field id and the word count with the docids that corresponds to it.
    pub field_id_word_count_docids: Database<FieldIdWordCountCodec, CboRoaringBitmapCodec>,
    /// Maps the position of a word prefix with all the docids where this prefix appears.
-    pub word_prefix_position_docids: Database<StrBEU32Codec, CboRoaringBitmapCodec>,
+    pub word_prefix_position_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,
+    /// Maps the word and the field id with the docids that corresponds to it.
+    pub word_prefix_fid_docids: Database<StrBEU16Codec, CboRoaringBitmapCodec>,

    /// Maps the script and language with all the docids that corresponds to it.
    pub script_language_docids: Database<ScriptLanguageCodec, RoaringBitmapCodec>,
@ -153,7 +160,7 @@ impl Index {
    ) -> Result<Index> {
        use db_name::*;

-        options.max_dbs(19);
+        options.max_dbs(21);
        unsafe { options.flag(Flags::MdbAlwaysFreePages) };

        let env = options.open(path)?;
@ -170,8 +177,10 @@ impl Index {
        let prefix_word_pair_proximity_docids =
            env.create_database(Some(PREFIX_WORD_PAIR_PROXIMITY_DOCIDS))?;
        let word_position_docids = env.create_database(Some(WORD_POSITION_DOCIDS))?;
+        let word_fid_docids = env.create_database(Some(WORD_FIELD_ID_DOCIDS))?;
        let field_id_word_count_docids = env.create_database(Some(FIELD_ID_WORD_COUNT_DOCIDS))?;
        let word_prefix_position_docids = env.create_database(Some(WORD_PREFIX_POSITION_DOCIDS))?;
+        let word_prefix_fid_docids = env.create_database(Some(WORD_PREFIX_FIELD_ID_DOCIDS))?;
        let facet_id_f64_docids = env.create_database(Some(FACET_ID_F64_DOCIDS))?;
        let facet_id_string_docids = env.create_database(Some(FACET_ID_STRING_DOCIDS))?;
        let facet_id_exists_docids = env.create_database(Some(FACET_ID_EXISTS_DOCIDS))?;
@ -196,7 +205,9 @@ impl Index {
            word_prefix_pair_proximity_docids,
            prefix_word_pair_proximity_docids,
            word_position_docids,
+            word_fid_docids,
            word_prefix_position_docids,
+            word_prefix_fid_docids,
            field_id_word_count_docids,
            facet_id_f64_docids,
            facet_id_string_docids,
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -152,6 +152,23 @@ pub fn relative_from_absolute_position(absolute: Position) -> (FieldId, Relative
 pub fn absolute_from_relative_position(field_id: FieldId, relative: RelativePosition) -> Position {
    (field_id as u32) << 16 | (relative as u32)
 }
+// TODO: this is wrong, but will do for now
+/// Compute the "bucketed" absolute position from the field id and relative position in the field.
+///
+/// In a bucketed position, the accuracy of the relative position is reduced exponentially as it gets larger.
+pub fn bucketed_position(relative: u16) -> u16 {
+    // The first few relative positions are kept intact.
+    if relative < 16 {
+        relative
+    } else if relative < 24 {
+        // Relative positions between 16 and 24 all become equal to 24
+        24
+    } else {
+        // Then, groups of positions that have the same base-2 logarithm are reduced to
+        // the same relative position: the smallest power of 2 that is greater than them
+        (relative as f64).log2().ceil().exp2() as u16
+    }
+}

 /// Transform a raw obkv store into a JSON Object.
 pub fn obkv_to_json(
--- a/milli/src/search/criteria/attribute.rs
+++ b/milli/src/search/criteria/attribute.rs
@ -199,7 +199,7 @@ impl<'t> Criterion for Attribute<'t> {
 struct QueryPositionIterator<'t> {
    #[allow(clippy::type_complexity)]
    inner:
-        Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u32), RoaringBitmap)>> + 't>>>,
+        Vec<Peekable<Box<dyn Iterator<Item = heed::Result<((&'t str, u16), RoaringBitmap)>> + 't>>>,
 }

 impl<'t> QueryPositionIterator<'t> {
@ -241,7 +241,7 @@ impl<'t> QueryPositionIterator<'t> {
 }

 impl<'t> Iterator for QueryPositionIterator<'t> {
-    type Item = heed::Result<(u32, RoaringBitmap)>;
+    type Item = heed::Result<(u16, RoaringBitmap)>;

    fn next(&mut self) -> Option<Self::Item> {
        // sort inner words from the closest next position to the farthest next position.
@ -281,9 +281,9 @@ impl<'t> Iterator for QueryPositionIterator<'t> {
 /// A Branch is represent a possible alternative of the original query and is build with the Query Tree,
 /// This branch allows us to iterate over meta-interval of positions.
 struct Branch<'t> {
-    query_level_iterator: Vec<(u32, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>,
-    last_result: (u32, RoaringBitmap),
-    branch_size: u32,
+    query_level_iterator: Vec<(u16, RoaringBitmap, Peekable<QueryPositionIterator<'t>>)>,
+    last_result: (u16, RoaringBitmap),
+    branch_size: u16,
 }

 impl<'t> Branch<'t> {
@ -303,7 +303,7 @@ impl<'t> Branch<'t> {
        let mut branch = Self {
            query_level_iterator,
            last_result: (0, RoaringBitmap::new()),
-            branch_size: flatten_branch.len() as u32,
+            branch_size: flatten_branch.len() as u16,
        };

        branch.update_last_result();
@ -342,7 +342,7 @@ impl<'t> Branch<'t> {
                        Some(result) => {
                            result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0)
                        }
-                        None => u32::MAX,
+                        None => u16::MAX,
                    }
                }
            })
@ -378,7 +378,8 @@ impl<'t> Branch<'t> {
    fn compute_rank(&self) -> u32 {
        // we compute a rank from the position.
        let (pos, _) = self.last_result;
-        pos.saturating_sub((0..self.branch_size).sum()) * LCM_10_FIRST_NUMBERS / self.branch_size
+        pos.saturating_sub((0..self.branch_size).sum()) as u32 * LCM_10_FIRST_NUMBERS
+            / self.branch_size as u32
    }

    fn cmp(&self, other: &Self) -> Ordering {
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@ -171,7 +171,7 @@ pub trait Context<'c> {
        &self,
        word: &str,
        in_prefix_cache: bool,
-    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>>;
+    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>>;
    fn synonyms(&self, word: &str) -> heed::Result<Option<Vec<Vec<String>>>>;
    fn searchable_fields_ids(&self) -> Result<Vec<FieldId>>;
    fn field_id_word_count_docids(
@ -322,11 +322,11 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
        &self,
        word: &str,
        in_prefix_cache: bool,
-    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>>
+    ) -> heed::Result<Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>>
    {
        let range = {
-            let left = u32::min_value();
-            let right = u32::max_value();
+            let left = u16::min_value(); // TODO: this is wrong
+            let right = u16::max_value(); // TODO: this is wrong
            let left = (word, left);
            let right = (word, right);
            left..=right
@ -360,7 +360,7 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> {
    }

    fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result<Option<RoaringBitmap>> {
-        let key = (word, pos);
+        let key = (word, pos as u16); // TODO: this is wrong
        self.index.word_position_docids.get(self.rtxn, &key)
    }
 }
@ -899,7 +899,7 @@ pub mod test {
            _word: &str,
            _in_prefix_cache: bool,
        ) -> heed::Result<
-            Box<dyn Iterator<Item = heed::Result<((&'c str, u32), RoaringBitmap)>> + 'c>,
+            Box<dyn Iterator<Item = heed::Result<((&'c str, u16), RoaringBitmap)>> + 'c>,
        > {
            todo!()
        }
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@ -28,8 +28,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
            word_prefix_pair_proximity_docids,
            prefix_word_pair_proximity_docids,
            word_position_docids,
+            word_fid_docids,
            field_id_word_count_docids,
            word_prefix_position_docids,
+            word_prefix_fid_docids,
            script_language_docids,
            facet_id_f64_docids,
            facet_id_string_docids,
@ -81,8 +83,10 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
        word_prefix_pair_proximity_docids.clear(self.wtxn)?;
        prefix_word_pair_proximity_docids.clear(self.wtxn)?;
        word_position_docids.clear(self.wtxn)?;
+        word_fid_docids.clear(self.wtxn)?;
        field_id_word_count_docids.clear(self.wtxn)?;
        word_prefix_position_docids.clear(self.wtxn)?;
+        word_prefix_fid_docids.clear(self.wtxn)?;
        script_language_docids.clear(self.wtxn)?;
        facet_id_f64_docids.clear(self.wtxn)?;
        facet_id_exists_docids.clear(self.wtxn)?;
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@ -2,8 +2,8 @@ use std::collections::btree_map::Entry;
 use std::collections::{HashMap, HashSet};

 use fst::IntoStreamer;
-use heed::types::{ByteSlice, DecodeIgnore, Str};
-use heed::Database;
+use heed::types::{ByteSlice, DecodeIgnore, Str, UnalignedSlice};
+use heed::{BytesDecode, BytesEncode, Database, RwIter};
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
 use time::OffsetDateTime;
@ -239,6 +239,8 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            prefix_word_pair_proximity_docids,
            word_position_docids,
            word_prefix_position_docids,
+            word_fid_docids,
+            word_prefix_fid_docids,
            facet_id_f64_docids: _,
            facet_id_string_docids: _,
            field_id_docid_facet_f64s: _,
@ -361,97 +363,34 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
        for db in [word_prefix_pair_proximity_docids, prefix_word_pair_proximity_docids] {
            // We delete the documents ids from the word prefix pair proximity database docids
            // and remove the empty pairs too.
-            let db = db.remap_key_type::<ByteSlice>();
-            let mut iter = db.iter_mut(self.wtxn)?;
-            while let Some(result) = iter.next() {
-                let (key, mut docids) = result?;
-                let previous_len = docids.len();
-                docids -= &self.to_delete_docids;
-                if docids.is_empty() {
-                    // safety: we don't keep references from inside the LMDB database.
-                    unsafe { iter.del_current()? };
-                } else if docids.len() != previous_len {
-                    let key = key.to_owned();
-                    // safety: we don't keep references from inside the LMDB database.
-                    unsafe { iter.put_current(&key, &docids)? };
-                }
-            }
+            Self::delete_from_db(db.iter_mut(self.wtxn)?.remap_key_type(), &self.to_delete_docids)?;
        }
-
-        // We delete the documents ids that are under the pairs of words,
-        // it is faster and use no memory to iterate over all the words pairs than
-        // to compute the cartesian product of every words of the deleted documents.
-        let mut iter =
-            word_pair_proximity_docids.remap_key_type::<ByteSlice>().iter_mut(self.wtxn)?;
-        while let Some(result) = iter.next() {
-            let (bytes, mut docids) = result?;
-            let previous_len = docids.len();
-            docids -= &self.to_delete_docids;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-            } else if docids.len() != previous_len {
-                let bytes = bytes.to_owned();
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&bytes, &docids)? };
-            }
-        }
-
-        drop(iter);
-
-        // We delete the documents ids that are under the word level position docids.
-        let mut iter = word_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
-        while let Some(result) = iter.next() {
-            let (bytes, mut docids) = result?;
-            let previous_len = docids.len();
-            docids -= &self.to_delete_docids;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-            } else if docids.len() != previous_len {
-                let bytes = bytes.to_owned();
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&bytes, &docids)? };
-            }
-        }
-
-        drop(iter);
-
-        // We delete the documents ids that are under the word prefix level position docids.
-        let mut iter =
-            word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
-        while let Some(result) = iter.next() {
-            let (bytes, mut docids) = result?;
-            let previous_len = docids.len();
-            docids -= &self.to_delete_docids;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-            } else if docids.len() != previous_len {
-                let bytes = bytes.to_owned();
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&bytes, &docids)? };
-            }
-        }
-
-        drop(iter);
+        Self::delete_from_db(
+            word_pair_proximity_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;
+        Self::delete_from_db(
+            word_position_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;
+        Self::delete_from_db(
+            word_prefix_position_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;
+        Self::delete_from_db(
+            word_fid_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;
+        Self::delete_from_db(
+            word_prefix_fid_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;

        // Remove the documents ids from the field id word count database.
-        let mut iter = field_id_word_count_docids.iter_mut(self.wtxn)?;
-        while let Some((key, mut docids)) = iter.next().transpose()? {
-            let previous_len = docids.len();
-            docids -= &self.to_delete_docids;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-            } else if docids.len() != previous_len {
-                let key = key.to_owned();
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&key, &docids)? };
-            }
-        }
-
-        drop(iter);
+        Self::delete_from_db(
+            field_id_word_count_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;

        if let Some(mut rtree) = self.index.geo_rtree(self.wtxn)? {
            let mut geo_faceted_doc_ids = self.index.geo_faceted_documents_ids(self.wtxn)?;
@ -501,21 +440,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
        }

        // Remove the documents ids from the script language database.
-        let mut iter = script_language_docids.iter_mut(self.wtxn)?;
-        while let Some((key, mut docids)) = iter.next().transpose()? {
-            let previous_len = docids.len();
-            docids -= &self.to_delete_docids;
-            if docids.is_empty() {
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.del_current()? };
-            } else if docids.len() != previous_len {
-                let key = key.to_owned();
-                // safety: we don't keep references from inside the LMDB database.
-                unsafe { iter.put_current(&key, &docids)? };
-            }
-        }
-
-        drop(iter);
+        Self::delete_from_db(
+            script_language_docids.iter_mut(self.wtxn)?.remap_key_type(),
+            &self.to_delete_docids,
+        )?;
        // We delete the documents ids that are under the facet field id values.
        remove_docids_from_facet_id_exists_docids(
            self.wtxn,
@ -531,6 +459,30 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            soft_deletion_used: false,
        })
    }
+
+    fn delete_from_db<C>(
+        mut iter: RwIter<UnalignedSlice<u8>, C>,
+        to_delete_docids: &RoaringBitmap,
+    ) -> Result<()>
+    where
+        C: for<'a> BytesDecode<'a, DItem = RoaringBitmap>
+            + for<'a> BytesEncode<'a, EItem = RoaringBitmap>,
+    {
+        while let Some(result) = iter.next() {
+            let (bytes, mut docids) = result?;
+            let previous_len = docids.len();
+            docids -= to_delete_docids;
+            if docids.is_empty() {
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.del_current()? };
+            } else if docids.len() != previous_len {
+                let bytes = bytes.to_owned();
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.put_current(&bytes, &docids)? };
+            }
+        }
+        Ok(())
+    }
 }

 fn remove_from_word_prefix_docids(
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@ -7,14 +7,17 @@ use super::helpers::{
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::{DocumentId, Result};
+use crate::{
+    absolute_from_relative_position, bucketed_position, relative_from_absolute_position,
+    DocumentId, Result,
+};

 /// Extracts the word positions and the documents ids where this word appear.
 ///
 /// Returns a grenad reader with the list of extracted words at positions and
 /// documents ids from the given chunk of docid word positions.
 #[logging_timer::time]
-pub fn extract_word_position_docids<R: io::Read + io::Seek>(
+pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
    docid_word_positions: grenad::Reader<R>,
    indexer: GrenadParameters,
 ) -> Result<grenad::Reader<File>> {
@ -39,11 +42,15 @@ pub fn extract_word_position_docids<R: io::Read + io::Seek>(
        for position in read_u32_ne_bytes(value) {
            key_buffer.clear();
            key_buffer.extend_from_slice(word_bytes);
+            let (fid, position) = relative_from_absolute_position(position);
+            let position = bucketed_position(position);
+            let position = absolute_from_relative_position(fid, position);
            key_buffer.extend_from_slice(&position.to_be_bytes());
-
            word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
        }
    }

-    sorter_into_reader(word_position_docids_sorter, indexer)
+    let word_position_docids_reader = sorter_into_reader(word_position_docids_sorter, indexer)?;
+
+    Ok(word_position_docids_reader)
 }
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@ -23,7 +23,7 @@ use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
 use self::extract_geo_points::extract_geo_points;
 use self::extract_word_docids::extract_word_docids;
 use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
-use self::extract_word_position_docids::extract_word_position_docids;
+use self::extract_word_position_docids::extract_word_fid_and_position_docids;
 use super::helpers::{
    as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
    GrenadParameters, MergeFn, MergeableReader,
@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents(
        docid_word_positions_chunks,
        indexer,
        lmdb_writer_sx.clone(),
-        extract_word_position_docids,
+        extract_word_fid_and_position_docids,
        merge_cbo_roaring_bitmaps,
        TypedChunk::WordPositionDocids,
        "word-position-docids",
--- a/milli/src/update/words_prefix_position_docids.rs
+++ b/milli/src/update/words_prefix_position_docids.rs
@ -8,13 +8,13 @@ use heed::{BytesDecode, BytesEncode};
 use log::debug;

 use crate::error::SerializationError;
-use crate::heed_codec::StrBEU32Codec;
+use crate::heed_codec::{StrBEU16Codec, StrBEU32Codec};
 use crate::index::main_key::WORDS_PREFIXES_FST_KEY;
 use crate::update::index_documents::{
    create_sorter, merge_cbo_roaring_bitmaps, sorter_into_lmdb_database, valid_lmdb_key,
    CursorClonableMmap, MergeFn,
 };
-use crate::{Index, Result};
+use crate::{bucketed_position, relative_from_absolute_position, Index, Result};

 pub struct WordPrefixPositionDocids<'t, 'u, 'i> {
    wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -82,6 +82,7 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
            let mut prefixes_cache = HashMap::new();
            while let Some((key, data)) = new_word_position_docids_iter.move_on_next()? {
                let (word, pos) = StrBEU32Codec::bytes_decode(key).ok_or(heed::Error::Decoding)?;
+                let (_fid, pos) = relative_from_absolute_position(pos);

                current_prefixes = match current_prefixes.take() {
                    Some(prefixes) if word.starts_with(&prefixes[0]) => Some(prefixes),
@ -127,12 +128,12 @@ impl<'t, 'u, 'i> WordPrefixPositionDocids<'t, 'u, 'i> {
            let iter = db
                .remap_key_type::<ByteSlice>()
                .prefix_iter(self.wtxn, prefix_bytes.as_bytes())?
-                .remap_key_type::<StrBEU32Codec>();
+                .remap_key_type::<StrBEU16Codec>();
            for result in iter {
                let ((word, pos), data) = result?;
                if word.starts_with(prefix) {
                    let key = (prefix, pos);
-                    let bytes = StrBEU32Codec::bytes_encode(&key).unwrap();
+                    let bytes = StrBEU16Codec::bytes_encode(&key).unwrap();
                    prefix_position_docids_sorter.insert(bytes, data)?;
                }
            }