From 5440f43fd3be28981933bd55e126ee88ed3324e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Wed, 5 Apr 2023 14:55:02 +0200
Subject: [PATCH] Fix indexing of word_position_docid and fid

---
 milli/src/snapshot_tests.rs                   |  8 +++
 .../extract/extract_word_fid_docids.rs        | 48 ++++++++++++++++
 .../extract/extract_word_position_docids.rs   |  4 +-
 .../src/update/index_documents/extract/mod.rs | 17 +++++-
 milli/src/update/index_documents/mod.rs       | 57 +++++++++++++++++++
 .../src/update/index_documents/typed_chunk.rs | 12 ++++
 6 files changed, 141 insertions(+), 5 deletions(-)
 create mode 100644 milli/src/update/index_documents/extract/extract_word_fid_docids.rs
diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs
index f7f1a97e6..eb94c4be9 100644
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -248,6 +248,11 @@ pub fn snap_word_position_docids(index: &Index) -> String {
         &format!("{word:<16} {position:<6} {}", display_bitmap(&b))
     })
 }
+pub fn snap_word_fid_docids(index: &Index) -> String {
+    make_db_snap_from_iter!(index, word_fid_docids, |((word, fid), b)| {
+        &format!("{word:<16} {fid:<3} {}", display_bitmap(&b))
+    })
+}
 pub fn snap_field_id_word_count_docids(index: &Index) -> String {
     make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| {
         &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b))
@@ -477,6 +482,9 @@ macro_rules! full_snap_of_db {
     ($index:ident, word_position_docids) => {{
         $crate::snapshot_tests::snap_word_position_docids(&$index)
     }};
+    ($index:ident, word_fid_docids) => {{
+        $crate::snapshot_tests::snap_word_fid_docids(&$index)
+    }};
     ($index:ident, field_id_word_count_docids) => {{
         $crate::snapshot_tests::snap_field_id_word_count_docids(&$index)
     }};
diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
new file mode 100644
index 000000000..72b30cddf
--- /dev/null
+++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
@@ -0,0 +1,48 @@
+use std::fs::File;
+use std::io;
+
+use super::helpers::{
+    create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
+    try_split_array_at, GrenadParameters,
+};
+use crate::error::SerializationError;
+use crate::index::db_name::DOCID_WORD_POSITIONS;
+use crate::{relative_from_absolute_position, DocumentId, Result};
+
+/// Extracts the word, field id, and the documents ids where this word appear at this field id.
+#[logging_timer::time]
+pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
+    docid_word_positions: grenad::Reader<R>,
+    indexer: GrenadParameters,
+) -> Result<grenad::Reader<File>> {
+    let max_memory = indexer.max_memory_by_thread();
+
+    let mut word_fid_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Unstable,
+        merge_cbo_roaring_bitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory,
+    );
+
+    let mut key_buffer = Vec::new();
+    let mut cursor = docid_word_positions.into_cursor()?;
+    while let Some((key, value)) = cursor.move_on_next()? {
+        let (document_id_bytes, word_bytes) = try_split_array_at(key)
+            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
+        let document_id = DocumentId::from_be_bytes(document_id_bytes);
+
+        for position in read_u32_ne_bytes(value) {
+            key_buffer.clear();
+            key_buffer.extend_from_slice(word_bytes);
+            let (fid, _) = relative_from_absolute_position(position);
+            key_buffer.extend_from_slice(&fid.to_be_bytes());
+            word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
+        }
+    }
+
+    let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
+
+    Ok(word_fid_docids_reader)
+}
diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
index 734cf8778..80a36c308 100644
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -14,7 +14,7 @@ use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Resu
 /// Returns a grenad reader with the list of extracted words at positions and
 /// documents ids from the given chunk of docid word positions.
 #[logging_timer::time]
-pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
+pub fn extract_word_position_docids<R: io::Read + io::Seek>(
     docid_word_positions: grenad::Reader<R>,
     indexer: GrenadParameters,
 ) -> Result<grenad::Reader<File>> {
@@ -39,7 +39,7 @@ pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
         for position in read_u32_ne_bytes(value) {
             key_buffer.clear();
             key_buffer.extend_from_slice(word_bytes);
-            let (_fid, position) = relative_from_absolute_position(position);
+            let (_, position) = relative_from_absolute_position(position);
             let position = bucketed_position(position);
             key_buffer.extend_from_slice(&position.to_be_bytes());
             word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 844efed36..db041de6f 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -5,6 +5,7 @@ mod extract_fid_docid_facet_values;
 mod extract_fid_word_count_docids;
 mod extract_geo_points;
 mod extract_word_docids;
+mod extract_word_fid_docids;
 mod extract_word_pair_proximity_docids;
 mod extract_word_position_docids;
 
@@ -22,8 +23,9 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
 use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
 use self::extract_geo_points::extract_geo_points;
 use self::extract_word_docids::extract_word_docids;
+use self::extract_word_fid_docids::extract_word_fid_docids;
 use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
-use self::extract_word_position_docids::extract_word_fid_and_position_docids;
+use self::extract_word_position_docids::extract_word_position_docids;
 use super::helpers::{
     as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
     GrenadParameters, MergeFn, MergeableReader,
@@ -130,14 +132,23 @@ pub(crate) fn data_from_obkv_documents(
     );
 
     spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
-        docid_word_positions_chunks,
+        docid_word_positions_chunks.clone(),
         indexer,
         lmdb_writer_sx.clone(),
-        extract_word_fid_and_position_docids,
+        extract_word_position_docids,
         merge_cbo_roaring_bitmaps,
         TypedChunk::WordPositionDocids,
         "word-position-docids",
     );
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
+        docid_word_positions_chunks,
+        indexer,
+        lmdb_writer_sx.clone(),
+        extract_word_fid_docids,
+        merge_cbo_roaring_bitmaps,
+        TypedChunk::WordFidDocids,
+        "word-fid-docids",
+    );
 
     spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
         docid_fid_facet_strings_chunks,
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index ade217beb..235b35fc8 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -2255,4 +2255,61 @@ mod tests {
         {"id":1,"catto":"jorts"}
         "###);
     }
+
+    #[test]
+    fn test_word_fid_position() {
+        let index = TempIndex::new();
+
+        index
+            .add_documents(documents!([
+              {"id": 0, "text": "sun flowers are looking at the sun" },
+              {"id": 1, "text": "sun flowers are looking at the sun" },
+              {"id": 2, "text": "the sun is shining today" },
+              {
+                "id": 3,
+                "text": "a a a a a a a a a a a a a a a a a
+                a a a a a a a a a a a a a a a a a a a a a a a a a a 
+                a a a a a a a a a a a a a a a a a a a a a a a a a a 
+                a a a a a a a a a a a a a a a a a a a a a a a a a a 
+                a a a a a a a a a a a a a a a a a a a a a a a a a a 
+                a a a a a a a a a a a a a a a a a a a a a a a a a a 
+                a a a a a a a a a a a a a a a a a a a a a "
+             }
+            ]))
+            .unwrap();
+
+        db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9");
+        db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f");
+
+        index
+            .add_documents(documents!([
+              {"id": 4, "text": "sun flowers are looking at the sun" },
+              {"id": 5, "text2": "sun flowers are looking at the sun" },
+              {"id": 6, "text": "b b b" },
+              {
+                "id": 7,
+                "text2": "a a a a"
+             }
+            ]))
+            .unwrap();
+
+        db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
+        db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
+
+        let mut wtxn = index.write_txn().unwrap();
+
+        // Delete not all of the documents but some of them.
+        let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
+        builder.strategy(DeletionStrategy::AlwaysHard);
+        builder.delete_external_id("0");
+        builder.delete_external_id("3");
+        let result = builder.execute().unwrap();
+        println!("{result:?}");
+
+        wtxn.commit().unwrap();
+
+        db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
+        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
+        db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1");
+    }
 }
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index b9b11cfa8..14ba021bd 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -35,6 +35,7 @@ pub(crate) enum TypedChunk {
         exact_word_docids_reader: grenad::Reader<File>,
     },
     WordPositionDocids(grenad::Reader<File>),
+    WordFidDocids(grenad::Reader<File>),
     WordPairProximityDocids(grenad::Reader<File>),
     FieldIdFacetStringDocids(grenad::Reader<File>),
     FieldIdFacetNumberDocids(grenad::Reader<File>),
@@ -140,6 +141,17 @@ pub(crate) fn write_typed_chunk_into_index(
             )?;
             is_merged_database = true;
         }
+        TypedChunk::WordFidDocids(word_fid_docids_iter) => {
+            append_entries_into_database(
+                word_fid_docids_iter,
+                &index.word_fid_docids,
+                wtxn,
+                index_is_empty,
+                |value, _buffer| Ok(value),
+                merge_cbo_roaring_bitmaps,
+            )?;
+            is_merged_database = true;
+        }
         TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
             let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
             indexer.execute(wtxn)?;