From 0a77be4ec02f29df26242a6ffa7a94ddcb3b0724 Mon Sep 17 00:00:00 2001
From: ad hoc <postma.marin@protonmail.com>
Date: Thu, 24 Mar 2022 15:22:57 +0100
Subject: [PATCH] introduce exact_word_docids db

---
 milli/src/index.rs                            |  9 ++-
 milli/src/update/clear_documents.rs           |  2 +
 milli/src/update/delete_documents.rs          | 71 +++++++++++++------
 .../extract/extract_word_docids.rs            | 12 +++-
 .../src/update/index_documents/extract/mod.rs |  7 +-
 .../index_documents/helpers/grenad_helpers.rs |  5 ++
 .../src/update/index_documents/helpers/mod.rs |  2 +-
 milli/src/update/index_documents/mod.rs       | 18 +++--
 .../src/update/index_documents/typed_chunk.rs | 49 +++++++++----
 milli/src/update/word_prefix_docids.rs        |  5 +-
 10 files changed, 133 insertions(+), 47 deletions(-)
diff --git a/milli/src/index.rs b/milli/src/index.rs
index f4e17d93c..8f9c9beb7 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -59,6 +59,7 @@ pub mod main_key {
 pub mod db_name {
     pub const MAIN: &str = "main";
     pub const WORD_DOCIDS: &str = "word-docids";
+    pub const EXACT_WORD_DOCIDS: &str = "exact-word-docids";
     pub const WORD_PREFIX_DOCIDS: &str = "word-prefix-docids";
     pub const DOCID_WORD_POSITIONS: &str = "docid-word-positions";
     pub const WORD_PAIR_PROXIMITY_DOCIDS: &str = "word-pair-proximity-docids";
@@ -83,6 +84,10 @@ pub struct Index {
 
     /// A word and all the documents ids containing the word.
     pub word_docids: Database<Str, RoaringBitmapCodec>,
+
+    /// A word and all the documents ids containing the word, from attributes for which typos are not allowed.
+    pub exact_word_docids: Database<Str, RoaringBitmapCodec>,
+
     /// A prefix of word and all the documents ids containing this prefix.
     pub word_prefix_docids: Database<Str, RoaringBitmapCodec>,
 
@@ -119,12 +124,13 @@ impl Index {
     pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> Result<Index> {
         use db_name::*;
 
-        options.max_dbs(14);
+        options.max_dbs(15);
         unsafe { options.flag(Flags::MdbAlwaysFreePages) };
 
         let env = options.open(path)?;
         let main = env.create_poly_database(Some(MAIN))?;
         let word_docids = env.create_database(Some(WORD_DOCIDS))?;
+        let exact_word_docids = env.create_database(Some(EXACT_WORD_DOCIDS))?;
         let word_prefix_docids = env.create_database(Some(WORD_PREFIX_DOCIDS))?;
         let docid_word_positions = env.create_database(Some(DOCID_WORD_POSITIONS))?;
         let word_pair_proximity_docids = env.create_database(Some(WORD_PAIR_PROXIMITY_DOCIDS))?;
@@ -146,6 +152,7 @@ impl Index {
             env,
             main,
             word_docids,
+            exact_word_docids,
             word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index 644547b91..57c0969c7 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -19,6 +19,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
             env: _env,
             main: _main,
             word_docids,
+            exact_word_docids,
             word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
@@ -55,6 +56,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
 
         // Clear the other databases.
         word_docids.clear(self.wtxn)?;
+        exact_word_docids.clear(self.wtxn)?;
         word_prefix_docids.clear(self.wtxn)?;
         docid_word_positions.clear(self.wtxn)?;
         word_pair_proximity_docids.clear(self.wtxn)?;
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 402cc61dd..46a4721c0 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -2,7 +2,7 @@ use std::collections::btree_map::Entry;
 use std::collections::HashMap;
 
 use fst::IntoStreamer;
-use heed::types::ByteSlice;
+use heed::types::{ByteSlice, Str};
 use heed::{BytesDecode, BytesEncode};
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
@@ -16,7 +16,10 @@ use crate::heed_codec::facet::{
 };
 use crate::heed_codec::CboRoaringBitmapCodec;
 use crate::index::{db_name, main_key};
-use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
+use crate::{
+    DocumentId, ExternalDocumentsIds, FieldId, Index, Result, RoaringBitmapCodec, SmallString32,
+    BEU32,
+};
 
 pub struct DeleteDocuments<'t, 'u, 'i> {
     wtxn: &'t mut heed::RwTxn<'i, 'u>,
@@ -108,6 +111,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
             env: _env,
             main: _main,
             word_docids,
+            exact_word_docids,
             word_prefix_docids,
             docid_word_positions,
             word_pair_proximity_docids,
@@ -204,25 +208,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
         // We iterate over the words and delete the documents ids
         // from the word docids database.
         for (word, must_remove) in &mut words {
-            // We create an iterator to be able to get the content and delete the word docids.
-            // It's faster to acquire a cursor to get and delete or put, as we avoid traversing
-            // the LMDB B-Tree two times but only once.
-            let mut iter = word_docids.prefix_iter_mut(self.wtxn, &word)?;
-            if let Some((key, mut docids)) = iter.next().transpose()? {
-                if key == word.as_str() {
-                    let previous_len = docids.len();
-                    docids -= &self.documents_ids;
-                    if docids.is_empty() {
-                        // safety: we don't keep references from inside the LMDB database.
-                        unsafe { iter.del_current()? };
-                        *must_remove = true;
-                    } else if docids.len() != previous_len {
-                        let key = key.to_owned();
-                        // safety: we don't keep references from inside the LMDB database.
-                        unsafe { iter.put_current(&key, &docids)? };
-                    }
-                }
-            }
+            remove_from_word_docids(
+                self.wtxn,
+                word_docids,
+                word.as_str(),
+                must_remove,
+                &self.documents_ids,
+            )?;
+
+            remove_from_word_docids(
+                self.wtxn,
+                exact_word_docids,
+                word.as_str(),
+                must_remove,
+                &self.documents_ids,
+            )?;
         }
 
         // We construct an FST set that contains the words to delete from the words FST.
@@ -457,6 +457,35 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
     }
 }
 
+fn remove_from_word_docids(
+    txn: &mut heed::RwTxn,
+    db: &heed::Database<Str, RoaringBitmapCodec>,
+    word: &str,
+    must_remove: &mut bool,
+    to_remove: &RoaringBitmap,
+) -> Result<()> {
+    // We create an iterator to be able to get the content and delete the word docids.
+    // It's faster to acquire a cursor to get and delete or put, as we avoid traversing
+    // the LMDB B-Tree two times but only once.
+    let mut iter = db.prefix_iter_mut(txn, &word)?;
+    if let Some((key, mut docids)) = iter.next().transpose()? {
+        if key == word {
+            let previous_len = docids.len();
+            docids -= to_remove;
+            if docids.is_empty() {
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.del_current()? };
+                *must_remove = true;
+            } else if docids.len() != previous_len {
+                let key = key.to_owned();
+                // safety: we don't keep references from inside the LMDB database.
+                unsafe { iter.put_current(&key, &docids)? };
+            }
+        }
+    }
+    Ok(())
+}
+
 fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>(
     wtxn: &'a mut heed::RwTxn,
     db: &heed::Database<C, DC>,
diff --git a/milli/src/update/index_documents/extract/extract_word_docids.rs b/milli/src/update/index_documents/extract/extract_word_docids.rs
index 80d68298a..03bfada21 100644
--- a/milli/src/update/index_documents/extract/extract_word_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_docids.rs
@@ -10,17 +10,21 @@ use super::helpers::{
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
+use crate::update::index_documents::MergeFn;
 use crate::Result;
 
 /// Extracts the word and the documents ids where this word appear.
 ///
 /// Returns a grenad reader with the list of extracted words and
 /// documents ids from the given chunk of docid word positions.
+///
+/// The first returned reader in the one for normal word_docids, and the second one is for
+/// exact_word_docids
 #[logging_timer::time]
 pub fn extract_word_docids<R: io::Read + io::Seek>(
     docid_word_positions: grenad::Reader<R>,
     indexer: GrenadParameters,
-) -> Result<grenad::Reader<File>> {
+) -> Result<(grenad::Reader<File>, grenad::Reader<File>)> {
     let max_memory = indexer.max_memory_by_thread();
 
     let mut word_docids_sorter = create_sorter(
@@ -43,5 +47,9 @@ pub fn extract_word_docids<R: io::Read + io::Seek>(
         word_docids_sorter.insert(word_bytes, &value_buffer)?;
     }
 
-    sorter_into_reader(word_docids_sorter, indexer)
+    let empty_sorter = grenad::Sorter::new(merge_roaring_bitmaps as MergeFn);
+    Ok((
+        sorter_into_reader(word_docids_sorter, indexer)?,
+        sorter_into_reader(empty_sorter, indexer)?,
+    ))
 }
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 100431237..4e7f211ce 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -86,13 +86,16 @@ pub(crate) fn data_from_obkv_documents(
         "field-id-wordcount-docids",
     );
 
-    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
+    spawn_extraction_task::<_, _, Vec<(grenad::Reader<File>, grenad::Reader<File>)>>(
         docid_word_positions_chunks.clone(),
         indexer.clone(),
         lmdb_writer_sx.clone(),
         extract_word_docids,
         merge_roaring_bitmaps,
-        TypedChunk::WordDocids,
+        |(word_docids_reader, exact_word_docids_reader)| TypedChunk::WordDocids {
+            word_docids_reader,
+            exact_word_docids_reader,
+        },
         "word-docids",
     );
 
diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs
index fc28860b2..fb5242910 100644
--- a/milli/src/update/index_documents/helpers/grenad_helpers.rs
+++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs
@@ -277,3 +277,8 @@ pub fn sorter_into_lmdb_database(
     debug!("MTBL sorter writen in {:.02?}!", before.elapsed());
     Ok(())
 }
+
+/// Used when trying to merge readers, but you don't actually care about the values.
+pub fn merge_nothing<'a>(_key: &[u8], _values: &[Cow<'a, [u8]>]) -> Result<Cow<'a, [u8]>> {
+    Ok(Cow::Owned(Vec::new()))
+}
diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs
index f4940af1d..4642bcf14 100644
--- a/milli/src/update/index_documents/helpers/mod.rs
+++ b/milli/src/update/index_documents/helpers/mod.rs
@@ -8,7 +8,7 @@ use std::convert::{TryFrom, TryInto};
 pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
 use fst::{IntoStreamer, Streamer};
 pub use grenad_helpers::{
-    as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
+    as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_nothing,
     sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
     GrenadParameters, MergeableReader,
 };
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 2d3004444..633b72cc9 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -20,7 +20,7 @@ pub use self::helpers::{
     fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
     sorter_into_lmdb_database, write_into_lmdb_database, writer_into_reader, ClonableMmap, MergeFn,
 };
-use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
+use self::helpers::{grenad_obkv_into_chunks, merge_nothing, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
 use crate::documents::DocumentBatchReader;
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
@@ -282,6 +282,7 @@ where
         let mut word_pair_proximity_docids = None;
         let mut word_position_docids = None;
         let mut word_docids = None;
+        let mut _exact_word_docids = None;
 
         let mut databases_seen = 0;
         (self.progress)(UpdateIndexingStep::MergeDataIntoFinalDatabase {
@@ -291,10 +292,13 @@ where
 
         for result in lmdb_writer_rx {
             let typed_chunk = match result? {
-                TypedChunk::WordDocids(chunk) => {
-                    let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
+                TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
+                    let cloneable_chunk = unsafe { as_cloneable_grenad(&word_docids_reader)? };
                     word_docids = Some(cloneable_chunk);
-                    TypedChunk::WordDocids(chunk)
+                    let cloneable_chunk =
+                        unsafe { as_cloneable_grenad(&exact_word_docids_reader)? };
+                    _exact_word_docids = Some(cloneable_chunk);
+                    TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader }
                 }
                 TypedChunk::WordPairProximityDocids(chunk) => {
                     let cloneable_chunk = unsafe { as_cloneable_grenad(&chunk)? };
@@ -425,6 +429,10 @@ where
         });
 
         if let Some(word_docids) = word_docids {
+            let mut word_docids_builder = grenad::MergerBuilder::new(merge_nothing as MergeFn);
+            word_docids_builder.push(word_docids.into_cursor()?);
+            // TODO: push exact_word_docids
+            let word_docids_iter = word_docids_builder.build().into_stream_merger_iter()?;
             // Run the word prefix docids update operation.
             let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
             builder.chunk_compression_type = self.indexer_config.chunk_compression_type;
@@ -432,7 +440,7 @@ where
             builder.max_nb_chunks = self.indexer_config.max_nb_chunks;
             builder.max_memory = self.indexer_config.max_memory;
             builder.execute(
-                word_docids,
+                word_docids_iter,
                 &new_prefix_fst_words,
                 &common_prefix_fst_words,
                 &del_prefix_fst_words,
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index 77ea31138..be440114f 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -3,14 +3,16 @@ use std::convert::TryInto;
 use std::fs::File;
 use std::io;
 
+use grenad::MergerBuilder;
 use heed::types::ByteSlice;
 use heed::{BytesDecode, RwTxn};
 use roaring::RoaringBitmap;
 
 use super::helpers::{
-    self, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key,
+    self, merge_nothing, roaring_bitmap_from_u32s_array, serialize_roaring_bitmap, valid_lmdb_key,
     CursorClonableMmap,
 };
+use super::{ClonableMmap, MergeFn};
 use crate::heed_codec::facet::{decode_prefix_string, encode_prefix_string};
 use crate::update::index_documents::helpers::as_cloneable_grenad;
 use crate::{
@@ -25,7 +27,10 @@ pub(crate) enum TypedChunk {
     Documents(grenad::Reader<CursorClonableMmap>),
     FieldIdWordcountDocids(grenad::Reader<File>),
     NewDocumentsIds(RoaringBitmap),
-    WordDocids(grenad::Reader<File>),
+    WordDocids {
+        word_docids_reader: grenad::Reader<File>,
+        exact_word_docids_reader: grenad::Reader<File>,
+    },
     WordPositionDocids(grenad::Reader<File>),
     WordPairProximityDocids(grenad::Reader<File>),
     FieldIdFacetStringDocids(grenad::Reader<File>),
@@ -86,8 +91,8 @@ pub(crate) fn write_typed_chunk_into_index(
         TypedChunk::NewDocumentsIds(documents_ids) => {
             return Ok((documents_ids, is_merged_database))
         }
-        TypedChunk::WordDocids(word_docids_iter) => {
-            let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_iter) }?;
+        TypedChunk::WordDocids { word_docids_reader, exact_word_docids_reader } => {
+            let word_docids_iter = unsafe { as_cloneable_grenad(&word_docids_reader) }?;
             append_entries_into_database(
                 word_docids_iter.clone(),
                 &index.word_docids,
@@ -97,15 +102,18 @@ pub(crate) fn write_typed_chunk_into_index(
                 merge_roaring_bitmaps,
             )?;
 
+            let exact_word_docids_iter = unsafe { as_cloneable_grenad(&exact_word_docids_reader) }?;
+            append_entries_into_database(
+                exact_word_docids_iter.clone(),
+                &index.exact_word_docids,
+                wtxn,
+                index_is_empty,
+                |value, _buffer| Ok(value),
+                merge_roaring_bitmaps,
+            )?;
+
             // create fst from word docids
-            let mut builder = fst::SetBuilder::memory();
-            let mut cursor = word_docids_iter.into_cursor()?;
-            while let Some((word, _value)) = cursor.move_on_next()? {
-                // This is a lexicographically ordered word position
-                // we use the key to construct the words fst.
-                builder.insert(word)?;
-            }
-            let fst = builder.into_set().map_data(std::borrow::Cow::Owned)?;
+            let fst = merge_word_docids_reader_into_fst(word_docids_iter, exact_word_docids_iter)?;
             let db_fst = index.words_fst(wtxn)?;
 
             // merge new fst with database fst
@@ -214,6 +222,23 @@ pub(crate) fn write_typed_chunk_into_index(
     Ok((RoaringBitmap::new(), is_merged_database))
 }
 
+fn merge_word_docids_reader_into_fst(
+    word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
+    exact_word_docids_iter: grenad::Reader<io::Cursor<ClonableMmap>>,
+) -> Result<fst::Set<Vec<u8>>> {
+    let mut merger_builder = MergerBuilder::new(merge_nothing as MergeFn);
+    merger_builder.push(word_docids_iter.into_cursor()?);
+    merger_builder.push(exact_word_docids_iter.into_cursor()?);
+    let mut iter = merger_builder.build().into_stream_merger_iter()?;
+    let mut builder = fst::SetBuilder::memory();
+
+    while let Some((k, _)) = iter.next()? {
+        builder.insert(k)?;
+    }
+
+    Ok(builder.into_set())
+}
+
 fn merge_roaring_bitmaps(new_value: &[u8], db_value: &[u8], buffer: &mut Vec<u8>) -> Result<()> {
     let new_value = RoaringBitmap::deserialize_from(new_value)?;
     let db_value = RoaringBitmap::deserialize_from(db_value)?;
diff --git a/milli/src/update/word_prefix_docids.rs b/milli/src/update/word_prefix_docids.rs
index 076816f09..4114f8baf 100644
--- a/milli/src/update/word_prefix_docids.rs
+++ b/milli/src/update/word_prefix_docids.rs
@@ -35,7 +35,7 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
     #[logging_timer::time("WordPrefixDocids::{}")]
     pub fn execute(
         self,
-        new_word_docids: grenad::Reader<CursorClonableMmap>,
+        mut new_word_docids_iter: grenad::MergerIter<CursorClonableMmap, MergeFn>,
         new_prefix_fst_words: &[String],
         common_prefix_fst_words: &[&[String]],
         del_prefix_fst_words: &HashSet<Vec<u8>>,
@@ -51,10 +51,9 @@ impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
         );
 
         if !common_prefix_fst_words.is_empty() {
-            let mut new_word_docids_iter = new_word_docids.into_cursor()?;
             let mut current_prefixes: Option<&&[String]> = None;
             let mut prefixes_cache = HashMap::new();
-            while let Some((word, data)) = new_word_docids_iter.move_on_next()? {
+            while let Some((word, data)) = new_word_docids_iter.next()? {
                 current_prefixes = match current_prefixes.take() {
                     Some(prefixes) if word.starts_with(&prefixes[0].as_bytes()) => Some(prefixes),
                     _otherwise => {