From 406b8bd2489931afb2373ed60011e73d9530922e Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 4 Apr 2023 17:04:46 +0200
Subject: [PATCH 01/10] Add new db caches

---
 milli/src/search/new/db_cache.rs | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)
diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs
index af94108e2..effd123be 100644
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@@ -24,6 +24,8 @@ pub struct DatabaseCache<'ctx> {
     pub word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
     pub exact_word_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
     pub word_prefix_docids: FxHashMap<Interned<String>, Option<&'ctx [u8]>>,
+    pub word_position_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
+    pub word_fid_docids: FxHashMap<(Interned<String>, u16), Option<&'ctx [u8]>>,
 }
 impl<'ctx> DatabaseCache<'ctx> {
     fn get_value<'v, K1, KC>(
@@ -128,4 +130,32 @@ impl<'ctx> SearchContext<'ctx> {
             self.index.prefix_word_pair_proximity_docids.remap_data_type::<ByteSlice>(),
         )
     }
+
+    pub fn get_db_word_position_docids(
+        &mut self,
+        word: Interned<String>,
+        position: u16,
+    ) -> Result<Option<&'ctx [u8]>> {
+        DatabaseCache::get_value(
+            self.txn,
+            (word, position),
+            &(self.word_interner.get(word).as_str(), position),
+            &mut self.db_cache.word_position_docids,
+            self.index.word_position_docids.remap_data_type::<ByteSlice>(),
+        )
+    }
+
+    pub fn get_db_word_fid_docids(
+        &mut self,
+        word: Interned<String>,
+        fid: u16,
+    ) -> Result<Option<&'ctx [u8]>> {
+        DatabaseCache::get_value(
+            self.txn,
+            (word, fid),
+            &(self.word_interner.get(word).as_str(), fid),
+            &mut self.db_cache.word_fid_docids,
+            self.index.word_fid_docids.remap_data_type::<ByteSlice>(),
+        )
+    }
 }

From ec2f8e804003f9ece9d49f3f616a60152b5c0ed2 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 4 Apr 2023 17:06:07 +0200
Subject: [PATCH 02/10] Rename `is_multiple_words` to `is_ngram` and
 `zero_typo` to `exact`

---
 milli/src/search/new/logger/detailed.rs |  4 ++--
 milli/src/search/new/query_term.rs      | 32 +++++++++++++++----------
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs
index 3a02950a8..3c4779ad9 100644
--- a/milli/src/search/new/logger/detailed.rs
+++ b/milli/src/search/new/logger/detailed.rs
@@ -441,7 +441,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{
             }) => {
                 let QueryTerm {
                     original,
-                    is_multiple_words: _,
+                    is_ngram: _,
                     is_prefix: _,
                     max_nbr_typos,
                     zero_typo,
@@ -458,7 +458,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{
                 )
                 .unwrap();
 
-                let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } =
+                let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } =
                     zero_typo;
 
                 for w in zero_typo.iter().copied() {
diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs
index d19ab6135..90b03d194 100644
--- a/milli/src/search/new/query_term.rs
+++ b/milli/src/search/new/query_term.rs
@@ -204,8 +204,13 @@ impl QueryTermSubset {
         }
 
         if !self.zero_typo_subset.is_empty() {
-            let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } =
-                &original.zero_typo;
+            let ZeroTypoTerm {
+                phrase: _,
+                exact: zero_typo,
+                prefix_of,
+                synonyms: _,
+                use_prefix_db: _,
+            } = &original.zero_typo;
             result.extend(zero_typo.iter().copied());
             result.extend(prefix_of.iter().copied());
         };
@@ -258,7 +263,7 @@ impl QueryTermSubset {
             )?;
         }
 
-        let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } =
+        let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } =
             &original.zero_typo;
         result.extend(phrase.iter().copied());
         result.extend(synonyms.iter().copied());
@@ -302,7 +307,7 @@ impl QueryTerm {
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub struct QueryTerm {
     pub original: Interned<String>,
-    pub is_multiple_words: bool,
+    pub is_ngram: bool,
     pub max_nbr_typos: u8,
     pub is_prefix: bool,
     pub zero_typo: ZeroTypoTerm,
@@ -318,7 +323,7 @@ pub struct ZeroTypoTerm {
     /// The original phrase, if any
     pub phrase: Option<Interned<Phrase>>,
     /// A single word equivalent to the original term, with zero typos
-    pub zero_typo: Option<Interned<String>>,
+    pub exact: Option<Interned<String>>,
     /// All the words that contain the original word as prefix
     pub prefix_of: BTreeSet<Interned<String>>,
     /// All the synonyms of the original word or phrase
@@ -341,7 +346,7 @@ pub struct TwoTypoTerm {
 
 impl ZeroTypoTerm {
     fn is_empty(&self) -> bool {
-        let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self;
+        let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self;
         phrase.is_none()
             && zero_typo.is_none()
             && prefix_of.is_empty()
@@ -370,12 +375,12 @@ impl QueryTerm {
     ) -> Self {
         Self {
             original: word_interner.insert(phrase.description(word_interner)),
-            is_multiple_words: false,
+            is_ngram: false,
             max_nbr_typos: 0,
             is_prefix: false,
             zero_typo: ZeroTypoTerm {
                 phrase: Some(phrase_interner.insert(phrase)),
-                zero_typo: None,
+                exact: None,
                 prefix_of: BTreeSet::default(),
                 synonyms: BTreeSet::default(),
                 use_prefix_db: None,
@@ -387,7 +392,7 @@ impl QueryTerm {
     pub fn empty(word_interner: &mut DedupInterner<String>, original: &str) -> Self {
         Self {
             original: word_interner.insert(original.to_owned()),
-            is_multiple_words: false,
+            is_ngram: false,
             is_prefix: false,
             max_nbr_typos: 0,
             zero_typo: <_>::default(),
@@ -606,11 +611,12 @@ fn partially_initialized_term_from_word(
             Some(ctx.phrase_interner.insert(Phrase { words }))
         })
         .collect();
-    let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db };
+    let zero_typo =
+        ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db };
 
     Ok(QueryTerm {
         original: word_interned,
-        is_multiple_words: false,
+        is_ngram: false,
         max_nbr_typos: max_typo,
         is_prefix,
         zero_typo,
@@ -765,7 +771,7 @@ fn split_best_frequency(
 impl QueryTerm {
     /// Return the original word from the given query term
     pub fn original_single_word(&self) -> Option<Interned<String>> {
-        if self.is_multiple_words {
+        if self.is_ngram {
             None
         } else {
             Some(self.original)
@@ -1039,7 +1045,7 @@ pub fn make_ngram(
 
     let term = QueryTerm {
         original,
-        is_multiple_words: true,
+        is_ngram: true,
         is_prefix,
         max_nbr_typos,
         zero_typo: term.zero_typo,

From 4d5bc9df4c4f3145ebc72ce73d3e51325f6fba1c Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 4 Apr 2023 17:07:26 +0200
Subject: [PATCH 03/10] Increase position by 8 on hard separator when building
 query terms

---
 milli/src/search/new/query_term.rs                              | 2 +-
 .../index_documents/extract/extract_docid_word_positions.rs     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs
index 90b03d194..005c0a2e3 100644
--- a/milli/src/search/new/query_term.rs
+++ b/milli/src/search/new/query_term.rs
@@ -907,7 +907,7 @@ pub fn located_query_terms_from_string(
             TokenKind::Separator(separator_kind) => {
                 match separator_kind {
                     SeparatorKind::Hard => {
-                        position += 1;
+                        position += 8;
                     }
                     SeparatorKind::Soft => {
                         position += 0;
diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
index 2d51fcc1a..c362f8f1b 100644
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@@ -153,7 +153,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st
 
 /// take an iterator on tokens and compute their relative position depending on separator kinds
 /// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
-/// else we keep the standart proximity of 1 between words.
+/// else we keep the standard proximity of 1 between words.
 fn process_tokens<'a>(
     tokens: impl Iterator<Item = Token<'a>>,
 ) -> impl Iterator<Item = (usize, Token<'a>)> {

From 3951fe22ab72e9d9e44498c9d95ad29a0449a8dc Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 4 Apr 2023 17:09:32 +0200
Subject: [PATCH 04/10] Add ExactTerm and helper method

---
 milli/src/search/new/query_term.rs | 37 ++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs
index 005c0a2e3..4e3922980 100644
--- a/milli/src/search/new/query_term.rs
+++ b/milli/src/search/new/query_term.rs
@@ -4,6 +4,7 @@ use std::ops::{ControlFlow, RangeInclusive};
 
 use charabia::normalizer::NormalizedTokenIter;
 use charabia::{SeparatorKind, TokenKind};
+use either::Either;
 use fst::automaton::Str;
 use fst::{Automaton, IntoStreamer, Streamer};
 use heed::types::DecodeIgnore;
@@ -138,7 +139,43 @@ pub struct LocatedQueryTermSubset {
     pub term_ids: RangeInclusive<u8>,
 }
 
+#[derive(Clone, Copy)]
+pub enum ExactTerm {
+    Phrase(Interned<Phrase>),
+    Word(Interned<String>),
+}
+
+impl ExactTerm {
+    pub fn interned_words<'ctx>(
+        &self,
+        ctx: &'ctx SearchContext<'ctx>,
+    ) -> impl Iterator<Item = Option<Interned<String>>> + 'ctx {
+        match *self {
+            ExactTerm::Phrase(phrase) => {
+                let phrase = ctx.phrase_interner.get(phrase);
+                Either::Left(phrase.words.iter().copied())
+            }
+            ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))),
+        }
+    }
+}
+
 impl QueryTermSubset {
+    pub fn exact_term(&self, ctx: &SearchContext) -> Option<ExactTerm> {
+        let full_query_term = ctx.term_interner.get(self.original);
+        if full_query_term.is_ngram {
+            return None;
+        }
+        // TODO: included in subset
+        if let Some(phrase) = full_query_term.zero_typo.phrase {
+            self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase))
+        } else if let Some(word) = full_query_term.zero_typo.exact {
+            self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word))
+        } else {
+            None
+        }
+    }
+
     pub fn empty(for_term: Interned<QueryTerm>) -> Self {
         Self {
             original: for_term,

From 4b4ffb8ec993729fb53467a2899b198a14d320f9 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Tue, 4 Apr 2023 17:12:07 +0200
Subject: [PATCH 05/10] Add exactness ranking rules

---
 milli/src/search/new/exact_attribute.rs       | 175 ++++++++++++++++++
 .../search/new/graph_based_ranking_rule.rs    |  10 +-
 milli/src/search/new/mod.rs                   |  12 +-
 .../new/ranking_rule_graph/exactness/mod.rs   | 107 +++++++++++
 .../src/search/new/ranking_rule_graph/mod.rs  |   3 +
 5 files changed, 301 insertions(+), 6 deletions(-)
 create mode 100644 milli/src/search/new/exact_attribute.rs
 create mode 100644 milli/src/search/new/ranking_rule_graph/exactness/mod.rs

diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs
new file mode 100644
index 000000000..bb6299e28
--- /dev/null
+++ b/milli/src/search/new/exact_attribute.rs
@@ -0,0 +1,175 @@
+use heed::BytesDecode;
+use roaring::MultiOps;
+
+use super::query_graph::QueryGraph;
+use super::ranking_rules::{RankingRule, RankingRuleOutput};
+use crate::search::new::query_graph::QueryNodeData;
+use crate::search::new::query_term::ExactTerm;
+use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
+
+/// FIXME:
+///
+/// - A lot of work done in next_bucket that start_iteration could do.
+/// - Consider calling the graph based rule directly from this one.
+/// - currently we did exact term, don't forget about prefix
+/// - some tests
+pub struct ExactAttribute {
+    query_graph: Option<QueryGraph>,
+}
+
+impl ExactAttribute {
+    pub fn new() -> Self {
+        Self { query_graph: None }
+    }
+}
+
+impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
+    fn id(&self) -> String {
+        "exact_attribute".to_owned()
+    }
+
+    fn start_iteration(
+        &mut self,
+        _ctx: &mut SearchContext<'ctx>,
+        _logger: &mut dyn SearchLogger<QueryGraph>,
+        _universe: &roaring::RoaringBitmap,
+        query: &QueryGraph,
+    ) -> Result<()> {
+        self.query_graph = Some(query.clone());
+        Ok(())
+    }
+
+    fn next_bucket(
+        &mut self,
+        ctx: &mut SearchContext<'ctx>,
+        _logger: &mut dyn SearchLogger<QueryGraph>,
+        universe: &roaring::RoaringBitmap,
+    ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
+        // iterate on the nodes of the graph, retain LocatedQueryTermSubset
+        let query_graph = self.query_graph.as_ref().unwrap();
+        let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> =
+            Vec::with_capacity(query_graph.nodes.len() as usize);
+        for (_, node) in query_graph.nodes.iter() {
+            match &node.data {
+                QueryNodeData::Term(term) => {
+                    let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
+                        exact_term
+                    } else {
+                        // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
+                        return Ok(Some(RankingRuleOutput {
+                            query: query_graph.clone(),
+                            candidates: universe.clone(),
+                        }));
+                    };
+                    exact_term_position_ids.push((
+                        exact_term,
+                        *term.positions.start(),
+                        *term.term_ids.start(),
+                    ))
+                }
+                QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue,
+            }
+        }
+
+        exact_term_position_ids.sort_by_key(|(_, _, id)| *id);
+        // bail if there is a "hole" (missing word) in remaining query graph
+        let mut previous_id = 0;
+        for (_, _, id) in exact_term_position_ids.iter().copied() {
+            if id < previous_id || id - previous_id > 1 {
+                // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
+                return Ok(Some(RankingRuleOutput {
+                    query: query_graph.clone(),
+                    candidates: universe.clone(),
+                }));
+            } else {
+                previous_id = id;
+            }
+        }
+
+        // sample query: "sunflower are pretty"
+        // sunflower at pos 0 in attr A
+        // are at pos 1 in attr B
+        // pretty at pos 2 in attr C
+        // We want to eliminate such document
+
+        // first check that for each term, there exists some attribute that has this term at the correct position
+        //"word-position-docids";
+        let mut candidates = universe.clone();
+        let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids
+            .iter()
+            .copied()
+            .map(|(term, position, _)| (term.interned_words(ctx).collect(), position))
+            .collect();
+        for (words, position) in &words_positions {
+            if candidates.is_empty() {
+                // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
+                return Ok(Some(RankingRuleOutput {
+                    query: query_graph.clone(),
+                    candidates: universe.clone(),
+                }));
+            }
+
+            'words: for (offset, word) in words.iter().enumerate() {
+                let offset = offset as u16;
+                let word = if let Some(word) = word {
+                    word
+                } else {
+                    continue 'words;
+                };
+                let word_position_docids = CboRoaringBitmapCodec::bytes_decode(
+                    ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(),
+                )
+                .unwrap_or_default();
+                candidates &= word_position_docids;
+            }
+        }
+
+        let candidates = candidates;
+
+        if candidates.is_empty() {
+            // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
+            return Ok(Some(RankingRuleOutput {
+                query: query_graph.clone(),
+                candidates: universe.clone(),
+            }));
+        }
+
+        let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default();
+
+        let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len());
+
+        // then check that there exists at least one attribute that has all of the terms
+        for fid in searchable_fields_ids {
+            let mut intersection = MultiOps::intersection(
+                words_positions
+                    .iter()
+                    .flat_map(|(words, ..)| words.iter())
+                    // ignore stop words words in phrases
+                    .flatten()
+                    .map(|word| -> Result<_> {
+                        Ok(ctx
+                            .get_db_word_fid_docids(*word, fid)?
+                            .map(CboRoaringBitmapCodec::bytes_decode)
+                            .unwrap_or_default()
+                            .unwrap_or_default())
+                    }),
+            )?;
+            intersection &= &candidates;
+            if !intersection.is_empty() {
+                candidates_per_attributes.push(intersection);
+            }
+        }
+        // note we could have "false positives" where there both exist different attributes that collectively
+        // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
+
+        let candidates = MultiOps::union(candidates_per_attributes.into_iter());
+        Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates }))
+    }
+
+    fn end_iteration(
+        &mut self,
+        _ctx: &mut SearchContext<'ctx>,
+        _logger: &mut dyn SearchLogger<QueryGraph>,
+    ) {
+    }
+}
diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs
index b8c58c726..28b4ed1f4 100644
--- a/milli/src/search/new/graph_based_ranking_rule.rs
+++ b/milli/src/search/new/graph_based_ranking_rule.rs
@@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner};
 use super::logger::SearchLogger;
 use super::query_graph::QueryNode;
 use super::ranking_rule_graph::{
-    ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait,
-    TypoGraph,
+    ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph,
+    RankingRuleGraphTrait, TypoGraph,
 };
 use super::small_bitmap::SmallBitmap;
 use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext};
@@ -65,6 +65,12 @@ impl GraphBasedRankingRule<TypoGraph> {
         Self::new_with_id("typo".to_owned(), terms_matching_strategy)
     }
 }
+pub type Exactness = GraphBasedRankingRule<ExactnessGraph>;
+impl GraphBasedRankingRule<ExactnessGraph> {
+    pub fn new() -> Self {
+        Self::new_with_id("exactness".to_owned(), None)
+    }
+}
 
 /// A generic graph-based ranking rule
 pub struct GraphBasedRankingRule<G: RankingRuleGraphTrait> {
diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs
index 4d561d25b..779e589b3 100644
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -9,8 +9,9 @@ mod query_term;
 mod ranking_rule_graph;
 mod ranking_rules;
 mod resolve_query_graph;
-// TODO: documentation + comments
 mod small_bitmap;
+
+mod exact_attribute;
 // TODO: documentation + comments
 // implementation is currently an adaptation of the previous implementation to fit with the new model
 mod sort;
@@ -33,6 +34,8 @@ use resolve_query_graph::PhraseDocIdsCache;
 use roaring::RoaringBitmap;
 use words::Words;
 
+use self::exact_attribute::ExactAttribute;
+use self::graph_based_ranking_rule::Exactness;
 use self::interner::Interner;
 use self::ranking_rules::{BoxRankingRule, RankingRule};
 use self::resolve_query_graph::compute_query_graph_docids;
@@ -150,7 +153,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
     let mut proximity = false;
     let mut sort = false;
     let attribute = false;
-    let exactness = false;
+    let mut exactness = false;
     let mut asc = HashSet::new();
     let mut desc = HashSet::new();
 
@@ -211,8 +214,9 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
                 if exactness {
                     continue;
                 }
-                // todo!();
-                // exactness = false;
+                ranking_rules.push(Box::new(ExactAttribute::new()));
+                ranking_rules.push(Box::new(Exactness::new()));
+                exactness = true;
             }
             crate::Criterion::Asc(field_name) => {
                 if asc.contains(&field_name) {
diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
new file mode 100644
index 000000000..a1e19a015
--- /dev/null
+++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
@@ -0,0 +1,107 @@
+use roaring::RoaringBitmap;
+
+use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
+use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
+use crate::search::new::query_graph::{QueryGraph, QueryNode};
+use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset};
+use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
+
+/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other
+/// word than a doc that matches 9 words non exactly but none exactly
+/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider
+///
+/// "naive vision"
+/// condition from one node to another:
+/// - word exactly present: cost 0
+/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside
+///
+/// Three strategies:
+/// 1. ExactAttribute: word position / word_fid_docid
+/// 2. AttributeStart:
+/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait`
+
+#[derive(Clone, PartialEq, Eq, Hash)]
+pub enum ExactnessCondition {
+    ExactInAttribute(LocatedQueryTermSubset),
+    Skip(LocatedQueryTermSubset),
+}
+
+pub enum ExactnessGraph {}
+
+fn compute_docids(
+    ctx: &mut SearchContext,
+    dest_node: &LocatedQueryTermSubset,
+    universe: &RoaringBitmap,
+) -> Result<RoaringBitmap> {
+    let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) {
+        exact_term
+    } else {
+        return Ok(Default::default());
+    };
+    let mut candidates = match exact_term {
+        ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(),
+        ExactTerm::Word(word) => {
+            if let Some(word_candidates) = ctx.get_db_word_docids(word)? {
+                CboRoaringBitmapCodec::deserialize_from(word_candidates)?
+            } else {
+                return Ok(Default::default());
+            }
+        }
+    };
+    // TODO: synonyms?
+    candidates &= universe;
+    Ok(candidates)
+}
+
+impl RankingRuleGraphTrait for ExactnessGraph {
+    type Condition = ExactnessCondition;
+
+    fn resolve_condition(
+        ctx: &mut SearchContext,
+        condition: &Self::Condition,
+        universe: &RoaringBitmap,
+    ) -> Result<ComputedCondition> {
+        let (docids, dest_node) = match condition {
+            ExactnessCondition::ExactInAttribute(dest_node) => {
+                (compute_docids(ctx, dest_node, universe)?, dest_node)
+            }
+            ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node),
+        };
+        Ok(ComputedCondition {
+            docids,
+            universe_len: universe.len(),
+            start_term_subset: None,
+            end_term_subset: dest_node.clone(),
+        })
+    }
+
+    fn build_edges(
+        _ctx: &mut SearchContext,
+        conditions_interner: &mut DedupInterner<Self::Condition>,
+        _source_node: Option<&LocatedQueryTermSubset>,
+        dest_node: &LocatedQueryTermSubset,
+    ) -> Result<Vec<(u32, Interned<Self::Condition>)>> {
+        let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone());
+        let exact_condition = conditions_interner.insert(exact_condition);
+
+        let skip_condition = ExactnessCondition::Skip(dest_node.clone());
+        let skip_condition = conditions_interner.insert(skip_condition);
+        Ok(vec![(0, exact_condition), (1, skip_condition)])
+    }
+
+    fn log_state(
+        graph: &RankingRuleGraph<Self>,
+        paths: &[Vec<Interned<Self::Condition>>],
+        dead_ends_cache: &DeadEndsCache<Self::Condition>,
+        universe: &RoaringBitmap,
+        costs: &MappedInterner<QueryNode, Vec<u64>>,
+        cost: u64,
+        logger: &mut dyn SearchLogger<QueryGraph>,
+    ) {
+        todo!()
+    }
+
+    fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result<String> {
+        todo!()
+    }
+}
diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs
index 7c40008c8..936c3e942 100644
--- a/milli/src/search/new/ranking_rule_graph/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/mod.rs
@@ -10,6 +10,8 @@ mod cheapest_paths;
 mod condition_docids_cache;
 mod dead_ends_cache;
 
+/// Implementation of the `exactness` ranking rule
+mod exactness;
 /// Implementation of the `proximity` ranking rule
 mod proximity;
 /// Implementation of the `typo` ranking rule
@@ -20,6 +22,7 @@ use std::hash::Hash;
 pub use cheapest_paths::PathVisitor;
 pub use condition_docids_cache::ConditionDocIdsCache;
 pub use dead_ends_cache::DeadEndsCache;
+pub use exactness::{ExactnessCondition, ExactnessGraph};
 pub use proximity::{ProximityCondition, ProximityGraph};
 use roaring::RoaringBitmap;
 pub use typo::{TypoCondition, TypoGraph};

From 337e75b0e4fa63566ed5ac516f504a723117ecb2 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 5 Apr 2023 14:42:51 +0200
Subject: [PATCH 06/10] Exact attribute with state

---
 milli/src/search/new/exact_attribute.rs | 166 +++++++++++++++++-------
 1 file changed, 122 insertions(+), 44 deletions(-)

diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs
index bb6299e28..fa837272b 100644
--- a/milli/src/search/new/exact_attribute.rs
+++ b/milli/src/search/new/exact_attribute.rs
@@ -1,5 +1,5 @@
 use heed::BytesDecode;
-use roaring::MultiOps;
+use roaring::{MultiOps, RoaringBitmap};
 
 use super::query_graph::QueryGraph;
 use super::ranking_rules::{RankingRule, RankingRuleOutput};
@@ -7,19 +7,18 @@ use crate::search::new::query_graph::QueryNodeData;
 use crate::search::new::query_term::ExactTerm;
 use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
 
-/// FIXME:
+/// A ranking rule that produces 3 disjoint buckets:
 ///
-/// - A lot of work done in next_bucket that start_iteration could do.
-/// - Consider calling the graph based rule directly from this one.
-/// - currently we did exact term, don't forget about prefix
-/// - some tests
+/// 1. Documents from the universe whose value is exactly the query.
+/// 2. Documents from the universe not in (1) whose value starts with the query.
+/// 3. Documents from the universe not in (1) or (2).
 pub struct ExactAttribute {
-    query_graph: Option<QueryGraph>,
+    state: State,
 }
 
 impl ExactAttribute {
     pub fn new() -> Self {
-        Self { query_graph: None }
+        Self { state: Default::default() }
     }
 }
 
@@ -30,23 +29,69 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
 
     fn start_iteration(
         &mut self,
-        _ctx: &mut SearchContext<'ctx>,
+        ctx: &mut SearchContext<'ctx>,
         _logger: &mut dyn SearchLogger<QueryGraph>,
-        _universe: &roaring::RoaringBitmap,
+        universe: &roaring::RoaringBitmap,
         query: &QueryGraph,
     ) -> Result<()> {
-        self.query_graph = Some(query.clone());
+        self.state = State::start_iteration(ctx, universe, query)?;
+
         Ok(())
     }
 
     fn next_bucket(
         &mut self,
-        ctx: &mut SearchContext<'ctx>,
+        _ctx: &mut SearchContext<'ctx>,
         _logger: &mut dyn SearchLogger<QueryGraph>,
         universe: &roaring::RoaringBitmap,
     ) -> Result<Option<RankingRuleOutput<QueryGraph>>> {
-        // iterate on the nodes of the graph, retain LocatedQueryTermSubset
-        let query_graph = self.query_graph.as_ref().unwrap();
+        let state = std::mem::take(&mut self.state);
+        let (state, output) = State::next(state, universe);
+        self.state = state;
+
+        Ok(output)
+    }
+
+    fn end_iteration(
+        &mut self,
+        _ctx: &mut SearchContext<'ctx>,
+        _logger: &mut dyn SearchLogger<QueryGraph>,
+    ) {
+        self.state = Default::default();
+    }
+}
+
+/// Inner state of the ranking rule.
+#[derive(Default)]
+enum State {
+    /// State between two iterations
+    #[default]
+    Uninitialized,
+    /// The next call to `next` will output the documents in the universe that have an attribute that is the exact query
+    ExactAttribute(QueryGraph, Vec<FieldCandidates>),
+    /// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query,
+    /// but isn't the exact query.
+    AttributeStarts(QueryGraph, Vec<FieldCandidates>),
+    /// The next calls to `next` will output the input universe.
+    Empty(QueryGraph),
+}
+
+/// The candidates sorted by attributes
+///
+/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field.
+struct FieldCandidates {
+    /// The candidates that start with all the words of the query in the field
+    start_with_exact: RoaringBitmap,
+    /// The candidates that have the same number of words as the query in the field
+    exact_word_count: RoaringBitmap,
+}
+
+impl State {
+    fn start_iteration(
+        ctx: &mut SearchContext<'_>,
+        universe: &RoaringBitmap,
+        query_graph: &QueryGraph,
+    ) -> Result<Self> {
         let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> =
             Vec::with_capacity(query_graph.nodes.len() as usize);
         for (_, node) in query_graph.nodes.iter() {
@@ -55,11 +100,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
                     let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) {
                         exact_term
                     } else {
-                        // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
-                        return Ok(Some(RankingRuleOutput {
-                            query: query_graph.clone(),
-                            candidates: universe.clone(),
-                        }));
+                        continue;
                     };
                     exact_term_position_ids.push((
                         exact_term,
@@ -73,14 +114,17 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
 
         exact_term_position_ids.sort_by_key(|(_, _, id)| *id);
         // bail if there is a "hole" (missing word) in remaining query graph
+        if let Some((_, _, first_id)) = exact_term_position_ids.first() {
+            if *first_id != 0 {
+                return Ok(State::Empty(query_graph.clone()));
+            }
+        } else {
+            return Ok(State::Empty(query_graph.clone()));
+        }
         let mut previous_id = 0;
         for (_, _, id) in exact_term_position_ids.iter().copied() {
             if id < previous_id || id - previous_id > 1 {
-                // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
-                return Ok(Some(RankingRuleOutput {
-                    query: query_graph.clone(),
-                    candidates: universe.clone(),
-                }));
+                return Ok(State::Empty(query_graph.clone()));
             } else {
                 previous_id = id;
             }
@@ -102,11 +146,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
             .collect();
         for (words, position) in &words_positions {
             if candidates.is_empty() {
-                // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
-                return Ok(Some(RankingRuleOutput {
-                    query: query_graph.clone(),
-                    candidates: universe.clone(),
-                }));
+                return Ok(State::Empty(query_graph.clone()));
             }
 
             'words: for (offset, word) in words.iter().enumerate() {
@@ -116,8 +156,11 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
                 } else {
                     continue 'words;
                 };
+                // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of
+                // longer phrases we'll be losing on precision here.
+                let bucketed_position = crate::bucketed_position(position + offset);
                 let word_position_docids = CboRoaringBitmapCodec::bytes_decode(
-                    ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(),
+                    ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(),
                 )
                 .unwrap_or_default();
                 candidates &= word_position_docids;
@@ -127,16 +170,12 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
         let candidates = candidates;
 
         if candidates.is_empty() {
-            // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules
-            return Ok(Some(RankingRuleOutput {
-                query: query_graph.clone(),
-                candidates: universe.clone(),
-            }));
+            return Ok(State::Empty(query_graph.clone()));
         }
 
         let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default();
 
-        let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len());
+        let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len());
 
         // then check that there exists at least one attribute that has all of the terms
         for fid in searchable_fields_ids {
@@ -156,20 +195,59 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute {
             )?;
             intersection &= &candidates;
             if !intersection.is_empty() {
-                candidates_per_attributes.push(intersection);
+                let candidates_with_exact_word_count = ctx
+                    .index
+                    .field_id_word_count_docids
+                    .get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))?
+                    .unwrap_or_default();
+                candidates_per_attribute.push(FieldCandidates {
+                    start_with_exact: intersection,
+                    exact_word_count: candidates_with_exact_word_count,
+                });
             }
         }
         // note we could have "false positives" where there both exist different attributes that collectively
         // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order.
 
-        let candidates = MultiOps::union(candidates_per_attributes.into_iter());
-        Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates }))
+        Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute))
     }
 
-    fn end_iteration(
-        &mut self,
-        _ctx: &mut SearchContext<'ctx>,
-        _logger: &mut dyn SearchLogger<QueryGraph>,
-    ) {
+    fn next(
+        state: State,
+        universe: &RoaringBitmap,
+    ) -> (State, Option<RankingRuleOutput<QueryGraph>>) {
+        let (state, output) = match state {
+            State::Uninitialized => (state, None),
+            State::ExactAttribute(query_graph, candidates_per_attribute) => {
+                let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
+                    |FieldCandidates { start_with_exact, exact_word_count }| {
+                        start_with_exact & exact_word_count
+                    },
+                ));
+                candidates &= universe;
+                (
+                    State::AttributeStarts(query_graph.clone(), candidates_per_attribute),
+                    Some(RankingRuleOutput { query: query_graph, candidates }),
+                )
+            }
+            State::AttributeStarts(query_graph, candidates_per_attribute) => {
+                let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
+                    |FieldCandidates { mut start_with_exact, exact_word_count }| {
+                        start_with_exact -= exact_word_count;
+                        start_with_exact
+                    },
+                ));
+                candidates &= universe;
+                (
+                    State::Empty(query_graph.clone()),
+                    Some(RankingRuleOutput { query: query_graph, candidates }),
+                )
+            }
+            State::Empty(query_graph) => (
+                State::Empty(query_graph.clone()),
+                Some(RankingRuleOutput { query: query_graph, candidates: universe.clone() }),
+            ),
+        };
+        (state, output)
     }
 }

From f7ecea142ec3c3d1403ad00969e37d211c861125 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 5 Apr 2023 14:43:16 +0200
Subject: [PATCH 07/10] Fix panics and issues in exactness graph ranking rule

---
 .../new/ranking_rule_graph/exactness/mod.rs   | 48 ++++++++-----------
 .../extract/extract_word_position_docids.rs   |  5 +-
 2 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
index a1e19a015..3d558e87b 100644
--- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
@@ -1,24 +1,11 @@
+use heed::BytesDecode;
 use roaring::RoaringBitmap;
 
 use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait};
 use crate::search::new::interner::{DedupInterner, Interned, MappedInterner};
 use crate::search::new::query_graph::{QueryGraph, QueryNode};
 use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset};
-use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
-
-/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other
-/// word than a doc that matches 9 words non exactly but none exactly
-/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider
-///
-/// "naive vision"
-/// condition from one node to another:
-/// - word exactly present: cost 0
-/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside
-///
-/// Three strategies:
-/// 1. ExactAttribute: word position / word_fid_docid
-/// 2. AttributeStart:
-/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait`
+use crate::{Result, RoaringBitmapCodec, SearchContext, SearchLogger};
 
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub enum ExactnessCondition {
@@ -42,7 +29,7 @@ fn compute_docids(
         ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(),
         ExactTerm::Word(word) => {
             if let Some(word_candidates) = ctx.get_db_word_docids(word)? {
-                CboRoaringBitmapCodec::deserialize_from(word_candidates)?
+                RoaringBitmapCodec::bytes_decode(word_candidates).ok_or(heed::Error::Decoding)?
             } else {
                 return Ok(Default::default());
             }
@@ -86,22 +73,29 @@ impl RankingRuleGraphTrait for ExactnessGraph {
 
         let skip_condition = ExactnessCondition::Skip(dest_node.clone());
         let skip_condition = conditions_interner.insert(skip_condition);
-        Ok(vec![(0, exact_condition), (1, skip_condition)])
+
+        Ok(vec![(0, exact_condition), (dest_node.term_ids.len() as u32, skip_condition)])
     }
 
     fn log_state(
-        graph: &RankingRuleGraph<Self>,
-        paths: &[Vec<Interned<Self::Condition>>],
-        dead_ends_cache: &DeadEndsCache<Self::Condition>,
-        universe: &RoaringBitmap,
-        costs: &MappedInterner<QueryNode, Vec<u64>>,
-        cost: u64,
-        logger: &mut dyn SearchLogger<QueryGraph>,
+        _graph: &RankingRuleGraph<Self>,
+        _paths: &[Vec<Interned<Self::Condition>>],
+        _dead_ends_cache: &DeadEndsCache<Self::Condition>,
+        _niverse: &RoaringBitmap,
+        _costs: &MappedInterner<QueryNode, Vec<u64>>,
+        _cost: u64,
+        _logger: &mut dyn SearchLogger<QueryGraph>,
     ) {
-        todo!()
     }
 
-    fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result<String> {
-        todo!()
+    fn label_for_condition(
+        _ctx: &mut SearchContext,
+        condition: &Self::Condition,
+    ) -> Result<String> {
+        Ok(match condition {
+            ExactnessCondition::ExactInAttribute(_) => "exact",
+            ExactnessCondition::Skip(_) => "skip",
+        }
+        .to_owned())
     }
 }
diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
index cd3ec691b..eef5089bc 100644
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -7,10 +7,7 @@ use super::helpers::{
 };
 use crate::error::SerializationError;
 use crate::index::db_name::DOCID_WORD_POSITIONS;
-use crate::{
-    absolute_from_relative_position, bucketed_position, relative_from_absolute_position,
-    DocumentId, Result,
-};
+use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result};
 
 /// Extracts the word positions and the documents ids where this word appear.
 ///

From d1ddaa223d39b7ba74fde7f9a72b04662931935f Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 5 Apr 2023 18:05:44 +0200
Subject: [PATCH 08/10] Use correct codec in proximity

---
 .../ranking_rule_graph/proximity/compute_docids.rs   | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
index 8496054b7..07bd102ca 100644
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@@ -1,14 +1,17 @@
 #![allow(clippy::too_many_arguments)]
 
+use std::collections::BTreeSet;
+
+use heed::BytesDecode;
+use roaring::RoaringBitmap;
+
 use super::ProximityCondition;
 use crate::search::new::interner::Interned;
 use crate::search::new::query_term::{Phrase, QueryTermSubset};
 use crate::search::new::ranking_rule_graph::ComputedCondition;
 use crate::search::new::resolve_query_graph::compute_query_term_subset_docids;
 use crate::search::new::SearchContext;
-use crate::{CboRoaringBitmapCodec, Result};
-use roaring::RoaringBitmap;
-use std::collections::BTreeSet;
+use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec};
 
 pub fn compute_docids(
     ctx: &mut SearchContext,
@@ -90,7 +93,8 @@ pub fn compute_docids(
                     continue;
                 }
             } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? {
-                let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?;
+                let left_word_docids =
+                    RoaringBitmapCodec::bytes_decode(lw_bytes).ok_or(heed::Error::Decoding)?;
                 if universe.is_disjoint(&left_word_docids) {
                     continue;
                 }

From d9460a76f43fa7b35cb2e4d423148c2a0ab174e5 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Wed, 5 Apr 2023 14:43:42 +0200
Subject: [PATCH 09/10] Fix word_position_docids indexing

---
 .../index_documents/extract/extract_word_position_docids.rs    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
index eef5089bc..734cf8778 100644
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -39,9 +39,8 @@ pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
         for position in read_u32_ne_bytes(value) {
             key_buffer.clear();
             key_buffer.extend_from_slice(word_bytes);
-            let (fid, position) = relative_from_absolute_position(position);
+            let (_fid, position) = relative_from_absolute_position(position);
             let position = bucketed_position(position);
-            let position = absolute_from_relative_position(fid, position);
             key_buffer.extend_from_slice(&position.to_be_bytes());
             word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
         }

From 5440f43fd3be28981933bd55e126ee88ed3324e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Wed, 5 Apr 2023 14:55:02 +0200
Subject: [PATCH 10/10] Fix indexing of word_position_docid and fid

---
 milli/src/snapshot_tests.rs                   |  8 +++
 .../extract/extract_word_fid_docids.rs        | 48 ++++++++++++++++
 .../extract/extract_word_position_docids.rs   |  4 +-
 .../src/update/index_documents/extract/mod.rs | 17 +++++-
 milli/src/update/index_documents/mod.rs       | 57 +++++++++++++++++++
 .../src/update/index_documents/typed_chunk.rs | 12 ++++
 6 files changed, 141 insertions(+), 5 deletions(-)
 create mode 100644 milli/src/update/index_documents/extract/extract_word_fid_docids.rs

diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs
index f7f1a97e6..eb94c4be9 100644
--- a/milli/src/snapshot_tests.rs
+++ b/milli/src/snapshot_tests.rs
@@ -248,6 +248,11 @@ pub fn snap_word_position_docids(index: &Index) -> String {
         &format!("{word:<16} {position:<6} {}", display_bitmap(&b))
     })
 }
+pub fn snap_word_fid_docids(index: &Index) -> String {
+    make_db_snap_from_iter!(index, word_fid_docids, |((word, fid), b)| {
+        &format!("{word:<16} {fid:<3} {}", display_bitmap(&b))
+    })
+}
 pub fn snap_field_id_word_count_docids(index: &Index) -> String {
     make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| {
         &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b))
@@ -477,6 +482,9 @@ macro_rules! full_snap_of_db {
     ($index:ident, word_position_docids) => {{
         $crate::snapshot_tests::snap_word_position_docids(&$index)
     }};
+    ($index:ident, word_fid_docids) => {{
+        $crate::snapshot_tests::snap_word_fid_docids(&$index)
+    }};
     ($index:ident, field_id_word_count_docids) => {{
         $crate::snapshot_tests::snap_field_id_word_count_docids(&$index)
     }};
diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
new file mode 100644
index 000000000..72b30cddf
--- /dev/null
+++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs
@@ -0,0 +1,48 @@
+use std::fs::File;
+use std::io;
+
+use super::helpers::{
+    create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader,
+    try_split_array_at, GrenadParameters,
+};
+use crate::error::SerializationError;
+use crate::index::db_name::DOCID_WORD_POSITIONS;
+use crate::{relative_from_absolute_position, DocumentId, Result};
+
+/// Extracts the word, field id, and the documents ids where this word appear at this field id.
+#[logging_timer::time]
+pub fn extract_word_fid_docids<R: io::Read + io::Seek>(
+    docid_word_positions: grenad::Reader<R>,
+    indexer: GrenadParameters,
+) -> Result<grenad::Reader<File>> {
+    let max_memory = indexer.max_memory_by_thread();
+
+    let mut word_fid_docids_sorter = create_sorter(
+        grenad::SortAlgorithm::Unstable,
+        merge_cbo_roaring_bitmaps,
+        indexer.chunk_compression_type,
+        indexer.chunk_compression_level,
+        indexer.max_nb_chunks,
+        max_memory,
+    );
+
+    let mut key_buffer = Vec::new();
+    let mut cursor = docid_word_positions.into_cursor()?;
+    while let Some((key, value)) = cursor.move_on_next()? {
+        let (document_id_bytes, word_bytes) = try_split_array_at(key)
+            .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?;
+        let document_id = DocumentId::from_be_bytes(document_id_bytes);
+
+        for position in read_u32_ne_bytes(value) {
+            key_buffer.clear();
+            key_buffer.extend_from_slice(word_bytes);
+            let (fid, _) = relative_from_absolute_position(position);
+            key_buffer.extend_from_slice(&fid.to_be_bytes());
+            word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
+        }
+    }
+
+    let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?;
+
+    Ok(word_fid_docids_reader)
+}
diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
index 734cf8778..80a36c308 100644
--- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs
+++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs
@@ -14,7 +14,7 @@ use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Resu
 /// Returns a grenad reader with the list of extracted words at positions and
 /// documents ids from the given chunk of docid word positions.
 #[logging_timer::time]
-pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
+pub fn extract_word_position_docids<R: io::Read + io::Seek>(
     docid_word_positions: grenad::Reader<R>,
     indexer: GrenadParameters,
 ) -> Result<grenad::Reader<File>> {
@@ -39,7 +39,7 @@ pub fn extract_word_fid_and_position_docids<R: io::Read + io::Seek>(
         for position in read_u32_ne_bytes(value) {
             key_buffer.clear();
             key_buffer.extend_from_slice(word_bytes);
-            let (_fid, position) = relative_from_absolute_position(position);
+            let (_, position) = relative_from_absolute_position(position);
             let position = bucketed_position(position);
             key_buffer.extend_from_slice(&position.to_be_bytes());
             word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?;
diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs
index 844efed36..db041de6f 100644
--- a/milli/src/update/index_documents/extract/mod.rs
+++ b/milli/src/update/index_documents/extract/mod.rs
@@ -5,6 +5,7 @@ mod extract_fid_docid_facet_values;
 mod extract_fid_word_count_docids;
 mod extract_geo_points;
 mod extract_word_docids;
+mod extract_word_fid_docids;
 mod extract_word_pair_proximity_docids;
 mod extract_word_position_docids;
 
@@ -22,8 +23,9 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values;
 use self::extract_fid_word_count_docids::extract_fid_word_count_docids;
 use self::extract_geo_points::extract_geo_points;
 use self::extract_word_docids::extract_word_docids;
+use self::extract_word_fid_docids::extract_word_fid_docids;
 use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids;
-use self::extract_word_position_docids::extract_word_fid_and_position_docids;
+use self::extract_word_position_docids::extract_word_position_docids;
 use super::helpers::{
     as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap,
     GrenadParameters, MergeFn, MergeableReader,
@@ -130,14 +132,23 @@ pub(crate) fn data_from_obkv_documents(
     );
 
     spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
-        docid_word_positions_chunks,
+        docid_word_positions_chunks.clone(),
         indexer,
         lmdb_writer_sx.clone(),
-        extract_word_fid_and_position_docids,
+        extract_word_position_docids,
         merge_cbo_roaring_bitmaps,
         TypedChunk::WordPositionDocids,
         "word-position-docids",
     );
+    spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
+        docid_word_positions_chunks,
+        indexer,
+        lmdb_writer_sx.clone(),
+        extract_word_fid_docids,
+        merge_cbo_roaring_bitmaps,
+        TypedChunk::WordFidDocids,
+        "word-fid-docids",
+    );
 
     spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
         docid_fid_facet_strings_chunks,
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index ade217beb..235b35fc8 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -2255,4 +2255,61 @@ mod tests {
         {"id":1,"catto":"jorts"}
         "###);
     }
+
+    #[test]
+    fn test_word_fid_position() {
+        let index = TempIndex::new();
+
+        index
+            .add_documents(documents!([
+              {"id": 0, "text": "sun flowers are looking at the sun" },
+              {"id": 1, "text": "sun flowers are looking at the sun" },
+              {"id": 2, "text": "the sun is shining today" },
+              {
+                "id": 3,
+                "text": "a a a a a a a a a a a a a a a a a
+                a a a a a a a a a a a a a a a a a a a a a a a a a a 
+                a a a a a a a a a a a a a a a a a a a a a a a a a a 
+                a a a a a a a a a a a a a a a a a a a a a a a a a a 
+                a a a a a a a a a a a a a a a a a a a a a a a a a a 
+                a a a a a a a a a a a a a a a a a a a a a a a a a a 
+                a a a a a a a a a a a a a a a a a a a a a "
+             }
+            ]))
+            .unwrap();
+
+        db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9");
+        db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f");
+
+        index
+            .add_documents(documents!([
+              {"id": 4, "text": "sun flowers are looking at the sun" },
+              {"id": 5, "text2": "sun flowers are looking at the sun" },
+              {"id": 6, "text": "b b b" },
+              {
+                "id": 7,
+                "text2": "a a a a"
+             }
+            ]))
+            .unwrap();
+
+        db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4");
+        db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83");
+
+        let mut wtxn = index.write_txn().unwrap();
+
+        // Delete not all of the documents but some of them.
+        let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap();
+        builder.strategy(DeletionStrategy::AlwaysHard);
+        builder.delete_external_id("0");
+        builder.delete_external_id("3");
+        let result = builder.execute().unwrap();
+        println!("{result:?}");
+
+        wtxn.commit().unwrap();
+
+        db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933");
+        db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f");
+        db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1");
+    }
 }
diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs
index b9b11cfa8..14ba021bd 100644
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@@ -35,6 +35,7 @@ pub(crate) enum TypedChunk {
         exact_word_docids_reader: grenad::Reader<File>,
     },
     WordPositionDocids(grenad::Reader<File>),
+    WordFidDocids(grenad::Reader<File>),
     WordPairProximityDocids(grenad::Reader<File>),
     FieldIdFacetStringDocids(grenad::Reader<File>),
     FieldIdFacetNumberDocids(grenad::Reader<File>),
@@ -140,6 +141,17 @@ pub(crate) fn write_typed_chunk_into_index(
             )?;
             is_merged_database = true;
         }
+        TypedChunk::WordFidDocids(word_fid_docids_iter) => {
+            append_entries_into_database(
+                word_fid_docids_iter,
+                &index.word_fid_docids,
+                wtxn,
+                index_is_empty,
+                |value, _buffer| Ok(value),
+                merge_cbo_roaring_bitmaps,
+            )?;
+            is_merged_database = true;
+        }
         TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => {
             let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter);
             indexer.execute(wtxn)?;