From 57582688660f7bb0c40865a558933fa4f9a73a27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Tue, 16 May 2023 16:22:23 +0200
Subject: [PATCH] Don't compute split_words for phrases

---
 milli/src/search/new/logger/visual.rs         |  2 +-
 .../new/query_term/compute_derivations.rs     | 37 ++++++++++++++-----
 milli/src/search/new/query_term/mod.rs        | 17 +++++++--
 .../src/search/new/query_term/parse_query.rs  |  4 +-
 .../search/new/ranking_rule_graph/typo/mod.rs |  2 +-
 5 files changed, 45 insertions(+), 17 deletions(-)
diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs
index f76782e63..8df56da89 100644
--- a/milli/src/search/new/logger/visual.rs
+++ b/milli/src/search/new/logger/visual.rs
@@ -462,7 +462,7 @@ fill: \"#B6E2D3\"
                 shape: class
                 max_nbr_typo: {}",
                     term_subset.description(ctx),
-                    term_subset.max_nbr_typos(ctx)
+                    term_subset.max_typo_cost(ctx)
                 )?;
 
                 for w in term_subset.all_single_words_except_prefix_db(ctx)? {
diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs
index c26c4bc6b..d5dfbbcd0 100644
--- a/milli/src/search/new/query_term/compute_derivations.rs
+++ b/milli/src/search/new/query_term/compute_derivations.rs
@@ -28,14 +28,14 @@ pub enum ZeroOrOneTypo {
 impl Interned<QueryTerm> {
     pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
         let s = ctx.term_interner.get_mut(self);
-        if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() {
+        if s.max_levenshtein_distance <= 1 && s.one_typo.is_uninit() {
             assert!(s.two_typo.is_uninit());
             // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
             self.initialize_one_typo_subterm(ctx)?;
             let s = ctx.term_interner.get_mut(self);
             assert!(s.one_typo.is_init());
             s.two_typo = Lazy::Init(TwoTypoTerm::default());
-        } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() {
+        } else if s.max_levenshtein_distance > 1 && s.two_typo.is_uninit() {
             assert!(s.two_typo.is_uninit());
             self.initialize_one_and_two_typo_subterm(ctx)?;
             let s = ctx.term_interner.get_mut(self);
@@ -185,7 +185,7 @@ pub fn partially_initialized_term_from_word(
                 original: ctx.word_interner.insert(word.to_owned()),
                 ngram_words: None,
                 is_prefix: false,
-                max_nbr_typos: 0,
+                max_levenshtein_distance: 0,
                 zero_typo: <_>::default(),
                 one_typo: Lazy::Init(<_>::default()),
                 two_typo: Lazy::Init(<_>::default()),
@@ -256,7 +256,7 @@ pub fn partially_initialized_term_from_word(
     Ok(QueryTerm {
         original: word_interned,
         ngram_words: None,
-        max_nbr_typos: max_typo,
+        max_levenshtein_distance: max_typo,
         is_prefix,
         zero_typo,
         one_typo: Lazy::Uninit,
@@ -275,7 +275,16 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern
 impl Interned<QueryTerm> {
     fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
         let self_mut = ctx.term_interner.get_mut(self);
-        let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut;
+
+        let allows_split_words = self_mut.allows_split_words();
+        let QueryTerm {
+            original,
+            is_prefix,
+            one_typo,
+            max_levenshtein_distance: max_nbr_typos,
+            ..
+        } = self_mut;
+
         let original = *original;
         let is_prefix = *is_prefix;
         // let original_str = ctx.word_interner.get(*original).to_owned();
@@ -300,13 +309,17 @@ impl Interned<QueryTerm> {
             })?;
         }
 
-        let original_str = ctx.word_interner.get(original).to_owned();
-        let split_words = find_split_words(ctx, original_str.as_str())?;
+        let split_words = if allows_split_words {
+            let original_str = ctx.word_interner.get(original).to_owned();
+            find_split_words(ctx, original_str.as_str())?
+        } else {
+            None
+        };
 
         let self_mut = ctx.term_interner.get_mut(self);
 
         // Only add the split words to the derivations if:
-        // 1. the term is not an ngram; OR
+        // 1. the term is neither an ngram nor a phrase; OR
         // 2. the term is an ngram, but the split words are different from the ngram's component words
         let split_words = if let Some((ngram_words, split_words)) =
             self_mut.ngram_words.as_ref().zip(split_words.as_ref())
@@ -328,7 +341,13 @@ impl Interned<QueryTerm> {
     }
     fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
         let self_mut = ctx.term_interner.get_mut(self);
-        let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut;
+        let QueryTerm {
+            original,
+            is_prefix,
+            two_typo,
+            max_levenshtein_distance: max_nbr_typos,
+            ..
+        } = self_mut;
         let original_str = ctx.word_interner.get(*original).to_owned();
         if two_typo.is_init() {
             return Ok(());
diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs
index bf521d9b2..fb749a797 100644
--- a/milli/src/search/new/query_term/mod.rs
+++ b/milli/src/search/new/query_term/mod.rs
@@ -43,7 +43,7 @@ pub struct QueryTermSubset {
 pub struct QueryTerm {
     original: Interned<String>,
     ngram_words: Option<Vec<Interned<String>>>,
-    max_nbr_typos: u8,
+    max_levenshtein_distance: u8,
     is_prefix: bool,
     zero_typo: ZeroTypoTerm,
     // May not be computed yet
@@ -342,10 +342,16 @@ impl QueryTermSubset {
         }
         None
     }
-    pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 {
+    pub fn max_typo_cost(&self, ctx: &SearchContext) -> u8 {
         let t = ctx.term_interner.get(self.original);
-        match t.max_nbr_typos {
-            0 => 0,
+        match t.max_levenshtein_distance {
+            0 => {
+                if t.allows_split_words() {
+                    1
+                } else {
+                    0
+                }
+            }
             1 => {
                 if self.one_typo_subset.is_empty() {
                     0
@@ -438,6 +444,9 @@ impl QueryTerm {
 
         self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
     }
+    fn allows_split_words(&self) -> bool {
+        self.zero_typo.phrase.is_none()
+    }
 }
 
 impl Interned<QueryTerm> {
diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs
index dc317a0fb..bf90748e4 100644
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@@ -217,7 +217,7 @@ pub fn make_ngram(
         original: ngram_str_interned,
         ngram_words: Some(words_interned),
         is_prefix,
-        max_nbr_typos,
+        max_levenshtein_distance: max_nbr_typos,
         zero_typo: term.zero_typo,
         one_typo: Lazy::Uninit,
         two_typo: Lazy::Uninit,
@@ -271,7 +271,7 @@ impl PhraseBuilder {
                 QueryTerm {
                     original: ctx.word_interner.insert(phrase_desc),
                     ngram_words: None,
-                    max_nbr_typos: 0,
+                    max_levenshtein_distance: 0,
                     is_prefix: false,
                     zero_typo: ZeroTypoTerm {
                         phrase: Some(phrase),
diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs
index da5198c23..a44be6015 100644
--- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs
@@ -50,7 +50,7 @@ impl RankingRuleGraphTrait for TypoGraph {
         // 3-gram -> equivalent to 2 typos
         let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };
 
-        for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) {
+        for nbr_typos in 0..=term.term_subset.max_typo_cost(ctx) {
             let mut term = term.clone();
             match nbr_typos {
                 0 => {