From 4d352a21ac72228d721db3a5aa29f3d42105fc50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Wed, 10 May 2023 13:31:19 +0200
Subject: [PATCH] Compute split words derivations of terms that don't accept
 typos

---
 .../new/query_term/compute_derivations.rs     | 90 ++++++++++---------
 .../src/search/new/tests/ngram_split_words.rs | 62 ++++++++++++-
 milli/src/search/new/tests/typo.rs            |  7 +-
 3 files changed, 109 insertions(+), 50 deletions(-)
diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs
index 0da841890..c26c4bc6b 100644
--- a/milli/src/search/new/query_term/compute_derivations.rs
+++ b/milli/src/search/new/query_term/compute_derivations.rs
@@ -28,11 +28,9 @@ pub enum ZeroOrOneTypo {
 impl Interned<QueryTerm> {
     pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
         let s = ctx.term_interner.get_mut(self);
-        if s.max_nbr_typos == 0 {
-            s.one_typo = Lazy::Init(OneTypoTerm::default());
-            s.two_typo = Lazy::Init(TwoTypoTerm::default());
-        } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() {
+        if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() {
             assert!(s.two_typo.is_uninit());
+            // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
             self.initialize_one_typo_subterm(ctx)?;
             let s = ctx.term_interner.get_mut(self);
             assert!(s.one_typo.is_init());
@@ -277,7 +275,7 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern
 impl Interned<QueryTerm> {
     fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
         let self_mut = ctx.term_interner.get_mut(self);
-        let QueryTerm { original, is_prefix, one_typo, .. } = self_mut;
+        let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut;
         let original = *original;
         let is_prefix = *is_prefix;
         // let original_str = ctx.word_interner.get(*original).to_owned();
@@ -286,19 +284,22 @@ impl Interned<QueryTerm> {
         }
         let mut one_typo_words = BTreeSet::new();
 
-        find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
-            match nbr_typos {
-                ZeroOrOneTypo::Zero => {}
-                ZeroOrOneTypo::One => {
-                    if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
-                        one_typo_words.insert(derived_word);
-                    } else {
-                        return Ok(ControlFlow::Break(()));
+        if *max_nbr_typos > 0 {
+            find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
+                match nbr_typos {
+                    ZeroOrOneTypo::Zero => {}
+                    ZeroOrOneTypo::One => {
+                        if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
+                            one_typo_words.insert(derived_word);
+                        } else {
+                            return Ok(ControlFlow::Break(()));
+                        }
                     }
                 }
-            }
-            Ok(ControlFlow::Continue(()))
-        })?;
+                Ok(ControlFlow::Continue(()))
+            })?;
+        }
+
         let original_str = ctx.word_interner.get(original).to_owned();
         let split_words = find_split_words(ctx, original_str.as_str())?;
 
@@ -327,7 +328,7 @@ impl Interned<QueryTerm> {
     }
     fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
         let self_mut = ctx.term_interner.get_mut(self);
-        let QueryTerm { original, is_prefix, two_typo, .. } = self_mut;
+        let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut;
         let original_str = ctx.word_interner.get(*original).to_owned();
         if two_typo.is_init() {
             return Ok(());
@@ -335,34 +336,37 @@ impl Interned<QueryTerm> {
         let mut one_typo_words = BTreeSet::new();
         let mut two_typo_words = BTreeSet::new();
 
-        find_zero_one_two_typo_derivations(
-            *original,
-            *is_prefix,
-            ctx.index.words_fst(ctx.txn)?,
-            &mut ctx.word_interner,
-            |derived_word, nbr_typos| {
-                if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
-                    && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
-                {
-                    // No chance we will add either one- or two-typo derivations anymore, stop iterating.
-                    return Ok(ControlFlow::Break(()));
-                }
-                match nbr_typos {
-                    NumberOfTypos::Zero => {}
-                    NumberOfTypos::One => {
-                        if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
-                            one_typo_words.insert(derived_word);
+        if *max_nbr_typos > 0 {
+            find_zero_one_two_typo_derivations(
+                *original,
+                *is_prefix,
+                ctx.index.words_fst(ctx.txn)?,
+                &mut ctx.word_interner,
+                |derived_word, nbr_typos| {
+                    if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
+                        && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
+                    {
+                        // No chance we will add either one- or two-typo derivations anymore, stop iterating.
+                        return Ok(ControlFlow::Break(()));
+                    }
+                    match nbr_typos {
+                        NumberOfTypos::Zero => {}
+                        NumberOfTypos::One => {
+                            if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
+                                one_typo_words.insert(derived_word);
+                            }
+                        }
+                        NumberOfTypos::Two => {
+                            if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
+                                two_typo_words.insert(derived_word);
+                            }
                         }
                     }
-                    NumberOfTypos::Two => {
-                        if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
-                            two_typo_words.insert(derived_word);
-                        }
-                    }
-                }
-                Ok(ControlFlow::Continue(()))
-            },
-        )?;
+                    Ok(ControlFlow::Continue(()))
+                },
+            )?;
+        }
+
         let split_words = find_split_words(ctx, original_str.as_str())?;
         let self_mut = ctx.term_interner.get_mut(self);
 
diff --git a/milli/src/search/new/tests/ngram_split_words.rs b/milli/src/search/new/tests/ngram_split_words.rs
index 2a0365bac..fb99b8ba2 100644
--- a/milli/src/search/new/tests/ngram_split_words.rs
+++ b/milli/src/search/new/tests/ngram_split_words.rs
@@ -3,9 +3,9 @@ This module tests the following properties:
 
 1. Two consecutive words from a query can be combined into a "2gram"
 2. Three consecutive words from a query can be combined into a "3gram"
-3. A word from the query can be split into two consecutive words (split words)
+3. A word from the query can be split into two consecutive words (split words), no matter how short it is
 4. A 2gram can be split into two words
-5. A 3gram cannot be split into two words
+5. A 3gram can be split into two words
 6. 2grams can contain up to 1 typo
 7. 3grams cannot have typos
 8. 2grams and 3grams can be prefix tolerant
@@ -14,6 +14,7 @@ This module tests the following properties:
 11. Disabling typo tolerance does not disable ngram tolerance
 12. Prefix tolerance is disabled for the last word if a space follows it
 13. Ngrams cannot be formed by combining a phrase and a word or two phrases
+14. Split words are not disabled by the `disableOnAttribute` or `disableOnWords` typo settings
 */
 
 use crate::index::tests::TempIndex;
@@ -56,6 +57,10 @@ fn create_index() -> TempIndex {
             {
                 "id": 5,
                 "text": "sunflowering is not a verb"
+            },
+            {
+                "id": 6,
+                "text": "xy z"
             }
         ]))
         .unwrap();
@@ -263,10 +268,11 @@ fn test_disable_split_words() {
     s.query("sunflower ");
     let SearchResult { documents_ids, .. } = s.execute().unwrap();
     // no document containing `sun flower`
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
     let texts = collect_field_values(&index, &txn, "text", &documents_ids);
     insta::assert_debug_snapshot!(texts, @r###"
     [
+        "\"the sun flower is tall\"",
         "\"the sunflower is tall\"",
     ]
     "###);
@@ -307,10 +313,11 @@ fn test_3gram_no_split_words() {
     let SearchResult { documents_ids, .. } = s.execute().unwrap();
 
     // no document with `sun flower`
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 5]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 5]");
     let texts = collect_field_values(&index, &txn, "text", &documents_ids);
     insta::assert_debug_snapshot!(texts, @r###"
     [
+        "\"the sun flower is tall\"",
         "\"the sunflowers are pretty\"",
         "\"the sunflower is tall\"",
         "\"sunflowering is not a verb\"",
@@ -369,3 +376,50 @@ fn test_no_ngram_phrases() {
     ]
     "###);
 }
+
+#[test]
+fn test_short_split_words() {
+    let index = create_index();
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::All);
+    s.query("xyz");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"xy z\"",
+    ]
+    "###);
+}
+
+#[test]
+fn test_split_words_never_disabled() {
+    let index = create_index();
+
+    index
+        .update_settings(|s| {
+            s.set_exact_words(["sunflower"].iter().map(ToString::to_string).collect());
+            s.set_exact_attributes(["text"].iter().map(ToString::to_string).collect());
+        })
+        .unwrap();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::All);
+    s.query("the sunflower is tall");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"the sun flower is tall\"",
+        "\"the sunflower is tall\"",
+    ]
+    "###);
+}
diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs
index 33b165a94..8fd9de5fc 100644
--- a/milli/src/search/new/tests/typo.rs
+++ b/milli/src/search/new/tests/typo.rs
@@ -9,7 +9,7 @@ This module tests the following properties:
 6. A typo on the first letter of a word counts as two typos
 7. Phrases are not typo tolerant
 8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos`
-9. 3grams are not typo tolerant
+9. 3grams are not typo tolerant (but they can be split into two words)
 10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly
 if `words` doesn't exist before it.
 11. The `typo` ranking rule places documents with the same number of typos in the same bucket
@@ -287,16 +287,17 @@ fn test_typo_exact_word() {
     ]
     "###);
 
-    // exact words do not disable prefix (sunflowering OK, but no sunflowar or sun flower)
+    // exact words do not disable prefix (sunflowering OK, but no sunflowar)
     let mut s = Search::new(&txn, &index);
     s.terms_matching_strategy(TermsMatchingStrategy::All);
     s.query("network interconnection sunflower");
     let SearchResult { documents_ids, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 17, 18]");
     let texts = collect_field_values(&index, &txn, "text", &documents_ids);
     insta::assert_debug_snapshot!(texts, @r###"
     [
         "\"network interconnection sunflower\"",
+        "\"network interconnection sun flower\"",
         "\"network interconnection sunflowering\"",
     ]
     "###);