Compute split words derivations of terms that don't accept typos

2025-02-20 17:45:54 +08:00 · 2023-05-10 13:31:19 +02:00 · 2023-05-10 13:31:19 +02:00 · 4d352a21ac
commit 4d352a21ac
parent 4a4210c116
3 changed files with 109 additions and 50 deletions
--- a/milli/src/search/new/query_term/compute_derivations.rs
+++ b/milli/src/search/new/query_term/compute_derivations.rs
@ -28,11 +28,9 @@ pub enum ZeroOrOneTypo {
 impl Interned<QueryTerm> {
    pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
        let s = ctx.term_interner.get_mut(self);
-        if s.max_nbr_typos == 0 {
-            s.one_typo = Lazy::Init(OneTypoTerm::default());
-            s.two_typo = Lazy::Init(TwoTypoTerm::default());
-        } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() {
+        if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() {
            assert!(s.two_typo.is_uninit());
+            // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
            self.initialize_one_typo_subterm(ctx)?;
            let s = ctx.term_interner.get_mut(self);
            assert!(s.one_typo.is_init());
@ -277,7 +275,7 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern
 impl Interned<QueryTerm> {
    fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
        let self_mut = ctx.term_interner.get_mut(self);
-        let QueryTerm { original, is_prefix, one_typo, .. } = self_mut;
+        let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut;
        let original = *original;
        let is_prefix = *is_prefix;
        // let original_str = ctx.word_interner.get(*original).to_owned();
@ -286,19 +284,22 @@ impl Interned<QueryTerm> {
        }
        let mut one_typo_words = BTreeSet::new();

-        find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
-            match nbr_typos {
-                ZeroOrOneTypo::Zero => {}
-                ZeroOrOneTypo::One => {
-                    if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
-                        one_typo_words.insert(derived_word);
-                    } else {
-                        return Ok(ControlFlow::Break(()));
+        if *max_nbr_typos > 0 {
+            find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| {
+                match nbr_typos {
+                    ZeroOrOneTypo::Zero => {}
+                    ZeroOrOneTypo::One => {
+                        if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
+                            one_typo_words.insert(derived_word);
+                        } else {
+                            return Ok(ControlFlow::Break(()));
+                        }
                    }
                }
-            }
-            Ok(ControlFlow::Continue(()))
-        })?;
+                Ok(ControlFlow::Continue(()))
+            })?;
+        }
+
        let original_str = ctx.word_interner.get(original).to_owned();
        let split_words = find_split_words(ctx, original_str.as_str())?;

@ -327,7 +328,7 @@ impl Interned<QueryTerm> {
    }
    fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
        let self_mut = ctx.term_interner.get_mut(self);
-        let QueryTerm { original, is_prefix, two_typo, .. } = self_mut;
+        let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut;
        let original_str = ctx.word_interner.get(*original).to_owned();
        if two_typo.is_init() {
            return Ok(());
@ -335,34 +336,37 @@ impl Interned<QueryTerm> {
        let mut one_typo_words = BTreeSet::new();
        let mut two_typo_words = BTreeSet::new();

-        find_zero_one_two_typo_derivations(
-            *original,
-            *is_prefix,
-            ctx.index.words_fst(ctx.txn)?,
-            &mut ctx.word_interner,
-            |derived_word, nbr_typos| {
-                if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
-                    && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
-                {
-                    // No chance we will add either one- or two-typo derivations anymore, stop iterating.
-                    return Ok(ControlFlow::Break(()));
-                }
-                match nbr_typos {
-                    NumberOfTypos::Zero => {}
-                    NumberOfTypos::One => {
-                        if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
-                            one_typo_words.insert(derived_word);
+        if *max_nbr_typos > 0 {
+            find_zero_one_two_typo_derivations(
+                *original,
+                *is_prefix,
+                ctx.index.words_fst(ctx.txn)?,
+                &mut ctx.word_interner,
+                |derived_word, nbr_typos| {
+                    if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT
+                        && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT
+                    {
+                        // No chance we will add either one- or two-typo derivations anymore, stop iterating.
+                        return Ok(ControlFlow::Break(()));
+                    }
+                    match nbr_typos {
+                        NumberOfTypos::Zero => {}
+                        NumberOfTypos::One => {
+                            if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT {
+                                one_typo_words.insert(derived_word);
+                            }
+                        }
+                        NumberOfTypos::Two => {
+                            if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
+                                two_typo_words.insert(derived_word);
+                            }
                        }
                    }
-                    NumberOfTypos::Two => {
-                        if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT {
-                            two_typo_words.insert(derived_word);
-                        }
-                    }
-                }
-                Ok(ControlFlow::Continue(()))
-            },
-        )?;
+                    Ok(ControlFlow::Continue(()))
+                },
+            )?;
+        }
+
        let split_words = find_split_words(ctx, original_str.as_str())?;
        let self_mut = ctx.term_interner.get_mut(self);

--- a/milli/src/search/new/tests/ngram_split_words.rs
+++ b/milli/src/search/new/tests/ngram_split_words.rs
@ -3,9 +3,9 @@ This module tests the following properties:

 1. Two consecutive words from a query can be combined into a "2gram"
 2. Three consecutive words from a query can be combined into a "3gram"
-3. A word from the query can be split into two consecutive words (split words)
+3. A word from the query can be split into two consecutive words (split words), no matter how short it is
 4. A 2gram can be split into two words
-5. A 3gram cannot be split into two words
+5. A 3gram can be split into two words
 6. 2grams can contain up to 1 typo
 7. 3grams cannot have typos
 8. 2grams and 3grams can be prefix tolerant
@ -14,6 +14,7 @@ This module tests the following properties:
 11. Disabling typo tolerance does not disable ngram tolerance
 12. Prefix tolerance is disabled for the last word if a space follows it
 13. Ngrams cannot be formed by combining a phrase and a word or two phrases
+14. Split words are not disabled by the `disableOnAttribute` or `disableOnWords` typo settings
 */

 use crate::index::tests::TempIndex;
@ -56,6 +57,10 @@ fn create_index() -> TempIndex {
            {
                "id": 5,
                "text": "sunflowering is not a verb"
+            },
+            {
+                "id": 6,
+                "text": "xy z"
            }
        ]))
        .unwrap();
@ -263,10 +268,11 @@ fn test_disable_split_words() {
    s.query("sunflower ");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    // no document containing `sun flower`
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
    insta::assert_debug_snapshot!(texts, @r###"
    [
+        "\"the sun flower is tall\"",
        "\"the sunflower is tall\"",
    ]
    "###);
@ -307,10 +313,11 @@ fn test_3gram_no_split_words() {
    let SearchResult { documents_ids, .. } = s.execute().unwrap();

    // no document with `sun flower`
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 5]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 5]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
    insta::assert_debug_snapshot!(texts, @r###"
    [
+        "\"the sun flower is tall\"",
        "\"the sunflowers are pretty\"",
        "\"the sunflower is tall\"",
        "\"sunflowering is not a verb\"",
@ -369,3 +376,50 @@ fn test_no_ngram_phrases() {
    ]
    "###);
 }
+
+#[test]
+fn test_short_split_words() {
+    let index = create_index();
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::All);
+    s.query("xyz");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"xy z\"",
+    ]
+    "###);
+}
+
+#[test]
+fn test_split_words_never_disabled() {
+    let index = create_index();
+
+    index
+        .update_settings(|s| {
+            s.set_exact_words(["sunflower"].iter().map(ToString::to_string).collect());
+            s.set_exact_attributes(["text"].iter().map(ToString::to_string).collect());
+        })
+        .unwrap();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::All);
+    s.query("the sunflower is tall");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"the sun flower is tall\"",
+        "\"the sunflower is tall\"",
+    ]
+    "###);
+}
--- a/milli/src/search/new/tests/typo.rs
+++ b/milli/src/search/new/tests/typo.rs
@ -9,7 +9,7 @@ This module tests the following properties:
 6. A typo on the first letter of a word counts as two typos
 7. Phrases are not typo tolerant
 8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos`
-9. 3grams are not typo tolerant
+9. 3grams are not typo tolerant (but they can be split into two words)
 10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly
 if `words` doesn't exist before it.
 11. The `typo` ranking rule places documents with the same number of typos in the same bucket
@ -287,16 +287,17 @@ fn test_typo_exact_word() {
    ]
    "###);

-    // exact words do not disable prefix (sunflowering OK, but no sunflowar or sun flower)
+    // exact words do not disable prefix (sunflowering OK, but no sunflowar)
    let mut s = Search::new(&txn, &index);
    s.terms_matching_strategy(TermsMatchingStrategy::All);
    s.query("network interconnection sunflower");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 17, 18]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"network interconnection sunflower\"",
+        "\"network interconnection sun flower\"",
        "\"network interconnection sunflowering\"",
    ]
    "###);