From 4d352a21ac72228d721db3a5aa29f3d42105fc50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 10 May 2023 13:31:19 +0200 Subject: [PATCH] Compute split words derivations of terms that don't accept typos --- .../new/query_term/compute_derivations.rs | 90 ++++++++++--------- .../src/search/new/tests/ngram_split_words.rs | 62 ++++++++++++- milli/src/search/new/tests/typo.rs | 7 +- 3 files changed, 109 insertions(+), 50 deletions(-) diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index 0da841890..c26c4bc6b 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -28,11 +28,9 @@ pub enum ZeroOrOneTypo { impl Interned { pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { let s = ctx.term_interner.get_mut(self); - if s.max_nbr_typos == 0 { - s.one_typo = Lazy::Init(OneTypoTerm::default()); - s.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() { + if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() { assert!(s.two_typo.is_uninit()); + // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words self.initialize_one_typo_subterm(ctx)?; let s = ctx.term_interner.get_mut(self); assert!(s.one_typo.is_init()); @@ -277,7 +275,7 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result { fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { let self_mut = ctx.term_interner.get_mut(self); - let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; + let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut; let original = *original; let is_prefix = *is_prefix; // let original_str = ctx.word_interner.get(*original).to_owned(); @@ -286,19 +284,22 @@ impl Interned { } let mut one_typo_words = BTreeSet::new(); - find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { - match nbr_typos { - ZeroOrOneTypo::Zero => {} - ZeroOrOneTypo::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); - } else { - return Ok(ControlFlow::Break(())); + if *max_nbr_typos > 0 { + find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { + match nbr_typos { + ZeroOrOneTypo::Zero => {} + ZeroOrOneTypo::One => { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } else { + return Ok(ControlFlow::Break(())); + } } } - } - Ok(ControlFlow::Continue(())) - })?; + Ok(ControlFlow::Continue(())) + })?; + } + let original_str = ctx.word_interner.get(original).to_owned(); let split_words = find_split_words(ctx, original_str.as_str())?; @@ -327,7 +328,7 @@ impl Interned { } fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { let self_mut = ctx.term_interner.get_mut(self); - let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; + let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut; let original_str = ctx.word_interner.get(*original).to_owned(); if two_typo.is_init() { return Ok(()); @@ -335,34 +336,37 @@ impl Interned { let mut one_typo_words = BTreeSet::new(); let mut two_typo_words = BTreeSet::new(); - find_zero_one_two_typo_derivations( - *original, - *is_prefix, - ctx.index.words_fst(ctx.txn)?, - &mut ctx.word_interner, - |derived_word, nbr_typos| { - if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT - && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT - { - // No chance we will add either one- or two-typo derivations anymore, stop iterating. - return Ok(ControlFlow::Break(())); - } - match nbr_typos { - NumberOfTypos::Zero => {} - NumberOfTypos::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); + if *max_nbr_typos > 0 { + find_zero_one_two_typo_derivations( + *original, + *is_prefix, + ctx.index.words_fst(ctx.txn)?, + &mut ctx.word_interner, + |derived_word, nbr_typos| { + if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT + && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT + { + // No chance we will add either one- or two-typo derivations anymore, stop iterating. + return Ok(ControlFlow::Break(())); + } + match nbr_typos { + NumberOfTypos::Zero => {} + NumberOfTypos::One => { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } + } + NumberOfTypos::Two => { + if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { + two_typo_words.insert(derived_word); + } } } - NumberOfTypos::Two => { - if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { - two_typo_words.insert(derived_word); - } - } - } - Ok(ControlFlow::Continue(())) - }, - )?; + Ok(ControlFlow::Continue(())) + }, + )?; + } + let split_words = find_split_words(ctx, original_str.as_str())?; let self_mut = ctx.term_interner.get_mut(self); diff --git a/milli/src/search/new/tests/ngram_split_words.rs b/milli/src/search/new/tests/ngram_split_words.rs index 2a0365bac..fb99b8ba2 100644 --- a/milli/src/search/new/tests/ngram_split_words.rs +++ b/milli/src/search/new/tests/ngram_split_words.rs @@ -3,9 +3,9 @@ This module tests the following properties: 1. Two consecutive words from a query can be combined into a "2gram" 2. Three consecutive words from a query can be combined into a "3gram" -3. A word from the query can be split into two consecutive words (split words) +3. A word from the query can be split into two consecutive words (split words), no matter how short it is 4. A 2gram can be split into two words -5. A 3gram cannot be split into two words +5. A 3gram can be split into two words 6. 2grams can contain up to 1 typo 7. 3grams cannot have typos 8. 2grams and 3grams can be prefix tolerant @@ -14,6 +14,7 @@ This module tests the following properties: 11. Disabling typo tolerance does not disable ngram tolerance 12. Prefix tolerance is disabled for the last word if a space follows it 13. Ngrams cannot be formed by combining a phrase and a word or two phrases +14. Split words are not disabled by the `disableOnAttribute` or `disableOnWords` typo settings */ use crate::index::tests::TempIndex; @@ -56,6 +57,10 @@ fn create_index() -> TempIndex { { "id": 5, "text": "sunflowering is not a verb" + }, + { + "id": 6, + "text": "xy z" } ])) .unwrap(); @@ -263,10 +268,11 @@ fn test_disable_split_words() { s.query("sunflower "); let SearchResult { documents_ids, .. } = s.execute().unwrap(); // no document containing `sun flower` - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ + "\"the sun flower is tall\"", "\"the sunflower is tall\"", ] "###); @@ -307,10 +313,11 @@ fn test_3gram_no_split_words() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); // no document with `sun flower` - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 5]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 5]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ + "\"the sun flower is tall\"", "\"the sunflowers are pretty\"", "\"the sunflower is tall\"", "\"sunflowering is not a verb\"", @@ -369,3 +376,50 @@ fn test_no_ngram_phrases() { ] "###); } + +#[test] +fn test_short_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("xyz"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"xy z\"", + ] + "###); +} + +#[test] +fn test_split_words_never_disabled() { + let index = create_index(); + + index + .update_settings(|s| { + s.set_exact_words(["sunflower"].iter().map(ToString::to_string).collect()); + s.set_exact_attributes(["text"].iter().map(ToString::to_string).collect()); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the sunflower is tall"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + "\"the sunflower is tall\"", + ] + "###); +} diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs index 33b165a94..8fd9de5fc 100644 --- a/milli/src/search/new/tests/typo.rs +++ b/milli/src/search/new/tests/typo.rs @@ -9,7 +9,7 @@ This module tests the following properties: 6. A typo on the first letter of a word counts as two typos 7. Phrases are not typo tolerant 8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos` -9. 3grams are not typo tolerant +9. 3grams are not typo tolerant (but they can be split into two words) 10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly if `words` doesn't exist before it. 11. The `typo` ranking rule places documents with the same number of typos in the same bucket @@ -287,16 +287,17 @@ fn test_typo_exact_word() { ] "###); - // exact words do not disable prefix (sunflowering OK, but no sunflowar or sun flower) + // exact words do not disable prefix (sunflowering OK, but no sunflowar) let mut s = Search::new(&txn, &index); s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("network interconnection sunflower"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 17, 18]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); insta::assert_debug_snapshot!(texts, @r###" [ "\"network interconnection sunflower\"", + "\"network interconnection sun flower\"", "\"network interconnection sunflowering\"", ] "###);