From 57582688660f7bb0c40865a558933fa4f9a73a27 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 16 May 2023 16:22:23 +0200 Subject: [PATCH] Don't compute split_words for phrases --- milli/src/search/new/logger/visual.rs | 2 +- .../new/query_term/compute_derivations.rs | 37 ++++++++++++++----- milli/src/search/new/query_term/mod.rs | 17 +++++++-- .../src/search/new/query_term/parse_query.rs | 4 +- .../search/new/ranking_rule_graph/typo/mod.rs | 2 +- 5 files changed, 45 insertions(+), 17 deletions(-) diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs index f76782e63..8df56da89 100644 --- a/milli/src/search/new/logger/visual.rs +++ b/milli/src/search/new/logger/visual.rs @@ -462,7 +462,7 @@ fill: \"#B6E2D3\" shape: class max_nbr_typo: {}", term_subset.description(ctx), - term_subset.max_nbr_typos(ctx) + term_subset.max_typo_cost(ctx) )?; for w in term_subset.all_single_words_except_prefix_db(ctx)? { diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index c26c4bc6b..d5dfbbcd0 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -28,14 +28,14 @@ pub enum ZeroOrOneTypo { impl Interned { pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { let s = ctx.term_interner.get_mut(self); - if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() { + if s.max_levenshtein_distance <= 1 && s.one_typo.is_uninit() { assert!(s.two_typo.is_uninit()); // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words self.initialize_one_typo_subterm(ctx)?; let s = ctx.term_interner.get_mut(self); assert!(s.one_typo.is_init()); s.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { + } else if s.max_levenshtein_distance > 1 && s.two_typo.is_uninit() { assert!(s.two_typo.is_uninit()); self.initialize_one_and_two_typo_subterm(ctx)?; let s = ctx.term_interner.get_mut(self); @@ -185,7 +185,7 @@ pub fn partially_initialized_term_from_word( original: ctx.word_interner.insert(word.to_owned()), ngram_words: None, is_prefix: false, - max_nbr_typos: 0, + max_levenshtein_distance: 0, zero_typo: <_>::default(), one_typo: Lazy::Init(<_>::default()), two_typo: Lazy::Init(<_>::default()), @@ -256,7 +256,7 @@ pub fn partially_initialized_term_from_word( Ok(QueryTerm { original: word_interned, ngram_words: None, - max_nbr_typos: max_typo, + max_levenshtein_distance: max_typo, is_prefix, zero_typo, one_typo: Lazy::Uninit, @@ -275,7 +275,16 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result { fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { let self_mut = ctx.term_interner.get_mut(self); - let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut; + + let allows_split_words = self_mut.allows_split_words(); + let QueryTerm { + original, + is_prefix, + one_typo, + max_levenshtein_distance: max_nbr_typos, + .. + } = self_mut; + let original = *original; let is_prefix = *is_prefix; // let original_str = ctx.word_interner.get(*original).to_owned(); @@ -300,13 +309,17 @@ impl Interned { })?; } - let original_str = ctx.word_interner.get(original).to_owned(); - let split_words = find_split_words(ctx, original_str.as_str())?; + let split_words = if allows_split_words { + let original_str = ctx.word_interner.get(original).to_owned(); + find_split_words(ctx, original_str.as_str())? + } else { + None + }; let self_mut = ctx.term_interner.get_mut(self); // Only add the split words to the derivations if: - // 1. the term is not an ngram; OR + // 1. the term is neither an ngram nor a phrase; OR // 2. the term is an ngram, but the split words are different from the ngram's component words let split_words = if let Some((ngram_words, split_words)) = self_mut.ngram_words.as_ref().zip(split_words.as_ref()) @@ -328,7 +341,13 @@ impl Interned { } fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { let self_mut = ctx.term_interner.get_mut(self); - let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut; + let QueryTerm { + original, + is_prefix, + two_typo, + max_levenshtein_distance: max_nbr_typos, + .. + } = self_mut; let original_str = ctx.word_interner.get(*original).to_owned(); if two_typo.is_init() { return Ok(()); diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index bf521d9b2..fb749a797 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -43,7 +43,7 @@ pub struct QueryTermSubset { pub struct QueryTerm { original: Interned, ngram_words: Option>>, - max_nbr_typos: u8, + max_levenshtein_distance: u8, is_prefix: bool, zero_typo: ZeroTypoTerm, // May not be computed yet @@ -342,10 +342,16 @@ impl QueryTermSubset { } None } - pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 { + pub fn max_typo_cost(&self, ctx: &SearchContext) -> u8 { let t = ctx.term_interner.get(self.original); - match t.max_nbr_typos { - 0 => 0, + match t.max_levenshtein_distance { + 0 => { + if t.allows_split_words() { + 1 + } else { + 0 + } + } 1 => { if self.one_typo_subset.is_empty() { 0 @@ -438,6 +444,9 @@ impl QueryTerm { self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() } + fn allows_split_words(&self) -> bool { + self.zero_typo.phrase.is_none() + } } impl Interned { diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index dc317a0fb..bf90748e4 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -217,7 +217,7 @@ pub fn make_ngram( original: ngram_str_interned, ngram_words: Some(words_interned), is_prefix, - max_nbr_typos, + max_levenshtein_distance: max_nbr_typos, zero_typo: term.zero_typo, one_typo: Lazy::Uninit, two_typo: Lazy::Uninit, @@ -271,7 +271,7 @@ impl PhraseBuilder { QueryTerm { original: ctx.word_interner.insert(phrase_desc), ngram_words: None, - max_nbr_typos: 0, + max_levenshtein_distance: 0, is_prefix: false, zero_typo: ZeroTypoTerm { phrase: Some(phrase), diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index da5198c23..a44be6015 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -50,7 +50,7 @@ impl RankingRuleGraphTrait for TypoGraph { // 3-gram -> equivalent to 2 typos let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; - for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) { + for nbr_typos in 0..=term.term_subset.max_typo_cost(ctx) { let mut term = term.clone(); match nbr_typos { 0 => {