From b439d36807f663d29953a1fe7dfc6488524a136b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 15:38:30 +0200 Subject: [PATCH] Split query_term module into multiple submodules --- milli/src/search/new/logger/detailed.rs | 89 +- milli/src/search/new/logger/mod.rs | 2 + milli/src/search/new/mod.rs | 3 + milli/src/search/new/query_graph.rs | 26 +- milli/src/search/new/query_term.rs | 1008 ----------------- .../new/query_term/compute_derivations.rs | 380 +++++++ milli/src/search/new/query_term/mod.rs | 331 ++++++ .../src/search/new/query_term/ntypo_subset.rs | 80 ++ .../src/search/new/query_term/parse_query.rs | 281 +++++ milli/src/search/new/query_term/phrase.rs | 16 + .../new/ranking_rule_graph/proximity/mod.rs | 4 +- .../search/new/ranking_rule_graph/typo/mod.rs | 24 +- 12 files changed, 1122 insertions(+), 1122 deletions(-) delete mode 100644 milli/src/search/new/query_term.rs create mode 100644 milli/src/search/new/query_term/compute_derivations.rs create mode 100644 milli/src/search/new/query_term/mod.rs create mode 100644 milli/src/search/new/query_term/ntypo_subset.rs create mode 100644 milli/src/search/new/query_term/parse_query.rs create mode 100644 milli/src/search/new/query_term/phrase.rs diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 3a02950a8..86568d5d2 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -8,9 +8,7 @@ use roaring::RoaringBitmap; use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::{ - Lazy, LocatedQueryTermSubset, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm, -}; +use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::ranking_rule_graph::{ DeadEndsCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, @@ -439,87 +437,26 @@ results.{cur_ranking_rule}{cur_activated_id} {{ positions: _, term_ids: _, }) => { - let QueryTerm { - original, - is_multiple_words: _, - is_prefix: _, - max_nbr_typos, - zero_typo, - one_typo, - two_typo, - } = ctx.term_interner.get(term_subset.original); - - let original = ctx.word_interner.get(*original); writeln!( file, - "{node_idx} : \"{original}\" {{ + "{node_idx} : \"{}\" {{ shape: class - max_nbr_typo: {max_nbr_typos}" + max_nbr_typo: {}", + term_subset.description(ctx), + term_subset.max_nbr_typos(ctx) ) .unwrap(); - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = - zero_typo; - - for w in zero_typo.iter().copied() { - if term_subset.zero_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0").unwrap(); - } + for w in term_subset.all_single_words_except_prefix_db(ctx).unwrap() { + let w = ctx.word_interner.get(w); + writeln!(file, "{w}: word").unwrap(); } - for w in prefix_of.iter().copied() { - if term_subset.zero_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0P").unwrap(); - } + for p in term_subset.all_phrases(ctx).unwrap() { + writeln!(file, "{}: phrase", p.description(ctx)).unwrap(); } - - if let Some(phrase) = phrase { - if term_subset.zero_typo_subset.contains_phrase(*phrase) { - let phrase = ctx.phrase_interner.get(*phrase); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : phrase").unwrap(); - } - } - - for synonym in synonyms.iter().copied() { - if term_subset.zero_typo_subset.contains_phrase(synonym) { - let phrase = ctx.phrase_interner.get(synonym); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); - } - } - if let Some(use_prefix_db) = use_prefix_db { - if term_subset.zero_typo_subset.contains_word(*use_prefix_db) { - let p = ctx.word_interner.get(*use_prefix_db); - writeln!(file, "use prefix DB : {p}").unwrap(); - } - } - if let Lazy::Init(one_typo) = one_typo { - let OneTypoTerm { split_words, one_typo } = one_typo; - - for w in one_typo.iter().copied() { - if term_subset.one_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 1").unwrap(); - } - } - if let Some(split_words) = split_words { - if term_subset.one_typo_subset.contains_phrase(*split_words) { - let phrase = ctx.phrase_interner.get(*split_words); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : split_words").unwrap(); - } - } - } - if let Lazy::Init(two_typo) = two_typo { - let TwoTypoTerm { two_typos } = two_typo; - for w in two_typos.iter().copied() { - if term_subset.two_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 2").unwrap(); - } - } + if let Some(w) = term_subset.use_prefix_db(ctx) { + let w = ctx.word_interner.get(w); + writeln!(file, "{w}: prefix db").unwrap(); } writeln!(file, "}}").unwrap(); diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 889e811ad..15cb78784 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -1,6 +1,8 @@ // #[cfg(test)] pub mod detailed; +pub mod test_logger; + use roaring::RoaringBitmap; use super::interner::{Interned, MappedInterner}; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4d561d25b..4456d693d 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -17,6 +17,9 @@ mod sort; // TODO: documentation + comments mod words; +#[cfg(test)] +mod tests; + use std::collections::HashSet; use charabia::TokenizerBuilder; diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 1eede33c2..33e178494 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,7 +1,6 @@ use super::interner::{FixedSizeInterner, Interned}; use super::query_term::{ - self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, NTypoTermSubset, - QueryTermSubset, + self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, QueryTermSubset, }; use super::small_bitmap::SmallBitmap; use super::SearchContext; @@ -107,12 +106,7 @@ impl QueryGraph { let new_node_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset { - original: Interned::from_raw(term_idx as u16), - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - }, + term_subset: QueryTermSubset::full(Interned::from_raw(term_idx as u16)), positions: terms[term_idx].positions.clone(), term_ids: term_idx as u8..=term_idx as u8, }), @@ -126,12 +120,7 @@ impl QueryGraph { let ngram_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset { - original: ngram.value, - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - }, + term_subset: QueryTermSubset::full(ngram.value), positions: ngram.positions, term_ids: term_idx as u8 - 1..=term_idx as u8, }), @@ -146,12 +135,7 @@ impl QueryGraph { let ngram_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset { - original: ngram.value, - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - }, + term_subset: QueryTermSubset::full(ngram.value), positions: ngram.positions, term_ids: term_idx as u8 - 2..=term_idx as u8, }), @@ -329,7 +313,7 @@ impl QueryGraph { let mut at_least_one_phrase = false; for (node_id, node) in self.nodes.iter() { let QueryNodeData::Term(t) = &node.data else { continue }; - if ctx.term_interner.get(t.term_subset.original).zero_typo.phrase.is_some() { + if t.term_subset.original_phrase(ctx).is_some() { at_least_one_phrase = true; continue; } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs deleted file mode 100644 index 15e106e06..000000000 --- a/milli/src/search/new/query_term.rs +++ /dev/null @@ -1,1008 +0,0 @@ -use std::borrow::Cow; -use std::collections::BTreeSet; -use std::ops::{ControlFlow, RangeInclusive}; - -use charabia::normalizer::NormalizedTokenIter; -use charabia::{SeparatorKind, TokenKind}; -use fst::automaton::Str; -use fst::{Automaton, IntoStreamer, Streamer}; -use heed::types::DecodeIgnore; -use heed::BytesDecode; -use itertools::Itertools; - -use super::interner::{DedupInterner, Interned}; -use super::{limits, SearchContext}; -use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; -use crate::search::{build_dfa, get_first}; -use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH}; - -/// A phrase in the user's search query, consisting of several words -/// that must appear side-by-side in the search results. -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct Phrase { - pub words: Vec>>, -} -impl Phrase { - pub fn description(&self, interner: &DedupInterner) -> String { - self.words.iter().flatten().map(|w| interner.get(*w)).join(" ") - } -} - -#[derive(Clone, PartialEq, Eq, Hash)] -pub enum Lazy { - Uninit, - Init(T), -} -impl Lazy { - pub fn is_init(&self) -> bool { - match self { - Lazy::Uninit => false, - Lazy::Init(_) => true, - } - } - pub fn is_uninit(&self) -> bool { - match self { - Lazy::Uninit => true, - Lazy::Init(_) => false, - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum NTypoTermSubset { - All, - Subset { - words: BTreeSet>, - phrases: BTreeSet>, - // TODO: prefixes: BTreeSet>, - }, - Nothing, -} - -impl NTypoTermSubset { - pub fn contains_word(&self, word: Interned) -> bool { - match self { - NTypoTermSubset::All => true, - NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word), - NTypoTermSubset::Nothing => false, - } - } - pub fn contains_phrase(&self, phrase: Interned) -> bool { - match self { - NTypoTermSubset::All => true, - NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase), - NTypoTermSubset::Nothing => false, - } - } - pub fn is_empty(&self) -> bool { - match self { - NTypoTermSubset::All => false, - NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(), - NTypoTermSubset::Nothing => true, - } - } - pub fn union(&mut self, other: &Self) { - match self { - Self::All => {} - Self::Subset { words, phrases } => match other { - Self::All => { - *self = Self::All; - } - Self::Subset { words: w2, phrases: p2 } => { - words.extend(w2); - phrases.extend(p2); - } - Self::Nothing => {} - }, - Self::Nothing => { - *self = other.clone(); - } - } - } - pub fn intersect(&mut self, other: &Self) { - match self { - Self::All => *self = other.clone(), - Self::Subset { words, phrases } => match other { - Self::All => {} - Self::Subset { words: w2, phrases: p2 } => { - let mut ws = BTreeSet::new(); - for w in words.intersection(w2) { - ws.insert(*w); - } - let mut ps = BTreeSet::new(); - for p in phrases.intersection(p2) { - ps.insert(*p); - } - *words = ws; - *phrases = ps; - } - Self::Nothing => *self = Self::Nothing, - }, - Self::Nothing => {} - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct QueryTermSubset { - pub original: Interned, - pub zero_typo_subset: NTypoTermSubset, - pub one_typo_subset: NTypoTermSubset, - pub two_typo_subset: NTypoTermSubset, -} - -#[derive(Clone, PartialEq, Eq, Hash)] -pub struct LocatedQueryTermSubset { - pub term_subset: QueryTermSubset, - pub positions: RangeInclusive, - pub term_ids: RangeInclusive, -} - -impl QueryTermSubset { - pub fn empty(for_term: Interned) -> Self { - Self { - original: for_term, - zero_typo_subset: NTypoTermSubset::Nothing, - one_typo_subset: NTypoTermSubset::Nothing, - two_typo_subset: NTypoTermSubset::Nothing, - } - } - pub fn full(for_term: Interned) -> Self { - Self { - original: for_term, - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - } - } - - pub fn union(&mut self, other: &Self) { - assert!(self.original == other.original); - self.zero_typo_subset.union(&other.zero_typo_subset); - self.one_typo_subset.union(&other.one_typo_subset); - self.two_typo_subset.union(&other.two_typo_subset); - } - pub fn intersect(&mut self, other: &Self) { - assert!(self.original == other.original); - self.zero_typo_subset.intersect(&other.zero_typo_subset); - self.one_typo_subset.intersect(&other.one_typo_subset); - self.two_typo_subset.intersect(&other.two_typo_subset); - } - - pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option> { - let original = ctx.term_interner.get(self.original); - let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { - return None - }; - match &self.zero_typo_subset { - NTypoTermSubset::All => Some(use_prefix_db), - NTypoTermSubset::Subset { words, phrases: _ } => { - // TODO: use a subset of prefix words instead - if words.contains(&use_prefix_db) { - Some(use_prefix_db) - } else { - None - } - } - NTypoTermSubset::Nothing => None, - } - } - pub fn all_single_words_except_prefix_db( - &self, - ctx: &mut SearchContext, - ) -> Result>> { - let mut result = BTreeSet::default(); - // TODO: a compute_partially funtion - if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { - self.original.compute_fully_if_needed(ctx)?; - } - - let original = ctx.term_interner.get_mut(self.original); - if !self.zero_typo_subset.is_empty() { - let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = - &original.zero_typo; - result.extend(zero_typo.iter().copied()); - result.extend(prefix_of.iter().copied()); - }; - - match &self.one_typo_subset { - NTypoTermSubset::All => { - let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { - panic!() - }; - result.extend(one_typo.iter().copied()) - } - NTypoTermSubset::Subset { words, phrases: _ } => { - let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { - panic!() - }; - result.extend(one_typo.intersection(words)); - } - NTypoTermSubset::Nothing => {} - }; - - match &self.two_typo_subset { - NTypoTermSubset::All => { - let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { - panic!() - }; - result.extend(two_typos.iter().copied()); - } - NTypoTermSubset::Subset { words, phrases: _ } => { - let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { - panic!() - }; - result.extend(two_typos.intersection(words)); - } - NTypoTermSubset::Nothing => {} - }; - - Ok(result) - } - pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result>> { - let mut result = BTreeSet::default(); - - if !self.one_typo_subset.is_empty() { - // TODO: compute less than fully if possible - self.original.compute_fully_if_needed(ctx)?; - } - let original = ctx.term_interner.get_mut(self.original); - - let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = - &original.zero_typo; - result.extend(phrase.iter().copied()); - result.extend(synonyms.iter().copied()); - - if !self.one_typo_subset.is_empty() { - let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { - panic!(); - }; - result.extend(split_words.iter().copied()); - } - - Ok(result) - } -} - -impl Interned { - pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { - let s = ctx.term_interner.get_mut(self); - if s.max_nbr_typos == 0 { - s.one_typo = Lazy::Init(OneTypoTerm::default()); - s.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() { - assert!(s.two_typo.is_uninit()); - self.initialize_one_typo_subterm(ctx)?; - let s = ctx.term_interner.get_mut(self); - assert!(s.one_typo.is_init()); - s.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { - assert!(s.two_typo.is_uninit()); - self.initialize_one_and_two_typo_subterm(ctx)?; - let s = ctx.term_interner.get_mut(self); - assert!(s.one_typo.is_init() && s.two_typo.is_init()); - } - Ok(()) - } -} - -#[derive(Clone, PartialEq, Eq, Hash)] -pub struct QueryTerm { - pub original: Interned, - pub ngram_words: Option>>, - pub max_nbr_typos: u8, - pub is_prefix: bool, - pub zero_typo: ZeroTypoTerm, - // May not be computed yet - pub one_typo: Lazy, - // May not be computed yet - pub two_typo: Lazy, -} - -// SubTerms will be in a dedup interner -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct ZeroTypoTerm { - /// The original phrase, if any - pub phrase: Option>, - /// A single word equivalent to the original term, with zero typos - pub zero_typo: Option>, - /// All the words that contain the original word as prefix - pub prefix_of: BTreeSet>, - /// All the synonyms of the original word or phrase - pub synonyms: BTreeSet>, - /// A prefix in the prefix databases matching the original word - pub use_prefix_db: Option>, -} -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct OneTypoTerm { - /// The original word split into multiple consecutive words - pub split_words: Option>, - /// Words that are 1 typo away from the original word - pub one_typo: BTreeSet>, -} -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct TwoTypoTerm { - /// Words that are 2 typos away from the original word - pub two_typos: BTreeSet>, -} - -impl ZeroTypoTerm { - fn is_empty(&self) -> bool { - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self; - phrase.is_none() - && zero_typo.is_none() - && prefix_of.is_empty() - && synonyms.is_empty() - && use_prefix_db.is_none() - } -} -impl OneTypoTerm { - fn is_empty(&self) -> bool { - let OneTypoTerm { split_words, one_typo } = self; - one_typo.is_empty() && split_words.is_none() - } -} -impl TwoTypoTerm { - fn is_empty(&self) -> bool { - let TwoTypoTerm { two_typos } = self; - two_typos.is_empty() - } -} - -impl QueryTerm { - pub fn is_empty(&self) -> bool { - let Lazy::Init(one_typo) = &self.one_typo else { - return false; - }; - let Lazy::Init(two_typo) = &self.two_typo else { - return false; - }; - - self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() - } -} - -pub enum ZeroOrOneTypo { - Zero, - One, -} - -fn find_zero_typo_prefix_derivations( - word_interned: Interned, - fst: fst::Set>, - word_interner: &mut DedupInterner, - mut visit: impl FnMut(Interned) -> Result>, -) -> Result<()> { - let word = word_interner.get(word_interned).to_owned(); - let word = word.as_str(); - let prefix = Str::new(word).starts_with(); - let mut stream = fst.search(prefix).into_stream(); - - while let Some(derived_word) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?.to_owned(); - let derived_word_interned = word_interner.insert(derived_word); - if derived_word_interned != word_interned { - let cf = visit(derived_word_interned)?; - if cf.is_break() { - break; - } - } - } - Ok(()) -} - -fn find_zero_one_typo_derivations( - ctx: &mut SearchContext, - word_interned: Interned, - is_prefix: bool, - mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result>, -) -> Result<()> { - let fst = ctx.get_words_fst()?; - let word = ctx.word_interner.get(word_interned).to_owned(); - let word = word.as_str(); - - let dfa = build_dfa(word, 1, is_prefix); - let starts = StartsWith(Str::new(get_first(word))); - let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); - - while let Some((derived_word, state)) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?; - let derived_word = ctx.word_interner.insert(derived_word.to_owned()); - let d = dfa.distance(state.1); - match d.to_u8() { - 0 => { - if derived_word != word_interned { - let cf = visit(derived_word, ZeroOrOneTypo::Zero)?; - if cf.is_break() { - break; - } - } - } - 1 => { - let cf = visit(derived_word, ZeroOrOneTypo::One)?; - if cf.is_break() { - break; - } - } - _ => { - unreachable!("One typo dfa produced multiple typos") - } - } - } - Ok(()) -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum NumberOfTypos { - Zero, - One, - Two, -} -fn find_zero_one_two_typo_derivations( - word_interned: Interned, - is_prefix: bool, - fst: fst::Set>, - word_interner: &mut DedupInterner, - mut visit: impl FnMut(Interned, NumberOfTypos) -> Result>, -) -> Result<()> { - let word = word_interner.get(word_interned).to_owned(); - let word = word.as_str(); - - let starts = StartsWith(Str::new(get_first(word))); - let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); - let second_dfa = build_dfa(word, 2, is_prefix); - let second = Intersection(&second_dfa, &starts); - let automaton = Union(first, &second); - - let mut stream = fst.search_with_state(automaton).into_stream(); - - while let Some((derived_word, state)) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?; - let derived_word_interned = word_interner.insert(derived_word.to_owned()); - // in the case the typo is on the first letter, we know the number of typo - // is two - if get_first(derived_word) != get_first(word) { - let cf = visit(derived_word_interned, NumberOfTypos::Two)?; - if cf.is_break() { - break; - } - } else { - // Else, we know that it is the second dfa that matched and compute the - // correct distance - let d = second_dfa.distance((state.1).0); - match d.to_u8() { - 0 => { - if derived_word_interned != word_interned { - let cf = visit(derived_word_interned, NumberOfTypos::Zero)?; - if cf.is_break() { - break; - } - } - } - 1 => { - let cf = visit(derived_word_interned, NumberOfTypos::One)?; - if cf.is_break() { - break; - } - } - 2 => { - let cf = visit(derived_word_interned, NumberOfTypos::Two)?; - if cf.is_break() { - break; - } - } - _ => unreachable!("2 typos DFA produced a distance greater than 2"), - } - } - } - Ok(()) -} - -fn partially_initialized_term_from_word( - ctx: &mut SearchContext, - word: &str, - max_typo: u8, - is_prefix: bool, -) -> Result { - let word_interned = ctx.word_interner.insert(word.to_owned()); - - if word.len() > MAX_WORD_LENGTH { - return Ok({ - QueryTerm { - original: ctx.word_interner.insert(word.to_owned()), - ngram_words: None, - is_prefix: false, - max_nbr_typos: 0, - zero_typo: <_>::default(), - one_typo: Lazy::Init(<_>::default()), - two_typo: Lazy::Init(<_>::default()), - } - }); - } - - let fst = ctx.index.words_fst(ctx.txn)?; - - let use_prefix_db = is_prefix - && ctx - .index - .word_prefix_docids - .remap_data_type::() - .get(ctx.txn, word)? - .is_some(); - let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; - - let mut zero_typo = None; - let mut prefix_of = BTreeSet::new(); - - if fst.contains(word) { - zero_typo = Some(word_interned); - } - - if is_prefix && use_prefix_db.is_none() { - find_zero_typo_prefix_derivations( - word_interned, - fst, - &mut ctx.word_interner, - |derived_word| { - if prefix_of.len() < limits::MAX_PREFIX_COUNT { - prefix_of.insert(derived_word); - Ok(ControlFlow::Continue(())) - } else { - Ok(ControlFlow::Break(())) - } - }, - )?; - } - let synonyms = ctx.index.synonyms(ctx.txn)?; - let mut synonym_word_count = 0; - let synonyms = synonyms - .get(&vec![word.to_owned()]) - .cloned() - .unwrap_or_default() - .into_iter() - .take(limits::MAX_SYNONYM_PHRASE_COUNT) - .filter_map(|words| { - if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT { - return None; - } - synonym_word_count += words.len(); - let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); - Some(ctx.phrase_interner.insert(Phrase { words })) - }) - .collect(); - let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; - - Ok(QueryTerm { - original: word_interned, - ngram_words: None, - max_nbr_typos: max_typo, - is_prefix, - zero_typo, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - }) -} - -fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result>> { - if let Some((l, r)) = split_best_frequency(ctx, word)? { - Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] }))) - } else { - Ok(None) - } -} - -impl Interned { - fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { - let self_mut = ctx.term_interner.get_mut(self); - let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; - let original = *original; - let is_prefix = *is_prefix; - // let original_str = ctx.word_interner.get(*original).to_owned(); - if one_typo.is_init() { - return Ok(()); - } - let mut one_typo_words = BTreeSet::new(); - - find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { - match nbr_typos { - ZeroOrOneTypo::Zero => {} - ZeroOrOneTypo::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); - } else { - return Ok(ControlFlow::Break(())); - } - } - } - Ok(ControlFlow::Continue(())) - })?; - let original_str = ctx.word_interner.get(original).to_owned(); - let split_words = find_split_words(ctx, original_str.as_str())?; - let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; - - let self_mut = ctx.term_interner.get_mut(self); - self_mut.one_typo = Lazy::Init(one_typo); - - Ok(()) - } - fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { - let self_mut = ctx.term_interner.get_mut(self); - let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; - let original_str = ctx.word_interner.get(*original).to_owned(); - if two_typo.is_init() { - return Ok(()); - } - let mut one_typo_words = BTreeSet::new(); - let mut two_typo_words = BTreeSet::new(); - - find_zero_one_two_typo_derivations( - *original, - *is_prefix, - ctx.index.words_fst(ctx.txn)?, - &mut ctx.word_interner, - |derived_word, nbr_typos| { - if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT - && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT - { - // No chance we will add either one- or two-typo derivations anymore, stop iterating. - return Ok(ControlFlow::Break(())); - } - match nbr_typos { - NumberOfTypos::Zero => {} - NumberOfTypos::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); - } - } - NumberOfTypos::Two => { - if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { - two_typo_words.insert(derived_word); - } - } - } - Ok(ControlFlow::Continue(())) - }, - )?; - let split_words = find_split_words(ctx, original_str.as_str())?; - let self_mut = ctx.term_interner.get_mut(self); - - let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words }; - - let two_typo = TwoTypoTerm { two_typos: two_typo_words }; - - self_mut.one_typo = Lazy::Init(one_typo); - self_mut.two_typo = Lazy::Init(two_typo); - - Ok(()) - } -} - -/// Split the original word into the two words that appear the -/// most next to each other in the index. -/// -/// Return `None` if the original word cannot be split. -fn split_best_frequency( - ctx: &mut SearchContext, - original: &str, -) -> Result, Interned)>> { - let chars = original.char_indices().skip(1); - let mut best = None; - - for (i, _) in chars { - let (left, right) = original.split_at(i); - let left = ctx.word_interner.insert(left.to_owned()); - let right = ctx.word_interner.insert(right.to_owned()); - - if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? { - let frequency = - CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?; - if best.map_or(true, |(old, _, _)| frequency > old) { - best = Some((frequency, left, right)); - } - } - } - - Ok(best.map(|(_, left, right)| (left, right))) -} - -impl Interned { - /// Return the original word from the given query term - pub fn original_single_word(self, ctx: &SearchContext) -> Option> { - let self_ = ctx.term_interner.get(self); - if self_.ngram_words.is_some() { - None - } else { - Some(self_.original) - } - } -} - -/// A query term term coupled with its position in the user's search query. -#[derive(Clone)] -pub struct LocatedQueryTerm { - pub value: Interned, - pub positions: RangeInclusive, -} - -impl LocatedQueryTerm { - /// Return `true` iff the term is empty - pub fn is_empty(&self, interner: &DedupInterner) -> bool { - interner.get(self.value).is_empty() - } -} - -struct PhraseBuilder { - words: Vec>>, - start: u16, - end: u16, -} - -impl PhraseBuilder { - fn empty() -> Self { - Self { words: Default::default(), start: u16::MAX, end: u16::MAX } - } - - fn is_empty(&self) -> bool { - self.words.is_empty() - } - - // precondition: token has kind Word or StopWord - fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) { - if self.is_empty() { - self.start = position; - } - self.end = position; - if let TokenKind::StopWord = token.kind { - self.words.push(None); - } else { - // token has kind Word - let word = ctx.word_interner.insert(token.lemma().to_string()); - // TODO: in a phrase, check that every word exists - // otherwise return an empty term - self.words.push(Some(word)); - } - } - - fn build(self, ctx: &mut SearchContext) -> Option { - if self.is_empty() { - return None; - } - Some(LocatedQueryTerm { - value: ctx.term_interner.push({ - let phrase = Phrase { words: self.words }; - let phrase_desc = phrase.description(&ctx.word_interner); - QueryTerm { - original: ctx.word_interner.insert(phrase_desc), - ngram_words: None, - max_nbr_typos: 0, - is_prefix: false, - zero_typo: ZeroTypoTerm { - phrase: Some(ctx.phrase_interner.insert(phrase)), - zero_typo: None, - prefix_of: BTreeSet::default(), - synonyms: BTreeSet::default(), - use_prefix_db: None, - }, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - } - }), - positions: self.start..=self.end, - }) - } -} - -/// Convert the tokenised search query into a list of located query terms. -// TODO: checking if the positions are correct for phrases, separators, ngrams -pub fn located_query_terms_from_string( - ctx: &mut SearchContext, - query: NormalizedTokenIter<&[u8]>, - words_limit: Option, -) -> Result> { - let nbr_typos = number_of_typos_allowed(ctx)?; - - let mut located_terms = Vec::new(); - - let mut phrase: Option = None; - - let parts_limit = words_limit.unwrap_or(usize::MAX); - - // start with the last position as we will wrap around to position 0 at the beginning of the loop below. - let mut position = u16::MAX; - - let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable(); - while let Some(token) = peekable.next() { - // early return if word limit is exceeded - if located_terms.len() >= parts_limit { - return Ok(located_terms); - } - - match token.kind { - TokenKind::Word | TokenKind::StopWord => { - // On first loop, goes from u16::MAX to 0, then normal increment. - position = position.wrapping_add(1); - - // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, - // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, - // 3. if the word is the last token of the query we push it as a prefix word. - if let Some(phrase) = &mut phrase { - phrase.push_word(ctx, &token, position) - } else if peekable.peek().is_some() { - match token.kind { - TokenKind::Word => { - let word = token.lemma(); - let term = partially_initialized_term_from_word( - ctx, - word, - nbr_typos(word), - false, - )?; - let located_term = LocatedQueryTerm { - value: ctx.term_interner.push(term), - positions: position..=position, - }; - located_terms.push(located_term); - } - TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} - } - } else { - let word = token.lemma(); - let term = - partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?; - let located_term = LocatedQueryTerm { - value: ctx.term_interner.push(term), - positions: position..=position, - }; - located_terms.push(located_term); - } - } - TokenKind::Separator(separator_kind) => { - match separator_kind { - SeparatorKind::Hard => { - position += 1; - } - SeparatorKind::Soft => { - position += 0; - } - } - - phrase = 'phrase: { - let phrase = phrase.take(); - - // If we have a hard separator inside a phrase, we immediately start a new phrase - let phrase = if separator_kind == SeparatorKind::Hard { - if let Some(phrase) = phrase { - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) - } - Some(PhraseBuilder::empty()) - } else { - None - } - } else { - phrase - }; - - // We close and start a new phrase depending on the number of double quotes - let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - if quote_count == 0 { - break 'phrase phrase; - } - - // Consume the closing quote and the phrase - if let Some(phrase) = phrase { - // Per the check above, quote_count > 0 - quote_count -= 1; - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) - } - } - - // Start new phrase if the token ends with an opening quote - (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) - }; - } - _ => (), - } - } - - // If a quote is never closed, we consider all of the end of the query as a phrase. - if let Some(phrase) = phrase.take() { - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term); - } - } - - Ok(located_terms) -} - -pub fn number_of_typos_allowed<'ctx>( - ctx: &SearchContext<'ctx>, -) -> Result u8 + 'ctx> { - let authorize_typos = ctx.index.authorize_typos(ctx.txn)?; - let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; - let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; - - // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? - let exact_words = ctx.index.exact_words(ctx.txn)?; - - Ok(Box::new(move |word: &str| { - if !authorize_typos - || word.len() < min_len_one_typo as usize - || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) - { - 0 - } else if word.len() < min_len_two_typos as usize { - 1 - } else { - 2 - } - })) -} - -pub fn make_ngram( - ctx: &mut SearchContext, - terms: &[LocatedQueryTerm], - number_of_typos_allowed: &impl Fn(&str) -> u8, -) -> Result> { - assert!(!terms.is_empty()); - for t in terms { - if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() { - return Ok(None); - } - } - for ts in terms.windows(2) { - let [t1, t2] = ts else { panic!() }; - if *t1.positions.end() != t2.positions.start() - 1 { - return Ok(None); - } - } - let mut words_interned = vec![]; - for term in terms { - if let Some(original_term_word) = term.value.original_single_word(ctx) { - words_interned.push(original_term_word); - } else { - return Ok(None); - } - } - let words = - words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::>(); - - let start = *terms.first().as_ref().unwrap().positions.start(); - let end = *terms.last().as_ref().unwrap().positions.end(); - let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix; - let ngram_str = words.join(""); - if ngram_str.len() > MAX_WORD_LENGTH { - return Ok(None); - } - let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone()); - - let max_nbr_typos = - number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); - - let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; - - // Now add the synonyms - let index_synonyms = ctx.index.synonyms(ctx.txn)?; - - term.zero_typo.synonyms.extend( - index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| { - let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); - ctx.phrase_interner.insert(Phrase { words }) - }), - ); - - let term = QueryTerm { - original: ngram_str_interned, - ngram_words: Some(words_interned), - is_prefix, - max_nbr_typos, - zero_typo: term.zero_typo, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - }; - - let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end }; - - Ok(Some(term)) -} diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs new file mode 100644 index 000000000..f95956fbf --- /dev/null +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -0,0 +1,380 @@ +use fst::automaton::Str; +use fst::{Automaton, IntoStreamer, Streamer}; +use heed::types::DecodeIgnore; +use heed::BytesDecode; +use std::borrow::Cow; +use std::collections::BTreeSet; +use std::ops::ControlFlow; + +use super::*; +use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; +use crate::search::new::query_term::TwoTypoTerm; +use crate::search::new::{limits, SearchContext}; +use crate::search::{build_dfa, get_first}; +use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum NumberOfTypos { + Zero, + One, + Two, +} + +pub enum ZeroOrOneTypo { + Zero, + One, +} + +impl Interned { + pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { + let s = ctx.term_interner.get_mut(self); + if s.max_nbr_typos == 0 { + s.one_typo = Lazy::Init(OneTypoTerm::default()); + s.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() { + assert!(s.two_typo.is_uninit()); + self.initialize_one_typo_subterm(ctx)?; + let s = ctx.term_interner.get_mut(self); + assert!(s.one_typo.is_init()); + s.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { + assert!(s.two_typo.is_uninit()); + self.initialize_one_and_two_typo_subterm(ctx)?; + let s = ctx.term_interner.get_mut(self); + assert!(s.one_typo.is_init() && s.two_typo.is_init()); + } + Ok(()) + } +} + +fn find_zero_typo_prefix_derivations( + word_interned: Interned, + fst: fst::Set>, + word_interner: &mut DedupInterner, + mut visit: impl FnMut(Interned) -> Result>, +) -> Result<()> { + let word = word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + let prefix = Str::new(word).starts_with(); + let mut stream = fst.search(prefix).into_stream(); + + while let Some(derived_word) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?.to_owned(); + let derived_word_interned = word_interner.insert(derived_word); + if derived_word_interned != word_interned { + let cf = visit(derived_word_interned)?; + if cf.is_break() { + break; + } + } + } + Ok(()) +} + +fn find_zero_one_typo_derivations( + ctx: &mut SearchContext, + word_interned: Interned, + is_prefix: bool, + mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result>, +) -> Result<()> { + let fst = ctx.get_words_fst()?; + let word = ctx.word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + + let dfa = build_dfa(word, 1, is_prefix); + let starts = StartsWith(Str::new(get_first(word))); + let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); + + while let Some((derived_word, state)) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?; + let derived_word = ctx.word_interner.insert(derived_word.to_owned()); + let d = dfa.distance(state.1); + match d.to_u8() { + 0 => { + if derived_word != word_interned { + let cf = visit(derived_word, ZeroOrOneTypo::Zero)?; + if cf.is_break() { + break; + } + } + } + 1 => { + let cf = visit(derived_word, ZeroOrOneTypo::One)?; + if cf.is_break() { + break; + } + } + _ => { + unreachable!("One typo dfa produced multiple typos") + } + } + } + Ok(()) +} + +fn find_zero_one_two_typo_derivations( + word_interned: Interned, + is_prefix: bool, + fst: fst::Set>, + word_interner: &mut DedupInterner, + mut visit: impl FnMut(Interned, NumberOfTypos) -> Result>, +) -> Result<()> { + let word = word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + + let starts = StartsWith(Str::new(get_first(word))); + let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); + let second_dfa = build_dfa(word, 2, is_prefix); + let second = Intersection(&second_dfa, &starts); + let automaton = Union(first, &second); + + let mut stream = fst.search_with_state(automaton).into_stream(); + + while let Some((derived_word, state)) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?; + let derived_word_interned = word_interner.insert(derived_word.to_owned()); + // in the case the typo is on the first letter, we know the number of typo + // is two + if get_first(derived_word) != get_first(word) { + let cf = visit(derived_word_interned, NumberOfTypos::Two)?; + if cf.is_break() { + break; + } + } else { + // Else, we know that it is the second dfa that matched and compute the + // correct distance + let d = second_dfa.distance((state.1).0); + match d.to_u8() { + 0 => { + if derived_word_interned != word_interned { + let cf = visit(derived_word_interned, NumberOfTypos::Zero)?; + if cf.is_break() { + break; + } + } + } + 1 => { + let cf = visit(derived_word_interned, NumberOfTypos::One)?; + if cf.is_break() { + break; + } + } + 2 => { + let cf = visit(derived_word_interned, NumberOfTypos::Two)?; + if cf.is_break() { + break; + } + } + _ => unreachable!("2 typos DFA produced a distance greater than 2"), + } + } + } + Ok(()) +} + +pub fn partially_initialized_term_from_word( + ctx: &mut SearchContext, + word: &str, + max_typo: u8, + is_prefix: bool, +) -> Result { + let word_interned = ctx.word_interner.insert(word.to_owned()); + + if word.len() > MAX_WORD_LENGTH { + return Ok({ + QueryTerm { + original: ctx.word_interner.insert(word.to_owned()), + ngram_words: None, + is_prefix: false, + max_nbr_typos: 0, + zero_typo: <_>::default(), + one_typo: Lazy::Init(<_>::default()), + two_typo: Lazy::Init(<_>::default()), + } + }); + } + + let fst = ctx.index.words_fst(ctx.txn)?; + + let use_prefix_db = is_prefix + && ctx + .index + .word_prefix_docids + .remap_data_type::() + .get(ctx.txn, word)? + .is_some(); + let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; + + let mut zero_typo = None; + let mut prefix_of = BTreeSet::new(); + + if fst.contains(word) { + zero_typo = Some(word_interned); + } + + if is_prefix && use_prefix_db.is_none() { + find_zero_typo_prefix_derivations( + word_interned, + fst, + &mut ctx.word_interner, + |derived_word| { + if prefix_of.len() < limits::MAX_PREFIX_COUNT { + prefix_of.insert(derived_word); + Ok(ControlFlow::Continue(())) + } else { + Ok(ControlFlow::Break(())) + } + }, + )?; + } + let synonyms = ctx.index.synonyms(ctx.txn)?; + let mut synonym_word_count = 0; + let synonyms = synonyms + .get(&vec![word.to_owned()]) + .cloned() + .unwrap_or_default() + .into_iter() + .take(limits::MAX_SYNONYM_PHRASE_COUNT) + .filter_map(|words| { + if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT { + return None; + } + synonym_word_count += words.len(); + let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); + Some(ctx.phrase_interner.insert(Phrase { words })) + }) + .collect(); + let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; + + Ok(QueryTerm { + original: word_interned, + ngram_words: None, + max_nbr_typos: max_typo, + is_prefix, + zero_typo, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + }) +} + +fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result>> { + if let Some((l, r)) = split_best_frequency(ctx, word)? { + Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] }))) + } else { + Ok(None) + } +} + +impl Interned { + fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { + let self_mut = ctx.term_interner.get_mut(self); + let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; + let original = *original; + let is_prefix = *is_prefix; + // let original_str = ctx.word_interner.get(*original).to_owned(); + if one_typo.is_init() { + return Ok(()); + } + let mut one_typo_words = BTreeSet::new(); + + find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { + match nbr_typos { + ZeroOrOneTypo::Zero => {} + ZeroOrOneTypo::One => { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } else { + return Ok(ControlFlow::Break(())); + } + } + } + Ok(ControlFlow::Continue(())) + })?; + let original_str = ctx.word_interner.get(original).to_owned(); + let split_words = find_split_words(ctx, original_str.as_str())?; + let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; + + let self_mut = ctx.term_interner.get_mut(self); + self_mut.one_typo = Lazy::Init(one_typo); + + Ok(()) + } + fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { + let self_mut = ctx.term_interner.get_mut(self); + let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; + let original_str = ctx.word_interner.get(*original).to_owned(); + if two_typo.is_init() { + return Ok(()); + } + let mut one_typo_words = BTreeSet::new(); + let mut two_typo_words = BTreeSet::new(); + + find_zero_one_two_typo_derivations( + *original, + *is_prefix, + ctx.index.words_fst(ctx.txn)?, + &mut ctx.word_interner, + |derived_word, nbr_typos| { + if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT + && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT + { + // No chance we will add either one- or two-typo derivations anymore, stop iterating. + return Ok(ControlFlow::Break(())); + } + match nbr_typos { + NumberOfTypos::Zero => {} + NumberOfTypos::One => { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } + } + NumberOfTypos::Two => { + if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { + two_typo_words.insert(derived_word); + } + } + } + Ok(ControlFlow::Continue(())) + }, + )?; + let split_words = find_split_words(ctx, original_str.as_str())?; + let self_mut = ctx.term_interner.get_mut(self); + + let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words }; + + let two_typo = TwoTypoTerm { two_typos: two_typo_words }; + + self_mut.one_typo = Lazy::Init(one_typo); + self_mut.two_typo = Lazy::Init(two_typo); + + Ok(()) + } +} + +/// Split the original word into the two words that appear the +/// most next to each other in the index. +/// +/// Return `None` if the original word cannot be split. +fn split_best_frequency( + ctx: &mut SearchContext, + original: &str, +) -> Result, Interned)>> { + let chars = original.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = original.split_at(i); + let left = ctx.word_interner.insert(left.to_owned()); + let right = ctx.word_interner.insert(right.to_owned()); + + if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? { + let frequency = + CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?; + if best.map_or(true, |(old, _, _)| frequency > old) { + best = Some((frequency, left, right)); + } + } + } + + Ok(best.map(|(_, left, right)| (left, right))) +} diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs new file mode 100644 index 000000000..50977395b --- /dev/null +++ b/milli/src/search/new/query_term/mod.rs @@ -0,0 +1,331 @@ +mod compute_derivations; +mod ntypo_subset; +mod parse_query; +mod phrase; + +use super::interner::{DedupInterner, Interned}; +use super::{limits, SearchContext}; +use crate::Result; +use std::collections::BTreeSet; +use std::ops::RangeInclusive; + +pub use ntypo_subset::NTypoTermSubset; +pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed}; +pub use phrase::Phrase; + +use compute_derivations::partially_initialized_term_from_word; + +/** +A set of word derivations attached to a location in the search query. + +*/ +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct LocatedQueryTermSubset { + pub term_subset: QueryTermSubset, + pub positions: RangeInclusive, + pub term_ids: RangeInclusive, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct QueryTermSubset { + original: Interned, + zero_typo_subset: NTypoTermSubset, + one_typo_subset: NTypoTermSubset, + two_typo_subset: NTypoTermSubset, +} + +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct QueryTerm { + original: Interned, + ngram_words: Option>>, + max_nbr_typos: u8, + is_prefix: bool, + zero_typo: ZeroTypoTerm, + // May not be computed yet + one_typo: Lazy, + // May not be computed yet + two_typo: Lazy, +} + +// SubTerms will be in a dedup interner +#[derive(Default, Clone, PartialEq, Eq, Hash)] +struct ZeroTypoTerm { + /// The original phrase, if any + phrase: Option>, + /// A single word equivalent to the original term, with zero typos + zero_typo: Option>, + /// All the words that contain the original word as prefix + prefix_of: BTreeSet>, + /// All the synonyms of the original word or phrase + synonyms: BTreeSet>, + /// A prefix in the prefix databases matching the original word + use_prefix_db: Option>, +} +#[derive(Default, Clone, PartialEq, Eq, Hash)] +struct OneTypoTerm { + /// The original word split into multiple consecutive words + split_words: Option>, + /// Words that are 1 typo away from the original word + one_typo: BTreeSet>, +} +#[derive(Default, Clone, PartialEq, Eq, Hash)] +struct TwoTypoTerm { + /// Words that are 2 typos away from the original word + two_typos: BTreeSet>, +} + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum Lazy { + Uninit, + Init(T), +} +impl Lazy { + pub fn is_init(&self) -> bool { + match self { + Lazy::Uninit => false, + Lazy::Init(_) => true, + } + } + pub fn is_uninit(&self) -> bool { + match self { + Lazy::Uninit => true, + Lazy::Init(_) => false, + } + } +} + +impl QueryTermSubset { + pub fn empty(for_term: Interned) -> Self { + Self { + original: for_term, + zero_typo_subset: NTypoTermSubset::Nothing, + one_typo_subset: NTypoTermSubset::Nothing, + two_typo_subset: NTypoTermSubset::Nothing, + } + } + pub fn full(for_term: Interned) -> Self { + Self { + original: for_term, + zero_typo_subset: NTypoTermSubset::All, + one_typo_subset: NTypoTermSubset::All, + two_typo_subset: NTypoTermSubset::All, + } + } + + pub fn union(&mut self, other: &Self) { + assert!(self.original == other.original); + self.zero_typo_subset.union(&other.zero_typo_subset); + self.one_typo_subset.union(&other.one_typo_subset); + self.two_typo_subset.union(&other.two_typo_subset); + } + pub fn intersect(&mut self, other: &Self) { + assert!(self.original == other.original); + self.zero_typo_subset.intersect(&other.zero_typo_subset); + self.one_typo_subset.intersect(&other.one_typo_subset); + self.two_typo_subset.intersect(&other.two_typo_subset); + } + + pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option> { + let original = ctx.term_interner.get(self.original); + let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { + return None + }; + match &self.zero_typo_subset { + NTypoTermSubset::All => Some(use_prefix_db), + NTypoTermSubset::Subset { words, phrases: _ } => { + // TODO: use a subset of prefix words instead + if words.contains(&use_prefix_db) { + Some(use_prefix_db) + } else { + None + } + } + NTypoTermSubset::Nothing => None, + } + } + pub fn all_single_words_except_prefix_db( + &self, + ctx: &mut SearchContext, + ) -> Result>> { + let mut result = BTreeSet::default(); + // TODO: a compute_partially funtion + if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { + self.original.compute_fully_if_needed(ctx)?; + } + + let original = ctx.term_interner.get_mut(self.original); + if !self.zero_typo_subset.is_empty() { + let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = + &original.zero_typo; + result.extend(zero_typo.iter().copied()); + result.extend(prefix_of.iter().copied()); + }; + + match &self.one_typo_subset { + NTypoTermSubset::All => { + let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { + panic!() + }; + result.extend(one_typo.iter().copied()) + } + NTypoTermSubset::Subset { words, phrases: _ } => { + let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { + panic!() + }; + result.extend(one_typo.intersection(words)); + } + NTypoTermSubset::Nothing => {} + }; + + match &self.two_typo_subset { + NTypoTermSubset::All => { + let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { + panic!() + }; + result.extend(two_typos.iter().copied()); + } + NTypoTermSubset::Subset { words, phrases: _ } => { + let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { + panic!() + }; + result.extend(two_typos.intersection(words)); + } + NTypoTermSubset::Nothing => {} + }; + + Ok(result) + } + pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result>> { + let mut result = BTreeSet::default(); + + if !self.one_typo_subset.is_empty() { + // TODO: compute less than fully if possible + self.original.compute_fully_if_needed(ctx)?; + } + let original = ctx.term_interner.get_mut(self.original); + + let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = + &original.zero_typo; + result.extend(phrase.iter().copied()); + result.extend(synonyms.iter().copied()); + + if !self.one_typo_subset.is_empty() { + let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { + panic!(); + }; + result.extend(split_words.iter().copied()); + } + + Ok(result) + } + + pub fn original_phrase(&self, ctx: &SearchContext) -> Option> { + let t = ctx.term_interner.get(self.original); + if let Some(p) = t.zero_typo.phrase { + if self.zero_typo_subset.contains_phrase(p) { + return Some(p); + } + } + None + } + pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 { + let t = ctx.term_interner.get(self.original); + match t.max_nbr_typos { + 0 => 0, + 1 => { + if self.one_typo_subset.is_empty() { + 0 + } else { + 1 + } + } + 2 => { + if self.two_typo_subset.is_empty() { + if self.one_typo_subset.is_empty() { + 0 + } else { + 1 + } + } else { + 2 + } + } + _ => panic!(), + } + } + pub fn clear_zero_typo_subset(&mut self) { + self.zero_typo_subset = NTypoTermSubset::Nothing; + } + pub fn clear_one_typo_subset(&mut self) { + self.one_typo_subset = NTypoTermSubset::Nothing; + } + pub fn clear_two_typo_subset(&mut self) { + self.two_typo_subset = NTypoTermSubset::Nothing; + } + pub fn description(&self, ctx: &SearchContext) -> String { + let t = ctx.term_interner.get(self.original); + ctx.word_interner.get(t.original).to_owned() + } +} + +impl ZeroTypoTerm { + fn is_empty(&self) -> bool { + let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self; + phrase.is_none() + && zero_typo.is_none() + && prefix_of.is_empty() + && synonyms.is_empty() + && use_prefix_db.is_none() + } +} +impl OneTypoTerm { + fn is_empty(&self) -> bool { + let OneTypoTerm { split_words, one_typo } = self; + one_typo.is_empty() && split_words.is_none() + } +} +impl TwoTypoTerm { + fn is_empty(&self) -> bool { + let TwoTypoTerm { two_typos } = self; + two_typos.is_empty() + } +} + +impl QueryTerm { + fn is_empty(&self) -> bool { + let Lazy::Init(one_typo) = &self.one_typo else { + return false; + }; + let Lazy::Init(two_typo) = &self.two_typo else { + return false; + }; + + self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() + } +} + +impl Interned { + /// Return the original word from the given query term + fn original_single_word(self, ctx: &SearchContext) -> Option> { + let self_ = ctx.term_interner.get(self); + if self_.ngram_words.is_some() { + None + } else { + Some(self_.original) + } + } +} + +/// A query term coupled with its position in the user's search query. +#[derive(Clone)] +pub struct LocatedQueryTerm { + pub value: Interned, + pub positions: RangeInclusive, +} + +impl LocatedQueryTerm { + /// Return `true` iff the term is empty + pub fn is_empty(&self, interner: &DedupInterner) -> bool { + interner.get(self.value).is_empty() + } +} diff --git a/milli/src/search/new/query_term/ntypo_subset.rs b/milli/src/search/new/query_term/ntypo_subset.rs new file mode 100644 index 000000000..ad25d73c7 --- /dev/null +++ b/milli/src/search/new/query_term/ntypo_subset.rs @@ -0,0 +1,80 @@ +use std::collections::BTreeSet; + +use crate::search::new::interner::Interned; + +use super::Phrase; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum NTypoTermSubset { + All, + Subset { + words: BTreeSet>, + phrases: BTreeSet>, + // TODO: prefixes: BTreeSet>, + }, + Nothing, +} + +impl NTypoTermSubset { + pub fn contains_word(&self, word: Interned) -> bool { + match self { + NTypoTermSubset::All => true, + NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word), + NTypoTermSubset::Nothing => false, + } + } + pub fn contains_phrase(&self, phrase: Interned) -> bool { + match self { + NTypoTermSubset::All => true, + NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase), + NTypoTermSubset::Nothing => false, + } + } + pub fn is_empty(&self) -> bool { + match self { + NTypoTermSubset::All => false, + NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(), + NTypoTermSubset::Nothing => true, + } + } + pub fn union(&mut self, other: &Self) { + match self { + Self::All => {} + Self::Subset { words, phrases } => match other { + Self::All => { + *self = Self::All; + } + Self::Subset { words: w2, phrases: p2 } => { + words.extend(w2); + phrases.extend(p2); + } + Self::Nothing => {} + }, + Self::Nothing => { + *self = other.clone(); + } + } + } + pub fn intersect(&mut self, other: &Self) { + match self { + Self::All => *self = other.clone(), + Self::Subset { words, phrases } => match other { + Self::All => {} + Self::Subset { words: w2, phrases: p2 } => { + let mut ws = BTreeSet::new(); + for w in words.intersection(w2) { + ws.insert(*w); + } + let mut ps = BTreeSet::new(); + for p in phrases.intersection(p2) { + ps.insert(*p); + } + *words = ws; + *phrases = ps; + } + Self::Nothing => *self = Self::Nothing, + }, + Self::Nothing => {} + } + } +} diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs new file mode 100644 index 000000000..e0f6d971b --- /dev/null +++ b/milli/src/search/new/query_term/parse_query.rs @@ -0,0 +1,281 @@ +use charabia::{normalizer::NormalizedTokenIter, SeparatorKind, TokenKind}; + +use crate::{Result, SearchContext, MAX_WORD_LENGTH}; + +use super::*; + +/// Convert the tokenised search query into a list of located query terms. +// TODO: checking if the positions are correct for phrases, separators, ngrams +pub fn located_query_terms_from_string( + ctx: &mut SearchContext, + query: NormalizedTokenIter<&[u8]>, + words_limit: Option, +) -> Result> { + let nbr_typos = number_of_typos_allowed(ctx)?; + + let mut located_terms = Vec::new(); + + let mut phrase: Option = None; + + let parts_limit = words_limit.unwrap_or(usize::MAX); + + // start with the last position as we will wrap around to position 0 at the beginning of the loop below. + let mut position = u16::MAX; + + let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable(); + while let Some(token) = peekable.next() { + // early return if word limit is exceeded + if located_terms.len() >= parts_limit { + return Ok(located_terms); + } + + match token.kind { + TokenKind::Word | TokenKind::StopWord => { + // On first loop, goes from u16::MAX to 0, then normal increment. + position = position.wrapping_add(1); + + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, + // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, + // 3. if the word is the last token of the query we push it as a prefix word. + if let Some(phrase) = &mut phrase { + phrase.push_word(ctx, &token, position) + } else if peekable.peek().is_some() { + match token.kind { + TokenKind::Word => { + let word = token.lemma(); + let term = partially_initialized_term_from_word( + ctx, + word, + nbr_typos(word), + false, + )?; + let located_term = LocatedQueryTerm { + value: ctx.term_interner.push(term), + positions: position..=position, + }; + located_terms.push(located_term); + } + TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} + } + } else { + let word = token.lemma(); + let term = + partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?; + let located_term = LocatedQueryTerm { + value: ctx.term_interner.push(term), + positions: position..=position, + }; + located_terms.push(located_term); + } + } + TokenKind::Separator(separator_kind) => { + match separator_kind { + SeparatorKind::Hard => { + position += 1; + } + SeparatorKind::Soft => { + position += 0; + } + } + + phrase = 'phrase: { + let phrase = phrase.take(); + + // If we have a hard separator inside a phrase, we immediately start a new phrase + let phrase = if separator_kind == SeparatorKind::Hard { + if let Some(phrase) = phrase { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + Some(PhraseBuilder::empty()) + } else { + None + } + } else { + phrase + }; + + // We close and start a new phrase depending on the number of double quotes + let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + if quote_count == 0 { + break 'phrase phrase; + } + + // Consume the closing quote and the phrase + if let Some(phrase) = phrase { + // Per the check above, quote_count > 0 + quote_count -= 1; + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + } + + // Start new phrase if the token ends with an opening quote + (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) + }; + } + _ => (), + } + } + + // If a quote is never closed, we consider all of the end of the query as a phrase. + if let Some(phrase) = phrase.take() { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term); + } + } + + Ok(located_terms) +} + +pub fn number_of_typos_allowed<'ctx>( + ctx: &SearchContext<'ctx>, +) -> Result u8 + 'ctx> { + let authorize_typos = ctx.index.authorize_typos(ctx.txn)?; + let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; + let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; + + // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? + let exact_words = ctx.index.exact_words(ctx.txn)?; + + Ok(Box::new(move |word: &str| { + if !authorize_typos + || word.len() < min_len_one_typo as usize + || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) + { + 0 + } else if word.len() < min_len_two_typos as usize { + 1 + } else { + 2 + } + })) +} + +pub fn make_ngram( + ctx: &mut SearchContext, + terms: &[LocatedQueryTerm], + number_of_typos_allowed: &impl Fn(&str) -> u8, +) -> Result> { + assert!(!terms.is_empty()); + for t in terms { + if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() { + return Ok(None); + } + } + for ts in terms.windows(2) { + let [t1, t2] = ts else { panic!() }; + if *t1.positions.end() != t2.positions.start() - 1 { + return Ok(None); + } + } + let mut words_interned = vec![]; + for term in terms { + if let Some(original_term_word) = term.value.original_single_word(ctx) { + words_interned.push(original_term_word); + } else { + return Ok(None); + } + } + let words = + words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::>(); + + let start = *terms.first().as_ref().unwrap().positions.start(); + let end = *terms.last().as_ref().unwrap().positions.end(); + let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix; + let ngram_str = words.join(""); + if ngram_str.len() > MAX_WORD_LENGTH { + return Ok(None); + } + let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone()); + + let max_nbr_typos = + number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); + + let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; + + // Now add the synonyms + let index_synonyms = ctx.index.synonyms(ctx.txn)?; + + term.zero_typo.synonyms.extend( + index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| { + let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); + ctx.phrase_interner.insert(Phrase { words }) + }), + ); + + let term = QueryTerm { + original: ngram_str_interned, + ngram_words: Some(words_interned), + is_prefix, + max_nbr_typos, + zero_typo: term.zero_typo, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + }; + + let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end }; + + Ok(Some(term)) +} + +struct PhraseBuilder { + words: Vec>>, + start: u16, + end: u16, +} + +impl PhraseBuilder { + fn empty() -> Self { + Self { words: Default::default(), start: u16::MAX, end: u16::MAX } + } + + fn is_empty(&self) -> bool { + self.words.is_empty() + } + + // precondition: token has kind Word or StopWord + fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) { + if self.is_empty() { + self.start = position; + } + self.end = position; + if let TokenKind::StopWord = token.kind { + self.words.push(None); + } else { + // token has kind Word + let word = ctx.word_interner.insert(token.lemma().to_string()); + // TODO: in a phrase, check that every word exists + // otherwise return an empty term + self.words.push(Some(word)); + } + } + + fn build(self, ctx: &mut SearchContext) -> Option { + if self.is_empty() { + return None; + } + Some(LocatedQueryTerm { + value: ctx.term_interner.push({ + let phrase = ctx.phrase_interner.insert(Phrase { words: self.words }); + let phrase_desc = phrase.description(ctx); + QueryTerm { + original: ctx.word_interner.insert(phrase_desc), + ngram_words: None, + max_nbr_typos: 0, + is_prefix: false, + zero_typo: ZeroTypoTerm { + phrase: Some(phrase), + zero_typo: None, + prefix_of: BTreeSet::default(), + synonyms: BTreeSet::default(), + use_prefix_db: None, + }, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + } + }), + positions: self.start..=self.end, + }) + } +} diff --git a/milli/src/search/new/query_term/phrase.rs b/milli/src/search/new/query_term/phrase.rs new file mode 100644 index 000000000..2ea8e0d39 --- /dev/null +++ b/milli/src/search/new/query_term/phrase.rs @@ -0,0 +1,16 @@ +use itertools::Itertools; + +use crate::{search::new::interner::Interned, SearchContext}; + +/// A phrase in the user's search query, consisting of several words +/// that must appear side-by-side in the search results. +#[derive(Default, Clone, PartialEq, Eq, Hash)] +pub struct Phrase { + pub words: Vec>>, +} +impl Interned { + pub fn description(self, ctx: &SearchContext) -> String { + let p = ctx.phrase_interner.get(self); + p.words.iter().flatten().map(|w| ctx.word_interner.get(*w)).join(" ") + } +} diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 81c99fd9a..cfd3f62bf 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -57,9 +57,7 @@ impl RankingRuleGraphTrait for ProximityGraph { Ok(format!("{cost}: cost")) } ProximityCondition::Term { term } => { - let original_term = ctx.term_interner.get(term.term_subset.original); - let original_word = ctx.word_interner.get(original_term.original); - Ok(format!("{original_word} : exists")) + Ok(format!("{} : exists", term.term_subset.description(ctx))) } } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index de02b67a4..5d7e0f874 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -3,7 +3,7 @@ use roaring::RoaringBitmap; use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; -use crate::search::new::query_term::{LocatedQueryTermSubset, NTypoTermSubset}; +use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; @@ -43,8 +43,7 @@ impl RankingRuleGraphTrait for TypoGraph { _from: Option<&LocatedQueryTermSubset>, to_term: &LocatedQueryTermSubset, ) -> Result)>> { - let term = to_term; // LocatedQueryTermSubset { term_subset, positions: _, term_ids } = to_term; - let original_full_term = ctx.term_interner.get(term.term_subset.original); + let term = to_term; let mut edges = vec![]; // Ngrams have a base typo cost @@ -52,20 +51,20 @@ impl RankingRuleGraphTrait for TypoGraph { // 3-gram -> equivalent to 2 typos let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; - for nbr_typos in 0..=original_full_term.max_nbr_typos { + for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) { let mut term = term.clone(); match nbr_typos { 0 => { - term.term_subset.one_typo_subset = NTypoTermSubset::Nothing; - term.term_subset.two_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.clear_one_typo_subset(); + term.term_subset.clear_two_typo_subset(); } 1 => { - term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing; - term.term_subset.two_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.clear_zero_typo_subset(); + term.term_subset.clear_two_typo_subset(); } 2 => { - term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing; - term.term_subset.one_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.clear_zero_typo_subset(); + term.term_subset.clear_one_typo_subset(); } _ => panic!(), }; @@ -92,9 +91,6 @@ impl RankingRuleGraphTrait for TypoGraph { fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { let TypoCondition { term, nbr_typos } = condition; - let original_term = ctx.term_interner.get(term.term_subset.original); - let original = ctx.word_interner.get(original_term.original); - - Ok(format!("{original}: {nbr_typos}")) + Ok(format!("{}: {nbr_typos}", term.term_subset.description(ctx))) } }