diff --git a/milli/src/search/new/limits.rs b/milli/src/search/new/limits.rs new file mode 100644 index 000000000..33a5a4a6c --- /dev/null +++ b/milli/src/search/new/limits.rs @@ -0,0 +1,18 @@ +/// Maximum number of tokens we consider in a single search. +// TODO: Loic, find proper value here so we don't overflow the interner. +pub const MAX_TOKEN_COUNT: usize = 1_000; + +/// Maximum number of prefixes that can be derived from a single word. +pub const MAX_PREFIX_COUNT: usize = 1_000; +/// Maximum number of words that can be derived from a single word with a distance of one to that word. +pub const MAX_ONE_TYPO_COUNT: usize = 150; +/// Maximum number of words that can be derived from a single word with a distance of two to that word. +pub const MAX_TWO_TYPOS_COUNT: usize = 50; + +/// Maximum amount of synonym phrases that can be derived from a single word. +pub const MAX_SYNONYM_PHRASE_COUNT: usize = 50; + +/// Maximum amount of words inside of all the synonym phrases that can be derived from a single word. +/// +/// This limit is meant to gracefully handle the case where a word would have very long phrases as synonyms. +pub const MAX_SYNONYM_WORD_COUNT: usize = 100; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 588b89123..4061df162 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -2,6 +2,7 @@ mod db_cache; mod distinct; mod graph_based_ranking_rule; mod interner; +mod limits; mod logger; mod query_graph; mod query_term; diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index f0930eb01..cba9e590f 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -254,28 +254,28 @@ impl QueryGraph { } for node_id in self.nodes.indexes() { let node = self.nodes.get(node_id); - let end_position = match &node.data { - QueryNodeData::Term(term) => *term.positions.end(), + let end_prev_term_id = match &node.data { + QueryNodeData::Term(term) => *term.term_ids.end() as i16, QueryNodeData::Start => -1, QueryNodeData::Deleted => continue, QueryNodeData::End => continue, }; let successors = { let mut successors = SmallBitmap::for_interned_values_in(&self.nodes); - let mut min = i8::MAX; + let mut min = i16::MAX; for (node_id, node) in self.nodes.iter() { - let start_position = match &node.data { - QueryNodeData::Term(term) => *term.positions.start(), - QueryNodeData::End => i8::MAX, + let start_next_term_id = match &node.data { + QueryNodeData::Term(term) => *term.term_ids.start() as i16, + QueryNodeData::End => i16::MAX, QueryNodeData::Start => continue, QueryNodeData::Deleted => continue, }; - if start_position <= end_position { + if start_next_term_id <= end_prev_term_id { continue; } - match start_position.cmp(&min) { + match start_next_term_id.cmp(&min) { Ordering::Less => { - min = start_position; + min = start_next_term_id; successors.clear(); successors.insert(node_id); } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index f0c4c3921..d19ab6135 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -1,7 +1,6 @@ use std::borrow::Cow; use std::collections::BTreeSet; -use std::mem; -use std::ops::RangeInclusive; +use std::ops::{ControlFlow, RangeInclusive}; use charabia::normalizer::NormalizedTokenIter; use charabia::{SeparatorKind, TokenKind}; @@ -12,7 +11,7 @@ use heed::RoTxn; use itertools::Itertools; use super::interner::{DedupInterner, Interned}; -use super::SearchContext; +use super::{limits, SearchContext}; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; use crate::{CboRoaringBitmapLenCodec, Index, Result, MAX_WORD_LENGTH}; @@ -135,7 +134,7 @@ pub struct QueryTermSubset { #[derive(Clone, PartialEq, Eq, Hash)] pub struct LocatedQueryTermSubset { pub term_subset: QueryTermSubset, - pub positions: RangeInclusive, + pub positions: RangeInclusive, pub term_ids: RangeInclusive, } @@ -418,7 +417,7 @@ fn find_zero_typo_prefix_derivations( word_interned: Interned, fst: fst::Set>, word_interner: &mut DedupInterner, - mut visit: impl FnMut(Interned) -> Result<()>, + mut visit: impl FnMut(Interned) -> Result>, ) -> Result<()> { let word = word_interner.get(word_interned).to_owned(); let word = word.as_str(); @@ -429,7 +428,10 @@ fn find_zero_typo_prefix_derivations( let derived_word = std::str::from_utf8(derived_word)?.to_owned(); let derived_word_interned = word_interner.insert(derived_word); if derived_word_interned != word_interned { - visit(derived_word_interned)?; + let cf = visit(derived_word_interned)?; + if cf.is_break() { + break; + } } } Ok(()) @@ -440,7 +442,7 @@ fn find_zero_one_typo_derivations( is_prefix: bool, fst: fst::Set>, word_interner: &mut DedupInterner, - mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result<()>, + mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result>, ) -> Result<()> { let word = word_interner.get(word_interned).to_owned(); let word = word.as_str(); @@ -448,7 +450,6 @@ fn find_zero_one_typo_derivations( let dfa = build_dfa(word, 1, is_prefix); let starts = StartsWith(Str::new(get_first(word))); let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); - // TODO: There may be wayyy too many matches (e.g. in the thousands), how to reduce them? while let Some((derived_word, state)) = stream.next() { let derived_word = std::str::from_utf8(derived_word)?; @@ -457,13 +458,21 @@ fn find_zero_one_typo_derivations( match d.to_u8() { 0 => { if derived_word != word_interned { - visit(derived_word, ZeroOrOneTypo::Zero)?; + let cf = visit(derived_word, ZeroOrOneTypo::Zero)?; + if cf.is_break() { + break; + } } } 1 => { - visit(derived_word, ZeroOrOneTypo::One)?; + let cf = visit(derived_word, ZeroOrOneTypo::One)?; + if cf.is_break() { + break; + } + } + _ => { + unreachable!("One typo dfa produced multiple typos") } - _ => panic!(), } } Ok(()) @@ -480,7 +489,7 @@ fn find_zero_one_two_typo_derivations( is_prefix: bool, fst: fst::Set>, word_interner: &mut DedupInterner, - mut visit: impl FnMut(Interned, NumberOfTypos) -> Result<()>, + mut visit: impl FnMut(Interned, NumberOfTypos) -> Result>, ) -> Result<()> { let word = word_interner.get(word_interned).to_owned(); let word = word.as_str(); @@ -492,7 +501,6 @@ fn find_zero_one_two_typo_derivations( let automaton = Union(first, &second); let mut stream = fst.search_with_state(automaton).into_stream(); - // TODO: There may be wayyy too many matches (e.g. in the thousands), how to reduce them? while let Some((derived_word, state)) = stream.next() { let derived_word = std::str::from_utf8(derived_word)?; @@ -500,7 +508,10 @@ fn find_zero_one_two_typo_derivations( // in the case the typo is on the first letter, we know the number of typo // is two if get_first(derived_word) != get_first(word) { - visit(derived_word_interned, NumberOfTypos::Two)?; + let cf = visit(derived_word_interned, NumberOfTypos::Two)?; + if cf.is_break() { + break; + } } else { // Else, we know that it is the second dfa that matched and compute the // correct distance @@ -508,16 +519,25 @@ fn find_zero_one_two_typo_derivations( match d.to_u8() { 0 => { if derived_word_interned != word_interned { - visit(derived_word_interned, NumberOfTypos::Zero)?; + let cf = visit(derived_word_interned, NumberOfTypos::Zero)?; + if cf.is_break() { + break; + } } } 1 => { - visit(derived_word_interned, NumberOfTypos::One)?; + let cf = visit(derived_word_interned, NumberOfTypos::One)?; + if cf.is_break() { + break; + } } 2 => { - visit(derived_word_interned, NumberOfTypos::Two)?; + let cf = visit(derived_word_interned, NumberOfTypos::Two)?; + if cf.is_break() { + break; + } } - _ => panic!(), + _ => unreachable!("2 typos DFA produced a distance greater than 2"), } } } @@ -560,21 +580,30 @@ fn partially_initialized_term_from_word( fst, &mut ctx.word_interner, |derived_word| { - prefix_of.insert(derived_word); - Ok(()) + if prefix_of.len() < limits::MAX_PREFIX_COUNT { + prefix_of.insert(derived_word); + Ok(ControlFlow::Continue(())) + } else { + Ok(ControlFlow::Break(())) + } }, )?; } let synonyms = ctx.index.synonyms(ctx.txn)?; - + let mut synonym_word_count = 0; let synonyms = synonyms .get(&vec![word.to_owned()]) .cloned() .unwrap_or_default() .into_iter() - .map(|words| { + .take(limits::MAX_SYNONYM_PHRASE_COUNT) + .filter_map(|words| { + if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT { + return None; + } + synonym_word_count += words.len(); let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); - ctx.phrase_interner.insert(Phrase { words }) + Some(ctx.phrase_interner.insert(Phrase { words })) }) .collect(); let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; @@ -629,10 +658,14 @@ impl QueryTerm { match nbr_typos { ZeroOrOneTypo::Zero => {} ZeroOrOneTypo::One => { - one_typo_words.insert(derived_word); + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } else { + return Ok(ControlFlow::Break(())); + } } } - Ok(()) + Ok(ControlFlow::Continue(())) }, )?; let split_words = @@ -664,16 +697,26 @@ impl QueryTerm { index.words_fst(txn)?, word_interner, |derived_word, nbr_typos| { + if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT + && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT + { + // No chance we will add either one- or two-typo derivations anymore, stop iterating. + return Ok(ControlFlow::Break(())); + } match nbr_typos { NumberOfTypos::Zero => {} NumberOfTypos::One => { - one_typo_words.insert(derived_word); + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } } NumberOfTypos::Two => { - two_typo_words.insert(derived_word); + if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { + two_typo_words.insert(derived_word); + } } } - Ok(()) + Ok(ControlFlow::Continue(())) }, )?; let split_words = @@ -733,13 +776,66 @@ impl QueryTerm { /// A query term term coupled with its position in the user's search query. #[derive(Clone)] pub struct LocatedQueryTerm { - // should the query term subset really be interned? - // possibly, yes pub value: Interned, - pub positions: RangeInclusive, + pub positions: RangeInclusive, +} + +impl LocatedQueryTerm { + /// Return `true` iff the term is empty + pub fn is_empty(&self, interner: &DedupInterner) -> bool { + interner.get(self.value).is_empty() + } +} + +struct PhraseBuilder { + words: Vec>>, + start: u16, + end: u16, +} + +impl PhraseBuilder { + fn empty() -> Self { + Self { words: Default::default(), start: u16::MAX, end: u16::MAX } + } + + fn is_empty(&self) -> bool { + self.words.is_empty() + } + + // precondition: token has kind Word or StopWord + fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) { + if self.is_empty() { + self.start = position; + } + self.end = position; + if let TokenKind::StopWord = token.kind { + self.words.push(None); + } else { + // token has kind Word + let word = ctx.word_interner.insert(token.lemma().to_string()); + // TODO: in a phrase, check that every word exists + // otherwise return an empty term + self.words.push(Some(word)); + } + } + + fn build(self, ctx: &mut SearchContext) -> Option { + if self.is_empty() { + return None; + } + Some(LocatedQueryTerm { + value: ctx.term_interner.push(QueryTerm::phrase( + &mut ctx.word_interner, + &mut ctx.phrase_interner, + Phrase { words: self.words }, + )), + positions: self.start..=self.end, + }) + } } /// Convert the tokenised search query into a list of located query terms. +// TODO: checking if the positions are correct for phrases, separators, ngrams pub fn located_query_terms_from_string( ctx: &mut SearchContext, query: NormalizedTokenIter<&[u8]>, @@ -749,16 +845,14 @@ pub fn located_query_terms_from_string( let mut located_terms = Vec::new(); - let mut phrase = Vec::new(); - let mut quoted = false; + let mut phrase: Option = None; let parts_limit = words_limit.unwrap_or(usize::MAX); - let mut position = -1i8; - let mut phrase_start = -1i8; - let mut phrase_end = -1i8; + // start with the last position as we will wrap around to position 0 at the beginning of the loop below. + let mut position = u16::MAX; - let mut peekable = query.peekable(); + let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable(); while let Some(token) = peekable.next() { // early return if word limit is exceeded if located_terms.len() >= parts_limit { @@ -767,23 +861,14 @@ pub fn located_query_terms_from_string( match token.kind { TokenKind::Word | TokenKind::StopWord => { - position += 1; + // On first loop, goes from u16::MAX to 0, then normal increment. + position = position.wrapping_add(1); + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. - if quoted { - phrase_end = position; - if phrase.is_empty() { - phrase_start = position; - } - if let TokenKind::StopWord = token.kind { - phrase.push(None); - } else { - let word = ctx.word_interner.insert(token.lemma().to_string()); - // TODO: in a phrase, check that every word exists - // otherwise return an empty term - phrase.push(Some(word)); - } + if let Some(phrase) = &mut phrase { + phrase.push_word(ctx, &token, position) } else if peekable.peek().is_some() { match token.kind { TokenKind::Word => { @@ -822,40 +907,52 @@ pub fn located_query_terms_from_string( position += 0; } } - let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - // swap quoted state if we encounter a double quote - if quote_count % 2 != 0 { - quoted = !quoted; - } - // if there is a quote or a hard separator we close the phrase. - if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) - { - let located_query_term = LocatedQueryTerm { - value: ctx.term_interner.push(QueryTerm::phrase( - &mut ctx.word_interner, - &mut ctx.phrase_interner, - Phrase { words: mem::take(&mut phrase) }, - )), - positions: phrase_start..=phrase_end, + + phrase = 'phrase: { + let phrase = phrase.take(); + + // If we have a hard separator inside a phrase, we immediately start a new phrase + let phrase = if separator_kind == SeparatorKind::Hard { + if let Some(phrase) = phrase { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + Some(PhraseBuilder::empty()) + } else { + None + } + } else { + phrase }; - located_terms.push(located_query_term); - } + + // We close and start a new phrase depending on the number of double quotes + let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + if quote_count == 0 { + break 'phrase phrase; + } + + // Consume the closing quote and the phrase + if let Some(phrase) = phrase { + // Per the check above, quote_count > 0 + quote_count -= 1; + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + } + + // Start new phrase if the token ends with an opening quote + (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) + }; } _ => (), } } // If a quote is never closed, we consider all of the end of the query as a phrase. - if !phrase.is_empty() { - let located_query_term = LocatedQueryTerm { - value: ctx.term_interner.push(QueryTerm::phrase( - &mut ctx.word_interner, - &mut ctx.phrase_interner, - Phrase { words: mem::take(&mut phrase) }, - )), - positions: phrase_start..=phrase_end, - }; - located_terms.push(located_query_term); + if let Some(phrase) = phrase.take() { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term); + } } Ok(located_terms)