From 3f13608002355ed547590af925f283f6406e590c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 3 Apr 2023 15:27:49 +0200 Subject: [PATCH 01/21] Fix computation of ngram derivations --- milli/src/search/new/query_term.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index d19ab6135..9d59b9999 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -1017,16 +1017,13 @@ pub fn make_ngram( if ngram_str.len() > MAX_WORD_LENGTH { return Ok(None); } + let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone()); let max_nbr_typos = number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; - // let (_, mut zero_typo, mut one_typo, two_typo) = - // all_subterms_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; - let original = ctx.word_interner.insert(words.join(" ")); - // Now add the synonyms let index_synonyms = ctx.index.synonyms(ctx.txn)?; @@ -1038,7 +1035,7 @@ pub fn make_ngram( ); let term = QueryTerm { - original, + original: ngram_str_interned, is_multiple_words: true, is_prefix, max_nbr_typos, From 4129d657e2f36c173dcf89f4a5aa6f7da052d1f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 15:01:42 +0200 Subject: [PATCH 02/21] Simplify query_term module a bit --- milli/src/search/new/db_cache.rs | 13 ++ milli/src/search/new/query_term.rs | 274 ++++++++++++----------------- 2 files changed, 129 insertions(+), 158 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index af94108e2..b780af39f 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::hash_map::Entry; use std::hash::Hash; @@ -24,6 +25,8 @@ pub struct DatabaseCache<'ctx> { pub word_docids: FxHashMap, Option<&'ctx [u8]>>, pub exact_word_docids: FxHashMap, Option<&'ctx [u8]>>, pub word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, + + pub words_fst: Option>>, } impl<'ctx> DatabaseCache<'ctx> { fn get_value<'v, K1, KC>( @@ -49,6 +52,16 @@ impl<'ctx> DatabaseCache<'ctx> { } } impl<'ctx> SearchContext<'ctx> { + pub fn get_words_fst(&mut self) -> Result>> { + if let Some(fst) = self.db_cache.words_fst.clone() { + Ok(fst) + } else { + let fst = self.index.words_fst(self.txn)?; + self.db_cache.words_fst = Some(fst.clone()); + Ok(fst) + } + } + /// Retrieve or insert the given value in the `word_docids` database. pub fn get_db_word_docids(&mut self, word: Interned) -> Result> { DatabaseCache::get_value( diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 9d59b9999..15e106e06 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -7,14 +7,14 @@ use charabia::{SeparatorKind, TokenKind}; use fst::automaton::Str; use fst::{Automaton, IntoStreamer, Streamer}; use heed::types::DecodeIgnore; -use heed::RoTxn; +use heed::BytesDecode; use itertools::Itertools; use super::interner::{DedupInterner, Interned}; use super::{limits, SearchContext}; use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; use crate::search::{build_dfa, get_first}; -use crate::{CboRoaringBitmapLenCodec, Index, Result, MAX_WORD_LENGTH}; +use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH}; /// A phrase in the user's search query, consisting of several words /// that must appear side-by-side in the search results. @@ -191,18 +191,13 @@ impl QueryTermSubset { &self, ctx: &mut SearchContext, ) -> Result>> { - let original = ctx.term_interner.get_mut(self.original); let mut result = BTreeSet::default(); // TODO: a compute_partially funtion if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { - original.compute_fully_if_needed( - ctx.index, - ctx.txn, - &mut ctx.word_interner, - &mut ctx.phrase_interner, - )?; + self.original.compute_fully_if_needed(ctx)?; } + let original = ctx.term_interner.get_mut(self.original); if !self.zero_typo_subset.is_empty() { let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = &original.zero_typo; @@ -245,18 +240,13 @@ impl QueryTermSubset { Ok(result) } pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result>> { - let original = ctx.term_interner.get_mut(self.original); let mut result = BTreeSet::default(); if !self.one_typo_subset.is_empty() { // TODO: compute less than fully if possible - original.compute_fully_if_needed( - ctx.index, - ctx.txn, - &mut ctx.word_interner, - &mut ctx.phrase_interner, - )?; + self.original.compute_fully_if_needed(ctx)?; } + let original = ctx.term_interner.get_mut(self.original); let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = &original.zero_typo; @@ -274,26 +264,23 @@ impl QueryTermSubset { } } -impl QueryTerm { - pub fn compute_fully_if_needed( - &mut self, - index: &Index, - txn: &RoTxn, - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - ) -> Result<()> { - if self.max_nbr_typos == 0 { - self.one_typo = Lazy::Init(OneTypoTerm::default()); - self.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if self.max_nbr_typos == 1 && self.one_typo.is_uninit() { - assert!(self.two_typo.is_uninit()); - self.initialize_one_typo_subterm(index, txn, word_interner, phrase_interner)?; - assert!(self.one_typo.is_init()); - self.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if self.max_nbr_typos > 1 && self.two_typo.is_uninit() { - assert!(self.two_typo.is_uninit()); - self.initialize_one_and_two_typo_subterm(index, txn, word_interner, phrase_interner)?; - assert!(self.one_typo.is_init() && self.two_typo.is_init()); +impl Interned { + pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { + let s = ctx.term_interner.get_mut(self); + if s.max_nbr_typos == 0 { + s.one_typo = Lazy::Init(OneTypoTerm::default()); + s.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() { + assert!(s.two_typo.is_uninit()); + self.initialize_one_typo_subterm(ctx)?; + let s = ctx.term_interner.get_mut(self); + assert!(s.one_typo.is_init()); + s.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { + assert!(s.two_typo.is_uninit()); + self.initialize_one_and_two_typo_subterm(ctx)?; + let s = ctx.term_interner.get_mut(self); + assert!(s.one_typo.is_init() && s.two_typo.is_init()); } Ok(()) } @@ -302,7 +289,7 @@ impl QueryTerm { #[derive(Clone, PartialEq, Eq, Hash)] pub struct QueryTerm { pub original: Interned, - pub is_multiple_words: bool, + pub ngram_words: Option>>, pub max_nbr_typos: u8, pub is_prefix: bool, pub zero_typo: ZeroTypoTerm, @@ -363,39 +350,6 @@ impl TwoTypoTerm { } impl QueryTerm { - pub fn phrase( - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - phrase: Phrase, - ) -> Self { - Self { - original: word_interner.insert(phrase.description(word_interner)), - is_multiple_words: false, - max_nbr_typos: 0, - is_prefix: false, - zero_typo: ZeroTypoTerm { - phrase: Some(phrase_interner.insert(phrase)), - zero_typo: None, - prefix_of: BTreeSet::default(), - synonyms: BTreeSet::default(), - use_prefix_db: None, - }, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - } - } - pub fn empty(word_interner: &mut DedupInterner, original: &str) -> Self { - Self { - original: word_interner.insert(original.to_owned()), - is_multiple_words: false, - is_prefix: false, - max_nbr_typos: 0, - zero_typo: <_>::default(), - one_typo: Lazy::Init(<_>::default()), - two_typo: Lazy::Init(<_>::default()), - } - } - pub fn is_empty(&self) -> bool { let Lazy::Init(one_typo) = &self.one_typo else { return false; @@ -438,13 +392,13 @@ fn find_zero_typo_prefix_derivations( } fn find_zero_one_typo_derivations( + ctx: &mut SearchContext, word_interned: Interned, is_prefix: bool, - fst: fst::Set>, - word_interner: &mut DedupInterner, mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result>, ) -> Result<()> { - let word = word_interner.get(word_interned).to_owned(); + let fst = ctx.get_words_fst()?; + let word = ctx.word_interner.get(word_interned).to_owned(); let word = word.as_str(); let dfa = build_dfa(word, 1, is_prefix); @@ -453,7 +407,7 @@ fn find_zero_one_typo_derivations( while let Some((derived_word, state)) = stream.next() { let derived_word = std::str::from_utf8(derived_word)?; - let derived_word = word_interner.insert(derived_word.to_owned()); + let derived_word = ctx.word_interner.insert(derived_word.to_owned()); let d = dfa.distance(state.1); match d.to_u8() { 0 => { @@ -553,7 +507,17 @@ fn partially_initialized_term_from_word( let word_interned = ctx.word_interner.insert(word.to_owned()); if word.len() > MAX_WORD_LENGTH { - return Ok(QueryTerm::empty(&mut ctx.word_interner, word)); + return Ok({ + QueryTerm { + original: ctx.word_interner.insert(word.to_owned()), + ngram_words: None, + is_prefix: false, + max_nbr_typos: 0, + zero_typo: <_>::default(), + one_typo: Lazy::Init(<_>::default()), + two_typo: Lazy::Init(<_>::default()), + } + }); } let fst = ctx.index.words_fst(ctx.txn)?; @@ -610,7 +574,7 @@ fn partially_initialized_term_from_word( Ok(QueryTerm { original: word_interned, - is_multiple_words: false, + ngram_words: None, max_nbr_typos: max_typo, is_prefix, zero_typo, @@ -619,72 +583,52 @@ fn partially_initialized_term_from_word( }) } -fn find_split_words( - index: &Index, - txn: &RoTxn, - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - word: &str, -) -> Result>> { - let split_words = split_best_frequency(index, txn, word)?.map(|(l, r)| { - phrase_interner.insert(Phrase { - words: vec![Some(word_interner.insert(l)), Some(word_interner.insert(r))], - }) - }); - Ok(split_words) +fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result>> { + if let Some((l, r)) = split_best_frequency(ctx, word)? { + Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] }))) + } else { + Ok(None) + } } -impl QueryTerm { - fn initialize_one_typo_subterm( - &mut self, - index: &Index, - txn: &RoTxn, - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - ) -> Result<()> { - let QueryTerm { original, is_prefix, one_typo, .. } = self; - let original_str = word_interner.get(*original).to_owned(); +impl Interned { + fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { + let self_mut = ctx.term_interner.get_mut(self); + let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; + let original = *original; + let is_prefix = *is_prefix; + // let original_str = ctx.word_interner.get(*original).to_owned(); if one_typo.is_init() { return Ok(()); } let mut one_typo_words = BTreeSet::new(); - find_zero_one_typo_derivations( - *original, - *is_prefix, - index.words_fst(txn)?, - word_interner, - |derived_word, nbr_typos| { - match nbr_typos { - ZeroOrOneTypo::Zero => {} - ZeroOrOneTypo::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); - } else { - return Ok(ControlFlow::Break(())); - } + find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { + match nbr_typos { + ZeroOrOneTypo::Zero => {} + ZeroOrOneTypo::One => { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } else { + return Ok(ControlFlow::Break(())); } } - Ok(ControlFlow::Continue(())) - }, - )?; - let split_words = - find_split_words(index, txn, word_interner, phrase_interner, original_str.as_str())?; + } + Ok(ControlFlow::Continue(())) + })?; + let original_str = ctx.word_interner.get(original).to_owned(); + let split_words = find_split_words(ctx, original_str.as_str())?; let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; - self.one_typo = Lazy::Init(one_typo); + let self_mut = ctx.term_interner.get_mut(self); + self_mut.one_typo = Lazy::Init(one_typo); Ok(()) } - fn initialize_one_and_two_typo_subterm( - &mut self, - index: &Index, - txn: &RoTxn, - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - ) -> Result<()> { - let QueryTerm { original, is_prefix, two_typo, .. } = self; - let original_str = word_interner.get(*original).to_owned(); + fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { + let self_mut = ctx.term_interner.get_mut(self); + let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; + let original_str = ctx.word_interner.get(*original).to_owned(); if two_typo.is_init() { return Ok(()); } @@ -694,8 +638,8 @@ impl QueryTerm { find_zero_one_two_typo_derivations( *original, *is_prefix, - index.words_fst(txn)?, - word_interner, + ctx.index.words_fst(ctx.txn)?, + &mut ctx.word_interner, |derived_word, nbr_typos| { if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT @@ -719,14 +663,15 @@ impl QueryTerm { Ok(ControlFlow::Continue(())) }, )?; - let split_words = - find_split_words(index, txn, word_interner, phrase_interner, original_str.as_str())?; + let split_words = find_split_words(ctx, original_str.as_str())?; + let self_mut = ctx.term_interner.get_mut(self); + let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words }; let two_typo = TwoTypoTerm { two_typos: two_typo_words }; - self.one_typo = Lazy::Init(one_typo); - self.two_typo = Lazy::Init(two_typo); + self_mut.one_typo = Lazy::Init(one_typo); + self_mut.two_typo = Lazy::Init(two_typo); Ok(()) } @@ -737,38 +682,37 @@ impl QueryTerm { /// /// Return `None` if the original word cannot be split. fn split_best_frequency( - index: &Index, - txn: &RoTxn, + ctx: &mut SearchContext, original: &str, -) -> Result> { +) -> Result, Interned)>> { let chars = original.char_indices().skip(1); let mut best = None; for (i, _) in chars { let (left, right) = original.split_at(i); + let left = ctx.word_interner.insert(left.to_owned()); + let right = ctx.word_interner.insert(right.to_owned()); - let key = (1, left, right); - let frequency = index - .word_pair_proximity_docids - .remap_data_type::() - .get(txn, &key)? - .unwrap_or(0); - - if frequency != 0 && best.map_or(true, |(old, _, _)| frequency > old) { - best = Some((frequency, left, right)); + if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? { + let frequency = + CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?; + if best.map_or(true, |(old, _, _)| frequency > old) { + best = Some((frequency, left, right)); + } } } - Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned()))) + Ok(best.map(|(_, left, right)| (left, right))) } -impl QueryTerm { +impl Interned { /// Return the original word from the given query term - pub fn original_single_word(&self) -> Option> { - if self.is_multiple_words { + pub fn original_single_word(self, ctx: &SearchContext) -> Option> { + let self_ = ctx.term_interner.get(self); + if self_.ngram_words.is_some() { None } else { - Some(self.original) + Some(self_.original) } } } @@ -824,11 +768,25 @@ impl PhraseBuilder { return None; } Some(LocatedQueryTerm { - value: ctx.term_interner.push(QueryTerm::phrase( - &mut ctx.word_interner, - &mut ctx.phrase_interner, - Phrase { words: self.words }, - )), + value: ctx.term_interner.push({ + let phrase = Phrase { words: self.words }; + let phrase_desc = phrase.description(&ctx.word_interner); + QueryTerm { + original: ctx.word_interner.insert(phrase_desc), + ngram_words: None, + max_nbr_typos: 0, + is_prefix: false, + zero_typo: ZeroTypoTerm { + phrase: Some(ctx.phrase_interner.insert(phrase)), + zero_typo: None, + prefix_of: BTreeSet::default(), + synonyms: BTreeSet::default(), + use_prefix_db: None, + }, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + } + }), positions: self.start..=self.end, }) } @@ -1001,7 +959,7 @@ pub fn make_ngram( } let mut words_interned = vec![]; for term in terms { - if let Some(original_term_word) = ctx.term_interner.get(term.value).original_single_word() { + if let Some(original_term_word) = term.value.original_single_word(ctx) { words_interned.push(original_term_word); } else { return Ok(None); @@ -1036,7 +994,7 @@ pub fn make_ngram( let term = QueryTerm { original: ngram_str_interned, - is_multiple_words: true, + ngram_words: Some(words_interned), is_prefix, max_nbr_typos, zero_typo: term.zero_typo, From faceb661e301c4dbc0590c3b5c26e3c9870f90f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 15:02:01 +0200 Subject: [PATCH 03/21] Add note that a part of the code needs fixing --- milli/src/search/new/query_graph.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index cba9e590f..1eede33c2 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -431,6 +431,9 @@ impl QueryGraph { let (start_term, dest_term) = node; let end_term = Interned::from_raw(dest_term.into_raw()); let src = if let Some(start_term) = start_term { + // TODO: this is incorrect! should take the intersection + // between the prev node and the start term if they refer to the same + // original query term! let start_term = Interned::from_raw(start_term.into_raw()); nodes.get_mut(prev_node).successors.insert(start_term); nodes.get_mut(start_term).predecessors.insert(prev_node); From b439d36807f663d29953a1fe7dfc6488524a136b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 15:38:30 +0200 Subject: [PATCH 04/21] Split query_term module into multiple submodules --- milli/src/search/new/logger/detailed.rs | 89 +- milli/src/search/new/logger/mod.rs | 2 + milli/src/search/new/mod.rs | 3 + milli/src/search/new/query_graph.rs | 26 +- milli/src/search/new/query_term.rs | 1008 ----------------- .../new/query_term/compute_derivations.rs | 380 +++++++ milli/src/search/new/query_term/mod.rs | 331 ++++++ .../src/search/new/query_term/ntypo_subset.rs | 80 ++ .../src/search/new/query_term/parse_query.rs | 281 +++++ milli/src/search/new/query_term/phrase.rs | 16 + .../new/ranking_rule_graph/proximity/mod.rs | 4 +- .../search/new/ranking_rule_graph/typo/mod.rs | 24 +- 12 files changed, 1122 insertions(+), 1122 deletions(-) delete mode 100644 milli/src/search/new/query_term.rs create mode 100644 milli/src/search/new/query_term/compute_derivations.rs create mode 100644 milli/src/search/new/query_term/mod.rs create mode 100644 milli/src/search/new/query_term/ntypo_subset.rs create mode 100644 milli/src/search/new/query_term/parse_query.rs create mode 100644 milli/src/search/new/query_term/phrase.rs diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 3a02950a8..86568d5d2 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -8,9 +8,7 @@ use roaring::RoaringBitmap; use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::{ - Lazy, LocatedQueryTermSubset, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm, -}; +use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::ranking_rule_graph::{ DeadEndsCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, @@ -439,87 +437,26 @@ results.{cur_ranking_rule}{cur_activated_id} {{ positions: _, term_ids: _, }) => { - let QueryTerm { - original, - is_multiple_words: _, - is_prefix: _, - max_nbr_typos, - zero_typo, - one_typo, - two_typo, - } = ctx.term_interner.get(term_subset.original); - - let original = ctx.word_interner.get(*original); writeln!( file, - "{node_idx} : \"{original}\" {{ + "{node_idx} : \"{}\" {{ shape: class - max_nbr_typo: {max_nbr_typos}" + max_nbr_typo: {}", + term_subset.description(ctx), + term_subset.max_nbr_typos(ctx) ) .unwrap(); - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = - zero_typo; - - for w in zero_typo.iter().copied() { - if term_subset.zero_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0").unwrap(); - } + for w in term_subset.all_single_words_except_prefix_db(ctx).unwrap() { + let w = ctx.word_interner.get(w); + writeln!(file, "{w}: word").unwrap(); } - for w in prefix_of.iter().copied() { - if term_subset.zero_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0P").unwrap(); - } + for p in term_subset.all_phrases(ctx).unwrap() { + writeln!(file, "{}: phrase", p.description(ctx)).unwrap(); } - - if let Some(phrase) = phrase { - if term_subset.zero_typo_subset.contains_phrase(*phrase) { - let phrase = ctx.phrase_interner.get(*phrase); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : phrase").unwrap(); - } - } - - for synonym in synonyms.iter().copied() { - if term_subset.zero_typo_subset.contains_phrase(synonym) { - let phrase = ctx.phrase_interner.get(synonym); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); - } - } - if let Some(use_prefix_db) = use_prefix_db { - if term_subset.zero_typo_subset.contains_word(*use_prefix_db) { - let p = ctx.word_interner.get(*use_prefix_db); - writeln!(file, "use prefix DB : {p}").unwrap(); - } - } - if let Lazy::Init(one_typo) = one_typo { - let OneTypoTerm { split_words, one_typo } = one_typo; - - for w in one_typo.iter().copied() { - if term_subset.one_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 1").unwrap(); - } - } - if let Some(split_words) = split_words { - if term_subset.one_typo_subset.contains_phrase(*split_words) { - let phrase = ctx.phrase_interner.get(*split_words); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : split_words").unwrap(); - } - } - } - if let Lazy::Init(two_typo) = two_typo { - let TwoTypoTerm { two_typos } = two_typo; - for w in two_typos.iter().copied() { - if term_subset.two_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 2").unwrap(); - } - } + if let Some(w) = term_subset.use_prefix_db(ctx) { + let w = ctx.word_interner.get(w); + writeln!(file, "{w}: prefix db").unwrap(); } writeln!(file, "}}").unwrap(); diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 889e811ad..15cb78784 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -1,6 +1,8 @@ // #[cfg(test)] pub mod detailed; +pub mod test_logger; + use roaring::RoaringBitmap; use super::interner::{Interned, MappedInterner}; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4d561d25b..4456d693d 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -17,6 +17,9 @@ mod sort; // TODO: documentation + comments mod words; +#[cfg(test)] +mod tests; + use std::collections::HashSet; use charabia::TokenizerBuilder; diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 1eede33c2..33e178494 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,7 +1,6 @@ use super::interner::{FixedSizeInterner, Interned}; use super::query_term::{ - self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, NTypoTermSubset, - QueryTermSubset, + self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, QueryTermSubset, }; use super::small_bitmap::SmallBitmap; use super::SearchContext; @@ -107,12 +106,7 @@ impl QueryGraph { let new_node_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset { - original: Interned::from_raw(term_idx as u16), - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - }, + term_subset: QueryTermSubset::full(Interned::from_raw(term_idx as u16)), positions: terms[term_idx].positions.clone(), term_ids: term_idx as u8..=term_idx as u8, }), @@ -126,12 +120,7 @@ impl QueryGraph { let ngram_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset { - original: ngram.value, - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - }, + term_subset: QueryTermSubset::full(ngram.value), positions: ngram.positions, term_ids: term_idx as u8 - 1..=term_idx as u8, }), @@ -146,12 +135,7 @@ impl QueryGraph { let ngram_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset { - original: ngram.value, - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - }, + term_subset: QueryTermSubset::full(ngram.value), positions: ngram.positions, term_ids: term_idx as u8 - 2..=term_idx as u8, }), @@ -329,7 +313,7 @@ impl QueryGraph { let mut at_least_one_phrase = false; for (node_id, node) in self.nodes.iter() { let QueryNodeData::Term(t) = &node.data else { continue }; - if ctx.term_interner.get(t.term_subset.original).zero_typo.phrase.is_some() { + if t.term_subset.original_phrase(ctx).is_some() { at_least_one_phrase = true; continue; } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs deleted file mode 100644 index 15e106e06..000000000 --- a/milli/src/search/new/query_term.rs +++ /dev/null @@ -1,1008 +0,0 @@ -use std::borrow::Cow; -use std::collections::BTreeSet; -use std::ops::{ControlFlow, RangeInclusive}; - -use charabia::normalizer::NormalizedTokenIter; -use charabia::{SeparatorKind, TokenKind}; -use fst::automaton::Str; -use fst::{Automaton, IntoStreamer, Streamer}; -use heed::types::DecodeIgnore; -use heed::BytesDecode; -use itertools::Itertools; - -use super::interner::{DedupInterner, Interned}; -use super::{limits, SearchContext}; -use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; -use crate::search::{build_dfa, get_first}; -use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH}; - -/// A phrase in the user's search query, consisting of several words -/// that must appear side-by-side in the search results. -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct Phrase { - pub words: Vec>>, -} -impl Phrase { - pub fn description(&self, interner: &DedupInterner) -> String { - self.words.iter().flatten().map(|w| interner.get(*w)).join(" ") - } -} - -#[derive(Clone, PartialEq, Eq, Hash)] -pub enum Lazy { - Uninit, - Init(T), -} -impl Lazy { - pub fn is_init(&self) -> bool { - match self { - Lazy::Uninit => false, - Lazy::Init(_) => true, - } - } - pub fn is_uninit(&self) -> bool { - match self { - Lazy::Uninit => true, - Lazy::Init(_) => false, - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum NTypoTermSubset { - All, - Subset { - words: BTreeSet>, - phrases: BTreeSet>, - // TODO: prefixes: BTreeSet>, - }, - Nothing, -} - -impl NTypoTermSubset { - pub fn contains_word(&self, word: Interned) -> bool { - match self { - NTypoTermSubset::All => true, - NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word), - NTypoTermSubset::Nothing => false, - } - } - pub fn contains_phrase(&self, phrase: Interned) -> bool { - match self { - NTypoTermSubset::All => true, - NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase), - NTypoTermSubset::Nothing => false, - } - } - pub fn is_empty(&self) -> bool { - match self { - NTypoTermSubset::All => false, - NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(), - NTypoTermSubset::Nothing => true, - } - } - pub fn union(&mut self, other: &Self) { - match self { - Self::All => {} - Self::Subset { words, phrases } => match other { - Self::All => { - *self = Self::All; - } - Self::Subset { words: w2, phrases: p2 } => { - words.extend(w2); - phrases.extend(p2); - } - Self::Nothing => {} - }, - Self::Nothing => { - *self = other.clone(); - } - } - } - pub fn intersect(&mut self, other: &Self) { - match self { - Self::All => *self = other.clone(), - Self::Subset { words, phrases } => match other { - Self::All => {} - Self::Subset { words: w2, phrases: p2 } => { - let mut ws = BTreeSet::new(); - for w in words.intersection(w2) { - ws.insert(*w); - } - let mut ps = BTreeSet::new(); - for p in phrases.intersection(p2) { - ps.insert(*p); - } - *words = ws; - *phrases = ps; - } - Self::Nothing => *self = Self::Nothing, - }, - Self::Nothing => {} - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct QueryTermSubset { - pub original: Interned, - pub zero_typo_subset: NTypoTermSubset, - pub one_typo_subset: NTypoTermSubset, - pub two_typo_subset: NTypoTermSubset, -} - -#[derive(Clone, PartialEq, Eq, Hash)] -pub struct LocatedQueryTermSubset { - pub term_subset: QueryTermSubset, - pub positions: RangeInclusive, - pub term_ids: RangeInclusive, -} - -impl QueryTermSubset { - pub fn empty(for_term: Interned) -> Self { - Self { - original: for_term, - zero_typo_subset: NTypoTermSubset::Nothing, - one_typo_subset: NTypoTermSubset::Nothing, - two_typo_subset: NTypoTermSubset::Nothing, - } - } - pub fn full(for_term: Interned) -> Self { - Self { - original: for_term, - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - } - } - - pub fn union(&mut self, other: &Self) { - assert!(self.original == other.original); - self.zero_typo_subset.union(&other.zero_typo_subset); - self.one_typo_subset.union(&other.one_typo_subset); - self.two_typo_subset.union(&other.two_typo_subset); - } - pub fn intersect(&mut self, other: &Self) { - assert!(self.original == other.original); - self.zero_typo_subset.intersect(&other.zero_typo_subset); - self.one_typo_subset.intersect(&other.one_typo_subset); - self.two_typo_subset.intersect(&other.two_typo_subset); - } - - pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option> { - let original = ctx.term_interner.get(self.original); - let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { - return None - }; - match &self.zero_typo_subset { - NTypoTermSubset::All => Some(use_prefix_db), - NTypoTermSubset::Subset { words, phrases: _ } => { - // TODO: use a subset of prefix words instead - if words.contains(&use_prefix_db) { - Some(use_prefix_db) - } else { - None - } - } - NTypoTermSubset::Nothing => None, - } - } - pub fn all_single_words_except_prefix_db( - &self, - ctx: &mut SearchContext, - ) -> Result>> { - let mut result = BTreeSet::default(); - // TODO: a compute_partially funtion - if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { - self.original.compute_fully_if_needed(ctx)?; - } - - let original = ctx.term_interner.get_mut(self.original); - if !self.zero_typo_subset.is_empty() { - let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = - &original.zero_typo; - result.extend(zero_typo.iter().copied()); - result.extend(prefix_of.iter().copied()); - }; - - match &self.one_typo_subset { - NTypoTermSubset::All => { - let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { - panic!() - }; - result.extend(one_typo.iter().copied()) - } - NTypoTermSubset::Subset { words, phrases: _ } => { - let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { - panic!() - }; - result.extend(one_typo.intersection(words)); - } - NTypoTermSubset::Nothing => {} - }; - - match &self.two_typo_subset { - NTypoTermSubset::All => { - let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { - panic!() - }; - result.extend(two_typos.iter().copied()); - } - NTypoTermSubset::Subset { words, phrases: _ } => { - let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { - panic!() - }; - result.extend(two_typos.intersection(words)); - } - NTypoTermSubset::Nothing => {} - }; - - Ok(result) - } - pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result>> { - let mut result = BTreeSet::default(); - - if !self.one_typo_subset.is_empty() { - // TODO: compute less than fully if possible - self.original.compute_fully_if_needed(ctx)?; - } - let original = ctx.term_interner.get_mut(self.original); - - let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = - &original.zero_typo; - result.extend(phrase.iter().copied()); - result.extend(synonyms.iter().copied()); - - if !self.one_typo_subset.is_empty() { - let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { - panic!(); - }; - result.extend(split_words.iter().copied()); - } - - Ok(result) - } -} - -impl Interned { - pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { - let s = ctx.term_interner.get_mut(self); - if s.max_nbr_typos == 0 { - s.one_typo = Lazy::Init(OneTypoTerm::default()); - s.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() { - assert!(s.two_typo.is_uninit()); - self.initialize_one_typo_subterm(ctx)?; - let s = ctx.term_interner.get_mut(self); - assert!(s.one_typo.is_init()); - s.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { - assert!(s.two_typo.is_uninit()); - self.initialize_one_and_two_typo_subterm(ctx)?; - let s = ctx.term_interner.get_mut(self); - assert!(s.one_typo.is_init() && s.two_typo.is_init()); - } - Ok(()) - } -} - -#[derive(Clone, PartialEq, Eq, Hash)] -pub struct QueryTerm { - pub original: Interned, - pub ngram_words: Option>>, - pub max_nbr_typos: u8, - pub is_prefix: bool, - pub zero_typo: ZeroTypoTerm, - // May not be computed yet - pub one_typo: Lazy, - // May not be computed yet - pub two_typo: Lazy, -} - -// SubTerms will be in a dedup interner -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct ZeroTypoTerm { - /// The original phrase, if any - pub phrase: Option>, - /// A single word equivalent to the original term, with zero typos - pub zero_typo: Option>, - /// All the words that contain the original word as prefix - pub prefix_of: BTreeSet>, - /// All the synonyms of the original word or phrase - pub synonyms: BTreeSet>, - /// A prefix in the prefix databases matching the original word - pub use_prefix_db: Option>, -} -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct OneTypoTerm { - /// The original word split into multiple consecutive words - pub split_words: Option>, - /// Words that are 1 typo away from the original word - pub one_typo: BTreeSet>, -} -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct TwoTypoTerm { - /// Words that are 2 typos away from the original word - pub two_typos: BTreeSet>, -} - -impl ZeroTypoTerm { - fn is_empty(&self) -> bool { - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self; - phrase.is_none() - && zero_typo.is_none() - && prefix_of.is_empty() - && synonyms.is_empty() - && use_prefix_db.is_none() - } -} -impl OneTypoTerm { - fn is_empty(&self) -> bool { - let OneTypoTerm { split_words, one_typo } = self; - one_typo.is_empty() && split_words.is_none() - } -} -impl TwoTypoTerm { - fn is_empty(&self) -> bool { - let TwoTypoTerm { two_typos } = self; - two_typos.is_empty() - } -} - -impl QueryTerm { - pub fn is_empty(&self) -> bool { - let Lazy::Init(one_typo) = &self.one_typo else { - return false; - }; - let Lazy::Init(two_typo) = &self.two_typo else { - return false; - }; - - self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() - } -} - -pub enum ZeroOrOneTypo { - Zero, - One, -} - -fn find_zero_typo_prefix_derivations( - word_interned: Interned, - fst: fst::Set>, - word_interner: &mut DedupInterner, - mut visit: impl FnMut(Interned) -> Result>, -) -> Result<()> { - let word = word_interner.get(word_interned).to_owned(); - let word = word.as_str(); - let prefix = Str::new(word).starts_with(); - let mut stream = fst.search(prefix).into_stream(); - - while let Some(derived_word) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?.to_owned(); - let derived_word_interned = word_interner.insert(derived_word); - if derived_word_interned != word_interned { - let cf = visit(derived_word_interned)?; - if cf.is_break() { - break; - } - } - } - Ok(()) -} - -fn find_zero_one_typo_derivations( - ctx: &mut SearchContext, - word_interned: Interned, - is_prefix: bool, - mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result>, -) -> Result<()> { - let fst = ctx.get_words_fst()?; - let word = ctx.word_interner.get(word_interned).to_owned(); - let word = word.as_str(); - - let dfa = build_dfa(word, 1, is_prefix); - let starts = StartsWith(Str::new(get_first(word))); - let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); - - while let Some((derived_word, state)) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?; - let derived_word = ctx.word_interner.insert(derived_word.to_owned()); - let d = dfa.distance(state.1); - match d.to_u8() { - 0 => { - if derived_word != word_interned { - let cf = visit(derived_word, ZeroOrOneTypo::Zero)?; - if cf.is_break() { - break; - } - } - } - 1 => { - let cf = visit(derived_word, ZeroOrOneTypo::One)?; - if cf.is_break() { - break; - } - } - _ => { - unreachable!("One typo dfa produced multiple typos") - } - } - } - Ok(()) -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum NumberOfTypos { - Zero, - One, - Two, -} -fn find_zero_one_two_typo_derivations( - word_interned: Interned, - is_prefix: bool, - fst: fst::Set>, - word_interner: &mut DedupInterner, - mut visit: impl FnMut(Interned, NumberOfTypos) -> Result>, -) -> Result<()> { - let word = word_interner.get(word_interned).to_owned(); - let word = word.as_str(); - - let starts = StartsWith(Str::new(get_first(word))); - let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); - let second_dfa = build_dfa(word, 2, is_prefix); - let second = Intersection(&second_dfa, &starts); - let automaton = Union(first, &second); - - let mut stream = fst.search_with_state(automaton).into_stream(); - - while let Some((derived_word, state)) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?; - let derived_word_interned = word_interner.insert(derived_word.to_owned()); - // in the case the typo is on the first letter, we know the number of typo - // is two - if get_first(derived_word) != get_first(word) { - let cf = visit(derived_word_interned, NumberOfTypos::Two)?; - if cf.is_break() { - break; - } - } else { - // Else, we know that it is the second dfa that matched and compute the - // correct distance - let d = second_dfa.distance((state.1).0); - match d.to_u8() { - 0 => { - if derived_word_interned != word_interned { - let cf = visit(derived_word_interned, NumberOfTypos::Zero)?; - if cf.is_break() { - break; - } - } - } - 1 => { - let cf = visit(derived_word_interned, NumberOfTypos::One)?; - if cf.is_break() { - break; - } - } - 2 => { - let cf = visit(derived_word_interned, NumberOfTypos::Two)?; - if cf.is_break() { - break; - } - } - _ => unreachable!("2 typos DFA produced a distance greater than 2"), - } - } - } - Ok(()) -} - -fn partially_initialized_term_from_word( - ctx: &mut SearchContext, - word: &str, - max_typo: u8, - is_prefix: bool, -) -> Result { - let word_interned = ctx.word_interner.insert(word.to_owned()); - - if word.len() > MAX_WORD_LENGTH { - return Ok({ - QueryTerm { - original: ctx.word_interner.insert(word.to_owned()), - ngram_words: None, - is_prefix: false, - max_nbr_typos: 0, - zero_typo: <_>::default(), - one_typo: Lazy::Init(<_>::default()), - two_typo: Lazy::Init(<_>::default()), - } - }); - } - - let fst = ctx.index.words_fst(ctx.txn)?; - - let use_prefix_db = is_prefix - && ctx - .index - .word_prefix_docids - .remap_data_type::() - .get(ctx.txn, word)? - .is_some(); - let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; - - let mut zero_typo = None; - let mut prefix_of = BTreeSet::new(); - - if fst.contains(word) { - zero_typo = Some(word_interned); - } - - if is_prefix && use_prefix_db.is_none() { - find_zero_typo_prefix_derivations( - word_interned, - fst, - &mut ctx.word_interner, - |derived_word| { - if prefix_of.len() < limits::MAX_PREFIX_COUNT { - prefix_of.insert(derived_word); - Ok(ControlFlow::Continue(())) - } else { - Ok(ControlFlow::Break(())) - } - }, - )?; - } - let synonyms = ctx.index.synonyms(ctx.txn)?; - let mut synonym_word_count = 0; - let synonyms = synonyms - .get(&vec![word.to_owned()]) - .cloned() - .unwrap_or_default() - .into_iter() - .take(limits::MAX_SYNONYM_PHRASE_COUNT) - .filter_map(|words| { - if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT { - return None; - } - synonym_word_count += words.len(); - let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); - Some(ctx.phrase_interner.insert(Phrase { words })) - }) - .collect(); - let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; - - Ok(QueryTerm { - original: word_interned, - ngram_words: None, - max_nbr_typos: max_typo, - is_prefix, - zero_typo, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - }) -} - -fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result>> { - if let Some((l, r)) = split_best_frequency(ctx, word)? { - Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] }))) - } else { - Ok(None) - } -} - -impl Interned { - fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { - let self_mut = ctx.term_interner.get_mut(self); - let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; - let original = *original; - let is_prefix = *is_prefix; - // let original_str = ctx.word_interner.get(*original).to_owned(); - if one_typo.is_init() { - return Ok(()); - } - let mut one_typo_words = BTreeSet::new(); - - find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { - match nbr_typos { - ZeroOrOneTypo::Zero => {} - ZeroOrOneTypo::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); - } else { - return Ok(ControlFlow::Break(())); - } - } - } - Ok(ControlFlow::Continue(())) - })?; - let original_str = ctx.word_interner.get(original).to_owned(); - let split_words = find_split_words(ctx, original_str.as_str())?; - let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; - - let self_mut = ctx.term_interner.get_mut(self); - self_mut.one_typo = Lazy::Init(one_typo); - - Ok(()) - } - fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { - let self_mut = ctx.term_interner.get_mut(self); - let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; - let original_str = ctx.word_interner.get(*original).to_owned(); - if two_typo.is_init() { - return Ok(()); - } - let mut one_typo_words = BTreeSet::new(); - let mut two_typo_words = BTreeSet::new(); - - find_zero_one_two_typo_derivations( - *original, - *is_prefix, - ctx.index.words_fst(ctx.txn)?, - &mut ctx.word_interner, - |derived_word, nbr_typos| { - if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT - && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT - { - // No chance we will add either one- or two-typo derivations anymore, stop iterating. - return Ok(ControlFlow::Break(())); - } - match nbr_typos { - NumberOfTypos::Zero => {} - NumberOfTypos::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); - } - } - NumberOfTypos::Two => { - if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { - two_typo_words.insert(derived_word); - } - } - } - Ok(ControlFlow::Continue(())) - }, - )?; - let split_words = find_split_words(ctx, original_str.as_str())?; - let self_mut = ctx.term_interner.get_mut(self); - - let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words }; - - let two_typo = TwoTypoTerm { two_typos: two_typo_words }; - - self_mut.one_typo = Lazy::Init(one_typo); - self_mut.two_typo = Lazy::Init(two_typo); - - Ok(()) - } -} - -/// Split the original word into the two words that appear the -/// most next to each other in the index. -/// -/// Return `None` if the original word cannot be split. -fn split_best_frequency( - ctx: &mut SearchContext, - original: &str, -) -> Result, Interned)>> { - let chars = original.char_indices().skip(1); - let mut best = None; - - for (i, _) in chars { - let (left, right) = original.split_at(i); - let left = ctx.word_interner.insert(left.to_owned()); - let right = ctx.word_interner.insert(right.to_owned()); - - if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? { - let frequency = - CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?; - if best.map_or(true, |(old, _, _)| frequency > old) { - best = Some((frequency, left, right)); - } - } - } - - Ok(best.map(|(_, left, right)| (left, right))) -} - -impl Interned { - /// Return the original word from the given query term - pub fn original_single_word(self, ctx: &SearchContext) -> Option> { - let self_ = ctx.term_interner.get(self); - if self_.ngram_words.is_some() { - None - } else { - Some(self_.original) - } - } -} - -/// A query term term coupled with its position in the user's search query. -#[derive(Clone)] -pub struct LocatedQueryTerm { - pub value: Interned, - pub positions: RangeInclusive, -} - -impl LocatedQueryTerm { - /// Return `true` iff the term is empty - pub fn is_empty(&self, interner: &DedupInterner) -> bool { - interner.get(self.value).is_empty() - } -} - -struct PhraseBuilder { - words: Vec>>, - start: u16, - end: u16, -} - -impl PhraseBuilder { - fn empty() -> Self { - Self { words: Default::default(), start: u16::MAX, end: u16::MAX } - } - - fn is_empty(&self) -> bool { - self.words.is_empty() - } - - // precondition: token has kind Word or StopWord - fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) { - if self.is_empty() { - self.start = position; - } - self.end = position; - if let TokenKind::StopWord = token.kind { - self.words.push(None); - } else { - // token has kind Word - let word = ctx.word_interner.insert(token.lemma().to_string()); - // TODO: in a phrase, check that every word exists - // otherwise return an empty term - self.words.push(Some(word)); - } - } - - fn build(self, ctx: &mut SearchContext) -> Option { - if self.is_empty() { - return None; - } - Some(LocatedQueryTerm { - value: ctx.term_interner.push({ - let phrase = Phrase { words: self.words }; - let phrase_desc = phrase.description(&ctx.word_interner); - QueryTerm { - original: ctx.word_interner.insert(phrase_desc), - ngram_words: None, - max_nbr_typos: 0, - is_prefix: false, - zero_typo: ZeroTypoTerm { - phrase: Some(ctx.phrase_interner.insert(phrase)), - zero_typo: None, - prefix_of: BTreeSet::default(), - synonyms: BTreeSet::default(), - use_prefix_db: None, - }, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - } - }), - positions: self.start..=self.end, - }) - } -} - -/// Convert the tokenised search query into a list of located query terms. -// TODO: checking if the positions are correct for phrases, separators, ngrams -pub fn located_query_terms_from_string( - ctx: &mut SearchContext, - query: NormalizedTokenIter<&[u8]>, - words_limit: Option, -) -> Result> { - let nbr_typos = number_of_typos_allowed(ctx)?; - - let mut located_terms = Vec::new(); - - let mut phrase: Option = None; - - let parts_limit = words_limit.unwrap_or(usize::MAX); - - // start with the last position as we will wrap around to position 0 at the beginning of the loop below. - let mut position = u16::MAX; - - let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable(); - while let Some(token) = peekable.next() { - // early return if word limit is exceeded - if located_terms.len() >= parts_limit { - return Ok(located_terms); - } - - match token.kind { - TokenKind::Word | TokenKind::StopWord => { - // On first loop, goes from u16::MAX to 0, then normal increment. - position = position.wrapping_add(1); - - // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, - // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, - // 3. if the word is the last token of the query we push it as a prefix word. - if let Some(phrase) = &mut phrase { - phrase.push_word(ctx, &token, position) - } else if peekable.peek().is_some() { - match token.kind { - TokenKind::Word => { - let word = token.lemma(); - let term = partially_initialized_term_from_word( - ctx, - word, - nbr_typos(word), - false, - )?; - let located_term = LocatedQueryTerm { - value: ctx.term_interner.push(term), - positions: position..=position, - }; - located_terms.push(located_term); - } - TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} - } - } else { - let word = token.lemma(); - let term = - partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?; - let located_term = LocatedQueryTerm { - value: ctx.term_interner.push(term), - positions: position..=position, - }; - located_terms.push(located_term); - } - } - TokenKind::Separator(separator_kind) => { - match separator_kind { - SeparatorKind::Hard => { - position += 1; - } - SeparatorKind::Soft => { - position += 0; - } - } - - phrase = 'phrase: { - let phrase = phrase.take(); - - // If we have a hard separator inside a phrase, we immediately start a new phrase - let phrase = if separator_kind == SeparatorKind::Hard { - if let Some(phrase) = phrase { - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) - } - Some(PhraseBuilder::empty()) - } else { - None - } - } else { - phrase - }; - - // We close and start a new phrase depending on the number of double quotes - let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - if quote_count == 0 { - break 'phrase phrase; - } - - // Consume the closing quote and the phrase - if let Some(phrase) = phrase { - // Per the check above, quote_count > 0 - quote_count -= 1; - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) - } - } - - // Start new phrase if the token ends with an opening quote - (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) - }; - } - _ => (), - } - } - - // If a quote is never closed, we consider all of the end of the query as a phrase. - if let Some(phrase) = phrase.take() { - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term); - } - } - - Ok(located_terms) -} - -pub fn number_of_typos_allowed<'ctx>( - ctx: &SearchContext<'ctx>, -) -> Result u8 + 'ctx> { - let authorize_typos = ctx.index.authorize_typos(ctx.txn)?; - let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; - let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; - - // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? - let exact_words = ctx.index.exact_words(ctx.txn)?; - - Ok(Box::new(move |word: &str| { - if !authorize_typos - || word.len() < min_len_one_typo as usize - || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) - { - 0 - } else if word.len() < min_len_two_typos as usize { - 1 - } else { - 2 - } - })) -} - -pub fn make_ngram( - ctx: &mut SearchContext, - terms: &[LocatedQueryTerm], - number_of_typos_allowed: &impl Fn(&str) -> u8, -) -> Result> { - assert!(!terms.is_empty()); - for t in terms { - if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() { - return Ok(None); - } - } - for ts in terms.windows(2) { - let [t1, t2] = ts else { panic!() }; - if *t1.positions.end() != t2.positions.start() - 1 { - return Ok(None); - } - } - let mut words_interned = vec![]; - for term in terms { - if let Some(original_term_word) = term.value.original_single_word(ctx) { - words_interned.push(original_term_word); - } else { - return Ok(None); - } - } - let words = - words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::>(); - - let start = *terms.first().as_ref().unwrap().positions.start(); - let end = *terms.last().as_ref().unwrap().positions.end(); - let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix; - let ngram_str = words.join(""); - if ngram_str.len() > MAX_WORD_LENGTH { - return Ok(None); - } - let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone()); - - let max_nbr_typos = - number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); - - let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; - - // Now add the synonyms - let index_synonyms = ctx.index.synonyms(ctx.txn)?; - - term.zero_typo.synonyms.extend( - index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| { - let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); - ctx.phrase_interner.insert(Phrase { words }) - }), - ); - - let term = QueryTerm { - original: ngram_str_interned, - ngram_words: Some(words_interned), - is_prefix, - max_nbr_typos, - zero_typo: term.zero_typo, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - }; - - let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end }; - - Ok(Some(term)) -} diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs new file mode 100644 index 000000000..f95956fbf --- /dev/null +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -0,0 +1,380 @@ +use fst::automaton::Str; +use fst::{Automaton, IntoStreamer, Streamer}; +use heed::types::DecodeIgnore; +use heed::BytesDecode; +use std::borrow::Cow; +use std::collections::BTreeSet; +use std::ops::ControlFlow; + +use super::*; +use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; +use crate::search::new::query_term::TwoTypoTerm; +use crate::search::new::{limits, SearchContext}; +use crate::search::{build_dfa, get_first}; +use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum NumberOfTypos { + Zero, + One, + Two, +} + +pub enum ZeroOrOneTypo { + Zero, + One, +} + +impl Interned { + pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { + let s = ctx.term_interner.get_mut(self); + if s.max_nbr_typos == 0 { + s.one_typo = Lazy::Init(OneTypoTerm::default()); + s.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() { + assert!(s.two_typo.is_uninit()); + self.initialize_one_typo_subterm(ctx)?; + let s = ctx.term_interner.get_mut(self); + assert!(s.one_typo.is_init()); + s.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { + assert!(s.two_typo.is_uninit()); + self.initialize_one_and_two_typo_subterm(ctx)?; + let s = ctx.term_interner.get_mut(self); + assert!(s.one_typo.is_init() && s.two_typo.is_init()); + } + Ok(()) + } +} + +fn find_zero_typo_prefix_derivations( + word_interned: Interned, + fst: fst::Set>, + word_interner: &mut DedupInterner, + mut visit: impl FnMut(Interned) -> Result>, +) -> Result<()> { + let word = word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + let prefix = Str::new(word).starts_with(); + let mut stream = fst.search(prefix).into_stream(); + + while let Some(derived_word) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?.to_owned(); + let derived_word_interned = word_interner.insert(derived_word); + if derived_word_interned != word_interned { + let cf = visit(derived_word_interned)?; + if cf.is_break() { + break; + } + } + } + Ok(()) +} + +fn find_zero_one_typo_derivations( + ctx: &mut SearchContext, + word_interned: Interned, + is_prefix: bool, + mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result>, +) -> Result<()> { + let fst = ctx.get_words_fst()?; + let word = ctx.word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + + let dfa = build_dfa(word, 1, is_prefix); + let starts = StartsWith(Str::new(get_first(word))); + let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); + + while let Some((derived_word, state)) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?; + let derived_word = ctx.word_interner.insert(derived_word.to_owned()); + let d = dfa.distance(state.1); + match d.to_u8() { + 0 => { + if derived_word != word_interned { + let cf = visit(derived_word, ZeroOrOneTypo::Zero)?; + if cf.is_break() { + break; + } + } + } + 1 => { + let cf = visit(derived_word, ZeroOrOneTypo::One)?; + if cf.is_break() { + break; + } + } + _ => { + unreachable!("One typo dfa produced multiple typos") + } + } + } + Ok(()) +} + +fn find_zero_one_two_typo_derivations( + word_interned: Interned, + is_prefix: bool, + fst: fst::Set>, + word_interner: &mut DedupInterner, + mut visit: impl FnMut(Interned, NumberOfTypos) -> Result>, +) -> Result<()> { + let word = word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + + let starts = StartsWith(Str::new(get_first(word))); + let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); + let second_dfa = build_dfa(word, 2, is_prefix); + let second = Intersection(&second_dfa, &starts); + let automaton = Union(first, &second); + + let mut stream = fst.search_with_state(automaton).into_stream(); + + while let Some((derived_word, state)) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?; + let derived_word_interned = word_interner.insert(derived_word.to_owned()); + // in the case the typo is on the first letter, we know the number of typo + // is two + if get_first(derived_word) != get_first(word) { + let cf = visit(derived_word_interned, NumberOfTypos::Two)?; + if cf.is_break() { + break; + } + } else { + // Else, we know that it is the second dfa that matched and compute the + // correct distance + let d = second_dfa.distance((state.1).0); + match d.to_u8() { + 0 => { + if derived_word_interned != word_interned { + let cf = visit(derived_word_interned, NumberOfTypos::Zero)?; + if cf.is_break() { + break; + } + } + } + 1 => { + let cf = visit(derived_word_interned, NumberOfTypos::One)?; + if cf.is_break() { + break; + } + } + 2 => { + let cf = visit(derived_word_interned, NumberOfTypos::Two)?; + if cf.is_break() { + break; + } + } + _ => unreachable!("2 typos DFA produced a distance greater than 2"), + } + } + } + Ok(()) +} + +pub fn partially_initialized_term_from_word( + ctx: &mut SearchContext, + word: &str, + max_typo: u8, + is_prefix: bool, +) -> Result { + let word_interned = ctx.word_interner.insert(word.to_owned()); + + if word.len() > MAX_WORD_LENGTH { + return Ok({ + QueryTerm { + original: ctx.word_interner.insert(word.to_owned()), + ngram_words: None, + is_prefix: false, + max_nbr_typos: 0, + zero_typo: <_>::default(), + one_typo: Lazy::Init(<_>::default()), + two_typo: Lazy::Init(<_>::default()), + } + }); + } + + let fst = ctx.index.words_fst(ctx.txn)?; + + let use_prefix_db = is_prefix + && ctx + .index + .word_prefix_docids + .remap_data_type::() + .get(ctx.txn, word)? + .is_some(); + let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; + + let mut zero_typo = None; + let mut prefix_of = BTreeSet::new(); + + if fst.contains(word) { + zero_typo = Some(word_interned); + } + + if is_prefix && use_prefix_db.is_none() { + find_zero_typo_prefix_derivations( + word_interned, + fst, + &mut ctx.word_interner, + |derived_word| { + if prefix_of.len() < limits::MAX_PREFIX_COUNT { + prefix_of.insert(derived_word); + Ok(ControlFlow::Continue(())) + } else { + Ok(ControlFlow::Break(())) + } + }, + )?; + } + let synonyms = ctx.index.synonyms(ctx.txn)?; + let mut synonym_word_count = 0; + let synonyms = synonyms + .get(&vec![word.to_owned()]) + .cloned() + .unwrap_or_default() + .into_iter() + .take(limits::MAX_SYNONYM_PHRASE_COUNT) + .filter_map(|words| { + if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT { + return None; + } + synonym_word_count += words.len(); + let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); + Some(ctx.phrase_interner.insert(Phrase { words })) + }) + .collect(); + let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; + + Ok(QueryTerm { + original: word_interned, + ngram_words: None, + max_nbr_typos: max_typo, + is_prefix, + zero_typo, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + }) +} + +fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result>> { + if let Some((l, r)) = split_best_frequency(ctx, word)? { + Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] }))) + } else { + Ok(None) + } +} + +impl Interned { + fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { + let self_mut = ctx.term_interner.get_mut(self); + let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; + let original = *original; + let is_prefix = *is_prefix; + // let original_str = ctx.word_interner.get(*original).to_owned(); + if one_typo.is_init() { + return Ok(()); + } + let mut one_typo_words = BTreeSet::new(); + + find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { + match nbr_typos { + ZeroOrOneTypo::Zero => {} + ZeroOrOneTypo::One => { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } else { + return Ok(ControlFlow::Break(())); + } + } + } + Ok(ControlFlow::Continue(())) + })?; + let original_str = ctx.word_interner.get(original).to_owned(); + let split_words = find_split_words(ctx, original_str.as_str())?; + let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; + + let self_mut = ctx.term_interner.get_mut(self); + self_mut.one_typo = Lazy::Init(one_typo); + + Ok(()) + } + fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { + let self_mut = ctx.term_interner.get_mut(self); + let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; + let original_str = ctx.word_interner.get(*original).to_owned(); + if two_typo.is_init() { + return Ok(()); + } + let mut one_typo_words = BTreeSet::new(); + let mut two_typo_words = BTreeSet::new(); + + find_zero_one_two_typo_derivations( + *original, + *is_prefix, + ctx.index.words_fst(ctx.txn)?, + &mut ctx.word_interner, + |derived_word, nbr_typos| { + if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT + && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT + { + // No chance we will add either one- or two-typo derivations anymore, stop iterating. + return Ok(ControlFlow::Break(())); + } + match nbr_typos { + NumberOfTypos::Zero => {} + NumberOfTypos::One => { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } + } + NumberOfTypos::Two => { + if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { + two_typo_words.insert(derived_word); + } + } + } + Ok(ControlFlow::Continue(())) + }, + )?; + let split_words = find_split_words(ctx, original_str.as_str())?; + let self_mut = ctx.term_interner.get_mut(self); + + let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words }; + + let two_typo = TwoTypoTerm { two_typos: two_typo_words }; + + self_mut.one_typo = Lazy::Init(one_typo); + self_mut.two_typo = Lazy::Init(two_typo); + + Ok(()) + } +} + +/// Split the original word into the two words that appear the +/// most next to each other in the index. +/// +/// Return `None` if the original word cannot be split. +fn split_best_frequency( + ctx: &mut SearchContext, + original: &str, +) -> Result, Interned)>> { + let chars = original.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = original.split_at(i); + let left = ctx.word_interner.insert(left.to_owned()); + let right = ctx.word_interner.insert(right.to_owned()); + + if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? { + let frequency = + CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?; + if best.map_or(true, |(old, _, _)| frequency > old) { + best = Some((frequency, left, right)); + } + } + } + + Ok(best.map(|(_, left, right)| (left, right))) +} diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs new file mode 100644 index 000000000..50977395b --- /dev/null +++ b/milli/src/search/new/query_term/mod.rs @@ -0,0 +1,331 @@ +mod compute_derivations; +mod ntypo_subset; +mod parse_query; +mod phrase; + +use super::interner::{DedupInterner, Interned}; +use super::{limits, SearchContext}; +use crate::Result; +use std::collections::BTreeSet; +use std::ops::RangeInclusive; + +pub use ntypo_subset::NTypoTermSubset; +pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed}; +pub use phrase::Phrase; + +use compute_derivations::partially_initialized_term_from_word; + +/** +A set of word derivations attached to a location in the search query. + +*/ +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct LocatedQueryTermSubset { + pub term_subset: QueryTermSubset, + pub positions: RangeInclusive, + pub term_ids: RangeInclusive, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct QueryTermSubset { + original: Interned, + zero_typo_subset: NTypoTermSubset, + one_typo_subset: NTypoTermSubset, + two_typo_subset: NTypoTermSubset, +} + +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct QueryTerm { + original: Interned, + ngram_words: Option>>, + max_nbr_typos: u8, + is_prefix: bool, + zero_typo: ZeroTypoTerm, + // May not be computed yet + one_typo: Lazy, + // May not be computed yet + two_typo: Lazy, +} + +// SubTerms will be in a dedup interner +#[derive(Default, Clone, PartialEq, Eq, Hash)] +struct ZeroTypoTerm { + /// The original phrase, if any + phrase: Option>, + /// A single word equivalent to the original term, with zero typos + zero_typo: Option>, + /// All the words that contain the original word as prefix + prefix_of: BTreeSet>, + /// All the synonyms of the original word or phrase + synonyms: BTreeSet>, + /// A prefix in the prefix databases matching the original word + use_prefix_db: Option>, +} +#[derive(Default, Clone, PartialEq, Eq, Hash)] +struct OneTypoTerm { + /// The original word split into multiple consecutive words + split_words: Option>, + /// Words that are 1 typo away from the original word + one_typo: BTreeSet>, +} +#[derive(Default, Clone, PartialEq, Eq, Hash)] +struct TwoTypoTerm { + /// Words that are 2 typos away from the original word + two_typos: BTreeSet>, +} + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum Lazy { + Uninit, + Init(T), +} +impl Lazy { + pub fn is_init(&self) -> bool { + match self { + Lazy::Uninit => false, + Lazy::Init(_) => true, + } + } + pub fn is_uninit(&self) -> bool { + match self { + Lazy::Uninit => true, + Lazy::Init(_) => false, + } + } +} + +impl QueryTermSubset { + pub fn empty(for_term: Interned) -> Self { + Self { + original: for_term, + zero_typo_subset: NTypoTermSubset::Nothing, + one_typo_subset: NTypoTermSubset::Nothing, + two_typo_subset: NTypoTermSubset::Nothing, + } + } + pub fn full(for_term: Interned) -> Self { + Self { + original: for_term, + zero_typo_subset: NTypoTermSubset::All, + one_typo_subset: NTypoTermSubset::All, + two_typo_subset: NTypoTermSubset::All, + } + } + + pub fn union(&mut self, other: &Self) { + assert!(self.original == other.original); + self.zero_typo_subset.union(&other.zero_typo_subset); + self.one_typo_subset.union(&other.one_typo_subset); + self.two_typo_subset.union(&other.two_typo_subset); + } + pub fn intersect(&mut self, other: &Self) { + assert!(self.original == other.original); + self.zero_typo_subset.intersect(&other.zero_typo_subset); + self.one_typo_subset.intersect(&other.one_typo_subset); + self.two_typo_subset.intersect(&other.two_typo_subset); + } + + pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option> { + let original = ctx.term_interner.get(self.original); + let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { + return None + }; + match &self.zero_typo_subset { + NTypoTermSubset::All => Some(use_prefix_db), + NTypoTermSubset::Subset { words, phrases: _ } => { + // TODO: use a subset of prefix words instead + if words.contains(&use_prefix_db) { + Some(use_prefix_db) + } else { + None + } + } + NTypoTermSubset::Nothing => None, + } + } + pub fn all_single_words_except_prefix_db( + &self, + ctx: &mut SearchContext, + ) -> Result>> { + let mut result = BTreeSet::default(); + // TODO: a compute_partially funtion + if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { + self.original.compute_fully_if_needed(ctx)?; + } + + let original = ctx.term_interner.get_mut(self.original); + if !self.zero_typo_subset.is_empty() { + let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = + &original.zero_typo; + result.extend(zero_typo.iter().copied()); + result.extend(prefix_of.iter().copied()); + }; + + match &self.one_typo_subset { + NTypoTermSubset::All => { + let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { + panic!() + }; + result.extend(one_typo.iter().copied()) + } + NTypoTermSubset::Subset { words, phrases: _ } => { + let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { + panic!() + }; + result.extend(one_typo.intersection(words)); + } + NTypoTermSubset::Nothing => {} + }; + + match &self.two_typo_subset { + NTypoTermSubset::All => { + let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { + panic!() + }; + result.extend(two_typos.iter().copied()); + } + NTypoTermSubset::Subset { words, phrases: _ } => { + let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { + panic!() + }; + result.extend(two_typos.intersection(words)); + } + NTypoTermSubset::Nothing => {} + }; + + Ok(result) + } + pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result>> { + let mut result = BTreeSet::default(); + + if !self.one_typo_subset.is_empty() { + // TODO: compute less than fully if possible + self.original.compute_fully_if_needed(ctx)?; + } + let original = ctx.term_interner.get_mut(self.original); + + let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = + &original.zero_typo; + result.extend(phrase.iter().copied()); + result.extend(synonyms.iter().copied()); + + if !self.one_typo_subset.is_empty() { + let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { + panic!(); + }; + result.extend(split_words.iter().copied()); + } + + Ok(result) + } + + pub fn original_phrase(&self, ctx: &SearchContext) -> Option> { + let t = ctx.term_interner.get(self.original); + if let Some(p) = t.zero_typo.phrase { + if self.zero_typo_subset.contains_phrase(p) { + return Some(p); + } + } + None + } + pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 { + let t = ctx.term_interner.get(self.original); + match t.max_nbr_typos { + 0 => 0, + 1 => { + if self.one_typo_subset.is_empty() { + 0 + } else { + 1 + } + } + 2 => { + if self.two_typo_subset.is_empty() { + if self.one_typo_subset.is_empty() { + 0 + } else { + 1 + } + } else { + 2 + } + } + _ => panic!(), + } + } + pub fn clear_zero_typo_subset(&mut self) { + self.zero_typo_subset = NTypoTermSubset::Nothing; + } + pub fn clear_one_typo_subset(&mut self) { + self.one_typo_subset = NTypoTermSubset::Nothing; + } + pub fn clear_two_typo_subset(&mut self) { + self.two_typo_subset = NTypoTermSubset::Nothing; + } + pub fn description(&self, ctx: &SearchContext) -> String { + let t = ctx.term_interner.get(self.original); + ctx.word_interner.get(t.original).to_owned() + } +} + +impl ZeroTypoTerm { + fn is_empty(&self) -> bool { + let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self; + phrase.is_none() + && zero_typo.is_none() + && prefix_of.is_empty() + && synonyms.is_empty() + && use_prefix_db.is_none() + } +} +impl OneTypoTerm { + fn is_empty(&self) -> bool { + let OneTypoTerm { split_words, one_typo } = self; + one_typo.is_empty() && split_words.is_none() + } +} +impl TwoTypoTerm { + fn is_empty(&self) -> bool { + let TwoTypoTerm { two_typos } = self; + two_typos.is_empty() + } +} + +impl QueryTerm { + fn is_empty(&self) -> bool { + let Lazy::Init(one_typo) = &self.one_typo else { + return false; + }; + let Lazy::Init(two_typo) = &self.two_typo else { + return false; + }; + + self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() + } +} + +impl Interned { + /// Return the original word from the given query term + fn original_single_word(self, ctx: &SearchContext) -> Option> { + let self_ = ctx.term_interner.get(self); + if self_.ngram_words.is_some() { + None + } else { + Some(self_.original) + } + } +} + +/// A query term coupled with its position in the user's search query. +#[derive(Clone)] +pub struct LocatedQueryTerm { + pub value: Interned, + pub positions: RangeInclusive, +} + +impl LocatedQueryTerm { + /// Return `true` iff the term is empty + pub fn is_empty(&self, interner: &DedupInterner) -> bool { + interner.get(self.value).is_empty() + } +} diff --git a/milli/src/search/new/query_term/ntypo_subset.rs b/milli/src/search/new/query_term/ntypo_subset.rs new file mode 100644 index 000000000..ad25d73c7 --- /dev/null +++ b/milli/src/search/new/query_term/ntypo_subset.rs @@ -0,0 +1,80 @@ +use std::collections::BTreeSet; + +use crate::search::new::interner::Interned; + +use super::Phrase; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum NTypoTermSubset { + All, + Subset { + words: BTreeSet>, + phrases: BTreeSet>, + // TODO: prefixes: BTreeSet>, + }, + Nothing, +} + +impl NTypoTermSubset { + pub fn contains_word(&self, word: Interned) -> bool { + match self { + NTypoTermSubset::All => true, + NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word), + NTypoTermSubset::Nothing => false, + } + } + pub fn contains_phrase(&self, phrase: Interned) -> bool { + match self { + NTypoTermSubset::All => true, + NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase), + NTypoTermSubset::Nothing => false, + } + } + pub fn is_empty(&self) -> bool { + match self { + NTypoTermSubset::All => false, + NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(), + NTypoTermSubset::Nothing => true, + } + } + pub fn union(&mut self, other: &Self) { + match self { + Self::All => {} + Self::Subset { words, phrases } => match other { + Self::All => { + *self = Self::All; + } + Self::Subset { words: w2, phrases: p2 } => { + words.extend(w2); + phrases.extend(p2); + } + Self::Nothing => {} + }, + Self::Nothing => { + *self = other.clone(); + } + } + } + pub fn intersect(&mut self, other: &Self) { + match self { + Self::All => *self = other.clone(), + Self::Subset { words, phrases } => match other { + Self::All => {} + Self::Subset { words: w2, phrases: p2 } => { + let mut ws = BTreeSet::new(); + for w in words.intersection(w2) { + ws.insert(*w); + } + let mut ps = BTreeSet::new(); + for p in phrases.intersection(p2) { + ps.insert(*p); + } + *words = ws; + *phrases = ps; + } + Self::Nothing => *self = Self::Nothing, + }, + Self::Nothing => {} + } + } +} diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs new file mode 100644 index 000000000..e0f6d971b --- /dev/null +++ b/milli/src/search/new/query_term/parse_query.rs @@ -0,0 +1,281 @@ +use charabia::{normalizer::NormalizedTokenIter, SeparatorKind, TokenKind}; + +use crate::{Result, SearchContext, MAX_WORD_LENGTH}; + +use super::*; + +/// Convert the tokenised search query into a list of located query terms. +// TODO: checking if the positions are correct for phrases, separators, ngrams +pub fn located_query_terms_from_string( + ctx: &mut SearchContext, + query: NormalizedTokenIter<&[u8]>, + words_limit: Option, +) -> Result> { + let nbr_typos = number_of_typos_allowed(ctx)?; + + let mut located_terms = Vec::new(); + + let mut phrase: Option = None; + + let parts_limit = words_limit.unwrap_or(usize::MAX); + + // start with the last position as we will wrap around to position 0 at the beginning of the loop below. + let mut position = u16::MAX; + + let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable(); + while let Some(token) = peekable.next() { + // early return if word limit is exceeded + if located_terms.len() >= parts_limit { + return Ok(located_terms); + } + + match token.kind { + TokenKind::Word | TokenKind::StopWord => { + // On first loop, goes from u16::MAX to 0, then normal increment. + position = position.wrapping_add(1); + + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, + // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, + // 3. if the word is the last token of the query we push it as a prefix word. + if let Some(phrase) = &mut phrase { + phrase.push_word(ctx, &token, position) + } else if peekable.peek().is_some() { + match token.kind { + TokenKind::Word => { + let word = token.lemma(); + let term = partially_initialized_term_from_word( + ctx, + word, + nbr_typos(word), + false, + )?; + let located_term = LocatedQueryTerm { + value: ctx.term_interner.push(term), + positions: position..=position, + }; + located_terms.push(located_term); + } + TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} + } + } else { + let word = token.lemma(); + let term = + partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?; + let located_term = LocatedQueryTerm { + value: ctx.term_interner.push(term), + positions: position..=position, + }; + located_terms.push(located_term); + } + } + TokenKind::Separator(separator_kind) => { + match separator_kind { + SeparatorKind::Hard => { + position += 1; + } + SeparatorKind::Soft => { + position += 0; + } + } + + phrase = 'phrase: { + let phrase = phrase.take(); + + // If we have a hard separator inside a phrase, we immediately start a new phrase + let phrase = if separator_kind == SeparatorKind::Hard { + if let Some(phrase) = phrase { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + Some(PhraseBuilder::empty()) + } else { + None + } + } else { + phrase + }; + + // We close and start a new phrase depending on the number of double quotes + let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + if quote_count == 0 { + break 'phrase phrase; + } + + // Consume the closing quote and the phrase + if let Some(phrase) = phrase { + // Per the check above, quote_count > 0 + quote_count -= 1; + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + } + + // Start new phrase if the token ends with an opening quote + (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) + }; + } + _ => (), + } + } + + // If a quote is never closed, we consider all of the end of the query as a phrase. + if let Some(phrase) = phrase.take() { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term); + } + } + + Ok(located_terms) +} + +pub fn number_of_typos_allowed<'ctx>( + ctx: &SearchContext<'ctx>, +) -> Result u8 + 'ctx> { + let authorize_typos = ctx.index.authorize_typos(ctx.txn)?; + let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; + let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; + + // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? + let exact_words = ctx.index.exact_words(ctx.txn)?; + + Ok(Box::new(move |word: &str| { + if !authorize_typos + || word.len() < min_len_one_typo as usize + || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) + { + 0 + } else if word.len() < min_len_two_typos as usize { + 1 + } else { + 2 + } + })) +} + +pub fn make_ngram( + ctx: &mut SearchContext, + terms: &[LocatedQueryTerm], + number_of_typos_allowed: &impl Fn(&str) -> u8, +) -> Result> { + assert!(!terms.is_empty()); + for t in terms { + if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() { + return Ok(None); + } + } + for ts in terms.windows(2) { + let [t1, t2] = ts else { panic!() }; + if *t1.positions.end() != t2.positions.start() - 1 { + return Ok(None); + } + } + let mut words_interned = vec![]; + for term in terms { + if let Some(original_term_word) = term.value.original_single_word(ctx) { + words_interned.push(original_term_word); + } else { + return Ok(None); + } + } + let words = + words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::>(); + + let start = *terms.first().as_ref().unwrap().positions.start(); + let end = *terms.last().as_ref().unwrap().positions.end(); + let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix; + let ngram_str = words.join(""); + if ngram_str.len() > MAX_WORD_LENGTH { + return Ok(None); + } + let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone()); + + let max_nbr_typos = + number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); + + let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; + + // Now add the synonyms + let index_synonyms = ctx.index.synonyms(ctx.txn)?; + + term.zero_typo.synonyms.extend( + index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| { + let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); + ctx.phrase_interner.insert(Phrase { words }) + }), + ); + + let term = QueryTerm { + original: ngram_str_interned, + ngram_words: Some(words_interned), + is_prefix, + max_nbr_typos, + zero_typo: term.zero_typo, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + }; + + let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end }; + + Ok(Some(term)) +} + +struct PhraseBuilder { + words: Vec>>, + start: u16, + end: u16, +} + +impl PhraseBuilder { + fn empty() -> Self { + Self { words: Default::default(), start: u16::MAX, end: u16::MAX } + } + + fn is_empty(&self) -> bool { + self.words.is_empty() + } + + // precondition: token has kind Word or StopWord + fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) { + if self.is_empty() { + self.start = position; + } + self.end = position; + if let TokenKind::StopWord = token.kind { + self.words.push(None); + } else { + // token has kind Word + let word = ctx.word_interner.insert(token.lemma().to_string()); + // TODO: in a phrase, check that every word exists + // otherwise return an empty term + self.words.push(Some(word)); + } + } + + fn build(self, ctx: &mut SearchContext) -> Option { + if self.is_empty() { + return None; + } + Some(LocatedQueryTerm { + value: ctx.term_interner.push({ + let phrase = ctx.phrase_interner.insert(Phrase { words: self.words }); + let phrase_desc = phrase.description(ctx); + QueryTerm { + original: ctx.word_interner.insert(phrase_desc), + ngram_words: None, + max_nbr_typos: 0, + is_prefix: false, + zero_typo: ZeroTypoTerm { + phrase: Some(phrase), + zero_typo: None, + prefix_of: BTreeSet::default(), + synonyms: BTreeSet::default(), + use_prefix_db: None, + }, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + } + }), + positions: self.start..=self.end, + }) + } +} diff --git a/milli/src/search/new/query_term/phrase.rs b/milli/src/search/new/query_term/phrase.rs new file mode 100644 index 000000000..2ea8e0d39 --- /dev/null +++ b/milli/src/search/new/query_term/phrase.rs @@ -0,0 +1,16 @@ +use itertools::Itertools; + +use crate::{search::new::interner::Interned, SearchContext}; + +/// A phrase in the user's search query, consisting of several words +/// that must appear side-by-side in the search results. +#[derive(Default, Clone, PartialEq, Eq, Hash)] +pub struct Phrase { + pub words: Vec>>, +} +impl Interned { + pub fn description(self, ctx: &SearchContext) -> String { + let p = ctx.phrase_interner.get(self); + p.words.iter().flatten().map(|w| ctx.word_interner.get(*w)).join(" ") + } +} diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 81c99fd9a..cfd3f62bf 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -57,9 +57,7 @@ impl RankingRuleGraphTrait for ProximityGraph { Ok(format!("{cost}: cost")) } ProximityCondition::Term { term } => { - let original_term = ctx.term_interner.get(term.term_subset.original); - let original_word = ctx.word_interner.get(original_term.original); - Ok(format!("{original_word} : exists")) + Ok(format!("{} : exists", term.term_subset.description(ctx))) } } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index de02b67a4..5d7e0f874 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -3,7 +3,7 @@ use roaring::RoaringBitmap; use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; -use crate::search::new::query_term::{LocatedQueryTermSubset, NTypoTermSubset}; +use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; @@ -43,8 +43,7 @@ impl RankingRuleGraphTrait for TypoGraph { _from: Option<&LocatedQueryTermSubset>, to_term: &LocatedQueryTermSubset, ) -> Result)>> { - let term = to_term; // LocatedQueryTermSubset { term_subset, positions: _, term_ids } = to_term; - let original_full_term = ctx.term_interner.get(term.term_subset.original); + let term = to_term; let mut edges = vec![]; // Ngrams have a base typo cost @@ -52,20 +51,20 @@ impl RankingRuleGraphTrait for TypoGraph { // 3-gram -> equivalent to 2 typos let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; - for nbr_typos in 0..=original_full_term.max_nbr_typos { + for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) { let mut term = term.clone(); match nbr_typos { 0 => { - term.term_subset.one_typo_subset = NTypoTermSubset::Nothing; - term.term_subset.two_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.clear_one_typo_subset(); + term.term_subset.clear_two_typo_subset(); } 1 => { - term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing; - term.term_subset.two_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.clear_zero_typo_subset(); + term.term_subset.clear_two_typo_subset(); } 2 => { - term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing; - term.term_subset.one_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.clear_zero_typo_subset(); + term.term_subset.clear_one_typo_subset(); } _ => panic!(), }; @@ -92,9 +91,6 @@ impl RankingRuleGraphTrait for TypoGraph { fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { let TypoCondition { term, nbr_typos } = condition; - let original_term = ctx.term_interner.get(term.term_subset.original); - let original = ctx.word_interner.get(original_term.original); - - Ok(format!("{original}: {nbr_typos}")) + Ok(format!("{}: {nbr_typos}", term.term_subset.description(ctx))) } } From 62b9c6fbee82cb0a1bf600457f74a49457f9bde2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 16:18:22 +0200 Subject: [PATCH 05/21] Add search tests --- milli/src/search/new/tests/mod.rs | 3 + .../src/search/new/tests/ngram_split_words.rs | 255 ++++++++++++ milli/src/search/new/tests/typo.rs | 363 ++++++++++++++++++ milli/src/search/new/tests/words_tms.rs | 266 +++++++++++++ 4 files changed, 887 insertions(+) create mode 100644 milli/src/search/new/tests/mod.rs create mode 100644 milli/src/search/new/tests/ngram_split_words.rs create mode 100644 milli/src/search/new/tests/typo.rs create mode 100644 milli/src/search/new/tests/words_tms.rs diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs new file mode 100644 index 000000000..eec4c62ec --- /dev/null +++ b/milli/src/search/new/tests/mod.rs @@ -0,0 +1,3 @@ +pub mod ngram_split_words; +pub mod typo; +pub mod words_tms; diff --git a/milli/src/search/new/tests/ngram_split_words.rs b/milli/src/search/new/tests/ngram_split_words.rs new file mode 100644 index 000000000..06c49274c --- /dev/null +++ b/milli/src/search/new/tests/ngram_split_words.rs @@ -0,0 +1,255 @@ +/*! +This module tests the following properties: + +1. Two consecutive words from a query can be combined into a "2gram" +2. Three consecutive words from a query can be combined into a "3gram" +3. A word from the query can be split into two consecutive words (split words) +4. A 2gram can be split into two words +5. A 3gram cannot be split into two words +6. 2grams can contain up to 1 typo +7. 3grams cannot have typos +8. 2grams and 3grams can be prefix tolerant +9. Disabling typo tolerance also disable the split words feature +10. Disabling typo tolerance does not disable prefix tolerance +11. Disabling typo tolerance does not disable ngram tolerance +12. Prefix tolerance is disabled for the last word if a space follows it +13. Ngrams cannot be formed by combining a phrase and a word or two phrases +*/ + +use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "the sun flowers are pretty" + }, + { + "id": 1, + "text": "the sun flower is tall" + }, + { + "id": 2, + "text": "the sunflowers are pretty" + }, + { + "id": 3, + "text": "the sunflower is tall" + } + ])) + .unwrap(); + index +} + +#[test] +fn test_2gram_simple() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // will also match documents with "sun flower" + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); +} +#[test] +fn test_3gram_simple() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flower s are"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2]"); +} + +#[test] +fn test_2gram_typo() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flawer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); +} + +#[test] +fn test_no_disable_ngrams() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flower "); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // documents containing `sunflower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]"); +} + +#[test] +fn test_2gram_prefix() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flow"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // documents containing words beginning with `sunflow` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); +} + +#[test] +fn test_3gram_prefix() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("su nf l"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // documents containing a word beginning with sunfl + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3]"); +} + +#[test] +fn test_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunflower "); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // all the documents with either `sunflower` or `sun flower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3]"); +} + +#[test] +fn test_disable_split_words() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunflower "); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // no document containing `sun flower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); +} + +#[test] +fn test_2gram_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunf lower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // all the documents with "sunflower", "sun flower", or (sunflower + 1 typo) + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3]"); +} + +#[test] +fn test_3gram_no_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunf lo wer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // no document with `sun flower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3]"); +} + +#[test] +fn test_3gram_no_typos() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunf la wer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} + +#[test] +fn test_no_ngram_phrases() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("\"sun\" flower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]"); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("\"sun\" \"flower\""); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]"); +} diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs new file mode 100644 index 000000000..6ac8f5516 --- /dev/null +++ b/milli/src/search/new/tests/typo.rs @@ -0,0 +1,363 @@ +/*! +This module tests the following properties: + +1. The `words` ranking rule is typo-tolerant +2. Typo-tolerance handles missing letters, extra letters, replaced letters, and swapped letters (at least) +3. Words which are < `min_word_len_one_typo` are not typo tolerant +4. Words which are >= `min_word_len_one_typo` but < `min_word_len_two_typos` can have one typo +5. Words which are >= `min_word_len_two_typos` can have two typos +6. A typo on the first letter of a word counts as two typos +7. Phrases are not typo tolerant +8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos` +9. 3grams are not typo tolerant +10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly +if `words` doesn't exist before it. +11. The `typo` ranking rule places documents with the same number of typos in the same bucket +12. Prefix tolerance costs nothing according to the typo ranking rule +13. Split words cost 1 typo according to the typo ranking rule +14. Synonyms cost nothing according to the typo ranking rule +*/ + +use std::collections::HashMap; + +use crate::{ + index::tests::TempIndex, Criterion, + Search, SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "the quick brown fox jumps over the lazy dog" + }, + { + "id": 1, + "text": "the quick brown foxes jump over the lazy dog" + }, + { + "id": 2, + "text": "the quick brown fax sends a letter to the dog" + }, + { + "id": 3, + "text": "the quickest brownest fox jumps over the laziest dog" + }, + { + "id": 4, + "text": "a fox doesn't quack, that crown goes to the duck." + }, + { + "id": 5, + "text": "the quicker browner fox jumped over the lazier dog" + }, + { + "id": 6, + "text": "the extravagant fox skyrocketed over the languorous dog" // thanks thesaurus + }, + { + "id": 7, + "text": "the quick brown fox jumps over the lazy" + }, + { + "id": 8, + "text": "the quick brown fox jumps over the" + }, + { + "id": 9, + "text": "the quick brown fox jumps over" + }, + { + "id": 10, + "text": "the quick brown fox jumps" + }, + { + "id": 11, + "text": "the quick brown fox" + }, + { + "id": 12, + "text": "the quick brown" + }, + { + "id": 13, + "text": "the quick" + }, + { + "id": 14, + "text": "netwolk interconections sunflawar" + }, + { + "id": 15, + "text": "network interconnections sunflawer" + }, + { + "id": 16, + "text": "network interconnection sunflower" + }, + { + "id": 17, + "text": "network interconnection sun flower" + }, + { + "id": 18, + "text": "network interconnection sunflowering" + }, + { + "id": 19, + "text": "network interconnection sun flowering" + }, + { + "id": 20, + "text": "network interconnection sunflowar" + }, + { + "id": 21, + "text": "the fast brownish fox jumps over the lackadaisical dog" + }, + { + "id": 22, + "text": "the quick brown fox jumps over the lackadaisical dog" + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_no_typo() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); +} + +#[test] +fn test_default_typo() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let ot = index.min_word_len_one_typo(&txn).unwrap(); + let tt = index.min_word_len_two_typos(&txn).unwrap(); + insta::assert_debug_snapshot!(ot, @"5"); + insta::assert_debug_snapshot!(tt, @"9"); + + // 0 typo + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + + // 1 typo on one word, replaced letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quack brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + + // 1 typo on one word, missing letter, extra letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quicest brownest fox jummps over the laziest dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); + + // 1 typo on one word, swapped letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quikc borwn fox jupms over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + + // 1 first letter typo on a word <5 bytes, replaced letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the nuick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // 1 first letter typo on a word <5 bytes, missing letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the uick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // 1 typo on all words >=5 bytes, replaced letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quack brawn fox junps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + + // 2 typos on words < 9 bytes + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quckest brawnert fox jumps over the aziest dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // 2 typos on words >= 9 bytes: missing letters, missing first letter, replaced letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the extravant fox kyrocketed over the lamguorout dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + // 2 typos on words >= 9 bytes: 2 extra letters in a single word, swapped letters + extra letter, replaced letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the extravaganttt fox sktyrocnketed over the lagnuorrous dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); +} + +#[test] +fn test_phrase_no_typo_allowed() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the \"quick brewn\" fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} + +#[test] +fn test_ngram_typos() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the extra lagant fox skyrocketed over the languorous dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the ex tra lagant fox skyrocketed over the languorous dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} +#[test] +fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Typo]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids: ids_1, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{ids_1:?}"), @"[0, 7, 8, 9, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Typo]); + }) + .unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids: ids_2, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{ids_2:?}"), @"[0, 7, 8, 9, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + + assert_eq!(ids_1, ids_2); +} + +#[test] +fn test_typo_bucketing() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + // First do the search with just the Words ranking rule + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sunflower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 15, 16, 17, 18, 20]"); + + // Then with the typo ranking rule + drop(txn); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Typo]); + }) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sunflower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18, 17, 20, 15, 14]"); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sun flower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[17, 19, 16, 18, 20, 15]"); +} + +#[test] +fn test_typo_synonyms() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Typo]); + + let mut synonyms = HashMap::new(); + synonyms.insert("lackadaisical".to_owned(), vec!["lazy".to_owned()]); + synonyms.insert("fast brownish".to_owned(), vec!["quick brown".to_owned()]); + + s.set_synonyms(synonyms); + }) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lackadaisical dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0]"); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the fast brownish fox jumps over the lackadaisical dog"); + + // TODO: is this correct? interaction of ngrams + synonyms means that the + // multi-word synonyms end up having a typo cost. This is probably not what we want. + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0]"); +} diff --git a/milli/src/search/new/tests/words_tms.rs b/milli/src/search/new/tests/words_tms.rs new file mode 100644 index 000000000..8b5c0153f --- /dev/null +++ b/milli/src/search/new/tests/words_tms.rs @@ -0,0 +1,266 @@ +/*! +This module tests the following properties: + +1. The `last` term matching strategy starts removing terms from the query +starting from the end if no more results match it. +2. Phrases are never deleted by the `last` term matching strategy +3. Duplicate words don't affect the ranking of a document according to the `words` ranking rule +4. The proximity of the first and last word of a phrase to its adjacent terms is taken into +account by the proximity ranking rule. +5. Unclosed double quotes still make a phrase +6. The `all` term matching strategy does not remove any term from the query +7. The search is capable of returning no results if no documents match the query +*/ + +use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; + +fn create_quick_brown_fox_trivial_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "", + }, + { + "id": 1, + "text": "the", + }, + { + "id": 2, + "text": "the quick", + }, + { + "id": 3, + "text": "the quick brown", + }, + { + "id": 4, + "text": "the quick brown fox", + }, + { + "id": 5, + "text": "the quick brown fox jumps", + }, + { + "id": 6, + "text": "the quick brown fox jumps over", + }, + { + "id": 7, + "text": "the quick brown fox jumps over the", + }, + { + "id": 8, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 9, + "text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 10, + "text": "the brown quick fox jumps over the lazy dog", + }, + { + "id": 11, + "text": "the quick brown fox talks to the lazy and slow dog", + }, + { + "id": 12, + "text": "the quick brown fox talks to the lazy dog", + }, + { + "id": 13, + "text": "the mighty and quick brown fox jumps over the lazy dog", + }, + { + "id": 14, + "text": "the great quick brown fox jumps over the lazy dog", + }, + { + "id": 15, + "text": "this quick brown and very scary fox jumps over the lazy dog", + }, + { + "id": 16, + "text": "this quick brown and scary fox jumps over the lazy dog", + }, + { + "id": 17, + "text": "the quick brown fox jumps over the really lazy dog", + }, + { + "id": 18, + "text": "the brown quick fox jumps over the really lazy dog", + }, + { + "id": 19, + "text": "the brown quick fox immediately jumps over the really lazy dog", + }, + { + "id": 20, + "text": "the brown quick fox immediately jumps over the really lazy blue dog", + }, + { + "id": 21, + "text": "the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.", + }, + { + "id": 22, + "text": "the, quick, brown, fox, jumps, over, the, lazy, dog", + } + ])) + .unwrap(); + index +} + +#[test] +fn test_words_tms_last_simple() { + let index = create_quick_brown_fox_trivial_index(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // 6 and 7 have the same score because "the" appears twice + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 8, 6, 7, 5, 4, 11, 12, 3]"); + + let mut s = Search::new(&txn, &index); + s.query("extravagant the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} + +#[test] +fn test_words_tms_last_phrase() { + let index = create_quick_brown_fox_trivial_index(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox\" jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "The quick brown fox" is a phrase, not deleted by this term matching strategy + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 6, 7, 5, 4, 11, 12]"); + + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox\" jumps over the \"lazy\" dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "lazy" is a phrase, not deleted by this term matching strategy + // but words before it can be deleted + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 11, 12]"); + + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox jumps over the lazy dog\""); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // The whole query is a phrase, no terms are removed + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]"); + + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // The whole query is still a phrase, even without closing quotes, so no terms are removed + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]"); +} + +#[test] +fn test_words_proximity_tms_last_simple() { + let index = create_quick_brown_fox_trivial_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // 7 is better than 6 because of the proximity between "the" and its surrounding terms + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]"); + + let mut s = Search::new(&txn, &index); + s.query("the brown quick fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // 10 is better than 9 because of the proximity between "quick" and "brown" + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]"); +} + +#[test] +fn test_words_proximity_tms_last_phrase() { + let index = create_quick_brown_fox_trivial_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the \"quick brown\" fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "quick brown" is a phrase. The proximity of its first and last words + // to their adjacent query words should be taken into account + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5, 4, 11, 12, 3]"); + + let mut s = Search::new(&txn, &index); + s.query("the \"quick brown\" \"fox jumps\" over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "quick brown" is a phrase. The proximity of its first and last words + // to their adjacent query words should be taken into account. + // The same applies to `fox jumps`. + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5]"); +} + +#[test] +fn test_words_tms_all() { + let index = create_quick_brown_fox_trivial_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::All); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]"); + + let mut s = Search::new(&txn, &index); + s.query("extravagant"); + s.terms_matching_strategy(TermsMatchingStrategy::All); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} From 406b8bd2489931afb2373ed60011e73d9530922e Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:04:46 +0200 Subject: [PATCH 06/21] Add new db caches --- milli/src/search/new/db_cache.rs | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index af94108e2..effd123be 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -24,6 +24,8 @@ pub struct DatabaseCache<'ctx> { pub word_docids: FxHashMap, Option<&'ctx [u8]>>, pub exact_word_docids: FxHashMap, Option<&'ctx [u8]>>, pub word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, + pub word_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, + pub word_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, } impl<'ctx> DatabaseCache<'ctx> { fn get_value<'v, K1, KC>( @@ -128,4 +130,32 @@ impl<'ctx> SearchContext<'ctx> { self.index.prefix_word_pair_proximity_docids.remap_data_type::(), ) } + + pub fn get_db_word_position_docids( + &mut self, + word: Interned, + position: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word, position), + &(self.word_interner.get(word).as_str(), position), + &mut self.db_cache.word_position_docids, + self.index.word_position_docids.remap_data_type::(), + ) + } + + pub fn get_db_word_fid_docids( + &mut self, + word: Interned, + fid: u16, + ) -> Result> { + DatabaseCache::get_value( + self.txn, + (word, fid), + &(self.word_interner.get(word).as_str(), fid), + &mut self.db_cache.word_fid_docids, + self.index.word_fid_docids.remap_data_type::(), + ) + } } From ec2f8e804003f9ece9d49f3f616a60152b5c0ed2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:06:07 +0200 Subject: [PATCH 07/21] Rename `is_multiple_words` to `is_ngram` and `zero_typo` to `exact` --- milli/src/search/new/logger/detailed.rs | 4 ++-- milli/src/search/new/query_term.rs | 32 +++++++++++++++---------- 2 files changed, 21 insertions(+), 15 deletions(-) diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 3a02950a8..3c4779ad9 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -441,7 +441,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ }) => { let QueryTerm { original, - is_multiple_words: _, + is_ngram: _, is_prefix: _, max_nbr_typos, zero_typo, @@ -458,7 +458,7 @@ results.{cur_ranking_rule}{cur_activated_id} {{ ) .unwrap(); - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = zero_typo; for w in zero_typo.iter().copied() { diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index d19ab6135..90b03d194 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -204,8 +204,13 @@ impl QueryTermSubset { } if !self.zero_typo_subset.is_empty() { - let ZeroTypoTerm { phrase: _, zero_typo, prefix_of, synonyms: _, use_prefix_db: _ } = - &original.zero_typo; + let ZeroTypoTerm { + phrase: _, + exact: zero_typo, + prefix_of, + synonyms: _, + use_prefix_db: _, + } = &original.zero_typo; result.extend(zero_typo.iter().copied()); result.extend(prefix_of.iter().copied()); }; @@ -258,7 +263,7 @@ impl QueryTermSubset { )?; } - let ZeroTypoTerm { phrase, zero_typo: _, prefix_of: _, synonyms, use_prefix_db: _ } = + let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } = &original.zero_typo; result.extend(phrase.iter().copied()); result.extend(synonyms.iter().copied()); @@ -302,7 +307,7 @@ impl QueryTerm { #[derive(Clone, PartialEq, Eq, Hash)] pub struct QueryTerm { pub original: Interned, - pub is_multiple_words: bool, + pub is_ngram: bool, pub max_nbr_typos: u8, pub is_prefix: bool, pub zero_typo: ZeroTypoTerm, @@ -318,7 +323,7 @@ pub struct ZeroTypoTerm { /// The original phrase, if any pub phrase: Option>, /// A single word equivalent to the original term, with zero typos - pub zero_typo: Option>, + pub exact: Option>, /// All the words that contain the original word as prefix pub prefix_of: BTreeSet>, /// All the synonyms of the original word or phrase @@ -341,7 +346,7 @@ pub struct TwoTypoTerm { impl ZeroTypoTerm { fn is_empty(&self) -> bool { - let ZeroTypoTerm { phrase, zero_typo, prefix_of, synonyms, use_prefix_db } = self; + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self; phrase.is_none() && zero_typo.is_none() && prefix_of.is_empty() @@ -370,12 +375,12 @@ impl QueryTerm { ) -> Self { Self { original: word_interner.insert(phrase.description(word_interner)), - is_multiple_words: false, + is_ngram: false, max_nbr_typos: 0, is_prefix: false, zero_typo: ZeroTypoTerm { phrase: Some(phrase_interner.insert(phrase)), - zero_typo: None, + exact: None, prefix_of: BTreeSet::default(), synonyms: BTreeSet::default(), use_prefix_db: None, @@ -387,7 +392,7 @@ impl QueryTerm { pub fn empty(word_interner: &mut DedupInterner, original: &str) -> Self { Self { original: word_interner.insert(original.to_owned()), - is_multiple_words: false, + is_ngram: false, is_prefix: false, max_nbr_typos: 0, zero_typo: <_>::default(), @@ -606,11 +611,12 @@ fn partially_initialized_term_from_word( Some(ctx.phrase_interner.insert(Phrase { words })) }) .collect(); - let zero_typo = ZeroTypoTerm { phrase: None, zero_typo, prefix_of, synonyms, use_prefix_db }; + let zero_typo = + ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db }; Ok(QueryTerm { original: word_interned, - is_multiple_words: false, + is_ngram: false, max_nbr_typos: max_typo, is_prefix, zero_typo, @@ -765,7 +771,7 @@ fn split_best_frequency( impl QueryTerm { /// Return the original word from the given query term pub fn original_single_word(&self) -> Option> { - if self.is_multiple_words { + if self.is_ngram { None } else { Some(self.original) @@ -1039,7 +1045,7 @@ pub fn make_ngram( let term = QueryTerm { original, - is_multiple_words: true, + is_ngram: true, is_prefix, max_nbr_typos, zero_typo: term.zero_typo, From 4d5bc9df4c4f3145ebc72ce73d3e51325f6fba1c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:07:26 +0200 Subject: [PATCH 08/21] Increase position by 8 on hard separator when building query terms --- milli/src/search/new/query_term.rs | 2 +- .../index_documents/extract/extract_docid_word_positions.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 90b03d194..005c0a2e3 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -907,7 +907,7 @@ pub fn located_query_terms_from_string( TokenKind::Separator(separator_kind) => { match separator_kind { SeparatorKind::Hard => { - position += 1; + position += 8; } SeparatorKind::Soft => { position += 0; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 2d51fcc1a..c362f8f1b 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -153,7 +153,7 @@ fn json_to_string<'a>(value: &'a Value, buffer: &'a mut String) -> Option<&'a st /// take an iterator on tokens and compute their relative position depending on separator kinds /// if it's an `Hard` separator we add an additional relative proximity of 8 between words, -/// else we keep the standart proximity of 1 between words. +/// else we keep the standard proximity of 1 between words. fn process_tokens<'a>( tokens: impl Iterator>, ) -> impl Iterator)> { From 3951fe22ab72e9d9e44498c9d95ad29a0449a8dc Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:09:32 +0200 Subject: [PATCH 09/21] Add ExactTerm and helper method --- milli/src/search/new/query_term.rs | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 005c0a2e3..4e3922980 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -4,6 +4,7 @@ use std::ops::{ControlFlow, RangeInclusive}; use charabia::normalizer::NormalizedTokenIter; use charabia::{SeparatorKind, TokenKind}; +use either::Either; use fst::automaton::Str; use fst::{Automaton, IntoStreamer, Streamer}; use heed::types::DecodeIgnore; @@ -138,7 +139,43 @@ pub struct LocatedQueryTermSubset { pub term_ids: RangeInclusive, } +#[derive(Clone, Copy)] +pub enum ExactTerm { + Phrase(Interned), + Word(Interned), +} + +impl ExactTerm { + pub fn interned_words<'ctx>( + &self, + ctx: &'ctx SearchContext<'ctx>, + ) -> impl Iterator>> + 'ctx { + match *self { + ExactTerm::Phrase(phrase) => { + let phrase = ctx.phrase_interner.get(phrase); + Either::Left(phrase.words.iter().copied()) + } + ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))), + } + } +} + impl QueryTermSubset { + pub fn exact_term(&self, ctx: &SearchContext) -> Option { + let full_query_term = ctx.term_interner.get(self.original); + if full_query_term.is_ngram { + return None; + } + // TODO: included in subset + if let Some(phrase) = full_query_term.zero_typo.phrase { + self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase)) + } else if let Some(word) = full_query_term.zero_typo.exact { + self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word)) + } else { + None + } + } + pub fn empty(for_term: Interned) -> Self { Self { original: for_term, From 4b4ffb8ec993729fb53467a2899b198a14d320f9 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:12:07 +0200 Subject: [PATCH 10/21] Add exactness ranking rules --- milli/src/search/new/exact_attribute.rs | 175 ++++++++++++++++++ .../search/new/graph_based_ranking_rule.rs | 10 +- milli/src/search/new/mod.rs | 12 +- .../new/ranking_rule_graph/exactness/mod.rs | 107 +++++++++++ .../src/search/new/ranking_rule_graph/mod.rs | 3 + 5 files changed, 301 insertions(+), 6 deletions(-) create mode 100644 milli/src/search/new/exact_attribute.rs create mode 100644 milli/src/search/new/ranking_rule_graph/exactness/mod.rs diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs new file mode 100644 index 000000000..bb6299e28 --- /dev/null +++ b/milli/src/search/new/exact_attribute.rs @@ -0,0 +1,175 @@ +use heed::BytesDecode; +use roaring::MultiOps; + +use super::query_graph::QueryGraph; +use super::ranking_rules::{RankingRule, RankingRuleOutput}; +use crate::search::new::query_graph::QueryNodeData; +use crate::search::new::query_term::ExactTerm; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; + +/// FIXME: +/// +/// - A lot of work done in next_bucket that start_iteration could do. +/// - Consider calling the graph based rule directly from this one. +/// - currently we did exact term, don't forget about prefix +/// - some tests +pub struct ExactAttribute { + query_graph: Option, +} + +impl ExactAttribute { + pub fn new() -> Self { + Self { query_graph: None } + } +} + +impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { + fn id(&self) -> String { + "exact_attribute".to_owned() + } + + fn start_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + _universe: &roaring::RoaringBitmap, + query: &QueryGraph, + ) -> Result<()> { + self.query_graph = Some(query.clone()); + Ok(()) + } + + fn next_bucket( + &mut self, + ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + universe: &roaring::RoaringBitmap, + ) -> Result>> { + // iterate on the nodes of the graph, retain LocatedQueryTermSubset + let query_graph = self.query_graph.as_ref().unwrap(); + let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = + Vec::with_capacity(query_graph.nodes.len() as usize); + for (_, node) in query_graph.nodes.iter() { + match &node.data { + QueryNodeData::Term(term) => { + let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { + exact_term + } else { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + }; + exact_term_position_ids.push(( + exact_term, + *term.positions.start(), + *term.term_ids.start(), + )) + } + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, + } + } + + exact_term_position_ids.sort_by_key(|(_, _, id)| *id); + // bail if there is a "hole" (missing word) in remaining query graph + let mut previous_id = 0; + for (_, _, id) in exact_term_position_ids.iter().copied() { + if id < previous_id || id - previous_id > 1 { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } else { + previous_id = id; + } + } + + // sample query: "sunflower are pretty" + // sunflower at pos 0 in attr A + // are at pos 1 in attr B + // pretty at pos 2 in attr C + // We want to eliminate such document + + // first check that for each term, there exists some attribute that has this term at the correct position + //"word-position-docids"; + let mut candidates = universe.clone(); + let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids + .iter() + .copied() + .map(|(term, position, _)| (term.interned_words(ctx).collect(), position)) + .collect(); + for (words, position) in &words_positions { + if candidates.is_empty() { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } + + 'words: for (offset, word) in words.iter().enumerate() { + let offset = offset as u16; + let word = if let Some(word) = word { + word + } else { + continue 'words; + }; + let word_position_docids = CboRoaringBitmapCodec::bytes_decode( + ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(), + ) + .unwrap_or_default(); + candidates &= word_position_docids; + } + } + + let candidates = candidates; + + if candidates.is_empty() { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } + + let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); + + let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len()); + + // then check that there exists at least one attribute that has all of the terms + for fid in searchable_fields_ids { + let mut intersection = MultiOps::intersection( + words_positions + .iter() + .flat_map(|(words, ..)| words.iter()) + // ignore stop words words in phrases + .flatten() + .map(|word| -> Result<_> { + Ok(ctx + .get_db_word_fid_docids(*word, fid)? + .map(CboRoaringBitmapCodec::bytes_decode) + .unwrap_or_default() + .unwrap_or_default()) + }), + )?; + intersection &= &candidates; + if !intersection.is_empty() { + candidates_per_attributes.push(intersection); + } + } + // note we could have "false positives" where there both exist different attributes that collectively + // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. + + let candidates = MultiOps::union(candidates_per_attributes.into_iter()); + Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates })) + } + + fn end_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + ) { + } +} diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index b8c58c726..28b4ed1f4 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner}; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, - TypoGraph, + ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph, + RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; @@ -65,6 +65,12 @@ impl GraphBasedRankingRule { Self::new_with_id("typo".to_owned(), terms_matching_strategy) } } +pub type Exactness = GraphBasedRankingRule; +impl GraphBasedRankingRule { + pub fn new() -> Self { + Self::new_with_id("exactness".to_owned(), None) + } +} /// A generic graph-based ranking rule pub struct GraphBasedRankingRule { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4d561d25b..779e589b3 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -9,8 +9,9 @@ mod query_term; mod ranking_rule_graph; mod ranking_rules; mod resolve_query_graph; -// TODO: documentation + comments mod small_bitmap; + +mod exact_attribute; // TODO: documentation + comments // implementation is currently an adaptation of the previous implementation to fit with the new model mod sort; @@ -33,6 +34,8 @@ use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; +use self::exact_attribute::ExactAttribute; +use self::graph_based_ranking_rule::Exactness; use self::interner::Interner; use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::resolve_query_graph::compute_query_graph_docids; @@ -150,7 +153,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( let mut proximity = false; let mut sort = false; let attribute = false; - let exactness = false; + let mut exactness = false; let mut asc = HashSet::new(); let mut desc = HashSet::new(); @@ -211,8 +214,9 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( if exactness { continue; } - // todo!(); - // exactness = false; + ranking_rules.push(Box::new(ExactAttribute::new())); + ranking_rules.push(Box::new(Exactness::new())); + exactness = true; } crate::Criterion::Asc(field_name) => { if asc.contains(&field_name) { diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs new file mode 100644 index 000000000..a1e19a015 --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -0,0 +1,107 @@ +use roaring::RoaringBitmap; + +use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; +use crate::search::new::query_graph::{QueryGraph, QueryNode}; +use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; + +/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other +/// word than a doc that matches 9 words non exactly but none exactly +/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider +/// +/// "naive vision" +/// condition from one node to another: +/// - word exactly present: cost 0 +/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside +/// +/// Three strategies: +/// 1. ExactAttribute: word position / word_fid_docid +/// 2. AttributeStart: +/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait` + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum ExactnessCondition { + ExactInAttribute(LocatedQueryTermSubset), + Skip(LocatedQueryTermSubset), +} + +pub enum ExactnessGraph {} + +fn compute_docids( + ctx: &mut SearchContext, + dest_node: &LocatedQueryTermSubset, + universe: &RoaringBitmap, +) -> Result { + let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) { + exact_term + } else { + return Ok(Default::default()); + }; + let mut candidates = match exact_term { + ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), + ExactTerm::Word(word) => { + if let Some(word_candidates) = ctx.get_db_word_docids(word)? { + CboRoaringBitmapCodec::deserialize_from(word_candidates)? + } else { + return Ok(Default::default()); + } + } + }; + // TODO: synonyms? + candidates &= universe; + Ok(candidates) +} + +impl RankingRuleGraphTrait for ExactnessGraph { + type Condition = ExactnessCondition; + + fn resolve_condition( + ctx: &mut SearchContext, + condition: &Self::Condition, + universe: &RoaringBitmap, + ) -> Result { + let (docids, dest_node) = match condition { + ExactnessCondition::ExactInAttribute(dest_node) => { + (compute_docids(ctx, dest_node, universe)?, dest_node) + } + ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node), + }; + Ok(ComputedCondition { + docids, + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: dest_node.clone(), + }) + } + + fn build_edges( + _ctx: &mut SearchContext, + conditions_interner: &mut DedupInterner, + _source_node: Option<&LocatedQueryTermSubset>, + dest_node: &LocatedQueryTermSubset, + ) -> Result)>> { + let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone()); + let exact_condition = conditions_interner.insert(exact_condition); + + let skip_condition = ExactnessCondition::Skip(dest_node.clone()); + let skip_condition = conditions_interner.insert(skip_condition); + Ok(vec![(0, exact_condition), (1, skip_condition)]) + } + + fn log_state( + graph: &RankingRuleGraph, + paths: &[Vec>], + dead_ends_cache: &DeadEndsCache, + universe: &RoaringBitmap, + costs: &MappedInterner>, + cost: u64, + logger: &mut dyn SearchLogger, + ) { + todo!() + } + + fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { + todo!() + } +} diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 7c40008c8..936c3e942 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -10,6 +10,8 @@ mod cheapest_paths; mod condition_docids_cache; mod dead_ends_cache; +/// Implementation of the `exactness` ranking rule +mod exactness; /// Implementation of the `proximity` ranking rule mod proximity; /// Implementation of the `typo` ranking rule @@ -20,6 +22,7 @@ use std::hash::Hash; pub use cheapest_paths::PathVisitor; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; +pub use exactness::{ExactnessCondition, ExactnessGraph}; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; From 959e4607bb11c684463ddf1149895d9aecb08a7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 18:02:46 +0200 Subject: [PATCH 11/21] Add more search tests --- milli/src/search/new/distinct.rs | 2 +- milli/src/search/new/mod.rs | 4 +- milli/src/search/new/tests/distinct.rs | 590 ++++++++++++++++++++++++ milli/src/search/new/tests/language.rs | 22 + milli/src/search/new/tests/mod.rs | 25 + milli/src/search/new/tests/proximity.rs | 0 milli/src/search/new/tests/sort.rs | 316 +++++++++++++ 7 files changed, 957 insertions(+), 2 deletions(-) create mode 100644 milli/src/search/new/tests/distinct.rs create mode 100644 milli/src/search/new/tests/language.rs create mode 100644 milli/src/search/new/tests/proximity.rs create mode 100644 milli/src/search/new/tests/sort.rs diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs index ad4b46659..7b77adf49 100644 --- a/milli/src/search/new/distinct.rs +++ b/milli/src/search/new/distinct.rs @@ -41,7 +41,7 @@ pub fn apply_distinct_rule( } /// Apply the distinct rule defined by [`apply_distinct_rule`] for a single document id. -fn distinct_single_docid( +pub fn distinct_single_docid( index: &Index, txn: &RoTxn, field_id: u16, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4456d693d..e7e38fe89 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -1,3 +1,4 @@ +mod bucket_sort; mod db_cache; mod distinct; mod graph_based_ranking_rule; @@ -31,7 +32,8 @@ pub use logger::detailed::DetailedSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; -use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; +use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; +use bucket_sort::bucket_sort; use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; diff --git a/milli/src/search/new/tests/distinct.rs b/milli/src/search/new/tests/distinct.rs new file mode 100644 index 000000000..4073cf585 --- /dev/null +++ b/milli/src/search/new/tests/distinct.rs @@ -0,0 +1,590 @@ +/*! +This module tests the "distinct attribute" feature, and its +interaction with other ranking rules. + +1. no duplicate distinct attributes are ever returned +2. only the best document (according to the search rules) for each distinct value appears in the result +3. if a document does not have a distinct attribute, then the distinct rule does not apply to it + +It doesn't test properly: +- combination of distinct + exhaustive_nbr_hits (because we know it's incorrect) +- distinct attributes with arrays (because we know it's incorrect as well) +*/ + +use std::collections::HashSet; + +use big_s::S; +use heed::RoTxn; +use maplit::hashset; + +use crate::{ + index::tests::TempIndex, AscDesc, Criterion, Index, Member, Search, SearchResult, + TermsMatchingStrategy, +}; + +use super::collect_field_values; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_sortable_fields(hashset! { S("rank1"), S("letter") }); + s.set_distinct_field("letter".to_owned()); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "letter": "A", + "rank1": 0, + "text": "the quick brown fox jamps over the lazy dog", + }, + { + "id": 1, + "letter": "A", + "rank1": 1, + "text": "the quick brown fox jumpes over the lazy dog", + }, + { + "id": 2, + "letter": "B", + "rank1": 0, + "text": "the quick brown foxjumps over the lazy dog", + }, + { + "id": 3, + "letter": "B", + "rank1": 1, + "text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 4, + "letter": "B", + "rank1": 2, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 5, + "letter": "C", + "rank1": 0, + "text": "the quickbrownfox jumps over the lazy", + }, + { + "id": 6, + "letter": "C", + "rank1": 1, + "text": "the quick brown fox jumpss over the lazy", + }, + { + "id": 7, + "letter": "C", + "rank1": 2, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 8, + "letter": "D", + "rank1": 0, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 9, + "letter": "E", + "rank1": 0, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 10, + "letter": "E", + "rank1": 1, + "text": "the quackbrown foxjunps over", + }, + { + "id": 11, + "letter": "E", + "rank1": 2, + "text": "the quicko browno fox junps over", + }, + { + "id": 12, + "letter": "E", + "rank1": 3, + "text": "the quicko browno fox jumps over", + }, + { + "id": 13, + "letter": "E", + "rank1": 4, + "text": "the quick brewn fox jumps over", + }, + { + "id": 14, + "letter": "E", + "rank1": 5, + "text": "the quick brown fox jumps over", + }, + { + "id": 15, + "letter": "F", + "rank1": 0, + "text": "the quick brownf fox jumps over", + }, + { + "id": 16, + "letter": "F", + "rank1": 1, + "text": "the quic brown fox jamps over", + }, + { + "id": 17, + "letter": "F", + "rank1": 2, + "text": "thequick browns fox jimps", + }, + { + "id": 18, + "letter": "G", + "rank1": 0, + "text": "the qick brown fox jumps", + }, + { + "id": 19, + "letter": "G", + "rank1": 1, + "text": "the quick brownfoxjumps", + }, + { + "id": 20, + "letter": "H", + "rank1": 0, + "text": "the quick brow fox jumps", + }, + { + "id": 21, + "letter": "I", + "rank1": 0, + "text": "the quick brown fox jpmps", + }, + { + "id": 22, + "letter": "I", + "rank1": 1, + "text": "the quick brown fox jumps", + }, + { + "id": 23, + "letter": "I", + "rank1": 2, + "text": "the quick", + }, + { + "id": 24, + "rank1": 0, + "text": "the quick", + }, + { + "id": 25, + "rank1": 1, + "text": "the quick brown", + }, + { + "id": 26, + "rank1": 2, + "text": "the quick brown fox", + }, + { + "id": 26, + "rank1": 3, + "text": "the quick brown fox jumps over the lazy dog", + }, + ])) + .unwrap(); + index +} + +fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec { + let vs = collect_field_values(index, txn, index.distinct_field(txn).unwrap().unwrap(), docids); + + let mut unique = HashSet::new(); + for v in vs.iter() { + if v == "__does_not_exist__" { + continue; + } + assert!(unique.insert(v.clone())); + } + + vs +} + +#[test] +fn test_distinct_placeholder_no_ranking_rules() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + let s = Search::new(&txn, &index); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"A\"", + "\"B\"", + "\"C\"", + "\"D\"", + "\"E\"", + "\"F\"", + "\"G\"", + "\"H\"", + "\"I\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); +} + +#[test] +fn test_distinct_placeholder_sort() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Sort]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank1")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"E\"", + "__does_not_exist__", + "\"B\"", + "\"C\"", + "\"F\"", + "\"I\"", + "\"A\"", + "\"G\"", + "__does_not_exist__", + "\"D\"", + "\"H\"", + "__does_not_exist__", + ] + "###); + let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "5", + "3", + "2", + "2", + "2", + "2", + "1", + "1", + "1", + "0", + "0", + "0", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"I\"", + "\"H\"", + "\"G\"", + "\"F\"", + "\"E\"", + "\"D\"", + "\"C\"", + "\"B\"", + "\"A\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "0", + "0", + "0", + "0", + "0", + "0", + "0", + "0", + "0", + "0", + "1", + "3", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.sort_criteria(vec![ + AscDesc::Desc(Member::Field(S("letter"))), + AscDesc::Desc(Member::Field(S("rank1"))), + ]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"I\"", + "\"H\"", + "\"G\"", + "\"F\"", + "\"E\"", + "\"D\"", + "\"C\"", + "\"B\"", + "\"A\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "2", + "0", + "1", + "2", + "5", + "0", + "2", + "2", + "1", + "3", + "1", + "0", + ] + "###); +} + +#[test] +fn test_distinct_words() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"A\"", + "\"B\"", + "__does_not_exist__", + "\"C\"", + "\"D\"", + "\"E\"", + "\"F\"", + "\"G\"", + "\"H\"", + "\"I\"", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + let text_values = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(text_values, @r###" + [ + "\"the quick brown fox jamps over the lazy dog\"", + "\"the quick brown foxjumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quickbrownfox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brownf fox jumps over\"", + "\"the qick brown fox jumps\"", + "\"the quick brow fox jumps\"", + "\"the quick brown fox jpmps\"", + "\"the quick brown\"", + "\"the quick\"", + ] + "###); +} + +#[test] +fn test_distinct_sort_words() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Sort, Criterion::Words, Criterion::Desc(S("rank1"))]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"I\"", + "\"H\"", + "\"G\"", + "\"F\"", + "\"E\"", + "\"D\"", + "\"C\"", + "\"B\"", + "\"A\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + + let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "1", + "0", + "1", + "1", + "0", + "0", + "2", + "1", + "1", + "3", + "1", + "0", + ] + "###); + + let text_values = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(text_values, @r###" + [ + "\"the quick brown fox jumps\"", + "\"the quick brow fox jumps\"", + "\"the quick brownfoxjumps\"", + "\"the quic brown fox jamps over\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumpes over the lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown\"", + "\"the quick\"", + ] + "###); +} + +#[test] +fn test_distinct_all_candidates() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Sort]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank1")))]); + s.exhaustive_number_hits(true); + + let SearchResult { documents_ids, candidates, .. } = s.execute().unwrap(); + let candidates = candidates.iter().collect::>(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); + // TODO: this is incorrect! + insta::assert_snapshot!(format!("{candidates:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); +} + +#[test] +fn test_distinct_typo() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Typo]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]"); + + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"B\"", + "__does_not_exist__", + "\"A\"", + "\"C\"", + "\"D\"", + "\"E\"", + "\"F\"", + "\"I\"", + "\"G\"", + "\"H\"", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + + let text_values = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(text_values, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jamps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brownf fox jumps over\"", + "\"the quick brown fox jumps\"", + "\"the qick brown fox jumps\"", + "\"the quick brow fox jumps\"", + "\"the quick brown\"", + "\"the quick\"", + ] + "###); +} diff --git a/milli/src/search/new/tests/language.rs b/milli/src/search/new/tests/language.rs new file mode 100644 index 000000000..6adad748c --- /dev/null +++ b/milli/src/search/new/tests/language.rs @@ -0,0 +1,22 @@ +use crate::{index::tests::TempIndex, Search, SearchResult}; + +#[test] +fn test_kanji_language_detection() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": 1, "title": "東京のお寿司。" }, + { "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" } + ])) + .unwrap(); + + let txn = index.write_txn().unwrap(); + let mut search = Search::new(&txn, &index); + + search.query("東京"); + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + assert_eq!(documents_ids, vec![1]); +} diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index eec4c62ec..0fd5013db 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -1,3 +1,28 @@ +pub mod distinct; +#[cfg(feature = "default")] +pub mod language; pub mod ngram_split_words; +pub mod proximity; +pub mod sort; pub mod typo; pub mod words_tms; + +fn collect_field_values( + index: &crate::Index, + txn: &heed::RoTxn, + fid: &str, + docids: &[u32], +) -> Vec { + let mut values = vec![]; + let fid = index.fields_ids_map(txn).unwrap().id(fid).unwrap(); + for doc in index.documents(txn, docids.iter().copied()).unwrap() { + if let Some(v) = doc.1.get(fid) { + let v: serde_json::Value = serde_json::from_slice(v).unwrap(); + let v = v.to_string(); + values.push(v); + } else { + values.push("__does_not_exist__".to_owned()); + } + } + values +} diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs new file mode 100644 index 000000000..e69de29bb diff --git a/milli/src/search/new/tests/sort.rs b/milli/src/search/new/tests/sort.rs new file mode 100644 index 000000000..d3a952a24 --- /dev/null +++ b/milli/src/search/new/tests/sort.rs @@ -0,0 +1,316 @@ +/*! +This module tests the `sort` ranking rule: + +1. an error is returned if the sort ranking rule exists but no fields-to-sort were given at search time +2. an error is returned if the fields-to-sort are not sortable +3. it is possible to add multiple fields-to-sort at search time +4. custom sort ranking rules can be added to the settings, they interact with the generic `sort` ranking rule as expected +5. numbers appear before strings +6. documents with either: (1) no value, (2) null, or (3) an object for the field-to-sort appear at the end of the bucket +7. boolean values are translated to strings +8. if a field contains an array, it is sorted by the best value in the array according to the sort rule +*/ + +use big_s::S; +use maplit::hashset; + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, AscDesc, Criterion, Member, + Search, SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_sortable_fields(hashset! { S("rank"), S("vague"), S("letter") }); + s.set_criteria(vec![Criterion::Sort]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "letter": "A", + "rank": 0, + "vague": 0, + }, + { + "id": 1, + "letter": "A", + "rank": 1, + "vague": "0", + }, + { + "id": 2, + "letter": "B", + "rank": 0, + "vague": 1, + }, + { + "id": 3, + "letter": "B", + "rank": 1, + "vague": "1", + }, + { + "id": 4, + "letter": "B", + "rank": 2, + "vague": [1, 2], + }, + { + "id": 5, + "letter": "C", + "rank": 0, + "vague": [1, "2"], + }, + { + "id": 6, + "letter": "C", + "rank": 1, + }, + { + "id": 7, + "letter": "C", + "rank": 2, + "vague": null, + }, + { + "id": 8, + "letter": "D", + "rank": 0, + "vague": [null, null, ""] + }, + { + "id": 9, + "letter": "E", + "rank": 0, + "vague": "" + }, + { + "id": 10, + "letter": "E", + "rank": 1, + "vague": { + "sub": 0, + } + }, + { + "id": 11, + "letter": "E", + "rank": 2, + "vague": true, + }, + { + "id": 12, + "letter": "E", + "rank": 3, + "vague": false, + }, + { + "id": 13, + "letter": "E", + "rank": 4, + "vague": 1.5673, + }, + { + "id": 14, + "letter": "E", + "rank": 5, + }, + { + "id": 15, + "letter": "F", + "rank": 0, + }, + { + "id": 16, + "letter": "F", + "rank": 1, + }, + { + "id": 17, + "letter": "F", + "rank": 2, + }, + { + "id": 18, + "letter": "G", + "rank": 0, + }, + { + "id": 19, + "letter": "G", + "rank": 1, + }, + { + "id": 20, + "letter": "H", + "rank": 0, + "vague": true, + }, + { + "id": 21, + "letter": "I", + "rank": 0, + "vague": false, + }, + { + "id": 22, + "letter": "I", + "rank": 1, + "vague": [1.1367, "help", null] + }, + { + "id": 23, + "letter": "I", + "rank": 2, + "vague": [1.2367, "hello"] + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_sort() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 22, 23, 20, 18, 19, 15, 16, 17, 9, 10, 11, 12, 13, 14, 8, 5, 6, 7, 2]"); + + let letter_values = collect_field_values(&index, &txn, "letter", &documents_ids); + insta::assert_debug_snapshot!(letter_values, @r###" + [ + "\"I\"", + "\"I\"", + "\"I\"", + "\"H\"", + "\"G\"", + "\"G\"", + "\"F\"", + "\"F\"", + "\"F\"", + "\"E\"", + "\"E\"", + "\"E\"", + "\"E\"", + "\"E\"", + "\"E\"", + "\"D\"", + "\"C\"", + "\"C\"", + "\"C\"", + "\"B\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 13, 12, 4, 7, 11, 17, 23, 1, 3, 6, 10, 16, 19, 22, 0, 2, 5, 8, 9]"); + + let rank_values = collect_field_values(&index, &txn, "rank", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "5", + "4", + "3", + "2", + "2", + "2", + "2", + "2", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "0", + "0", + "0", + "0", + "0", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Asc(Member::Field(S("vague")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 4, 5, 22, 23, 13, 1, 3, 12, 21, 11, 20, 6, 7, 8, 9, 10, 14, 15]"); + + let vague_values = collect_field_values(&index, &txn, "vague", &documents_ids); + insta::assert_debug_snapshot!(vague_values, @r###" + [ + "0", + "1", + "[1,2]", + "[1,\"2\"]", + "[1.1367,\"help\",null]", + "[1.2367,\"hello\"]", + "1.5673", + "\"0\"", + "\"1\"", + "false", + "false", + "true", + "true", + "__does_not_exist___", + "null", + "[null,null,\"\"]", + "\"\"", + "{\"sub\":0}", + "__does_not_exist___", + "__does_not_exist___", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("vague")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 13, 23, 22, 2, 5, 0, 11, 20, 12, 21, 3, 1, 6, 7, 8, 9, 10, 14, 15]"); + + let vague_values = collect_field_values(&index, &txn, "vague", &documents_ids); + insta::assert_debug_snapshot!(vague_values, @r###" + [ + "[1,2]", + "1.5673", + "[1.2367,\"hello\"]", + "[1.1367,\"help\",null]", + "1", + "[1,\"2\"]", + "0", + "true", + "true", + "false", + "false", + "\"1\"", + "\"0\"", + "__does_not_exist___", + "null", + "[null,null,\"\"]", + "\"\"", + "{\"sub\":0}", + "__does_not_exist___", + "__does_not_exist___", + ] + "###); +} From ce328c329d354aa1ae168377eecf800d18cc163a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 4 Apr 2023 18:02:16 +0200 Subject: [PATCH 12/21] Move bucket sort function to its own module and fix a bug --- milli/src/search/new/bucket_sort.rs | 195 ++++++++++++++++++++++++++ milli/src/search/new/ranking_rules.rs | 170 ---------------------- 2 files changed, 195 insertions(+), 170 deletions(-) create mode 100644 milli/src/search/new/bucket_sort.rs diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs new file mode 100644 index 000000000..712825c31 --- /dev/null +++ b/milli/src/search/new/bucket_sort.rs @@ -0,0 +1,195 @@ +use roaring::RoaringBitmap; + +use super::logger::SearchLogger; +use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait}; +use super::SearchContext; +use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput}; +use crate::Result; + +pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( + ctx: &mut SearchContext<'ctx>, + mut ranking_rules: Vec>, + query: &Q, + universe: &RoaringBitmap, + from: usize, + length: usize, + logger: &mut dyn SearchLogger, +) -> Result> { + logger.initial_query(query); + logger.ranking_rules(&ranking_rules); + logger.initial_universe(universe); + + let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? { + ctx.index.fields_ids_map(ctx.txn)?.id(field) + } else { + None + }; + + if universe.len() < from as u64 { + return Ok(vec![]); + } + if ranking_rules.is_empty() { + if let Some(distinct_fid) = distinct_fid { + let mut excluded = RoaringBitmap::new(); + let mut results = vec![]; + for docid in universe.iter() { + if results.len() >= from + length { + break; + } + if excluded.contains(docid) { + continue; + } + distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?; + results.push(docid); + } + return Ok(results); + } else { + return Ok(universe.iter().skip(from).take(length).collect()); + }; + } + + let ranking_rules_len = ranking_rules.len(); + + logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe); + ranking_rules[0].start_iteration(ctx, logger, universe, query)?; + + let mut ranking_rule_universes: Vec = + vec![RoaringBitmap::default(); ranking_rules_len]; + ranking_rule_universes[0] = universe.clone(); + + let mut cur_ranking_rule_index = 0; + + /// Finish iterating over the current ranking rule, yielding + /// control to the parent (or finishing the search if not possible). + /// Update the candidates accordingly and inform the logger. + macro_rules! back { + () => { + assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); + logger.end_iteration_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &ranking_rule_universes[cur_ranking_rule_index], + ); + ranking_rule_universes[cur_ranking_rule_index].clear(); + ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger); + if cur_ranking_rule_index == 0 { + break; + } else { + cur_ranking_rule_index -= 1; + } + }; + } + + let mut results = vec![]; + let mut cur_offset = 0usize; + + /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` + /// into account and inform the logger. + macro_rules! maybe_add_to_results { + ($candidates:expr) => { + // First apply the distinct rule on the candidates, reducing the universes if necessary + let candidates = if let Some(distinct_fid) = distinct_fid { + let DistinctOutput { remaining, excluded } = apply_distinct_rule(ctx, distinct_fid, $candidates)?; + for universe in ranking_rule_universes.iter_mut() { + *universe -= &excluded; + } + remaining + } else { + $candidates.clone() + }; + let len = candidates.len(); + // if the candidates are empty, there is nothing to do; + if !candidates.is_empty() { + // if we still haven't reached the first document to return + if cur_offset < from { + // and if no document from this bucket can be returned + if cur_offset + (candidates.len() as usize) < from { + // then just skip the bucket + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &candidates, + ); + } else { + // otherwise, skip some of the documents and add some of the rest, in order of ids + let all_candidates = candidates.iter().collect::>(); + let (skipped_candidates, candidates) = + all_candidates.split_at(from - cur_offset); + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &skipped_candidates.into_iter().collect(), + ); + let candidates = candidates + .iter() + .take(length - results.len()) + .copied() + .collect::>(); + logger.add_to_results(&candidates); + results.extend(&candidates); + } + } else { + // if we have passed the offset already, add some of the documents (up to the limit) + let candidates = + candidates.iter().take(length - results.len()).collect::>(); + logger.add_to_results(&candidates); + results.extend(&candidates); + } + } + cur_offset += len as usize; + }; + } + + while results.len() < length { + // The universe for this bucket is zero or one element, so we don't need to sort + // anything, just extend the results and go back to the parent ranking rule. + if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { + maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); + ranking_rule_universes[cur_ranking_rule_index].clear(); + back!(); + continue; + } + + let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else { + back!(); + continue; + }; + + logger.next_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &ranking_rule_universes[cur_ranking_rule_index], + &next_bucket.candidates, + ); + + debug_assert!( + ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates) + ); + ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; + + if cur_ranking_rule_index == ranking_rules_len - 1 + || next_bucket.candidates.len() <= 1 + || cur_offset + (next_bucket.candidates.len() as usize) < from + { + maybe_add_to_results!(&next_bucket.candidates); + continue; + } + + cur_ranking_rule_index += 1; + ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); + logger.start_iteration_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &next_bucket.query, + &ranking_rule_universes[cur_ranking_rule_index], + ); + ranking_rules[cur_ranking_rule_index].start_iteration( + ctx, + logger, + &next_bucket.candidates, + &next_bucket.query, + )?; + } + + Ok(results) +} diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 9dc6018e6..a771d3768 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -2,8 +2,6 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::{QueryGraph, SearchContext}; -// use crate::search::new::sort::Sort; -use crate::search::new::distinct::{apply_distinct_rule, DistinctOutput}; use crate::Result; /// An internal trait implemented by only [`PlaceholderQuery`] and [`QueryGraph`] @@ -69,171 +67,3 @@ pub struct RankingRuleOutput { /// The allowed candidates for the child ranking rule pub candidates: RoaringBitmap, } - -pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( - ctx: &mut SearchContext<'ctx>, - mut ranking_rules: Vec>, - query: &Q, - universe: &RoaringBitmap, - from: usize, - length: usize, - logger: &mut dyn SearchLogger, -) -> Result> { - logger.initial_query(query); - logger.ranking_rules(&ranking_rules); - logger.initial_universe(universe); - - let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? { - ctx.index.fields_ids_map(ctx.txn)?.id(field) - } else { - None - }; - - if universe.len() < from as u64 { - return Ok(vec![]); - } - - let ranking_rules_len = ranking_rules.len(); - logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe); - ranking_rules[0].start_iteration(ctx, logger, universe, query)?; - - let mut ranking_rule_universes: Vec = - vec![RoaringBitmap::default(); ranking_rules_len]; - ranking_rule_universes[0] = universe.clone(); - - let mut cur_ranking_rule_index = 0; - - /// Finish iterating over the current ranking rule, yielding - /// control to the parent (or finishing the search if not possible). - /// Update the candidates accordingly and inform the logger. - macro_rules! back { - () => { - assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); - logger.end_iteration_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &ranking_rule_universes[cur_ranking_rule_index], - ); - ranking_rule_universes[cur_ranking_rule_index].clear(); - ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger); - if cur_ranking_rule_index == 0 { - break; - } else { - cur_ranking_rule_index -= 1; - } - }; - } - - let mut results = vec![]; - let mut cur_offset = 0usize; - - /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` - /// into account and inform the logger. - macro_rules! maybe_add_to_results { - ($candidates:expr) => { - // First apply the distinct rule on the candidates, reducing the universes if necessary - let candidates = if let Some(distinct_fid) = distinct_fid { - let DistinctOutput { remaining, excluded } = apply_distinct_rule(ctx, distinct_fid, $candidates)?; - for universe in ranking_rule_universes.iter_mut() { - *universe -= &excluded; - } - remaining - } else { - $candidates.clone() - }; - let len = candidates.len(); - // if the candidates are empty, there is nothing to do; - if !candidates.is_empty() { - // if we still haven't reached the first document to return - if cur_offset < from { - // and if no document from this bucket can be returned - if cur_offset + (candidates.len() as usize) < from { - // then just skip the bucket - logger.skip_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &candidates, - ); - } else { - // otherwise, skip some of the documents and add some of the rest, in order of ids - let all_candidates = candidates.iter().collect::>(); - let (skipped_candidates, candidates) = - all_candidates.split_at(from - cur_offset); - logger.skip_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &skipped_candidates.into_iter().collect(), - ); - let candidates = candidates - .iter() - .take(length - results.len()) - .copied() - .collect::>(); - logger.add_to_results(&candidates); - results.extend(&candidates); - } - } else { - // if we have passed the offset already, add some of the documents (up to the limit) - let candidates = - candidates.iter().take(length - results.len()).collect::>(); - logger.add_to_results(&candidates); - results.extend(&candidates); - } - } - cur_offset += len as usize; - }; - } - - while results.len() < length { - // The universe for this bucket is zero or one element, so we don't need to sort - // anything, just extend the results and go back to the parent ranking rule. - if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { - maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); - ranking_rule_universes[cur_ranking_rule_index].clear(); - back!(); - continue; - } - - let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else { - back!(); - continue; - }; - - logger.next_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &ranking_rule_universes[cur_ranking_rule_index], - &next_bucket.candidates, - ); - - debug_assert!( - ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates) - ); - ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; - - if cur_ranking_rule_index == ranking_rules_len - 1 - || next_bucket.candidates.len() <= 1 - || cur_offset + (next_bucket.candidates.len() as usize) < from - { - maybe_add_to_results!(&next_bucket.candidates); - continue; - } - - cur_ranking_rule_index += 1; - ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); - logger.start_iteration_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &next_bucket.query, - &ranking_rule_universes[cur_ranking_rule_index], - ); - ranking_rules[cur_ranking_rule_index].start_iteration( - ctx, - logger, - &next_bucket.candidates, - &next_bucket.query, - )?; - } - - Ok(results) -} From c69cbec64a516629e22029737b35cf3dae10c8c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 11:20:04 +0200 Subject: [PATCH 13/21] Add more search tests --- milli/src/search/new/tests/language.rs | 2 +- .../src/search/new/tests/ngram_split_words.rs | 141 +++++++- milli/src/search/new/tests/proximity.rs | 317 ++++++++++++++++++ milli/src/search/new/tests/typo.rs | 148 +++++++- milli/src/search/new/tests/words_tms.rs | 185 +++++++++- 5 files changed, 766 insertions(+), 27 deletions(-) diff --git a/milli/src/search/new/tests/language.rs b/milli/src/search/new/tests/language.rs index 6adad748c..e16544fdb 100644 --- a/milli/src/search/new/tests/language.rs +++ b/milli/src/search/new/tests/language.rs @@ -18,5 +18,5 @@ fn test_kanji_language_detection() { search.query("東京"); let SearchResult { documents_ids, .. } = search.execute().unwrap(); - assert_eq!(documents_ids, vec![1]); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]"); } diff --git a/milli/src/search/new/tests/ngram_split_words.rs b/milli/src/search/new/tests/ngram_split_words.rs index 06c49274c..b78bbe763 100644 --- a/milli/src/search/new/tests/ngram_split_words.rs +++ b/milli/src/search/new/tests/ngram_split_words.rs @@ -16,7 +16,10 @@ This module tests the following properties: 13. Ngrams cannot be formed by combining a phrase and a word or two phrases */ -use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; fn create_index() -> TempIndex { let index = TempIndex::new(); @@ -46,6 +49,14 @@ fn create_index() -> TempIndex { { "id": 3, "text": "the sunflower is tall" + }, + { + "id": 4, + "text": "the sunflawer is tall" + }, + { + "id": 5, + "text": "sunflowering is not a verb" } ])) .unwrap(); @@ -67,8 +78,18 @@ fn test_2gram_simple() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("sun flower"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - // will also match documents with "sun flower" - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); + // will also match documents with "sunflower" + prefix tolerance + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); } #[test] fn test_3gram_simple() { @@ -87,6 +108,13 @@ fn test_3gram_simple() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sunflowers are pretty\"", + ] + "###); } #[test] @@ -99,7 +127,18 @@ fn test_2gram_typo() { s.query("sun flawer"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"the sunflawer is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); } #[test] @@ -119,6 +158,13 @@ fn test_no_disable_ngrams() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); // documents containing `sunflower` insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + "\"the sunflower is tall\"", + ] + "###); } #[test] @@ -137,7 +183,17 @@ fn test_2gram_prefix() { s.query("sun flow"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); // documents containing words beginning with `sunflow` - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); } #[test] @@ -157,7 +213,16 @@ fn test_3gram_prefix() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); // documents containing a word beginning with sunfl - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"the sunflawer is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); } #[test] @@ -170,8 +235,17 @@ fn test_split_words() { s.query("sunflower "); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - // all the documents with either `sunflower` or `sun flower` - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3]"); + // all the documents with either `sunflower` or `sun flower` + eventual typo + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 4]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"the sunflawer is tall\"", + ] + "###); } #[test] @@ -191,6 +265,12 @@ fn test_disable_split_words() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); // no document containing `sun flower` insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sunflower is tall\"", + ] + "###); } #[test] @@ -203,8 +283,18 @@ fn test_2gram_split_words() { s.query("sunf lower"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - // all the documents with "sunflower", "sun flower", or (sunflower + 1 typo) - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3]"); + // all the documents with "sunflower", "sun flower", (sunflower + 1 typo), or (sunflower as prefix) + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 4, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"the sunflawer is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); } #[test] @@ -218,7 +308,15 @@ fn test_3gram_no_split_words() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); // no document with `sun flower` - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); } #[test] @@ -231,7 +329,13 @@ fn test_3gram_no_typos() { s.query("sunf la wer"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sunflawer is tall\"", + ] + "###); } #[test] @@ -245,6 +349,13 @@ fn test_no_ngram_phrases() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sun flower is tall\"", + ] + "###); let mut s = Search::new(&txn, &index); s.terms_matching_strategy(TermsMatchingStrategy::All); @@ -252,4 +363,10 @@ fn test_no_ngram_phrases() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + ] + "###); } diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index e69de29bb..f6e071572 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -0,0 +1,317 @@ +/*! +This module tests the Proximity ranking rule: + +1. A proximity of >7 always has the same cost. + +2. Phrase terms can be in proximity to other terms via their start and end words, +but we need to make sure that the phrase exists in the document that meets this +proximity condition. This is especially relevant with split words and synonyms. + +3. An ngram has the same proximity cost as its component words being consecutive. +e.g. `sunflower` equivalent to `sun flower`. + +4. The prefix databases can be used to find the proximity between two words, but +they store fewer proximities than the regular word proximity DB. + +*/ + +use std::collections::HashMap; + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_simple_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "the very quick dark brown and smart fox did jump over the terribly lazy and small dog" + }, + { + "id": 1, + "text": "the. quick brown fox jumps over the lazy. dog" + }, + { + "id": 2, + "text": "the quick brown fox jumps over the lazy. dog" + }, + { + "id": 3, + "text": "dog the quick brown fox jumps over the lazy" + }, + { + "id": 4, + "text": "the quickbrown fox jumps over the lazy dog" + }, + { + "id": 5, + "text": "brown quick fox jumps over the lazy dog" + }, + { + "id": 6, + "text": "the really quick brown fox jumps over the very lazy dog" + }, + { + "id": 7, + "text": "the really quick brown fox jumps over the lazy dog" + }, + { + "id": 8, + "text": "the quick brown fox jumps over the lazy" + }, + { + "id": 9, + "text": "the quack brown fox jumps over the lazy" + }, + { + "id": 9, + "text": "the quack brown fox jumps over the lazy dog" + }, + { + "id": 10, + "text": "the quick brown fox jumps over the lazy dog" + } + ])) + .unwrap(); + index +} + +fn create_edge_cases_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + index.add_documents(documents!([ + { + // This document will insert "s" in the prefix database + "id": 0, + "text": " + saa sab sac sae saf sag sah sai saj sak sal sam san sao sap saq sar sasa sat sau sav saw sax say saz + sba sbb sbc sbe sbf sbg sbh sbi sbj sbk sbl sbm sbn sbo sbp sbq sbr sbsb sbt sbu sbv sbw sbx sby sbz + sca scb scc sce scf scg sch sci scj sck scl scm scn sco scp scq scr scsc sct scu scv scw scx scy scz + sda sdb sdc sde sdf sdg sdh sdi sdj sdk sdl sdm sdn sdo sdp sdq sdr sdsd sdt sdu sdv sdw sdx sdy sdz + sea seb sec see sef seg seh sei sej sek sel sem sen seo sep seq ser sese set seu sev sew sex sey sez + sfa sfb sfc sfe sff sfg sfh sfi sfj sfk sfl sfm sfn sfo sfp sfq sfr sfsf sft sfu sfv sfw sfx sfy sfz + sga sgb sgc sge sgf sgg sgh sgi sgj sgk sgl sgm sgn sgo sgp sgq sgr sgsg sgt sgu sgv sgw sgx sgy sgz + ska skb skc ske skf skg skh ski skj skk skl skm skn sko skp skq skr sksk skt sku skv skw skx sky skz + sla slb slc sle slf slg slh sli slj slk sll slm sln slo slp slq slr slsl slt slu slv slw slx sly slz + sma smb smc sme smf smg smh smi smj smk sml smm smn smo smp smq smr smsm smt smu smv smw smx smy smz + sna snb snc sne snf sng snh sni snj snk snl snm snn sno snp snq snr snsn snt snu snv snw snx sny snz + soa sob soc soe sof sog soh soi soj sok sol som son soo sop soq sor soso sot sou sov sow sox soy soz + spa spb spc spe spf spg sph spi spj spk spl spm spn spo spp spq spr spsp spt spu spv spw spx spy spz + sqa sqb sqc sqe sqf sqg sqh sqi sqj sqk sql sqm sqn sqo sqp sqq sqr sqsq sqt squ sqv sqw sqx sqy sqz + sra srb src sre srf srg srh sri srj srk srl srm srn sro srp srq srr srsr srt sru srv srw srx sry srz + ssa ssb ssc sse ssf ssg ssh ssi ssj ssk ssl ssm ssn sso ssp ssq ssr ssss sst ssu ssv ssw ssx ssy ssz + sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz + " + }, + // The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`. + // If the search query is "sunflower", the split word "Sun Flower" will match some documents. + // If the query is `sunflower wilting`, then we should make sure that + // the proximity condition `flower wilting: prox N` also comes with the condition + // `sun wilting: prox N+1`. TODO: this is not the exact condition we use for now. + // We only check that the phrase `sun flower` exists and `flower wilting: prox N`, which + // is better than nothing but not the best. + { + "id": 1, + "text": "Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat." + }, + { + "id": 2, + "text": "Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat." + }, + { + "id": 3, + // This document matches the query `sunflower wilting`, but the proximity condition + // between `sunflower` and `wilting` cannot be through the split-word `Sun Flower` + // which would reduce to only `flower` and `wilting` being in proximity. + "text": "A flower wilting under the sun, unlike a sunflower" + }, + { + // This should be the best document for `sunflower wilting` + "id": 4, + "text": "sun flower wilting under the heat" + }, + { + // This is also the best document for `sunflower wilting` + "id": 5, + "text": "sunflower wilting under the heat" + }, + { + // Prox MAX between `best` and `s` prefix + "id": 6, + "text": "this is the best meal I have ever had in such a beautiful summer day" + }, + { + // Prox 5 between `best` and `s` prefix + "id": 7, + "text": "this is the best cooked meal of the summer" + }, + { + // Prox 4 between `best` and `s` prefix + "id": 8, + "text": "this is the best meal of the summer" + }, + { + // Prox 3 between `best` and `s` prefix + "id": 9, + "text": "this is the best meal of summer" + }, + { + // Prox 1 between `best` and `s` prefix + "id": 10, + "text": "this is the best summer meal" + }, + { + // Reverse Prox 3 between `best` and `s` prefix + "id": 11, + "text": "summer x y best" + }, + { + // Reverse Prox 2 between `best` and `s` prefix + "id": 12, + "text": "summer x best" + }, + { + // Reverse Prox 1 between `best` and `s` prefix + "id": 13, + "text": "summer best" + }, + ])).unwrap(); + index +} + +#[test] +fn test_proximity_simple() { + let index = create_simple_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 9, 10, 7, 6, 5, 2, 3, 0, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quickbrown fox jumps over the lazy dog\"", + "\"the quack brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the really quick brown fox jumps over the lazy dog\"", + "\"the really quick brown fox jumps over the very lazy dog\"", + "\"brown quick fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy. dog\"", + "\"dog the quick brown fox jumps over the lazy\"", + "\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"", + "\"the. quick brown fox jumps over the lazy. dog\"", + ] + "###); +} + +#[test] +fn test_proximity_split_word() { + let index = create_edge_cases_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunflower wilting"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: "2" and "4" should be swapped ideally + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", + "\"sun flower wilting under the heat\"", + "\"sunflower wilting under the heat\"", + "\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"", + "\"A flower wilting under the sun, unlike a sunflower\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("\"sun flower\" wilting"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: "2" and "4" should be swapped ideally + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", + "\"sun flower wilting under the heat\"", + "\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"", + ] + "###); + drop(txn); + + index + .update_settings(|s| { + let mut syns = HashMap::new(); + syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]); + s.set_synonyms(syns); + }) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("xyz wilting"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: "2" and "4" should be swapped ideally + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", + "\"sun flower wilting under the heat\"", + "\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"", + ] + "###); +} + +#[test] +fn test_proximity_prefix_db() { + let index = create_edge_cases_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best s"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + // This test illustrates the loss of precision from using the prefix DB + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best summer meal\"", + "\"summer best\"", + "\"this is the best meal of summer\"", + "\"summer x best\"", + "\"this is the best meal of the summer\"", + "\"this is the best meal I have ever had in such a beautiful summer day\"", + "\"this is the best cooked meal of the summer\"", + "\"summer x y best\"", + ] + "###); +} diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs index 6ac8f5516..4df340e9b 100644 --- a/milli/src/search/new/tests/typo.rs +++ b/milli/src/search/new/tests/typo.rs @@ -21,8 +21,8 @@ if `words` doesn't exist before it. use std::collections::HashMap; use crate::{ - index::tests::TempIndex, Criterion, - Search, SearchResult, TermsMatchingStrategy, + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, }; fn create_index() -> TempIndex { @@ -130,6 +130,10 @@ fn create_index() -> TempIndex { "id": 22, "text": "the quick brown fox jumps over the lackadaisical dog" }, + { + "id": 23, + "text": "the quivk brown fox jumps over the lazy dog" + }, ])) .unwrap(); index @@ -151,6 +155,12 @@ fn test_no_typo() { s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); } #[test] @@ -168,7 +178,14 @@ fn test_default_typo() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 23]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + ] + "###); // 1 typo on one word, replaced letter let mut s = Search::new(&txn, &index); @@ -176,6 +193,12 @@ fn test_default_typo() { s.query("the quack brown fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); // 1 typo on one word, missing letter, extra letter let mut s = Search::new(&txn, &index); @@ -183,6 +206,12 @@ fn test_default_typo() { s.query("the quicest brownest fox jummps over the laziest dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quickest brownest fox jumps over the laziest dog\"", + ] + "###); // 1 typo on one word, swapped letters let mut s = Search::new(&txn, &index); @@ -190,6 +219,12 @@ fn test_default_typo() { s.query("the quikc borwn fox jupms over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); // 1 first letter typo on a word <5 bytes, replaced letter let mut s = Search::new(&txn, &index); @@ -211,6 +246,12 @@ fn test_default_typo() { s.query("the quack brawn fox junps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); // 2 typos on words < 9 bytes let mut s = Search::new(&txn, &index); @@ -225,6 +266,12 @@ fn test_default_typo() { s.query("the extravant fox kyrocketed over the lamguorout dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the extravagant fox skyrocketed over the languorous dog\"", + ] + "###); // 2 typos on words >= 9 bytes: 2 extra letters in a single word, swapped letters + extra letter, replaced letters let mut s = Search::new(&txn, &index); @@ -232,6 +279,12 @@ fn test_default_typo() { s.query("the extravaganttt fox sktyrocnketed over the lagnuorrous dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the extravagant fox skyrocketed over the languorous dog\"", + ] + "###); } #[test] @@ -244,6 +297,8 @@ fn test_phrase_no_typo_allowed() { s.query("the \"quick brewn\" fox jumps over the lazy dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @"[]"); } #[test] @@ -256,12 +311,20 @@ fn test_ngram_typos() { s.query("the extra lagant fox skyrocketed over the languorous dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the extravagant fox skyrocketed over the languorous dog\"", + ] + "###); let mut s = Search::new(&txn, &index); s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("the ex tra lagant fox skyrocketed over the languorous dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @"[]"); } #[test] fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() { @@ -278,7 +341,29 @@ fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids: ids_1, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{ids_1:?}"), @"[0, 7, 8, 9, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + insta::assert_snapshot!(format!("{ids_1:?}"), @"[0, 23, 7, 8, 9, 22, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + let texts = collect_field_values(&index, &txn, "text", &ids_1); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps over the lackadaisical dog\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown foxes jump over the lazy dog\"", + "\"the quick brown fax sends a letter to the dog\"", + "\"the quick brown\"", + "\"the quick\"", + "\"a fox doesn't quack, that crown goes to the duck.\"", + "\"the quickest brownest fox jumps over the laziest dog\"", + "\"the quicker browner fox jumped over the lazier dog\"", + "\"the extravagant fox skyrocketed over the languorous dog\"", + "\"the fast brownish fox jumps over the lackadaisical dog\"", + ] + "###); index .update_settings(|s| { @@ -290,7 +375,7 @@ fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("the quick brown fox jumps over the lazy dog"); let SearchResult { documents_ids: ids_2, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{ids_2:?}"), @"[0, 7, 8, 9, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + insta::assert_snapshot!(format!("{ids_2:?}"), @"[0, 23, 7, 8, 9, 22, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); assert_eq!(ids_1, ids_2); } @@ -307,6 +392,17 @@ fn test_typo_bucketing() { s.query("network interconnection sunflower"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 15, 16, 17, 18, 20]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"netwolk interconections sunflawar\"", + "\"network interconnections sunflawer\"", + "\"network interconnection sunflower\"", + "\"network interconnection sun flower\"", + "\"network interconnection sunflowering\"", + "\"network interconnection sunflowar\"", + ] + "###); // Then with the typo ranking rule drop(txn); @@ -322,12 +418,34 @@ fn test_typo_bucketing() { s.query("network interconnection sunflower"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18, 17, 20, 15, 14]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"network interconnection sunflower\"", + "\"network interconnection sunflowering\"", + "\"network interconnection sun flower\"", + "\"network interconnection sunflowar\"", + "\"network interconnections sunflawer\"", + "\"netwolk interconections sunflawar\"", + ] + "###); let mut s = Search::new(&txn, &index); s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("network interconnection sun flower"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[17, 19, 16, 18, 20, 15]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"network interconnection sun flower\"", + "\"network interconnection sun flowering\"", + "\"network interconnection sunflower\"", + "\"network interconnection sunflowering\"", + "\"network interconnection sunflowar\"", + "\"network interconnections sunflawer\"", + ] + "###); } #[test] @@ -350,7 +468,15 @@ fn test_typo_synonyms() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("the quick brown fox jumps over the lackadaisical dog"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 22, 23]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lackadaisical dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + ] + "###); let mut s = Search::new(&txn, &index); s.terms_matching_strategy(TermsMatchingStrategy::All); @@ -359,5 +485,13 @@ fn test_typo_synonyms() { // TODO: is this correct? interaction of ngrams + synonyms means that the // multi-word synonyms end up having a typo cost. This is probably not what we want. let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0, 22]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the fast brownish fox jumps over the lackadaisical dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lackadaisical dog\"", + ] + "###); } diff --git a/milli/src/search/new/tests/words_tms.rs b/milli/src/search/new/tests/words_tms.rs index 8b5c0153f..74748ea5a 100644 --- a/milli/src/search/new/tests/words_tms.rs +++ b/milli/src/search/new/tests/words_tms.rs @@ -12,9 +12,12 @@ account by the proximity ranking rule. 7. The search is capable of returning no results if no documents match the query */ -use crate::{index::tests::TempIndex, Criterion, Search, SearchResult, TermsMatchingStrategy}; +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; -fn create_quick_brown_fox_trivial_index() -> TempIndex { +fn create_index() -> TempIndex { let index = TempIndex::new(); index @@ -126,7 +129,7 @@ fn create_quick_brown_fox_trivial_index() -> TempIndex { #[test] fn test_words_tms_last_simple() { - let index = create_quick_brown_fox_trivial_index(); + let index = create_index(); let txn = index.read_txn().unwrap(); let mut s = Search::new(&txn, &index); @@ -136,6 +139,31 @@ fn test_words_tms_last_simple() { // 6 and 7 have the same score because "the" appears twice insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 8, 6, 7, 5, 4, 11, 12, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the brown quick fox jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy blue dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the, quick, brown, fox, jumps, over, the, lazy, dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + "\"the quick brown\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("extravagant the quick brown fox jumps over the lazy dog"); @@ -146,7 +174,7 @@ fn test_words_tms_last_simple() { #[test] fn test_words_tms_last_phrase() { - let index = create_quick_brown_fox_trivial_index(); + let index = create_index(); let txn = index.read_txn().unwrap(); let mut s = Search::new(&txn, &index); @@ -156,6 +184,21 @@ fn test_words_tms_last_phrase() { // "The quick brown fox" is a phrase, not deleted by this term matching strategy insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 6, 7, 5, 4, 11, 12]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("\"the quick brown fox\" jumps over the \"lazy\" dog"); @@ -165,6 +208,17 @@ fn test_words_tms_last_phrase() { // "lazy" is a phrase, not deleted by this term matching strategy // but words before it can be deleted insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 11, 12]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("\"the quick brown fox jumps over the lazy dog\""); @@ -173,6 +227,12 @@ fn test_words_tms_last_phrase() { // The whole query is a phrase, no terms are removed insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("\"the quick brown fox jumps over the lazy dog"); @@ -181,11 +241,17 @@ fn test_words_tms_last_phrase() { // The whole query is still a phrase, even without closing quotes, so no terms are removed insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); } #[test] fn test_words_proximity_tms_last_simple() { - let index = create_quick_brown_fox_trivial_index(); + let index = create_index(); index .update_settings(|s| { s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); @@ -200,6 +266,31 @@ fn test_words_proximity_tms_last_simple() { // 7 is better than 6 because of the proximity between "the" and its surrounding terms insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy blue dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the, quick, brown, fox, jumps, over, the, lazy, dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + "\"the quick brown\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("the brown quick fox jumps over the lazy dog"); @@ -208,11 +299,36 @@ fn test_words_proximity_tms_last_simple() { // 10 is better than 9 because of the proximity between "quick" and "brown" insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the brown quick fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy blue dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the, quick, brown, fox, jumps, over, the, lazy, dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + "\"the quick brown\"", + ] + "###); } #[test] fn test_words_proximity_tms_last_phrase() { - let index = create_quick_brown_fox_trivial_index(); + let index = create_index(); index .update_settings(|s| { s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); @@ -228,6 +344,26 @@ fn test_words_proximity_tms_last_phrase() { // "quick brown" is a phrase. The proximity of its first and last words // to their adjacent query words should be taken into account insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5, 4, 11, 12, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + "\"the quick brown\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("the \"quick brown\" \"fox jumps\" over the lazy dog"); @@ -238,11 +374,27 @@ fn test_words_proximity_tms_last_phrase() { // to their adjacent query words should be taken into account. // The same applies to `fox jumps`. insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps\"", + ] + "###); } #[test] fn test_words_tms_all() { - let index = create_quick_brown_fox_trivial_index(); + let index = create_index(); index .update_settings(|s| { s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); @@ -256,6 +408,23 @@ fn test_words_tms_all() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy blue dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the, quick, brown, fox, jumps, over, the, lazy, dog\"", + ] + "###); let mut s = Search::new(&txn, &index); s.query("extravagant"); @@ -263,4 +432,6 @@ fn test_words_tms_all() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @"[]"); } From 4c8a0179ba1c243791a3b24fae2f37e12e50e8cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 11:30:49 +0200 Subject: [PATCH 14/21] Add more search tests --- milli/src/search/new/tests/proximity.rs | 176 ++++++++++++++++++++++-- 1 file changed, 165 insertions(+), 11 deletions(-) diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index f6e071572..44ff94f1d 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -1,17 +1,17 @@ /*! This module tests the Proximity ranking rule: -1. A proximity of >7 always has the same cost. +1. A sprximity of >7 always has the same cost. -2. Phrase terms can be in proximity to other terms via their start and end words, +2. Phrase terms can be in sprximity to other terms via their start and end words, but we need to make sure that the phrase exists in the document that meets this proximity condition. This is especially relevant with split words and synonyms. -3. An ngram has the same proximity cost as its component words being consecutive. +3. An ngram has the same sprximity cost as its component words being consecutive. e.g. `sunflower` equivalent to `sun flower`. -4. The prefix databases can be used to find the proximity between two words, but -they store fewer proximities than the regular word proximity DB. +4. The prefix databases can be used to find the sprximity between two words, but +they store fewer sprximities than the regular word sprximity DB. */ @@ -126,9 +126,9 @@ fn create_edge_cases_index() -> TempIndex { // The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`. // If the search query is "sunflower", the split word "Sun Flower" will match some documents. // If the query is `sunflower wilting`, then we should make sure that - // the proximity condition `flower wilting: prox N` also comes with the condition - // `sun wilting: prox N+1`. TODO: this is not the exact condition we use for now. - // We only check that the phrase `sun flower` exists and `flower wilting: prox N`, which + // the sprximity condition `flower wilting: sprx N` also comes with the condition + // `sun wilting: sprx N+1`. TODO: this is not the exact condition we use for now. + // We only check that the phrase `sun flower` exists and `flower wilting: sprx N`, which // is better than nothing but not the best. { "id": 1, @@ -140,9 +140,9 @@ fn create_edge_cases_index() -> TempIndex { }, { "id": 3, - // This document matches the query `sunflower wilting`, but the proximity condition + // This document matches the query `sunflower wilting`, but the sprximity condition // between `sunflower` and `wilting` cannot be through the split-word `Sun Flower` - // which would reduce to only `flower` and `wilting` being in proximity. + // which would reduce to only `flower` and `wilting` being in sprximity. "text": "A flower wilting under the sun, unlike a sunflower" }, { @@ -195,6 +195,69 @@ fn create_edge_cases_index() -> TempIndex { "id": 13, "text": "summer best" }, + { + // This document will insert "win" in the prefix database + "id": 14, + "text": " + winaa winab winac winae winaf winag winah winai winaj winak winal winam winan winao winap winaq winar winasa winat winau winav winaw winax winay winaz + winba winbb winbc winbe winbf winbg winbh winbi winbj winbk winbl winbm winbn winbo winbp winbq winbr winbsb winbt winbu winbv winbw winbx winby winbz + winca wincb wincc wince wincf wincg winch winci wincj winck wincl wincm wincn winco wincp wincq wincr wincsc winct wincu wincv wincw wincx wincy wincz + winda windb windc winde windf windg windh windi windj windk windl windm windn windo windp windq windr windsd windt windu windv windw windx windy windz + winea wineb winec winee winef wineg wineh winei winej winek winel winem winen wineo winep wineq winer winese winet wineu winev winew winex winey winez + winfa winfb winfc winfe winff winfg winfh winfi winfj winfk winfl winfm winfn winfo winfp winfq winfr winfsf winft winfu winfv winfw winfx winfy winfz + winga wingb wingc winge wingf wingg wingh wingi wingj wingk wingl wingm wingn wingo wingp wingq wingr wingsg wingt wingu wingv wingw wingx wingy wingz + winka winkb winkc winke winkf winkg winkh winki winkj winkk winkl winkm winkn winko winkp winkq winkr winksk winkt winku winkv winkw winkx winky winkz + winla winlb winlc winle winlf winlg winlh winli winlj winlk winll winlm winln winlo winlp winlq winlr winlsl winlt winlu winlv winlw winlx winly winlz + winma winmb winmc winme winmf winmg winmh winmi winmj winmk winml winmm winmn winmo winmp winmq winmr winmsm winmt winmu winmv winmw winmx winmy winmz + winna winnb winnc winne winnf winng winnh winni winnj winnk winnl winnm winnn winno winnp winnq winnr winnsn winnt winnu winnv winnw winnx winny winnz + winoa winob winoc winoe winof winog winoh winoi winoj winok winol winom winon winoo winop winoq winor winoso winot winou winov winow winox winoy winoz + winpa winpb winpc winpe winpf winpg winph winpi winpj winpk winpl winpm winpn winpo winpp winpq winpr winpsp winpt winpu winpv winpw winpx winpy winpz + winqa winqb winqc winqe winqf winqg winqh winqi winqj winqk winql winqm winqn winqo winqp winqq winqr winqsq winqt winqu winqv winqw winqx winqy winqz + winra winrb winrc winre winrf winrg winrh winri winrj winrk winrl winrm winrn winro winrp winrq winrr winrsr winrt winru winrv winrw winrx winry winrz + winsa winsb winsc winse winsf winsg winsh winsi winsj winsk winsl winsm winsn winso winsp winsq winsr winsss winst winsu winsv winsw winsx winsy winsz + winta wintb wintc winte wintf wintg winth winti wintj wintk wintl wintm wintn winto wintp wintq wintr wintst wintt wintu wintv wintw wintx winty wintz + " + }, + { + // Prox MAX between `best` and `win` prefix + "id": 15, + "text": "this is the best meal I have ever had in such a beautiful winter day" + }, + { + // Prox 5 between `best` and `win` prefix + "id": 16, + "text": "this is the best cooked meal of the winter" + }, + { + // Prox 4 between `best` and `win` prefix + "id": 17, + "text": "this is the best meal of the winter" + }, + { + // Prox 3 between `best` and `win` prefix + "id": 18, + "text": "this is the best meal of winter" + }, + { + // Prox 1 between `best` and `win` prefix + "id": 19, + "text": "this is the best winter meal" + }, + { + // Reverse Prox 3 between `best` and `win` prefix + "id": 20, + "text": "winter x y best" + }, + { + // Reverse Prox 2 between `best` and `win` prefix + "id": 21, + "text": "winter x best" + }, + { + // Reverse Prox 1 between `best` and `win` prefix + "id": 22, + "text": "winter best" + }, ])).unwrap(); index } @@ -298,7 +361,7 @@ fn test_proximity_prefix_db() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("best s"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11, 15]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); // This test illustrates the loss of precision from using the prefix DB @@ -312,6 +375,97 @@ fn test_proximity_prefix_db() { "\"this is the best meal I have ever had in such a beautiful summer day\"", "\"this is the best cooked meal of the summer\"", "\"summer x y best\"", + "\"this is the best meal I have ever had in such a beautiful winter day\"", + ] + "###); + + // Difference when using the `su` prefix, which is not in the prefix DB + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best su"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 11, 7, 6, 15]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best summer meal\"", + "\"summer best\"", + "\"this is the best meal of summer\"", + "\"summer x best\"", + "\"this is the best meal of the summer\"", + "\"summer x y best\"", + "\"this is the best cooked meal of the summer\"", + "\"this is the best meal I have ever had in such a beautiful summer day\"", + "\"this is the best meal I have ever had in such a beautiful winter day\"", + ] + "###); + + // Note that there is a case where a prefix is in the prefix DB but not in the + // **proximity** prefix DB. In that case, its sprximity score will always be + // the maximum. This happens for prefixes that are larger than 2 bytes. + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best win"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best meal I have ever had in such a beautiful winter day\"", + "\"this is the best cooked meal of the winter\"", + "\"this is the best meal of the winter\"", + "\"this is the best meal of winter\"", + "\"this is the best winter meal\"", + "\"winter x y best\"", + "\"winter x best\"", + "\"winter best\"", + ] + "###); + + // Now using `wint`, which is not in the prefix DB: + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best wint"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 20, 16, 15]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best winter meal\"", + "\"winter best\"", + "\"this is the best meal of winter\"", + "\"winter x best\"", + "\"this is the best meal of the winter\"", + "\"winter x y best\"", + "\"this is the best cooked meal of the winter\"", + "\"this is the best meal I have ever had in such a beautiful winter day\"", + ] + "###); + + // and using `wi` which is in the prefix DB and proximity prefix DB + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best wi"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 15, 16, 20]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best winter meal\"", + "\"winter best\"", + "\"this is the best meal of winter\"", + "\"winter x best\"", + "\"this is the best meal of the winter\"", + "\"this is the best meal I have ever had in such a beautiful winter day\"", + "\"this is the best cooked meal of the winter\"", + "\"winter x y best\"", ] "###); } From 6e50f23896418c5d363a69cc13813fc6bad548f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 13:33:23 +0200 Subject: [PATCH 15/21] Add more search tests --- milli/src/search/new/tests/mod.rs | 2 + milli/src/search/new/tests/proximity.rs | 2 +- milli/src/search/new/tests/proximity_typo.rs | 68 ++++++++++ milli/src/search/new/tests/typo_proximity.rs | 126 +++++++++++++++++++ 4 files changed, 197 insertions(+), 1 deletion(-) create mode 100644 milli/src/search/new/tests/proximity_typo.rs create mode 100644 milli/src/search/new/tests/typo_proximity.rs diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index 0fd5013db..898276858 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -3,8 +3,10 @@ pub mod distinct; pub mod language; pub mod ngram_split_words; pub mod proximity; +pub mod proximity_typo; pub mod sort; pub mod typo; +pub mod typo_proximity; pub mod words_tms; fn collect_field_values( diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index 44ff94f1d..880f933f0 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -1,7 +1,7 @@ /*! This module tests the Proximity ranking rule: -1. A sprximity of >7 always has the same cost. +1. A proximity of >7 always has the same cost. 2. Phrase terms can be in sprximity to other terms via their start and end words, but we need to make sure that the phrase exists in the document that meets this diff --git a/milli/src/search/new/tests/proximity_typo.rs b/milli/src/search/new/tests/proximity_typo.rs new file mode 100644 index 000000000..3bf869d1d --- /dev/null +++ b/milli/src/search/new/tests/proximity_typo.rs @@ -0,0 +1,68 @@ +/*! +This module tests the interactions between the proximity and typo ranking rules. + +The proximity ranking rule should transform the query graph such that it +only contains the word pairs that it used to compute its bucket. +*/ + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words, Criterion::Proximity, Criterion::Typo]); + }) + .unwrap(); + + index + .add_documents(documents!([ + // Basic trap. + // + // We have one document with the perfect word pair: `sommer - holiday` + // and another with the perfect word pair: `sommer holidty`. + // + // The proximity ranking rule will put them both in the same bucket, and it + // should minify the query graph to make it represent: + // EITHER: + // sommer + holiday + // OR: + // sommer + holidty + // + // Such that the child typo ranking rule does not find any match + // for its zero-typo bucket `summer + holiday`, even though both documents + // contain these two exact words. + { + "id": 0, + "text": "summer. holiday. sommer holidty" + }, + { + "id": 1, + "text": "summer. holiday. sommer holiday" + }, + + ])) + .unwrap(); + index +} + +#[test] +fn test_trap_basic() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("summer holiday"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + "###); +} diff --git a/milli/src/search/new/tests/typo_proximity.rs b/milli/src/search/new/tests/typo_proximity.rs new file mode 100644 index 000000000..ba6371544 --- /dev/null +++ b/milli/src/search/new/tests/typo_proximity.rs @@ -0,0 +1,126 @@ +/*! +This module tests the interactions between the typo and proximity ranking rules. + +The typo ranking rule should transform the query graph such that it only contains +the combinations of word derivations that it used to compute its bucket. + +The proximity ranking rule should then look for proximities only between those specific derivations. +For example, given the the search query `beautiful summer` and the dataset: +```text +{ "id": 0, "text": "beautigul summer...... beautiful day in the summer" } +{ "id": 1, "text": "beautiful summer" } +``` +Then the document with id `1` should be returned before `0`. +The proximity ranking rule is not allowed to look for the proximity between `beautigul` and `summer` +because the typo ranking rule before it only used the derivation `beautiful`. +*/ + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); + }) + .unwrap(); + + index + .add_documents(documents!([ + // trap explained in the module documentation + { + "id": 0, + "text": "beautigul summer. beautiful x y z summer" + }, + { + "id": 1, + "text": "beautiful summer" + }, + // the next 2 documents set up a more complicated trap + // with the query `beautiful summer`, we will have: + // 1. documents with no typos, id 0 and 1 + // 2. documents with 1 typos: id 2 and 3, those are interpreted as EITHER + // - id 2: "beautigul + summer" ; OR + // - id 3: "beautiful + sommer" + // To sort these two documents, the proximity ranking rule must use only the + // word pairs: `beautigul -- summer` and `beautiful -- sommer` even though + // all variations of `beautiful` and `sommer` were used by the typo ranking rule. + { + "id": 2, + "text": "beautigul sommer. beautigul x summer" + }, + { + "id": 3, + "text": "beautiful sommer" + }, + // The next two documents lay out an even more complex trap, which the current implementation + // fails to handle properly. + // With the user query `delicious sweet dessert`, the typo ranking rule will return one bucket of: + // - id 4: delicitous + sweet + dessert + // - id 5: beautiful + sweet + desgert + // The word pairs that the proximity ranking rules is allowed to use are + // EITHER: + // delicitous -- sweet AND sweet -- dessert + // OR + // delicious -- sweet AND sweet -- desgert + // So the word pair to use for the terms `summer` and `dessert` depend on the + // word pairs explored before them. + { + "id": 4, + "text": "delicitous. sweet. dessert. delicitous sweet desgert", + }, + { + "id": 5, + "text": "delicious. sweet desgert. delicious sweet desgert", + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_trap_basic_and_complex1() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("beautiful summer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"beautiful summer\"", + "\"beautigul summer. beautiful x y z summer\"", + "\"beautiful sommer\"", + "\"beautigul sommer. beautigul x summer\"", + ] + "###); +} + +#[test] +fn test_trap_complex2() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("delicious sweet dessert"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: this is incorrect. 5 should appear before 4 + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"delicitous. sweet. dessert. delicitous sweet desgert\"", + "\"delicious. sweet desgert. delicious sweet desgert\"", + ] + "###); +} From b5691802a32fd63697a83882124ecf645ae4df86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 14:10:01 +0200 Subject: [PATCH 16/21] Add new tests and fix construction of query graph from paths --- milli/src/search/new/interner.rs | 3 + milli/src/search/new/query_graph.rs | 101 ++++++++++--------- milli/src/search/new/tests/proximity_typo.rs | 9 +- milli/src/search/new/tests/sort.rs | 12 +-- milli/src/search/new/tests/typo_proximity.rs | 8 +- 5 files changed, 73 insertions(+), 60 deletions(-) diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index e9bfbef86..ebf18f38c 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -176,6 +176,9 @@ impl Interner { pub fn iter_mut(&mut self) -> impl Iterator, &mut T)> { self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x)) } + pub fn freeze(self) -> FixedSizeInterner { + FixedSizeInterner { stable_store: self.stable_store } + } } /// A store of values of type `T`, each linked to a value of type `From` diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 33e178494..2662ef730 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -4,7 +4,7 @@ use super::query_term::{ }; use super::small_bitmap::SmallBitmap; use super::SearchContext; -use crate::search::new::interner::DedupInterner; +use crate::search::new::interner::Interner; use crate::Result; use std::cmp::Ordering; use std::collections::BTreeMap; @@ -345,29 +345,13 @@ impl QueryGraph { Build a query graph from a list of paths The paths are composed of source and dest terms. - If the source term is `None`, then the last dest term is used - as the predecessor of the dest term. If the source is Some(_), - then an edge is built between the last dest term and the source, - and between the source and new dest term. - Note that the resulting graph will not correspond to a perfect - representation of the set of paths. For example, consider the following paths: ```txt PATH 1 : a -> b1 -> c1 -> d -> e1 PATH 2 : a -> b2 -> c2 -> d -> e2 ``` Then the resulting graph will be: - ```txt - ┌────┐ ┌────┐ ┌────┐ - ┌──│ b1 │──│ c1 │─┐ ┌──│ e1 │ - ┌────┐ │ └────┘ └────┘ │ ┌────┐ │ └────┘ - │ a │─┤ ├─│ d │─┤ - └────┘ │ ┌────┐ ┌────┐ │ └────┘ │ ┌────┐ - └──│ b2 │──│ c2 │─┘ └──│ e2 │ - └────┘ └────┘ └────┘ - ``` - which is different from the fully correct representation: ```txt ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌──│ b1 │──│ c1 │───│ d │───│ e1 │ @@ -383,21 +367,51 @@ impl QueryGraph { pub fn build_from_paths( paths: Vec, LocatedQueryTermSubset)>>, ) -> Self { - let mut node_data = DedupInterner::default(); - let root_node = node_data.insert(QueryNodeData::Start); - let end_node = node_data.insert(QueryNodeData::End); + let mut node_data = Interner::default(); + let root_node = node_data.push(QueryNodeData::Start); + let end_node = node_data.push(QueryNodeData::End); + + let mut paths_with_single_terms = vec![]; + + for path in paths { + let mut processed_path = vec![]; + let mut prev_dest_term: Option = None; + for (start_term, dest_term) in path { + if let Some(prev_dest_term) = prev_dest_term.take() { + if let Some(mut start_term) = start_term { + if start_term.term_ids == prev_dest_term.term_ids { + start_term.term_subset.intersect(&prev_dest_term.term_subset); + processed_path.push(start_term); + } else { + processed_path.push(prev_dest_term); + processed_path.push(start_term); + } + } else { + processed_path.push(prev_dest_term); + } + } else if let Some(start_term) = start_term { + processed_path.push(start_term); + } + prev_dest_term = Some(dest_term); + } + if let Some(prev_dest_term) = prev_dest_term { + processed_path.push(prev_dest_term); + } + paths_with_single_terms.push(processed_path); + } + + // TODO: make a prefix tree of the processed paths to avoid uselessly duplicating nodes let mut paths_with_ids = vec![]; - for path in paths { + for path in paths_with_single_terms { let mut path_with_ids = vec![]; - for node in path { - let (start_term, end_term) = node; - let src_node_id = start_term.map(|x| node_data.insert(QueryNodeData::Term(x))); - let dest_node_id = node_data.insert(QueryNodeData::Term(end_term)); - path_with_ids.push((src_node_id, dest_node_id)); + for term in path { + let id = node_data.push(QueryNodeData::Term(term)); + path_with_ids.push(Interned::from_raw(id.into_raw())); } paths_with_ids.push(path_with_ids); } + let nodes_data = node_data.freeze(); let nodes_data_len = nodes_data.len(); let mut nodes = nodes_data.map_move(|n| QueryNode { @@ -406,31 +420,22 @@ impl QueryGraph { successors: SmallBitmap::new(nodes_data_len), }); - let root_node = Interned::from_raw(root_node.into_raw()); - let end_node = Interned::from_raw(end_node.into_raw()); + let root_node = Interned::::from_raw(root_node.into_raw()); + let end_node = Interned::::from_raw(end_node.into_raw()); for path in paths_with_ids { - let mut prev_node = root_node; - for node in path { - let (start_term, dest_term) = node; - let end_term = Interned::from_raw(dest_term.into_raw()); - let src = if let Some(start_term) = start_term { - // TODO: this is incorrect! should take the intersection - // between the prev node and the start term if they refer to the same - // original query term! - let start_term = Interned::from_raw(start_term.into_raw()); - nodes.get_mut(prev_node).successors.insert(start_term); - nodes.get_mut(start_term).predecessors.insert(prev_node); - start_term - } else { - prev_node - }; - nodes.get_mut(src).successors.insert(end_term); - nodes.get_mut(end_term).predecessors.insert(src); - prev_node = end_term; + let mut prev_node_id = root_node; + for node_id in path { + let prev_node = nodes.get_mut(prev_node_id); + prev_node.successors.insert(node_id); + let node = nodes.get_mut(node_id); + node.predecessors.insert(prev_node_id); + prev_node_id = node_id; } - nodes.get_mut(prev_node).successors.insert(end_node); - nodes.get_mut(end_node).predecessors.insert(prev_node); + let prev_node = nodes.get_mut(prev_node_id); + prev_node.successors.insert(end_node); + let node = nodes.get_mut(end_node); + node.predecessors.insert(prev_node_id); } QueryGraph { root_node, end_node, nodes } diff --git a/milli/src/search/new/tests/proximity_typo.rs b/milli/src/search/new/tests/proximity_typo.rs index 3bf869d1d..9f9601e3f 100644 --- a/milli/src/search/new/tests/proximity_typo.rs +++ b/milli/src/search/new/tests/proximity_typo.rs @@ -3,6 +3,8 @@ This module tests the interactions between the proximity and typo ranking rules. The proximity ranking rule should transform the query graph such that it only contains the word pairs that it used to compute its bucket. + +TODO: This is not currently implemented. */ use crate::{ @@ -61,8 +63,13 @@ fn test_trap_basic() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("summer holiday"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: this is incorrect, 1 should come before 0 insta::assert_debug_snapshot!(texts, @r###" + [ + "\"summer. holiday. sommer holidty\"", + "\"summer. holiday. sommer holiday\"", + ] "###); } diff --git a/milli/src/search/new/tests/sort.rs b/milli/src/search/new/tests/sort.rs index d3a952a24..d2201f55b 100644 --- a/milli/src/search/new/tests/sort.rs +++ b/milli/src/search/new/tests/sort.rs @@ -271,13 +271,13 @@ fn test_sort() { "false", "true", "true", - "__does_not_exist___", + "__does_not_exist__", "null", "[null,null,\"\"]", "\"\"", "{\"sub\":0}", - "__does_not_exist___", - "__does_not_exist___", + "__does_not_exist__", + "__does_not_exist__", ] "###); @@ -304,13 +304,13 @@ fn test_sort() { "false", "\"1\"", "\"0\"", - "__does_not_exist___", + "__does_not_exist__", "null", "[null,null,\"\"]", "\"\"", "{\"sub\":0}", - "__does_not_exist___", - "__does_not_exist___", + "__does_not_exist__", + "__does_not_exist__", ] "###); } diff --git a/milli/src/search/new/tests/typo_proximity.rs b/milli/src/search/new/tests/typo_proximity.rs index ba6371544..220fc69e1 100644 --- a/milli/src/search/new/tests/typo_proximity.rs +++ b/milli/src/search/new/tests/typo_proximity.rs @@ -59,8 +59,7 @@ fn create_index() -> TempIndex { "id": 3, "text": "beautiful sommer" }, - // The next two documents lay out an even more complex trap, which the current implementation - // fails to handle properly. + // The next two documents lay out an even more complex trap. // With the user query `delicious sweet dessert`, the typo ranking rule will return one bucket of: // - id 4: delicitous + sweet + dessert // - id 5: beautiful + sweet + desgert @@ -114,13 +113,12 @@ fn test_trap_complex2() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("delicious sweet dessert"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); - // TODO: this is incorrect. 5 should appear before 4 insta::assert_debug_snapshot!(texts, @r###" [ - "\"delicitous. sweet. dessert. delicitous sweet desgert\"", "\"delicious. sweet desgert. delicious sweet desgert\"", + "\"delicitous. sweet. dessert. delicitous sweet desgert\"", ] "###); } From 337e75b0e4fa63566ed5ac516f504a723117ecb2 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:42:51 +0200 Subject: [PATCH 17/21] Exact attribute with state --- milli/src/search/new/exact_attribute.rs | 166 +++++++++++++++++------- 1 file changed, 122 insertions(+), 44 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index bb6299e28..fa837272b 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -1,5 +1,5 @@ use heed::BytesDecode; -use roaring::MultiOps; +use roaring::{MultiOps, RoaringBitmap}; use super::query_graph::QueryGraph; use super::ranking_rules::{RankingRule, RankingRuleOutput}; @@ -7,19 +7,18 @@ use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::ExactTerm; use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; -/// FIXME: +/// A ranking rule that produces 3 disjoint buckets: /// -/// - A lot of work done in next_bucket that start_iteration could do. -/// - Consider calling the graph based rule directly from this one. -/// - currently we did exact term, don't forget about prefix -/// - some tests +/// 1. Documents from the universe whose value is exactly the query. +/// 2. Documents from the universe not in (1) whose value starts with the query. +/// 3. Documents from the universe not in (1) or (2). pub struct ExactAttribute { - query_graph: Option, + state: State, } impl ExactAttribute { pub fn new() -> Self { - Self { query_graph: None } + Self { state: Default::default() } } } @@ -30,23 +29,69 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { fn start_iteration( &mut self, - _ctx: &mut SearchContext<'ctx>, + ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, - _universe: &roaring::RoaringBitmap, + universe: &roaring::RoaringBitmap, query: &QueryGraph, ) -> Result<()> { - self.query_graph = Some(query.clone()); + self.state = State::start_iteration(ctx, universe, query)?; + Ok(()) } fn next_bucket( &mut self, - ctx: &mut SearchContext<'ctx>, + _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, universe: &roaring::RoaringBitmap, ) -> Result>> { - // iterate on the nodes of the graph, retain LocatedQueryTermSubset - let query_graph = self.query_graph.as_ref().unwrap(); + let state = std::mem::take(&mut self.state); + let (state, output) = State::next(state, universe); + self.state = state; + + Ok(output) + } + + fn end_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + ) { + self.state = Default::default(); + } +} + +/// Inner state of the ranking rule. +#[derive(Default)] +enum State { + /// State between two iterations + #[default] + Uninitialized, + /// The next call to `next` will output the documents in the universe that have an attribute that is the exact query + ExactAttribute(QueryGraph, Vec), + /// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query, + /// but isn't the exact query. + AttributeStarts(QueryGraph, Vec), + /// The next calls to `next` will output the input universe. + Empty(QueryGraph), +} + +/// The candidates sorted by attributes +/// +/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field. +struct FieldCandidates { + /// The candidates that start with all the words of the query in the field + start_with_exact: RoaringBitmap, + /// The candidates that have the same number of words as the query in the field + exact_word_count: RoaringBitmap, +} + +impl State { + fn start_iteration( + ctx: &mut SearchContext<'_>, + universe: &RoaringBitmap, + query_graph: &QueryGraph, + ) -> Result { let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = Vec::with_capacity(query_graph.nodes.len() as usize); for (_, node) in query_graph.nodes.iter() { @@ -55,11 +100,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { exact_term } else { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + continue; }; exact_term_position_ids.push(( exact_term, @@ -73,14 +114,17 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { exact_term_position_ids.sort_by_key(|(_, _, id)| *id); // bail if there is a "hole" (missing word) in remaining query graph + if let Some((_, _, first_id)) = exact_term_position_ids.first() { + if *first_id != 0 { + return Ok(State::Empty(query_graph.clone())); + } + } else { + return Ok(State::Empty(query_graph.clone())); + } let mut previous_id = 0; for (_, _, id) in exact_term_position_ids.iter().copied() { if id < previous_id || id - previous_id > 1 { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } else { previous_id = id; } @@ -102,11 +146,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { .collect(); for (words, position) in &words_positions { if candidates.is_empty() { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } 'words: for (offset, word) in words.iter().enumerate() { @@ -116,8 +156,11 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { } else { continue 'words; }; + // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of + // longer phrases we'll be losing on precision here. + let bucketed_position = crate::bucketed_position(position + offset); let word_position_docids = CboRoaringBitmapCodec::bytes_decode( - ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(), + ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(), ) .unwrap_or_default(); candidates &= word_position_docids; @@ -127,16 +170,12 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { let candidates = candidates; if candidates.is_empty() { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); - let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len()); + let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); // then check that there exists at least one attribute that has all of the terms for fid in searchable_fields_ids { @@ -156,20 +195,59 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { )?; intersection &= &candidates; if !intersection.is_empty() { - candidates_per_attributes.push(intersection); + let candidates_with_exact_word_count = ctx + .index + .field_id_word_count_docids + .get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))? + .unwrap_or_default(); + candidates_per_attribute.push(FieldCandidates { + start_with_exact: intersection, + exact_word_count: candidates_with_exact_word_count, + }); } } // note we could have "false positives" where there both exist different attributes that collectively // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. - let candidates = MultiOps::union(candidates_per_attributes.into_iter()); - Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates })) + Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute)) } - fn end_iteration( - &mut self, - _ctx: &mut SearchContext<'ctx>, - _logger: &mut dyn SearchLogger, - ) { + fn next( + state: State, + universe: &RoaringBitmap, + ) -> (State, Option>) { + let (state, output) = match state { + State::Uninitialized => (state, None), + State::ExactAttribute(query_graph, candidates_per_attribute) => { + let mut candidates = MultiOps::union(candidates_per_attribute.iter().map( + |FieldCandidates { start_with_exact, exact_word_count }| { + start_with_exact & exact_word_count + }, + )); + candidates &= universe; + ( + State::AttributeStarts(query_graph.clone(), candidates_per_attribute), + Some(RankingRuleOutput { query: query_graph, candidates }), + ) + } + State::AttributeStarts(query_graph, candidates_per_attribute) => { + let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map( + |FieldCandidates { mut start_with_exact, exact_word_count }| { + start_with_exact -= exact_word_count; + start_with_exact + }, + )); + candidates &= universe; + ( + State::Empty(query_graph.clone()), + Some(RankingRuleOutput { query: query_graph, candidates }), + ) + } + State::Empty(query_graph) => ( + State::Empty(query_graph.clone()), + Some(RankingRuleOutput { query: query_graph, candidates: universe.clone() }), + ), + }; + (state, output) } } From f7ecea142ec3c3d1403ad00969e37d211c861125 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:43:16 +0200 Subject: [PATCH 18/21] Fix panics and issues in exactness graph ranking rule --- .../new/ranking_rule_graph/exactness/mod.rs | 48 ++++++++----------- .../extract/extract_word_position_docids.rs | 5 +- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index a1e19a015..3d558e87b 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -1,24 +1,11 @@ +use heed::BytesDecode; use roaring::RoaringBitmap; use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::query_graph::{QueryGraph, QueryNode}; use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; -use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; - -/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other -/// word than a doc that matches 9 words non exactly but none exactly -/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider -/// -/// "naive vision" -/// condition from one node to another: -/// - word exactly present: cost 0 -/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside -/// -/// Three strategies: -/// 1. ExactAttribute: word position / word_fid_docid -/// 2. AttributeStart: -/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait` +use crate::{Result, RoaringBitmapCodec, SearchContext, SearchLogger}; #[derive(Clone, PartialEq, Eq, Hash)] pub enum ExactnessCondition { @@ -42,7 +29,7 @@ fn compute_docids( ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), ExactTerm::Word(word) => { if let Some(word_candidates) = ctx.get_db_word_docids(word)? { - CboRoaringBitmapCodec::deserialize_from(word_candidates)? + RoaringBitmapCodec::bytes_decode(word_candidates).ok_or(heed::Error::Decoding)? } else { return Ok(Default::default()); } @@ -86,22 +73,29 @@ impl RankingRuleGraphTrait for ExactnessGraph { let skip_condition = ExactnessCondition::Skip(dest_node.clone()); let skip_condition = conditions_interner.insert(skip_condition); - Ok(vec![(0, exact_condition), (1, skip_condition)]) + + Ok(vec![(0, exact_condition), (dest_node.term_ids.len() as u32, skip_condition)]) } fn log_state( - graph: &RankingRuleGraph, - paths: &[Vec>], - dead_ends_cache: &DeadEndsCache, - universe: &RoaringBitmap, - costs: &MappedInterner>, - cost: u64, - logger: &mut dyn SearchLogger, + _graph: &RankingRuleGraph, + _paths: &[Vec>], + _dead_ends_cache: &DeadEndsCache, + _niverse: &RoaringBitmap, + _costs: &MappedInterner>, + _cost: u64, + _logger: &mut dyn SearchLogger, ) { - todo!() } - fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { - todo!() + fn label_for_condition( + _ctx: &mut SearchContext, + condition: &Self::Condition, + ) -> Result { + Ok(match condition { + ExactnessCondition::ExactInAttribute(_) => "exact", + ExactnessCondition::Skip(_) => "skip", + } + .to_owned()) } } diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index cd3ec691b..eef5089bc 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -7,10 +7,7 @@ use super::helpers::{ }; use crate::error::SerializationError; use crate::index::db_name::DOCID_WORD_POSITIONS; -use crate::{ - absolute_from_relative_position, bucketed_position, relative_from_absolute_position, - DocumentId, Result, -}; +use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Result}; /// Extracts the word positions and the documents ids where this word appear. /// From d1ddaa223d39b7ba74fde7f9a72b04662931935f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 18:05:44 +0200 Subject: [PATCH 19/21] Use correct codec in proximity --- .../ranking_rule_graph/proximity/compute_docids.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 8496054b7..07bd102ca 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,14 +1,17 @@ #![allow(clippy::too_many_arguments)] +use std::collections::BTreeSet; + +use heed::BytesDecode; +use roaring::RoaringBitmap; + use super::ProximityCondition; use crate::search::new::interner::Interned; use crate::search::new::query_term::{Phrase, QueryTermSubset}; use crate::search::new::ranking_rule_graph::ComputedCondition; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::SearchContext; -use crate::{CboRoaringBitmapCodec, Result}; -use roaring::RoaringBitmap; -use std::collections::BTreeSet; +use crate::{CboRoaringBitmapCodec, Result, RoaringBitmapCodec}; pub fn compute_docids( ctx: &mut SearchContext, @@ -90,7 +93,8 @@ pub fn compute_docids( continue; } } else if let Some(lw_bytes) = ctx.get_db_word_docids(left_word)? { - let left_word_docids = CboRoaringBitmapCodec::deserialize_from(lw_bytes)?; + let left_word_docids = + RoaringBitmapCodec::bytes_decode(lw_bytes).ok_or(heed::Error::Decoding)?; if universe.is_disjoint(&left_word_docids) { continue; } From d9460a76f43fa7b35cb2e4d423148c2a0ab174e5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:43:42 +0200 Subject: [PATCH 20/21] Fix word_position_docids indexing --- .../index_documents/extract/extract_word_position_docids.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index eef5089bc..734cf8778 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -39,9 +39,8 @@ pub fn extract_word_fid_and_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - let (fid, position) = relative_from_absolute_position(position); + let (_fid, position) = relative_from_absolute_position(position); let position = bucketed_position(position); - let position = absolute_from_relative_position(fid, position); key_buffer.extend_from_slice(&position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; } From 5440f43fd3be28981933bd55e126ee88ed3324e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 5 Apr 2023 14:55:02 +0200 Subject: [PATCH 21/21] Fix indexing of word_position_docid and fid --- milli/src/snapshot_tests.rs | 8 +++ .../extract/extract_word_fid_docids.rs | 48 ++++++++++++++++ .../extract/extract_word_position_docids.rs | 4 +- .../src/update/index_documents/extract/mod.rs | 17 +++++- milli/src/update/index_documents/mod.rs | 57 +++++++++++++++++++ .../src/update/index_documents/typed_chunk.rs | 12 ++++ 6 files changed, 141 insertions(+), 5 deletions(-) create mode 100644 milli/src/update/index_documents/extract/extract_word_fid_docids.rs diff --git a/milli/src/snapshot_tests.rs b/milli/src/snapshot_tests.rs index f7f1a97e6..eb94c4be9 100644 --- a/milli/src/snapshot_tests.rs +++ b/milli/src/snapshot_tests.rs @@ -248,6 +248,11 @@ pub fn snap_word_position_docids(index: &Index) -> String { &format!("{word:<16} {position:<6} {}", display_bitmap(&b)) }) } +pub fn snap_word_fid_docids(index: &Index) -> String { + make_db_snap_from_iter!(index, word_fid_docids, |((word, fid), b)| { + &format!("{word:<16} {fid:<3} {}", display_bitmap(&b)) + }) +} pub fn snap_field_id_word_count_docids(index: &Index) -> String { make_db_snap_from_iter!(index, field_id_word_count_docids, |((field_id, word_count), b)| { &format!("{field_id:<3} {word_count:<6} {}", display_bitmap(&b)) @@ -477,6 +482,9 @@ macro_rules! full_snap_of_db { ($index:ident, word_position_docids) => {{ $crate::snapshot_tests::snap_word_position_docids(&$index) }}; + ($index:ident, word_fid_docids) => {{ + $crate::snapshot_tests::snap_word_fid_docids(&$index) + }}; ($index:ident, field_id_word_count_docids) => {{ $crate::snapshot_tests::snap_field_id_word_count_docids(&$index) }}; diff --git a/milli/src/update/index_documents/extract/extract_word_fid_docids.rs b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs new file mode 100644 index 000000000..72b30cddf --- /dev/null +++ b/milli/src/update/index_documents/extract/extract_word_fid_docids.rs @@ -0,0 +1,48 @@ +use std::fs::File; +use std::io; + +use super::helpers::{ + create_sorter, merge_cbo_roaring_bitmaps, read_u32_ne_bytes, sorter_into_reader, + try_split_array_at, GrenadParameters, +}; +use crate::error::SerializationError; +use crate::index::db_name::DOCID_WORD_POSITIONS; +use crate::{relative_from_absolute_position, DocumentId, Result}; + +/// Extracts the word, field id, and the documents ids where this word appear at this field id. +#[logging_timer::time] +pub fn extract_word_fid_docids( + docid_word_positions: grenad::Reader, + indexer: GrenadParameters, +) -> Result> { + let max_memory = indexer.max_memory_by_thread(); + + let mut word_fid_docids_sorter = create_sorter( + grenad::SortAlgorithm::Unstable, + merge_cbo_roaring_bitmaps, + indexer.chunk_compression_type, + indexer.chunk_compression_level, + indexer.max_nb_chunks, + max_memory, + ); + + let mut key_buffer = Vec::new(); + let mut cursor = docid_word_positions.into_cursor()?; + while let Some((key, value)) = cursor.move_on_next()? { + let (document_id_bytes, word_bytes) = try_split_array_at(key) + .ok_or(SerializationError::Decoding { db_name: Some(DOCID_WORD_POSITIONS) })?; + let document_id = DocumentId::from_be_bytes(document_id_bytes); + + for position in read_u32_ne_bytes(value) { + key_buffer.clear(); + key_buffer.extend_from_slice(word_bytes); + let (fid, _) = relative_from_absolute_position(position); + key_buffer.extend_from_slice(&fid.to_be_bytes()); + word_fid_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; + } + } + + let word_fid_docids_reader = sorter_into_reader(word_fid_docids_sorter, indexer)?; + + Ok(word_fid_docids_reader) +} diff --git a/milli/src/update/index_documents/extract/extract_word_position_docids.rs b/milli/src/update/index_documents/extract/extract_word_position_docids.rs index 734cf8778..80a36c308 100644 --- a/milli/src/update/index_documents/extract/extract_word_position_docids.rs +++ b/milli/src/update/index_documents/extract/extract_word_position_docids.rs @@ -14,7 +14,7 @@ use crate::{bucketed_position, relative_from_absolute_position, DocumentId, Resu /// Returns a grenad reader with the list of extracted words at positions and /// documents ids from the given chunk of docid word positions. #[logging_timer::time] -pub fn extract_word_fid_and_position_docids( +pub fn extract_word_position_docids( docid_word_positions: grenad::Reader, indexer: GrenadParameters, ) -> Result> { @@ -39,7 +39,7 @@ pub fn extract_word_fid_and_position_docids( for position in read_u32_ne_bytes(value) { key_buffer.clear(); key_buffer.extend_from_slice(word_bytes); - let (_fid, position) = relative_from_absolute_position(position); + let (_, position) = relative_from_absolute_position(position); let position = bucketed_position(position); key_buffer.extend_from_slice(&position.to_be_bytes()); word_position_docids_sorter.insert(&key_buffer, document_id.to_ne_bytes())?; diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 844efed36..db041de6f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -5,6 +5,7 @@ mod extract_fid_docid_facet_values; mod extract_fid_word_count_docids; mod extract_geo_points; mod extract_word_docids; +mod extract_word_fid_docids; mod extract_word_pair_proximity_docids; mod extract_word_position_docids; @@ -22,8 +23,9 @@ use self::extract_fid_docid_facet_values::extract_fid_docid_facet_values; use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_word_docids::extract_word_docids; +use self::extract_word_fid_docids::extract_word_fid_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; -use self::extract_word_position_docids::extract_word_fid_and_position_docids; +use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ as_cloneable_grenad, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, @@ -130,14 +132,23 @@ pub(crate) fn data_from_obkv_documents( ); spawn_extraction_task::<_, _, Vec>>( - docid_word_positions_chunks, + docid_word_positions_chunks.clone(), indexer, lmdb_writer_sx.clone(), - extract_word_fid_and_position_docids, + extract_word_position_docids, merge_cbo_roaring_bitmaps, TypedChunk::WordPositionDocids, "word-position-docids", ); + spawn_extraction_task::<_, _, Vec>>( + docid_word_positions_chunks, + indexer, + lmdb_writer_sx.clone(), + extract_word_fid_docids, + merge_cbo_roaring_bitmaps, + TypedChunk::WordFidDocids, + "word-fid-docids", + ); spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_strings_chunks, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ade217beb..235b35fc8 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2255,4 +2255,61 @@ mod tests { {"id":1,"catto":"jorts"} "###); } + + #[test] + fn test_word_fid_position() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + {"id": 0, "text": "sun flowers are looking at the sun" }, + {"id": 1, "text": "sun flowers are looking at the sun" }, + {"id": 2, "text": "the sun is shining today" }, + { + "id": 3, + "text": "a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a a a a a a + a a a a a a a a a a a a a a a a a a a a a " + } + ])) + .unwrap(); + + db_snap!(index, word_fid_docids, 1, @"bf3355e493330de036c8823ddd1dbbd9"); + db_snap!(index, word_position_docids, 1, @"896d54b29ed79c4c6f14084f326dcf6f"); + + index + .add_documents(documents!([ + {"id": 4, "text": "sun flowers are looking at the sun" }, + {"id": 5, "text2": "sun flowers are looking at the sun" }, + {"id": 6, "text": "b b b" }, + { + "id": 7, + "text2": "a a a a" + } + ])) + .unwrap(); + + db_snap!(index, word_fid_docids, 2, @"a48d3f88db33f94bc23110a673ea49e4"); + db_snap!(index, word_position_docids, 2, @"3c9e66c6768ae2cf42b46b2c46e46a83"); + + let mut wtxn = index.write_txn().unwrap(); + + // Delete not all of the documents but some of them. + let mut builder = DeleteDocuments::new(&mut wtxn, &index).unwrap(); + builder.strategy(DeletionStrategy::AlwaysHard); + builder.delete_external_id("0"); + builder.delete_external_id("3"); + let result = builder.execute().unwrap(); + println!("{result:?}"); + + wtxn.commit().unwrap(); + + db_snap!(index, word_fid_docids, 3, @"4c2e2a1832e5802796edc1638136d933"); + db_snap!(index, word_position_docids, 3, @"74f556b91d161d997a89468b4da1cb8f"); + db_snap!(index, docid_word_positions, 3, @"5287245332627675740b28bd46e1cde1"); + } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index b9b11cfa8..14ba021bd 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -35,6 +35,7 @@ pub(crate) enum TypedChunk { exact_word_docids_reader: grenad::Reader, }, WordPositionDocids(grenad::Reader), + WordFidDocids(grenad::Reader), WordPairProximityDocids(grenad::Reader), FieldIdFacetStringDocids(grenad::Reader), FieldIdFacetNumberDocids(grenad::Reader), @@ -140,6 +141,17 @@ pub(crate) fn write_typed_chunk_into_index( )?; is_merged_database = true; } + TypedChunk::WordFidDocids(word_fid_docids_iter) => { + append_entries_into_database( + word_fid_docids_iter, + &index.word_fid_docids, + wtxn, + index_is_empty, + |value, _buffer| Ok(value), + merge_cbo_roaring_bitmaps, + )?; + is_merged_database = true; + } TypedChunk::FieldIdFacetNumberDocids(facet_id_number_docids_iter) => { let indexer = FacetsUpdate::new(index, FacetType::Number, facet_id_number_docids_iter); indexer.execute(wtxn)?;