diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs new file mode 100644 index 000000000..712825c31 --- /dev/null +++ b/milli/src/search/new/bucket_sort.rs @@ -0,0 +1,195 @@ +use roaring::RoaringBitmap; + +use super::logger::SearchLogger; +use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait}; +use super::SearchContext; +use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput}; +use crate::Result; + +pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( + ctx: &mut SearchContext<'ctx>, + mut ranking_rules: Vec>, + query: &Q, + universe: &RoaringBitmap, + from: usize, + length: usize, + logger: &mut dyn SearchLogger, +) -> Result> { + logger.initial_query(query); + logger.ranking_rules(&ranking_rules); + logger.initial_universe(universe); + + let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? { + ctx.index.fields_ids_map(ctx.txn)?.id(field) + } else { + None + }; + + if universe.len() < from as u64 { + return Ok(vec![]); + } + if ranking_rules.is_empty() { + if let Some(distinct_fid) = distinct_fid { + let mut excluded = RoaringBitmap::new(); + let mut results = vec![]; + for docid in universe.iter() { + if results.len() >= from + length { + break; + } + if excluded.contains(docid) { + continue; + } + distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?; + results.push(docid); + } + return Ok(results); + } else { + return Ok(universe.iter().skip(from).take(length).collect()); + }; + } + + let ranking_rules_len = ranking_rules.len(); + + logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe); + ranking_rules[0].start_iteration(ctx, logger, universe, query)?; + + let mut ranking_rule_universes: Vec = + vec![RoaringBitmap::default(); ranking_rules_len]; + ranking_rule_universes[0] = universe.clone(); + + let mut cur_ranking_rule_index = 0; + + /// Finish iterating over the current ranking rule, yielding + /// control to the parent (or finishing the search if not possible). + /// Update the candidates accordingly and inform the logger. + macro_rules! back { + () => { + assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); + logger.end_iteration_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &ranking_rule_universes[cur_ranking_rule_index], + ); + ranking_rule_universes[cur_ranking_rule_index].clear(); + ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger); + if cur_ranking_rule_index == 0 { + break; + } else { + cur_ranking_rule_index -= 1; + } + }; + } + + let mut results = vec![]; + let mut cur_offset = 0usize; + + /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` + /// into account and inform the logger. + macro_rules! maybe_add_to_results { + ($candidates:expr) => { + // First apply the distinct rule on the candidates, reducing the universes if necessary + let candidates = if let Some(distinct_fid) = distinct_fid { + let DistinctOutput { remaining, excluded } = apply_distinct_rule(ctx, distinct_fid, $candidates)?; + for universe in ranking_rule_universes.iter_mut() { + *universe -= &excluded; + } + remaining + } else { + $candidates.clone() + }; + let len = candidates.len(); + // if the candidates are empty, there is nothing to do; + if !candidates.is_empty() { + // if we still haven't reached the first document to return + if cur_offset < from { + // and if no document from this bucket can be returned + if cur_offset + (candidates.len() as usize) < from { + // then just skip the bucket + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &candidates, + ); + } else { + // otherwise, skip some of the documents and add some of the rest, in order of ids + let all_candidates = candidates.iter().collect::>(); + let (skipped_candidates, candidates) = + all_candidates.split_at(from - cur_offset); + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &skipped_candidates.into_iter().collect(), + ); + let candidates = candidates + .iter() + .take(length - results.len()) + .copied() + .collect::>(); + logger.add_to_results(&candidates); + results.extend(&candidates); + } + } else { + // if we have passed the offset already, add some of the documents (up to the limit) + let candidates = + candidates.iter().take(length - results.len()).collect::>(); + logger.add_to_results(&candidates); + results.extend(&candidates); + } + } + cur_offset += len as usize; + }; + } + + while results.len() < length { + // The universe for this bucket is zero or one element, so we don't need to sort + // anything, just extend the results and go back to the parent ranking rule. + if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { + maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); + ranking_rule_universes[cur_ranking_rule_index].clear(); + back!(); + continue; + } + + let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else { + back!(); + continue; + }; + + logger.next_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &ranking_rule_universes[cur_ranking_rule_index], + &next_bucket.candidates, + ); + + debug_assert!( + ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates) + ); + ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; + + if cur_ranking_rule_index == ranking_rules_len - 1 + || next_bucket.candidates.len() <= 1 + || cur_offset + (next_bucket.candidates.len() as usize) < from + { + maybe_add_to_results!(&next_bucket.candidates); + continue; + } + + cur_ranking_rule_index += 1; + ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); + logger.start_iteration_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &next_bucket.query, + &ranking_rule_universes[cur_ranking_rule_index], + ); + ranking_rules[cur_ranking_rule_index].start_iteration( + ctx, + logger, + &next_bucket.candidates, + &next_bucket.query, + )?; + } + + Ok(results) +} diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index effd123be..a0dde4686 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::collections::hash_map::Entry; use std::hash::Hash; @@ -24,6 +25,8 @@ pub struct DatabaseCache<'ctx> { pub word_docids: FxHashMap, Option<&'ctx [u8]>>, pub exact_word_docids: FxHashMap, Option<&'ctx [u8]>>, pub word_prefix_docids: FxHashMap, Option<&'ctx [u8]>>, + + pub words_fst: Option>>, pub word_position_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, pub word_fid_docids: FxHashMap<(Interned, u16), Option<&'ctx [u8]>>, } @@ -51,6 +54,16 @@ impl<'ctx> DatabaseCache<'ctx> { } } impl<'ctx> SearchContext<'ctx> { + pub fn get_words_fst(&mut self) -> Result>> { + if let Some(fst) = self.db_cache.words_fst.clone() { + Ok(fst) + } else { + let fst = self.index.words_fst(self.txn)?; + self.db_cache.words_fst = Some(fst.clone()); + Ok(fst) + } + } + /// Retrieve or insert the given value in the `word_docids` database. pub fn get_db_word_docids(&mut self, word: Interned) -> Result> { DatabaseCache::get_value( diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs index ad4b46659..7b77adf49 100644 --- a/milli/src/search/new/distinct.rs +++ b/milli/src/search/new/distinct.rs @@ -41,7 +41,7 @@ pub fn apply_distinct_rule( } /// Apply the distinct rule defined by [`apply_distinct_rule`] for a single document id. -fn distinct_single_docid( +pub fn distinct_single_docid( index: &Index, txn: &RoTxn, field_id: u16, diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index e9bfbef86..ebf18f38c 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -176,6 +176,9 @@ impl Interner { pub fn iter_mut(&mut self) -> impl Iterator, &mut T)> { self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x)) } + pub fn freeze(self) -> FixedSizeInterner { + FixedSizeInterner { stable_store: self.stable_store } + } } /// A store of values of type `T`, each linked to a value of type `From` diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 3c4779ad9..86568d5d2 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -8,9 +8,7 @@ use roaring::RoaringBitmap; use crate::search::new::interner::{Interned, MappedInterner}; use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::{ - Lazy, LocatedQueryTermSubset, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm, -}; +use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::ranking_rule_graph::{ DeadEndsCache, Edge, ProximityCondition, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, TypoCondition, TypoGraph, @@ -439,87 +437,26 @@ results.{cur_ranking_rule}{cur_activated_id} {{ positions: _, term_ids: _, }) => { - let QueryTerm { - original, - is_ngram: _, - is_prefix: _, - max_nbr_typos, - zero_typo, - one_typo, - two_typo, - } = ctx.term_interner.get(term_subset.original); - - let original = ctx.word_interner.get(*original); writeln!( file, - "{node_idx} : \"{original}\" {{ + "{node_idx} : \"{}\" {{ shape: class - max_nbr_typo: {max_nbr_typos}" + max_nbr_typo: {}", + term_subset.description(ctx), + term_subset.max_nbr_typos(ctx) ) .unwrap(); - let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = - zero_typo; - - for w in zero_typo.iter().copied() { - if term_subset.zero_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0").unwrap(); - } + for w in term_subset.all_single_words_except_prefix_db(ctx).unwrap() { + let w = ctx.word_interner.get(w); + writeln!(file, "{w}: word").unwrap(); } - for w in prefix_of.iter().copied() { - if term_subset.zero_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 0P").unwrap(); - } + for p in term_subset.all_phrases(ctx).unwrap() { + writeln!(file, "{}: phrase", p.description(ctx)).unwrap(); } - - if let Some(phrase) = phrase { - if term_subset.zero_typo_subset.contains_phrase(*phrase) { - let phrase = ctx.phrase_interner.get(*phrase); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : phrase").unwrap(); - } - } - - for synonym in synonyms.iter().copied() { - if term_subset.zero_typo_subset.contains_phrase(synonym) { - let phrase = ctx.phrase_interner.get(synonym); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : synonym").unwrap(); - } - } - if let Some(use_prefix_db) = use_prefix_db { - if term_subset.zero_typo_subset.contains_word(*use_prefix_db) { - let p = ctx.word_interner.get(*use_prefix_db); - writeln!(file, "use prefix DB : {p}").unwrap(); - } - } - if let Lazy::Init(one_typo) = one_typo { - let OneTypoTerm { split_words, one_typo } = one_typo; - - for w in one_typo.iter().copied() { - if term_subset.one_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 1").unwrap(); - } - } - if let Some(split_words) = split_words { - if term_subset.one_typo_subset.contains_phrase(*split_words) { - let phrase = ctx.phrase_interner.get(*split_words); - let phrase_str = phrase.description(&ctx.word_interner); - writeln!(file, "\"{phrase_str}\" : split_words").unwrap(); - } - } - } - if let Lazy::Init(two_typo) = two_typo { - let TwoTypoTerm { two_typos } = two_typo; - for w in two_typos.iter().copied() { - if term_subset.two_typo_subset.contains_word(w) { - let w = ctx.word_interner.get(w); - writeln!(file, "\"{w}\" : 2").unwrap(); - } - } + if let Some(w) = term_subset.use_prefix_db(ctx) { + let w = ctx.word_interner.get(w); + writeln!(file, "{w}: prefix db").unwrap(); } writeln!(file, "}}").unwrap(); diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 889e811ad..15cb78784 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -1,6 +1,8 @@ // #[cfg(test)] pub mod detailed; +pub mod test_logger; + use roaring::RoaringBitmap; use super::interner::{Interned, MappedInterner}; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 779e589b3..b307b2434 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -1,3 +1,4 @@ +mod bucket_sort; mod db_cache; mod distinct; mod graph_based_ranking_rule; @@ -18,6 +19,9 @@ mod sort; // TODO: documentation + comments mod words; +#[cfg(test)] +mod tests; + use std::collections::HashSet; use charabia::TokenizerBuilder; @@ -29,7 +33,8 @@ pub use logger::detailed::DetailedSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; -use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; +use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; +use bucket_sort::bucket_sort; use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index cba9e590f..2662ef730 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -1,11 +1,10 @@ use super::interner::{FixedSizeInterner, Interned}; use super::query_term::{ - self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, NTypoTermSubset, - QueryTermSubset, + self, number_of_typos_allowed, LocatedQueryTerm, LocatedQueryTermSubset, QueryTermSubset, }; use super::small_bitmap::SmallBitmap; use super::SearchContext; -use crate::search::new::interner::DedupInterner; +use crate::search::new::interner::Interner; use crate::Result; use std::cmp::Ordering; use std::collections::BTreeMap; @@ -107,12 +106,7 @@ impl QueryGraph { let new_node_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset { - original: Interned::from_raw(term_idx as u16), - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - }, + term_subset: QueryTermSubset::full(Interned::from_raw(term_idx as u16)), positions: terms[term_idx].positions.clone(), term_ids: term_idx as u8..=term_idx as u8, }), @@ -126,12 +120,7 @@ impl QueryGraph { let ngram_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset { - original: ngram.value, - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - }, + term_subset: QueryTermSubset::full(ngram.value), positions: ngram.positions, term_ids: term_idx as u8 - 1..=term_idx as u8, }), @@ -146,12 +135,7 @@ impl QueryGraph { let ngram_idx = add_node( &mut nodes_data, QueryNodeData::Term(LocatedQueryTermSubset { - term_subset: QueryTermSubset { - original: ngram.value, - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - }, + term_subset: QueryTermSubset::full(ngram.value), positions: ngram.positions, term_ids: term_idx as u8 - 2..=term_idx as u8, }), @@ -329,7 +313,7 @@ impl QueryGraph { let mut at_least_one_phrase = false; for (node_id, node) in self.nodes.iter() { let QueryNodeData::Term(t) = &node.data else { continue }; - if ctx.term_interner.get(t.term_subset.original).zero_typo.phrase.is_some() { + if t.term_subset.original_phrase(ctx).is_some() { at_least_one_phrase = true; continue; } @@ -361,29 +345,13 @@ impl QueryGraph { Build a query graph from a list of paths The paths are composed of source and dest terms. - If the source term is `None`, then the last dest term is used - as the predecessor of the dest term. If the source is Some(_), - then an edge is built between the last dest term and the source, - and between the source and new dest term. - Note that the resulting graph will not correspond to a perfect - representation of the set of paths. For example, consider the following paths: ```txt PATH 1 : a -> b1 -> c1 -> d -> e1 PATH 2 : a -> b2 -> c2 -> d -> e2 ``` Then the resulting graph will be: - ```txt - ┌────┐ ┌────┐ ┌────┐ - ┌──│ b1 │──│ c1 │─┐ ┌──│ e1 │ - ┌────┐ │ └────┘ └────┘ │ ┌────┐ │ └────┘ - │ a │─┤ ├─│ d │─┤ - └────┘ │ ┌────┐ ┌────┐ │ └────┘ │ ┌────┐ - └──│ b2 │──│ c2 │─┘ └──│ e2 │ - └────┘ └────┘ └────┘ - ``` - which is different from the fully correct representation: ```txt ┌────┐ ┌────┐ ┌────┐ ┌────┐ ┌──│ b1 │──│ c1 │───│ d │───│ e1 │ @@ -399,21 +367,51 @@ impl QueryGraph { pub fn build_from_paths( paths: Vec, LocatedQueryTermSubset)>>, ) -> Self { - let mut node_data = DedupInterner::default(); - let root_node = node_data.insert(QueryNodeData::Start); - let end_node = node_data.insert(QueryNodeData::End); + let mut node_data = Interner::default(); + let root_node = node_data.push(QueryNodeData::Start); + let end_node = node_data.push(QueryNodeData::End); + + let mut paths_with_single_terms = vec![]; + + for path in paths { + let mut processed_path = vec![]; + let mut prev_dest_term: Option = None; + for (start_term, dest_term) in path { + if let Some(prev_dest_term) = prev_dest_term.take() { + if let Some(mut start_term) = start_term { + if start_term.term_ids == prev_dest_term.term_ids { + start_term.term_subset.intersect(&prev_dest_term.term_subset); + processed_path.push(start_term); + } else { + processed_path.push(prev_dest_term); + processed_path.push(start_term); + } + } else { + processed_path.push(prev_dest_term); + } + } else if let Some(start_term) = start_term { + processed_path.push(start_term); + } + prev_dest_term = Some(dest_term); + } + if let Some(prev_dest_term) = prev_dest_term { + processed_path.push(prev_dest_term); + } + paths_with_single_terms.push(processed_path); + } + + // TODO: make a prefix tree of the processed paths to avoid uselessly duplicating nodes let mut paths_with_ids = vec![]; - for path in paths { + for path in paths_with_single_terms { let mut path_with_ids = vec![]; - for node in path { - let (start_term, end_term) = node; - let src_node_id = start_term.map(|x| node_data.insert(QueryNodeData::Term(x))); - let dest_node_id = node_data.insert(QueryNodeData::Term(end_term)); - path_with_ids.push((src_node_id, dest_node_id)); + for term in path { + let id = node_data.push(QueryNodeData::Term(term)); + path_with_ids.push(Interned::from_raw(id.into_raw())); } paths_with_ids.push(path_with_ids); } + let nodes_data = node_data.freeze(); let nodes_data_len = nodes_data.len(); let mut nodes = nodes_data.map_move(|n| QueryNode { @@ -422,28 +420,22 @@ impl QueryGraph { successors: SmallBitmap::new(nodes_data_len), }); - let root_node = Interned::from_raw(root_node.into_raw()); - let end_node = Interned::from_raw(end_node.into_raw()); + let root_node = Interned::::from_raw(root_node.into_raw()); + let end_node = Interned::::from_raw(end_node.into_raw()); for path in paths_with_ids { - let mut prev_node = root_node; - for node in path { - let (start_term, dest_term) = node; - let end_term = Interned::from_raw(dest_term.into_raw()); - let src = if let Some(start_term) = start_term { - let start_term = Interned::from_raw(start_term.into_raw()); - nodes.get_mut(prev_node).successors.insert(start_term); - nodes.get_mut(start_term).predecessors.insert(prev_node); - start_term - } else { - prev_node - }; - nodes.get_mut(src).successors.insert(end_term); - nodes.get_mut(end_term).predecessors.insert(src); - prev_node = end_term; + let mut prev_node_id = root_node; + for node_id in path { + let prev_node = nodes.get_mut(prev_node_id); + prev_node.successors.insert(node_id); + let node = nodes.get_mut(node_id); + node.predecessors.insert(prev_node_id); + prev_node_id = node_id; } - nodes.get_mut(prev_node).successors.insert(end_node); - nodes.get_mut(end_node).predecessors.insert(prev_node); + let prev_node = nodes.get_mut(prev_node_id); + prev_node.successors.insert(end_node); + let node = nodes.get_mut(end_node); + node.predecessors.insert(prev_node_id); } QueryGraph { root_node, end_node, nodes } diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs deleted file mode 100644 index 4e3922980..000000000 --- a/milli/src/search/new/query_term.rs +++ /dev/null @@ -1,1096 +0,0 @@ -use std::borrow::Cow; -use std::collections::BTreeSet; -use std::ops::{ControlFlow, RangeInclusive}; - -use charabia::normalizer::NormalizedTokenIter; -use charabia::{SeparatorKind, TokenKind}; -use either::Either; -use fst::automaton::Str; -use fst::{Automaton, IntoStreamer, Streamer}; -use heed::types::DecodeIgnore; -use heed::RoTxn; -use itertools::Itertools; - -use super::interner::{DedupInterner, Interned}; -use super::{limits, SearchContext}; -use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; -use crate::search::{build_dfa, get_first}; -use crate::{CboRoaringBitmapLenCodec, Index, Result, MAX_WORD_LENGTH}; - -/// A phrase in the user's search query, consisting of several words -/// that must appear side-by-side in the search results. -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct Phrase { - pub words: Vec>>, -} -impl Phrase { - pub fn description(&self, interner: &DedupInterner) -> String { - self.words.iter().flatten().map(|w| interner.get(*w)).join(" ") - } -} - -#[derive(Clone, PartialEq, Eq, Hash)] -pub enum Lazy { - Uninit, - Init(T), -} -impl Lazy { - pub fn is_init(&self) -> bool { - match self { - Lazy::Uninit => false, - Lazy::Init(_) => true, - } - } - pub fn is_uninit(&self) -> bool { - match self { - Lazy::Uninit => true, - Lazy::Init(_) => false, - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum NTypoTermSubset { - All, - Subset { - words: BTreeSet>, - phrases: BTreeSet>, - // TODO: prefixes: BTreeSet>, - }, - Nothing, -} - -impl NTypoTermSubset { - pub fn contains_word(&self, word: Interned) -> bool { - match self { - NTypoTermSubset::All => true, - NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word), - NTypoTermSubset::Nothing => false, - } - } - pub fn contains_phrase(&self, phrase: Interned) -> bool { - match self { - NTypoTermSubset::All => true, - NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase), - NTypoTermSubset::Nothing => false, - } - } - pub fn is_empty(&self) -> bool { - match self { - NTypoTermSubset::All => false, - NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(), - NTypoTermSubset::Nothing => true, - } - } - pub fn union(&mut self, other: &Self) { - match self { - Self::All => {} - Self::Subset { words, phrases } => match other { - Self::All => { - *self = Self::All; - } - Self::Subset { words: w2, phrases: p2 } => { - words.extend(w2); - phrases.extend(p2); - } - Self::Nothing => {} - }, - Self::Nothing => { - *self = other.clone(); - } - } - } - pub fn intersect(&mut self, other: &Self) { - match self { - Self::All => *self = other.clone(), - Self::Subset { words, phrases } => match other { - Self::All => {} - Self::Subset { words: w2, phrases: p2 } => { - let mut ws = BTreeSet::new(); - for w in words.intersection(w2) { - ws.insert(*w); - } - let mut ps = BTreeSet::new(); - for p in phrases.intersection(p2) { - ps.insert(*p); - } - *words = ws; - *phrases = ps; - } - Self::Nothing => *self = Self::Nothing, - }, - Self::Nothing => {} - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct QueryTermSubset { - pub original: Interned, - pub zero_typo_subset: NTypoTermSubset, - pub one_typo_subset: NTypoTermSubset, - pub two_typo_subset: NTypoTermSubset, -} - -#[derive(Clone, PartialEq, Eq, Hash)] -pub struct LocatedQueryTermSubset { - pub term_subset: QueryTermSubset, - pub positions: RangeInclusive, - pub term_ids: RangeInclusive, -} - -#[derive(Clone, Copy)] -pub enum ExactTerm { - Phrase(Interned), - Word(Interned), -} - -impl ExactTerm { - pub fn interned_words<'ctx>( - &self, - ctx: &'ctx SearchContext<'ctx>, - ) -> impl Iterator>> + 'ctx { - match *self { - ExactTerm::Phrase(phrase) => { - let phrase = ctx.phrase_interner.get(phrase); - Either::Left(phrase.words.iter().copied()) - } - ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))), - } - } -} - -impl QueryTermSubset { - pub fn exact_term(&self, ctx: &SearchContext) -> Option { - let full_query_term = ctx.term_interner.get(self.original); - if full_query_term.is_ngram { - return None; - } - // TODO: included in subset - if let Some(phrase) = full_query_term.zero_typo.phrase { - self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase)) - } else if let Some(word) = full_query_term.zero_typo.exact { - self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word)) - } else { - None - } - } - - pub fn empty(for_term: Interned) -> Self { - Self { - original: for_term, - zero_typo_subset: NTypoTermSubset::Nothing, - one_typo_subset: NTypoTermSubset::Nothing, - two_typo_subset: NTypoTermSubset::Nothing, - } - } - pub fn full(for_term: Interned) -> Self { - Self { - original: for_term, - zero_typo_subset: NTypoTermSubset::All, - one_typo_subset: NTypoTermSubset::All, - two_typo_subset: NTypoTermSubset::All, - } - } - - pub fn union(&mut self, other: &Self) { - assert!(self.original == other.original); - self.zero_typo_subset.union(&other.zero_typo_subset); - self.one_typo_subset.union(&other.one_typo_subset); - self.two_typo_subset.union(&other.two_typo_subset); - } - pub fn intersect(&mut self, other: &Self) { - assert!(self.original == other.original); - self.zero_typo_subset.intersect(&other.zero_typo_subset); - self.one_typo_subset.intersect(&other.one_typo_subset); - self.two_typo_subset.intersect(&other.two_typo_subset); - } - - pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option> { - let original = ctx.term_interner.get(self.original); - let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { - return None - }; - match &self.zero_typo_subset { - NTypoTermSubset::All => Some(use_prefix_db), - NTypoTermSubset::Subset { words, phrases: _ } => { - // TODO: use a subset of prefix words instead - if words.contains(&use_prefix_db) { - Some(use_prefix_db) - } else { - None - } - } - NTypoTermSubset::Nothing => None, - } - } - pub fn all_single_words_except_prefix_db( - &self, - ctx: &mut SearchContext, - ) -> Result>> { - let original = ctx.term_interner.get_mut(self.original); - let mut result = BTreeSet::default(); - // TODO: a compute_partially funtion - if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { - original.compute_fully_if_needed( - ctx.index, - ctx.txn, - &mut ctx.word_interner, - &mut ctx.phrase_interner, - )?; - } - - if !self.zero_typo_subset.is_empty() { - let ZeroTypoTerm { - phrase: _, - exact: zero_typo, - prefix_of, - synonyms: _, - use_prefix_db: _, - } = &original.zero_typo; - result.extend(zero_typo.iter().copied()); - result.extend(prefix_of.iter().copied()); - }; - - match &self.one_typo_subset { - NTypoTermSubset::All => { - let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { - panic!() - }; - result.extend(one_typo.iter().copied()) - } - NTypoTermSubset::Subset { words, phrases: _ } => { - let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { - panic!() - }; - result.extend(one_typo.intersection(words)); - } - NTypoTermSubset::Nothing => {} - }; - - match &self.two_typo_subset { - NTypoTermSubset::All => { - let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { - panic!() - }; - result.extend(two_typos.iter().copied()); - } - NTypoTermSubset::Subset { words, phrases: _ } => { - let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { - panic!() - }; - result.extend(two_typos.intersection(words)); - } - NTypoTermSubset::Nothing => {} - }; - - Ok(result) - } - pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result>> { - let original = ctx.term_interner.get_mut(self.original); - let mut result = BTreeSet::default(); - - if !self.one_typo_subset.is_empty() { - // TODO: compute less than fully if possible - original.compute_fully_if_needed( - ctx.index, - ctx.txn, - &mut ctx.word_interner, - &mut ctx.phrase_interner, - )?; - } - - let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } = - &original.zero_typo; - result.extend(phrase.iter().copied()); - result.extend(synonyms.iter().copied()); - - if !self.one_typo_subset.is_empty() { - let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { - panic!(); - }; - result.extend(split_words.iter().copied()); - } - - Ok(result) - } -} - -impl QueryTerm { - pub fn compute_fully_if_needed( - &mut self, - index: &Index, - txn: &RoTxn, - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - ) -> Result<()> { - if self.max_nbr_typos == 0 { - self.one_typo = Lazy::Init(OneTypoTerm::default()); - self.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if self.max_nbr_typos == 1 && self.one_typo.is_uninit() { - assert!(self.two_typo.is_uninit()); - self.initialize_one_typo_subterm(index, txn, word_interner, phrase_interner)?; - assert!(self.one_typo.is_init()); - self.two_typo = Lazy::Init(TwoTypoTerm::default()); - } else if self.max_nbr_typos > 1 && self.two_typo.is_uninit() { - assert!(self.two_typo.is_uninit()); - self.initialize_one_and_two_typo_subterm(index, txn, word_interner, phrase_interner)?; - assert!(self.one_typo.is_init() && self.two_typo.is_init()); - } - Ok(()) - } -} - -#[derive(Clone, PartialEq, Eq, Hash)] -pub struct QueryTerm { - pub original: Interned, - pub is_ngram: bool, - pub max_nbr_typos: u8, - pub is_prefix: bool, - pub zero_typo: ZeroTypoTerm, - // May not be computed yet - pub one_typo: Lazy, - // May not be computed yet - pub two_typo: Lazy, -} - -// SubTerms will be in a dedup interner -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct ZeroTypoTerm { - /// The original phrase, if any - pub phrase: Option>, - /// A single word equivalent to the original term, with zero typos - pub exact: Option>, - /// All the words that contain the original word as prefix - pub prefix_of: BTreeSet>, - /// All the synonyms of the original word or phrase - pub synonyms: BTreeSet>, - /// A prefix in the prefix databases matching the original word - pub use_prefix_db: Option>, -} -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct OneTypoTerm { - /// The original word split into multiple consecutive words - pub split_words: Option>, - /// Words that are 1 typo away from the original word - pub one_typo: BTreeSet>, -} -#[derive(Default, Clone, PartialEq, Eq, Hash)] -pub struct TwoTypoTerm { - /// Words that are 2 typos away from the original word - pub two_typos: BTreeSet>, -} - -impl ZeroTypoTerm { - fn is_empty(&self) -> bool { - let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self; - phrase.is_none() - && zero_typo.is_none() - && prefix_of.is_empty() - && synonyms.is_empty() - && use_prefix_db.is_none() - } -} -impl OneTypoTerm { - fn is_empty(&self) -> bool { - let OneTypoTerm { split_words, one_typo } = self; - one_typo.is_empty() && split_words.is_none() - } -} -impl TwoTypoTerm { - fn is_empty(&self) -> bool { - let TwoTypoTerm { two_typos } = self; - two_typos.is_empty() - } -} - -impl QueryTerm { - pub fn phrase( - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - phrase: Phrase, - ) -> Self { - Self { - original: word_interner.insert(phrase.description(word_interner)), - is_ngram: false, - max_nbr_typos: 0, - is_prefix: false, - zero_typo: ZeroTypoTerm { - phrase: Some(phrase_interner.insert(phrase)), - exact: None, - prefix_of: BTreeSet::default(), - synonyms: BTreeSet::default(), - use_prefix_db: None, - }, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - } - } - pub fn empty(word_interner: &mut DedupInterner, original: &str) -> Self { - Self { - original: word_interner.insert(original.to_owned()), - is_ngram: false, - is_prefix: false, - max_nbr_typos: 0, - zero_typo: <_>::default(), - one_typo: Lazy::Init(<_>::default()), - two_typo: Lazy::Init(<_>::default()), - } - } - - pub fn is_empty(&self) -> bool { - let Lazy::Init(one_typo) = &self.one_typo else { - return false; - }; - let Lazy::Init(two_typo) = &self.two_typo else { - return false; - }; - - self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() - } -} - -pub enum ZeroOrOneTypo { - Zero, - One, -} - -fn find_zero_typo_prefix_derivations( - word_interned: Interned, - fst: fst::Set>, - word_interner: &mut DedupInterner, - mut visit: impl FnMut(Interned) -> Result>, -) -> Result<()> { - let word = word_interner.get(word_interned).to_owned(); - let word = word.as_str(); - let prefix = Str::new(word).starts_with(); - let mut stream = fst.search(prefix).into_stream(); - - while let Some(derived_word) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?.to_owned(); - let derived_word_interned = word_interner.insert(derived_word); - if derived_word_interned != word_interned { - let cf = visit(derived_word_interned)?; - if cf.is_break() { - break; - } - } - } - Ok(()) -} - -fn find_zero_one_typo_derivations( - word_interned: Interned, - is_prefix: bool, - fst: fst::Set>, - word_interner: &mut DedupInterner, - mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result>, -) -> Result<()> { - let word = word_interner.get(word_interned).to_owned(); - let word = word.as_str(); - - let dfa = build_dfa(word, 1, is_prefix); - let starts = StartsWith(Str::new(get_first(word))); - let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); - - while let Some((derived_word, state)) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?; - let derived_word = word_interner.insert(derived_word.to_owned()); - let d = dfa.distance(state.1); - match d.to_u8() { - 0 => { - if derived_word != word_interned { - let cf = visit(derived_word, ZeroOrOneTypo::Zero)?; - if cf.is_break() { - break; - } - } - } - 1 => { - let cf = visit(derived_word, ZeroOrOneTypo::One)?; - if cf.is_break() { - break; - } - } - _ => { - unreachable!("One typo dfa produced multiple typos") - } - } - } - Ok(()) -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum NumberOfTypos { - Zero, - One, - Two, -} -fn find_zero_one_two_typo_derivations( - word_interned: Interned, - is_prefix: bool, - fst: fst::Set>, - word_interner: &mut DedupInterner, - mut visit: impl FnMut(Interned, NumberOfTypos) -> Result>, -) -> Result<()> { - let word = word_interner.get(word_interned).to_owned(); - let word = word.as_str(); - - let starts = StartsWith(Str::new(get_first(word))); - let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); - let second_dfa = build_dfa(word, 2, is_prefix); - let second = Intersection(&second_dfa, &starts); - let automaton = Union(first, &second); - - let mut stream = fst.search_with_state(automaton).into_stream(); - - while let Some((derived_word, state)) = stream.next() { - let derived_word = std::str::from_utf8(derived_word)?; - let derived_word_interned = word_interner.insert(derived_word.to_owned()); - // in the case the typo is on the first letter, we know the number of typo - // is two - if get_first(derived_word) != get_first(word) { - let cf = visit(derived_word_interned, NumberOfTypos::Two)?; - if cf.is_break() { - break; - } - } else { - // Else, we know that it is the second dfa that matched and compute the - // correct distance - let d = second_dfa.distance((state.1).0); - match d.to_u8() { - 0 => { - if derived_word_interned != word_interned { - let cf = visit(derived_word_interned, NumberOfTypos::Zero)?; - if cf.is_break() { - break; - } - } - } - 1 => { - let cf = visit(derived_word_interned, NumberOfTypos::One)?; - if cf.is_break() { - break; - } - } - 2 => { - let cf = visit(derived_word_interned, NumberOfTypos::Two)?; - if cf.is_break() { - break; - } - } - _ => unreachable!("2 typos DFA produced a distance greater than 2"), - } - } - } - Ok(()) -} - -fn partially_initialized_term_from_word( - ctx: &mut SearchContext, - word: &str, - max_typo: u8, - is_prefix: bool, -) -> Result { - let word_interned = ctx.word_interner.insert(word.to_owned()); - - if word.len() > MAX_WORD_LENGTH { - return Ok(QueryTerm::empty(&mut ctx.word_interner, word)); - } - - let fst = ctx.index.words_fst(ctx.txn)?; - - let use_prefix_db = is_prefix - && ctx - .index - .word_prefix_docids - .remap_data_type::() - .get(ctx.txn, word)? - .is_some(); - let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; - - let mut zero_typo = None; - let mut prefix_of = BTreeSet::new(); - - if fst.contains(word) { - zero_typo = Some(word_interned); - } - - if is_prefix && use_prefix_db.is_none() { - find_zero_typo_prefix_derivations( - word_interned, - fst, - &mut ctx.word_interner, - |derived_word| { - if prefix_of.len() < limits::MAX_PREFIX_COUNT { - prefix_of.insert(derived_word); - Ok(ControlFlow::Continue(())) - } else { - Ok(ControlFlow::Break(())) - } - }, - )?; - } - let synonyms = ctx.index.synonyms(ctx.txn)?; - let mut synonym_word_count = 0; - let synonyms = synonyms - .get(&vec![word.to_owned()]) - .cloned() - .unwrap_or_default() - .into_iter() - .take(limits::MAX_SYNONYM_PHRASE_COUNT) - .filter_map(|words| { - if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT { - return None; - } - synonym_word_count += words.len(); - let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); - Some(ctx.phrase_interner.insert(Phrase { words })) - }) - .collect(); - let zero_typo = - ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db }; - - Ok(QueryTerm { - original: word_interned, - is_ngram: false, - max_nbr_typos: max_typo, - is_prefix, - zero_typo, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - }) -} - -fn find_split_words( - index: &Index, - txn: &RoTxn, - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - word: &str, -) -> Result>> { - let split_words = split_best_frequency(index, txn, word)?.map(|(l, r)| { - phrase_interner.insert(Phrase { - words: vec![Some(word_interner.insert(l)), Some(word_interner.insert(r))], - }) - }); - Ok(split_words) -} - -impl QueryTerm { - fn initialize_one_typo_subterm( - &mut self, - index: &Index, - txn: &RoTxn, - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - ) -> Result<()> { - let QueryTerm { original, is_prefix, one_typo, .. } = self; - let original_str = word_interner.get(*original).to_owned(); - if one_typo.is_init() { - return Ok(()); - } - let mut one_typo_words = BTreeSet::new(); - - find_zero_one_typo_derivations( - *original, - *is_prefix, - index.words_fst(txn)?, - word_interner, - |derived_word, nbr_typos| { - match nbr_typos { - ZeroOrOneTypo::Zero => {} - ZeroOrOneTypo::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); - } else { - return Ok(ControlFlow::Break(())); - } - } - } - Ok(ControlFlow::Continue(())) - }, - )?; - let split_words = - find_split_words(index, txn, word_interner, phrase_interner, original_str.as_str())?; - let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; - - self.one_typo = Lazy::Init(one_typo); - - Ok(()) - } - fn initialize_one_and_two_typo_subterm( - &mut self, - index: &Index, - txn: &RoTxn, - word_interner: &mut DedupInterner, - phrase_interner: &mut DedupInterner, - ) -> Result<()> { - let QueryTerm { original, is_prefix, two_typo, .. } = self; - let original_str = word_interner.get(*original).to_owned(); - if two_typo.is_init() { - return Ok(()); - } - let mut one_typo_words = BTreeSet::new(); - let mut two_typo_words = BTreeSet::new(); - - find_zero_one_two_typo_derivations( - *original, - *is_prefix, - index.words_fst(txn)?, - word_interner, - |derived_word, nbr_typos| { - if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT - && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT - { - // No chance we will add either one- or two-typo derivations anymore, stop iterating. - return Ok(ControlFlow::Break(())); - } - match nbr_typos { - NumberOfTypos::Zero => {} - NumberOfTypos::One => { - if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { - one_typo_words.insert(derived_word); - } - } - NumberOfTypos::Two => { - if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { - two_typo_words.insert(derived_word); - } - } - } - Ok(ControlFlow::Continue(())) - }, - )?; - let split_words = - find_split_words(index, txn, word_interner, phrase_interner, original_str.as_str())?; - let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words }; - - let two_typo = TwoTypoTerm { two_typos: two_typo_words }; - - self.one_typo = Lazy::Init(one_typo); - self.two_typo = Lazy::Init(two_typo); - - Ok(()) - } -} - -/// Split the original word into the two words that appear the -/// most next to each other in the index. -/// -/// Return `None` if the original word cannot be split. -fn split_best_frequency( - index: &Index, - txn: &RoTxn, - original: &str, -) -> Result> { - let chars = original.char_indices().skip(1); - let mut best = None; - - for (i, _) in chars { - let (left, right) = original.split_at(i); - - let key = (1, left, right); - let frequency = index - .word_pair_proximity_docids - .remap_data_type::() - .get(txn, &key)? - .unwrap_or(0); - - if frequency != 0 && best.map_or(true, |(old, _, _)| frequency > old) { - best = Some((frequency, left, right)); - } - } - - Ok(best.map(|(_, left, right)| (left.to_owned(), right.to_owned()))) -} - -impl QueryTerm { - /// Return the original word from the given query term - pub fn original_single_word(&self) -> Option> { - if self.is_ngram { - None - } else { - Some(self.original) - } - } -} - -/// A query term term coupled with its position in the user's search query. -#[derive(Clone)] -pub struct LocatedQueryTerm { - pub value: Interned, - pub positions: RangeInclusive, -} - -impl LocatedQueryTerm { - /// Return `true` iff the term is empty - pub fn is_empty(&self, interner: &DedupInterner) -> bool { - interner.get(self.value).is_empty() - } -} - -struct PhraseBuilder { - words: Vec>>, - start: u16, - end: u16, -} - -impl PhraseBuilder { - fn empty() -> Self { - Self { words: Default::default(), start: u16::MAX, end: u16::MAX } - } - - fn is_empty(&self) -> bool { - self.words.is_empty() - } - - // precondition: token has kind Word or StopWord - fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) { - if self.is_empty() { - self.start = position; - } - self.end = position; - if let TokenKind::StopWord = token.kind { - self.words.push(None); - } else { - // token has kind Word - let word = ctx.word_interner.insert(token.lemma().to_string()); - // TODO: in a phrase, check that every word exists - // otherwise return an empty term - self.words.push(Some(word)); - } - } - - fn build(self, ctx: &mut SearchContext) -> Option { - if self.is_empty() { - return None; - } - Some(LocatedQueryTerm { - value: ctx.term_interner.push(QueryTerm::phrase( - &mut ctx.word_interner, - &mut ctx.phrase_interner, - Phrase { words: self.words }, - )), - positions: self.start..=self.end, - }) - } -} - -/// Convert the tokenised search query into a list of located query terms. -// TODO: checking if the positions are correct for phrases, separators, ngrams -pub fn located_query_terms_from_string( - ctx: &mut SearchContext, - query: NormalizedTokenIter<&[u8]>, - words_limit: Option, -) -> Result> { - let nbr_typos = number_of_typos_allowed(ctx)?; - - let mut located_terms = Vec::new(); - - let mut phrase: Option = None; - - let parts_limit = words_limit.unwrap_or(usize::MAX); - - // start with the last position as we will wrap around to position 0 at the beginning of the loop below. - let mut position = u16::MAX; - - let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable(); - while let Some(token) = peekable.next() { - // early return if word limit is exceeded - if located_terms.len() >= parts_limit { - return Ok(located_terms); - } - - match token.kind { - TokenKind::Word | TokenKind::StopWord => { - // On first loop, goes from u16::MAX to 0, then normal increment. - position = position.wrapping_add(1); - - // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, - // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, - // 3. if the word is the last token of the query we push it as a prefix word. - if let Some(phrase) = &mut phrase { - phrase.push_word(ctx, &token, position) - } else if peekable.peek().is_some() { - match token.kind { - TokenKind::Word => { - let word = token.lemma(); - let term = partially_initialized_term_from_word( - ctx, - word, - nbr_typos(word), - false, - )?; - let located_term = LocatedQueryTerm { - value: ctx.term_interner.push(term), - positions: position..=position, - }; - located_terms.push(located_term); - } - TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} - } - } else { - let word = token.lemma(); - let term = - partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?; - let located_term = LocatedQueryTerm { - value: ctx.term_interner.push(term), - positions: position..=position, - }; - located_terms.push(located_term); - } - } - TokenKind::Separator(separator_kind) => { - match separator_kind { - SeparatorKind::Hard => { - position += 8; - } - SeparatorKind::Soft => { - position += 0; - } - } - - phrase = 'phrase: { - let phrase = phrase.take(); - - // If we have a hard separator inside a phrase, we immediately start a new phrase - let phrase = if separator_kind == SeparatorKind::Hard { - if let Some(phrase) = phrase { - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) - } - Some(PhraseBuilder::empty()) - } else { - None - } - } else { - phrase - }; - - // We close and start a new phrase depending on the number of double quotes - let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - if quote_count == 0 { - break 'phrase phrase; - } - - // Consume the closing quote and the phrase - if let Some(phrase) = phrase { - // Per the check above, quote_count > 0 - quote_count -= 1; - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) - } - } - - // Start new phrase if the token ends with an opening quote - (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) - }; - } - _ => (), - } - } - - // If a quote is never closed, we consider all of the end of the query as a phrase. - if let Some(phrase) = phrase.take() { - if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term); - } - } - - Ok(located_terms) -} - -pub fn number_of_typos_allowed<'ctx>( - ctx: &SearchContext<'ctx>, -) -> Result u8 + 'ctx> { - let authorize_typos = ctx.index.authorize_typos(ctx.txn)?; - let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; - let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; - - // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? - let exact_words = ctx.index.exact_words(ctx.txn)?; - - Ok(Box::new(move |word: &str| { - if !authorize_typos - || word.len() < min_len_one_typo as usize - || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) - { - 0 - } else if word.len() < min_len_two_typos as usize { - 1 - } else { - 2 - } - })) -} - -pub fn make_ngram( - ctx: &mut SearchContext, - terms: &[LocatedQueryTerm], - number_of_typos_allowed: &impl Fn(&str) -> u8, -) -> Result> { - assert!(!terms.is_empty()); - for t in terms { - if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() { - return Ok(None); - } - } - for ts in terms.windows(2) { - let [t1, t2] = ts else { panic!() }; - if *t1.positions.end() != t2.positions.start() - 1 { - return Ok(None); - } - } - let mut words_interned = vec![]; - for term in terms { - if let Some(original_term_word) = ctx.term_interner.get(term.value).original_single_word() { - words_interned.push(original_term_word); - } else { - return Ok(None); - } - } - let words = - words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::>(); - - let start = *terms.first().as_ref().unwrap().positions.start(); - let end = *terms.last().as_ref().unwrap().positions.end(); - let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix; - let ngram_str = words.join(""); - if ngram_str.len() > MAX_WORD_LENGTH { - return Ok(None); - } - - let max_nbr_typos = - number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); - - let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; - - // let (_, mut zero_typo, mut one_typo, two_typo) = - // all_subterms_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; - let original = ctx.word_interner.insert(words.join(" ")); - - // Now add the synonyms - let index_synonyms = ctx.index.synonyms(ctx.txn)?; - - term.zero_typo.synonyms.extend( - index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| { - let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); - ctx.phrase_interner.insert(Phrase { words }) - }), - ); - - let term = QueryTerm { - original, - is_ngram: true, - is_prefix, - max_nbr_typos, - zero_typo: term.zero_typo, - one_typo: Lazy::Uninit, - two_typo: Lazy::Uninit, - }; - - let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end }; - - Ok(Some(term)) -} diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs new file mode 100644 index 000000000..695c57f08 --- /dev/null +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -0,0 +1,381 @@ +use fst::automaton::Str; +use fst::{Automaton, IntoStreamer, Streamer}; +use heed::types::DecodeIgnore; +use heed::BytesDecode; +use std::borrow::Cow; +use std::collections::BTreeSet; +use std::ops::ControlFlow; + +use super::*; +use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union}; +use crate::search::new::query_term::TwoTypoTerm; +use crate::search::new::{limits, SearchContext}; +use crate::search::{build_dfa, get_first}; +use crate::{CboRoaringBitmapLenCodec, Result, MAX_WORD_LENGTH}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum NumberOfTypos { + Zero, + One, + Two, +} + +pub enum ZeroOrOneTypo { + Zero, + One, +} + +impl Interned { + pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> { + let s = ctx.term_interner.get_mut(self); + if s.max_nbr_typos == 0 { + s.one_typo = Lazy::Init(OneTypoTerm::default()); + s.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if s.max_nbr_typos == 1 && s.one_typo.is_uninit() { + assert!(s.two_typo.is_uninit()); + self.initialize_one_typo_subterm(ctx)?; + let s = ctx.term_interner.get_mut(self); + assert!(s.one_typo.is_init()); + s.two_typo = Lazy::Init(TwoTypoTerm::default()); + } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() { + assert!(s.two_typo.is_uninit()); + self.initialize_one_and_two_typo_subterm(ctx)?; + let s = ctx.term_interner.get_mut(self); + assert!(s.one_typo.is_init() && s.two_typo.is_init()); + } + Ok(()) + } +} + +fn find_zero_typo_prefix_derivations( + word_interned: Interned, + fst: fst::Set>, + word_interner: &mut DedupInterner, + mut visit: impl FnMut(Interned) -> Result>, +) -> Result<()> { + let word = word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + let prefix = Str::new(word).starts_with(); + let mut stream = fst.search(prefix).into_stream(); + + while let Some(derived_word) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?.to_owned(); + let derived_word_interned = word_interner.insert(derived_word); + if derived_word_interned != word_interned { + let cf = visit(derived_word_interned)?; + if cf.is_break() { + break; + } + } + } + Ok(()) +} + +fn find_zero_one_typo_derivations( + ctx: &mut SearchContext, + word_interned: Interned, + is_prefix: bool, + mut visit: impl FnMut(Interned, ZeroOrOneTypo) -> Result>, +) -> Result<()> { + let fst = ctx.get_words_fst()?; + let word = ctx.word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + + let dfa = build_dfa(word, 1, is_prefix); + let starts = StartsWith(Str::new(get_first(word))); + let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); + + while let Some((derived_word, state)) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?; + let derived_word = ctx.word_interner.insert(derived_word.to_owned()); + let d = dfa.distance(state.1); + match d.to_u8() { + 0 => { + if derived_word != word_interned { + let cf = visit(derived_word, ZeroOrOneTypo::Zero)?; + if cf.is_break() { + break; + } + } + } + 1 => { + let cf = visit(derived_word, ZeroOrOneTypo::One)?; + if cf.is_break() { + break; + } + } + _ => { + unreachable!("One typo dfa produced multiple typos") + } + } + } + Ok(()) +} + +fn find_zero_one_two_typo_derivations( + word_interned: Interned, + is_prefix: bool, + fst: fst::Set>, + word_interner: &mut DedupInterner, + mut visit: impl FnMut(Interned, NumberOfTypos) -> Result>, +) -> Result<()> { + let word = word_interner.get(word_interned).to_owned(); + let word = word.as_str(); + + let starts = StartsWith(Str::new(get_first(word))); + let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); + let second_dfa = build_dfa(word, 2, is_prefix); + let second = Intersection(&second_dfa, &starts); + let automaton = Union(first, &second); + + let mut stream = fst.search_with_state(automaton).into_stream(); + + while let Some((derived_word, state)) = stream.next() { + let derived_word = std::str::from_utf8(derived_word)?; + let derived_word_interned = word_interner.insert(derived_word.to_owned()); + // in the case the typo is on the first letter, we know the number of typo + // is two + if get_first(derived_word) != get_first(word) { + let cf = visit(derived_word_interned, NumberOfTypos::Two)?; + if cf.is_break() { + break; + } + } else { + // Else, we know that it is the second dfa that matched and compute the + // correct distance + let d = second_dfa.distance((state.1).0); + match d.to_u8() { + 0 => { + if derived_word_interned != word_interned { + let cf = visit(derived_word_interned, NumberOfTypos::Zero)?; + if cf.is_break() { + break; + } + } + } + 1 => { + let cf = visit(derived_word_interned, NumberOfTypos::One)?; + if cf.is_break() { + break; + } + } + 2 => { + let cf = visit(derived_word_interned, NumberOfTypos::Two)?; + if cf.is_break() { + break; + } + } + _ => unreachable!("2 typos DFA produced a distance greater than 2"), + } + } + } + Ok(()) +} + +pub fn partially_initialized_term_from_word( + ctx: &mut SearchContext, + word: &str, + max_typo: u8, + is_prefix: bool, +) -> Result { + let word_interned = ctx.word_interner.insert(word.to_owned()); + + if word.len() > MAX_WORD_LENGTH { + return Ok({ + QueryTerm { + original: ctx.word_interner.insert(word.to_owned()), + ngram_words: None, + is_prefix: false, + max_nbr_typos: 0, + zero_typo: <_>::default(), + one_typo: Lazy::Init(<_>::default()), + two_typo: Lazy::Init(<_>::default()), + } + }); + } + + let fst = ctx.index.words_fst(ctx.txn)?; + + let use_prefix_db = is_prefix + && ctx + .index + .word_prefix_docids + .remap_data_type::() + .get(ctx.txn, word)? + .is_some(); + let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None }; + + let mut zero_typo = None; + let mut prefix_of = BTreeSet::new(); + + if fst.contains(word) { + zero_typo = Some(word_interned); + } + + if is_prefix && use_prefix_db.is_none() { + find_zero_typo_prefix_derivations( + word_interned, + fst, + &mut ctx.word_interner, + |derived_word| { + if prefix_of.len() < limits::MAX_PREFIX_COUNT { + prefix_of.insert(derived_word); + Ok(ControlFlow::Continue(())) + } else { + Ok(ControlFlow::Break(())) + } + }, + )?; + } + let synonyms = ctx.index.synonyms(ctx.txn)?; + let mut synonym_word_count = 0; + let synonyms = synonyms + .get(&vec![word.to_owned()]) + .cloned() + .unwrap_or_default() + .into_iter() + .take(limits::MAX_SYNONYM_PHRASE_COUNT) + .filter_map(|words| { + if synonym_word_count + words.len() > limits::MAX_SYNONYM_WORD_COUNT { + return None; + } + synonym_word_count += words.len(); + let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); + Some(ctx.phrase_interner.insert(Phrase { words })) + }) + .collect(); + let zero_typo = + ZeroTypoTerm { phrase: None, exact: zero_typo, prefix_of, synonyms, use_prefix_db }; + + Ok(QueryTerm { + original: word_interned, + ngram_words: None, + max_nbr_typos: max_typo, + is_prefix, + zero_typo, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + }) +} + +fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result>> { + if let Some((l, r)) = split_best_frequency(ctx, word)? { + Ok(Some(ctx.phrase_interner.insert(Phrase { words: vec![Some(l), Some(r)] }))) + } else { + Ok(None) + } +} + +impl Interned { + fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { + let self_mut = ctx.term_interner.get_mut(self); + let QueryTerm { original, is_prefix, one_typo, .. } = self_mut; + let original = *original; + let is_prefix = *is_prefix; + // let original_str = ctx.word_interner.get(*original).to_owned(); + if one_typo.is_init() { + return Ok(()); + } + let mut one_typo_words = BTreeSet::new(); + + find_zero_one_typo_derivations(ctx, original, is_prefix, |derived_word, nbr_typos| { + match nbr_typos { + ZeroOrOneTypo::Zero => {} + ZeroOrOneTypo::One => { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } else { + return Ok(ControlFlow::Break(())); + } + } + } + Ok(ControlFlow::Continue(())) + })?; + let original_str = ctx.word_interner.get(original).to_owned(); + let split_words = find_split_words(ctx, original_str.as_str())?; + let one_typo = OneTypoTerm { split_words, one_typo: one_typo_words }; + + let self_mut = ctx.term_interner.get_mut(self); + self_mut.one_typo = Lazy::Init(one_typo); + + Ok(()) + } + fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> { + let self_mut = ctx.term_interner.get_mut(self); + let QueryTerm { original, is_prefix, two_typo, .. } = self_mut; + let original_str = ctx.word_interner.get(*original).to_owned(); + if two_typo.is_init() { + return Ok(()); + } + let mut one_typo_words = BTreeSet::new(); + let mut two_typo_words = BTreeSet::new(); + + find_zero_one_two_typo_derivations( + *original, + *is_prefix, + ctx.index.words_fst(ctx.txn)?, + &mut ctx.word_interner, + |derived_word, nbr_typos| { + if one_typo_words.len() >= limits::MAX_ONE_TYPO_COUNT + && two_typo_words.len() >= limits::MAX_TWO_TYPOS_COUNT + { + // No chance we will add either one- or two-typo derivations anymore, stop iterating. + return Ok(ControlFlow::Break(())); + } + match nbr_typos { + NumberOfTypos::Zero => {} + NumberOfTypos::One => { + if one_typo_words.len() < limits::MAX_ONE_TYPO_COUNT { + one_typo_words.insert(derived_word); + } + } + NumberOfTypos::Two => { + if two_typo_words.len() < limits::MAX_TWO_TYPOS_COUNT { + two_typo_words.insert(derived_word); + } + } + } + Ok(ControlFlow::Continue(())) + }, + )?; + let split_words = find_split_words(ctx, original_str.as_str())?; + let self_mut = ctx.term_interner.get_mut(self); + + let one_typo = OneTypoTerm { one_typo: one_typo_words, split_words }; + + let two_typo = TwoTypoTerm { two_typos: two_typo_words }; + + self_mut.one_typo = Lazy::Init(one_typo); + self_mut.two_typo = Lazy::Init(two_typo); + + Ok(()) + } +} + +/// Split the original word into the two words that appear the +/// most next to each other in the index. +/// +/// Return `None` if the original word cannot be split. +fn split_best_frequency( + ctx: &mut SearchContext, + original: &str, +) -> Result, Interned)>> { + let chars = original.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = original.split_at(i); + let left = ctx.word_interner.insert(left.to_owned()); + let right = ctx.word_interner.insert(right.to_owned()); + + if let Some(docid_bytes) = ctx.get_db_word_pair_proximity_docids(left, right, 1)? { + let frequency = + CboRoaringBitmapLenCodec::bytes_decode(docid_bytes).ok_or(heed::Error::Decoding)?; + if best.map_or(true, |(old, _, _)| frequency > old) { + best = Some((frequency, left, right)); + } + } + } + + Ok(best.map(|(_, left, right)| (left, right))) +} diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs new file mode 100644 index 000000000..896c70e1b --- /dev/null +++ b/milli/src/search/new/query_term/mod.rs @@ -0,0 +1,370 @@ +mod compute_derivations; +mod ntypo_subset; +mod parse_query; +mod phrase; + +use super::interner::{DedupInterner, Interned}; +use super::{limits, SearchContext}; +use crate::Result; +use std::collections::BTreeSet; +use std::ops::RangeInclusive; + +use either::Either; +pub use ntypo_subset::NTypoTermSubset; +pub use parse_query::{located_query_terms_from_string, make_ngram, number_of_typos_allowed}; +pub use phrase::Phrase; + +use compute_derivations::partially_initialized_term_from_word; + +/// A set of word derivations attached to a location in the search query. +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct LocatedQueryTermSubset { + pub term_subset: QueryTermSubset, + pub positions: RangeInclusive, + pub term_ids: RangeInclusive, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct QueryTermSubset { + original: Interned, + zero_typo_subset: NTypoTermSubset, + one_typo_subset: NTypoTermSubset, + two_typo_subset: NTypoTermSubset, +} + +#[derive(Clone, PartialEq, Eq, Hash)] +pub struct QueryTerm { + original: Interned, + ngram_words: Option>>, + max_nbr_typos: u8, + is_prefix: bool, + zero_typo: ZeroTypoTerm, + // May not be computed yet + one_typo: Lazy, + // May not be computed yet + two_typo: Lazy, +} + +// SubTerms will be in a dedup interner +#[derive(Default, Clone, PartialEq, Eq, Hash)] +struct ZeroTypoTerm { + /// The original phrase, if any + phrase: Option>, + /// A single word equivalent to the original term, with zero typos + exact: Option>, + /// All the words that contain the original word as prefix + prefix_of: BTreeSet>, + /// All the synonyms of the original word or phrase + synonyms: BTreeSet>, + /// A prefix in the prefix databases matching the original word + use_prefix_db: Option>, +} +#[derive(Default, Clone, PartialEq, Eq, Hash)] +struct OneTypoTerm { + /// The original word split into multiple consecutive words + split_words: Option>, + /// Words that are 1 typo away from the original word + one_typo: BTreeSet>, +} +#[derive(Default, Clone, PartialEq, Eq, Hash)] +struct TwoTypoTerm { + /// Words that are 2 typos away from the original word + two_typos: BTreeSet>, +} + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum Lazy { + Uninit, + Init(T), +} +impl Lazy { + pub fn is_init(&self) -> bool { + match self { + Lazy::Uninit => false, + Lazy::Init(_) => true, + } + } + pub fn is_uninit(&self) -> bool { + match self { + Lazy::Uninit => true, + Lazy::Init(_) => false, + } + } +} + +#[derive(Clone, Copy)] +pub enum ExactTerm { + Phrase(Interned), + Word(Interned), +} + +impl ExactTerm { + pub fn interned_words<'ctx>( + &self, + ctx: &'ctx SearchContext<'ctx>, + ) -> impl Iterator>> + 'ctx { + match *self { + ExactTerm::Phrase(phrase) => { + let phrase = ctx.phrase_interner.get(phrase); + Either::Left(phrase.words.iter().copied()) + } + ExactTerm::Word(word) => Either::Right(std::iter::once(Some(word))), + } + } +} + +impl QueryTermSubset { + pub fn exact_term(&self, ctx: &SearchContext) -> Option { + let full_query_term = ctx.term_interner.get(self.original); + if full_query_term.ngram_words.is_some() { + return None; + } + // TODO: included in subset + if let Some(phrase) = full_query_term.zero_typo.phrase { + self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase)) + } else if let Some(word) = full_query_term.zero_typo.exact { + self.zero_typo_subset.contains_word(word).then_some(ExactTerm::Word(word)) + } else { + None + } + } + + pub fn empty(for_term: Interned) -> Self { + Self { + original: for_term, + zero_typo_subset: NTypoTermSubset::Nothing, + one_typo_subset: NTypoTermSubset::Nothing, + two_typo_subset: NTypoTermSubset::Nothing, + } + } + pub fn full(for_term: Interned) -> Self { + Self { + original: for_term, + zero_typo_subset: NTypoTermSubset::All, + one_typo_subset: NTypoTermSubset::All, + two_typo_subset: NTypoTermSubset::All, + } + } + + pub fn union(&mut self, other: &Self) { + assert!(self.original == other.original); + self.zero_typo_subset.union(&other.zero_typo_subset); + self.one_typo_subset.union(&other.one_typo_subset); + self.two_typo_subset.union(&other.two_typo_subset); + } + pub fn intersect(&mut self, other: &Self) { + assert!(self.original == other.original); + self.zero_typo_subset.intersect(&other.zero_typo_subset); + self.one_typo_subset.intersect(&other.one_typo_subset); + self.two_typo_subset.intersect(&other.two_typo_subset); + } + + pub fn use_prefix_db(&self, ctx: &SearchContext) -> Option> { + let original = ctx.term_interner.get(self.original); + let Some(use_prefix_db) = original.zero_typo.use_prefix_db else { + return None + }; + match &self.zero_typo_subset { + NTypoTermSubset::All => Some(use_prefix_db), + NTypoTermSubset::Subset { words, phrases: _ } => { + // TODO: use a subset of prefix words instead + if words.contains(&use_prefix_db) { + Some(use_prefix_db) + } else { + None + } + } + NTypoTermSubset::Nothing => None, + } + } + pub fn all_single_words_except_prefix_db( + &self, + ctx: &mut SearchContext, + ) -> Result>> { + let mut result = BTreeSet::default(); + // TODO: a compute_partially funtion + if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { + self.original.compute_fully_if_needed(ctx)?; + } + + let original = ctx.term_interner.get_mut(self.original); + if !self.zero_typo_subset.is_empty() { + let ZeroTypoTerm { + phrase: _, + exact: zero_typo, + prefix_of, + synonyms: _, + use_prefix_db: _, + } = &original.zero_typo; + result.extend(zero_typo.iter().copied()); + result.extend(prefix_of.iter().copied()); + }; + + match &self.one_typo_subset { + NTypoTermSubset::All => { + let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { + panic!() + }; + result.extend(one_typo.iter().copied()) + } + NTypoTermSubset::Subset { words, phrases: _ } => { + let Lazy::Init(OneTypoTerm { split_words: _, one_typo }) = &original.one_typo else { + panic!() + }; + result.extend(one_typo.intersection(words)); + } + NTypoTermSubset::Nothing => {} + }; + + match &self.two_typo_subset { + NTypoTermSubset::All => { + let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { + panic!() + }; + result.extend(two_typos.iter().copied()); + } + NTypoTermSubset::Subset { words, phrases: _ } => { + let Lazy::Init(TwoTypoTerm { two_typos }) = &original.two_typo else { + panic!() + }; + result.extend(two_typos.intersection(words)); + } + NTypoTermSubset::Nothing => {} + }; + + Ok(result) + } + pub fn all_phrases(&self, ctx: &mut SearchContext) -> Result>> { + let mut result = BTreeSet::default(); + + if !self.one_typo_subset.is_empty() { + // TODO: compute less than fully if possible + self.original.compute_fully_if_needed(ctx)?; + } + let original = ctx.term_interner.get_mut(self.original); + + let ZeroTypoTerm { phrase, exact: _, prefix_of: _, synonyms, use_prefix_db: _ } = + &original.zero_typo; + result.extend(phrase.iter().copied()); + result.extend(synonyms.iter().copied()); + + if !self.one_typo_subset.is_empty() { + let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { + panic!(); + }; + result.extend(split_words.iter().copied()); + } + + Ok(result) + } + + pub fn original_phrase(&self, ctx: &SearchContext) -> Option> { + let t = ctx.term_interner.get(self.original); + if let Some(p) = t.zero_typo.phrase { + if self.zero_typo_subset.contains_phrase(p) { + return Some(p); + } + } + None + } + pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 { + let t = ctx.term_interner.get(self.original); + match t.max_nbr_typos { + 0 => 0, + 1 => { + if self.one_typo_subset.is_empty() { + 0 + } else { + 1 + } + } + 2 => { + if self.two_typo_subset.is_empty() { + if self.one_typo_subset.is_empty() { + 0 + } else { + 1 + } + } else { + 2 + } + } + _ => panic!(), + } + } + pub fn clear_zero_typo_subset(&mut self) { + self.zero_typo_subset = NTypoTermSubset::Nothing; + } + pub fn clear_one_typo_subset(&mut self) { + self.one_typo_subset = NTypoTermSubset::Nothing; + } + pub fn clear_two_typo_subset(&mut self) { + self.two_typo_subset = NTypoTermSubset::Nothing; + } + pub fn description(&self, ctx: &SearchContext) -> String { + let t = ctx.term_interner.get(self.original); + ctx.word_interner.get(t.original).to_owned() + } +} + +impl ZeroTypoTerm { + fn is_empty(&self) -> bool { + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db } = self; + phrase.is_none() + && zero_typo.is_none() + && prefix_of.is_empty() + && synonyms.is_empty() + && use_prefix_db.is_none() + } +} +impl OneTypoTerm { + fn is_empty(&self) -> bool { + let OneTypoTerm { split_words, one_typo } = self; + one_typo.is_empty() && split_words.is_none() + } +} +impl TwoTypoTerm { + fn is_empty(&self) -> bool { + let TwoTypoTerm { two_typos } = self; + two_typos.is_empty() + } +} + +impl QueryTerm { + fn is_empty(&self) -> bool { + let Lazy::Init(one_typo) = &self.one_typo else { + return false; + }; + let Lazy::Init(two_typo) = &self.two_typo else { + return false; + }; + + self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty() + } +} + +impl Interned { + /// Return the original word from the given query term + fn original_single_word(self, ctx: &SearchContext) -> Option> { + let self_ = ctx.term_interner.get(self); + if self_.ngram_words.is_some() { + None + } else { + Some(self_.original) + } + } +} + +/// A query term coupled with its position in the user's search query. +#[derive(Clone)] +pub struct LocatedQueryTerm { + pub value: Interned, + pub positions: RangeInclusive, +} + +impl LocatedQueryTerm { + /// Return `true` iff the term is empty + pub fn is_empty(&self, interner: &DedupInterner) -> bool { + interner.get(self.value).is_empty() + } +} diff --git a/milli/src/search/new/query_term/ntypo_subset.rs b/milli/src/search/new/query_term/ntypo_subset.rs new file mode 100644 index 000000000..ad25d73c7 --- /dev/null +++ b/milli/src/search/new/query_term/ntypo_subset.rs @@ -0,0 +1,80 @@ +use std::collections::BTreeSet; + +use crate::search::new::interner::Interned; + +use super::Phrase; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum NTypoTermSubset { + All, + Subset { + words: BTreeSet>, + phrases: BTreeSet>, + // TODO: prefixes: BTreeSet>, + }, + Nothing, +} + +impl NTypoTermSubset { + pub fn contains_word(&self, word: Interned) -> bool { + match self { + NTypoTermSubset::All => true, + NTypoTermSubset::Subset { words, phrases: _ } => words.contains(&word), + NTypoTermSubset::Nothing => false, + } + } + pub fn contains_phrase(&self, phrase: Interned) -> bool { + match self { + NTypoTermSubset::All => true, + NTypoTermSubset::Subset { words: _, phrases } => phrases.contains(&phrase), + NTypoTermSubset::Nothing => false, + } + } + pub fn is_empty(&self) -> bool { + match self { + NTypoTermSubset::All => false, + NTypoTermSubset::Subset { words, phrases } => words.is_empty() && phrases.is_empty(), + NTypoTermSubset::Nothing => true, + } + } + pub fn union(&mut self, other: &Self) { + match self { + Self::All => {} + Self::Subset { words, phrases } => match other { + Self::All => { + *self = Self::All; + } + Self::Subset { words: w2, phrases: p2 } => { + words.extend(w2); + phrases.extend(p2); + } + Self::Nothing => {} + }, + Self::Nothing => { + *self = other.clone(); + } + } + } + pub fn intersect(&mut self, other: &Self) { + match self { + Self::All => *self = other.clone(), + Self::Subset { words, phrases } => match other { + Self::All => {} + Self::Subset { words: w2, phrases: p2 } => { + let mut ws = BTreeSet::new(); + for w in words.intersection(w2) { + ws.insert(*w); + } + let mut ps = BTreeSet::new(); + for p in phrases.intersection(p2) { + ps.insert(*p); + } + *words = ws; + *phrases = ps; + } + Self::Nothing => *self = Self::Nothing, + }, + Self::Nothing => {} + } + } +} diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs new file mode 100644 index 000000000..7bcbeeff4 --- /dev/null +++ b/milli/src/search/new/query_term/parse_query.rs @@ -0,0 +1,281 @@ +use charabia::{normalizer::NormalizedTokenIter, SeparatorKind, TokenKind}; + +use crate::{Result, SearchContext, MAX_WORD_LENGTH}; + +use super::*; + +/// Convert the tokenised search query into a list of located query terms. +// TODO: checking if the positions are correct for phrases, separators, ngrams +pub fn located_query_terms_from_string( + ctx: &mut SearchContext, + query: NormalizedTokenIter<&[u8]>, + words_limit: Option, +) -> Result> { + let nbr_typos = number_of_typos_allowed(ctx)?; + + let mut located_terms = Vec::new(); + + let mut phrase: Option = None; + + let parts_limit = words_limit.unwrap_or(usize::MAX); + + // start with the last position as we will wrap around to position 0 at the beginning of the loop below. + let mut position = u16::MAX; + + let mut peekable = query.take(super::limits::MAX_TOKEN_COUNT).peekable(); + while let Some(token) = peekable.next() { + // early return if word limit is exceeded + if located_terms.len() >= parts_limit { + return Ok(located_terms); + } + + match token.kind { + TokenKind::Word | TokenKind::StopWord => { + // On first loop, goes from u16::MAX to 0, then normal increment. + position = position.wrapping_add(1); + + // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, + // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, + // 3. if the word is the last token of the query we push it as a prefix word. + if let Some(phrase) = &mut phrase { + phrase.push_word(ctx, &token, position) + } else if peekable.peek().is_some() { + match token.kind { + TokenKind::Word => { + let word = token.lemma(); + let term = partially_initialized_term_from_word( + ctx, + word, + nbr_typos(word), + false, + )?; + let located_term = LocatedQueryTerm { + value: ctx.term_interner.push(term), + positions: position..=position, + }; + located_terms.push(located_term); + } + TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} + } + } else { + let word = token.lemma(); + let term = + partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?; + let located_term = LocatedQueryTerm { + value: ctx.term_interner.push(term), + positions: position..=position, + }; + located_terms.push(located_term); + } + } + TokenKind::Separator(separator_kind) => { + match separator_kind { + SeparatorKind::Hard => { + position += 1; + } + SeparatorKind::Soft => { + position += 0; + } + } + + phrase = 'phrase: { + let phrase = phrase.take(); + + // If we have a hard separator inside a phrase, we immediately start a new phrase + let phrase = if separator_kind == SeparatorKind::Hard { + if let Some(phrase) = phrase { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + Some(PhraseBuilder::empty()) + } else { + None + } + } else { + phrase + }; + + // We close and start a new phrase depending on the number of double quotes + let mut quote_count = token.lemma().chars().filter(|&s| s == '"').count(); + if quote_count == 0 { + break 'phrase phrase; + } + + // Consume the closing quote and the phrase + if let Some(phrase) = phrase { + // Per the check above, quote_count > 0 + quote_count -= 1; + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term) + } + } + + // Start new phrase if the token ends with an opening quote + (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) + }; + } + _ => (), + } + } + + // If a quote is never closed, we consider all of the end of the query as a phrase. + if let Some(phrase) = phrase.take() { + if let Some(located_query_term) = phrase.build(ctx) { + located_terms.push(located_query_term); + } + } + + Ok(located_terms) +} + +pub fn number_of_typos_allowed<'ctx>( + ctx: &SearchContext<'ctx>, +) -> Result u8 + 'ctx> { + let authorize_typos = ctx.index.authorize_typos(ctx.txn)?; + let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; + let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; + + // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? + let exact_words = ctx.index.exact_words(ctx.txn)?; + + Ok(Box::new(move |word: &str| { + if !authorize_typos + || word.len() < min_len_one_typo as usize + || exact_words.as_ref().map_or(false, |fst| fst.contains(word)) + { + 0 + } else if word.len() < min_len_two_typos as usize { + 1 + } else { + 2 + } + })) +} + +pub fn make_ngram( + ctx: &mut SearchContext, + terms: &[LocatedQueryTerm], + number_of_typos_allowed: &impl Fn(&str) -> u8, +) -> Result> { + assert!(!terms.is_empty()); + for t in terms { + if ctx.term_interner.get(t.value).zero_typo.phrase.is_some() { + return Ok(None); + } + } + for ts in terms.windows(2) { + let [t1, t2] = ts else { panic!() }; + if *t1.positions.end() != t2.positions.start() - 1 { + return Ok(None); + } + } + let mut words_interned = vec![]; + for term in terms { + if let Some(original_term_word) = term.value.original_single_word(ctx) { + words_interned.push(original_term_word); + } else { + return Ok(None); + } + } + let words = + words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::>(); + + let start = *terms.first().as_ref().unwrap().positions.start(); + let end = *terms.last().as_ref().unwrap().positions.end(); + let is_prefix = ctx.term_interner.get(terms.last().as_ref().unwrap().value).is_prefix; + let ngram_str = words.join(""); + if ngram_str.len() > MAX_WORD_LENGTH { + return Ok(None); + } + let ngram_str_interned = ctx.word_interner.insert(ngram_str.clone()); + + let max_nbr_typos = + number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1); + + let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?; + + // Now add the synonyms + let index_synonyms = ctx.index.synonyms(ctx.txn)?; + + term.zero_typo.synonyms.extend( + index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| { + let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect(); + ctx.phrase_interner.insert(Phrase { words }) + }), + ); + + let term = QueryTerm { + original: ngram_str_interned, + ngram_words: Some(words_interned), + is_prefix, + max_nbr_typos, + zero_typo: term.zero_typo, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + }; + + let term = LocatedQueryTerm { value: ctx.term_interner.push(term), positions: start..=end }; + + Ok(Some(term)) +} + +struct PhraseBuilder { + words: Vec>>, + start: u16, + end: u16, +} + +impl PhraseBuilder { + fn empty() -> Self { + Self { words: Default::default(), start: u16::MAX, end: u16::MAX } + } + + fn is_empty(&self) -> bool { + self.words.is_empty() + } + + // precondition: token has kind Word or StopWord + fn push_word(&mut self, ctx: &mut SearchContext, token: &charabia::Token, position: u16) { + if self.is_empty() { + self.start = position; + } + self.end = position; + if let TokenKind::StopWord = token.kind { + self.words.push(None); + } else { + // token has kind Word + let word = ctx.word_interner.insert(token.lemma().to_string()); + // TODO: in a phrase, check that every word exists + // otherwise return an empty term + self.words.push(Some(word)); + } + } + + fn build(self, ctx: &mut SearchContext) -> Option { + if self.is_empty() { + return None; + } + Some(LocatedQueryTerm { + value: ctx.term_interner.push({ + let phrase = ctx.phrase_interner.insert(Phrase { words: self.words }); + let phrase_desc = phrase.description(ctx); + QueryTerm { + original: ctx.word_interner.insert(phrase_desc), + ngram_words: None, + max_nbr_typos: 0, + is_prefix: false, + zero_typo: ZeroTypoTerm { + phrase: Some(phrase), + exact: None, + prefix_of: BTreeSet::default(), + synonyms: BTreeSet::default(), + use_prefix_db: None, + }, + one_typo: Lazy::Uninit, + two_typo: Lazy::Uninit, + } + }), + positions: self.start..=self.end, + }) + } +} diff --git a/milli/src/search/new/query_term/phrase.rs b/milli/src/search/new/query_term/phrase.rs new file mode 100644 index 000000000..2ea8e0d39 --- /dev/null +++ b/milli/src/search/new/query_term/phrase.rs @@ -0,0 +1,16 @@ +use itertools::Itertools; + +use crate::{search::new::interner::Interned, SearchContext}; + +/// A phrase in the user's search query, consisting of several words +/// that must appear side-by-side in the search results. +#[derive(Default, Clone, PartialEq, Eq, Hash)] +pub struct Phrase { + pub words: Vec>>, +} +impl Interned { + pub fn description(self, ctx: &SearchContext) -> String { + let p = ctx.phrase_interner.get(self); + p.words.iter().flatten().map(|w| ctx.word_interner.get(*w)).join(" ") + } +} diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 81c99fd9a..cfd3f62bf 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -57,9 +57,7 @@ impl RankingRuleGraphTrait for ProximityGraph { Ok(format!("{cost}: cost")) } ProximityCondition::Term { term } => { - let original_term = ctx.term_interner.get(term.term_subset.original); - let original_word = ctx.word_interner.get(original_term.original); - Ok(format!("{original_word} : exists")) + Ok(format!("{} : exists", term.term_subset.description(ctx))) } } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index de02b67a4..5d7e0f874 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -3,7 +3,7 @@ use roaring::RoaringBitmap; use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; use crate::search::new::logger::SearchLogger; -use crate::search::new::query_term::{LocatedQueryTermSubset, NTypoTermSubset}; +use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::resolve_query_graph::compute_query_term_subset_docids; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; @@ -43,8 +43,7 @@ impl RankingRuleGraphTrait for TypoGraph { _from: Option<&LocatedQueryTermSubset>, to_term: &LocatedQueryTermSubset, ) -> Result)>> { - let term = to_term; // LocatedQueryTermSubset { term_subset, positions: _, term_ids } = to_term; - let original_full_term = ctx.term_interner.get(term.term_subset.original); + let term = to_term; let mut edges = vec![]; // Ngrams have a base typo cost @@ -52,20 +51,20 @@ impl RankingRuleGraphTrait for TypoGraph { // 3-gram -> equivalent to 2 typos let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 }; - for nbr_typos in 0..=original_full_term.max_nbr_typos { + for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) { let mut term = term.clone(); match nbr_typos { 0 => { - term.term_subset.one_typo_subset = NTypoTermSubset::Nothing; - term.term_subset.two_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.clear_one_typo_subset(); + term.term_subset.clear_two_typo_subset(); } 1 => { - term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing; - term.term_subset.two_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.clear_zero_typo_subset(); + term.term_subset.clear_two_typo_subset(); } 2 => { - term.term_subset.zero_typo_subset = NTypoTermSubset::Nothing; - term.term_subset.one_typo_subset = NTypoTermSubset::Nothing; + term.term_subset.clear_zero_typo_subset(); + term.term_subset.clear_one_typo_subset(); } _ => panic!(), }; @@ -92,9 +91,6 @@ impl RankingRuleGraphTrait for TypoGraph { fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { let TypoCondition { term, nbr_typos } = condition; - let original_term = ctx.term_interner.get(term.term_subset.original); - let original = ctx.word_interner.get(original_term.original); - - Ok(format!("{original}: {nbr_typos}")) + Ok(format!("{}: {nbr_typos}", term.term_subset.description(ctx))) } } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 9dc6018e6..a771d3768 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -2,8 +2,6 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::{QueryGraph, SearchContext}; -// use crate::search::new::sort::Sort; -use crate::search::new::distinct::{apply_distinct_rule, DistinctOutput}; use crate::Result; /// An internal trait implemented by only [`PlaceholderQuery`] and [`QueryGraph`] @@ -69,171 +67,3 @@ pub struct RankingRuleOutput { /// The allowed candidates for the child ranking rule pub candidates: RoaringBitmap, } - -pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( - ctx: &mut SearchContext<'ctx>, - mut ranking_rules: Vec>, - query: &Q, - universe: &RoaringBitmap, - from: usize, - length: usize, - logger: &mut dyn SearchLogger, -) -> Result> { - logger.initial_query(query); - logger.ranking_rules(&ranking_rules); - logger.initial_universe(universe); - - let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? { - ctx.index.fields_ids_map(ctx.txn)?.id(field) - } else { - None - }; - - if universe.len() < from as u64 { - return Ok(vec![]); - } - - let ranking_rules_len = ranking_rules.len(); - logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe); - ranking_rules[0].start_iteration(ctx, logger, universe, query)?; - - let mut ranking_rule_universes: Vec = - vec![RoaringBitmap::default(); ranking_rules_len]; - ranking_rule_universes[0] = universe.clone(); - - let mut cur_ranking_rule_index = 0; - - /// Finish iterating over the current ranking rule, yielding - /// control to the parent (or finishing the search if not possible). - /// Update the candidates accordingly and inform the logger. - macro_rules! back { - () => { - assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); - logger.end_iteration_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &ranking_rule_universes[cur_ranking_rule_index], - ); - ranking_rule_universes[cur_ranking_rule_index].clear(); - ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger); - if cur_ranking_rule_index == 0 { - break; - } else { - cur_ranking_rule_index -= 1; - } - }; - } - - let mut results = vec![]; - let mut cur_offset = 0usize; - - /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` - /// into account and inform the logger. - macro_rules! maybe_add_to_results { - ($candidates:expr) => { - // First apply the distinct rule on the candidates, reducing the universes if necessary - let candidates = if let Some(distinct_fid) = distinct_fid { - let DistinctOutput { remaining, excluded } = apply_distinct_rule(ctx, distinct_fid, $candidates)?; - for universe in ranking_rule_universes.iter_mut() { - *universe -= &excluded; - } - remaining - } else { - $candidates.clone() - }; - let len = candidates.len(); - // if the candidates are empty, there is nothing to do; - if !candidates.is_empty() { - // if we still haven't reached the first document to return - if cur_offset < from { - // and if no document from this bucket can be returned - if cur_offset + (candidates.len() as usize) < from { - // then just skip the bucket - logger.skip_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &candidates, - ); - } else { - // otherwise, skip some of the documents and add some of the rest, in order of ids - let all_candidates = candidates.iter().collect::>(); - let (skipped_candidates, candidates) = - all_candidates.split_at(from - cur_offset); - logger.skip_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &skipped_candidates.into_iter().collect(), - ); - let candidates = candidates - .iter() - .take(length - results.len()) - .copied() - .collect::>(); - logger.add_to_results(&candidates); - results.extend(&candidates); - } - } else { - // if we have passed the offset already, add some of the documents (up to the limit) - let candidates = - candidates.iter().take(length - results.len()).collect::>(); - logger.add_to_results(&candidates); - results.extend(&candidates); - } - } - cur_offset += len as usize; - }; - } - - while results.len() < length { - // The universe for this bucket is zero or one element, so we don't need to sort - // anything, just extend the results and go back to the parent ranking rule. - if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { - maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); - ranking_rule_universes[cur_ranking_rule_index].clear(); - back!(); - continue; - } - - let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else { - back!(); - continue; - }; - - logger.next_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &ranking_rule_universes[cur_ranking_rule_index], - &next_bucket.candidates, - ); - - debug_assert!( - ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates) - ); - ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; - - if cur_ranking_rule_index == ranking_rules_len - 1 - || next_bucket.candidates.len() <= 1 - || cur_offset + (next_bucket.candidates.len() as usize) < from - { - maybe_add_to_results!(&next_bucket.candidates); - continue; - } - - cur_ranking_rule_index += 1; - ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); - logger.start_iteration_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &next_bucket.query, - &ranking_rule_universes[cur_ranking_rule_index], - ); - ranking_rules[cur_ranking_rule_index].start_iteration( - ctx, - logger, - &next_bucket.candidates, - &next_bucket.query, - )?; - } - - Ok(results) -} diff --git a/milli/src/search/new/tests/distinct.rs b/milli/src/search/new/tests/distinct.rs new file mode 100644 index 000000000..4073cf585 --- /dev/null +++ b/milli/src/search/new/tests/distinct.rs @@ -0,0 +1,590 @@ +/*! +This module tests the "distinct attribute" feature, and its +interaction with other ranking rules. + +1. no duplicate distinct attributes are ever returned +2. only the best document (according to the search rules) for each distinct value appears in the result +3. if a document does not have a distinct attribute, then the distinct rule does not apply to it + +It doesn't test properly: +- combination of distinct + exhaustive_nbr_hits (because we know it's incorrect) +- distinct attributes with arrays (because we know it's incorrect as well) +*/ + +use std::collections::HashSet; + +use big_s::S; +use heed::RoTxn; +use maplit::hashset; + +use crate::{ + index::tests::TempIndex, AscDesc, Criterion, Index, Member, Search, SearchResult, + TermsMatchingStrategy, +}; + +use super::collect_field_values; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_sortable_fields(hashset! { S("rank1"), S("letter") }); + s.set_distinct_field("letter".to_owned()); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "letter": "A", + "rank1": 0, + "text": "the quick brown fox jamps over the lazy dog", + }, + { + "id": 1, + "letter": "A", + "rank1": 1, + "text": "the quick brown fox jumpes over the lazy dog", + }, + { + "id": 2, + "letter": "B", + "rank1": 0, + "text": "the quick brown foxjumps over the lazy dog", + }, + { + "id": 3, + "letter": "B", + "rank1": 1, + "text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 4, + "letter": "B", + "rank1": 2, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 5, + "letter": "C", + "rank1": 0, + "text": "the quickbrownfox jumps over the lazy", + }, + { + "id": 6, + "letter": "C", + "rank1": 1, + "text": "the quick brown fox jumpss over the lazy", + }, + { + "id": 7, + "letter": "C", + "rank1": 2, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 8, + "letter": "D", + "rank1": 0, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 9, + "letter": "E", + "rank1": 0, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 10, + "letter": "E", + "rank1": 1, + "text": "the quackbrown foxjunps over", + }, + { + "id": 11, + "letter": "E", + "rank1": 2, + "text": "the quicko browno fox junps over", + }, + { + "id": 12, + "letter": "E", + "rank1": 3, + "text": "the quicko browno fox jumps over", + }, + { + "id": 13, + "letter": "E", + "rank1": 4, + "text": "the quick brewn fox jumps over", + }, + { + "id": 14, + "letter": "E", + "rank1": 5, + "text": "the quick brown fox jumps over", + }, + { + "id": 15, + "letter": "F", + "rank1": 0, + "text": "the quick brownf fox jumps over", + }, + { + "id": 16, + "letter": "F", + "rank1": 1, + "text": "the quic brown fox jamps over", + }, + { + "id": 17, + "letter": "F", + "rank1": 2, + "text": "thequick browns fox jimps", + }, + { + "id": 18, + "letter": "G", + "rank1": 0, + "text": "the qick brown fox jumps", + }, + { + "id": 19, + "letter": "G", + "rank1": 1, + "text": "the quick brownfoxjumps", + }, + { + "id": 20, + "letter": "H", + "rank1": 0, + "text": "the quick brow fox jumps", + }, + { + "id": 21, + "letter": "I", + "rank1": 0, + "text": "the quick brown fox jpmps", + }, + { + "id": 22, + "letter": "I", + "rank1": 1, + "text": "the quick brown fox jumps", + }, + { + "id": 23, + "letter": "I", + "rank1": 2, + "text": "the quick", + }, + { + "id": 24, + "rank1": 0, + "text": "the quick", + }, + { + "id": 25, + "rank1": 1, + "text": "the quick brown", + }, + { + "id": 26, + "rank1": 2, + "text": "the quick brown fox", + }, + { + "id": 26, + "rank1": 3, + "text": "the quick brown fox jumps over the lazy dog", + }, + ])) + .unwrap(); + index +} + +fn verify_distinct(index: &Index, txn: &RoTxn, docids: &[u32]) -> Vec { + let vs = collect_field_values(index, txn, index.distinct_field(txn).unwrap().unwrap(), docids); + + let mut unique = HashSet::new(); + for v in vs.iter() { + if v == "__does_not_exist__" { + continue; + } + assert!(unique.insert(v.clone())); + } + + vs +} + +#[test] +fn test_distinct_placeholder_no_ranking_rules() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + let s = Search::new(&txn, &index); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"A\"", + "\"B\"", + "\"C\"", + "\"D\"", + "\"E\"", + "\"F\"", + "\"G\"", + "\"H\"", + "\"I\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); +} + +#[test] +fn test_distinct_placeholder_sort() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Sort]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank1")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"E\"", + "__does_not_exist__", + "\"B\"", + "\"C\"", + "\"F\"", + "\"I\"", + "\"A\"", + "\"G\"", + "__does_not_exist__", + "\"D\"", + "\"H\"", + "__does_not_exist__", + ] + "###); + let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "5", + "3", + "2", + "2", + "2", + "2", + "1", + "1", + "1", + "0", + "0", + "0", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 20, 18, 15, 9, 8, 5, 2, 0, 24, 25, 26]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"I\"", + "\"H\"", + "\"G\"", + "\"F\"", + "\"E\"", + "\"D\"", + "\"C\"", + "\"B\"", + "\"A\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "0", + "0", + "0", + "0", + "0", + "0", + "0", + "0", + "0", + "0", + "1", + "3", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.sort_criteria(vec![ + AscDesc::Desc(Member::Field(S("letter"))), + AscDesc::Desc(Member::Field(S("rank1"))), + ]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[23, 20, 19, 17, 14, 8, 7, 4, 1, 26, 25, 24]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"I\"", + "\"H\"", + "\"G\"", + "\"F\"", + "\"E\"", + "\"D\"", + "\"C\"", + "\"B\"", + "\"A\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "2", + "0", + "1", + "2", + "5", + "0", + "2", + "2", + "1", + "3", + "1", + "0", + ] + "###); +} + +#[test] +fn test_distinct_words() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 26, 5, 8, 9, 15, 18, 20, 21, 25, 24]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"A\"", + "\"B\"", + "__does_not_exist__", + "\"C\"", + "\"D\"", + "\"E\"", + "\"F\"", + "\"G\"", + "\"H\"", + "\"I\"", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + let text_values = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(text_values, @r###" + [ + "\"the quick brown fox jamps over the lazy dog\"", + "\"the quick brown foxjumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quickbrownfox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brownf fox jumps over\"", + "\"the qick brown fox jumps\"", + "\"the quick brow fox jumps\"", + "\"the quick brown fox jpmps\"", + "\"the quick brown\"", + "\"the quick\"", + ] + "###); +} + +#[test] +fn test_distinct_sort_words() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Sort, Criterion::Words, Criterion::Desc(S("rank1"))]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[22, 20, 19, 16, 9, 8, 7, 3, 1, 26, 25, 24]"); + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"I\"", + "\"H\"", + "\"G\"", + "\"F\"", + "\"E\"", + "\"D\"", + "\"C\"", + "\"B\"", + "\"A\"", + "__does_not_exist__", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + + let rank_values = collect_field_values(&index, &txn, "rank1", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "1", + "0", + "1", + "1", + "0", + "0", + "2", + "1", + "1", + "3", + "1", + "0", + ] + "###); + + let text_values = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(text_values, @r###" + [ + "\"the quick brown fox jumps\"", + "\"the quick brow fox jumps\"", + "\"the quick brownfoxjumps\"", + "\"the quic brown fox jamps over\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumpes over the lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown\"", + "\"the quick\"", + ] + "###); +} + +#[test] +fn test_distinct_all_candidates() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Sort]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank1")))]); + s.exhaustive_number_hits(true); + + let SearchResult { documents_ids, candidates, .. } = s.execute().unwrap(); + let candidates = candidates.iter().collect::>(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); + // TODO: this is incorrect! + insta::assert_snapshot!(format!("{candidates:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); +} + +#[test] +fn test_distinct_typo() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Typo]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 26, 0, 7, 8, 9, 15, 22, 18, 20, 25, 24]"); + + let distinct_values = verify_distinct(&index, &txn, &documents_ids); + insta::assert_debug_snapshot!(distinct_values, @r###" + [ + "\"B\"", + "__does_not_exist__", + "\"A\"", + "\"C\"", + "\"D\"", + "\"E\"", + "\"F\"", + "\"I\"", + "\"G\"", + "\"H\"", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + + let text_values = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(text_values, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jamps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brownf fox jumps over\"", + "\"the quick brown fox jumps\"", + "\"the qick brown fox jumps\"", + "\"the quick brow fox jumps\"", + "\"the quick brown\"", + "\"the quick\"", + ] + "###); +} diff --git a/milli/src/search/new/tests/language.rs b/milli/src/search/new/tests/language.rs new file mode 100644 index 000000000..e16544fdb --- /dev/null +++ b/milli/src/search/new/tests/language.rs @@ -0,0 +1,22 @@ +use crate::{index::tests::TempIndex, Search, SearchResult}; + +#[test] +fn test_kanji_language_detection() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { "id": 0, "title": "The quick (\"brown\") fox can't jump 32.3 feet, right? Brr, it's 29.3°F!" }, + { "id": 1, "title": "東京のお寿司。" }, + { "id": 2, "title": "הַשּׁוּעָל הַמָּהִיר (״הַחוּם״) לֹא יָכוֹל לִקְפֹּץ 9.94 מֶטְרִים, נָכוֹן? ברר, 1.5°C- בַּחוּץ!" } + ])) + .unwrap(); + + let txn = index.write_txn().unwrap(); + let mut search = Search::new(&txn, &index); + + search.query("東京"); + let SearchResult { documents_ids, .. } = search.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]"); +} diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs new file mode 100644 index 000000000..898276858 --- /dev/null +++ b/milli/src/search/new/tests/mod.rs @@ -0,0 +1,30 @@ +pub mod distinct; +#[cfg(feature = "default")] +pub mod language; +pub mod ngram_split_words; +pub mod proximity; +pub mod proximity_typo; +pub mod sort; +pub mod typo; +pub mod typo_proximity; +pub mod words_tms; + +fn collect_field_values( + index: &crate::Index, + txn: &heed::RoTxn, + fid: &str, + docids: &[u32], +) -> Vec { + let mut values = vec![]; + let fid = index.fields_ids_map(txn).unwrap().id(fid).unwrap(); + for doc in index.documents(txn, docids.iter().copied()).unwrap() { + if let Some(v) = doc.1.get(fid) { + let v: serde_json::Value = serde_json::from_slice(v).unwrap(); + let v = v.to_string(); + values.push(v); + } else { + values.push("__does_not_exist__".to_owned()); + } + } + values +} diff --git a/milli/src/search/new/tests/ngram_split_words.rs b/milli/src/search/new/tests/ngram_split_words.rs new file mode 100644 index 000000000..b78bbe763 --- /dev/null +++ b/milli/src/search/new/tests/ngram_split_words.rs @@ -0,0 +1,372 @@ +/*! +This module tests the following properties: + +1. Two consecutive words from a query can be combined into a "2gram" +2. Three consecutive words from a query can be combined into a "3gram" +3. A word from the query can be split into two consecutive words (split words) +4. A 2gram can be split into two words +5. A 3gram cannot be split into two words +6. 2grams can contain up to 1 typo +7. 3grams cannot have typos +8. 2grams and 3grams can be prefix tolerant +9. Disabling typo tolerance also disable the split words feature +10. Disabling typo tolerance does not disable prefix tolerance +11. Disabling typo tolerance does not disable ngram tolerance +12. Prefix tolerance is disabled for the last word if a space follows it +13. Ngrams cannot be formed by combining a phrase and a word or two phrases +*/ + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "the sun flowers are pretty" + }, + { + "id": 1, + "text": "the sun flower is tall" + }, + { + "id": 2, + "text": "the sunflowers are pretty" + }, + { + "id": 3, + "text": "the sunflower is tall" + }, + { + "id": 4, + "text": "the sunflawer is tall" + }, + { + "id": 5, + "text": "sunflowering is not a verb" + } + ])) + .unwrap(); + index +} + +#[test] +fn test_2gram_simple() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // will also match documents with "sunflower" + prefix tolerance + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); +} +#[test] +fn test_3gram_simple() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flower s are"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sunflowers are pretty\"", + ] + "###); +} + +#[test] +fn test_2gram_typo() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flawer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"the sunflawer is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); +} + +#[test] +fn test_no_disable_ngrams() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flower "); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // documents containing `sunflower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + "\"the sunflower is tall\"", + ] + "###); +} + +#[test] +fn test_2gram_prefix() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sun flow"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // documents containing words beginning with `sunflow` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); +} + +#[test] +fn test_3gram_prefix() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("su nf l"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // documents containing a word beginning with sunfl + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"the sunflawer is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); +} + +#[test] +fn test_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunflower "); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // all the documents with either `sunflower` or `sun flower` + eventual typo + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 4]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"the sunflawer is tall\"", + ] + "###); +} + +#[test] +fn test_disable_split_words() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunflower "); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + // no document containing `sun flower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sunflower is tall\"", + ] + "###); +} + +#[test] +fn test_2gram_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunf lower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // all the documents with "sunflower", "sun flower", (sunflower + 1 typo), or (sunflower as prefix) + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 2, 3, 4, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"the sunflawer is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); +} + +#[test] +fn test_3gram_no_split_words() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunf lo wer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // no document with `sun flower` + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sunflowers are pretty\"", + "\"the sunflower is tall\"", + "\"sunflowering is not a verb\"", + ] + "###); +} + +#[test] +fn test_3gram_no_typos() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunf la wer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sunflawer is tall\"", + ] + "###); +} + +#[test] +fn test_no_ngram_phrases() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("\"sun\" flower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flowers are pretty\"", + "\"the sun flower is tall\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("\"sun\" \"flower\""); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the sun flower is tall\"", + ] + "###); +} diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs new file mode 100644 index 000000000..880f933f0 --- /dev/null +++ b/milli/src/search/new/tests/proximity.rs @@ -0,0 +1,471 @@ +/*! +This module tests the Proximity ranking rule: + +1. A proximity of >7 always has the same cost. + +2. Phrase terms can be in sprximity to other terms via their start and end words, +but we need to make sure that the phrase exists in the document that meets this +proximity condition. This is especially relevant with split words and synonyms. + +3. An ngram has the same sprximity cost as its component words being consecutive. +e.g. `sunflower` equivalent to `sun flower`. + +4. The prefix databases can be used to find the sprximity between two words, but +they store fewer sprximities than the regular word sprximity DB. + +*/ + +use std::collections::HashMap; + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_simple_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "the very quick dark brown and smart fox did jump over the terribly lazy and small dog" + }, + { + "id": 1, + "text": "the. quick brown fox jumps over the lazy. dog" + }, + { + "id": 2, + "text": "the quick brown fox jumps over the lazy. dog" + }, + { + "id": 3, + "text": "dog the quick brown fox jumps over the lazy" + }, + { + "id": 4, + "text": "the quickbrown fox jumps over the lazy dog" + }, + { + "id": 5, + "text": "brown quick fox jumps over the lazy dog" + }, + { + "id": 6, + "text": "the really quick brown fox jumps over the very lazy dog" + }, + { + "id": 7, + "text": "the really quick brown fox jumps over the lazy dog" + }, + { + "id": 8, + "text": "the quick brown fox jumps over the lazy" + }, + { + "id": 9, + "text": "the quack brown fox jumps over the lazy" + }, + { + "id": 9, + "text": "the quack brown fox jumps over the lazy dog" + }, + { + "id": 10, + "text": "the quick brown fox jumps over the lazy dog" + } + ])) + .unwrap(); + index +} + +fn create_edge_cases_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + index.add_documents(documents!([ + { + // This document will insert "s" in the prefix database + "id": 0, + "text": " + saa sab sac sae saf sag sah sai saj sak sal sam san sao sap saq sar sasa sat sau sav saw sax say saz + sba sbb sbc sbe sbf sbg sbh sbi sbj sbk sbl sbm sbn sbo sbp sbq sbr sbsb sbt sbu sbv sbw sbx sby sbz + sca scb scc sce scf scg sch sci scj sck scl scm scn sco scp scq scr scsc sct scu scv scw scx scy scz + sda sdb sdc sde sdf sdg sdh sdi sdj sdk sdl sdm sdn sdo sdp sdq sdr sdsd sdt sdu sdv sdw sdx sdy sdz + sea seb sec see sef seg seh sei sej sek sel sem sen seo sep seq ser sese set seu sev sew sex sey sez + sfa sfb sfc sfe sff sfg sfh sfi sfj sfk sfl sfm sfn sfo sfp sfq sfr sfsf sft sfu sfv sfw sfx sfy sfz + sga sgb sgc sge sgf sgg sgh sgi sgj sgk sgl sgm sgn sgo sgp sgq sgr sgsg sgt sgu sgv sgw sgx sgy sgz + ska skb skc ske skf skg skh ski skj skk skl skm skn sko skp skq skr sksk skt sku skv skw skx sky skz + sla slb slc sle slf slg slh sli slj slk sll slm sln slo slp slq slr slsl slt slu slv slw slx sly slz + sma smb smc sme smf smg smh smi smj smk sml smm smn smo smp smq smr smsm smt smu smv smw smx smy smz + sna snb snc sne snf sng snh sni snj snk snl snm snn sno snp snq snr snsn snt snu snv snw snx sny snz + soa sob soc soe sof sog soh soi soj sok sol som son soo sop soq sor soso sot sou sov sow sox soy soz + spa spb spc spe spf spg sph spi spj spk spl spm spn spo spp spq spr spsp spt spu spv spw spx spy spz + sqa sqb sqc sqe sqf sqg sqh sqi sqj sqk sql sqm sqn sqo sqp sqq sqr sqsq sqt squ sqv sqw sqx sqy sqz + sra srb src sre srf srg srh sri srj srk srl srm srn sro srp srq srr srsr srt sru srv srw srx sry srz + ssa ssb ssc sse ssf ssg ssh ssi ssj ssk ssl ssm ssn sso ssp ssq ssr ssss sst ssu ssv ssw ssx ssy ssz + sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz + " + }, + // The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`. + // If the search query is "sunflower", the split word "Sun Flower" will match some documents. + // If the query is `sunflower wilting`, then we should make sure that + // the sprximity condition `flower wilting: sprx N` also comes with the condition + // `sun wilting: sprx N+1`. TODO: this is not the exact condition we use for now. + // We only check that the phrase `sun flower` exists and `flower wilting: sprx N`, which + // is better than nothing but not the best. + { + "id": 1, + "text": "Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat." + }, + { + "id": 2, + "text": "Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat." + }, + { + "id": 3, + // This document matches the query `sunflower wilting`, but the sprximity condition + // between `sunflower` and `wilting` cannot be through the split-word `Sun Flower` + // which would reduce to only `flower` and `wilting` being in sprximity. + "text": "A flower wilting under the sun, unlike a sunflower" + }, + { + // This should be the best document for `sunflower wilting` + "id": 4, + "text": "sun flower wilting under the heat" + }, + { + // This is also the best document for `sunflower wilting` + "id": 5, + "text": "sunflower wilting under the heat" + }, + { + // Prox MAX between `best` and `s` prefix + "id": 6, + "text": "this is the best meal I have ever had in such a beautiful summer day" + }, + { + // Prox 5 between `best` and `s` prefix + "id": 7, + "text": "this is the best cooked meal of the summer" + }, + { + // Prox 4 between `best` and `s` prefix + "id": 8, + "text": "this is the best meal of the summer" + }, + { + // Prox 3 between `best` and `s` prefix + "id": 9, + "text": "this is the best meal of summer" + }, + { + // Prox 1 between `best` and `s` prefix + "id": 10, + "text": "this is the best summer meal" + }, + { + // Reverse Prox 3 between `best` and `s` prefix + "id": 11, + "text": "summer x y best" + }, + { + // Reverse Prox 2 between `best` and `s` prefix + "id": 12, + "text": "summer x best" + }, + { + // Reverse Prox 1 between `best` and `s` prefix + "id": 13, + "text": "summer best" + }, + { + // This document will insert "win" in the prefix database + "id": 14, + "text": " + winaa winab winac winae winaf winag winah winai winaj winak winal winam winan winao winap winaq winar winasa winat winau winav winaw winax winay winaz + winba winbb winbc winbe winbf winbg winbh winbi winbj winbk winbl winbm winbn winbo winbp winbq winbr winbsb winbt winbu winbv winbw winbx winby winbz + winca wincb wincc wince wincf wincg winch winci wincj winck wincl wincm wincn winco wincp wincq wincr wincsc winct wincu wincv wincw wincx wincy wincz + winda windb windc winde windf windg windh windi windj windk windl windm windn windo windp windq windr windsd windt windu windv windw windx windy windz + winea wineb winec winee winef wineg wineh winei winej winek winel winem winen wineo winep wineq winer winese winet wineu winev winew winex winey winez + winfa winfb winfc winfe winff winfg winfh winfi winfj winfk winfl winfm winfn winfo winfp winfq winfr winfsf winft winfu winfv winfw winfx winfy winfz + winga wingb wingc winge wingf wingg wingh wingi wingj wingk wingl wingm wingn wingo wingp wingq wingr wingsg wingt wingu wingv wingw wingx wingy wingz + winka winkb winkc winke winkf winkg winkh winki winkj winkk winkl winkm winkn winko winkp winkq winkr winksk winkt winku winkv winkw winkx winky winkz + winla winlb winlc winle winlf winlg winlh winli winlj winlk winll winlm winln winlo winlp winlq winlr winlsl winlt winlu winlv winlw winlx winly winlz + winma winmb winmc winme winmf winmg winmh winmi winmj winmk winml winmm winmn winmo winmp winmq winmr winmsm winmt winmu winmv winmw winmx winmy winmz + winna winnb winnc winne winnf winng winnh winni winnj winnk winnl winnm winnn winno winnp winnq winnr winnsn winnt winnu winnv winnw winnx winny winnz + winoa winob winoc winoe winof winog winoh winoi winoj winok winol winom winon winoo winop winoq winor winoso winot winou winov winow winox winoy winoz + winpa winpb winpc winpe winpf winpg winph winpi winpj winpk winpl winpm winpn winpo winpp winpq winpr winpsp winpt winpu winpv winpw winpx winpy winpz + winqa winqb winqc winqe winqf winqg winqh winqi winqj winqk winql winqm winqn winqo winqp winqq winqr winqsq winqt winqu winqv winqw winqx winqy winqz + winra winrb winrc winre winrf winrg winrh winri winrj winrk winrl winrm winrn winro winrp winrq winrr winrsr winrt winru winrv winrw winrx winry winrz + winsa winsb winsc winse winsf winsg winsh winsi winsj winsk winsl winsm winsn winso winsp winsq winsr winsss winst winsu winsv winsw winsx winsy winsz + winta wintb wintc winte wintf wintg winth winti wintj wintk wintl wintm wintn winto wintp wintq wintr wintst wintt wintu wintv wintw wintx winty wintz + " + }, + { + // Prox MAX between `best` and `win` prefix + "id": 15, + "text": "this is the best meal I have ever had in such a beautiful winter day" + }, + { + // Prox 5 between `best` and `win` prefix + "id": 16, + "text": "this is the best cooked meal of the winter" + }, + { + // Prox 4 between `best` and `win` prefix + "id": 17, + "text": "this is the best meal of the winter" + }, + { + // Prox 3 between `best` and `win` prefix + "id": 18, + "text": "this is the best meal of winter" + }, + { + // Prox 1 between `best` and `win` prefix + "id": 19, + "text": "this is the best winter meal" + }, + { + // Reverse Prox 3 between `best` and `win` prefix + "id": 20, + "text": "winter x y best" + }, + { + // Reverse Prox 2 between `best` and `win` prefix + "id": 21, + "text": "winter x best" + }, + { + // Reverse Prox 1 between `best` and `win` prefix + "id": 22, + "text": "winter best" + }, + ])).unwrap(); + index +} + +#[test] +fn test_proximity_simple() { + let index = create_simple_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 9, 10, 7, 6, 5, 2, 3, 0, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quickbrown fox jumps over the lazy dog\"", + "\"the quack brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the really quick brown fox jumps over the lazy dog\"", + "\"the really quick brown fox jumps over the very lazy dog\"", + "\"brown quick fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy. dog\"", + "\"dog the quick brown fox jumps over the lazy\"", + "\"the very quick dark brown and smart fox did jump over the terribly lazy and small dog\"", + "\"the. quick brown fox jumps over the lazy. dog\"", + ] + "###); +} + +#[test] +fn test_proximity_split_word() { + let index = create_edge_cases_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("sunflower wilting"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: "2" and "4" should be swapped ideally + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", + "\"sun flower wilting under the heat\"", + "\"sunflower wilting under the heat\"", + "\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"", + "\"A flower wilting under the sun, unlike a sunflower\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("\"sun flower\" wilting"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: "2" and "4" should be swapped ideally + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", + "\"sun flower wilting under the heat\"", + "\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"", + ] + "###); + drop(txn); + + index + .update_settings(|s| { + let mut syns = HashMap::new(); + syns.insert("xyz".to_owned(), vec!["sun flower".to_owned()]); + s.set_synonyms(syns); + }) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("xyz wilting"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: "2" and "4" should be swapped ideally + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", + "\"sun flower wilting under the heat\"", + "\"Sun Flower sounds like the title of a painting, maybe about a plant wilting under the heat.\"", + ] + "###); +} + +#[test] +fn test_proximity_prefix_db() { + let index = create_edge_cases_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best s"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 6, 7, 11, 15]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + // This test illustrates the loss of precision from using the prefix DB + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best summer meal\"", + "\"summer best\"", + "\"this is the best meal of summer\"", + "\"summer x best\"", + "\"this is the best meal of the summer\"", + "\"this is the best meal I have ever had in such a beautiful summer day\"", + "\"this is the best cooked meal of the summer\"", + "\"summer x y best\"", + "\"this is the best meal I have ever had in such a beautiful winter day\"", + ] + "###); + + // Difference when using the `su` prefix, which is not in the prefix DB + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best su"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 13, 9, 12, 8, 11, 7, 6, 15]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best summer meal\"", + "\"summer best\"", + "\"this is the best meal of summer\"", + "\"summer x best\"", + "\"this is the best meal of the summer\"", + "\"summer x y best\"", + "\"this is the best cooked meal of the summer\"", + "\"this is the best meal I have ever had in such a beautiful summer day\"", + "\"this is the best meal I have ever had in such a beautiful winter day\"", + ] + "###); + + // Note that there is a case where a prefix is in the prefix DB but not in the + // **proximity** prefix DB. In that case, its sprximity score will always be + // the maximum. This happens for prefixes that are larger than 2 bytes. + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best win"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[15, 16, 17, 18, 19, 20, 21, 22]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best meal I have ever had in such a beautiful winter day\"", + "\"this is the best cooked meal of the winter\"", + "\"this is the best meal of the winter\"", + "\"this is the best meal of winter\"", + "\"this is the best winter meal\"", + "\"winter x y best\"", + "\"winter x best\"", + "\"winter best\"", + ] + "###); + + // Now using `wint`, which is not in the prefix DB: + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best wint"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 20, 16, 15]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best winter meal\"", + "\"winter best\"", + "\"this is the best meal of winter\"", + "\"winter x best\"", + "\"this is the best meal of the winter\"", + "\"winter x y best\"", + "\"this is the best cooked meal of the winter\"", + "\"this is the best meal I have ever had in such a beautiful winter day\"", + ] + "###); + + // and using `wi` which is in the prefix DB and proximity prefix DB + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("best wi"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 22, 18, 21, 17, 15, 16, 20]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"this is the best winter meal\"", + "\"winter best\"", + "\"this is the best meal of winter\"", + "\"winter x best\"", + "\"this is the best meal of the winter\"", + "\"this is the best meal I have ever had in such a beautiful winter day\"", + "\"this is the best cooked meal of the winter\"", + "\"winter x y best\"", + ] + "###); +} diff --git a/milli/src/search/new/tests/proximity_typo.rs b/milli/src/search/new/tests/proximity_typo.rs new file mode 100644 index 000000000..9f9601e3f --- /dev/null +++ b/milli/src/search/new/tests/proximity_typo.rs @@ -0,0 +1,75 @@ +/*! +This module tests the interactions between the proximity and typo ranking rules. + +The proximity ranking rule should transform the query graph such that it +only contains the word pairs that it used to compute its bucket. + +TODO: This is not currently implemented. +*/ + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words, Criterion::Proximity, Criterion::Typo]); + }) + .unwrap(); + + index + .add_documents(documents!([ + // Basic trap. + // + // We have one document with the perfect word pair: `sommer - holiday` + // and another with the perfect word pair: `sommer holidty`. + // + // The proximity ranking rule will put them both in the same bucket, and it + // should minify the query graph to make it represent: + // EITHER: + // sommer + holiday + // OR: + // sommer + holidty + // + // Such that the child typo ranking rule does not find any match + // for its zero-typo bucket `summer + holiday`, even though both documents + // contain these two exact words. + { + "id": 0, + "text": "summer. holiday. sommer holidty" + }, + { + "id": 1, + "text": "summer. holiday. sommer holiday" + }, + + ])) + .unwrap(); + index +} + +#[test] +fn test_trap_basic() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("summer holiday"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + // TODO: this is incorrect, 1 should come before 0 + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"summer. holiday. sommer holidty\"", + "\"summer. holiday. sommer holiday\"", + ] + "###); +} diff --git a/milli/src/search/new/tests/sort.rs b/milli/src/search/new/tests/sort.rs new file mode 100644 index 000000000..d2201f55b --- /dev/null +++ b/milli/src/search/new/tests/sort.rs @@ -0,0 +1,316 @@ +/*! +This module tests the `sort` ranking rule: + +1. an error is returned if the sort ranking rule exists but no fields-to-sort were given at search time +2. an error is returned if the fields-to-sort are not sortable +3. it is possible to add multiple fields-to-sort at search time +4. custom sort ranking rules can be added to the settings, they interact with the generic `sort` ranking rule as expected +5. numbers appear before strings +6. documents with either: (1) no value, (2) null, or (3) an object for the field-to-sort appear at the end of the bucket +7. boolean values are translated to strings +8. if a field contains an array, it is sorted by the best value in the array according to the sort rule +*/ + +use big_s::S; +use maplit::hashset; + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, AscDesc, Criterion, Member, + Search, SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_sortable_fields(hashset! { S("rank"), S("vague"), S("letter") }); + s.set_criteria(vec![Criterion::Sort]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "letter": "A", + "rank": 0, + "vague": 0, + }, + { + "id": 1, + "letter": "A", + "rank": 1, + "vague": "0", + }, + { + "id": 2, + "letter": "B", + "rank": 0, + "vague": 1, + }, + { + "id": 3, + "letter": "B", + "rank": 1, + "vague": "1", + }, + { + "id": 4, + "letter": "B", + "rank": 2, + "vague": [1, 2], + }, + { + "id": 5, + "letter": "C", + "rank": 0, + "vague": [1, "2"], + }, + { + "id": 6, + "letter": "C", + "rank": 1, + }, + { + "id": 7, + "letter": "C", + "rank": 2, + "vague": null, + }, + { + "id": 8, + "letter": "D", + "rank": 0, + "vague": [null, null, ""] + }, + { + "id": 9, + "letter": "E", + "rank": 0, + "vague": "" + }, + { + "id": 10, + "letter": "E", + "rank": 1, + "vague": { + "sub": 0, + } + }, + { + "id": 11, + "letter": "E", + "rank": 2, + "vague": true, + }, + { + "id": 12, + "letter": "E", + "rank": 3, + "vague": false, + }, + { + "id": 13, + "letter": "E", + "rank": 4, + "vague": 1.5673, + }, + { + "id": 14, + "letter": "E", + "rank": 5, + }, + { + "id": 15, + "letter": "F", + "rank": 0, + }, + { + "id": 16, + "letter": "F", + "rank": 1, + }, + { + "id": 17, + "letter": "F", + "rank": 2, + }, + { + "id": 18, + "letter": "G", + "rank": 0, + }, + { + "id": 19, + "letter": "G", + "rank": 1, + }, + { + "id": 20, + "letter": "H", + "rank": 0, + "vague": true, + }, + { + "id": 21, + "letter": "I", + "rank": 0, + "vague": false, + }, + { + "id": 22, + "letter": "I", + "rank": 1, + "vague": [1.1367, "help", null] + }, + { + "id": 23, + "letter": "I", + "rank": 2, + "vague": [1.2367, "hello"] + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_sort() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("letter")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 22, 23, 20, 18, 19, 15, 16, 17, 9, 10, 11, 12, 13, 14, 8, 5, 6, 7, 2]"); + + let letter_values = collect_field_values(&index, &txn, "letter", &documents_ids); + insta::assert_debug_snapshot!(letter_values, @r###" + [ + "\"I\"", + "\"I\"", + "\"I\"", + "\"H\"", + "\"G\"", + "\"G\"", + "\"F\"", + "\"F\"", + "\"F\"", + "\"E\"", + "\"E\"", + "\"E\"", + "\"E\"", + "\"E\"", + "\"E\"", + "\"D\"", + "\"C\"", + "\"C\"", + "\"C\"", + "\"B\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("rank")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 13, 12, 4, 7, 11, 17, 23, 1, 3, 6, 10, 16, 19, 22, 0, 2, 5, 8, 9]"); + + let rank_values = collect_field_values(&index, &txn, "rank", &documents_ids); + insta::assert_debug_snapshot!(rank_values, @r###" + [ + "5", + "4", + "3", + "2", + "2", + "2", + "2", + "2", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "0", + "0", + "0", + "0", + "0", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Asc(Member::Field(S("vague")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 2, 4, 5, 22, 23, 13, 1, 3, 12, 21, 11, 20, 6, 7, 8, 9, 10, 14, 15]"); + + let vague_values = collect_field_values(&index, &txn, "vague", &documents_ids); + insta::assert_debug_snapshot!(vague_values, @r###" + [ + "0", + "1", + "[1,2]", + "[1,\"2\"]", + "[1.1367,\"help\",null]", + "[1.2367,\"hello\"]", + "1.5673", + "\"0\"", + "\"1\"", + "false", + "false", + "true", + "true", + "__does_not_exist__", + "null", + "[null,null,\"\"]", + "\"\"", + "{\"sub\":0}", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.sort_criteria(vec![AscDesc::Desc(Member::Field(S("vague")))]); + + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 13, 23, 22, 2, 5, 0, 11, 20, 12, 21, 3, 1, 6, 7, 8, 9, 10, 14, 15]"); + + let vague_values = collect_field_values(&index, &txn, "vague", &documents_ids); + insta::assert_debug_snapshot!(vague_values, @r###" + [ + "[1,2]", + "1.5673", + "[1.2367,\"hello\"]", + "[1.1367,\"help\",null]", + "1", + "[1,\"2\"]", + "0", + "true", + "true", + "false", + "false", + "\"1\"", + "\"0\"", + "__does_not_exist__", + "null", + "[null,null,\"\"]", + "\"\"", + "{\"sub\":0}", + "__does_not_exist__", + "__does_not_exist__", + ] + "###); +} diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs new file mode 100644 index 000000000..4df340e9b --- /dev/null +++ b/milli/src/search/new/tests/typo.rs @@ -0,0 +1,497 @@ +/*! +This module tests the following properties: + +1. The `words` ranking rule is typo-tolerant +2. Typo-tolerance handles missing letters, extra letters, replaced letters, and swapped letters (at least) +3. Words which are < `min_word_len_one_typo` are not typo tolerant +4. Words which are >= `min_word_len_one_typo` but < `min_word_len_two_typos` can have one typo +5. Words which are >= `min_word_len_two_typos` can have two typos +6. A typo on the first letter of a word counts as two typos +7. Phrases are not typo tolerant +8. 2grams can have 1 typo if they are larger than `min_word_len_two_typos` +9. 3grams are not typo tolerant +10. The `typo` ranking rule assumes the role of the `words` ranking rule implicitly +if `words` doesn't exist before it. +11. The `typo` ranking rule places documents with the same number of typos in the same bucket +12. Prefix tolerance costs nothing according to the typo ranking rule +13. Split words cost 1 typo according to the typo ranking rule +14. Synonyms cost nothing according to the typo ranking rule +*/ + +use std::collections::HashMap; + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "the quick brown fox jumps over the lazy dog" + }, + { + "id": 1, + "text": "the quick brown foxes jump over the lazy dog" + }, + { + "id": 2, + "text": "the quick brown fax sends a letter to the dog" + }, + { + "id": 3, + "text": "the quickest brownest fox jumps over the laziest dog" + }, + { + "id": 4, + "text": "a fox doesn't quack, that crown goes to the duck." + }, + { + "id": 5, + "text": "the quicker browner fox jumped over the lazier dog" + }, + { + "id": 6, + "text": "the extravagant fox skyrocketed over the languorous dog" // thanks thesaurus + }, + { + "id": 7, + "text": "the quick brown fox jumps over the lazy" + }, + { + "id": 8, + "text": "the quick brown fox jumps over the" + }, + { + "id": 9, + "text": "the quick brown fox jumps over" + }, + { + "id": 10, + "text": "the quick brown fox jumps" + }, + { + "id": 11, + "text": "the quick brown fox" + }, + { + "id": 12, + "text": "the quick brown" + }, + { + "id": 13, + "text": "the quick" + }, + { + "id": 14, + "text": "netwolk interconections sunflawar" + }, + { + "id": 15, + "text": "network interconnections sunflawer" + }, + { + "id": 16, + "text": "network interconnection sunflower" + }, + { + "id": 17, + "text": "network interconnection sun flower" + }, + { + "id": 18, + "text": "network interconnection sunflowering" + }, + { + "id": 19, + "text": "network interconnection sun flowering" + }, + { + "id": 20, + "text": "network interconnection sunflowar" + }, + { + "id": 21, + "text": "the fast brownish fox jumps over the lackadaisical dog" + }, + { + "id": 22, + "text": "the quick brown fox jumps over the lackadaisical dog" + }, + { + "id": 23, + "text": "the quivk brown fox jumps over the lazy dog" + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_no_typo() { + let index = create_index(); + index + .update_settings(|s| { + s.set_autorize_typos(false); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); +} + +#[test] +fn test_default_typo() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let ot = index.min_word_len_one_typo(&txn).unwrap(); + let tt = index.min_word_len_two_typos(&txn).unwrap(); + insta::assert_debug_snapshot!(ot, @"5"); + insta::assert_debug_snapshot!(tt, @"9"); + + // 0 typo + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 23]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + ] + "###); + + // 1 typo on one word, replaced letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quack brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); + + // 1 typo on one word, missing letter, extra letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quicest brownest fox jummps over the laziest dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quickest brownest fox jumps over the laziest dog\"", + ] + "###); + + // 1 typo on one word, swapped letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quikc borwn fox jupms over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); + + // 1 first letter typo on a word <5 bytes, replaced letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the nuick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // 1 first letter typo on a word <5 bytes, missing letter + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the uick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // 1 typo on all words >=5 bytes, replaced letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quack brawn fox junps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); + + // 2 typos on words < 9 bytes + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quckest brawnert fox jumps over the aziest dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + // 2 typos on words >= 9 bytes: missing letters, missing first letter, replaced letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the extravant fox kyrocketed over the lamguorout dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the extravagant fox skyrocketed over the languorous dog\"", + ] + "###); + + // 2 typos on words >= 9 bytes: 2 extra letters in a single word, swapped letters + extra letter, replaced letters + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the extravaganttt fox sktyrocnketed over the lagnuorrous dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the extravagant fox skyrocketed over the languorous dog\"", + ] + "###); +} + +#[test] +fn test_phrase_no_typo_allowed() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the \"quick brewn\" fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @"[]"); +} + +#[test] +fn test_ngram_typos() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the extra lagant fox skyrocketed over the languorous dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the extravagant fox skyrocketed over the languorous dog\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the ex tra lagant fox skyrocketed over the languorous dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @"[]"); +} +#[test] +fn test_typo_ranking_rule_not_preceded_by_words_ranking_rule() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Typo]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids: ids_1, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{ids_1:?}"), @"[0, 23, 7, 8, 9, 22, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + let texts = collect_field_values(&index, &txn, "text", &ids_1); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps over the lackadaisical dog\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown foxes jump over the lazy dog\"", + "\"the quick brown fax sends a letter to the dog\"", + "\"the quick brown\"", + "\"the quick\"", + "\"a fox doesn't quack, that crown goes to the duck.\"", + "\"the quickest brownest fox jumps over the laziest dog\"", + "\"the quicker browner fox jumped over the lazier dog\"", + "\"the extravagant fox skyrocketed over the languorous dog\"", + "\"the fast brownish fox jumps over the lackadaisical dog\"", + ] + "###); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Typo]); + }) + .unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids: ids_2, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{ids_2:?}"), @"[0, 23, 7, 8, 9, 22, 10, 11, 1, 2, 12, 13, 4, 3, 5, 6, 21]"); + + assert_eq!(ids_1, ids_2); +} + +#[test] +fn test_typo_bucketing() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + // First do the search with just the Words ranking rule + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sunflower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 15, 16, 17, 18, 20]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"netwolk interconections sunflawar\"", + "\"network interconnections sunflawer\"", + "\"network interconnection sunflower\"", + "\"network interconnection sun flower\"", + "\"network interconnection sunflowering\"", + "\"network interconnection sunflowar\"", + ] + "###); + + // Then with the typo ranking rule + drop(txn); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Typo]); + }) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sunflower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[16, 18, 17, 20, 15, 14]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"network interconnection sunflower\"", + "\"network interconnection sunflowering\"", + "\"network interconnection sun flower\"", + "\"network interconnection sunflowar\"", + "\"network interconnections sunflawer\"", + "\"netwolk interconections sunflawar\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("network interconnection sun flower"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[17, 19, 16, 18, 20, 15]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"network interconnection sun flower\"", + "\"network interconnection sun flowering\"", + "\"network interconnection sunflower\"", + "\"network interconnection sunflowering\"", + "\"network interconnection sunflowar\"", + "\"network interconnections sunflawer\"", + ] + "###); +} + +#[test] +fn test_typo_synonyms() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Typo]); + + let mut synonyms = HashMap::new(); + synonyms.insert("lackadaisical".to_owned(), vec!["lazy".to_owned()]); + synonyms.insert("fast brownish".to_owned(), vec!["quick brown".to_owned()]); + + s.set_synonyms(synonyms); + }) + .unwrap(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the quick brown fox jumps over the lackadaisical dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 22, 23]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lackadaisical dog\"", + "\"the quivk brown fox jumps over the lazy dog\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("the fast brownish fox jumps over the lackadaisical dog"); + + // TODO: is this correct? interaction of ngrams + synonyms means that the + // multi-word synonyms end up having a typo cost. This is probably not what we want. + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0, 22]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the fast brownish fox jumps over the lackadaisical dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lackadaisical dog\"", + ] + "###); +} diff --git a/milli/src/search/new/tests/typo_proximity.rs b/milli/src/search/new/tests/typo_proximity.rs new file mode 100644 index 000000000..220fc69e1 --- /dev/null +++ b/milli/src/search/new/tests/typo_proximity.rs @@ -0,0 +1,124 @@ +/*! +This module tests the interactions between the typo and proximity ranking rules. + +The typo ranking rule should transform the query graph such that it only contains +the combinations of word derivations that it used to compute its bucket. + +The proximity ranking rule should then look for proximities only between those specific derivations. +For example, given the the search query `beautiful summer` and the dataset: +```text +{ "id": 0, "text": "beautigul summer...... beautiful day in the summer" } +{ "id": 1, "text": "beautiful summer" } +``` +Then the document with id `1` should be returned before `0`. +The proximity ranking rule is not allowed to look for the proximity between `beautigul` and `summer` +because the typo ranking rule before it only used the derivation `beautiful`. +*/ + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); + }) + .unwrap(); + + index + .add_documents(documents!([ + // trap explained in the module documentation + { + "id": 0, + "text": "beautigul summer. beautiful x y z summer" + }, + { + "id": 1, + "text": "beautiful summer" + }, + // the next 2 documents set up a more complicated trap + // with the query `beautiful summer`, we will have: + // 1. documents with no typos, id 0 and 1 + // 2. documents with 1 typos: id 2 and 3, those are interpreted as EITHER + // - id 2: "beautigul + summer" ; OR + // - id 3: "beautiful + sommer" + // To sort these two documents, the proximity ranking rule must use only the + // word pairs: `beautigul -- summer` and `beautiful -- sommer` even though + // all variations of `beautiful` and `sommer` were used by the typo ranking rule. + { + "id": 2, + "text": "beautigul sommer. beautigul x summer" + }, + { + "id": 3, + "text": "beautiful sommer" + }, + // The next two documents lay out an even more complex trap. + // With the user query `delicious sweet dessert`, the typo ranking rule will return one bucket of: + // - id 4: delicitous + sweet + dessert + // - id 5: beautiful + sweet + desgert + // The word pairs that the proximity ranking rules is allowed to use are + // EITHER: + // delicitous -- sweet AND sweet -- dessert + // OR + // delicious -- sweet AND sweet -- desgert + // So the word pair to use for the terms `summer` and `dessert` depend on the + // word pairs explored before them. + { + "id": 4, + "text": "delicitous. sweet. dessert. delicitous sweet desgert", + }, + { + "id": 5, + "text": "delicious. sweet desgert. delicious sweet desgert", + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_trap_basic_and_complex1() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("beautiful summer"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"beautiful summer\"", + "\"beautigul summer. beautiful x y z summer\"", + "\"beautiful sommer\"", + "\"beautigul sommer. beautigul x summer\"", + ] + "###); +} + +#[test] +fn test_trap_complex2() { + let index = create_index(); + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::All); + s.query("delicious sweet dessert"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"delicious. sweet desgert. delicious sweet desgert\"", + "\"delicitous. sweet. dessert. delicitous sweet desgert\"", + ] + "###); +} diff --git a/milli/src/search/new/tests/words_tms.rs b/milli/src/search/new/tests/words_tms.rs new file mode 100644 index 000000000..74748ea5a --- /dev/null +++ b/milli/src/search/new/tests/words_tms.rs @@ -0,0 +1,437 @@ +/*! +This module tests the following properties: + +1. The `last` term matching strategy starts removing terms from the query +starting from the end if no more results match it. +2. Phrases are never deleted by the `last` term matching strategy +3. Duplicate words don't affect the ranking of a document according to the `words` ranking rule +4. The proximity of the first and last word of a phrase to its adjacent terms is taken into +account by the proximity ranking rule. +5. Unclosed double quotes still make a phrase +6. The `all` term matching strategy does not remove any term from the query +7. The search is capable of returning no results if no documents match the query +*/ + +use crate::{ + index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search, + SearchResult, TermsMatchingStrategy, +}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Words]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "", + }, + { + "id": 1, + "text": "the", + }, + { + "id": 2, + "text": "the quick", + }, + { + "id": 3, + "text": "the quick brown", + }, + { + "id": 4, + "text": "the quick brown fox", + }, + { + "id": 5, + "text": "the quick brown fox jumps", + }, + { + "id": 6, + "text": "the quick brown fox jumps over", + }, + { + "id": 7, + "text": "the quick brown fox jumps over the", + }, + { + "id": 8, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 9, + "text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 10, + "text": "the brown quick fox jumps over the lazy dog", + }, + { + "id": 11, + "text": "the quick brown fox talks to the lazy and slow dog", + }, + { + "id": 12, + "text": "the quick brown fox talks to the lazy dog", + }, + { + "id": 13, + "text": "the mighty and quick brown fox jumps over the lazy dog", + }, + { + "id": 14, + "text": "the great quick brown fox jumps over the lazy dog", + }, + { + "id": 15, + "text": "this quick brown and very scary fox jumps over the lazy dog", + }, + { + "id": 16, + "text": "this quick brown and scary fox jumps over the lazy dog", + }, + { + "id": 17, + "text": "the quick brown fox jumps over the really lazy dog", + }, + { + "id": 18, + "text": "the brown quick fox jumps over the really lazy dog", + }, + { + "id": 19, + "text": "the brown quick fox immediately jumps over the really lazy dog", + }, + { + "id": 20, + "text": "the brown quick fox immediately jumps over the really lazy blue dog", + }, + { + "id": 21, + "text": "the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.", + }, + { + "id": 22, + "text": "the, quick, brown, fox, jumps, over, the, lazy, dog", + } + ])) + .unwrap(); + index +} + +#[test] +fn test_words_tms_last_simple() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // 6 and 7 have the same score because "the" appears twice + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 8, 6, 7, 5, 4, 11, 12, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the brown quick fox jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy blue dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the, quick, brown, fox, jumps, over, the, lazy, dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + "\"the quick brown\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.query("extravagant the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); +} + +#[test] +fn test_words_tms_last_phrase() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox\" jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "The quick brown fox" is a phrase, not deleted by this term matching strategy + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 6, 7, 5, 4, 11, 12]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox\" jumps over the \"lazy\" dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "lazy" is a phrase, not deleted by this term matching strategy + // but words before it can be deleted + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 17, 21, 8, 11, 12]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox jumps over the lazy dog\""); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // The whole query is a phrase, no terms are removed + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.query("\"the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // The whole query is still a phrase, even without closing quotes, so no terms are removed + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + ] + "###); +} + +#[test] +fn test_words_proximity_tms_last_simple() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // 7 is better than 6 because of the proximity between "the" and its surrounding terms + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy blue dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the, quick, brown, fox, jumps, over, the, lazy, dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + "\"the quick brown\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.query("the brown quick fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // 10 is better than 9 because of the proximity between "quick" and "brown" + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[10, 18, 19, 9, 20, 21, 14, 17, 13, 16, 15, 22, 8, 7, 6, 5, 4, 11, 12, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the brown quick fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy dog\"", + "\"the quick brown fox jumps over the lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy blue dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the, quick, brown, fox, jumps, over, the, lazy, dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + "\"the quick brown\"", + ] + "###); +} + +#[test] +fn test_words_proximity_tms_last_phrase() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the \"quick brown\" fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "quick brown" is a phrase. The proximity of its first and last words + // to their adjacent query words should be taken into account + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5, 4, 11, 12, 3]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps\"", + "\"the quick brown fox\"", + "\"the quick brown fox talks to the lazy and slow dog\"", + "\"the quick brown fox talks to the lazy dog\"", + "\"the quick brown\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.query("the \"quick brown\" \"fox jumps\" over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + // "quick brown" is a phrase. The proximity of its first and last words + // to their adjacent query words should be taken into account. + // The same applies to `fox jumps`. + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 16, 15, 8, 7, 6, 5]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps\"", + ] + "###); +} + +#[test] +fn test_words_tms_all() { + let index = create_index(); + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + let mut s = Search::new(&txn, &index); + s.query("the quick brown fox jumps over the lazy dog"); + s.terms_matching_strategy(TermsMatchingStrategy::All); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 21, 14, 17, 13, 10, 18, 19, 20, 16, 15, 22]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown. quick brown fox. brown fox jumps. fox jumps over. over the lazy. the lazy dog.\"", + "\"the great quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the really lazy dog\"", + "\"the mighty and quick brown fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the lazy dog\"", + "\"the brown quick fox jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy dog\"", + "\"the brown quick fox immediately jumps over the really lazy blue dog\"", + "\"this quick brown and scary fox jumps over the lazy dog\"", + "\"this quick brown and very scary fox jumps over the lazy dog\"", + "\"the, quick, brown, fox, jumps, over, the, lazy, dog\"", + ] + "###); + + let mut s = Search::new(&txn, &index); + s.query("extravagant"); + s.terms_matching_strategy(TermsMatchingStrategy::All); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + insta::assert_debug_snapshot!(texts, @"[]"); +}