diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index e9435f180..f5f32fecf 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -73,7 +73,7 @@ impl<'a> FacetDistribution<'a> { let distribution_prelength = distribution.len(); let db = self.index.field_id_docid_facet_f64s; - for docid in candidates.into_iter() { + for docid in candidates { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db @@ -97,7 +97,7 @@ impl<'a> FacetDistribution<'a> { let mut key_buffer: Vec<_> = field_id.to_be_bytes().to_vec(); let db = self.index.field_id_docid_facet_strings; - 'outer: for docid in candidates.into_iter() { + 'outer: for docid in candidates { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db @@ -505,7 +505,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..10_000).into_iter().collect()) + .candidates((0..10_000).collect()) .execute() .unwrap(); @@ -513,7 +513,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..5_000).into_iter().collect()) + .candidates((0..5_000).collect()) .execute() .unwrap(); @@ -521,7 +521,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..5_000).into_iter().collect()) + .candidates((0..5_000).collect()) .execute() .unwrap(); @@ -529,7 +529,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..5_000).into_iter().collect()) + .candidates((0..5_000).collect()) .max_values_per_facet(1) .execute() .unwrap(); @@ -546,7 +546,7 @@ mod tests { .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) .unwrap(); - let facet_values = (0..1000).into_iter().map(|x| format!("{x:x}")).collect::>(); + let facet_values = (0..1000).map(|x| format!("{x:x}")).collect::>(); let mut documents = vec![]; for i in 0..10_000 { @@ -582,7 +582,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..10_000).into_iter().collect()) + .candidates((0..10_000).collect()) .execute() .unwrap(); @@ -590,7 +590,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..5_000).into_iter().collect()) + .candidates((0..5_000).collect()) .execute() .unwrap(); @@ -606,7 +606,7 @@ mod tests { .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) .unwrap(); - let facet_values = (0..1000).into_iter().collect::>(); + let facet_values = (0..1000).collect::>(); let mut documents = vec![]; for i in 0..1000 { @@ -634,7 +634,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..1000).into_iter().collect()) + .candidates((0..1000).collect()) .compute_stats() .unwrap(); @@ -642,7 +642,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((217..777).into_iter().collect()) + .candidates((217..777).collect()) .compute_stats() .unwrap(); @@ -658,7 +658,7 @@ mod tests { .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) .unwrap(); - let facet_values = (0..1000).into_iter().collect::>(); + let facet_values = (0..1000).collect::>(); let mut documents = vec![]; for i in 0..1000 { @@ -686,7 +686,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..1000).into_iter().collect()) + .candidates((0..1000).collect()) .compute_stats() .unwrap(); @@ -694,7 +694,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((217..777).into_iter().collect()) + .candidates((217..777).collect()) .compute_stats() .unwrap(); @@ -710,7 +710,7 @@ mod tests { .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) .unwrap(); - let facet_values = (0..1000).into_iter().collect::>(); + let facet_values = (0..1000).collect::>(); let mut documents = vec![]; for i in 0..1000 { @@ -738,7 +738,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..1000).into_iter().collect()) + .candidates((0..1000).collect()) .compute_stats() .unwrap(); @@ -746,7 +746,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((217..777).into_iter().collect()) + .candidates((217..777).collect()) .compute_stats() .unwrap(); @@ -762,7 +762,7 @@ mod tests { .update_settings(|settings| settings.set_filterable_fields(hashset! { S("colour") })) .unwrap(); - let facet_values = (0..1000).into_iter().collect::>(); + let facet_values = (0..1000).collect::>(); let mut documents = vec![]; for i in 0..1000 { @@ -794,7 +794,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((0..1000).into_iter().collect()) + .candidates((0..1000).collect()) .compute_stats() .unwrap(); @@ -802,7 +802,7 @@ mod tests { let map = FacetDistribution::new(&txn, &index) .facets(std::iter::once("colour")) - .candidates((217..777).into_iter().collect()) + .candidates((217..777).collect()) .compute_stats() .unwrap(); diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index bb3c75343..d355b981a 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -142,7 +142,7 @@ mod tests { let indexes = [get_simple_index(), get_random_looking_index()]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let candidates = (0..=255).into_iter().collect::(); + let candidates = (0..=255).collect::(); let mut results = String::new(); iterate_over_facet_distribution( &txn, @@ -166,7 +166,7 @@ mod tests { let indexes = [get_simple_index(), get_random_looking_index()]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let candidates = (0..=255).into_iter().collect::(); + let candidates = (0..=255).collect::(); let mut results = String::new(); let mut nbr_facets = 0; iterate_over_facet_distribution( diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index b1ab6f71f..26854bc1a 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -410,7 +410,7 @@ mod tests { let mut results = String::new(); - for i in (0..=255).into_iter().rev() { + for i in (0..=255).rev() { let i = i as f64; let start = Bound::Included(i); let end = Bound::Included(255.); @@ -431,7 +431,7 @@ mod tests { let mut results = String::new(); - for i in (0..=255).into_iter().rev() { + for i in (0..=255).rev() { let i = i as f64; let start = Bound::Excluded(i); let end = Bound::Excluded(255.); @@ -466,7 +466,7 @@ mod tests { let mut results = String::new(); - for i in (0..=128).into_iter().rev() { + for i in (0..=128).rev() { let i = i as f64; let start = Bound::Included(i); let end = Bound::Included(255. - i); @@ -491,7 +491,7 @@ mod tests { let mut results = String::new(); - for i in (0..=128).into_iter().rev() { + for i in (0..=128).rev() { let i = i as f64; let start = Bound::Excluded(i); let end = Bound::Excluded(255. - i); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index f59b884de..892401c08 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -132,7 +132,7 @@ mod tests { let indexes = [get_simple_index(), get_random_looking_index()]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let candidates = (200..=300).into_iter().collect::(); + let candidates = (200..=300).collect::(); let mut results = String::new(); let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap(); for el in iter { @@ -154,7 +154,7 @@ mod tests { ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let candidates = (200..=300).into_iter().collect::(); + let candidates = (200..=300).collect::(); let mut results = String::new(); let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap(); for el in iter { diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index 454b12859..549f50f0a 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -142,7 +142,7 @@ mod tests { ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let candidates = (200..=300).into_iter().collect::(); + let candidates = (200..=300).collect::(); let mut results = String::new(); let db = index.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates).unwrap(); @@ -165,7 +165,7 @@ mod tests { ]; for (i, index) in indexes.iter().enumerate() { let txn = index.env.read_txn().unwrap(); - let candidates = (200..=300).into_iter().collect::(); + let candidates = (200..=300).collect::(); let mut results = String::new(); let db = index.content.remap_key_type::>(); let iter = descending_facet_sort(&txn, db, 0, candidates.clone()).unwrap(); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 1015b01cb..46829b986 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,21 +1,14 @@ pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; -use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, }; use crate::{ execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, }; -use fst::automaton::Str; -use fst::{Automaton, IntoStreamer, Streamer}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -use std::borrow::Cow; -use std::collections::hash_map::{Entry, HashMap}; use std::fmt; -use std::result::Result as StdResult; -use std::str::Utf8Error; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -26,7 +19,6 @@ pub mod facet; mod fst_utils; mod matches; pub mod new; -mod query_tree; pub struct Search<'a> { query: Option, @@ -200,70 +192,6 @@ impl Default for TermsMatchingStrategy { } } -pub type WordDerivationsCache = HashMap<(String, bool, u8), Vec<(String, u8)>>; - -pub fn word_derivations<'c>( - word: &str, - is_prefix: bool, - max_typo: u8, - fst: &fst::Set>, - cache: &'c mut WordDerivationsCache, -) -> StdResult<&'c [(String, u8)], Utf8Error> { - match cache.entry((word.to_string(), is_prefix, max_typo)) { - Entry::Occupied(entry) => Ok(entry.into_mut()), - Entry::Vacant(entry) => { - // println!("word derivations {word} {is_prefix} {max_typo}"); - let mut derived_words = Vec::new(); - if max_typo == 0 { - if is_prefix { - let prefix = Str::new(word).starts_with(); - let mut stream = fst.search(prefix).into_stream(); - - while let Some(word) = stream.next() { - let word = std::str::from_utf8(word)?; - derived_words.push((word.to_string(), 0)); - } - } else if fst.contains(word) { - derived_words.push((word.to_string(), 0)); - } - } else if max_typo == 1 { - let dfa = build_dfa(word, 1, is_prefix); - let starts = StartsWith(Str::new(get_first(word))); - let mut stream = fst.search_with_state(Intersection(starts, &dfa)).into_stream(); - - while let Some((word, state)) = stream.next() { - let word = std::str::from_utf8(word)?; - let d = dfa.distance(state.1); - derived_words.push((word.to_string(), d.to_u8())); - } - } else { - let starts = StartsWith(Str::new(get_first(word))); - let first = Intersection(build_dfa(word, 1, is_prefix), Complement(&starts)); - let second_dfa = build_dfa(word, 2, is_prefix); - let second = Intersection(&second_dfa, &starts); - let automaton = Union(first, &second); - - let mut stream = fst.search_with_state(automaton).into_stream(); - - while let Some((found_word, state)) = stream.next() { - let found_word = std::str::from_utf8(found_word)?; - // in the case the typo is on the first letter, we know the number of typo - // is two - if get_first(found_word) != get_first(word) { - derived_words.push((found_word.to_string(), 2)); - } else { - // Else, we know that it is the second dfa that matched and compute the - // correct distance - let d = second_dfa.distance((state.1).0); - derived_words.push((found_word.to_string(), d.to_u8())); - } - } - } - Ok(entry.insert(derived_words)) - } - } -} - fn get_first(s: &str) -> &str { match s.chars().next() { Some(c) => &s[..c.len_utf8()], @@ -337,66 +265,66 @@ mod test { assert!(!search.is_typo_authorized().unwrap()); } - #[test] - fn test_one_typos_tolerance() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap(); + // #[test] + // fn test_one_typos_tolerance() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("zealend", false, 1, &fst, &mut cache).unwrap(); - assert_eq!(found, &[("zealand".to_string(), 1)]); - } + // assert_eq!(found, &[("zealand".to_string(), 1)]); + // } - #[test] - fn test_one_typos_first_letter() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap(); + // #[test] + // fn test_one_typos_first_letter() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("sealand", false, 1, &fst, &mut cache).unwrap(); - assert_eq!(found, &[]); - } + // assert_eq!(found, &[]); + // } - #[test] - fn test_two_typos_tolerance() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap(); + // #[test] + // fn test_two_typos_tolerance() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("zealemd", false, 2, &fst, &mut cache).unwrap(); - assert_eq!(found, &[("zealand".to_string(), 2)]); - } + // assert_eq!(found, &[("zealand".to_string(), 2)]); + // } - #[test] - fn test_two_typos_first_letter() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap(); + // #[test] + // fn test_two_typos_first_letter() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("sealand", false, 2, &fst, &mut cache).unwrap(); - assert_eq!(found, &[("zealand".to_string(), 2)]); - } + // assert_eq!(found, &[("zealand".to_string(), 2)]); + // } - #[test] - fn test_prefix() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap(); + // #[test] + // fn test_prefix() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("ze", true, 0, &fst, &mut cache).unwrap(); - assert_eq!(found, &[("zealand".to_string(), 0)]); - } + // assert_eq!(found, &[("zealand".to_string(), 0)]); + // } - #[test] - fn test_bad_prefix() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap(); + // #[test] + // fn test_bad_prefix() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("se", true, 0, &fst, &mut cache).unwrap(); - assert_eq!(found, &[]); - } + // assert_eq!(found, &[]); + // } - #[test] - fn test_prefix_with_typo() { - let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); - let mut cache = HashMap::new(); - let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap(); + // #[test] + // fn test_prefix_with_typo() { + // let fst = fst::Set::from_iter(["zealand"].iter()).unwrap().map_data(Cow::Owned).unwrap(); + // let mut cache = HashMap::new(); + // let found = word_derivations("zae", true, 1, &fst, &mut cache).unwrap(); - assert_eq!(found, &[("zealand".to_string(), 1)]); - } + // assert_eq!(found, &[("zealand".to_string(), 1)]); + // } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 15c895583..45cad378a 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -15,10 +15,9 @@ mod sort; // TODO: documentation + comments mod words; -// #[cfg(test)] use std::collections::{BTreeSet, HashSet}; -use charabia::{Tokenize, TokenizerBuilder}; +use charabia::TokenizerBuilder; use db_cache::DatabaseCache; use graph_based_ranking_rule::{Proximity, Typo}; use heed::RoTxn; @@ -254,7 +253,7 @@ pub fn execute_search( } let tokenizer = tokbuilder.build(); - let tokens = tokenizer.tokenize(&query); + let tokens = tokenizer.tokenize(query); let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; let graph = QueryGraph::from_query(ctx, query_terms)?; diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs deleted file mode 100755 index 1b1a42c1c..000000000 --- a/milli/src/search/query_tree.rs +++ /dev/null @@ -1,1435 +0,0 @@ -use std::borrow::Cow; -use std::cmp::max; -use std::collections::hash_map::Entry; -use std::collections::HashMap; -use std::hash::Hash; -use std::rc::Rc; -use std::{fmt, mem}; - -use charabia::normalizer::NormalizedTokenIter; -use charabia::{SeparatorKind, TokenKind}; -use roaring::RoaringBitmap; -use slice_group_by::GroupBy; - -use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId}; -use crate::search::TermsMatchingStrategy; -use crate::{CboRoaringBitmapLenCodec, Index, MatchingWords, Result}; - -type IsOptionalWord = bool; -type IsPrefix = bool; - -#[derive(Clone, PartialEq, Eq, Hash)] -pub enum Operation { - And(Vec), - // series of consecutive non prefix and exact words - // `None` means a stop word. - Phrase(Vec>), - Or(IsOptionalWord, Vec), - Query(Query), -} - -impl fmt::Debug for Operation { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fn pprint_tree(f: &mut fmt::Formatter<'_>, op: &Operation, depth: usize) -> fmt::Result { - match op { - Operation::And(children) => { - writeln!(f, "{:1$}AND", "", depth * 2)?; - children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) - } - Operation::Phrase(children) => { - writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2) - } - Operation::Or(true, children) => { - writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?; - children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) - } - Operation::Or(false, children) => { - writeln!(f, "{:1$}OR", "", depth * 2)?; - children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) - } - Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2), - } - } - - pprint_tree(f, self, 0) - } -} - -impl Operation { - fn and(mut ops: Vec) -> Self { - if ops.len() == 1 { - ops.pop().unwrap() - } else { - Self::And(ops) - } - } - - pub fn or(word_branch: IsOptionalWord, mut ops: Vec) -> Self { - if ops.len() == 1 { - ops.pop().unwrap() - } else { - let ops = ops - .into_iter() - .flat_map(|o| match o { - Operation::Or(wb, children) if wb == word_branch => children, - op => vec![op], - }) - .collect(); - Self::Or(word_branch, ops) - } - } - - fn phrase(mut words: Vec>) -> Self { - if words.len() == 1 { - if let Some(word) = words.pop().unwrap() { - Self::Query(Query { prefix: false, kind: QueryKind::exact(word) }) - } else { - Self::Phrase(words) - } - } else { - Self::Phrase(words) - } - } - - pub fn query(&self) -> Option<&Query> { - match self { - Operation::Query(query) => Some(query), - _ => None, - } - } -} - -#[derive(Clone, Eq, PartialEq, Hash)] -pub struct Query { - pub prefix: IsPrefix, - pub kind: QueryKind, -} - -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum QueryKind { - Tolerant { typo: u8, word: String }, - Exact { original_typo: u8, word: String }, -} - -impl QueryKind { - pub fn exact(word: String) -> Self { - QueryKind::Exact { original_typo: 0, word } - } - - pub fn tolerant(typo: u8, word: String) -> Self { - QueryKind::Tolerant { typo, word } - } - - pub fn typo(&self) -> u8 { - match self { - QueryKind::Tolerant { typo, .. } => *typo, - QueryKind::Exact { original_typo, .. } => *original_typo, - } - } - - pub fn word(&self) -> &str { - match self { - QueryKind::Tolerant { word, .. } => word, - QueryKind::Exact { word, .. } => word, - } - } -} - -impl fmt::Debug for Query { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - let Query { prefix, kind } = self; - let prefix = if *prefix { String::from("Prefix") } else { String::default() }; - match kind { - QueryKind::Exact { word, .. } => { - f.debug_struct(&(prefix + "Exact")).field("word", &word).finish() - } - QueryKind::Tolerant { typo, word } => f - .debug_struct(&(prefix + "Tolerant")) - .field("word", &word) - .field("max typo", &typo) - .finish(), - } - } -} - -trait Context { - fn word_docids(&self, word: &str) -> heed::Result>; - fn synonyms>(&self, words: &[S]) -> heed::Result>>>; - fn word_documents_count(&self, word: &str) -> heed::Result> { - match self.word_docids(word)? { - Some(rb) => Ok(Some(rb.len())), - None => Ok(None), - } - } - /// Returns the minimum word len for 1 and 2 typos. - fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>; - fn exact_words(&self) -> Option<&fst::Set>>; - fn word_pair_frequency( - &self, - left_word: &str, - right_word: &str, - proximity: u8, - ) -> heed::Result>; -} - -/// The query tree builder is the interface to build a query tree. -pub struct QueryTreeBuilder<'a> { - rtxn: &'a heed::RoTxn<'a>, - index: &'a Index, - terms_matching_strategy: TermsMatchingStrategy, - authorize_typos: bool, - words_limit: Option, - exact_words: Option>>, -} - -impl<'a> Context for QueryTreeBuilder<'a> { - fn word_docids(&self, word: &str) -> heed::Result> { - self.index.word_docids.get(self.rtxn, word) - } - - fn synonyms>(&self, words: &[S]) -> heed::Result>>> { - self.index.words_synonyms(self.rtxn, words) - } - - fn word_documents_count(&self, word: &str) -> heed::Result> { - self.index.word_documents_count(self.rtxn, word) - } - - fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { - let one = self.index.min_word_len_one_typo(self.rtxn)?; - let two = self.index.min_word_len_two_typos(self.rtxn)?; - Ok((one, two)) - } - - fn exact_words(&self) -> Option<&fst::Set>> { - self.exact_words.as_ref() - } - - fn word_pair_frequency( - &self, - left_word: &str, - right_word: &str, - proximity: u8, - ) -> heed::Result> { - let key = (proximity, left_word, right_word); - self.index - .word_pair_proximity_docids - .remap_data_type::() - .get(self.rtxn, &key) - } -} - -impl<'a> QueryTreeBuilder<'a> { - /// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn` - /// and an Index `index`. - pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Result { - Ok(Self { - rtxn, - index, - terms_matching_strategy: TermsMatchingStrategy::default(), - authorize_typos: true, - words_limit: None, - exact_words: index.exact_words(rtxn)?, - }) - } - - /// if `terms_matching_strategy` is set to `All` the query tree will be - /// generated forcing all query words to be present in each matching documents - /// (the criterion `words` will be ignored). - /// default value if not called: `Last` - pub fn terms_matching_strategy( - &mut self, - terms_matching_strategy: TermsMatchingStrategy, - ) -> &mut Self { - self.terms_matching_strategy = terms_matching_strategy; - self - } - - /// if `authorize_typos` is set to `false` the query tree will be generated - /// forcing all query words to match documents without any typo - /// (the criterion `typo` will be ignored). - /// default value if not called: `true` - pub fn authorize_typos(&mut self, authorize_typos: bool) -> &mut Self { - self.authorize_typos = authorize_typos; - self - } - - /// Limit words and phrases that will be taken for query building. - /// Any beyond `words_limit` will be ignored. - pub fn words_limit(&mut self, words_limit: usize) -> &mut Self { - self.words_limit = Some(words_limit); - self - } - - /// Build the query tree: - /// - if `terms_matching_strategy` is set to `All` the query tree will be - /// generated forcing all query words to be present in each matching documents - /// (the criterion `words` will be ignored) - /// - if `authorize_typos` is set to `false` the query tree will be generated - /// forcing all query words to match documents without any typo - /// (the criterion `typo` will be ignored) - pub fn build>( - &self, - query: NormalizedTokenIter, - ) -> Result> { - let primitive_query = create_primitive_query(query, self.words_limit); - if !primitive_query.is_empty() { - let qt = create_query_tree( - self, - self.terms_matching_strategy, - self.authorize_typos, - &primitive_query, - )?; - let matching_words = - create_matching_words(self, self.authorize_typos, &primitive_query)?; - Ok(Some((qt, primitive_query, matching_words))) - } else { - Ok(None) - } - } -} - -/// Split the word depending on the frequency of pairs near together in the database documents. -fn split_best_frequency<'a>( - ctx: &impl Context, - word: &'a str, -) -> heed::Result> { - let chars = word.char_indices().skip(1); - let mut best = None; - - for (i, _) in chars { - let (left, right) = word.split_at(i); - - let pair_freq = ctx.word_pair_frequency(left, right, 1)?.unwrap_or(0); - - if pair_freq != 0 && best.map_or(true, |(old, _, _)| pair_freq > old) { - best = Some((pair_freq, left, right)); - } - } - - Ok(best.map(|(_, left, right)| (left, right))) -} - -#[derive(Clone)] -pub struct TypoConfig<'a> { - pub max_typos: u8, - pub word_len_one_typo: u8, - pub word_len_two_typo: u8, - pub exact_words: Option<&'a fst::Set>>, -} - -/// Return the `QueryKind` of a word depending on `authorize_typos` -/// and the provided word length. -fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind { - if authorize_typos && !config.exact_words.map_or(false, |s| s.contains(&word)) { - let count = word.chars().count().min(u8::MAX as usize) as u8; - if count < config.word_len_one_typo { - QueryKind::exact(word) - } else if count < config.word_len_two_typo { - QueryKind::tolerant(1.min(config.max_typos), word) - } else { - QueryKind::tolerant(2.min(config.max_typos), word) - } - } else { - QueryKind::exact(word) - } -} - -/// Fetch synonyms from the `Context` for the provided word -/// and create the list of operations for the query tree -fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result>> { - let synonyms = ctx.synonyms(word)?; - Ok(synonyms.map(|synonyms| { - synonyms - .into_iter() - .map(|synonym| { - if synonym.len() == 1 { - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact(synonym[0].clone()), - }) - } else { - Operation::Phrase(synonym.into_iter().map(Some).collect()) - } - }) - .collect() - })) -} - -/// Main function that creates the final query tree from the primitive query. -fn create_query_tree( - ctx: &impl Context, - terms_matching_strategy: TermsMatchingStrategy, - authorize_typos: bool, - query: &[PrimitiveQueryPart], -) -> Result { - /// Matches on the `PrimitiveQueryPart` and create an operation from it. - fn resolve_primitive_part( - ctx: &impl Context, - authorize_typos: bool, - part: PrimitiveQueryPart, - ) -> Result { - match part { - // 1. try to split word in 2 - // 2. try to fetch synonyms - // 3. create an operation containing the word - // 4. wrap all in an OR operation - PrimitiveQueryPart::Word(word, prefix) => { - let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); - if let Some((left, right)) = split_best_frequency(ctx, &word)? { - children.push(Operation::Phrase(vec![ - Some(left.to_string()), - Some(right.to_string()), - ])); - } - let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let exact_words = ctx.exact_words(); - let config = - TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; - children.push(Operation::Query(Query { - prefix, - kind: typos(word, authorize_typos, config), - })); - Ok(Operation::or(false, children)) - } - // create a CONSECUTIVE operation wrapping all word in the phrase - PrimitiveQueryPart::Phrase(words) => Ok(Operation::phrase(words)), - } - } - - /// Create all ngrams 1..=3 generating query tree branches. - fn ngrams( - ctx: &impl Context, - authorize_typos: bool, - query: &[PrimitiveQueryPart], - any_words: bool, - ) -> Result { - const MAX_NGRAM: usize = 3; - let mut op_children = Vec::new(); - - for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { - let mut or_op_children = Vec::new(); - - for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { - if let Some(group) = sub_query.get(..ngram) { - let mut and_op_children = Vec::new(); - let tail = &sub_query[ngram..]; - let is_last = tail.is_empty(); - - match group { - [part] => { - let operation = - resolve_primitive_part(ctx, authorize_typos, part.clone())?; - and_op_children.push(operation); - } - words => { - let is_prefix = words.last().map_or(false, |part| part.is_prefix()); - let words: Vec<_> = words - .iter() - .filter_map(|part| { - if let PrimitiveQueryPart::Word(word, _) = part { - Some(word.as_str()) - } else { - None - } - }) - .collect(); - let mut operations = synonyms(ctx, &words)?.unwrap_or_default(); - let concat = words.concat(); - let (word_len_one_typo, word_len_two_typo) = - ctx.min_word_len_for_typo()?; - let exact_words = ctx.exact_words(); - let config = TypoConfig { - max_typos: 1, - word_len_one_typo, - word_len_two_typo, - exact_words, - }; - let query = Query { - prefix: is_prefix, - kind: typos(concat, authorize_typos, config), - }; - operations.push(Operation::Query(query)); - and_op_children.push(Operation::or(false, operations)); - } - } - - if !is_last { - let ngrams = ngrams(ctx, authorize_typos, tail, any_words)?; - and_op_children.push(ngrams); - } - - if any_words { - or_op_children.push(Operation::or(false, and_op_children)); - } else { - or_op_children.push(Operation::and(and_op_children)); - } - } - } - op_children.push(Operation::or(false, or_op_children)); - } - - if any_words { - Ok(Operation::or(false, op_children)) - } else { - Ok(Operation::and(op_children)) - } - } - - let number_phrases = query.iter().filter(|p| p.is_phrase()).count(); - let remove_count = query.len() - max(number_phrases, 1); - if remove_count == 0 { - return ngrams(ctx, authorize_typos, query, false); - } - - let mut operation_children = Vec::new(); - let mut query = query.to_vec(); - for _ in 0..=remove_count { - let pos = match terms_matching_strategy { - TermsMatchingStrategy::All => return ngrams(ctx, authorize_typos, &query, false), - TermsMatchingStrategy::Last => query - .iter() - .enumerate() - .filter(|(_, part)| !part.is_phrase()) - .last() - .map(|(pos, _)| pos), - }; - - // compute and push the current branch on the front - operation_children.insert(0, ngrams(ctx, authorize_typos, &query, false)?); - // remove word from query before creating an new branch - match pos { - Some(pos) => query.remove(pos), - None => break, - }; - } - - Ok(Operation::or(true, operation_children)) -} - -#[derive(Default, Debug)] -struct MatchingWordCache { - all: Vec>, - map: HashMap<(String, u8, bool), Rc>, -} -impl MatchingWordCache { - fn insert(&mut self, word: String, typo: u8, prefix: bool) -> Option> { - match self.map.entry((word.clone(), typo, prefix)) { - Entry::Occupied(idx) => Some(idx.get().clone()), - Entry::Vacant(vacant) => { - let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?); - self.all.push(matching_word.clone()); - vacant.insert(matching_word.clone()); - Some(matching_word) - } - } - // To deactivate the cache, for testing purposes, use the following instead: - // let matching_word = Rc::new(MatchingWord::new(word, typo, prefix)?); - // self.all.push(matching_word.clone()); - // Some(matching_word) - } -} - -/// Main function that matchings words used for crop and highlight. -fn create_matching_words( - ctx: &impl Context, - authorize_typos: bool, - query: &[PrimitiveQueryPart], -) -> Result { - /// Matches on the `PrimitiveQueryPart` and create matchings words from it. - fn resolve_primitive_part( - ctx: &impl Context, - authorize_typos: bool, - part: PrimitiveQueryPart, - matching_words: &mut Vec<(Vec>, Vec)>, - matching_word_cache: &mut MatchingWordCache, - id: PrimitiveWordId, - ) -> Result<()> { - match part { - // 1. try to split word in 2 - // 2. try to fetch synonyms - PrimitiveQueryPart::Word(word, prefix) => { - if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? { - for synonym in synonyms { - // Require that all words of the synonym have a corresponding MatchingWord - // before adding any of its words to the matching_words result. - if let Some(synonym_matching_words) = synonym - .into_iter() - .map(|word| matching_word_cache.insert(word, 0, false)) - .collect() - { - matching_words.push((synonym_matching_words, vec![id])); - } - } - } - - if let Some((left, right)) = split_best_frequency(ctx, &word)? { - // Require that both left and right words have a corresponding MatchingWord - // before adding them to the matching_words result - if let Some(left) = matching_word_cache.insert(left.to_string(), 0, false) { - if let Some(right) = matching_word_cache.insert(right.to_string(), 0, false) - { - matching_words.push((vec![left, right], vec![id])); - } - } - } - - let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; - let exact_words = ctx.exact_words(); - let config = - TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; - - let matching_word = match typos(word, authorize_typos, config) { - QueryKind::Exact { word, .. } => matching_word_cache.insert(word, 0, prefix), - QueryKind::Tolerant { typo, word } => { - matching_word_cache.insert(word, typo, prefix) - } - }; - if let Some(matching_word) = matching_word { - matching_words.push((vec![matching_word], vec![id])); - } - } - // create a CONSECUTIVE matchings words wrapping all word in the phrase - PrimitiveQueryPart::Phrase(words) => { - let ids: Vec<_> = (0..words.len()).map(|i| id + i as PrimitiveWordId).collect(); - // Require that all words of the phrase have a corresponding MatchingWord - // before adding any of them to the matching_words result - if let Some(phrase_matching_words) = words - .into_iter() - .flatten() - .map(|w| matching_word_cache.insert(w, 0, false)) - .collect() - { - matching_words.push((phrase_matching_words, ids)); - } - } - } - - Ok(()) - } - - /// Create all ngrams 1..=3 generating query tree branches. - fn ngrams( - ctx: &impl Context, - authorize_typos: bool, - query: &[PrimitiveQueryPart], - matching_words: &mut Vec<(Vec>, Vec)>, - matching_word_cache: &mut MatchingWordCache, - mut id: PrimitiveWordId, - ) -> Result<()> { - const MAX_NGRAM: usize = 3; - - for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { - for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { - if let Some(group) = sub_query.get(..ngram) { - let tail = &sub_query[ngram..]; - let is_last = tail.is_empty(); - - match group { - [part] => { - resolve_primitive_part( - ctx, - authorize_typos, - part.clone(), - matching_words, - matching_word_cache, - id, - )?; - } - words => { - let is_prefix = words.last().map_or(false, |part| part.is_prefix()); - let words: Vec<_> = words - .iter() - .filter_map(|part| { - if let PrimitiveQueryPart::Word(word, _) = part { - Some(word.as_str()) - } else { - None - } - }) - .collect(); - let ids: Vec<_> = - (0..words.len()).map(|i| id + i as PrimitiveWordId).collect(); - - if let Some(synonyms) = ctx.synonyms(&words)? { - for synonym in synonyms { - if let Some(synonym) = synonym - .into_iter() - .map(|syn| matching_word_cache.insert(syn, 0, false)) - .collect() - { - matching_words.push((synonym, ids.clone())); - } - } - } - let word = words.concat(); - let (word_len_one_typo, word_len_two_typo) = - ctx.min_word_len_for_typo()?; - let exact_words = ctx.exact_words(); - let config = TypoConfig { - max_typos: 1, - word_len_one_typo, - word_len_two_typo, - exact_words, - }; - let matching_word = match typos(word, authorize_typos, config) { - QueryKind::Exact { word, .. } => { - matching_word_cache.insert(word, 0, is_prefix) - } - QueryKind::Tolerant { typo, word } => { - matching_word_cache.insert(word, typo, is_prefix) - } - }; - if let Some(matching_word) = matching_word { - matching_words.push((vec![matching_word], ids)); - } - } - } - - if !is_last { - ngrams( - ctx, - authorize_typos, - tail, - matching_words, - matching_word_cache, - id + 1, - )?; - } - } - } - id += sub_query.iter().map(|x| x.len() as PrimitiveWordId).sum::(); - } - - Ok(()) - } - - let mut matching_word_cache = MatchingWordCache::default(); - let mut matching_words = Vec::new(); - ngrams(ctx, authorize_typos, query, &mut matching_words, &mut matching_word_cache, 0)?; - MatchingWords::new(matching_words) -} - -pub type PrimitiveQuery = Vec; - -#[derive(Debug, Clone)] -pub enum PrimitiveQueryPart { - Phrase(Vec>), - Word(String, IsPrefix), -} - -impl PrimitiveQueryPart { - fn is_phrase(&self) -> bool { - matches!(self, Self::Phrase(_)) - } - - fn is_prefix(&self) -> bool { - matches!(self, Self::Word(_, is_prefix) if *is_prefix) - } - - fn len(&self) -> usize { - match self { - Self::Phrase(words) => words.len(), - Self::Word(_, _) => 1, - } - } -} - -/// Create primitive query from tokenized query string, -/// the primitive query is an intermediate state to build the query tree. -fn create_primitive_query( - query: NormalizedTokenIter, - words_limit: Option, -) -> PrimitiveQuery -where - A: AsRef<[u8]>, -{ - let mut primitive_query = Vec::new(); - let mut phrase = Vec::new(); - let mut quoted = false; - - let parts_limit = words_limit.unwrap_or(usize::MAX); - - let mut peekable = query.peekable(); - while let Some(token) = peekable.next() { - // early return if word limit is exceeded - if primitive_query.len() >= parts_limit { - return primitive_query; - } - - match token.kind { - TokenKind::Word | TokenKind::StopWord => { - // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, - // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, - // 3. if the word is the last token of the query we push it as a prefix word. - if quoted { - if let TokenKind::StopWord = token.kind { - phrase.push(None) - } else { - phrase.push(Some(token.lemma().to_string())); - } - } else if peekable.peek().is_some() { - if let TokenKind::StopWord = token.kind { - } else { - primitive_query - .push(PrimitiveQueryPart::Word(token.lemma().to_string(), false)); - } - } else { - primitive_query.push(PrimitiveQueryPart::Word(token.lemma().to_string(), true)); - } - } - TokenKind::Separator(separator_kind) => { - let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); - // swap quoted state if we encounter a double quote - if quote_count % 2 != 0 { - quoted = !quoted; - } - // if there is a quote or a hard separator we close the phrase. - if quote_count > 0 || separator_kind == SeparatorKind::Hard { - let phrase = mem::take(&mut phrase); - - // if the phrase only contains stop words, we don't keep it in the query. - if phrase.iter().any(|w| w.is_some()) { - primitive_query.push(PrimitiveQueryPart::Phrase(phrase)); - } - } - } - _ => (), - } - } - - // If a quote is never closed, we consider all of the end of the query as a phrase. - if phrase.iter().any(|w| w.is_some()) { - primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase))); - } - - primitive_query -} - -/// Returns the maximum number of typos that this Operation allows. -pub fn maximum_typo(operation: &Operation) -> usize { - use Operation::{And, Or, Phrase, Query}; - match operation { - Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0), - And(ops) => ops.iter().map(maximum_typo).sum::(), - Query(q) => q.kind.typo() as usize, - // no typo allowed in phrases - Phrase(_) => 0, - } -} - -/// Returns the maximum proximity that this Operation allows. -pub fn maximum_proximity(operation: &Operation) -> usize { - use Operation::{And, Or, Phrase, Query}; - match operation { - Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), - And(ops) => { - ops.iter().map(maximum_proximity).sum::() + ops.len().saturating_sub(1) * 7 - } - Query(_) | Phrase(_) => 0, - } -} - -#[cfg(test)] -mod test { - use std::collections::HashMap; - - use charabia::Tokenize; - use maplit::hashmap; - use rand::rngs::StdRng; - use rand::{Rng, SeedableRng}; - - use super::*; - use crate::index::tests::TempIndex; - use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; - - #[derive(Debug)] - struct TestContext { - synonyms: HashMap, Vec>>, - postings: HashMap, - exact_words: Option>>, - } - - impl TestContext { - fn build>( - &self, - terms_matching_strategy: TermsMatchingStrategy, - authorize_typos: bool, - words_limit: Option, - query: NormalizedTokenIter, - ) -> Result> { - let primitive_query = create_primitive_query(query, words_limit); - if !primitive_query.is_empty() { - let qt = create_query_tree( - self, - terms_matching_strategy, - authorize_typos, - &primitive_query, - )?; - Ok(Some((qt, primitive_query))) - } else { - Ok(None) - } - } - } - - impl Context for TestContext { - fn word_docids(&self, word: &str) -> heed::Result> { - Ok(self.postings.get(word).cloned()) - } - - fn synonyms>(&self, words: &[S]) -> heed::Result>>> { - let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect(); - Ok(self.synonyms.get(&words).cloned()) - } - - fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> { - Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS)) - } - - fn exact_words(&self) -> Option<&fst::Set>> { - self.exact_words.as_ref() - } - - fn word_pair_frequency( - &self, - left_word: &str, - right_word: &str, - _proximity: u8, - ) -> heed::Result> { - match self.word_docids(&format!("{} {}", left_word, right_word))? { - Some(rb) => Ok(Some(rb.len())), - None => Ok(None), - } - } - } - - impl Default for TestContext { - fn default() -> TestContext { - let mut rng = StdRng::seed_from_u64(102); - let rng = &mut rng; - - fn random_postings(rng: &mut R, len: usize) -> RoaringBitmap { - let mut values = Vec::::with_capacity(len); - while values.len() != len { - values.push(rng.gen()); - } - values.sort_unstable(); - RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap() - } - - let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap(); - let exact_words = - Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); - - TestContext { - synonyms: hashmap! { - vec![String::from("hello")] => vec![ - vec![String::from("hi")], - vec![String::from("good"), String::from("morning")], - ], - vec![String::from("world")] => vec![ - vec![String::from("earth")], - vec![String::from("nature")], - ], - // new york city - vec![String::from("nyc")] => vec![ - vec![String::from("new"), String::from("york")], - vec![String::from("new"), String::from("york"), String::from("city")], - ], - vec![String::from("new"), String::from("york")] => vec![ - vec![String::from("nyc")], - vec![String::from("new"), String::from("york"), String::from("city")], - ], - vec![String::from("new"), String::from("york"), String::from("city")] => vec![ - vec![String::from("nyc")], - vec![String::from("new"), String::from("york")], - ], - }, - postings: hashmap! { - String::from("hello") => random_postings(rng, 1500), - String::from("hi") => random_postings(rng, 4000), - String::from("word") => random_postings(rng, 2500), - String::from("split") => random_postings(rng, 400), - String::from("ngrams") => random_postings(rng, 1400), - String::from("world") => random_postings(rng, 15_000), - String::from("earth") => random_postings(rng, 8000), - String::from("2021") => random_postings(rng, 100), - String::from("2020") => random_postings(rng, 500), - String::from("is") => random_postings(rng, 50_000), - String::from("this") => random_postings(rng, 50_000), - String::from("good") => random_postings(rng, 1250), - String::from("morning") => random_postings(rng, 125), - String::from("word split") => random_postings(rng, 5000), - String::from("quick brownfox") => random_postings(rng, 7000), - String::from("quickbrown fox") => random_postings(rng, 8000), - }, - exact_words, - } - } - } - - #[test] - fn prefix() { - let query = "hey friends"; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - Exact { word: "hey" } - PrefixTolerant { word: "friends", max typo: 1 } - PrefixTolerant { word: "heyfriends", max typo: 1 } - "###); - } - - #[test] - fn no_prefix() { - let query = "hey friends "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - Exact { word: "hey" } - Tolerant { word: "friends", max typo: 1 } - Tolerant { word: "heyfriends", max typo: 1 } - "###); - } - - #[test] - fn synonyms() { - let query = "hello world "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - OR - Exact { word: "hi" } - PHRASE [Some("good"), Some("morning")] - Tolerant { word: "hello", max typo: 1 } - OR - Exact { word: "earth" } - Exact { word: "nature" } - Tolerant { word: "world", max typo: 1 } - Tolerant { word: "helloworld", max typo: 1 } - "###); - } - - #[test] - fn simple_synonyms() { - let query = "nyc"; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::Last, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - PHRASE [Some("new"), Some("york")] - PHRASE [Some("new"), Some("york"), Some("city")] - PrefixExact { word: "nyc" } - "###); - } - - #[test] - fn complex_synonyms() { - let query = "new york city "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - Exact { word: "new" } - OR - AND - Exact { word: "york" } - Exact { word: "city" } - Tolerant { word: "yorkcity", max typo: 1 } - AND - OR - Exact { word: "nyc" } - PHRASE [Some("new"), Some("york"), Some("city")] - Tolerant { word: "newyork", max typo: 1 } - Exact { word: "city" } - Exact { word: "nyc" } - PHRASE [Some("new"), Some("york")] - Tolerant { word: "newyorkcity", max typo: 1 } - "###); - } - - #[test] - fn ngrams() { - let query = "n grams "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - Exact { word: "n" } - Tolerant { word: "grams", max typo: 1 } - Tolerant { word: "ngrams", max typo: 1 } - "###); - } - - #[test] - fn word_split() { - let query = "wordsplit fish "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - OR - PHRASE [Some("word"), Some("split")] - Tolerant { word: "wordsplit", max typo: 2 } - Exact { word: "fish" } - Tolerant { word: "wordsplitfish", max typo: 1 } - "###); - } - - #[test] - fn word_split_choose_pair_with_max_freq() { - let query = "quickbrownfox"; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - PHRASE [Some("quickbrown"), Some("fox")] - PrefixTolerant { word: "quickbrownfox", max typo: 2 } - "###); - } - - #[test] - fn phrase() { - let query = "\"hey friends\" \" \" \"wooop"; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - AND - PHRASE [Some("hey"), Some("friends")] - Exact { word: "wooop" } - "###); - } - - #[test] - fn phrase_2() { - // https://github.com/meilisearch/meilisearch/issues/2722 - let query = "coco \"harry\""; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::default(), true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR(WORD) - Exact { word: "harry" } - AND - Exact { word: "coco" } - Exact { word: "harry" } - "###); - } - - #[test] - fn phrase_with_hard_separator() { - let query = "\"hey friends. wooop wooop\""; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - AND - PHRASE [Some("hey"), Some("friends")] - PHRASE [Some("wooop"), Some("wooop")] - "###); - } - - #[test] - fn optional_word() { - let query = "hey my friend "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::default(), true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR(WORD) - Exact { word: "hey" } - OR - AND - Exact { word: "hey" } - Exact { word: "my" } - Tolerant { word: "heymy", max typo: 1 } - OR - AND - Exact { word: "hey" } - OR - AND - Exact { word: "my" } - Tolerant { word: "friend", max typo: 1 } - Tolerant { word: "myfriend", max typo: 1 } - AND - Tolerant { word: "heymy", max typo: 1 } - Tolerant { word: "friend", max typo: 1 } - Tolerant { word: "heymyfriend", max typo: 1 } - "###); - } - - #[test] - fn optional_word_phrase() { - let query = "\"hey my\""; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::default(), true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - PHRASE [Some("hey"), Some("my")] - "###); - } - - #[test] - fn optional_word_multiple_phrases() { - let query = r#""hey" my good "friend""#; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::default(), true, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR(WORD) - AND - Exact { word: "hey" } - Exact { word: "friend" } - AND - Exact { word: "hey" } - Exact { word: "my" } - Exact { word: "friend" } - AND - Exact { word: "hey" } - OR - AND - Exact { word: "my" } - Exact { word: "good" } - Tolerant { word: "mygood", max typo: 1 } - Exact { word: "friend" } - "###); - } - - #[test] - fn no_typo() { - let query = "hey friends "; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, false, None, tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - OR - AND - Exact { word: "hey" } - Exact { word: "friends" } - Exact { word: "heyfriends" } - "###); - } - - #[test] - fn words_limit() { - let query = "\"hey my\" good friend"; - let tokens = query.tokenize(); - - let (query_tree, _) = TestContext::default() - .build(TermsMatchingStrategy::All, false, Some(2), tokens) - .unwrap() - .unwrap(); - - insta::assert_debug_snapshot!(query_tree, @r###" - AND - PHRASE [Some("hey"), Some("my")] - Exact { word: "good" } - "###); - } - - #[test] - fn test_min_word_len_typo() { - let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap(); - let config = TypoConfig { - max_typos: 2, - word_len_one_typo: 5, - word_len_two_typo: 7, - exact_words: Some(&exact_words), - }; - - assert_eq!( - typos("hello".to_string(), true, config.clone()), - QueryKind::Tolerant { typo: 1, word: "hello".to_string() } - ); - - assert_eq!( - typos("hell".to_string(), true, config.clone()), - QueryKind::exact("hell".to_string()) - ); - - assert_eq!( - typos("verylongword".to_string(), true, config.clone()), - QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() } - ); - } - - #[test] - fn test_dont_create_matching_word_for_long_words() { - let index = TempIndex::new(); - let rtxn = index.read_txn().unwrap(); - let query = "what a supercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocioussupercalifragilisticexpialidocious house"; - let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); - builder.words_limit(10); - let (_, _, matching_words) = builder.build(query.tokenize()).unwrap().unwrap(); - insta::assert_snapshot!(format!("{matching_words:?}"), @r###" - [ - ([MatchingWord { word: "house", typo: 1, prefix: true }], [3]) - ([MatchingWord { word: "house", typo: 1, prefix: true }], [2]) - ([MatchingWord { word: "whata", typo: 1, prefix: false }], [0, 1]) - ([MatchingWord { word: "house", typo: 1, prefix: true }], [2]) - ([MatchingWord { word: "house", typo: 1, prefix: true }], [1]) - ([MatchingWord { word: "what", typo: 0, prefix: false }], [0]) - ([MatchingWord { word: "a", typo: 0, prefix: false }], [1]) - ] - "###); - } - - #[test] - fn disable_typo_on_word() { - let query = "goodbye"; - let tokens = query.tokenize(); - - let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); - let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); - let context = TestContext { exact_words, ..Default::default() }; - let (query_tree, _) = - context.build(TermsMatchingStrategy::All, true, Some(2), tokens).unwrap().unwrap(); - - assert!(matches!( - query_tree, - Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } }) - )); - } - - // The memory usage test below is disabled because `cargo test` runs multiple tests in parallel, - // which invalidates the measurements of memory usage. Nevertheless, it is a useful test to run - // manually from time to time, so I kept it here, commented-out. - - // use std::alloc::{GlobalAlloc, System}; - // use std::sync::atomic::{self, AtomicI64}; - // - // #[global_allocator] - // static ALLOC: CountingAlloc = - // CountingAlloc { resident: AtomicI64::new(0), allocated: AtomicI64::new(0) }; - // - // pub struct CountingAlloc { - // pub resident: AtomicI64, - // pub allocated: AtomicI64, - // } - // unsafe impl GlobalAlloc for CountingAlloc { - // unsafe fn alloc(&self, layout: std::alloc::Layout) -> *mut u8 { - // self.allocated.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); - // self.resident.fetch_add(layout.size() as i64, atomic::Ordering::Relaxed); - // - // System.alloc(layout) - // } - // - // unsafe fn dealloc(&self, ptr: *mut u8, layout: std::alloc::Layout) { - // self.resident.fetch_sub(layout.size() as i64, atomic::Ordering::Relaxed); - // System.dealloc(ptr, layout) - // } - // } - // - // #[test] - // fn memory_usage_of_ten_word_query() { - // let resident_before = ALLOC.resident.load(atomic::Ordering::SeqCst); - // let allocated_before = ALLOC.allocated.load(atomic::Ordering::SeqCst); - // - // let index = TempIndex::new(); - // let rtxn = index.read_txn().unwrap(); - // let query = "a beautiful summer house by the beach overlooking what seems"; - // let mut builder = QueryTreeBuilder::new(&rtxn, &index).unwrap(); - // builder.words_limit(10); - // let x = builder.build(query.tokenize()).unwrap().unwrap(); - // let resident_after = ALLOC.resident.load(atomic::Ordering::SeqCst); - // let allocated_after = ALLOC.allocated.load(atomic::Ordering::SeqCst); - // - // // Weak check on the memory usage - // // Don't keep more than 5MB. (Arguably 5MB is already too high) - // assert!(resident_after - resident_before < 5_000_000); - // // Don't allocate more than 10MB. - // assert!(allocated_after - allocated_before < 10_000_000); - // - // // Use these snapshots to measure the exact memory usage. - // // The values below were correct at the time I wrote them. - // // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"4486950"); - // // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"7107502"); - // - // // Note, with the matching word cache deactivated, the memory usage was: - // // insta::assert_snapshot!(format!("{}", resident_after - resident_before), @"91248697"); - // // insta::assert_snapshot!(format!("{}", allocated_after - allocated_before), @"125697588"); - // // or about 20x more resident memory (90MB vs 4.5MB) - // - // // Use x - // let _x = x; - // } -}