From 3bb1e35adac89c9b1e371dcd7b82372063b520ce Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 4 Apr 2022 18:56:59 +0200 Subject: [PATCH] Fix match count --- milli/src/search/matches/matching_words.rs | 339 ++++++++++++--------- milli/src/search/matches/mod.rs | 169 +++++----- milli/src/search/mod.rs | 17 +- milli/src/search/query_tree.rs | 175 ++++++++++- 4 files changed, 469 insertions(+), 231 deletions(-) diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 48f6fe809..274634554 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -1,12 +1,12 @@ use std::cmp::{min, Reverse}; -use std::collections::{BTreeMap, HashMap}; +use std::collections::BTreeMap; +use std::fmt; use std::ops::{Index, IndexMut}; use levenshtein_automata::{Distance, DFA}; use meilisearch_tokenizer::Token; use crate::search::build_dfa; -use crate::search::query_tree::{Operation, Query}; type IsPrefix = bool; @@ -14,83 +14,129 @@ type IsPrefix = bool; /// referencing words that match the given query tree. #[derive(Default)] pub struct MatchingWords { - dfas: Vec<(DFA, String, u8, IsPrefix, usize)>, + inner: Vec<(Vec, Vec)>, } impl MatchingWords { - pub fn from_query_tree(tree: &Operation) -> Self { - // fetch matchable words from the query tree - let mut dfas: Vec<_> = fetch_queries(tree) - .into_iter() - // create DFAs for each word - .map(|((w, t, p), id)| (build_dfa(w, t, p), w.to_string(), t, p, id)) - .collect(); - // Sort word by len in DESC order prioritizing the longuest word, + pub fn new(mut matching_words: Vec<(Vec, Vec)>) -> Self { + // Sort word by len in DESC order prioritizing the longuest matches, // in order to highlight the longuest part of the matched word. - dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix, _id)| { - Reverse(query_word.len()) - }); - Self { dfas } + matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len()))); + + Self { inner: matching_words } } - /// Returns the number of matching bytes if the word matches one of the query words. - pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option { - self.matching_bytes_with_id(word_to_highlight).map(|(len, _)| len) - } - - pub fn matching_bytes_with_id(&self, word_to_highlight: &Token) -> Option<(usize, usize)> { - self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix, id)| { - match dfa.eval(word_to_highlight.text()) { - Distance::Exact(t) if t <= *typo => { - if *is_prefix { - let len = bytes_to_highlight(word_to_highlight.text(), query_word); - Some((word_to_highlight.num_chars_from_bytes(len), *id)) - } else { - Some(( - word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()), - *id, - )) - } - } - _otherwise => None, - } - }) + pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { + MatchesIter { inner: Box::new(self.inner.iter()), token } } } -/// Lists all words which can be considered as a match for the query tree. -fn fetch_queries(tree: &Operation) -> HashMap<(&str, u8, IsPrefix), usize> { - fn resolve_ops<'a>( - tree: &'a Operation, - out: &mut HashMap<(&'a str, u8, IsPrefix), usize>, - id: &mut usize, - ) { - match tree { - Operation::Or(_, ops) | Operation::And(ops) => { - ops.as_slice().iter().for_each(|op| resolve_ops(op, out, id)); - } - Operation::Query(Query { prefix, kind }) => { - let typo = if kind.is_exact() { 0 } else { kind.typo() }; - out.entry((kind.word(), typo, *prefix)).or_insert_with(|| { - *id += 1; - *id - }); - } - Operation::Phrase(words) => { - for word in words { - out.entry((word, 0, false)).or_insert_with(|| { - *id += 1; - *id - }); +pub struct MatchesIter<'a, 'b> { + inner: Box, Vec)> + 'a>, + token: &'b Token<'b>, +} + +impl<'a> Iterator for MatchesIter<'a, '_> { + type Item = MatchType<'a>; + + fn next(&mut self) -> Option { + match self.inner.next() { + Some((matching_words, ids)) => match matching_words[0].match_token(&self.token) { + Some(char_len) => { + if matching_words.len() > 1 { + Some(MatchType::Partial(PartialMatch { + matching_words: &matching_words[1..], + ids, + char_len, + })) + } else { + Some(MatchType::Full { char_len, ids }) + } } - } + None => self.next(), + }, + None => None, } } +} - let mut queries = HashMap::new(); - let mut id = 0; - resolve_ops(tree, &mut queries, &mut id); - queries +pub type PrimitiveWordId = u8; +pub struct MatchingWord { + pub dfa: DFA, + pub word: String, + pub typo: u8, + pub prefix: IsPrefix, +} + +impl fmt::Debug for MatchingWord { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MatchingWord") + .field("word", &self.word) + .field("typo", &self.typo) + .field("prefix", &self.prefix) + .finish() + } +} + +impl PartialEq for MatchingWord { + fn eq(&self, other: &Self) -> bool { + self.prefix == other.prefix && self.typo == other.typo && self.word == other.word + } +} + +impl MatchingWord { + pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self { + let dfa = build_dfa(&word, typo, prefix); + + Self { dfa, word, typo, prefix } + } + + pub fn match_token(&self, token: &Token) -> Option { + match self.dfa.eval(token.text()) { + Distance::Exact(t) if t <= self.typo => { + if self.prefix { + let len = bytes_to_highlight(token.text(), &self.word); + Some(token.num_chars_from_bytes(len)) + } else { + Some(token.num_chars_from_bytes(token.text().len())) + } + } + _otherwise => None, + } + } +} + +#[derive(Debug, PartialEq)] +pub enum MatchType<'a> { + Full { char_len: usize, ids: &'a [PrimitiveWordId] }, + Partial(PartialMatch<'a>), +} + +#[derive(Debug, PartialEq)] +pub struct PartialMatch<'a> { + matching_words: &'a [MatchingWord], + ids: &'a [PrimitiveWordId], + char_len: usize, +} + +impl<'a> PartialMatch<'a> { + pub fn match_token(self, token: &Token) -> Option> { + self.matching_words[0].match_token(token).map(|char_len| { + if self.matching_words.len() > 1 { + MatchType::Partial(PartialMatch { + matching_words: &self.matching_words[1..], + ids: self.ids, + char_len, + }) + } else { + MatchType::Full { char_len, ids: self.ids } + } + }) + } + + pub fn char_len(&self) -> usize { + self.char_len + } } // A simple wrapper around vec so we can get contiguous but index it like it's 2D array. @@ -203,7 +249,6 @@ mod tests { use meilisearch_tokenizer::TokenKind; use super::*; - use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::MatchingWords; #[test] @@ -271,102 +316,104 @@ mod tests { #[test] fn matching_words() { - let query_tree = Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: true, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ); + let matching_words = vec![ + (vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]), + (vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]), + (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]), + ]; - let matching_words = MatchingWords::from_query_tree(&query_tree); + let matching_words = MatchingWords::new(matching_words); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("word"), - byte_start: 0, - char_index: 0, - byte_end: "word".len(), - char_map: None, - }), - Some(3) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("word"), + byte_start: 0, + char_index: 0, + byte_end: "word".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 3, ids: &[2] }) ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("nyc"), - byte_start: 0, - char_index: 0, - byte_end: "nyc".len(), - char_map: None, - }), + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("nyc"), + byte_start: 0, + char_index: 0, + byte_end: "nyc".len(), + char_map: None, + }) + .next(), None ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("world"), - byte_start: 0, - char_index: 0, - byte_end: "world".len(), - char_map: None, - }), - Some(5) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("world"), + byte_start: 0, + char_index: 0, + byte_end: "world".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[2] }) ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("splitted"), - byte_start: 0, - char_index: 0, - byte_end: "splitted".len(), - char_map: None, - }), - Some(5) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("splitted"), + byte_start: 0, + char_index: 0, + byte_end: "splitted".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[0] }) ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("thisnew"), - byte_start: 0, - char_index: 0, - byte_end: "thisnew".len(), - char_map: None, - }), + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("thisnew"), + byte_start: 0, + char_index: 0, + byte_end: "thisnew".len(), + char_map: None, + }) + .next(), None ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("borld"), - byte_start: 0, - char_index: 0, - byte_end: "borld".len(), - char_map: None, - }), - Some(5) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("borld"), + byte_start: 0, + char_index: 0, + byte_end: "borld".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[2] }) ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("wordsplit"), - byte_start: 0, - char_index: 0, - byte_end: "wordsplit".len(), - char_map: None, - }), - Some(4) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("wordsplit"), + byte_start: 0, + char_index: 0, + byte_end: "wordsplit".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 4, ids: &[2] }) ); } } diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index c6b89f9ec..a99798a9b 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -1,11 +1,10 @@ use std::borrow::Cow; pub use matching_words::MatchingWords; +use matching_words::{MatchType, PrimitiveWordId}; use meilisearch_tokenizer::token::{SeparatorKind, Token}; -use crate::search::query_tree::Operation; - -mod matching_words; +pub mod matching_words; const DEFAULT_CROP_SIZE: usize = 10; const DEFAULT_CROP_MARKER: &'static str = "…"; @@ -21,18 +20,6 @@ pub struct MatcherBuilder { } impl MatcherBuilder { - pub fn from_query_tree(query_tree: &Operation) -> Self { - let matching_words = MatchingWords::from_query_tree(query_tree); - - Self { - matching_words, - crop_size: DEFAULT_CROP_SIZE, - crop_marker: None, - highlight_prefix: None, - highlight_suffix: None, - } - } - pub fn from_matching_words(matching_words: MatchingWords) -> Self { Self { matching_words, @@ -93,8 +80,8 @@ impl MatcherBuilder { #[derive(Clone, Debug)] pub struct Match { match_len: usize, - // id of the query word that matches. - id: usize, + // ids of the query words that matches. + ids: Vec, // position of the word in the whole text. word_position: usize, // position of the token in the whole text. @@ -123,10 +110,72 @@ impl<'t> Matcher<'t, '_> { let mut matches = Vec::new(); let mut word_position = 0; let mut token_position = 0; - for token in self.tokens { + while let Some(token) = self.tokens.get(token_position) { if token.is_separator().is_none() { - if let Some((match_len, id)) = self.matching_words.matching_bytes_with_id(&token) { - matches.push(Match { match_len, id, word_position, token_position }); + 'matches: for match_type in self.matching_words.match_token(&token) { + match match_type { + MatchType::Full { char_len, ids } => { + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + // stop on the first match + break; + } + MatchType::Partial(mut partial) => { + let mut potential_matches = + vec![(token_position, word_position, partial.char_len())]; + let mut t_position = 1; + let mut w_position = 1; + 'partials: for token in &self.tokens[token_position + 1..] { + if token.is_separator().is_none() { + partial = match partial.match_token(&token) { + Some(MatchType::Partial(partial)) => { + potential_matches.push(( + token_position + t_position, + word_position + w_position, + partial.char_len(), + )); + partial + } + // partial match is now full, we keep this matches and we advance positions + Some(MatchType::Full { char_len, ids }) => { + let iter = potential_matches.into_iter().map( + |(token_position, word_position, match_len)| { + Match { + match_len, + ids: ids.to_vec(), + word_position, + token_position, + } + }, + ); + + matches.extend(iter); + + word_position += w_position; + token_position += t_position; + + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + + break 'matches; + } + // no match, continue to next match. + None => break 'partials, + }; + w_position += 1; + } + t_position += 1; + } + } + } } word_position += 1; } @@ -229,7 +278,7 @@ impl<'t> Matcher<'t, '_> { } fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { - let mut ids = Vec::with_capacity(matches.len()); + let mut ids: Vec = Vec::with_capacity(matches.len()); let mut order_score = 0; let mut distance_score = 0; @@ -237,7 +286,7 @@ impl<'t> Matcher<'t, '_> { while let Some(m) = iter.next() { if let Some(next_match) = iter.peek() { // if matches are ordered - if next_match.id > m.id { + if next_match.ids.iter().min() > m.ids.iter().min() { order_score += 1; } @@ -245,7 +294,7 @@ impl<'t> Matcher<'t, '_> { distance_score -= (next_match.word_position - m.word_position).min(7) as i16; } - ids.push(m.id); + ids.extend(m.ids.iter()); } ids.sort_unstable(); @@ -348,7 +397,8 @@ impl<'t> Matcher<'t, '_> { .char_indices() .enumerate() .find(|(i, _)| *i == m.match_len) - .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start) + .min(token.byte_end); formatted.push(self.highlight_prefix); formatted.push(&self.text[token.byte_start..highlight_byte_index]); formatted.push(self.highlight_suffix); @@ -386,33 +436,23 @@ mod tests { use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use super::*; - use crate::search::query_tree::{Query, QueryKind}; + use crate::search::matches::matching_words::MatchingWord; - fn query_tree() -> Operation { - Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: true, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("the".to_string()), - }), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ) + fn matching_words() -> MatchingWords { + let matching_words = vec![ + (vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]), + (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]), + ]; + + MatchingWords::new(matching_words) } #[test] fn format_identity() { - let query_tree = query_tree(); + let matching_words = matching_words(); - let builder = MatcherBuilder::from_query_tree(&query_tree); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = false; @@ -445,9 +485,9 @@ mod tests { #[test] fn format_highlight() { - let query_tree = query_tree(); + let matching_words = matching_words(); - let builder = MatcherBuilder::from_query_tree(&query_tree); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = true; @@ -497,21 +537,14 @@ mod tests { #[test] fn highlight_unicode() { - let query_tree = Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "wessfalia".to_string()), - }), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ); + let matching_words = vec![ + (vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]), + (vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]), + ]; - let builder = MatcherBuilder::from_query_tree(&query_tree); + let matching_words = MatchingWords::new(matching_words); + + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = true; @@ -539,14 +572,14 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Westfália"); + assert_eq!(&matcher.format(highlight, crop), "Westfália"); } #[test] fn format_crop() { - let query_tree = query_tree(); + let matching_words = matching_words(); - let builder = MatcherBuilder::from_query_tree(&query_tree); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = false; @@ -657,9 +690,9 @@ mod tests { #[test] fn format_highlight_crop() { - let query_tree = query_tree(); + let matching_words = matching_words(); - let builder = MatcherBuilder::from_query_tree(&query_tree); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = true; @@ -724,9 +757,9 @@ mod tests { #[test] fn smaller_crop_size() { //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 - let query_tree = query_tree(); + let matching_words = matching_words(); - let mut builder = MatcherBuilder::from_query_tree(&query_tree); + let mut builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = false; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 8804d9151..2b025f269 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -114,7 +114,7 @@ impl<'a> Search<'a> { pub fn execute(&self) -> Result { // We create the query tree by spliting the query into tokens. let before = Instant::now(); - let (query_tree, primitive_query) = match self.query.as_ref() { + let (query_tree, primitive_query, matching_words) = match self.query.as_ref() { Some(query) => { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); @@ -132,9 +132,11 @@ impl<'a> Search<'a> { let analyzer = Analyzer::new(config); let result = analyzer.analyze(query); let tokens = result.tokens(); - builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq))) + builder + .build(tokens)? + .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw))) } - None => (None, None), + None => (None, None, None), }; debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed()); @@ -148,11 +150,6 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", filtered_candidates, before.elapsed()); - let matching_words = match query_tree.as_ref() { - Some(query_tree) => MatchingWords::from_query_tree(&query_tree), - None => MatchingWords::default(), - }; - // We check that we are allowed to use the sort criteria, we check // that they are declared in the sortable fields. if let Some(sort_criteria) = &self.sort_criteria { @@ -193,13 +190,13 @@ impl<'a> Search<'a> { )?; match self.index.distinct_field(self.rtxn)? { - None => self.perform_sort(NoopDistinct, matching_words, criteria), + None => self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria), Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; match field_ids_map.id(name) { Some(fid) => { let distinct = FacetDistinct::new(fid, self.index, self.rtxn); - self.perform_sort(distinct, matching_words, criteria) + self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria) } None => Ok(SearchResult::default()), } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4eccae8ce..a45034a3b 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -8,7 +8,8 @@ use meilisearch_tokenizer::TokenKind; use roaring::RoaringBitmap; use slice_group_by::GroupBy; -use crate::{Index, Result}; +use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId}; +use crate::{Index, MatchingWords, Result}; type IsOptionalWord = bool; type IsPrefix = bool; @@ -233,7 +234,10 @@ impl<'a> QueryTreeBuilder<'a> { /// - if `authorize_typos` is set to `false` the query tree will be generated /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) - pub fn build(&self, query: TokenStream) -> Result> { + pub fn build( + &self, + query: TokenStream, + ) -> Result> { let stop_words = self.index.stop_words(self.rtxn)?; let primitive_query = create_primitive_query(query, stop_words, self.words_limit); if !primitive_query.is_empty() { @@ -243,7 +247,9 @@ impl<'a> QueryTreeBuilder<'a> { self.authorize_typos, &primitive_query, )?; - Ok(Some((qt, primitive_query))) + let matching_words = + create_matching_words(self, self.authorize_typos, &primitive_query)?; + Ok(Some((qt, primitive_query, matching_words))) } else { Ok(None) } @@ -251,7 +257,7 @@ impl<'a> QueryTreeBuilder<'a> { } /// Split the word depending on the frequency of subwords in the database documents. -fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result> { +fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -267,7 +273,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); - if let Some(child) = split_best_frequency(ctx, &word)? { - children.push(child); + if let Some((left, right)) = split_best_frequency(ctx, &word)? { + children.push(Operation::Phrase(vec![left, right])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; let exact_words = ctx.exact_words()?; @@ -464,6 +470,154 @@ fn create_query_tree( } } +/// Main function that matchings words used for crop and highlight. +fn create_matching_words( + ctx: &impl Context, + authorize_typos: bool, + query: &[PrimitiveQueryPart], +) -> Result { + /// Matches on the `PrimitiveQueryPart` and create matchings words from it. + fn resolve_primitive_part( + ctx: &impl Context, + authorize_typos: bool, + part: PrimitiveQueryPart, + matching_words: &mut Vec<(Vec, Vec)>, + id: PrimitiveWordId, + ) -> Result<()> { + match part { + // 1. try to split word in 2 + // 2. try to fetch synonyms + PrimitiveQueryPart::Word(word, prefix) => { + if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? { + for synonym in synonyms { + let synonym = synonym + .into_iter() + .map(|syn| MatchingWord::new(syn.to_string(), 0, false)) + .collect(); + matching_words.push((synonym, vec![id])); + } + } + + if let Some((left, right)) = split_best_frequency(ctx, &word)? { + let left = MatchingWord::new(left, 0, false); + let right = MatchingWord::new(right, 0, false); + matching_words.push((vec![left, right], vec![id])); + } + + let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; + let exact_words = ctx.exact_words()?; + let config = + TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; + + let matching_word = match typos(word, authorize_typos, config) { + QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix), + QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix), + }; + matching_words.push((vec![matching_word], vec![id])); + } + // create a CONSECUTIVE matchings words wrapping all word in the phrase + PrimitiveQueryPart::Phrase(words) => { + let ids: Vec<_> = + (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); + let words = + words.into_iter().map(|w| MatchingWord::new(w.to_string(), 0, false)).collect(); + matching_words.push((words, ids)); + } + } + + Ok(()) + } + + /// Create all ngrams 1..=3 generating query tree branches. + fn ngrams( + ctx: &impl Context, + authorize_typos: bool, + query: &[PrimitiveQueryPart], + matching_words: &mut Vec<(Vec, Vec)>, + mut id: PrimitiveWordId, + ) -> Result<()> { + const MAX_NGRAM: usize = 3; + + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { + for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { + if let Some(group) = sub_query.get(..ngram) { + let tail = &sub_query[ngram..]; + let is_last = tail.is_empty(); + + match group { + [part] => { + resolve_primitive_part( + ctx, + authorize_typos, + part.clone(), + matching_words, + id, + )?; + } + words => { + let is_prefix = words.last().map_or(false, |part| part.is_prefix()); + let words: Vec<_> = words + .iter() + .filter_map(|part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }) + .collect(); + let ids: Vec<_> = (0..words.len()) + .into_iter() + .map(|i| id + i as PrimitiveWordId) + .collect(); + + if let Some(synonyms) = ctx.synonyms(&words)? { + for synonym in synonyms { + let synonym = synonym + .into_iter() + .map(|syn| MatchingWord::new(syn.to_string(), 0, false)) + .collect(); + matching_words.push((synonym, ids.clone())); + } + } + let word = words.concat(); + let (word_len_one_typo, word_len_two_typo) = + ctx.min_word_len_for_typo()?; + let exact_words = ctx.exact_words()?; + let config = TypoConfig { + max_typos: 1, + word_len_one_typo, + word_len_two_typo, + exact_words, + }; + let matching_word = match typos(word, authorize_typos, config) { + QueryKind::Exact { word, .. } => { + MatchingWord::new(word, 0, is_prefix) + } + QueryKind::Tolerant { typo, word } => { + MatchingWord::new(word, typo, is_prefix) + } + }; + matching_words.push((vec![matching_word], ids)); + } + } + + if !is_last { + ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?; + } + } + } + id += sub_query.iter().map(|x| x.len() as PrimitiveWordId).sum::(); + } + + Ok(()) + } + + let mut matching_words = Vec::new(); + ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?; + Ok(MatchingWords::new(matching_words)) +} + pub type PrimitiveQuery = Vec; #[derive(Debug, Clone)] @@ -480,6 +634,13 @@ impl PrimitiveQueryPart { fn is_prefix(&self) -> bool { matches!(self, Self::Word(_, is_prefix) if *is_prefix) } + + fn len(&self) -> usize { + match self { + Self::Phrase(words) => words.len(), + Self::Word(_, _) => 1, + } + } } /// Create primitive query from tokenized query string,