From d96e72e5dc2c01305d41fd3cb927ff77696f698f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 22 Mar 2022 15:22:14 +0100 Subject: [PATCH 01/21] Create formater with some tests --- .../search/{ => matches}/matching_words.rs | 48 +- milli/src/search/matches/mod.rs | 434 ++++++++++++++++++ milli/src/search/mod.rs | 4 +- 3 files changed, 469 insertions(+), 17 deletions(-) rename milli/src/search/{ => matches}/matching_words.rs (89%) create mode 100644 milli/src/search/matches/mod.rs diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matches/matching_words.rs similarity index 89% rename from milli/src/search/matching_words.rs rename to milli/src/search/matches/matching_words.rs index 67bdefb37..48f6fe809 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -1,11 +1,11 @@ use std::cmp::{min, Reverse}; -use std::collections::{BTreeMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::ops::{Index, IndexMut}; use levenshtein_automata::{Distance, DFA}; use meilisearch_tokenizer::Token; -use super::build_dfa; +use crate::search::build_dfa; use crate::search::query_tree::{Operation, Query}; type IsPrefix = bool; @@ -14,7 +14,7 @@ type IsPrefix = bool; /// referencing words that match the given query tree. #[derive(Default)] pub struct MatchingWords { - dfas: Vec<(DFA, String, u8, IsPrefix)>, + dfas: Vec<(DFA, String, u8, IsPrefix, usize)>, } impl MatchingWords { @@ -23,11 +23,11 @@ impl MatchingWords { let mut dfas: Vec<_> = fetch_queries(tree) .into_iter() // create DFAs for each word - .map(|(w, t, p)| (build_dfa(w, t, p), w.to_string(), t, p)) + .map(|((w, t, p), id)| (build_dfa(w, t, p), w.to_string(), t, p, id)) .collect(); // Sort word by len in DESC order prioritizing the longuest word, // in order to highlight the longuest part of the matched word. - dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| { + dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix, _id)| { Reverse(query_word.len()) }); Self { dfas } @@ -35,14 +35,21 @@ impl MatchingWords { /// Returns the number of matching bytes if the word matches one of the query words. pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option { - self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| { + self.matching_bytes_with_id(word_to_highlight).map(|(len, _)| len) + } + + pub fn matching_bytes_with_id(&self, word_to_highlight: &Token) -> Option<(usize, usize)> { + self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix, id)| { match dfa.eval(word_to_highlight.text()) { Distance::Exact(t) if t <= *typo => { if *is_prefix { let len = bytes_to_highlight(word_to_highlight.text(), query_word); - Some(word_to_highlight.num_chars_from_bytes(len)) + Some((word_to_highlight.num_chars_from_bytes(len), *id)) } else { - Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len())) + Some(( + word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()), + *id, + )) } } _otherwise => None, @@ -52,26 +59,37 @@ impl MatchingWords { } /// Lists all words which can be considered as a match for the query tree. -fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { - fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { +fn fetch_queries(tree: &Operation) -> HashMap<(&str, u8, IsPrefix), usize> { + fn resolve_ops<'a>( + tree: &'a Operation, + out: &mut HashMap<(&'a str, u8, IsPrefix), usize>, + id: &mut usize, + ) { match tree { Operation::Or(_, ops) | Operation::And(ops) => { - ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); + ops.as_slice().iter().for_each(|op| resolve_ops(op, out, id)); } Operation::Query(Query { prefix, kind }) => { let typo = if kind.is_exact() { 0 } else { kind.typo() }; - out.insert((kind.word(), typo, *prefix)); + out.entry((kind.word(), typo, *prefix)).or_insert_with(|| { + *id += 1; + *id + }); } Operation::Phrase(words) => { for word in words { - out.insert((word, 0, false)); + out.entry((word, 0, false)).or_insert_with(|| { + *id += 1; + *id + }); } } } } - let mut queries = HashSet::new(); - resolve_ops(tree, &mut queries); + let mut queries = HashMap::new(); + let mut id = 0; + resolve_ops(tree, &mut queries, &mut id); queries } diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs new file mode 100644 index 000000000..0ebf6305f --- /dev/null +++ b/milli/src/search/matches/mod.rs @@ -0,0 +1,434 @@ +use std::borrow::Cow; + +use matching_words::MatchingWords; +use meilisearch_tokenizer::token::SeparatorKind; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token}; + +use crate::search::query_tree::Operation; + +pub mod matching_words; + +const DEFAULT_CROP_SIZE: usize = 10; +const DEFAULT_CROP_MARKER: &'static str = "…"; +const DEFAULT_HIGHLIGHT_PREFIX: &'static str = ""; +const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = ""; + +pub struct MatcherBuilder { + matching_words: MatchingWords, + crop_size: usize, + crop_marker: Option, + highlight_prefix: Option, + highlight_suffix: Option, +} + +impl MatcherBuilder { + pub fn from_query_tree(query_tree: &Operation) -> Self { + let matching_words = MatchingWords::from_query_tree(query_tree); + + Self { + matching_words, + crop_size: DEFAULT_CROP_SIZE, + crop_marker: None, + highlight_prefix: None, + highlight_suffix: None, + } + } + + pub fn crop_size(&mut self, word_count: usize) -> &Self { + self.crop_size = word_count; + self + } + + pub fn crop_marker(&mut self, marker: String) -> &Self { + self.crop_marker = Some(marker); + self + } + + pub fn highlight_prefix(&mut self, prefix: String) -> &Self { + self.highlight_prefix = Some(prefix); + self + } + + pub fn highlight_suffix(&mut self, suffix: String) -> &Self { + self.highlight_suffix = Some(suffix); + self + } + + pub fn build<'t, 'm>(&'m self, tokens: &'t [Token], text: &'t str) -> Matcher<'t, 'm> { + let crop_marker = match &self.crop_marker { + Some(marker) => marker.as_str(), + None => &DEFAULT_CROP_MARKER, + }; + + let highlight_prefix = match &self.highlight_prefix { + Some(marker) => marker.as_str(), + None => &DEFAULT_HIGHLIGHT_PREFIX, + }; + let highlight_suffix = match &self.highlight_suffix { + Some(marker) => marker.as_str(), + None => &DEFAULT_HIGHLIGHT_SUFFIX, + }; + Matcher { + text, + tokens, + matching_words: &self.matching_words, + crop_size: self.crop_size, + crop_marker, + highlight_prefix, + highlight_suffix, + matches: None, + } + } +} + +// impl Default for MatcherBuilder { +// fn default() -> Self { +// Self { +// crop_size: DEFAULT_CROP_SIZE, +// crop_marker: None, +// highlight_prefix: None, +// highlight_suffix: None, +// } +// } +// } + +pub struct Match<'t> { + token: &'t Token<'t>, + match_len: usize, + // id of the query word that matches. + id: usize, + // position of the word in the whole text. + position: usize, +} + +pub struct MatchBounds { + start: usize, + length: usize, +} + +impl<'t> From<&Match<'t>> for MatchBounds { + fn from(m: &Match) -> Self { + MatchBounds { start: m.token.byte_start, length: m.match_len } + } +} + +pub struct Matcher<'t, 'm> { + text: &'t str, + tokens: &'t [Token<'t>], + matching_words: &'m MatchingWords, + crop_size: usize, + crop_marker: &'m str, + highlight_prefix: &'m str, + highlight_suffix: &'m str, + matches: Option>>, +} + +impl<'t> Matcher<'t, '_> { + fn compute_matches(&mut self) -> &mut Self { + let mut matches = Vec::new(); + let mut position = 0; + for token in self.tokens { + match token.is_separator() { + Some(SeparatorKind::Hard) => position += 7, + None => { + if let Some((match_len, id)) = + self.matching_words.matching_bytes_with_id(&token) + { + matches.push(Match { token, match_len, id, position }); + } + position += 1; + } + _otherwise => {} + } + } + + self.matches = Some(matches); + self + } + + pub fn matches(&mut self) -> Vec { + match &self.matches { + None => self.compute_matches().matches(), + Some(matches) => matches.iter().map(MatchBounds::from).collect(), + } + } + + fn crop_bounds(&self, matches: &[Match<'t>]) -> (usize, usize) { + let byte_end = self + .tokens + .iter() + .filter(|t| t.is_separator().is_none()) + .enumerate() + .take_while(|(i, _)| *i < self.crop_size) + .last() + .map_or(self.text.len(), |(_, t)| t.byte_end); + + (0, byte_end) + } + + pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { + if !highlight && !crop { + // compute matches is not needed if no highlight or crop is requested. + Cow::Borrowed(self.text) + } else { + match &self.matches { + Some(matches) => { + let (byte_start, byte_end) = + if crop { self.crop_bounds(matches) } else { (0, self.text.len()) }; + + let mut formatted = Vec::new(); + + // push crop marker if it's not the start of the text. + if byte_start > 0 && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + let mut byte_index = byte_start; + + if highlight { + // insert highlight markers around matches. + for m in matches + .iter() + .skip_while(|m| m.token.byte_start < byte_start) + .take_while(|m| m.token.byte_start < byte_end) + { + if byte_index < m.token.byte_start { + formatted.push(&self.text[byte_index..m.token.byte_start]); + } + + formatted.push(self.highlight_prefix); + formatted.push(&self.text[m.token.byte_start..m.token.byte_end]); + formatted.push(self.highlight_suffix); + + byte_index = m.token.byte_end; + } + } + + // push the rest of the text between last match and the end of crop. + if byte_index < byte_end { + formatted.push(&self.text[byte_index..byte_end]); + } + + // push crop marker if it's not the end of the text. + if byte_end < self.text.len() && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + if formatted.len() == 1 { + // avoid concatenating if there is already 1 slice. + Cow::Borrowed(&self.text[byte_start..byte_end]) + } else { + Cow::Owned(formatted.concat()) + } + } + None => self.compute_matches().format(highlight, crop), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::search::query_tree::{Query, QueryKind}; + + fn query_tree() -> Operation { + Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: true, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("the".to_string()), + }), + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ) + } + + #[test] + fn format_identity() { + let query_tree = query_tree(); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = false; + let crop = false; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(highlight, crop), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(highlight, crop), &text); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(highlight, crop), &text); + } + + #[test] + fn format_highlight() { + let query_tree = query_tree(); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = true; + let crop = false; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text, because there is no matches. + assert_eq!(&matcher.format(highlight, crop), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!( + &matcher.format(highlight, crop), + "Natalie risk her future to build a world with the boy she loves." + ); + } + + #[test] + fn format_crop() { + let query_tree = query_tree(); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = false; + let crop = true; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 first words with a marker at the end. + assert_eq!( + &matcher.format(highlight, crop), + "A quick brown fox can not jump 32 feet, right…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…she loves. Emily Henry: The Love That Split The World" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…future to build a world with the boy she loves." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…void void void void void split the world void void" + ); + } + + #[test] + fn format_highlight_crop() { + let query_tree = query_tree(); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = true; + let crop = true; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 first words with a marker at the end. + assert_eq!( + &matcher.format(highlight, crop), + "A quick brown fox can not jump 32 feet, right…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 last words with a marker at the start and highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World"); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 last words with a marker at the start and highlighted matches. + assert_eq!( + &matcher.format(highlight, crop), + "…future to build a world with the boy she loves." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…void void void void void split the world void void" + ); + } +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 0d33d9042..a80e520a0 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matching_words::MatchingWords; +pub use self::matches::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; @@ -32,7 +32,7 @@ mod criteria; mod distinct; mod facet; mod fst_utils; -mod matching_words; +mod matches; mod query_tree; pub struct Search<'a> { From 3be179080321308baa4c4e82f741fefcb61e5869 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 28 Mar 2022 15:57:05 +0200 Subject: [PATCH 02/21] Add crop algorithm with naive match algorithm --- milli/src/search/matches/mod.rs | 199 +++++++++++++++++++++++--------- 1 file changed, 144 insertions(+), 55 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 0ebf6305f..fb3ab9c37 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -92,13 +92,15 @@ impl MatcherBuilder { // } // } -pub struct Match<'t> { - token: &'t Token<'t>, +#[derive(Clone)] +pub struct Match { match_len: usize, // id of the query word that matches. id: usize, // position of the word in the whole text. - position: usize, + word_position: usize, + // position of the token in the whole text. + token_position: usize, } pub struct MatchBounds { @@ -106,12 +108,6 @@ pub struct MatchBounds { length: usize, } -impl<'t> From<&Match<'t>> for MatchBounds { - fn from(m: &Match) -> Self { - MatchBounds { start: m.token.byte_start, length: m.match_len } - } -} - pub struct Matcher<'t, 'm> { text: &'t str, tokens: &'t [Token<'t>], @@ -120,26 +116,22 @@ pub struct Matcher<'t, 'm> { crop_marker: &'m str, highlight_prefix: &'m str, highlight_suffix: &'m str, - matches: Option>>, + matches: Option>, } impl<'t> Matcher<'t, '_> { fn compute_matches(&mut self) -> &mut Self { let mut matches = Vec::new(); - let mut position = 0; + let mut word_position = 0; + let mut token_position = 0; for token in self.tokens { - match token.is_separator() { - Some(SeparatorKind::Hard) => position += 7, - None => { - if let Some((match_len, id)) = - self.matching_words.matching_bytes_with_id(&token) - { - matches.push(Match { token, match_len, id, position }); - } - position += 1; + if token.is_separator().is_none() { + if let Some((match_len, id)) = self.matching_words.matching_bytes_with_id(&token) { + matches.push(Match { match_len, id, word_position, token_position }); } - _otherwise => {} + word_position += 1; } + token_position += 1; } self.matches = Some(matches); @@ -149,21 +141,104 @@ impl<'t> Matcher<'t, '_> { pub fn matches(&mut self) -> Vec { match &self.matches { None => self.compute_matches().matches(), - Some(matches) => matches.iter().map(MatchBounds::from).collect(), + Some(matches) => matches + .iter() + .map(|m| MatchBounds { + start: self.tokens[m.token_position].byte_start, + length: m.match_len, + }) + .collect(), } } - fn crop_bounds(&self, matches: &[Match<'t>]) -> (usize, usize) { - let byte_end = self - .tokens - .iter() - .filter(|t| t.is_separator().is_none()) - .enumerate() - .take_while(|(i, _)| *i < self.crop_size) - .last() - .map_or(self.text.len(), |(_, t)| t.byte_end); + fn crop_around(&self, matches: &[Match]) -> (usize, usize) { + let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); + let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); + let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); + let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); - (0, byte_end) + // TODO: buggy if no match and fisrt token is a sepparator + let mut remaining_words = + self.crop_size + first_match_word_position - last_match_word_position - 1; + let mut first_token_position = first_match_token_position; + let mut last_token_position = last_match_token_position; + + while remaining_words > 0 { + match ( + first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)), + last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)), + ) { + (Some(ft), Some(lt)) => { + match (ft.is_separator(), lt.is_separator()) { + // if they are both separators and are the same kind then advance both + (Some(f_kind), Some(s_kind)) => { + if f_kind == s_kind { + first_token_position -= 1; + last_token_position += 1; + } else if f_kind == SeparatorKind::Hard { + last_token_position += 1; + } else { + first_token_position -= 1; + } + } + // left is a word, advance left + (None, Some(_)) => { + first_token_position -= 1; + remaining_words -= 1; + } + // right is a word, advance right + (Some(_), None) => { + last_token_position += 1; + remaining_words -= 1; + } + // both are words, advance left then right if remaining_word > 0 + (None, None) => { + first_token_position -= 1; + remaining_words -= 1; + + if remaining_words > 0 { + last_token_position += 1; + remaining_words -= 1; + } + } + } + } + (Some(ft), None) => { + first_token_position -= 1; + if ft.is_separator().is_none() { + remaining_words -= 1; + } + } + (None, Some(lt)) => { + last_token_position += 1; + if lt.is_separator().is_none() { + remaining_words -= 1; + } + } + (None, None) => break, + } + } + + // if tokens after the end of the window are separators, + // then add them to the window in order to keep context in cropped text. + while let Some(_separator_kind) = last_token_position + .checked_add(1) + .and_then(|i| self.tokens.get(i)) + .and_then(|t| t.is_separator()) + { + last_token_position += 1; + } + + (self.tokens[first_token_position].byte_start, self.tokens[last_token_position].byte_end) + } + + fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { + match matches { + // at least 2 matches + [first, last, ..] => self.crop_around(&[first.clone()][..]), + // less than 2 matches + _ => self.crop_around(matches), + } } pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { @@ -187,20 +262,23 @@ impl<'t> Matcher<'t, '_> { if highlight { // insert highlight markers around matches. + let tokens = self.tokens; for m in matches .iter() - .skip_while(|m| m.token.byte_start < byte_start) - .take_while(|m| m.token.byte_start < byte_end) + .skip_while(|m| tokens[m.token_position].byte_start < byte_start) + .take_while(|m| tokens[m.token_position].byte_start < byte_end) { - if byte_index < m.token.byte_start { - formatted.push(&self.text[byte_index..m.token.byte_start]); + let token = &tokens[m.token_position]; + + if byte_index < token.byte_start { + formatted.push(&self.text[byte_index..token.byte_start]); } formatted.push(self.highlight_prefix); - formatted.push(&self.text[m.token.byte_start..m.token.byte_end]); + formatted.push(&self.text[token.byte_start..token.byte_end]); formatted.push(self.highlight_suffix); - byte_index = m.token.byte_end; + byte_index = token.byte_end; } } @@ -271,7 +349,7 @@ mod tests { assert_eq!(&matcher.format(highlight, crop), &text); // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); @@ -306,12 +384,12 @@ mod tests { assert_eq!(&matcher.format(highlight, crop), &text); // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"); + assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -343,18 +421,18 @@ mod tests { // no highlight should return 10 first words with a marker at the end. assert_eq!( &matcher.format(highlight, crop), - "A quick brown fox can not jump 32 feet, right…" + "A quick brown fox can not jump 32 feet, right? …" ); - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + // Test phrase propagation + let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - // no highlight should return 10 last words with a marker at the start. + // should crop the phrase instead of croping around the match. assert_eq!( &matcher.format(highlight, crop), - "…she loves. Emily Henry: The Love That Split The World" + "…Split The World is a book written by Emily Henry. …" ); // Text containing some matches. @@ -368,6 +446,17 @@ mod tests { "…future to build a world with the boy she loves." ); + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…she loves. Emily Henry: The Love That Split The World." + ); + // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; let analyzed = analyzer.analyze(&text); @@ -398,17 +487,9 @@ mod tests { // both should return 10 first words with a marker at the end. assert_eq!( &matcher.format(highlight, crop), - "A quick brown fox can not jump 32 feet, right…" + "A quick brown fox can not jump 32 feet, right? …" ); - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); - let mut matcher = builder.build(&tokens[..], text); - // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World"); - // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; let analyzed = analyzer.analyze(&text); @@ -420,6 +501,14 @@ mod tests { "…future to build a world with the boy she loves." ); + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 last words with a marker at the start and highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World."); + // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; let analyzed = analyzer.analyze(&text); From 844f546a8b3713df444dce8a0c016e0827be0067 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 28 Mar 2022 18:17:50 +0200 Subject: [PATCH 03/21] Add matches algorithm V1 --- milli/src/search/matches/mod.rs | 109 +++++++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 9 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index fb3ab9c37..9266992d0 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -92,7 +92,7 @@ impl MatcherBuilder { // } // } -#[derive(Clone)] +#[derive(Clone, Debug)] pub struct Match { match_len: usize, // id of the query word that matches. @@ -103,6 +103,7 @@ pub struct Match { token_position: usize, } +#[derive(Clone, Debug)] pub struct MatchBounds { start: usize, length: usize, @@ -151,7 +152,7 @@ impl<'t> Matcher<'t, '_> { } } - fn crop_around(&self, matches: &[Match]) -> (usize, usize) { + fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) { let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); @@ -229,16 +230,84 @@ impl<'t> Matcher<'t, '_> { last_token_position += 1; } - (self.tokens[first_token_position].byte_start, self.tokens[last_token_position].byte_end) + (first_token_position, last_token_position) + } + + fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { + let mut ids = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.id > m.id { + order_score += 1; + } + + // compute distance between matches + distance_score -= (next_match.word_position - m.word_position).min(7) as i16; + } + + ids.push(m.id); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + (uniq_score, distance_score, order_score) + } + + fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] { + if matches.len() > 1 { + let mut best_interval = (0, 1); + let mut best_interval_score = self.match_interval_score(&matches[0..=1]); + let mut interval_first = 0; + let mut interval_last = 1; + for (index, next_match) in matches.iter().enumerate().skip(2) { + // if next match would make interval gross more than crop_size + if next_match.word_position - matches[interval_first].word_position > self.crop_size + { + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + + // keep interval if it's the best + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + best_interval_score = interval_score; + } + + // advance start of the interval while interval is longer than crop_size + while next_match.word_position - matches[interval_first].word_position + > self.crop_size + { + interval_first += 1; + } + } + interval_last = index; + } + + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + } + + &matches[best_interval.0..=best_interval.1] + } else { + matches + } } fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { - match matches { - // at least 2 matches - [first, last, ..] => self.crop_around(&[first.clone()][..]), - // less than 2 matches - _ => self.crop_around(matches), - } + let match_interval = self.find_best_match_interval(matches); + + let (first_token_position, last_token_position) = self.token_crop_bounds(match_interval); + + (self.tokens[first_token_position].byte_start, self.tokens[last_token_position].byte_end) } pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { @@ -467,6 +536,28 @@ mod tests { &matcher.format(highlight, crop), "…void void void void void split the world void void" ); + + // Text containing matches with diferent density. + let text = "split void the void void world void void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…void void void void void split the world void void" + ); + + // Text containing matches with same word. + let text = "split split split split split split void void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…void void void void void split the world void void" + ); } #[test] From 4428cb5909a8e936ff4d80fa5414fa63c9b8773d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 29 Mar 2022 14:51:02 +0200 Subject: [PATCH 04/21] Add some tests and fix some corner cases --- milli/src/search/matches/mod.rs | 118 +++++++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 9 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 9266992d0..680dbdffc 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -158,9 +158,13 @@ impl<'t> Matcher<'t, '_> { let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); - // TODO: buggy if no match and fisrt token is a sepparator + // TODO: buggy if no match and first token is a sepparator let mut remaining_words = - self.crop_size + first_match_word_position - last_match_word_position - 1; + self.crop_size + first_match_word_position - last_match_word_position; + // if first token is a word, then remove 1 to remaining_words. + if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) { + remaining_words -= 1; + } let mut first_token_position = first_match_token_position; let mut last_token_position = last_match_token_position; @@ -204,18 +208,21 @@ impl<'t> Matcher<'t, '_> { } } } + // the end of the text is reached, advance left. (Some(ft), None) => { first_token_position -= 1; if ft.is_separator().is_none() { remaining_words -= 1; } } + // the start of the text is reached, advance right. (None, Some(lt)) => { last_token_position += 1; if lt.is_separator().is_none() { remaining_words -= 1; } } + // no more token to add. (None, None) => break, } } @@ -263,13 +270,14 @@ impl<'t> Matcher<'t, '_> { fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] { if matches.len() > 1 { - let mut best_interval = (0, 1); - let mut best_interval_score = self.match_interval_score(&matches[0..=1]); + let mut best_interval = (0, 0); + let mut best_interval_score = self.match_interval_score(&matches[0..=0]); let mut interval_first = 0; - let mut interval_last = 1; - for (index, next_match) in matches.iter().enumerate().skip(2) { + let mut interval_last = 0; + for (index, next_match) in matches.iter().enumerate().skip(1) { // if next match would make interval gross more than crop_size - if next_match.word_position - matches[interval_first].word_position > self.crop_size + if next_match.word_position - matches[interval_first].word_position + >= self.crop_size { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); @@ -282,7 +290,7 @@ impl<'t> Matcher<'t, '_> { // advance start of the interval while interval is longer than crop_size while next_match.word_position - matches[interval_first].word_position - > self.crop_size + >= self.crop_size { interval_first += 1; } @@ -307,10 +315,15 @@ impl<'t> Matcher<'t, '_> { let (first_token_position, last_token_position) = self.token_crop_bounds(match_interval); - (self.tokens[first_token_position].byte_start, self.tokens[last_token_position].byte_end) + let byte_start = self.tokens.get(first_token_position).map_or(0, |t| t.byte_start); + let byte_end = self.tokens.get(last_token_position).map_or(byte_start, |t| t.byte_end); + (byte_start, byte_end) } pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { + // If 0 it will be considered null and thus not crop the field + // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 + let crop = crop && self.crop_size > 0; if !highlight && !crop { // compute matches is not needed if no highlight or crop is requested. Cow::Borrowed(self.text) @@ -444,6 +457,20 @@ mod tests { let highlight = true; let crop = false; + // empty text. + let text = ""; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(highlight, crop), ""); + + // text containing only separators. + let text = ":-)"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(highlight, crop), ":-)"); + // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let analyzed = analyzer.analyze(&text); @@ -482,6 +509,20 @@ mod tests { let highlight = false; let crop = true; + // empty text. + let text = ""; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(highlight, crop), ""); + + // text containing only separators. + let text = ":-)"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(highlight, crop), ":-)"); + // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let analyzed = analyzer.analyze(&text); @@ -493,6 +534,17 @@ mod tests { "A quick brown fox can not jump 32 feet, right? …" ); + // Text without any match starting by a separator. + let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 first words with a marker at the end. + assert_eq!( + &matcher.format(highlight, crop), + "(A quick brown fox can not jump 32 feet, right? …" + ); + // Test phrase propagation let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; let analyzed = analyzer.analyze(&text); @@ -570,6 +622,20 @@ mod tests { let highlight = true; let crop = true; + // empty text. + let text = ""; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(highlight, crop), ""); + + // text containing only separators. + let text = ":-)"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(highlight, crop), ":-)"); + // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let analyzed = analyzer.analyze(&text); @@ -611,4 +677,38 @@ mod tests { "…void void void void void split the world void void" ); } + + #[test] + fn smaller_crop_size() { + //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 + let query_tree = query_tree(); + + let mut builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = false; + let crop = true; + + let text = "void void split the world void void."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + + // set a smaller crop size + builder.crop_size(2); + let mut matcher = builder.build(&tokens[..], text); + // because crop size < query size, partially format matches. + assert_eq!(&matcher.format(highlight, crop), "…split the …"); + + // set a smaller crop size + builder.crop_size(1); + let mut matcher = builder.build(&tokens[..], text); + // because crop size < query size, partially format matches. + assert_eq!(&matcher.format(highlight, crop), "…split …"); + + // set a smaller crop size + builder.crop_size(0); + let mut matcher = builder.build(&tokens[..], text); + // because crop size is 0, crop is ignored. + assert_eq!(&matcher.format(highlight, crop), "void void split the world void void."); + } } From 734d0899d341a23f6bb3e49efc2da7828b46fa63 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 29 Mar 2022 14:57:21 +0200 Subject: [PATCH 05/21] Publish Matcher --- milli/src/search/matches/mod.rs | 4 ++-- milli/src/search/mod.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 680dbdffc..2169c54ab 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -1,12 +1,12 @@ use std::borrow::Cow; -use matching_words::MatchingWords; +pub use matching_words::MatchingWords; use meilisearch_tokenizer::token::SeparatorKind; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token}; use crate::search::query_tree::Operation; -pub mod matching_words; +mod matching_words; const DEFAULT_CROP_SIZE: usize = 10; const DEFAULT_CROP_MARKER: &'static str = "…"; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index a80e520a0..752ae236b 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matches::matching_words::MatchingWords; +pub use self::matches::{Matcher, MatcherBuilder, MatchingWords}; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; From 29c5f76d7f389b3bfb02f00b88aa1b98318eecce Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 30 Mar 2022 10:50:23 +0200 Subject: [PATCH 06/21] Use new matcher in http-ui --- http-ui/src/main.rs | 41 +++++++++++---------------------- milli/src/lib.rs | 4 +++- milli/src/search/matches/mod.rs | 10 ++++++++ 3 files changed, 26 insertions(+), 29 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 26c1034eb..fdfc04af9 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -25,7 +25,7 @@ use milli::update::{ ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, }; use milli::{ - obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, Index, MatchingWords, + obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, Index, MatcherBuilder, SearchResult, SortError, }; use once_cell::sync::OnceCell; @@ -152,43 +152,25 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { Self { analyzer } } - fn highlight_value(&self, value: Value, matching_words: &MatchingWords) -> Value { + fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value { match value { Value::Null => Value::Null, Value::Bool(boolean) => Value::Bool(boolean), Value::Number(number) => Value::Number(number), Value::String(old_string) => { - let mut string = String::new(); let analyzed = self.analyzer.analyze(&old_string); - for (word, token) in analyzed.reconstruct() { - if token.is_word() { - match matching_words.matching_bytes(&token) { - Some(chars_to_highlight) => { - let mut chars = word.chars(); + let analyzed: Vec<_> = analyzed.tokens().collect(); + let mut matcher = matcher_builder.build(&analyzed[..], &old_string); - string.push_str(""); - // push the part to highlight - string.extend(chars.by_ref().take(chars_to_highlight)); - string.push_str(""); - // push the suffix after highlight - string.extend(chars); - } - // no highlight - None => string.push_str(word), - } - } else { - string.push_str(word); - } - } - Value::String(string) + Value::String(matcher.format(true, true).to_string()) } Value::Array(values) => Value::Array( - values.into_iter().map(|v| self.highlight_value(v, matching_words)).collect(), + values.into_iter().map(|v| self.highlight_value(v, matcher_builder)).collect(), ), Value::Object(object) => Value::Object( object .into_iter() - .map(|(k, v)| (k, self.highlight_value(v, matching_words))) + .map(|(k, v)| (k, self.highlight_value(v, matcher_builder))) .collect(), ), } @@ -197,14 +179,14 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { fn highlight_record( &self, object: &mut Map, - matching_words: &MatchingWords, + matcher_builder: &MatcherBuilder, attributes_to_highlight: &HashSet, ) { // TODO do we need to create a string for element that are not and needs to be highlight? for (key, value) in object.iter_mut() { if attributes_to_highlight.contains(key) { let old_value = mem::take(value); - *value = self.highlight_value(old_value, matching_words); + *value = self.highlight_value(old_value, matcher_builder); } } } @@ -819,12 +801,15 @@ async fn main() -> anyhow::Result<()> { let stop_words = fst::Set::default(); let highlighter = Highlighter::new(&stop_words); + let mut matcher_builder = MatcherBuilder::from_matching_words(matching_words); + matcher_builder.highlight_prefix("".to_string()); + matcher_builder.highlight_suffix("".to_string()); for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); if !disable_highlighting { highlighter.highlight_record( &mut object, - &matching_words, + &matcher_builder, &attributes_to_highlight, ); } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ba2bd9b0f..9a9ec428c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -36,7 +36,9 @@ pub use self::heed_codec::{ RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, }; pub use self::index::Index; -pub use self::search::{FacetDistribution, Filter, MatchingWords, Search, SearchResult}; +pub use self::search::{ + FacetDistribution, Filter, MatcherBuilder, MatchingWords, Search, SearchResult, +}; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 2169c54ab..aeaa8196e 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -34,6 +34,16 @@ impl MatcherBuilder { } } + pub fn from_matching_words(matching_words: MatchingWords) -> Self { + Self { + matching_words, + crop_size: DEFAULT_CROP_SIZE, + crop_marker: None, + highlight_prefix: None, + highlight_suffix: None, + } + } + pub fn crop_size(&mut self, word_count: usize) -> &Self { self.crop_size = word_count; self From bd30ee97b8c67c32c264bbeaad7114bb5367caf2 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 30 Mar 2022 14:00:06 +0200 Subject: [PATCH 07/21] Keep separators at start of the croped string --- milli/src/search/matches/mod.rs | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index aeaa8196e..9ab1ef50f 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -247,6 +247,15 @@ impl<'t> Matcher<'t, '_> { last_token_position += 1; } + // same for start + while let Some(_separator_kind) = first_token_position + .checked_sub(1) + .and_then(|i| self.tokens.get(i)) + .and_then(|t| t.is_separator()) + { + first_token_position -= 1; + } + (first_token_position, last_token_position) } @@ -563,7 +572,7 @@ mod tests { // should crop the phrase instead of croping around the match. assert_eq!( &matcher.format(highlight, crop), - "…Split The World is a book written by Emily Henry. …" + "…. Split The World is a book written by Emily Henry. …" ); // Text containing some matches. @@ -574,7 +583,7 @@ mod tests { // no highlight should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "…future to build a world with the boy she loves." + "… future to build a world with the boy she loves." ); // Text containing all matches. @@ -585,7 +594,7 @@ mod tests { // no highlight should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "…she loves. Emily Henry: The Love That Split The World." + "… she loves. Emily Henry: The Love That Split The World." ); // Text containing a match unordered and a match ordered. @@ -596,7 +605,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "…void void void void void split the world void void" + "… void void void void void split the world void void" ); // Text containing matches with diferent density. @@ -607,7 +616,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "…void void void void void split the world void void" + "… void void void void void split the world void void" ); // Text containing matches with same word. @@ -618,7 +627,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "…void void void void void split the world void void" + "… void void void void void split the world void void" ); } @@ -665,7 +674,7 @@ mod tests { // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( &matcher.format(highlight, crop), - "…future to build a world with the boy she loves." + "… future to build a world with the boy she loves." ); // Text containing all matches. @@ -674,7 +683,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(highlight, crop), "… she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; @@ -684,7 +693,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "…void void void void void split the world void void" + "… void void void void void split the world void void" ); } @@ -707,13 +716,13 @@ mod tests { builder.crop_size(2); let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "…split the …"); + assert_eq!(&matcher.format(highlight, crop), "… split the …"); // set a smaller crop size builder.crop_size(1); let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "…split …"); + assert_eq!(&matcher.format(highlight, crop), "… split …"); // set a smaller crop size builder.crop_size(0); From 6dc345bc531015c565b8640ca1a4576cbf9e4486 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 30 Mar 2022 15:15:14 +0200 Subject: [PATCH 08/21] Test and Fix prefix highlight --- milli/src/search/matches/mod.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 9ab1ef50f..816f5e273 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -376,8 +376,10 @@ impl<'t> Matcher<'t, '_> { } formatted.push(self.highlight_prefix); - formatted.push(&self.text[token.byte_start..token.byte_end]); + formatted.push(&self.text[token.byte_start..][..m.match_len]); formatted.push(self.highlight_suffix); + formatted + .push(&self.text[token.byte_start + m.match_len..token.byte_end]); byte_index = token.byte_end; } @@ -516,6 +518,17 @@ mod tests { &matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves." ); + + // Text containing some matches by prefix. + let text = "Natalie risk her future to build a worldle with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!( + &matcher.format(highlight, crop), + "Natalie risk her future to build a worldle with the boy she loves." + ); } #[test] From b3f0f39106fba1c596aefca61877fd8ca1395d8b Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 30 Mar 2022 15:22:18 +0200 Subject: [PATCH 09/21] Make some cleaning --- milli/src/lib.rs | 2 +- milli/src/search/matches/mod.rs | 18 +++--------------- milli/src/search/mod.rs | 2 +- 3 files changed, 5 insertions(+), 17 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 9a9ec428c..6cbb9f126 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -37,7 +37,7 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - FacetDistribution, Filter, MatcherBuilder, MatchingWords, Search, SearchResult, + FacetDistribution, Filter, MatchBounds, MatcherBuilder, MatchingWords, Search, SearchResult, }; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 816f5e273..e66ba781c 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -1,8 +1,7 @@ use std::borrow::Cow; pub use matching_words::MatchingWords; -use meilisearch_tokenizer::token::SeparatorKind; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token}; +use meilisearch_tokenizer::token::{SeparatorKind, Token}; use crate::search::query_tree::Operation; @@ -91,17 +90,6 @@ impl MatcherBuilder { } } -// impl Default for MatcherBuilder { -// fn default() -> Self { -// Self { -// crop_size: DEFAULT_CROP_SIZE, -// crop_marker: None, -// highlight_prefix: None, -// highlight_suffix: None, -// } -// } -// } - #[derive(Clone, Debug)] pub struct Match { match_len: usize, @@ -115,8 +103,8 @@ pub struct Match { #[derive(Clone, Debug)] pub struct MatchBounds { - start: usize, - length: usize, + pub start: usize, + pub length: usize, } pub struct Matcher<'t, 'm> { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 752ae236b..8804d9151 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matches::{Matcher, MatcherBuilder, MatchingWords}; +pub use self::matches::{MatchBounds, Matcher, MatcherBuilder, MatchingWords}; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; From a93cd8c61c0b6b4f096e65e06de76aaaf81fde52 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 30 Mar 2022 15:43:49 +0200 Subject: [PATCH 10/21] Fix prefix highlight with special chars --- milli/src/search/matches/mod.rs | 60 ++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index e66ba781c..a4c29ce66 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -363,11 +363,15 @@ impl<'t> Matcher<'t, '_> { formatted.push(&self.text[byte_index..token.byte_start]); } + let highlight_byte_index = self.text[token.byte_start..] + .char_indices() + .enumerate() + .find(|(i, _)| *i == m.match_len) + .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); formatted.push(self.highlight_prefix); - formatted.push(&self.text[token.byte_start..][..m.match_len]); + formatted.push(&self.text[token.byte_start..highlight_byte_index]); formatted.push(self.highlight_suffix); - formatted - .push(&self.text[token.byte_start + m.match_len..token.byte_end]); + formatted.push(&self.text[highlight_byte_index..token.byte_end]); byte_index = token.byte_end; } @@ -398,6 +402,8 @@ impl<'t> Matcher<'t, '_> { #[cfg(test)] mod tests { + use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; + use super::*; use crate::search::query_tree::{Query, QueryKind}; @@ -506,17 +512,53 @@ mod tests { &matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves." ); + } - // Text containing some matches by prefix. - let text = "Natalie risk her future to build a worldle with the boy she loves."; + #[test] + fn highlight_unicode() { + let query_tree = Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(1, "wessfalia".to_string()), + }), + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = true; + let crop = false; + + // Text containing prefix match. + let text = "Ŵôřlḑôle"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!( - &matcher.format(highlight, crop), - "Natalie risk her future to build a worldle with the boy she loves." - ); + assert_eq!(&matcher.format(highlight, crop), "Ŵôřlḑôle"); + + // Text containing unicode match. + let text = "Ŵôřlḑ"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "Ŵôřlḑ"); + + // Text containing unicode match. + let text = "Westfália"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "Westfália"); } #[test] From 56e0edd62119d345e67755a4f8a94ad2f03de5cc Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 30 Mar 2022 17:22:58 +0200 Subject: [PATCH 11/21] Put crop markers direclty around words --- milli/src/search/matches/mod.rs | 47 ++++++++++----------------------- 1 file changed, 14 insertions(+), 33 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index a4c29ce66..c6b89f9ec 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -225,25 +225,6 @@ impl<'t> Matcher<'t, '_> { } } - // if tokens after the end of the window are separators, - // then add them to the window in order to keep context in cropped text. - while let Some(_separator_kind) = last_token_position - .checked_add(1) - .and_then(|i| self.tokens.get(i)) - .and_then(|t| t.is_separator()) - { - last_token_position += 1; - } - - // same for start - while let Some(_separator_kind) = first_token_position - .checked_sub(1) - .and_then(|i| self.tokens.get(i)) - .and_then(|t| t.is_separator()) - { - first_token_position -= 1; - } - (first_token_position, last_token_position) } @@ -593,7 +574,7 @@ mod tests { // no highlight should return 10 first words with a marker at the end. assert_eq!( &matcher.format(highlight, crop), - "A quick brown fox can not jump 32 feet, right? …" + "A quick brown fox can not jump 32 feet, right…" ); // Text without any match starting by a separator. @@ -604,7 +585,7 @@ mod tests { // no highlight should return 10 first words with a marker at the end. assert_eq!( &matcher.format(highlight, crop), - "(A quick brown fox can not jump 32 feet, right? …" + "(A quick brown fox can not jump 32 feet, right…" ); // Test phrase propagation @@ -615,7 +596,7 @@ mod tests { // should crop the phrase instead of croping around the match. assert_eq!( &matcher.format(highlight, crop), - "…. Split The World is a book written by Emily Henry. …" + "…Split The World is a book written by Emily Henry…" ); // Text containing some matches. @@ -626,7 +607,7 @@ mod tests { // no highlight should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "… future to build a world with the boy she loves." + "…future to build a world with the boy she loves…" ); // Text containing all matches. @@ -637,7 +618,7 @@ mod tests { // no highlight should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "… she loves. Emily Henry: The Love That Split The World." + "…she loves. Emily Henry: The Love That Split The World." ); // Text containing a match unordered and a match ordered. @@ -648,7 +629,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "… void void void void void split the world void void" + "…void void void void void split the world void void" ); // Text containing matches with diferent density. @@ -659,7 +640,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "… void void void void void split the world void void" + "…void void void void void split the world void void" ); // Text containing matches with same word. @@ -670,7 +651,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "… void void void void void split the world void void" + "…void void void void void split the world void void" ); } @@ -706,7 +687,7 @@ mod tests { // both should return 10 first words with a marker at the end. assert_eq!( &matcher.format(highlight, crop), - "A quick brown fox can not jump 32 feet, right? …" + "A quick brown fox can not jump 32 feet, right…" ); // Text containing some matches. @@ -717,7 +698,7 @@ mod tests { // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( &matcher.format(highlight, crop), - "… future to build a world with the boy she loves." + "…future to build a world with the boy she loves…" ); // Text containing all matches. @@ -726,7 +707,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "… she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; @@ -736,7 +717,7 @@ mod tests { // crop should return 10 last words with a marker at the start. assert_eq!( &matcher.format(highlight, crop), - "… void void void void void split the world void void" + "…void void void void void split the world void void" ); } @@ -759,13 +740,13 @@ mod tests { builder.crop_size(2); let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "… split the …"); + assert_eq!(&matcher.format(highlight, crop), "…split the…"); // set a smaller crop size builder.crop_size(1); let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "… split …"); + assert_eq!(&matcher.format(highlight, crop), "…split…"); // set a smaller crop size builder.crop_size(0); From 3bb1e35adac89c9b1e371dcd7b82372063b520ce Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 4 Apr 2022 18:56:59 +0200 Subject: [PATCH 12/21] Fix match count --- milli/src/search/matches/matching_words.rs | 339 ++++++++++++--------- milli/src/search/matches/mod.rs | 169 +++++----- milli/src/search/mod.rs | 17 +- milli/src/search/query_tree.rs | 175 ++++++++++- 4 files changed, 469 insertions(+), 231 deletions(-) diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 48f6fe809..274634554 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -1,12 +1,12 @@ use std::cmp::{min, Reverse}; -use std::collections::{BTreeMap, HashMap}; +use std::collections::BTreeMap; +use std::fmt; use std::ops::{Index, IndexMut}; use levenshtein_automata::{Distance, DFA}; use meilisearch_tokenizer::Token; use crate::search::build_dfa; -use crate::search::query_tree::{Operation, Query}; type IsPrefix = bool; @@ -14,83 +14,129 @@ type IsPrefix = bool; /// referencing words that match the given query tree. #[derive(Default)] pub struct MatchingWords { - dfas: Vec<(DFA, String, u8, IsPrefix, usize)>, + inner: Vec<(Vec, Vec)>, } impl MatchingWords { - pub fn from_query_tree(tree: &Operation) -> Self { - // fetch matchable words from the query tree - let mut dfas: Vec<_> = fetch_queries(tree) - .into_iter() - // create DFAs for each word - .map(|((w, t, p), id)| (build_dfa(w, t, p), w.to_string(), t, p, id)) - .collect(); - // Sort word by len in DESC order prioritizing the longuest word, + pub fn new(mut matching_words: Vec<(Vec, Vec)>) -> Self { + // Sort word by len in DESC order prioritizing the longuest matches, // in order to highlight the longuest part of the matched word. - dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix, _id)| { - Reverse(query_word.len()) - }); - Self { dfas } + matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len()))); + + Self { inner: matching_words } } - /// Returns the number of matching bytes if the word matches one of the query words. - pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option { - self.matching_bytes_with_id(word_to_highlight).map(|(len, _)| len) - } - - pub fn matching_bytes_with_id(&self, word_to_highlight: &Token) -> Option<(usize, usize)> { - self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix, id)| { - match dfa.eval(word_to_highlight.text()) { - Distance::Exact(t) if t <= *typo => { - if *is_prefix { - let len = bytes_to_highlight(word_to_highlight.text(), query_word); - Some((word_to_highlight.num_chars_from_bytes(len), *id)) - } else { - Some(( - word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()), - *id, - )) - } - } - _otherwise => None, - } - }) + pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { + MatchesIter { inner: Box::new(self.inner.iter()), token } } } -/// Lists all words which can be considered as a match for the query tree. -fn fetch_queries(tree: &Operation) -> HashMap<(&str, u8, IsPrefix), usize> { - fn resolve_ops<'a>( - tree: &'a Operation, - out: &mut HashMap<(&'a str, u8, IsPrefix), usize>, - id: &mut usize, - ) { - match tree { - Operation::Or(_, ops) | Operation::And(ops) => { - ops.as_slice().iter().for_each(|op| resolve_ops(op, out, id)); - } - Operation::Query(Query { prefix, kind }) => { - let typo = if kind.is_exact() { 0 } else { kind.typo() }; - out.entry((kind.word(), typo, *prefix)).or_insert_with(|| { - *id += 1; - *id - }); - } - Operation::Phrase(words) => { - for word in words { - out.entry((word, 0, false)).or_insert_with(|| { - *id += 1; - *id - }); +pub struct MatchesIter<'a, 'b> { + inner: Box, Vec)> + 'a>, + token: &'b Token<'b>, +} + +impl<'a> Iterator for MatchesIter<'a, '_> { + type Item = MatchType<'a>; + + fn next(&mut self) -> Option { + match self.inner.next() { + Some((matching_words, ids)) => match matching_words[0].match_token(&self.token) { + Some(char_len) => { + if matching_words.len() > 1 { + Some(MatchType::Partial(PartialMatch { + matching_words: &matching_words[1..], + ids, + char_len, + })) + } else { + Some(MatchType::Full { char_len, ids }) + } } - } + None => self.next(), + }, + None => None, } } +} - let mut queries = HashMap::new(); - let mut id = 0; - resolve_ops(tree, &mut queries, &mut id); - queries +pub type PrimitiveWordId = u8; +pub struct MatchingWord { + pub dfa: DFA, + pub word: String, + pub typo: u8, + pub prefix: IsPrefix, +} + +impl fmt::Debug for MatchingWord { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MatchingWord") + .field("word", &self.word) + .field("typo", &self.typo) + .field("prefix", &self.prefix) + .finish() + } +} + +impl PartialEq for MatchingWord { + fn eq(&self, other: &Self) -> bool { + self.prefix == other.prefix && self.typo == other.typo && self.word == other.word + } +} + +impl MatchingWord { + pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self { + let dfa = build_dfa(&word, typo, prefix); + + Self { dfa, word, typo, prefix } + } + + pub fn match_token(&self, token: &Token) -> Option { + match self.dfa.eval(token.text()) { + Distance::Exact(t) if t <= self.typo => { + if self.prefix { + let len = bytes_to_highlight(token.text(), &self.word); + Some(token.num_chars_from_bytes(len)) + } else { + Some(token.num_chars_from_bytes(token.text().len())) + } + } + _otherwise => None, + } + } +} + +#[derive(Debug, PartialEq)] +pub enum MatchType<'a> { + Full { char_len: usize, ids: &'a [PrimitiveWordId] }, + Partial(PartialMatch<'a>), +} + +#[derive(Debug, PartialEq)] +pub struct PartialMatch<'a> { + matching_words: &'a [MatchingWord], + ids: &'a [PrimitiveWordId], + char_len: usize, +} + +impl<'a> PartialMatch<'a> { + pub fn match_token(self, token: &Token) -> Option> { + self.matching_words[0].match_token(token).map(|char_len| { + if self.matching_words.len() > 1 { + MatchType::Partial(PartialMatch { + matching_words: &self.matching_words[1..], + ids: self.ids, + char_len, + }) + } else { + MatchType::Full { char_len, ids: self.ids } + } + }) + } + + pub fn char_len(&self) -> usize { + self.char_len + } } // A simple wrapper around vec so we can get contiguous but index it like it's 2D array. @@ -203,7 +249,6 @@ mod tests { use meilisearch_tokenizer::TokenKind; use super::*; - use crate::search::query_tree::{Operation, Query, QueryKind}; use crate::MatchingWords; #[test] @@ -271,102 +316,104 @@ mod tests { #[test] fn matching_words() { - let query_tree = Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: true, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ); + let matching_words = vec![ + (vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]), + (vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]), + (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]), + ]; - let matching_words = MatchingWords::from_query_tree(&query_tree); + let matching_words = MatchingWords::new(matching_words); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("word"), - byte_start: 0, - char_index: 0, - byte_end: "word".len(), - char_map: None, - }), - Some(3) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("word"), + byte_start: 0, + char_index: 0, + byte_end: "word".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 3, ids: &[2] }) ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("nyc"), - byte_start: 0, - char_index: 0, - byte_end: "nyc".len(), - char_map: None, - }), + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("nyc"), + byte_start: 0, + char_index: 0, + byte_end: "nyc".len(), + char_map: None, + }) + .next(), None ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("world"), - byte_start: 0, - char_index: 0, - byte_end: "world".len(), - char_map: None, - }), - Some(5) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("world"), + byte_start: 0, + char_index: 0, + byte_end: "world".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[2] }) ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("splitted"), - byte_start: 0, - char_index: 0, - byte_end: "splitted".len(), - char_map: None, - }), - Some(5) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("splitted"), + byte_start: 0, + char_index: 0, + byte_end: "splitted".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[0] }) ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("thisnew"), - byte_start: 0, - char_index: 0, - byte_end: "thisnew".len(), - char_map: None, - }), + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("thisnew"), + byte_start: 0, + char_index: 0, + byte_end: "thisnew".len(), + char_map: None, + }) + .next(), None ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("borld"), - byte_start: 0, - char_index: 0, - byte_end: "borld".len(), - char_map: None, - }), - Some(5) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("borld"), + byte_start: 0, + char_index: 0, + byte_end: "borld".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[2] }) ); assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("wordsplit"), - byte_start: 0, - char_index: 0, - byte_end: "wordsplit".len(), - char_map: None, - }), - Some(4) + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("wordsplit"), + byte_start: 0, + char_index: 0, + byte_end: "wordsplit".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 4, ids: &[2] }) ); } } diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index c6b89f9ec..a99798a9b 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -1,11 +1,10 @@ use std::borrow::Cow; pub use matching_words::MatchingWords; +use matching_words::{MatchType, PrimitiveWordId}; use meilisearch_tokenizer::token::{SeparatorKind, Token}; -use crate::search::query_tree::Operation; - -mod matching_words; +pub mod matching_words; const DEFAULT_CROP_SIZE: usize = 10; const DEFAULT_CROP_MARKER: &'static str = "…"; @@ -21,18 +20,6 @@ pub struct MatcherBuilder { } impl MatcherBuilder { - pub fn from_query_tree(query_tree: &Operation) -> Self { - let matching_words = MatchingWords::from_query_tree(query_tree); - - Self { - matching_words, - crop_size: DEFAULT_CROP_SIZE, - crop_marker: None, - highlight_prefix: None, - highlight_suffix: None, - } - } - pub fn from_matching_words(matching_words: MatchingWords) -> Self { Self { matching_words, @@ -93,8 +80,8 @@ impl MatcherBuilder { #[derive(Clone, Debug)] pub struct Match { match_len: usize, - // id of the query word that matches. - id: usize, + // ids of the query words that matches. + ids: Vec, // position of the word in the whole text. word_position: usize, // position of the token in the whole text. @@ -123,10 +110,72 @@ impl<'t> Matcher<'t, '_> { let mut matches = Vec::new(); let mut word_position = 0; let mut token_position = 0; - for token in self.tokens { + while let Some(token) = self.tokens.get(token_position) { if token.is_separator().is_none() { - if let Some((match_len, id)) = self.matching_words.matching_bytes_with_id(&token) { - matches.push(Match { match_len, id, word_position, token_position }); + 'matches: for match_type in self.matching_words.match_token(&token) { + match match_type { + MatchType::Full { char_len, ids } => { + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + // stop on the first match + break; + } + MatchType::Partial(mut partial) => { + let mut potential_matches = + vec![(token_position, word_position, partial.char_len())]; + let mut t_position = 1; + let mut w_position = 1; + 'partials: for token in &self.tokens[token_position + 1..] { + if token.is_separator().is_none() { + partial = match partial.match_token(&token) { + Some(MatchType::Partial(partial)) => { + potential_matches.push(( + token_position + t_position, + word_position + w_position, + partial.char_len(), + )); + partial + } + // partial match is now full, we keep this matches and we advance positions + Some(MatchType::Full { char_len, ids }) => { + let iter = potential_matches.into_iter().map( + |(token_position, word_position, match_len)| { + Match { + match_len, + ids: ids.to_vec(), + word_position, + token_position, + } + }, + ); + + matches.extend(iter); + + word_position += w_position; + token_position += t_position; + + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + + break 'matches; + } + // no match, continue to next match. + None => break 'partials, + }; + w_position += 1; + } + t_position += 1; + } + } + } } word_position += 1; } @@ -229,7 +278,7 @@ impl<'t> Matcher<'t, '_> { } fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { - let mut ids = Vec::with_capacity(matches.len()); + let mut ids: Vec = Vec::with_capacity(matches.len()); let mut order_score = 0; let mut distance_score = 0; @@ -237,7 +286,7 @@ impl<'t> Matcher<'t, '_> { while let Some(m) = iter.next() { if let Some(next_match) = iter.peek() { // if matches are ordered - if next_match.id > m.id { + if next_match.ids.iter().min() > m.ids.iter().min() { order_score += 1; } @@ -245,7 +294,7 @@ impl<'t> Matcher<'t, '_> { distance_score -= (next_match.word_position - m.word_position).min(7) as i16; } - ids.push(m.id); + ids.extend(m.ids.iter()); } ids.sort_unstable(); @@ -348,7 +397,8 @@ impl<'t> Matcher<'t, '_> { .char_indices() .enumerate() .find(|(i, _)| *i == m.match_len) - .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start) + .min(token.byte_end); formatted.push(self.highlight_prefix); formatted.push(&self.text[token.byte_start..highlight_byte_index]); formatted.push(self.highlight_suffix); @@ -386,33 +436,23 @@ mod tests { use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use super::*; - use crate::search::query_tree::{Query, QueryKind}; + use crate::search::matches::matching_words::MatchingWord; - fn query_tree() -> Operation { - Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: true, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("the".to_string()), - }), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ) + fn matching_words() -> MatchingWords { + let matching_words = vec![ + (vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]), + (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]), + ]; + + MatchingWords::new(matching_words) } #[test] fn format_identity() { - let query_tree = query_tree(); + let matching_words = matching_words(); - let builder = MatcherBuilder::from_query_tree(&query_tree); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = false; @@ -445,9 +485,9 @@ mod tests { #[test] fn format_highlight() { - let query_tree = query_tree(); + let matching_words = matching_words(); - let builder = MatcherBuilder::from_query_tree(&query_tree); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = true; @@ -497,21 +537,14 @@ mod tests { #[test] fn highlight_unicode() { - let query_tree = Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "wessfalia".to_string()), - }), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ); + let matching_words = vec![ + (vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]), + (vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]), + ]; - let builder = MatcherBuilder::from_query_tree(&query_tree); + let matching_words = MatchingWords::new(matching_words); + + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = true; @@ -539,14 +572,14 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Westfália"); + assert_eq!(&matcher.format(highlight, crop), "Westfália"); } #[test] fn format_crop() { - let query_tree = query_tree(); + let matching_words = matching_words(); - let builder = MatcherBuilder::from_query_tree(&query_tree); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = false; @@ -657,9 +690,9 @@ mod tests { #[test] fn format_highlight_crop() { - let query_tree = query_tree(); + let matching_words = matching_words(); - let builder = MatcherBuilder::from_query_tree(&query_tree); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = true; @@ -724,9 +757,9 @@ mod tests { #[test] fn smaller_crop_size() { //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 - let query_tree = query_tree(); + let matching_words = matching_words(); - let mut builder = MatcherBuilder::from_query_tree(&query_tree); + let mut builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let highlight = false; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 8804d9151..2b025f269 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -114,7 +114,7 @@ impl<'a> Search<'a> { pub fn execute(&self) -> Result { // We create the query tree by spliting the query into tokens. let before = Instant::now(); - let (query_tree, primitive_query) = match self.query.as_ref() { + let (query_tree, primitive_query, matching_words) = match self.query.as_ref() { Some(query) => { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); @@ -132,9 +132,11 @@ impl<'a> Search<'a> { let analyzer = Analyzer::new(config); let result = analyzer.analyze(query); let tokens = result.tokens(); - builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq))) + builder + .build(tokens)? + .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw))) } - None => (None, None), + None => (None, None, None), }; debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed()); @@ -148,11 +150,6 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", filtered_candidates, before.elapsed()); - let matching_words = match query_tree.as_ref() { - Some(query_tree) => MatchingWords::from_query_tree(&query_tree), - None => MatchingWords::default(), - }; - // We check that we are allowed to use the sort criteria, we check // that they are declared in the sortable fields. if let Some(sort_criteria) = &self.sort_criteria { @@ -193,13 +190,13 @@ impl<'a> Search<'a> { )?; match self.index.distinct_field(self.rtxn)? { - None => self.perform_sort(NoopDistinct, matching_words, criteria), + None => self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria), Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; match field_ids_map.id(name) { Some(fid) => { let distinct = FacetDistinct::new(fid, self.index, self.rtxn); - self.perform_sort(distinct, matching_words, criteria) + self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria) } None => Ok(SearchResult::default()), } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 4eccae8ce..a45034a3b 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -8,7 +8,8 @@ use meilisearch_tokenizer::TokenKind; use roaring::RoaringBitmap; use slice_group_by::GroupBy; -use crate::{Index, Result}; +use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId}; +use crate::{Index, MatchingWords, Result}; type IsOptionalWord = bool; type IsPrefix = bool; @@ -233,7 +234,10 @@ impl<'a> QueryTreeBuilder<'a> { /// - if `authorize_typos` is set to `false` the query tree will be generated /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) - pub fn build(&self, query: TokenStream) -> Result> { + pub fn build( + &self, + query: TokenStream, + ) -> Result> { let stop_words = self.index.stop_words(self.rtxn)?; let primitive_query = create_primitive_query(query, stop_words, self.words_limit); if !primitive_query.is_empty() { @@ -243,7 +247,9 @@ impl<'a> QueryTreeBuilder<'a> { self.authorize_typos, &primitive_query, )?; - Ok(Some((qt, primitive_query))) + let matching_words = + create_matching_words(self, self.authorize_typos, &primitive_query)?; + Ok(Some((qt, primitive_query, matching_words))) } else { Ok(None) } @@ -251,7 +257,7 @@ impl<'a> QueryTreeBuilder<'a> { } /// Split the word depending on the frequency of subwords in the database documents. -fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result> { +fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -267,7 +273,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); - if let Some(child) = split_best_frequency(ctx, &word)? { - children.push(child); + if let Some((left, right)) = split_best_frequency(ctx, &word)? { + children.push(Operation::Phrase(vec![left, right])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; let exact_words = ctx.exact_words()?; @@ -464,6 +470,154 @@ fn create_query_tree( } } +/// Main function that matchings words used for crop and highlight. +fn create_matching_words( + ctx: &impl Context, + authorize_typos: bool, + query: &[PrimitiveQueryPart], +) -> Result { + /// Matches on the `PrimitiveQueryPart` and create matchings words from it. + fn resolve_primitive_part( + ctx: &impl Context, + authorize_typos: bool, + part: PrimitiveQueryPart, + matching_words: &mut Vec<(Vec, Vec)>, + id: PrimitiveWordId, + ) -> Result<()> { + match part { + // 1. try to split word in 2 + // 2. try to fetch synonyms + PrimitiveQueryPart::Word(word, prefix) => { + if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? { + for synonym in synonyms { + let synonym = synonym + .into_iter() + .map(|syn| MatchingWord::new(syn.to_string(), 0, false)) + .collect(); + matching_words.push((synonym, vec![id])); + } + } + + if let Some((left, right)) = split_best_frequency(ctx, &word)? { + let left = MatchingWord::new(left, 0, false); + let right = MatchingWord::new(right, 0, false); + matching_words.push((vec![left, right], vec![id])); + } + + let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; + let exact_words = ctx.exact_words()?; + let config = + TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; + + let matching_word = match typos(word, authorize_typos, config) { + QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix), + QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix), + }; + matching_words.push((vec![matching_word], vec![id])); + } + // create a CONSECUTIVE matchings words wrapping all word in the phrase + PrimitiveQueryPart::Phrase(words) => { + let ids: Vec<_> = + (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); + let words = + words.into_iter().map(|w| MatchingWord::new(w.to_string(), 0, false)).collect(); + matching_words.push((words, ids)); + } + } + + Ok(()) + } + + /// Create all ngrams 1..=3 generating query tree branches. + fn ngrams( + ctx: &impl Context, + authorize_typos: bool, + query: &[PrimitiveQueryPart], + matching_words: &mut Vec<(Vec, Vec)>, + mut id: PrimitiveWordId, + ) -> Result<()> { + const MAX_NGRAM: usize = 3; + + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { + for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { + if let Some(group) = sub_query.get(..ngram) { + let tail = &sub_query[ngram..]; + let is_last = tail.is_empty(); + + match group { + [part] => { + resolve_primitive_part( + ctx, + authorize_typos, + part.clone(), + matching_words, + id, + )?; + } + words => { + let is_prefix = words.last().map_or(false, |part| part.is_prefix()); + let words: Vec<_> = words + .iter() + .filter_map(|part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }) + .collect(); + let ids: Vec<_> = (0..words.len()) + .into_iter() + .map(|i| id + i as PrimitiveWordId) + .collect(); + + if let Some(synonyms) = ctx.synonyms(&words)? { + for synonym in synonyms { + let synonym = synonym + .into_iter() + .map(|syn| MatchingWord::new(syn.to_string(), 0, false)) + .collect(); + matching_words.push((synonym, ids.clone())); + } + } + let word = words.concat(); + let (word_len_one_typo, word_len_two_typo) = + ctx.min_word_len_for_typo()?; + let exact_words = ctx.exact_words()?; + let config = TypoConfig { + max_typos: 1, + word_len_one_typo, + word_len_two_typo, + exact_words, + }; + let matching_word = match typos(word, authorize_typos, config) { + QueryKind::Exact { word, .. } => { + MatchingWord::new(word, 0, is_prefix) + } + QueryKind::Tolerant { typo, word } => { + MatchingWord::new(word, typo, is_prefix) + } + }; + matching_words.push((vec![matching_word], ids)); + } + } + + if !is_last { + ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?; + } + } + } + id += sub_query.iter().map(|x| x.len() as PrimitiveWordId).sum::(); + } + + Ok(()) + } + + let mut matching_words = Vec::new(); + ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?; + Ok(MatchingWords::new(matching_words)) +} + pub type PrimitiveQuery = Vec; #[derive(Debug, Clone)] @@ -480,6 +634,13 @@ impl PrimitiveQueryPart { fn is_prefix(&self) -> bool { matches!(self, Self::Word(_, is_prefix) if *is_prefix) } + + fn len(&self) -> usize { + match self { + Self::Phrase(words) => words.len(), + Self::Word(_, _) => 1, + } + } } /// Create primitive query from tokenized query string, From fa7d3a37c0d86d8b3129071889e6bc3e4746a26d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 5 Apr 2022 17:35:52 +0200 Subject: [PATCH 13/21] Make some cleaning and add comments --- milli/src/search/matches/mod.rs | 180 +++++++++++++++++++++----------- 1 file changed, 117 insertions(+), 63 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index a99798a9b..993ee1f2b 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -4,6 +4,8 @@ pub use matching_words::MatchingWords; use matching_words::{MatchType, PrimitiveWordId}; use meilisearch_tokenizer::token::{SeparatorKind, Token}; +use crate::search::matches::matching_words::PartialMatch; + pub mod matching_words; const DEFAULT_CROP_SIZE: usize = 10; @@ -106,14 +108,80 @@ pub struct Matcher<'t, 'm> { } impl<'t> Matcher<'t, '_> { + /// Iterates over tokens and save any of them that matches the query. fn compute_matches(&mut self) -> &mut Self { + fn compute_partial_match( + mut partial: PartialMatch, + tokens: &[Token], + token_position: &mut usize, + word_position: &mut usize, + matches: &mut Vec, + ) -> bool { + let mut potential_matches = vec![(*token_position, *word_position, partial.char_len())]; + let mut t_position = 1; + let mut w_position = 1; + for token in &tokens[*token_position + 1..] { + if token.is_separator().is_none() { + partial = match partial.match_token(&token) { + // token matches the partial match, but the match is not full, + // we temporarly save the current token then we try to match the next one. + Some(MatchType::Partial(partial)) => { + potential_matches.push(( + *token_position + t_position, + *word_position + w_position, + partial.char_len(), + )); + partial + } + // partial match is now full, we keep this matches and we advance positions + Some(MatchType::Full { char_len, ids }) => { + // save previously matched tokens as matches. + let iter = potential_matches.into_iter().map( + |(token_position, word_position, match_len)| Match { + match_len, + ids: ids.to_vec(), + word_position, + token_position, + }, + ); + matches.extend(iter); + + // move word and token positions after the end of the match. + *word_position += w_position; + *token_position += t_position; + + // save the token that closes the partial match as a match. + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position: *word_position, + token_position: *token_position, + }); + + // the match is complete, we return true. + return true; + } + // no match, continue to next match. + None => break, + }; + w_position += 1; + } + t_position += 1; + } + + // the match is not complete, we return false. + false + } + let mut matches = Vec::new(); let mut word_position = 0; let mut token_position = 0; while let Some(token) = self.tokens.get(token_position) { if token.is_separator().is_none() { - 'matches: for match_type in self.matching_words.match_token(&token) { + for match_type in self.matching_words.match_token(&token) { match match_type { + // we match, we save the current token as a match, + // then we continue the rest of the tokens. MatchType::Full { char_len, ids } => { matches.push(Match { match_len: char_len, @@ -121,58 +189,20 @@ impl<'t> Matcher<'t, '_> { word_position, token_position, }); - // stop on the first match break; } - MatchType::Partial(mut partial) => { - let mut potential_matches = - vec![(token_position, word_position, partial.char_len())]; - let mut t_position = 1; - let mut w_position = 1; - 'partials: for token in &self.tokens[token_position + 1..] { - if token.is_separator().is_none() { - partial = match partial.match_token(&token) { - Some(MatchType::Partial(partial)) => { - potential_matches.push(( - token_position + t_position, - word_position + w_position, - partial.char_len(), - )); - partial - } - // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { char_len, ids }) => { - let iter = potential_matches.into_iter().map( - |(token_position, word_position, match_len)| { - Match { - match_len, - ids: ids.to_vec(), - word_position, - token_position, - } - }, - ); - - matches.extend(iter); - - word_position += w_position; - token_position += t_position; - - matches.push(Match { - match_len: char_len, - ids: ids.to_vec(), - word_position, - token_position, - }); - - break 'matches; - } - // no match, continue to next match. - None => break 'partials, - }; - w_position += 1; - } - t_position += 1; + // we match partially, iterate over next tokens to check if we can complete the match. + MatchType::Partial(partial) => { + // if match is completed, we break the matching loop over the current token, + // then we continue the rest of the tokens. + if compute_partial_match( + partial, + &self.tokens, + &mut token_position, + &mut word_position, + &mut matches, + ) { + break; } } } @@ -186,6 +216,7 @@ impl<'t> Matcher<'t, '_> { self } + /// Returns boundaries of the words that match the query. pub fn matches(&mut self) -> Vec { match &self.matches { None => self.compute_matches().matches(), @@ -199,30 +230,37 @@ impl<'t> Matcher<'t, '_> { } } + /// Returns token position of the window to crop around. fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) { + // if there is no match, we start from the beginning of the string by default. let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); - // TODO: buggy if no match and first token is a sepparator + // matches needs to be counted in the crop len. let mut remaining_words = self.crop_size + first_match_word_position - last_match_word_position; // if first token is a word, then remove 1 to remaining_words. if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) { remaining_words -= 1; } + + // we start from matches positions, then we expand the window in both sides. let mut first_token_position = first_match_token_position; let mut last_token_position = last_match_token_position; - while remaining_words > 0 { match ( + // try to expand left first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)), + // try to expand right last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)), ) { + // we can expand both sides. (Some(ft), Some(lt)) => { match (ft.is_separator(), lt.is_separator()) { - // if they are both separators and are the same kind then advance both + // if they are both separators and are the same kind then advance both, + // or expand in the soft separator separator side. (Some(f_kind), Some(s_kind)) => { if f_kind == s_kind { first_token_position -= 1; @@ -233,17 +271,18 @@ impl<'t> Matcher<'t, '_> { first_token_position -= 1; } } - // left is a word, advance left + // if one of the tokens is a word, we expend in the side of the word. + // left is a word, advance left. (None, Some(_)) => { first_token_position -= 1; remaining_words -= 1; } - // right is a word, advance right + // right is a word, advance right. (Some(_), None) => { last_token_position += 1; remaining_words -= 1; } - // both are words, advance left then right if remaining_word > 0 + // both are words, advance left then right if remaining_word > 0. (None, None) => { first_token_position -= 1; remaining_words -= 1; @@ -277,6 +316,10 @@ impl<'t> Matcher<'t, '_> { (first_token_position, last_token_position) } + /// Compute the score of a match interval: + /// 1) count unique matches + /// 2) calculate distance between matches + /// 3) count ordered matches fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { let mut ids: Vec = Vec::with_capacity(matches.len()); let mut order_score = 0; @@ -305,14 +348,20 @@ impl<'t> Matcher<'t, '_> { (uniq_score, distance_score, order_score) } + /// Returns the matches interval where the score computed by match_interval_score is maximal. fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] { + // we compute the matches interval if we have at least 2 matches. if matches.len() > 1 { + // positions of the first and the last match of the best matches interval in `matches`. let mut best_interval = (0, 0); let mut best_interval_score = self.match_interval_score(&matches[0..=0]); + // current interval positions. let mut interval_first = 0; let mut interval_last = 0; for (index, next_match) in matches.iter().enumerate().skip(1) { - // if next match would make interval gross more than crop_size + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. if next_match.word_position - matches[interval_first].word_position >= self.crop_size { @@ -325,7 +374,7 @@ impl<'t> Matcher<'t, '_> { best_interval_score = interval_score; } - // advance start of the interval while interval is longer than crop_size + // advance start of the interval while interval is longer than crop_size. while next_match.word_position - matches[interval_first].word_position >= self.crop_size { @@ -335,6 +384,7 @@ impl<'t> Matcher<'t, '_> { interval_last = index; } + // compute the last interval score and compare it to the best one. let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); if interval_score > best_interval_score { @@ -347,6 +397,7 @@ impl<'t> Matcher<'t, '_> { } } + /// Returns the bounds in byte index of the crop window. fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { let match_interval = self.find_best_match_interval(matches); @@ -357,12 +408,13 @@ impl<'t> Matcher<'t, '_> { (byte_start, byte_end) } + // Returns the formatted version of the original text. pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { // If 0 it will be considered null and thus not crop the field // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 let crop = crop && self.crop_size > 0; if !highlight && !crop { - // compute matches is not needed if no highlight or crop is requested. + // compute matches is not needed if no highlight nor crop is requested. Cow::Borrowed(self.text) } else { match &self.matches { @@ -397,12 +449,14 @@ impl<'t> Matcher<'t, '_> { .char_indices() .enumerate() .find(|(i, _)| *i == m.match_len) - .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start) - .min(token.byte_end); + .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); formatted.push(self.highlight_prefix); formatted.push(&self.text[token.byte_start..highlight_byte_index]); formatted.push(self.highlight_suffix); - formatted.push(&self.text[highlight_byte_index..token.byte_end]); + // if it's a prefix highlight, we put the end of the word after the highlight marker. + if highlight_byte_index < token.byte_end { + formatted.push(&self.text[highlight_byte_index..token.byte_end]); + } byte_index = token.byte_end; } From b1905dfa2409a8c7aa5a08431b51ad6c81cc73bd Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 7 Apr 2022 17:05:44 +0200 Subject: [PATCH 14/21] Make split_best_frequency returns references instead of owned data --- milli/src/search/query_tree.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index a45034a3b..f8dd82a57 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -257,7 +257,10 @@ impl<'a> QueryTreeBuilder<'a> { } /// Split the word depending on the frequency of subwords in the database documents. -fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result> { +fn split_best_frequency<'a>( + ctx: &impl Context, + word: &'a str, +) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -273,7 +276,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); if let Some((left, right)) = split_best_frequency(ctx, &word)? { - children.push(Operation::Phrase(vec![left, right])); + children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; let exact_words = ctx.exact_words()?; @@ -499,8 +502,8 @@ fn create_matching_words( } if let Some((left, right)) = split_best_frequency(ctx, &word)? { - let left = MatchingWord::new(left, 0, false); - let right = MatchingWord::new(right, 0, false); + let left = MatchingWord::new(left.to_string(), 0, false); + let right = MatchingWord::new(right.to_string(), 0, false); matching_words.push((vec![left, right], vec![id])); } From c8ed1675a75b40195a1371f1ce3bd8a4c8cdbac2 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 7 Apr 2022 17:32:13 +0200 Subject: [PATCH 15/21] Add some documentation --- milli/src/search/matches/matching_words.rs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 274634554..84b47bba5 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -26,11 +26,14 @@ impl MatchingWords { Self { inner: matching_words } } + /// Returns an iterator over terms that match or partially match the given token. pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { MatchesIter { inner: Box::new(self.inner.iter()), token } } } +/// Iterator over terms that match the given token, +/// This allow to lazily evaluate matches. pub struct MatchesIter<'a, 'b> { inner: Box, Vec)> + 'a>, token: &'b Token<'b>, @@ -60,7 +63,10 @@ impl<'a> Iterator for MatchesIter<'a, '_> { } } +/// Id of a matching term corespounding to a word written by the end user. pub type PrimitiveWordId = u8; + +/// Structure used to match a specific term. pub struct MatchingWord { pub dfa: DFA, pub word: String, @@ -91,6 +97,7 @@ impl MatchingWord { Self { dfa, word, typo, prefix } } + /// Returns the lenght in chars of the match in case of the token matches the term. pub fn match_token(&self, token: &Token) -> Option { match self.dfa.eval(token.text()) { Distance::Exact(t) if t <= self.typo => { @@ -106,12 +113,17 @@ impl MatchingWord { } } +/// A given token can partially match a query word for several reasons: +/// - split words +/// - multi-word synonyms +/// In these cases we need to match consecutively several tokens to consider that the match is full. #[derive(Debug, PartialEq)] pub enum MatchType<'a> { Full { char_len: usize, ids: &'a [PrimitiveWordId] }, Partial(PartialMatch<'a>), } +/// Structure helper to match several tokens in a row in order to complete a partial match. #[derive(Debug, PartialEq)] pub struct PartialMatch<'a> { matching_words: &'a [MatchingWord], @@ -120,6 +132,10 @@ pub struct PartialMatch<'a> { } impl<'a> PartialMatch<'a> { + /// Returns: + /// - None if the given token breaks the partial match + /// - Partial if the given token matches the partial match but doesn't complete it + /// - Full if the given token completes the partial match pub fn match_token(self, token: &Token) -> Option> { self.matching_words[0].match_token(token).map(|char_len| { if self.matching_words.len() > 1 { From a769e09dfa57a5e1fe107f3ab1d6e830a9b939fa Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 7 Apr 2022 20:15:14 +0200 Subject: [PATCH 16/21] Make token_crop_bounds more rust idiomatic --- milli/src/search/matches/mod.rs | 71 +++++++++++++++------------------ 1 file changed, 32 insertions(+), 39 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 993ee1f2b..d6e7dcc37 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -230,7 +230,7 @@ impl<'t> Matcher<'t, '_> { } } - /// Returns token position of the window to crop around. + /// Returns the bounds in byte index of the crop window. fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); @@ -241,70 +241,64 @@ impl<'t> Matcher<'t, '_> { // matches needs to be counted in the crop len. let mut remaining_words = self.crop_size + first_match_word_position - last_match_word_position; - // if first token is a word, then remove 1 to remaining_words. - if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) { - remaining_words -= 1; - } - // we start from matches positions, then we expand the window in both sides. - let mut first_token_position = first_match_token_position; - let mut last_token_position = last_match_token_position; + let mut before_tokens = self.tokens[..first_match_token_position].iter().rev().peekable(); + let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); + while remaining_words > 0 { - match ( - // try to expand left - first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)), - // try to expand right - last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)), - ) { + let before_token = before_tokens.peek().map(|t| t.is_separator()); + let after_token = after_tokens.peek().map(|t| t.is_separator()); + + match (before_token, after_token) { // we can expand both sides. - (Some(ft), Some(lt)) => { - match (ft.is_separator(), lt.is_separator()) { + (Some(before_token), Some(after_token)) => { + match (before_token, after_token) { // if they are both separators and are the same kind then advance both, // or expand in the soft separator separator side. - (Some(f_kind), Some(s_kind)) => { - if f_kind == s_kind { - first_token_position -= 1; - last_token_position += 1; - } else if f_kind == SeparatorKind::Hard { - last_token_position += 1; + (Some(before_token_kind), Some(after_token_kind)) => { + if before_token_kind == after_token_kind { + before_tokens.next(); + after_tokens.next(); + } else if before_token_kind == SeparatorKind::Hard { + after_tokens.next(); } else { - first_token_position -= 1; + before_tokens.next(); } } // if one of the tokens is a word, we expend in the side of the word. // left is a word, advance left. (None, Some(_)) => { - first_token_position -= 1; + before_tokens.next(); remaining_words -= 1; } // right is a word, advance right. (Some(_), None) => { - last_token_position += 1; + after_tokens.next(); remaining_words -= 1; } // both are words, advance left then right if remaining_word > 0. (None, None) => { - first_token_position -= 1; + before_tokens.next(); remaining_words -= 1; if remaining_words > 0 { - last_token_position += 1; + after_tokens.next(); remaining_words -= 1; } } } } // the end of the text is reached, advance left. - (Some(ft), None) => { - first_token_position -= 1; - if ft.is_separator().is_none() { + (Some(before_token), None) => { + before_tokens.next(); + if before_token.is_none() { remaining_words -= 1; } } // the start of the text is reached, advance right. - (None, Some(lt)) => { - last_token_position += 1; - if lt.is_separator().is_none() { + (None, Some(after_token)) => { + after_tokens.next(); + if after_token.is_none() { remaining_words -= 1; } } @@ -313,7 +307,10 @@ impl<'t> Matcher<'t, '_> { } } - (first_token_position, last_token_position) + let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); + let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); + + (crop_byte_start, crop_byte_end) } /// Compute the score of a match interval: @@ -401,11 +398,7 @@ impl<'t> Matcher<'t, '_> { fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { let match_interval = self.find_best_match_interval(matches); - let (first_token_position, last_token_position) = self.token_crop_bounds(match_interval); - - let byte_start = self.tokens.get(first_token_position).map_or(0, |t| t.byte_start); - let byte_end = self.tokens.get(last_token_position).map_or(byte_start, |t| t.byte_end); - (byte_start, byte_end) + self.token_crop_bounds(match_interval) } // Returns the formatted version of the original text. From a16de5de84935e6663767ff9912bcd0ad0d98af2 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Fri, 8 Apr 2022 11:20:41 +0200 Subject: [PATCH 17/21] Symplify format and remove intermediate function --- milli/src/search/matches/mod.rs | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index d6e7dcc37..71ff2f1b3 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -231,7 +231,7 @@ impl<'t> Matcher<'t, '_> { } /// Returns the bounds in byte index of the crop window. - fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) { + fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); @@ -394,13 +394,6 @@ impl<'t> Matcher<'t, '_> { } } - /// Returns the bounds in byte index of the crop window. - fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { - let match_interval = self.find_best_match_interval(matches); - - self.token_crop_bounds(match_interval) - } - // Returns the formatted version of the original text. pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { // If 0 it will be considered null and thus not crop the field @@ -412,6 +405,9 @@ impl<'t> Matcher<'t, '_> { } else { match &self.matches { Some(matches) => { + let matches = + if crop { self.find_best_match_interval(matches) } else { matches }; + let (byte_start, byte_end) = if crop { self.crop_bounds(matches) } else { (0, self.text.len()) }; @@ -427,11 +423,7 @@ impl<'t> Matcher<'t, '_> { if highlight { // insert highlight markers around matches. let tokens = self.tokens; - for m in matches - .iter() - .skip_while(|m| tokens[m.token_position].byte_start < byte_start) - .take_while(|m| tokens[m.token_position].byte_start < byte_end) - { + for m in matches { let token = &tokens[m.token_position]; if byte_index < token.byte_start { From 011f8210eddb7cf4bbfedc3aa07e9c5cb73206de Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 11 Apr 2022 16:46:45 +0200 Subject: [PATCH 18/21] Make compute_matches more rust idiomatic --- milli/src/search/matches/mod.rs | 207 ++++++++++++++++++++------------ 1 file changed, 127 insertions(+), 80 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 71ff2f1b3..04e552c8d 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -110,63 +110,53 @@ pub struct Matcher<'t, 'm> { impl<'t> Matcher<'t, '_> { /// Iterates over tokens and save any of them that matches the query. fn compute_matches(&mut self) -> &mut Self { - fn compute_partial_match( + fn compute_partial_match<'a>( mut partial: PartialMatch, - tokens: &[Token], - token_position: &mut usize, - word_position: &mut usize, + token_position: usize, + word_position: usize, + words_positions: &mut impl Iterator)>, matches: &mut Vec, ) -> bool { - let mut potential_matches = vec![(*token_position, *word_position, partial.char_len())]; - let mut t_position = 1; - let mut w_position = 1; - for token in &tokens[*token_position + 1..] { - if token.is_separator().is_none() { - partial = match partial.match_token(&token) { - // token matches the partial match, but the match is not full, - // we temporarly save the current token then we try to match the next one. - Some(MatchType::Partial(partial)) => { - potential_matches.push(( - *token_position + t_position, - *word_position + w_position, - partial.char_len(), - )); - partial - } - // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { char_len, ids }) => { - // save previously matched tokens as matches. - let iter = potential_matches.into_iter().map( - |(token_position, word_position, match_len)| Match { - match_len, - ids: ids.to_vec(), - word_position, - token_position, - }, - ); - matches.extend(iter); + let mut potential_matches = Vec::new(); - // move word and token positions after the end of the match. - *word_position += w_position; - *token_position += t_position; + // Add first match to potential matches. + potential_matches.push((token_position, word_position, partial.char_len())); - // save the token that closes the partial match as a match. - matches.push(Match { - match_len: char_len, + for (token_position, word_position, word) in words_positions { + partial = match partial.match_token(&word) { + // token matches the partial match, but the match is not full, + // we temporarly save the current token then we try to match the next one. + Some(MatchType::Partial(partial)) => { + potential_matches.push((token_position, word_position, partial.char_len())); + partial + } + // partial match is now full, we keep this matches and we advance positions + Some(MatchType::Full { char_len, ids }) => { + // save previously matched tokens as matches. + let iter = potential_matches.into_iter().map( + |(token_position, word_position, match_len)| Match { + match_len, ids: ids.to_vec(), - word_position: *word_position, - token_position: *token_position, - }); + word_position, + token_position, + }, + ); + matches.extend(iter); - // the match is complete, we return true. - return true; - } - // no match, continue to next match. - None => break, - }; - w_position += 1; - } - t_position += 1; + // save the token that closes the partial match as a match. + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + + // the match is complete, we return true. + return true; + } + // no match, continue to next match. + None => break, + }; } // the match is not complete, we return false. @@ -174,42 +164,54 @@ impl<'t> Matcher<'t, '_> { } let mut matches = Vec::new(); - let mut word_position = 0; - let mut token_position = 0; - while let Some(token) = self.tokens.get(token_position) { - if token.is_separator().is_none() { - for match_type in self.matching_words.match_token(&token) { - match match_type { - // we match, we save the current token as a match, + + let mut words_positions = self + .tokens + .iter() + .scan((0, 0), |(token_position, word_position), token| { + let current_token_position = *token_position; + let current_word_position = *word_position; + *token_position += 1; + if token.is_separator().is_none() { + *word_position += 1; + } + + Some((current_token_position, current_word_position, token)) + }) + .filter(|(_, _, token)| token.is_separator().is_none()); + + while let Some((token_position, word_position, word)) = words_positions.next() { + for match_type in self.matching_words.match_token(word) { + match match_type { + // we match, we save the current token as a match, + // then we continue the rest of the tokens. + MatchType::Full { char_len, ids } => { + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + break; + } + // we match partially, iterate over next tokens to check if we can complete the match. + MatchType::Partial(partial) => { + // if match is completed, we break the matching loop over the current token, // then we continue the rest of the tokens. - MatchType::Full { char_len, ids } => { - matches.push(Match { - match_len: char_len, - ids: ids.to_vec(), - word_position, - token_position, - }); + let mut wp = words_positions.clone(); + if compute_partial_match( + partial, + token_position, + word_position, + &mut wp, + &mut matches, + ) { + words_positions = wp; break; } - // we match partially, iterate over next tokens to check if we can complete the match. - MatchType::Partial(partial) => { - // if match is completed, we break the matching loop over the current token, - // then we continue the rest of the tokens. - if compute_partial_match( - partial, - &self.tokens, - &mut token_position, - &mut word_position, - &mut matches, - ) { - break; - } - } } } - word_position += 1; } - token_position += 1; } self.matches = Some(matches); @@ -826,4 +828,49 @@ mod tests { // because crop size is 0, crop is ignored. assert_eq!(&matcher.format(highlight, crop), "void void split the world void void."); } + + #[test] + fn partial_matches() { + let matching_words = vec![ + (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]), + ( + vec![ + MatchingWord::new("t".to_string(), 0, false), + MatchingWord::new("he".to_string(), 0, false), + ], + vec![0], + ), + (vec![MatchingWord::new("door".to_string(), 0, false)], vec![1]), + ( + vec![ + MatchingWord::new("do".to_string(), 0, false), + MatchingWord::new("or".to_string(), 0, false), + ], + vec![1], + ), + (vec![MatchingWord::new("do".to_string(), 0, false)], vec![2]), + ]; + + let matching_words = MatchingWords::new(matching_words); + + let mut builder = MatcherBuilder::from_matching_words(matching_words); + builder.highlight_prefix("_".to_string()); + builder.highlight_suffix("_".to_string()); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = true; + let crop = false; + + let text = "the do or die can't be he do and or isn't he"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + + let mut matcher = builder.build(&tokens[..], text); + assert_eq!( + &matcher.format(highlight, crop), + "_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_", + "matches: {:?}", + &matcher.matches + ); + } } From 827cedcd15b9943d52194d8ea17bfc154dfeebf4 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 12 Apr 2022 13:42:14 +0200 Subject: [PATCH 19/21] Add format option structure --- http-ui/src/main.rs | 8 +- milli/src/lib.rs | 3 +- milli/src/search/matches/mod.rs | 163 +++++++++++++++----------------- milli/src/search/mod.rs | 2 +- 4 files changed, 85 insertions(+), 91 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index fdfc04af9..adf7f1788 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -25,8 +25,8 @@ use milli::update::{ ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, }; use milli::{ - obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, Index, MatcherBuilder, - SearchResult, SortError, + obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index, + MatcherBuilder, SearchResult, SortError, }; use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; @@ -162,7 +162,9 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed: Vec<_> = analyzed.tokens().collect(); let mut matcher = matcher_builder.build(&analyzed[..], &old_string); - Value::String(matcher.format(true, true).to_string()) + let format_options = FormatOptions { highlight: true, crop: Some(10) }; + + Value::String(matcher.format(format_options).to_string()) } Value::Array(values) => Value::Array( values.into_iter().map(|v| self.highlight_value(v, matcher_builder)).collect(), diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 6cbb9f126..793079563 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -37,7 +37,8 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - FacetDistribution, Filter, MatchBounds, MatcherBuilder, MatchingWords, Search, SearchResult, + FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search, + SearchResult, }; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 04e552c8d..65ff0a255 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -8,14 +8,12 @@ use crate::search::matches::matching_words::PartialMatch; pub mod matching_words; -const DEFAULT_CROP_SIZE: usize = 10; const DEFAULT_CROP_MARKER: &'static str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &'static str = ""; const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = ""; pub struct MatcherBuilder { matching_words: MatchingWords, - crop_size: usize, crop_marker: Option, highlight_prefix: Option, highlight_suffix: Option, @@ -23,18 +21,7 @@ pub struct MatcherBuilder { impl MatcherBuilder { pub fn from_matching_words(matching_words: MatchingWords) -> Self { - Self { - matching_words, - crop_size: DEFAULT_CROP_SIZE, - crop_marker: None, - highlight_prefix: None, - highlight_suffix: None, - } - } - - pub fn crop_size(&mut self, word_count: usize) -> &Self { - self.crop_size = word_count; - self + Self { matching_words, crop_marker: None, highlight_prefix: None, highlight_suffix: None } } pub fn crop_marker(&mut self, marker: String) -> &Self { @@ -70,7 +57,6 @@ impl MatcherBuilder { text, tokens, matching_words: &self.matching_words, - crop_size: self.crop_size, crop_marker, highlight_prefix, highlight_suffix, @@ -79,6 +65,18 @@ impl MatcherBuilder { } } +#[derive(Copy, Clone, Default)] +pub struct FormatOptions { + pub highlight: bool, + pub crop: Option, +} + +impl FormatOptions { + pub fn merge(self, other: Self) -> Self { + Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } + } +} + #[derive(Clone, Debug)] pub struct Match { match_len: usize, @@ -100,7 +98,6 @@ pub struct Matcher<'t, 'm> { text: &'t str, tokens: &'t [Token<'t>], matching_words: &'m MatchingWords, - crop_size: usize, crop_marker: &'m str, highlight_prefix: &'m str, highlight_suffix: &'m str, @@ -233,7 +230,7 @@ impl<'t> Matcher<'t, '_> { } /// Returns the bounds in byte index of the crop window. - fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { + fn crop_bounds(&self, matches: &[Match], crop_size: usize) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); @@ -241,8 +238,7 @@ impl<'t> Matcher<'t, '_> { let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); // matches needs to be counted in the crop len. - let mut remaining_words = - self.crop_size + first_match_word_position - last_match_word_position; + let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; let mut before_tokens = self.tokens[..first_match_token_position].iter().rev().peekable(); let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); @@ -348,7 +344,7 @@ impl<'t> Matcher<'t, '_> { } /// Returns the matches interval where the score computed by match_interval_score is maximal. - fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] { + fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { // we compute the matches interval if we have at least 2 matches. if matches.len() > 1 { // positions of the first and the last match of the best matches interval in `matches`. @@ -361,9 +357,7 @@ impl<'t> Matcher<'t, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - if next_match.word_position - matches[interval_first].word_position - >= self.crop_size - { + if next_match.word_position - matches[interval_first].word_position >= crop_size { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); @@ -375,7 +369,7 @@ impl<'t> Matcher<'t, '_> { // advance start of the interval while interval is longer than crop_size. while next_match.word_position - matches[interval_first].word_position - >= self.crop_size + >= crop_size { interval_first += 1; } @@ -397,21 +391,24 @@ impl<'t> Matcher<'t, '_> { } // Returns the formatted version of the original text. - pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { - // If 0 it will be considered null and thus not crop the field - // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 - let crop = crop && self.crop_size > 0; - if !highlight && !crop { + pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { + if !format_options.highlight && format_options.crop.is_none() { // compute matches is not needed if no highlight nor crop is requested. Cow::Borrowed(self.text) } else { match &self.matches { Some(matches) => { - let matches = - if crop { self.find_best_match_interval(matches) } else { matches }; + let matches = match format_options.crop { + Some(crop_size) if crop_size > 0 => { + self.find_best_match_interval(matches, crop_size) + } + _ => matches, + }; - let (byte_start, byte_end) = - if crop { self.crop_bounds(matches) } else { (0, self.text.len()) }; + let (byte_start, byte_end) = match format_options.crop { + Some(crop_size) if crop_size > 0 => self.crop_bounds(matches, crop_size), + _ => (0, self.text.len()), + }; let mut formatted = Vec::new(); @@ -422,7 +419,7 @@ impl<'t> Matcher<'t, '_> { let mut byte_index = byte_start; - if highlight { + if format_options.highlight { // insert highlight markers around matches. let tokens = self.tokens; for m in matches { @@ -466,7 +463,7 @@ impl<'t> Matcher<'t, '_> { Cow::Owned(formatted.concat()) } } - None => self.compute_matches().format(highlight, crop), + None => self.compute_matches().format(format_options), } } } @@ -496,8 +493,7 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = false; - let crop = false; + let format_options = FormatOptions { highlight: false, crop: None }; // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -505,7 +501,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(highlight, crop), &text); + assert_eq!(&matcher.format(format_options.clone()), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; @@ -513,7 +509,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(highlight, crop), &text); + assert_eq!(&matcher.format(format_options.clone()), &text); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -521,7 +517,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(highlight, crop), &text); + assert_eq!(&matcher.format(format_options.clone()), &text); } #[test] @@ -531,22 +527,21 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = true; - let crop = false; + let format_options = FormatOptions { highlight: true, crop: None }; // empty text. let text = ""; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ""); + assert_eq!(&matcher.format(format_options.clone()), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ":-)"); + assert_eq!(&matcher.format(format_options.clone()), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -554,7 +549,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text, because there is no matches. - assert_eq!(&matcher.format(highlight, crop), &text); + assert_eq!(&matcher.format(format_options.clone()), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; @@ -562,7 +557,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(format_options.clone()), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -571,7 +566,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "Natalie risk her future to build a world with the boy she loves." ); } @@ -588,8 +583,7 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = true; - let crop = false; + let format_options = FormatOptions { highlight: true, crop: None }; // Text containing prefix match. let text = "Ŵôřlḑôle"; @@ -597,7 +591,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Ŵôřlḑôle"); + assert_eq!(&matcher.format(format_options.clone()), "Ŵôřlḑôle"); // Text containing unicode match. let text = "Ŵôřlḑ"; @@ -605,7 +599,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Ŵôřlḑ"); + assert_eq!(&matcher.format(format_options.clone()), "Ŵôřlḑ"); // Text containing unicode match. let text = "Westfália"; @@ -613,7 +607,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Westfália"); + assert_eq!(&matcher.format(format_options.clone()), "Westfália"); } #[test] @@ -623,22 +617,21 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = false; - let crop = true; + let format_options = FormatOptions { highlight: false, crop: Some(10) }; // empty text. let text = ""; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ""); + assert_eq!(&matcher.format(format_options.clone()), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ":-)"); + assert_eq!(&matcher.format(format_options.clone()), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -647,7 +640,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "A quick brown fox can not jump 32 feet, right…" ); @@ -658,7 +651,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "(A quick brown fox can not jump 32 feet, right…" ); @@ -669,7 +662,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // should crop the phrase instead of croping around the match. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…Split The World is a book written by Emily Henry…" ); @@ -680,7 +673,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…future to build a world with the boy she loves…" ); @@ -691,7 +684,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…she loves. Emily Henry: The Love That Split The World." ); @@ -702,7 +695,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…void void void void void split the world void void" ); @@ -713,7 +706,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…void void void void void split the world void void" ); @@ -724,7 +717,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…void void void void void split the world void void" ); } @@ -736,22 +729,21 @@ mod tests { let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = true; - let crop = true; + let format_options = FormatOptions { highlight: true, crop: Some(10) }; // empty text. let text = ""; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ""); + assert_eq!(&matcher.format(format_options.clone()), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(highlight, crop), ":-)"); + assert_eq!(&matcher.format(format_options.clone()), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -760,7 +752,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // both should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "A quick brown fox can not jump 32 feet, right…" ); @@ -771,7 +763,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…future to build a world with the boy she loves…" ); @@ -781,7 +773,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(format_options.clone()), "…she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; @@ -790,7 +782,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options.clone()), "…void void void void void split the world void void" ); } @@ -800,33 +792,33 @@ mod tests { //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 let matching_words = matching_words(); - let mut builder = MatcherBuilder::from_matching_words(matching_words); + let builder = MatcherBuilder::from_matching_words(matching_words); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = false; - let crop = true; - let text = "void void split the world void void."; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); // set a smaller crop size - builder.crop_size(2); + let format_options = FormatOptions { highlight: false, crop: Some(2) }; + let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "…split the…"); + assert_eq!(&matcher.format(format_options), "…split the…"); // set a smaller crop size - builder.crop_size(1); + let format_options = FormatOptions { highlight: false, crop: Some(1) }; + let mut matcher = builder.build(&tokens[..], text); // because crop size < query size, partially format matches. - assert_eq!(&matcher.format(highlight, crop), "…split…"); + assert_eq!(&matcher.format(format_options), "…split…"); + + // set crop size to 0 + let format_options = FormatOptions { highlight: false, crop: Some(0) }; - // set a smaller crop size - builder.crop_size(0); let mut matcher = builder.build(&tokens[..], text); // because crop size is 0, crop is ignored. - assert_eq!(&matcher.format(highlight, crop), "void void split the world void void."); + assert_eq!(&matcher.format(format_options), "void void split the world void void."); } #[test] @@ -858,8 +850,7 @@ mod tests { builder.highlight_suffix("_".to_string()); let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let highlight = true; - let crop = false; + let format_options = FormatOptions { highlight: true, crop: None }; let text = "the do or die can't be he do and or isn't he"; let analyzed = analyzer.analyze(&text); @@ -867,7 +858,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); assert_eq!( - &matcher.format(highlight, crop), + &matcher.format(format_options), "_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_", "matches: {:?}", &matcher.matches diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 2b025f269..a9712d261 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matches::{MatchBounds, Matcher, MatcherBuilder, MatchingWords}; +pub use self::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords}; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; From 5809d3ae0d3c7b86d4c65ccabd689038cc3b0bc7 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 12 Apr 2022 16:31:58 +0200 Subject: [PATCH 20/21] Add first benchmarks on formatting --- benchmarks/Cargo.toml | 4 ++ benchmarks/benches/formatting.rs | 68 ++++++++++++++++++++++++++++++++ milli/src/lib.rs | 4 +- milli/src/search/matches/mod.rs | 6 +-- milli/src/search/mod.rs | 4 +- 5 files changed, 79 insertions(+), 7 deletions(-) create mode 100644 benchmarks/benches/formatting.rs diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 0cac5e017..0dbbd6d6f 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -39,3 +39,7 @@ harness = false [[bench]] name = "indexing" harness = false + +[[bench]] +name = "formatting" +harness = false diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs new file mode 100644 index 000000000..5045df268 --- /dev/null +++ b/benchmarks/benches/formatting.rs @@ -0,0 +1,68 @@ +use criterion::{criterion_group, criterion_main}; +use milli::tokenizer::{Analyzer, AnalyzerConfig}; +use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +struct Conf<'a> { + name: &'a str, + text: &'a str, + matching_words: MatcherBuilder, +} + +fn bench_formatting(c: &mut criterion::Criterion) { + #[rustfmt::skip] + let confs = &[ + Conf { + name: "'the door d'", + text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#, + matching_words: MatcherBuilder::from_matching_words(MatchingWords::new(vec![ + (vec![MatchingWord::new("t".to_string(), 0, false), MatchingWord::new("he".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("door".to_string(), 1, false)], vec![1]), + (vec![MatchingWord::new("do".to_string(), 0, false), MatchingWord::new("or".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("thedoor".to_string(), 1, false)], vec![0, 1]), + (vec![MatchingWord::new("d".to_string(), 0, true)], vec![2]), + (vec![MatchingWord::new("thedoord".to_string(), 1, true)], vec![0, 1, 2]), + (vec![MatchingWord::new("doord".to_string(), 1, true)], vec![1, 2]), + ])), + }, + ]; + + let format_options = &[ + FormatOptions { highlight: false, crop: None }, + FormatOptions { highlight: true, crop: None }, + FormatOptions { highlight: false, crop: Some(10) }, + FormatOptions { highlight: true, crop: Some(10) }, + FormatOptions { highlight: false, crop: Some(20) }, + FormatOptions { highlight: true, crop: Some(20) }, + ]; + + for option in format_options { + let highlight = if option.highlight { "highlight" } else { "no-highlight" }; + + let name = match option.crop { + Some(size) => format!("{}-crop({})", highlight, size), + None => format!("{}-no-crop", highlight), + }; + + let mut group = c.benchmark_group(&name); + for conf in confs { + group.bench_function(conf.name, |b| { + b.iter(|| { + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + let analyzed = analyzer.analyze(&conf.text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = conf.matching_words.build(&tokens[..], conf.text); + matcher.format(option.clone()); + }) + }); + } + group.finish(); + } +} + +criterion_group!(benches, bench_formatting); +criterion_main!(benches); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 793079563..6f5d4abe8 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -37,8 +37,8 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search, - SearchResult, + FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, + MatchingWords, Search, SearchResult, }; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 65ff0a255..ad4f6cd69 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -1,11 +1,9 @@ use std::borrow::Cow; -pub use matching_words::MatchingWords; -use matching_words::{MatchType, PrimitiveWordId}; +use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; +pub use matching_words::{MatchingWord, MatchingWords}; use meilisearch_tokenizer::token::{SeparatorKind, Token}; -use crate::search::matches::matching_words::PartialMatch; - pub mod matching_words; const DEFAULT_CROP_MARKER: &'static str = "…"; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index a9712d261..979b2fd7a 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,9 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords}; +pub use self::matches::{ + FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, +}; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; From f1115e274ff4fc055d15c10f2cb8517d6b34e84b Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 19 Apr 2022 10:35:50 +0200 Subject: [PATCH 21/21] Use Copy impl of FormatOption instead of clonning --- milli/src/search/matches/mod.rs | 54 ++++++++++++++++----------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index ad4f6cd69..c7812aa77 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -499,7 +499,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options.clone()), &text); + assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; @@ -507,7 +507,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options.clone()), &text); + assert_eq!(&matcher.format(format_options), &text); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -515,7 +515,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options.clone()), &text); + assert_eq!(&matcher.format(format_options), &text); } #[test] @@ -532,14 +532,14 @@ mod tests { let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(format_options.clone()), ""); + assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(format_options.clone()), ":-)"); + assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -547,7 +547,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text, because there is no matches. - assert_eq!(&matcher.format(format_options.clone()), &text); + assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; @@ -555,7 +555,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options.clone()), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -564,7 +564,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "Natalie risk her future to build a world with the boy she loves." ); } @@ -589,7 +589,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options.clone()), "Ŵôřlḑôle"); + assert_eq!(&matcher.format(format_options), "Ŵôřlḑôle"); // Text containing unicode match. let text = "Ŵôřlḑ"; @@ -597,7 +597,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options.clone()), "Ŵôřlḑ"); + assert_eq!(&matcher.format(format_options), "Ŵôřlḑ"); // Text containing unicode match. let text = "Westfália"; @@ -605,7 +605,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(format_options.clone()), "Westfália"); + assert_eq!(&matcher.format(format_options), "Westfália"); } #[test] @@ -622,14 +622,14 @@ mod tests { let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(format_options.clone()), ""); + assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(format_options.clone()), ":-)"); + assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -638,7 +638,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "A quick brown fox can not jump 32 feet, right…" ); @@ -649,7 +649,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "(A quick brown fox can not jump 32 feet, right…" ); @@ -660,7 +660,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // should crop the phrase instead of croping around the match. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…Split The World is a book written by Emily Henry…" ); @@ -671,7 +671,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…future to build a world with the boy she loves…" ); @@ -682,7 +682,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…she loves. Emily Henry: The Love That Split The World." ); @@ -693,7 +693,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…void void void void void split the world void void" ); @@ -704,7 +704,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…void void void void void split the world void void" ); @@ -715,7 +715,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…void void void void void split the world void void" ); } @@ -734,14 +734,14 @@ mod tests { let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(format_options.clone()), ""); + assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - assert_eq!(&matcher.format(format_options.clone()), ":-)"); + assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; @@ -750,7 +750,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // both should return 10 first words with a marker at the end. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "A quick brown fox can not jump 32 feet, right…" ); @@ -761,7 +761,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…future to build a world with the boy she loves…" ); @@ -771,7 +771,7 @@ mod tests { let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(format_options.clone()), "…she loves. Emily Henry: The Love That Split The World."); + assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; @@ -780,7 +780,7 @@ mod tests { let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( - &matcher.format(format_options.clone()), + &matcher.format(format_options), "…void void void void void split the world void void" ); }