From d96e72e5dc2c01305d41fd3cb927ff77696f698f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 22 Mar 2022 15:22:14 +0100 Subject: [PATCH] Create formater with some tests --- .../search/{ => matches}/matching_words.rs | 48 +- milli/src/search/matches/mod.rs | 434 ++++++++++++++++++ milli/src/search/mod.rs | 4 +- 3 files changed, 469 insertions(+), 17 deletions(-) rename milli/src/search/{ => matches}/matching_words.rs (89%) create mode 100644 milli/src/search/matches/mod.rs diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matches/matching_words.rs similarity index 89% rename from milli/src/search/matching_words.rs rename to milli/src/search/matches/matching_words.rs index 67bdefb37..48f6fe809 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -1,11 +1,11 @@ use std::cmp::{min, Reverse}; -use std::collections::{BTreeMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::ops::{Index, IndexMut}; use levenshtein_automata::{Distance, DFA}; use meilisearch_tokenizer::Token; -use super::build_dfa; +use crate::search::build_dfa; use crate::search::query_tree::{Operation, Query}; type IsPrefix = bool; @@ -14,7 +14,7 @@ type IsPrefix = bool; /// referencing words that match the given query tree. #[derive(Default)] pub struct MatchingWords { - dfas: Vec<(DFA, String, u8, IsPrefix)>, + dfas: Vec<(DFA, String, u8, IsPrefix, usize)>, } impl MatchingWords { @@ -23,11 +23,11 @@ impl MatchingWords { let mut dfas: Vec<_> = fetch_queries(tree) .into_iter() // create DFAs for each word - .map(|(w, t, p)| (build_dfa(w, t, p), w.to_string(), t, p)) + .map(|((w, t, p), id)| (build_dfa(w, t, p), w.to_string(), t, p, id)) .collect(); // Sort word by len in DESC order prioritizing the longuest word, // in order to highlight the longuest part of the matched word. - dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| { + dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix, _id)| { Reverse(query_word.len()) }); Self { dfas } @@ -35,14 +35,21 @@ impl MatchingWords { /// Returns the number of matching bytes if the word matches one of the query words. pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option { - self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| { + self.matching_bytes_with_id(word_to_highlight).map(|(len, _)| len) + } + + pub fn matching_bytes_with_id(&self, word_to_highlight: &Token) -> Option<(usize, usize)> { + self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix, id)| { match dfa.eval(word_to_highlight.text()) { Distance::Exact(t) if t <= *typo => { if *is_prefix { let len = bytes_to_highlight(word_to_highlight.text(), query_word); - Some(word_to_highlight.num_chars_from_bytes(len)) + Some((word_to_highlight.num_chars_from_bytes(len), *id)) } else { - Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len())) + Some(( + word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()), + *id, + )) } } _otherwise => None, @@ -52,26 +59,37 @@ impl MatchingWords { } /// Lists all words which can be considered as a match for the query tree. -fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { - fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { +fn fetch_queries(tree: &Operation) -> HashMap<(&str, u8, IsPrefix), usize> { + fn resolve_ops<'a>( + tree: &'a Operation, + out: &mut HashMap<(&'a str, u8, IsPrefix), usize>, + id: &mut usize, + ) { match tree { Operation::Or(_, ops) | Operation::And(ops) => { - ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); + ops.as_slice().iter().for_each(|op| resolve_ops(op, out, id)); } Operation::Query(Query { prefix, kind }) => { let typo = if kind.is_exact() { 0 } else { kind.typo() }; - out.insert((kind.word(), typo, *prefix)); + out.entry((kind.word(), typo, *prefix)).or_insert_with(|| { + *id += 1; + *id + }); } Operation::Phrase(words) => { for word in words { - out.insert((word, 0, false)); + out.entry((word, 0, false)).or_insert_with(|| { + *id += 1; + *id + }); } } } } - let mut queries = HashSet::new(); - resolve_ops(tree, &mut queries); + let mut queries = HashMap::new(); + let mut id = 0; + resolve_ops(tree, &mut queries, &mut id); queries } diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs new file mode 100644 index 000000000..0ebf6305f --- /dev/null +++ b/milli/src/search/matches/mod.rs @@ -0,0 +1,434 @@ +use std::borrow::Cow; + +use matching_words::MatchingWords; +use meilisearch_tokenizer::token::SeparatorKind; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token}; + +use crate::search::query_tree::Operation; + +pub mod matching_words; + +const DEFAULT_CROP_SIZE: usize = 10; +const DEFAULT_CROP_MARKER: &'static str = "…"; +const DEFAULT_HIGHLIGHT_PREFIX: &'static str = ""; +const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = ""; + +pub struct MatcherBuilder { + matching_words: MatchingWords, + crop_size: usize, + crop_marker: Option, + highlight_prefix: Option, + highlight_suffix: Option, +} + +impl MatcherBuilder { + pub fn from_query_tree(query_tree: &Operation) -> Self { + let matching_words = MatchingWords::from_query_tree(query_tree); + + Self { + matching_words, + crop_size: DEFAULT_CROP_SIZE, + crop_marker: None, + highlight_prefix: None, + highlight_suffix: None, + } + } + + pub fn crop_size(&mut self, word_count: usize) -> &Self { + self.crop_size = word_count; + self + } + + pub fn crop_marker(&mut self, marker: String) -> &Self { + self.crop_marker = Some(marker); + self + } + + pub fn highlight_prefix(&mut self, prefix: String) -> &Self { + self.highlight_prefix = Some(prefix); + self + } + + pub fn highlight_suffix(&mut self, suffix: String) -> &Self { + self.highlight_suffix = Some(suffix); + self + } + + pub fn build<'t, 'm>(&'m self, tokens: &'t [Token], text: &'t str) -> Matcher<'t, 'm> { + let crop_marker = match &self.crop_marker { + Some(marker) => marker.as_str(), + None => &DEFAULT_CROP_MARKER, + }; + + let highlight_prefix = match &self.highlight_prefix { + Some(marker) => marker.as_str(), + None => &DEFAULT_HIGHLIGHT_PREFIX, + }; + let highlight_suffix = match &self.highlight_suffix { + Some(marker) => marker.as_str(), + None => &DEFAULT_HIGHLIGHT_SUFFIX, + }; + Matcher { + text, + tokens, + matching_words: &self.matching_words, + crop_size: self.crop_size, + crop_marker, + highlight_prefix, + highlight_suffix, + matches: None, + } + } +} + +// impl Default for MatcherBuilder { +// fn default() -> Self { +// Self { +// crop_size: DEFAULT_CROP_SIZE, +// crop_marker: None, +// highlight_prefix: None, +// highlight_suffix: None, +// } +// } +// } + +pub struct Match<'t> { + token: &'t Token<'t>, + match_len: usize, + // id of the query word that matches. + id: usize, + // position of the word in the whole text. + position: usize, +} + +pub struct MatchBounds { + start: usize, + length: usize, +} + +impl<'t> From<&Match<'t>> for MatchBounds { + fn from(m: &Match) -> Self { + MatchBounds { start: m.token.byte_start, length: m.match_len } + } +} + +pub struct Matcher<'t, 'm> { + text: &'t str, + tokens: &'t [Token<'t>], + matching_words: &'m MatchingWords, + crop_size: usize, + crop_marker: &'m str, + highlight_prefix: &'m str, + highlight_suffix: &'m str, + matches: Option>>, +} + +impl<'t> Matcher<'t, '_> { + fn compute_matches(&mut self) -> &mut Self { + let mut matches = Vec::new(); + let mut position = 0; + for token in self.tokens { + match token.is_separator() { + Some(SeparatorKind::Hard) => position += 7, + None => { + if let Some((match_len, id)) = + self.matching_words.matching_bytes_with_id(&token) + { + matches.push(Match { token, match_len, id, position }); + } + position += 1; + } + _otherwise => {} + } + } + + self.matches = Some(matches); + self + } + + pub fn matches(&mut self) -> Vec { + match &self.matches { + None => self.compute_matches().matches(), + Some(matches) => matches.iter().map(MatchBounds::from).collect(), + } + } + + fn crop_bounds(&self, matches: &[Match<'t>]) -> (usize, usize) { + let byte_end = self + .tokens + .iter() + .filter(|t| t.is_separator().is_none()) + .enumerate() + .take_while(|(i, _)| *i < self.crop_size) + .last() + .map_or(self.text.len(), |(_, t)| t.byte_end); + + (0, byte_end) + } + + pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { + if !highlight && !crop { + // compute matches is not needed if no highlight or crop is requested. + Cow::Borrowed(self.text) + } else { + match &self.matches { + Some(matches) => { + let (byte_start, byte_end) = + if crop { self.crop_bounds(matches) } else { (0, self.text.len()) }; + + let mut formatted = Vec::new(); + + // push crop marker if it's not the start of the text. + if byte_start > 0 && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + let mut byte_index = byte_start; + + if highlight { + // insert highlight markers around matches. + for m in matches + .iter() + .skip_while(|m| m.token.byte_start < byte_start) + .take_while(|m| m.token.byte_start < byte_end) + { + if byte_index < m.token.byte_start { + formatted.push(&self.text[byte_index..m.token.byte_start]); + } + + formatted.push(self.highlight_prefix); + formatted.push(&self.text[m.token.byte_start..m.token.byte_end]); + formatted.push(self.highlight_suffix); + + byte_index = m.token.byte_end; + } + } + + // push the rest of the text between last match and the end of crop. + if byte_index < byte_end { + formatted.push(&self.text[byte_index..byte_end]); + } + + // push crop marker if it's not the end of the text. + if byte_end < self.text.len() && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + if formatted.len() == 1 { + // avoid concatenating if there is already 1 slice. + Cow::Borrowed(&self.text[byte_start..byte_end]) + } else { + Cow::Owned(formatted.concat()) + } + } + None => self.compute_matches().format(highlight, crop), + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::search::query_tree::{Query, QueryKind}; + + fn query_tree() -> Operation { + Operation::Or( + false, + vec![Operation::And(vec![ + Operation::Query(Query { + prefix: true, + kind: QueryKind::exact("split".to_string()), + }), + Operation::Query(Query { + prefix: false, + kind: QueryKind::exact("the".to_string()), + }), + Operation::Query(Query { + prefix: true, + kind: QueryKind::tolerant(1, "world".to_string()), + }), + ])], + ) + } + + #[test] + fn format_identity() { + let query_tree = query_tree(); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = false; + let crop = false; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(highlight, crop), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(highlight, crop), &text); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(highlight, crop), &text); + } + + #[test] + fn format_highlight() { + let query_tree = query_tree(); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = true; + let crop = false; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text, because there is no matches. + assert_eq!(&matcher.format(highlight, crop), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!( + &matcher.format(highlight, crop), + "Natalie risk her future to build a world with the boy she loves." + ); + } + + #[test] + fn format_crop() { + let query_tree = query_tree(); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = false; + let crop = true; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 first words with a marker at the end. + assert_eq!( + &matcher.format(highlight, crop), + "A quick brown fox can not jump 32 feet, right…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…she loves. Emily Henry: The Love That Split The World" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…future to build a world with the boy she loves." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…void void void void void split the world void void" + ); + } + + #[test] + fn format_highlight_crop() { + let query_tree = query_tree(); + + let builder = MatcherBuilder::from_query_tree(&query_tree); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let highlight = true; + let crop = true; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 first words with a marker at the end. + assert_eq!( + &matcher.format(highlight, crop), + "A quick brown fox can not jump 32 feet, right…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 last words with a marker at the start and highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World"); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 last words with a marker at the start and highlighted matches. + assert_eq!( + &matcher.format(highlight, crop), + "…future to build a world with the boy she loves." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…void void void void void split the world void void" + ); + } +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 0d33d9042..a80e520a0 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matching_words::MatchingWords; +pub use self::matches::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; @@ -32,7 +32,7 @@ mod criteria; mod distinct; mod facet; mod fst_utils; -mod matching_words; +mod matches; mod query_tree; pub struct Search<'a> {