diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 0cac5e017..0dbbd6d6f 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -39,3 +39,7 @@ harness = false [[bench]] name = "indexing" harness = false + +[[bench]] +name = "formatting" +harness = false diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs new file mode 100644 index 000000000..5045df268 --- /dev/null +++ b/benchmarks/benches/formatting.rs @@ -0,0 +1,68 @@ +use criterion::{criterion_group, criterion_main}; +use milli::tokenizer::{Analyzer, AnalyzerConfig}; +use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; + +#[cfg(target_os = "linux")] +#[global_allocator] +static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; + +struct Conf<'a> { + name: &'a str, + text: &'a str, + matching_words: MatcherBuilder, +} + +fn bench_formatting(c: &mut criterion::Criterion) { + #[rustfmt::skip] + let confs = &[ + Conf { + name: "'the door d'", + text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#, + matching_words: MatcherBuilder::from_matching_words(MatchingWords::new(vec![ + (vec![MatchingWord::new("t".to_string(), 0, false), MatchingWord::new("he".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("door".to_string(), 1, false)], vec![1]), + (vec![MatchingWord::new("do".to_string(), 0, false), MatchingWord::new("or".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("thedoor".to_string(), 1, false)], vec![0, 1]), + (vec![MatchingWord::new("d".to_string(), 0, true)], vec![2]), + (vec![MatchingWord::new("thedoord".to_string(), 1, true)], vec![0, 1, 2]), + (vec![MatchingWord::new("doord".to_string(), 1, true)], vec![1, 2]), + ])), + }, + ]; + + let format_options = &[ + FormatOptions { highlight: false, crop: None }, + FormatOptions { highlight: true, crop: None }, + FormatOptions { highlight: false, crop: Some(10) }, + FormatOptions { highlight: true, crop: Some(10) }, + FormatOptions { highlight: false, crop: Some(20) }, + FormatOptions { highlight: true, crop: Some(20) }, + ]; + + for option in format_options { + let highlight = if option.highlight { "highlight" } else { "no-highlight" }; + + let name = match option.crop { + Some(size) => format!("{}-crop({})", highlight, size), + None => format!("{}-no-crop", highlight), + }; + + let mut group = c.benchmark_group(&name); + for conf in confs { + group.bench_function(conf.name, |b| { + b.iter(|| { + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + let analyzed = analyzer.analyze(&conf.text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = conf.matching_words.build(&tokens[..], conf.text); + matcher.format(option.clone()); + }) + }); + } + group.finish(); + } +} + +criterion_group!(benches, bench_formatting); +criterion_main!(benches); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 7a3ed8ebe..641f82046 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -25,8 +25,8 @@ use milli::update::{ ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, }; use milli::{ - obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, Index, MatchingWords, - SearchResult, SortError, + obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index, + MatcherBuilder, SearchResult, SortError, }; use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; @@ -152,43 +152,27 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { Self { analyzer } } - fn highlight_value(&self, value: Value, matching_words: &MatchingWords) -> Value { + fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value { match value { Value::Null => Value::Null, Value::Bool(boolean) => Value::Bool(boolean), Value::Number(number) => Value::Number(number), Value::String(old_string) => { - let mut string = String::new(); let analyzed = self.analyzer.analyze(&old_string); - for (word, token) in analyzed.reconstruct() { - if token.is_word() { - match matching_words.matching_bytes(&token) { - Some(chars_to_highlight) => { - let mut chars = word.chars(); + let analyzed: Vec<_> = analyzed.tokens().collect(); + let mut matcher = matcher_builder.build(&analyzed[..], &old_string); - string.push_str(""); - // push the part to highlight - string.extend(chars.by_ref().take(chars_to_highlight)); - string.push_str(""); - // push the suffix after highlight - string.extend(chars); - } - // no highlight - None => string.push_str(word), - } - } else { - string.push_str(word); - } - } - Value::String(string) + let format_options = FormatOptions { highlight: true, crop: Some(10) }; + + Value::String(matcher.format(format_options).to_string()) } Value::Array(values) => Value::Array( - values.into_iter().map(|v| self.highlight_value(v, matching_words)).collect(), + values.into_iter().map(|v| self.highlight_value(v, matcher_builder)).collect(), ), Value::Object(object) => Value::Object( object .into_iter() - .map(|(k, v)| (k, self.highlight_value(v, matching_words))) + .map(|(k, v)| (k, self.highlight_value(v, matcher_builder))) .collect(), ), } @@ -197,14 +181,14 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { fn highlight_record( &self, object: &mut Map, - matching_words: &MatchingWords, + matcher_builder: &MatcherBuilder, attributes_to_highlight: &HashSet, ) { // TODO do we need to create a string for element that are not and needs to be highlight? for (key, value) in object.iter_mut() { if attributes_to_highlight.contains(key) { let old_value = mem::take(value); - *value = self.highlight_value(old_value, matching_words); + *value = self.highlight_value(old_value, matcher_builder); } } } @@ -819,12 +803,15 @@ async fn main() -> anyhow::Result<()> { let stop_words = fst::Set::default(); let highlighter = Highlighter::new(&stop_words); + let mut matcher_builder = MatcherBuilder::from_matching_words(matching_words); + matcher_builder.highlight_prefix("".to_string()); + matcher_builder.highlight_suffix("".to_string()); for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); if !disable_highlighting { highlighter.highlight_record( &mut object, - &matching_words, + &matcher_builder, &attributes_to_highlight, ); } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index ec28dbb1b..e718dccae 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -36,7 +36,10 @@ pub use self::heed_codec::{ RoaringBitmapLenCodec, StrBEU32Codec, StrStrU8Codec, }; pub use self::index::Index; -pub use self::search::{FacetDistribution, Filter, MatchingWords, Search, SearchResult}; +pub use self::search::{ + FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, + MatchingWords, Search, SearchResult, +}; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs new file mode 100644 index 000000000..84b47bba5 --- /dev/null +++ b/milli/src/search/matches/matching_words.rs @@ -0,0 +1,435 @@ +use std::cmp::{min, Reverse}; +use std::collections::BTreeMap; +use std::fmt; +use std::ops::{Index, IndexMut}; + +use levenshtein_automata::{Distance, DFA}; +use meilisearch_tokenizer::Token; + +use crate::search::build_dfa; + +type IsPrefix = bool; + +/// Structure created from a query tree +/// referencing words that match the given query tree. +#[derive(Default)] +pub struct MatchingWords { + inner: Vec<(Vec, Vec)>, +} + +impl MatchingWords { + pub fn new(mut matching_words: Vec<(Vec, Vec)>) -> Self { + // Sort word by len in DESC order prioritizing the longuest matches, + // in order to highlight the longuest part of the matched word. + matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len()))); + + Self { inner: matching_words } + } + + /// Returns an iterator over terms that match or partially match the given token. + pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { + MatchesIter { inner: Box::new(self.inner.iter()), token } + } +} + +/// Iterator over terms that match the given token, +/// This allow to lazily evaluate matches. +pub struct MatchesIter<'a, 'b> { + inner: Box, Vec)> + 'a>, + token: &'b Token<'b>, +} + +impl<'a> Iterator for MatchesIter<'a, '_> { + type Item = MatchType<'a>; + + fn next(&mut self) -> Option { + match self.inner.next() { + Some((matching_words, ids)) => match matching_words[0].match_token(&self.token) { + Some(char_len) => { + if matching_words.len() > 1 { + Some(MatchType::Partial(PartialMatch { + matching_words: &matching_words[1..], + ids, + char_len, + })) + } else { + Some(MatchType::Full { char_len, ids }) + } + } + None => self.next(), + }, + None => None, + } + } +} + +/// Id of a matching term corespounding to a word written by the end user. +pub type PrimitiveWordId = u8; + +/// Structure used to match a specific term. +pub struct MatchingWord { + pub dfa: DFA, + pub word: String, + pub typo: u8, + pub prefix: IsPrefix, +} + +impl fmt::Debug for MatchingWord { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MatchingWord") + .field("word", &self.word) + .field("typo", &self.typo) + .field("prefix", &self.prefix) + .finish() + } +} + +impl PartialEq for MatchingWord { + fn eq(&self, other: &Self) -> bool { + self.prefix == other.prefix && self.typo == other.typo && self.word == other.word + } +} + +impl MatchingWord { + pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self { + let dfa = build_dfa(&word, typo, prefix); + + Self { dfa, word, typo, prefix } + } + + /// Returns the lenght in chars of the match in case of the token matches the term. + pub fn match_token(&self, token: &Token) -> Option { + match self.dfa.eval(token.text()) { + Distance::Exact(t) if t <= self.typo => { + if self.prefix { + let len = bytes_to_highlight(token.text(), &self.word); + Some(token.num_chars_from_bytes(len)) + } else { + Some(token.num_chars_from_bytes(token.text().len())) + } + } + _otherwise => None, + } + } +} + +/// A given token can partially match a query word for several reasons: +/// - split words +/// - multi-word synonyms +/// In these cases we need to match consecutively several tokens to consider that the match is full. +#[derive(Debug, PartialEq)] +pub enum MatchType<'a> { + Full { char_len: usize, ids: &'a [PrimitiveWordId] }, + Partial(PartialMatch<'a>), +} + +/// Structure helper to match several tokens in a row in order to complete a partial match. +#[derive(Debug, PartialEq)] +pub struct PartialMatch<'a> { + matching_words: &'a [MatchingWord], + ids: &'a [PrimitiveWordId], + char_len: usize, +} + +impl<'a> PartialMatch<'a> { + /// Returns: + /// - None if the given token breaks the partial match + /// - Partial if the given token matches the partial match but doesn't complete it + /// - Full if the given token completes the partial match + pub fn match_token(self, token: &Token) -> Option> { + self.matching_words[0].match_token(token).map(|char_len| { + if self.matching_words.len() > 1 { + MatchType::Partial(PartialMatch { + matching_words: &self.matching_words[1..], + ids: self.ids, + char_len, + }) + } else { + MatchType::Full { char_len, ids: self.ids } + } + }) + } + + pub fn char_len(&self) -> usize { + self.char_len + } +} + +// A simple wrapper around vec so we can get contiguous but index it like it's 2D array. +struct N2Array { + y_size: usize, + buf: Vec, +} + +impl N2Array { + fn new(x: usize, y: usize, value: T) -> N2Array { + N2Array { y_size: y, buf: vec![value; x * y] } + } +} + +impl Index<(usize, usize)> for N2Array { + type Output = T; + + #[inline] + fn index(&self, (x, y): (usize, usize)) -> &T { + &self.buf[(x * self.y_size) + y] + } +} + +impl IndexMut<(usize, usize)> for N2Array { + #[inline] + fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T { + &mut self.buf[(x * self.y_size) + y] + } +} + +/// Returns the number of **bytes** we want to highlight in the `source` word. +/// Basically we want to highlight as much characters as possible in the source until it has too much +/// typos (= 2) +/// The algorithm is a modified +/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) +fn bytes_to_highlight(source: &str, target: &str) -> usize { + let n = source.chars().count(); + let m = target.chars().count(); + + if n == 0 { + return 0; + } + // since we allow two typos we can send two characters even if it's completely wrong + if m < 3 { + return source.chars().take(m).map(|c| c.len_utf8()).sum(); + } + if n == m && source == target { + return source.len(); + } + + let inf = n + m; + let mut matrix = N2Array::new(n + 2, m + 2, 0); + + matrix[(0, 0)] = inf; + for i in 0..=n { + matrix[(i + 1, 0)] = inf; + matrix[(i + 1, 1)] = i; + } + for j in 0..=m { + matrix[(0, j + 1)] = inf; + matrix[(1, j + 1)] = j; + } + + let mut last_row = BTreeMap::new(); + + for (row, char_s) in source.chars().enumerate() { + let mut last_match_col = 0; + let row = row + 1; + + for (col, char_t) in target.chars().enumerate() { + let col = col + 1; + let last_match_row = *last_row.get(&char_t).unwrap_or(&0); + let cost = if char_s == char_t { 0 } else { 1 }; + + let dist_add = matrix[(row, col + 1)] + 1; + let dist_del = matrix[(row + 1, col)] + 1; + let dist_sub = matrix[(row, col)] + cost; + let dist_trans = matrix[(last_match_row, last_match_col)] + + (row - last_match_row - 1) + + 1 + + (col - last_match_col - 1); + let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans)); + matrix[(row + 1, col + 1)] = dist; + + if cost == 0 { + last_match_col = col; + } + } + + last_row.insert(char_s, row); + } + + let mut minimum = (u32::max_value(), 0); + for x in 0..=m { + let dist = matrix[(n + 1, x + 1)] as u32; + if dist < minimum.0 { + minimum = (dist, x); + } + } + + // everything was done characters wise and now we want to returns a number of bytes + source.chars().take(minimum.1).map(|c| c.len_utf8()).sum() +} + +#[cfg(test)] +mod tests { + use std::borrow::Cow; + use std::str::from_utf8; + + use meilisearch_tokenizer::TokenKind; + + use super::*; + use crate::MatchingWords; + + #[test] + fn test_bytes_to_highlight() { + struct TestBytesToHighlight { + query: &'static str, + text: &'static str, + length: usize, + } + let tests = [ + TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() }, + TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() }, + TestBytesToHighlight { + query: "Levenshtein", + text: "Levenshtein", + length: "Levenshtein".len(), + }, + // we get to the end of our word with only one typo + TestBytesToHighlight { + query: "Levenste", + text: "Levenshtein", + length: "Levenste".len(), + }, + // we get our third and last authorized typo right on the last character + TestBytesToHighlight { + query: "Levenstein", + text: "Levenshte", + length: "Levenste".len(), + }, + // we get to the end of our word with only two typos at the beginning + TestBytesToHighlight { + query: "Bavenshtein", + text: "Levenshtein", + length: "Bavenshtein".len(), + }, + TestBytesToHighlight { + query: "Альфа", text: "Альфой", length: "Альф".len() + }, + TestBytesToHighlight { + query: "Go💼", text: "Go💼od luck.", length: "Go💼".len() + }, + TestBytesToHighlight { + query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len() + }, + TestBytesToHighlight { + query: "chäräcters", + text: "chäräcters", + length: "chäräcters".len(), + }, + TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() }, + TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() }, + ]; + + for test in &tests { + let length = bytes_to_highlight(test.text, test.query); + assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text); + assert!( + from_utf8(&test.query.as_bytes()[..length]).is_ok(), + r#"converting {}[..{}] to an utf8 str failed"#, + test.query, + length + ); + } + } + + #[test] + fn matching_words() { + let matching_words = vec![ + (vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]), + (vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]), + (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]), + ]; + + let matching_words = MatchingWords::new(matching_words); + + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("word"), + byte_start: 0, + char_index: 0, + byte_end: "word".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 3, ids: &[2] }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("nyc"), + byte_start: 0, + char_index: 0, + byte_end: "nyc".len(), + char_map: None, + }) + .next(), + None + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("world"), + byte_start: 0, + char_index: 0, + byte_end: "world".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[2] }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("splitted"), + byte_start: 0, + char_index: 0, + byte_end: "splitted".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[0] }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("thisnew"), + byte_start: 0, + char_index: 0, + byte_end: "thisnew".len(), + char_map: None, + }) + .next(), + None + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("borld"), + byte_start: 0, + char_index: 0, + byte_end: "borld".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &[2] }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + word: Cow::Borrowed("wordsplit"), + byte_start: 0, + char_index: 0, + byte_end: "wordsplit".len(), + char_map: None, + }) + .next(), + Some(MatchType::Full { char_len: 4, ids: &[2] }) + ); + } +} diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs new file mode 100644 index 000000000..c7812aa77 --- /dev/null +++ b/milli/src/search/matches/mod.rs @@ -0,0 +1,865 @@ +use std::borrow::Cow; + +use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; +pub use matching_words::{MatchingWord, MatchingWords}; +use meilisearch_tokenizer::token::{SeparatorKind, Token}; + +pub mod matching_words; + +const DEFAULT_CROP_MARKER: &'static str = "…"; +const DEFAULT_HIGHLIGHT_PREFIX: &'static str = ""; +const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = ""; + +pub struct MatcherBuilder { + matching_words: MatchingWords, + crop_marker: Option, + highlight_prefix: Option, + highlight_suffix: Option, +} + +impl MatcherBuilder { + pub fn from_matching_words(matching_words: MatchingWords) -> Self { + Self { matching_words, crop_marker: None, highlight_prefix: None, highlight_suffix: None } + } + + pub fn crop_marker(&mut self, marker: String) -> &Self { + self.crop_marker = Some(marker); + self + } + + pub fn highlight_prefix(&mut self, prefix: String) -> &Self { + self.highlight_prefix = Some(prefix); + self + } + + pub fn highlight_suffix(&mut self, suffix: String) -> &Self { + self.highlight_suffix = Some(suffix); + self + } + + pub fn build<'t, 'm>(&'m self, tokens: &'t [Token], text: &'t str) -> Matcher<'t, 'm> { + let crop_marker = match &self.crop_marker { + Some(marker) => marker.as_str(), + None => &DEFAULT_CROP_MARKER, + }; + + let highlight_prefix = match &self.highlight_prefix { + Some(marker) => marker.as_str(), + None => &DEFAULT_HIGHLIGHT_PREFIX, + }; + let highlight_suffix = match &self.highlight_suffix { + Some(marker) => marker.as_str(), + None => &DEFAULT_HIGHLIGHT_SUFFIX, + }; + Matcher { + text, + tokens, + matching_words: &self.matching_words, + crop_marker, + highlight_prefix, + highlight_suffix, + matches: None, + } + } +} + +#[derive(Copy, Clone, Default)] +pub struct FormatOptions { + pub highlight: bool, + pub crop: Option, +} + +impl FormatOptions { + pub fn merge(self, other: Self) -> Self { + Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } + } +} + +#[derive(Clone, Debug)] +pub struct Match { + match_len: usize, + // ids of the query words that matches. + ids: Vec, + // position of the word in the whole text. + word_position: usize, + // position of the token in the whole text. + token_position: usize, +} + +#[derive(Clone, Debug)] +pub struct MatchBounds { + pub start: usize, + pub length: usize, +} + +pub struct Matcher<'t, 'm> { + text: &'t str, + tokens: &'t [Token<'t>], + matching_words: &'m MatchingWords, + crop_marker: &'m str, + highlight_prefix: &'m str, + highlight_suffix: &'m str, + matches: Option>, +} + +impl<'t> Matcher<'t, '_> { + /// Iterates over tokens and save any of them that matches the query. + fn compute_matches(&mut self) -> &mut Self { + fn compute_partial_match<'a>( + mut partial: PartialMatch, + token_position: usize, + word_position: usize, + words_positions: &mut impl Iterator)>, + matches: &mut Vec, + ) -> bool { + let mut potential_matches = Vec::new(); + + // Add first match to potential matches. + potential_matches.push((token_position, word_position, partial.char_len())); + + for (token_position, word_position, word) in words_positions { + partial = match partial.match_token(&word) { + // token matches the partial match, but the match is not full, + // we temporarly save the current token then we try to match the next one. + Some(MatchType::Partial(partial)) => { + potential_matches.push((token_position, word_position, partial.char_len())); + partial + } + // partial match is now full, we keep this matches and we advance positions + Some(MatchType::Full { char_len, ids }) => { + // save previously matched tokens as matches. + let iter = potential_matches.into_iter().map( + |(token_position, word_position, match_len)| Match { + match_len, + ids: ids.to_vec(), + word_position, + token_position, + }, + ); + matches.extend(iter); + + // save the token that closes the partial match as a match. + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + + // the match is complete, we return true. + return true; + } + // no match, continue to next match. + None => break, + }; + } + + // the match is not complete, we return false. + false + } + + let mut matches = Vec::new(); + + let mut words_positions = self + .tokens + .iter() + .scan((0, 0), |(token_position, word_position), token| { + let current_token_position = *token_position; + let current_word_position = *word_position; + *token_position += 1; + if token.is_separator().is_none() { + *word_position += 1; + } + + Some((current_token_position, current_word_position, token)) + }) + .filter(|(_, _, token)| token.is_separator().is_none()); + + while let Some((token_position, word_position, word)) = words_positions.next() { + for match_type in self.matching_words.match_token(word) { + match match_type { + // we match, we save the current token as a match, + // then we continue the rest of the tokens. + MatchType::Full { char_len, ids } => { + matches.push(Match { + match_len: char_len, + ids: ids.to_vec(), + word_position, + token_position, + }); + break; + } + // we match partially, iterate over next tokens to check if we can complete the match. + MatchType::Partial(partial) => { + // if match is completed, we break the matching loop over the current token, + // then we continue the rest of the tokens. + let mut wp = words_positions.clone(); + if compute_partial_match( + partial, + token_position, + word_position, + &mut wp, + &mut matches, + ) { + words_positions = wp; + break; + } + } + } + } + } + + self.matches = Some(matches); + self + } + + /// Returns boundaries of the words that match the query. + pub fn matches(&mut self) -> Vec { + match &self.matches { + None => self.compute_matches().matches(), + Some(matches) => matches + .iter() + .map(|m| MatchBounds { + start: self.tokens[m.token_position].byte_start, + length: m.match_len, + }) + .collect(), + } + } + + /// Returns the bounds in byte index of the crop window. + fn crop_bounds(&self, matches: &[Match], crop_size: usize) -> (usize, usize) { + // if there is no match, we start from the beginning of the string by default. + let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); + let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); + let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); + let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); + + // matches needs to be counted in the crop len. + let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; + + let mut before_tokens = self.tokens[..first_match_token_position].iter().rev().peekable(); + let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); + + while remaining_words > 0 { + let before_token = before_tokens.peek().map(|t| t.is_separator()); + let after_token = after_tokens.peek().map(|t| t.is_separator()); + + match (before_token, after_token) { + // we can expand both sides. + (Some(before_token), Some(after_token)) => { + match (before_token, after_token) { + // if they are both separators and are the same kind then advance both, + // or expand in the soft separator separator side. + (Some(before_token_kind), Some(after_token_kind)) => { + if before_token_kind == after_token_kind { + before_tokens.next(); + after_tokens.next(); + } else if before_token_kind == SeparatorKind::Hard { + after_tokens.next(); + } else { + before_tokens.next(); + } + } + // if one of the tokens is a word, we expend in the side of the word. + // left is a word, advance left. + (None, Some(_)) => { + before_tokens.next(); + remaining_words -= 1; + } + // right is a word, advance right. + (Some(_), None) => { + after_tokens.next(); + remaining_words -= 1; + } + // both are words, advance left then right if remaining_word > 0. + (None, None) => { + before_tokens.next(); + remaining_words -= 1; + + if remaining_words > 0 { + after_tokens.next(); + remaining_words -= 1; + } + } + } + } + // the end of the text is reached, advance left. + (Some(before_token), None) => { + before_tokens.next(); + if before_token.is_none() { + remaining_words -= 1; + } + } + // the start of the text is reached, advance right. + (None, Some(after_token)) => { + after_tokens.next(); + if after_token.is_none() { + remaining_words -= 1; + } + } + // no more token to add. + (None, None) => break, + } + } + + let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); + let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); + + (crop_byte_start, crop_byte_end) + } + + /// Compute the score of a match interval: + /// 1) count unique matches + /// 2) calculate distance between matches + /// 3) count ordered matches + fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { + let mut ids: Vec = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.ids.iter().min() > m.ids.iter().min() { + order_score += 1; + } + + // compute distance between matches + distance_score -= (next_match.word_position - m.word_position).min(7) as i16; + } + + ids.extend(m.ids.iter()); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + (uniq_score, distance_score, order_score) + } + + /// Returns the matches interval where the score computed by match_interval_score is maximal. + fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { + // we compute the matches interval if we have at least 2 matches. + if matches.len() > 1 { + // positions of the first and the last match of the best matches interval in `matches`. + let mut best_interval = (0, 0); + let mut best_interval_score = self.match_interval_score(&matches[0..=0]); + // current interval positions. + let mut interval_first = 0; + let mut interval_last = 0; + for (index, next_match) in matches.iter().enumerate().skip(1) { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + if next_match.word_position - matches[interval_first].word_position >= crop_size { + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + + // keep interval if it's the best + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + best_interval_score = interval_score; + } + + // advance start of the interval while interval is longer than crop_size. + while next_match.word_position - matches[interval_first].word_position + >= crop_size + { + interval_first += 1; + } + } + interval_last = index; + } + + // compute the last interval score and compare it to the best one. + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + } + + &matches[best_interval.0..=best_interval.1] + } else { + matches + } + } + + // Returns the formatted version of the original text. + pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { + if !format_options.highlight && format_options.crop.is_none() { + // compute matches is not needed if no highlight nor crop is requested. + Cow::Borrowed(self.text) + } else { + match &self.matches { + Some(matches) => { + let matches = match format_options.crop { + Some(crop_size) if crop_size > 0 => { + self.find_best_match_interval(matches, crop_size) + } + _ => matches, + }; + + let (byte_start, byte_end) = match format_options.crop { + Some(crop_size) if crop_size > 0 => self.crop_bounds(matches, crop_size), + _ => (0, self.text.len()), + }; + + let mut formatted = Vec::new(); + + // push crop marker if it's not the start of the text. + if byte_start > 0 && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + let mut byte_index = byte_start; + + if format_options.highlight { + // insert highlight markers around matches. + let tokens = self.tokens; + for m in matches { + let token = &tokens[m.token_position]; + + if byte_index < token.byte_start { + formatted.push(&self.text[byte_index..token.byte_start]); + } + + let highlight_byte_index = self.text[token.byte_start..] + .char_indices() + .enumerate() + .find(|(i, _)| *i == m.match_len) + .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + formatted.push(self.highlight_prefix); + formatted.push(&self.text[token.byte_start..highlight_byte_index]); + formatted.push(self.highlight_suffix); + // if it's a prefix highlight, we put the end of the word after the highlight marker. + if highlight_byte_index < token.byte_end { + formatted.push(&self.text[highlight_byte_index..token.byte_end]); + } + + byte_index = token.byte_end; + } + } + + // push the rest of the text between last match and the end of crop. + if byte_index < byte_end { + formatted.push(&self.text[byte_index..byte_end]); + } + + // push crop marker if it's not the end of the text. + if byte_end < self.text.len() && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + if formatted.len() == 1 { + // avoid concatenating if there is already 1 slice. + Cow::Borrowed(&self.text[byte_start..byte_end]) + } else { + Cow::Owned(formatted.concat()) + } + } + None => self.compute_matches().format(format_options), + } + } + } +} + +#[cfg(test)] +mod tests { + use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; + + use super::*; + use crate::search::matches::matching_words::MatchingWord; + + fn matching_words() -> MatchingWords { + let matching_words = vec![ + (vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]), + (vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]), + (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]), + ]; + + MatchingWords::new(matching_words) + } + + #[test] + fn format_identity() { + let matching_words = matching_words(); + + let builder = MatcherBuilder::from_matching_words(matching_words); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let format_options = FormatOptions { highlight: false, crop: None }; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + } + + #[test] + fn format_highlight() { + let matching_words = matching_words(); + + let builder = MatcherBuilder::from_matching_words(matching_words); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let format_options = FormatOptions { highlight: true, crop: None }; + + // empty text. + let text = ""; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(format_options), ""); + + // text containing only separators. + let text = ":-)"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(format_options), ":-)"); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text, because there is no matches. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!( + &matcher.format(format_options), + "Natalie risk her future to build a world with the boy she loves." + ); + } + + #[test] + fn highlight_unicode() { + let matching_words = vec![ + (vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]), + (vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]), + ]; + + let matching_words = MatchingWords::new(matching_words); + + let builder = MatcherBuilder::from_matching_words(matching_words); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let format_options = FormatOptions { highlight: true, crop: None }; + + // Text containing prefix match. + let text = "Ŵôřlḑôle"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!(&matcher.format(format_options), "Ŵôřlḑôle"); + + // Text containing unicode match. + let text = "Ŵôřlḑ"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!(&matcher.format(format_options), "Ŵôřlḑ"); + + // Text containing unicode match. + let text = "Westfália"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no crop should return complete text with highlighted matches. + assert_eq!(&matcher.format(format_options), "Westfália"); + } + + #[test] + fn format_crop() { + let matching_words = matching_words(); + + let builder = MatcherBuilder::from_matching_words(matching_words); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let format_options = FormatOptions { highlight: false, crop: Some(10) }; + + // empty text. + let text = ""; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(format_options), ""); + + // text containing only separators. + let text = ":-)"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(format_options), ":-)"); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 first words with a marker at the end. + assert_eq!( + &matcher.format(format_options), + "A quick brown fox can not jump 32 feet, right…" + ); + + // Text without any match starting by a separator. + let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 first words with a marker at the end. + assert_eq!( + &matcher.format(format_options), + "(A quick brown fox can not jump 32 feet, right…" + ); + + // Test phrase propagation + let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // should crop the phrase instead of croping around the match. + assert_eq!( + &matcher.format(format_options), + "…Split The World is a book written by Emily Henry…" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(format_options), + "…future to build a world with the boy she loves…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(format_options), + "…she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(format_options), + "…void void void void void split the world void void" + ); + + // Text containing matches with diferent density. + let text = "split void the void void world void void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(format_options), + "…void void void void void split the world void void" + ); + + // Text containing matches with same word. + let text = "split split split split split split void void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(format_options), + "…void void void void void split the world void void" + ); + } + + #[test] + fn format_highlight_crop() { + let matching_words = matching_words(); + + let builder = MatcherBuilder::from_matching_words(matching_words); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let format_options = FormatOptions { highlight: true, crop: Some(10) }; + + // empty text. + let text = ""; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(format_options), ""); + + // text containing only separators. + let text = ":-)"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + assert_eq!(&matcher.format(format_options), ":-)"); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 first words with a marker at the end. + assert_eq!( + &matcher.format(format_options), + "A quick brown fox can not jump 32 feet, right…" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 last words with a marker at the start and highlighted matches. + assert_eq!( + &matcher.format(format_options), + "…future to build a world with the boy she loves…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 last words with a marker at the start and highlighted matches. + assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: The Love That Split The World."); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // crop should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(format_options), + "…void void void void void split the world void void" + ); + } + + #[test] + fn smaller_crop_size() { + //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 + let matching_words = matching_words(); + + let builder = MatcherBuilder::from_matching_words(matching_words); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let text = "void void split the world void void."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + + // set a smaller crop size + let format_options = FormatOptions { highlight: false, crop: Some(2) }; + + let mut matcher = builder.build(&tokens[..], text); + // because crop size < query size, partially format matches. + assert_eq!(&matcher.format(format_options), "…split the…"); + + // set a smaller crop size + let format_options = FormatOptions { highlight: false, crop: Some(1) }; + + let mut matcher = builder.build(&tokens[..], text); + // because crop size < query size, partially format matches. + assert_eq!(&matcher.format(format_options), "…split…"); + + // set crop size to 0 + let format_options = FormatOptions { highlight: false, crop: Some(0) }; + + let mut matcher = builder.build(&tokens[..], text); + // because crop size is 0, crop is ignored. + assert_eq!(&matcher.format(format_options), "void void split the world void void."); + } + + #[test] + fn partial_matches() { + let matching_words = vec![ + (vec![MatchingWord::new("the".to_string(), 0, false)], vec![0]), + ( + vec![ + MatchingWord::new("t".to_string(), 0, false), + MatchingWord::new("he".to_string(), 0, false), + ], + vec![0], + ), + (vec![MatchingWord::new("door".to_string(), 0, false)], vec![1]), + ( + vec![ + MatchingWord::new("do".to_string(), 0, false), + MatchingWord::new("or".to_string(), 0, false), + ], + vec![1], + ), + (vec![MatchingWord::new("do".to_string(), 0, false)], vec![2]), + ]; + + let matching_words = MatchingWords::new(matching_words); + + let mut builder = MatcherBuilder::from_matching_words(matching_words); + builder.highlight_prefix("_".to_string()); + builder.highlight_suffix("_".to_string()); + let analyzer = Analyzer::new(AnalyzerConfig::>::default()); + + let format_options = FormatOptions { highlight: true, crop: None }; + + let text = "the do or die can't be he do and or isn't he"; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + + let mut matcher = builder.build(&tokens[..], text); + assert_eq!( + &matcher.format(format_options), + "_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_", + "matches: {:?}", + &matcher.matches + ); + } +} diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs deleted file mode 100644 index 67bdefb37..000000000 --- a/milli/src/search/matching_words.rs +++ /dev/null @@ -1,354 +0,0 @@ -use std::cmp::{min, Reverse}; -use std::collections::{BTreeMap, HashSet}; -use std::ops::{Index, IndexMut}; - -use levenshtein_automata::{Distance, DFA}; -use meilisearch_tokenizer::Token; - -use super::build_dfa; -use crate::search::query_tree::{Operation, Query}; - -type IsPrefix = bool; - -/// Structure created from a query tree -/// referencing words that match the given query tree. -#[derive(Default)] -pub struct MatchingWords { - dfas: Vec<(DFA, String, u8, IsPrefix)>, -} - -impl MatchingWords { - pub fn from_query_tree(tree: &Operation) -> Self { - // fetch matchable words from the query tree - let mut dfas: Vec<_> = fetch_queries(tree) - .into_iter() - // create DFAs for each word - .map(|(w, t, p)| (build_dfa(w, t, p), w.to_string(), t, p)) - .collect(); - // Sort word by len in DESC order prioritizing the longuest word, - // in order to highlight the longuest part of the matched word. - dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| { - Reverse(query_word.len()) - }); - Self { dfas } - } - - /// Returns the number of matching bytes if the word matches one of the query words. - pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option { - self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| { - match dfa.eval(word_to_highlight.text()) { - Distance::Exact(t) if t <= *typo => { - if *is_prefix { - let len = bytes_to_highlight(word_to_highlight.text(), query_word); - Some(word_to_highlight.num_chars_from_bytes(len)) - } else { - Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len())) - } - } - _otherwise => None, - } - }) - } -} - -/// Lists all words which can be considered as a match for the query tree. -fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { - fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { - match tree { - Operation::Or(_, ops) | Operation::And(ops) => { - ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); - } - Operation::Query(Query { prefix, kind }) => { - let typo = if kind.is_exact() { 0 } else { kind.typo() }; - out.insert((kind.word(), typo, *prefix)); - } - Operation::Phrase(words) => { - for word in words { - out.insert((word, 0, false)); - } - } - } - } - - let mut queries = HashSet::new(); - resolve_ops(tree, &mut queries); - queries -} - -// A simple wrapper around vec so we can get contiguous but index it like it's 2D array. -struct N2Array { - y_size: usize, - buf: Vec, -} - -impl N2Array { - fn new(x: usize, y: usize, value: T) -> N2Array { - N2Array { y_size: y, buf: vec![value; x * y] } - } -} - -impl Index<(usize, usize)> for N2Array { - type Output = T; - - #[inline] - fn index(&self, (x, y): (usize, usize)) -> &T { - &self.buf[(x * self.y_size) + y] - } -} - -impl IndexMut<(usize, usize)> for N2Array { - #[inline] - fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T { - &mut self.buf[(x * self.y_size) + y] - } -} - -/// Returns the number of **bytes** we want to highlight in the `source` word. -/// Basically we want to highlight as much characters as possible in the source until it has too much -/// typos (= 2) -/// The algorithm is a modified -/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) -fn bytes_to_highlight(source: &str, target: &str) -> usize { - let n = source.chars().count(); - let m = target.chars().count(); - - if n == 0 { - return 0; - } - // since we allow two typos we can send two characters even if it's completely wrong - if m < 3 { - return source.chars().take(m).map(|c| c.len_utf8()).sum(); - } - if n == m && source == target { - return source.len(); - } - - let inf = n + m; - let mut matrix = N2Array::new(n + 2, m + 2, 0); - - matrix[(0, 0)] = inf; - for i in 0..=n { - matrix[(i + 1, 0)] = inf; - matrix[(i + 1, 1)] = i; - } - for j in 0..=m { - matrix[(0, j + 1)] = inf; - matrix[(1, j + 1)] = j; - } - - let mut last_row = BTreeMap::new(); - - for (row, char_s) in source.chars().enumerate() { - let mut last_match_col = 0; - let row = row + 1; - - for (col, char_t) in target.chars().enumerate() { - let col = col + 1; - let last_match_row = *last_row.get(&char_t).unwrap_or(&0); - let cost = if char_s == char_t { 0 } else { 1 }; - - let dist_add = matrix[(row, col + 1)] + 1; - let dist_del = matrix[(row + 1, col)] + 1; - let dist_sub = matrix[(row, col)] + cost; - let dist_trans = matrix[(last_match_row, last_match_col)] - + (row - last_match_row - 1) - + 1 - + (col - last_match_col - 1); - let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans)); - matrix[(row + 1, col + 1)] = dist; - - if cost == 0 { - last_match_col = col; - } - } - - last_row.insert(char_s, row); - } - - let mut minimum = (u32::max_value(), 0); - for x in 0..=m { - let dist = matrix[(n + 1, x + 1)] as u32; - if dist < minimum.0 { - minimum = (dist, x); - } - } - - // everything was done characters wise and now we want to returns a number of bytes - source.chars().take(minimum.1).map(|c| c.len_utf8()).sum() -} - -#[cfg(test)] -mod tests { - use std::borrow::Cow; - use std::str::from_utf8; - - use meilisearch_tokenizer::TokenKind; - - use super::*; - use crate::search::query_tree::{Operation, Query, QueryKind}; - use crate::MatchingWords; - - #[test] - fn test_bytes_to_highlight() { - struct TestBytesToHighlight { - query: &'static str, - text: &'static str, - length: usize, - } - let tests = [ - TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() }, - TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() }, - TestBytesToHighlight { - query: "Levenshtein", - text: "Levenshtein", - length: "Levenshtein".len(), - }, - // we get to the end of our word with only one typo - TestBytesToHighlight { - query: "Levenste", - text: "Levenshtein", - length: "Levenste".len(), - }, - // we get our third and last authorized typo right on the last character - TestBytesToHighlight { - query: "Levenstein", - text: "Levenshte", - length: "Levenste".len(), - }, - // we get to the end of our word with only two typos at the beginning - TestBytesToHighlight { - query: "Bavenshtein", - text: "Levenshtein", - length: "Bavenshtein".len(), - }, - TestBytesToHighlight { - query: "Альфа", text: "Альфой", length: "Альф".len() - }, - TestBytesToHighlight { - query: "Go💼", text: "Go💼od luck.", length: "Go💼".len() - }, - TestBytesToHighlight { - query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len() - }, - TestBytesToHighlight { - query: "chäräcters", - text: "chäräcters", - length: "chäräcters".len(), - }, - TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() }, - TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() }, - ]; - - for test in &tests { - let length = bytes_to_highlight(test.text, test.query); - assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text); - assert!( - from_utf8(&test.query.as_bytes()[..length]).is_ok(), - r#"converting {}[..{}] to an utf8 str failed"#, - test.query, - length - ); - } - } - - #[test] - fn matching_words() { - let query_tree = Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: true, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Query(Query { - prefix: true, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ); - - let matching_words = MatchingWords::from_query_tree(&query_tree); - - assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("word"), - byte_start: 0, - char_index: 0, - byte_end: "word".len(), - char_map: None, - }), - Some(3) - ); - assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("nyc"), - byte_start: 0, - char_index: 0, - byte_end: "nyc".len(), - char_map: None, - }), - None - ); - assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("world"), - byte_start: 0, - char_index: 0, - byte_end: "world".len(), - char_map: None, - }), - Some(5) - ); - assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("splitted"), - byte_start: 0, - char_index: 0, - byte_end: "splitted".len(), - char_map: None, - }), - Some(5) - ); - assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("thisnew"), - byte_start: 0, - char_index: 0, - byte_end: "thisnew".len(), - char_map: None, - }), - None - ); - assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("borld"), - byte_start: 0, - char_index: 0, - byte_end: "borld".len(), - char_map: None, - }), - Some(5) - ); - assert_eq!( - matching_words.matching_bytes(&Token { - kind: TokenKind::Word, - word: Cow::Borrowed("wordsplit"), - byte_start: 0, - char_index: 0, - byte_end: "wordsplit".len(), - char_map: None, - }), - Some(4) - ); - } -} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index dcb2e0803..979ee1e6e 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,9 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, FacetNumberIter, Filter}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; -pub use self::matching_words::MatchingWords; +pub use self::matches::{ + FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, +}; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; use crate::search::criteria::r#final::{Final, FinalResult}; @@ -32,7 +34,7 @@ mod criteria; mod distinct; mod facet; mod fst_utils; -mod matching_words; +mod matches; mod query_tree; pub struct Search<'a> { @@ -114,7 +116,7 @@ impl<'a> Search<'a> { pub fn execute(&self) -> Result { // We create the query tree by spliting the query into tokens. let before = Instant::now(); - let (query_tree, primitive_query) = match self.query.as_ref() { + let (query_tree, primitive_query, matching_words) = match self.query.as_ref() { Some(query) => { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); @@ -132,9 +134,11 @@ impl<'a> Search<'a> { let analyzer = Analyzer::new(config); let result = analyzer.analyze(query); let tokens = result.tokens(); - builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq))) + builder + .build(tokens)? + .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw))) } - None => (None, None), + None => (None, None, None), }; debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed()); @@ -148,11 +152,6 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", filtered_candidates, before.elapsed()); - let matching_words = match query_tree.as_ref() { - Some(query_tree) => MatchingWords::from_query_tree(&query_tree), - None => MatchingWords::default(), - }; - // We check that we are allowed to use the sort criteria, we check // that they are declared in the sortable fields. if let Some(sort_criteria) = &self.sort_criteria { @@ -193,13 +192,13 @@ impl<'a> Search<'a> { )?; match self.index.distinct_field(self.rtxn)? { - None => self.perform_sort(NoopDistinct, matching_words, criteria), + None => self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria), Some(name) => { let field_ids_map = self.index.fields_ids_map(self.rtxn)?; match field_ids_map.id(name) { Some(fid) => { let distinct = FacetDistinct::new(fid, self.index, self.rtxn); - self.perform_sort(distinct, matching_words, criteria) + self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria) } None => Ok(SearchResult::default()), } diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index ff9d3f4e9..2db4e06d5 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -8,7 +8,8 @@ use meilisearch_tokenizer::TokenKind; use roaring::RoaringBitmap; use slice_group_by::GroupBy; -use crate::{Index, Result}; +use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId}; +use crate::{Index, MatchingWords, Result}; type IsOptionalWord = bool; type IsPrefix = bool; @@ -233,7 +234,10 @@ impl<'a> QueryTreeBuilder<'a> { /// - if `authorize_typos` is set to `false` the query tree will be generated /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) - pub fn build(&self, query: TokenStream) -> Result> { + pub fn build( + &self, + query: TokenStream, + ) -> Result> { let stop_words = self.index.stop_words(self.rtxn)?; let primitive_query = create_primitive_query(query, stop_words, self.words_limit); if !primitive_query.is_empty() { @@ -243,7 +247,9 @@ impl<'a> QueryTreeBuilder<'a> { self.authorize_typos, &primitive_query, )?; - Ok(Some((qt, primitive_query))) + let matching_words = + create_matching_words(self, self.authorize_typos, &primitive_query)?; + Ok(Some((qt, primitive_query, matching_words))) } else { Ok(None) } @@ -251,7 +257,10 @@ impl<'a> QueryTreeBuilder<'a> { } /// Split the word depending on the frequency of subwords in the database documents. -fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result> { +fn split_best_frequency<'a>( + ctx: &impl Context, + word: &'a str, +) -> heed::Result> { let chars = word.char_indices().skip(1); let mut best = None; @@ -267,7 +276,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result { let mut children = synonyms(ctx, &[&word])?.unwrap_or_default(); - if let Some(child) = split_best_frequency(ctx, &word)? { - children.push(child); + if let Some((left, right)) = split_best_frequency(ctx, &word)? { + children.push(Operation::Phrase(vec![left.to_string(), right.to_string()])); } let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; let exact_words = ctx.exact_words()?; @@ -464,6 +473,154 @@ fn create_query_tree( } } +/// Main function that matchings words used for crop and highlight. +fn create_matching_words( + ctx: &impl Context, + authorize_typos: bool, + query: &[PrimitiveQueryPart], +) -> Result { + /// Matches on the `PrimitiveQueryPart` and create matchings words from it. + fn resolve_primitive_part( + ctx: &impl Context, + authorize_typos: bool, + part: PrimitiveQueryPart, + matching_words: &mut Vec<(Vec, Vec)>, + id: PrimitiveWordId, + ) -> Result<()> { + match part { + // 1. try to split word in 2 + // 2. try to fetch synonyms + PrimitiveQueryPart::Word(word, prefix) => { + if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? { + for synonym in synonyms { + let synonym = synonym + .into_iter() + .map(|syn| MatchingWord::new(syn.to_string(), 0, false)) + .collect(); + matching_words.push((synonym, vec![id])); + } + } + + if let Some((left, right)) = split_best_frequency(ctx, &word)? { + let left = MatchingWord::new(left.to_string(), 0, false); + let right = MatchingWord::new(right.to_string(), 0, false); + matching_words.push((vec![left, right], vec![id])); + } + + let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?; + let exact_words = ctx.exact_words()?; + let config = + TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words }; + + let matching_word = match typos(word, authorize_typos, config) { + QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix), + QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix), + }; + matching_words.push((vec![matching_word], vec![id])); + } + // create a CONSECUTIVE matchings words wrapping all word in the phrase + PrimitiveQueryPart::Phrase(words) => { + let ids: Vec<_> = + (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect(); + let words = + words.into_iter().map(|w| MatchingWord::new(w.to_string(), 0, false)).collect(); + matching_words.push((words, ids)); + } + } + + Ok(()) + } + + /// Create all ngrams 1..=3 generating query tree branches. + fn ngrams( + ctx: &impl Context, + authorize_typos: bool, + query: &[PrimitiveQueryPart], + matching_words: &mut Vec<(Vec, Vec)>, + mut id: PrimitiveWordId, + ) -> Result<()> { + const MAX_NGRAM: usize = 3; + + for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) { + for ngram in 1..=MAX_NGRAM.min(sub_query.len()) { + if let Some(group) = sub_query.get(..ngram) { + let tail = &sub_query[ngram..]; + let is_last = tail.is_empty(); + + match group { + [part] => { + resolve_primitive_part( + ctx, + authorize_typos, + part.clone(), + matching_words, + id, + )?; + } + words => { + let is_prefix = words.last().map_or(false, |part| part.is_prefix()); + let words: Vec<_> = words + .iter() + .filter_map(|part| { + if let PrimitiveQueryPart::Word(word, _) = part { + Some(word.as_str()) + } else { + None + } + }) + .collect(); + let ids: Vec<_> = (0..words.len()) + .into_iter() + .map(|i| id + i as PrimitiveWordId) + .collect(); + + if let Some(synonyms) = ctx.synonyms(&words)? { + for synonym in synonyms { + let synonym = synonym + .into_iter() + .map(|syn| MatchingWord::new(syn.to_string(), 0, false)) + .collect(); + matching_words.push((synonym, ids.clone())); + } + } + let word = words.concat(); + let (word_len_one_typo, word_len_two_typo) = + ctx.min_word_len_for_typo()?; + let exact_words = ctx.exact_words()?; + let config = TypoConfig { + max_typos: 1, + word_len_one_typo, + word_len_two_typo, + exact_words, + }; + let matching_word = match typos(word, authorize_typos, config) { + QueryKind::Exact { word, .. } => { + MatchingWord::new(word, 0, is_prefix) + } + QueryKind::Tolerant { typo, word } => { + MatchingWord::new(word, typo, is_prefix) + } + }; + matching_words.push((vec![matching_word], ids)); + } + } + + if !is_last { + ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?; + } + } + } + id += sub_query.iter().map(|x| x.len() as PrimitiveWordId).sum::(); + } + + Ok(()) + } + + let mut matching_words = Vec::new(); + ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?; + Ok(MatchingWords::new(matching_words)) +} + pub type PrimitiveQuery = Vec; #[derive(Debug, Clone)] @@ -480,6 +637,13 @@ impl PrimitiveQueryPart { fn is_prefix(&self) -> bool { matches!(self, Self::Word(_, is_prefix) if *is_prefix) } + + fn len(&self) -> usize { + match self { + Self::Phrase(words) => words.len(), + Self::Word(_, _) => 1, + } + } } /// Create primitive query from tokenized query string,