From 13b7c826c14140283494a8c85407cf8cc197f184 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 12:15:37 +0200 Subject: [PATCH 1/9] add new highlighter --- .../src/search/new/matches/matching_words.rs | 334 +++++++ milli/src/search/new/matches/mod.rs | 848 ++++++++++++++++++ milli/src/search/new/mod.rs | 1 + 3 files changed, 1183 insertions(+) create mode 100644 milli/src/search/new/matches/matching_words.rs create mode 100644 milli/src/search/new/matches/mod.rs diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs new file mode 100644 index 000000000..a47a08c68 --- /dev/null +++ b/milli/src/search/new/matches/matching_words.rs @@ -0,0 +1,334 @@ +use std::cmp::Reverse; +use std::ops::RangeInclusive; + +use charabia::Token; + +use super::super::interner::Interned; +use super::super::query_term::{ + Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm, +}; +use super::super::{DedupInterner, Phrase}; +use crate::SearchContext; + +pub struct LocatedMatchingPhrase { + pub value: Interned, + pub positions: RangeInclusive, +} + +pub struct LocatedMatchingWords { + pub value: Vec>, + pub positions: RangeInclusive, + pub is_prefix: bool, +} + +/// Structure created from a query tree +/// referencing words that match the given query tree. +pub struct MatchingWords<'ctx> { + word_interner: &'ctx DedupInterner, + phrase_interner: &'ctx DedupInterner, + phrases: Vec, + words: Vec, +} + +/// Extract and centralize the different phrases and words to match stored in a QueryTerm. +fn extract_matching_terms(term: &QueryTerm) -> (Vec>, Vec>) { + let mut matching_words = Vec::new(); + let mut matching_phrases = Vec::new(); + + // the structure is exhaustively extracted to ensure that no field is missing. + let QueryTerm { + original: _, + is_multiple_words: _, + max_nbr_typos: _, + is_prefix: _, + zero_typo, + one_typo, + two_typo, + } = term; + + // the structure is exhaustively extracted to ensure that no field is missing. + let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo; + + // zero typo + if let Some(phrase) = phrase { + matching_phrases.push(*phrase); + } + if let Some(zero_typo) = zero_typo { + matching_words.push(*zero_typo); + } + for synonym in synonyms { + matching_phrases.push(*synonym); + } + + // one typo + // the structure is exhaustively extracted to ensure that no field is missing. + if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo { + if let Some(split_words) = split_words { + matching_phrases.push(*split_words); + } + for one_typo in one_typo { + matching_words.push(*one_typo); + } + } + + // two typos + // the structure is exhaustively extracted to ensure that no field is missing. + if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo { + for two_typos in two_typos { + matching_words.push(*two_typos); + } + } + + (matching_phrases, matching_words) +} + +impl<'ctx> MatchingWords<'ctx> { + pub fn new(ctx: &'ctx SearchContext, located_terms: Vec) -> Self { + let mut phrases = Vec::new(); + let mut words = Vec::new(); + + // Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms + // and wrap them in dedicated structures. + for located_term in located_terms { + let term = ctx.term_interner.get(located_term.value); + let (matching_phrases, matching_words) = extract_matching_terms(term); + + for matching_phrase in matching_phrases { + phrases.push(LocatedMatchingPhrase { + value: matching_phrase, + positions: located_term.positions.clone(), + }); + } + words.push(LocatedMatchingWords { + value: matching_words, + positions: located_term.positions.clone(), + is_prefix: term.is_prefix, + }); + } + + // Sort word to put prefixes at the bottom prioritizing the exact matches. + words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len()))); + + Self { + phrases, + words, + word_interner: &ctx.word_interner, + phrase_interner: &ctx.phrase_interner, + } + } + + /// Returns an iterator over terms that match or partially match the given token. + pub fn match_token<'b>(&'ctx self, token: &'b Token<'b>) -> MatchesIter<'ctx, 'b> { + MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token } + } + + /// Try to match the token with one of the located_words. + fn match_unique_words(&'ctx self, token: &Token) -> Option> { + for located_words in &self.words { + for word in &located_words.value { + let word = self.word_interner.get(*word); + // if the word is a prefix we match using starts_with. + if located_words.is_prefix && token.lemma().starts_with(word) { + let char_len = token.original_lengths(word.len()).0; + let ids = &located_words.positions; + return Some(MatchType::Full { char_len, ids }); + // else we exact match the token. + } else if token.lemma() == word { + let char_len = token.char_end - token.char_start; + let ids = &located_words.positions; + return Some(MatchType::Full { char_len, ids }); + } + } + } + + None + } +} + +/// Iterator over terms that match the given token, +/// This allow to lazily evaluate matches. +pub struct MatchesIter<'a, 'b> { + matching_words: &'a MatchingWords<'a>, + phrases: Box + 'a>, + token: &'b Token<'b>, +} + +impl<'a> Iterator for MatchesIter<'a, '_> { + type Item = MatchType<'a>; + + fn next(&mut self) -> Option { + match self.phrases.next() { + // Try to match all the phrases first. + Some(located_phrase) => { + let phrase = self.matching_words.phrase_interner.get(located_phrase.value); + + // create a PartialMatch struct to make it compute the first match + // instead of duplicating the code. + let ids = &located_phrase.positions; + // collect the references of words from the interner. + let words = phrase + .words + .iter() + .map(|word| { + word.map(|word| self.matching_words.word_interner.get(word).as_str()) + }) + .collect(); + let partial = PartialMatch { matching_words: words, ids, char_len: 0 }; + + partial.match_token(self.token).or_else(|| self.next()) + } + // If no phrases matches, try to match uiques words. + None => self.matching_words.match_unique_words(self.token), + } + } +} + +/// Id of a matching term corespounding to a word written by the end user. +pub type WordId = u16; + +/// A given token can partially match a query word for several reasons: +/// - split words +/// - multi-word synonyms +/// In these cases we need to match consecutively several tokens to consider that the match is full. +#[derive(Debug, PartialEq)] +pub enum MatchType<'a> { + Full { char_len: usize, ids: &'a RangeInclusive }, + Partial(PartialMatch<'a>), +} + +/// Structure helper to match several tokens in a row in order to complete a partial match. +#[derive(Debug, PartialEq)] +pub struct PartialMatch<'a> { + matching_words: Vec>, + ids: &'a RangeInclusive, + char_len: usize, +} + +impl<'a> PartialMatch<'a> { + /// Returns: + /// - None if the given token breaks the partial match + /// - Partial if the given token matches the partial match but doesn't complete it + /// - Full if the given token completes the partial match + pub fn match_token(self, token: &Token) -> Option> { + let Self { mut matching_words, ids, .. } = self; + + let is_matching = match matching_words.first()? { + Some(word) => &token.lemma() == word, + // a None value in the phrase corresponds to a stop word, + // the walue is considered a match if the current token is categorized as a stop word. + None => token.is_stopword(), + }; + + let char_len = token.char_end - token.char_start; + // if there are remaining words to match in the phrase and the current token is matching, + // return a new Partial match allowing the highlighter to continue. + if is_matching && matching_words.len() > 1 { + matching_words.remove(0); + Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len })) + // if there is no remaining word to match in the phrase and the current token is matching, + // return a Full match. + } else if is_matching { + Some(MatchType::Full { char_len, ids }) + // if the current token doesn't match, return None to break the match sequence. + } else { + None + } + } + + pub fn char_len(&self) -> usize { + self.char_len + } +} + +#[cfg(test)] +pub(crate) mod tests { + use std::borrow::Cow; + + use charabia::{TokenKind, TokenizerBuilder}; + + use super::super::super::located_query_terms_from_string; + use super::*; + use crate::index::tests::TempIndex; + + pub(crate) fn temp_index_with_documents() -> TempIndex { + let temp_index = TempIndex::new(); + temp_index + .add_documents(documents!([ + { "id": 1, "name": "split this world westfali westfalia the" }, + ])) + .unwrap(); + temp_index + } + + #[test] + fn matching_words() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let tokenizer = TokenizerBuilder::new().build(); + let tokens = tokenizer.tokenize("split this world"); + let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); + let matching_words = MatchingWords::new(&ctx, query_terms); + + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("split"), + char_end: "split".chars().count(), + byte_end: "split".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &(0..=0) }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("nyc"), + char_end: "nyc".chars().count(), + byte_end: "nyc".len(), + ..Default::default() + }) + .next(), + None + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("world"), + char_end: "world".chars().count(), + byte_end: "world".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("worlded"), + char_end: "worlded".chars().count(), + byte_end: "worlded".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("thisnew"), + char_end: "thisnew".chars().count(), + byte_end: "thisnew".len(), + ..Default::default() + }) + .next(), + None + ); + } +} diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs new file mode 100644 index 000000000..33d0591a6 --- /dev/null +++ b/milli/src/search/new/matches/mod.rs @@ -0,0 +1,848 @@ +use std::borrow::Cow; + +use charabia::{SeparatorKind, Token, Tokenizer}; +use matching_words::{MatchType, MatchingWords, PartialMatch, WordId}; +use serde::Serialize; + +use super::query_term::LocatedQueryTerm; +use crate::SearchContext; + +pub mod matching_words; + +const DEFAULT_CROP_MARKER: &str = "…"; +const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; +const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; + +/// Structure used to build a Matcher allowing to customize formating tags. +pub struct MatcherBuilder<'a, 'ctx, A> { + matching_words: MatchingWords<'ctx>, + tokenizer: Tokenizer<'a, 'a, A>, + crop_marker: Option, + highlight_prefix: Option, + highlight_suffix: Option, +} + +impl<'a, 'ctx, A> MatcherBuilder<'a, 'ctx, A> { + pub fn new( + ctx: &'ctx SearchContext, + located_terms: Vec, + tokenizer: Tokenizer<'a, 'a, A>, + ) -> Self { + let matching_words = MatchingWords::new(ctx, located_terms); + Self { + matching_words, + tokenizer, + crop_marker: None, + highlight_prefix: None, + highlight_suffix: None, + } + } + + pub fn crop_marker(&mut self, marker: String) -> &Self { + self.crop_marker = Some(marker); + self + } + + pub fn highlight_prefix(&mut self, prefix: String) -> &Self { + self.highlight_prefix = Some(prefix); + self + } + + pub fn highlight_suffix(&mut self, suffix: String) -> &Self { + self.highlight_suffix = Some(suffix); + self + } + + pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { + let crop_marker = match &self.crop_marker { + Some(marker) => marker.as_str(), + None => DEFAULT_CROP_MARKER, + }; + + let highlight_prefix = match &self.highlight_prefix { + Some(marker) => marker.as_str(), + None => DEFAULT_HIGHLIGHT_PREFIX, + }; + let highlight_suffix = match &self.highlight_suffix { + Some(marker) => marker.as_str(), + None => DEFAULT_HIGHLIGHT_SUFFIX, + }; + Matcher { + text, + matching_words: &self.matching_words, + tokenizer: &self.tokenizer, + crop_marker, + highlight_prefix, + highlight_suffix, + matches: None, + } + } +} + +#[derive(Copy, Clone, Default)] +pub struct FormatOptions { + pub highlight: bool, + pub crop: Option, +} + +impl FormatOptions { + pub fn merge(self, other: Self) -> Self { + Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } + } +} + +#[derive(Clone, Debug)] +pub struct Match { + match_len: usize, + // ids of the query words that matches. + ids: Vec, + // position of the word in the whole text. + word_position: usize, + // position of the token in the whole text. + token_position: usize, +} + +#[derive(Serialize, Debug, Clone, PartialEq, Eq)] +pub struct MatchBounds { + pub start: usize, + pub length: usize, +} + +/// Structure used to analize a string, compute words that match, +/// and format the source string, returning a highlighted and cropped sub-string. +pub struct Matcher<'t, 'm, A> { + text: &'t str, + matching_words: &'m MatchingWords<'m>, + tokenizer: &'m Tokenizer<'m, 'm, A>, + crop_marker: &'m str, + highlight_prefix: &'m str, + highlight_suffix: &'m str, + matches: Option<(Vec>, Vec)>, +} + +impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { + /// Iterates over tokens and save any of them that matches the query. + fn compute_matches(&mut self) -> &mut Self { + /// some words are counted as matches only if they are close together and in the good order, + /// compute_partial_match peek into next words to validate if the match is complete. + fn compute_partial_match<'a>( + mut partial: PartialMatch, + token_position: usize, + word_position: usize, + words_positions: &mut impl Iterator)>, + matches: &mut Vec, + ) -> bool { + let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; + + for (token_position, word_position, word) in words_positions { + partial = match partial.match_token(word) { + // token matches the partial match, but the match is not full, + // we temporarly save the current token then we try to match the next one. + Some(MatchType::Partial(partial)) => { + potential_matches.push((token_position, word_position, partial.char_len())); + partial + } + // partial match is now full, we keep this matches and we advance positions + Some(MatchType::Full { char_len, ids }) => { + let ids: Vec<_> = ids.clone().into_iter().collect(); + // save previously matched tokens as matches. + let iter = potential_matches.into_iter().map( + |(token_position, word_position, match_len)| Match { + match_len, + ids: ids.clone(), + word_position, + token_position, + }, + ); + matches.extend(iter); + + // save the token that closes the partial match as a match. + matches.push(Match { + match_len: char_len, + ids, + word_position, + token_position, + }); + + // the match is complete, we return true. + return true; + } + // no match, continue to next match. + None => break, + }; + } + + // the match is not complete, we return false. + false + } + + let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect(); + let mut matches = Vec::new(); + + let mut words_positions = tokens + .iter() + .scan((0, 0), |(token_position, word_position), token| { + let current_token_position = *token_position; + let current_word_position = *word_position; + *token_position += 1; + if !token.is_separator() { + *word_position += 1; + } + + Some((current_token_position, current_word_position, token)) + }) + .filter(|(_, _, token)| !token.is_separator()); + + while let Some((token_position, word_position, word)) = words_positions.next() { + for match_type in self.matching_words.match_token(word) { + match match_type { + // we match, we save the current token as a match, + // then we continue the rest of the tokens. + MatchType::Full { char_len, ids } => { + let ids: Vec<_> = ids.clone().into_iter().collect(); + matches.push(Match { + match_len: char_len, + ids, + word_position, + token_position, + }); + break; + } + // we match partially, iterate over next tokens to check if we can complete the match. + MatchType::Partial(partial) => { + // if match is completed, we break the matching loop over the current token, + // then we continue the rest of the tokens. + let mut wp = words_positions.clone(); + if compute_partial_match( + partial, + token_position, + word_position, + &mut wp, + &mut matches, + ) { + words_positions = wp; + break; + } + } + } + } + } + + self.matches = Some((tokens, matches)); + self + } + + /// Returns boundaries of the words that match the query. + pub fn matches(&mut self) -> Vec { + match &self.matches { + None => self.compute_matches().matches(), + Some((tokens, matches)) => matches + .iter() + .map(|m| MatchBounds { + start: tokens[m.token_position].byte_start, + length: m.match_len, + }) + .collect(), + } + } + + /// Returns the bounds in byte index of the crop window. + fn crop_bounds(&self, tokens: &[Token], matches: &[Match], crop_size: usize) -> (usize, usize) { + // if there is no match, we start from the beginning of the string by default. + let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); + let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); + let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); + let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); + + // matches needs to be counted in the crop len. + let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; + + // create the initial state of the crop window: 2 iterators starting from the matches positions, + // a reverse iterator starting from the first match token position and going towards the beginning of the text, + let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); + // an iterator starting from the last match token position and going towards the end of the text. + let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); + + // grows the crop window peeking in both directions + // until the window contains the good number of words: + while remaining_words > 0 { + let before_token = before_tokens.peek().map(|t| t.separator_kind()); + let after_token = after_tokens.peek().map(|t| t.separator_kind()); + + match (before_token, after_token) { + // we can expand both sides. + (Some(before_token), Some(after_token)) => { + match (before_token, after_token) { + // if they are both separators and are the same kind then advance both, + // or expand in the soft separator separator side. + (Some(before_token_kind), Some(after_token_kind)) => { + if before_token_kind == after_token_kind { + before_tokens.next(); + + // this avoid having an ending separator before crop marker. + if remaining_words > 1 { + after_tokens.next(); + } + } else if before_token_kind == SeparatorKind::Hard { + after_tokens.next(); + } else { + before_tokens.next(); + } + } + // if one of the tokens is a word, we expend in the side of the word. + // left is a word, advance left. + (None, Some(_)) => { + before_tokens.next(); + remaining_words -= 1; + } + // right is a word, advance right. + (Some(_), None) => { + after_tokens.next(); + remaining_words -= 1; + } + // both are words, advance left then right if remaining_word > 0. + (None, None) => { + before_tokens.next(); + remaining_words -= 1; + + if remaining_words > 0 { + after_tokens.next(); + remaining_words -= 1; + } + } + } + } + // the end of the text is reached, advance left. + (Some(before_token), None) => { + before_tokens.next(); + if before_token.is_none() { + remaining_words -= 1; + } + } + // the start of the text is reached, advance right. + (None, Some(after_token)) => { + after_tokens.next(); + if after_token.is_none() { + remaining_words -= 1; + } + } + // no more token to add. + (None, None) => break, + } + } + + // finally, keep the byte index of each bound of the crop window. + let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); + let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); + + (crop_byte_start, crop_byte_end) + } + + /// Compute the score of a match interval: + /// 1) count unique matches + /// 2) calculate distance between matches + /// 3) count ordered matches + fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { + let mut ids: Vec = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.ids.iter().min() > m.ids.iter().min() { + order_score += 1; + } + + // compute distance between matches + distance_score -= (next_match.word_position - m.word_position).min(7) as i16; + } + + ids.extend(m.ids.iter()); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + (uniq_score, distance_score, order_score) + } + + /// Returns the matches interval where the score computed by match_interval_score is the best. + fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { + // we compute the matches interval if we have at least 2 matches. + if matches.len() > 1 { + // positions of the first and the last match of the best matches interval in `matches`. + let mut best_interval = (0, 0); + let mut best_interval_score = self.match_interval_score(&matches[0..=0]); + // current interval positions. + let mut interval_first = 0; + let mut interval_last = 0; + for (index, next_match) in matches.iter().enumerate().skip(1) { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + if next_match.word_position - matches[interval_first].word_position >= crop_size { + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + + // keep interval if it's the best + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + best_interval_score = interval_score; + } + + // advance start of the interval while interval is longer than crop_size. + while next_match.word_position - matches[interval_first].word_position + >= crop_size + { + interval_first += 1; + } + } + interval_last = index; + } + + // compute the last interval score and compare it to the best one. + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + } + + &matches[best_interval.0..=best_interval.1] + } else { + matches + } + } + + // Returns the formatted version of the original text. + pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { + if !format_options.highlight && format_options.crop.is_none() { + // compute matches is not needed if no highlight nor crop is requested. + Cow::Borrowed(self.text) + } else { + match &self.matches { + Some((tokens, matches)) => { + // If the text has to be cropped, + // compute the best interval to crop around. + let matches = match format_options.crop { + Some(crop_size) if crop_size > 0 => { + self.find_best_match_interval(matches, crop_size) + } + _ => matches, + }; + + // If the text has to be cropped, + // crop around the best interval. + let (byte_start, byte_end) = match format_options.crop { + Some(crop_size) if crop_size > 0 => { + self.crop_bounds(tokens, matches, crop_size) + } + _ => (0, self.text.len()), + }; + + let mut formatted = Vec::new(); + + // push crop marker if it's not the start of the text. + if byte_start > 0 && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + let mut byte_index = byte_start; + + if format_options.highlight { + // insert highlight markers around matches. + for m in matches { + let token = &tokens[m.token_position]; + + if byte_index < token.byte_start { + formatted.push(&self.text[byte_index..token.byte_start]); + } + + let highlight_byte_index = self.text[token.byte_start..] + .char_indices() + .enumerate() + .find(|(i, _)| *i == m.match_len) + .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + formatted.push(self.highlight_prefix); + formatted.push(&self.text[token.byte_start..highlight_byte_index]); + formatted.push(self.highlight_suffix); + // if it's a prefix highlight, we put the end of the word after the highlight marker. + if highlight_byte_index < token.byte_end { + formatted.push(&self.text[highlight_byte_index..token.byte_end]); + } + + byte_index = token.byte_end; + } + } + + // push the rest of the text between last match and the end of crop. + if byte_index < byte_end { + formatted.push(&self.text[byte_index..byte_end]); + } + + // push crop marker if it's not the end of the text. + if byte_end < self.text.len() && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + if formatted.len() == 1 { + // avoid concatenating if there is already 1 slice. + Cow::Borrowed(&self.text[byte_start..byte_end]) + } else { + Cow::Owned(formatted.concat()) + } + } + None => self.compute_matches().format(format_options), + } + } + } +} + +#[cfg(test)] +mod tests { + use charabia::TokenizerBuilder; + use matching_words::tests::temp_index_with_documents; + + use super::super::located_query_terms_from_string; + use super::*; + + impl<'a, 'ctx> MatcherBuilder<'a, 'ctx, &[u8]> { + pub fn new_test(ctx: &'ctx mut SearchContext, query: &'a str) -> Self { + let tokenizer = TokenizerBuilder::new().build(); + let tokens = tokenizer.tokenize(query); + let query_terms = located_query_terms_from_string(ctx, tokens, None).unwrap(); + Self::new(ctx, query_terms, TokenizerBuilder::new().build()) + } + } + + #[test] + fn format_identity() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let format_options = FormatOptions { highlight: false, crop: None }; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + } + + #[test] + fn format_highlight() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let format_options = FormatOptions { highlight: true, crop: None }; + + // empty text. + let text = ""; + let mut matcher = builder.build(text); + assert_eq!(&matcher.format(format_options), ""); + + // text containing only separators. + let text = ":-)"; + let mut matcher = builder.build(text); + assert_eq!(&matcher.format(format_options), ":-)"); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // no crop should return complete text, because there is no matches. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Natalie risk her future to build a world with the boy she loves." + ); + } + + #[test] + fn highlight_unicode() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "world"); + let format_options = FormatOptions { highlight: true, crop: None }; + + // Text containing prefix match. + let text = "Ŵôřlḑôle"; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Ŵôřlḑôle" + ); + + // Text containing unicode match. + let text = "Ŵôřlḑ"; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Ŵôřlḑ" + ); + + let builder = MatcherBuilder::new_test(&mut ctx, "westfali"); + let format_options = FormatOptions { highlight: true, crop: None }; + + // Text containing unicode match. + let text = "Westfália"; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Westfália" + ); + } + + #[test] + fn format_crop() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let format_options = FormatOptions { highlight: false, crop: Some(10) }; + + // empty text. + let text = ""; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @"" + ); + + // text containing only separators. + let text = ":-)"; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @":-)" + ); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // no highlight should return 10 first words with a marker at the end. + insta::assert_snapshot!( + matcher.format(format_options), + @"A quick brown fox can not jump 32 feet, right…" + ); + + // Text without any match starting by a separator. + let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; + let mut matcher = builder.build(text); + // no highlight should return 10 first words with a marker at the end. + insta::assert_snapshot!( + matcher.format(format_options), + @"(A quick brown fox can not jump 32 feet, right…" + ); + + // Test phrase propagation + let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; + let mut matcher = builder.build(text); + // should crop the phrase instead of croping around the match. + insta::assert_snapshot!( + matcher.format(format_options), + @"… Split The World is a book written by Emily Henry…" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // no highlight should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…future to build a world with the boy she loves…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // no highlight should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + + // Text containing matches with diferent density. + let text = "split void the void void world void void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + + // Text containing matches with same word. + let text = "split split split split split split void void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + } + + #[test] + fn format_highlight_crop() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let format_options = FormatOptions { highlight: true, crop: Some(10) }; + + // empty text. + let text = ""; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @"" + ); + + // text containing only separators. + let text = ":-)"; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @":-)" + ); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // both should return 10 first words with a marker at the end. + insta::assert_snapshot!( + matcher.format(format_options), + @"A quick brown fox can not jump 32 feet, right…" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // both should return 10 last words with a marker at the start and highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…future to build a world with the boy she loves…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // both should return 10 last words with a marker at the start and highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + } + + #[test] + fn smaller_crop_size() { + //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let text = "void void split the world void void."; + + // set a smaller crop size + let format_options = FormatOptions { highlight: false, crop: Some(2) }; + let mut matcher = builder.build(text); + // because crop size < query size, partially format matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…split the…" + ); + + // set a smaller crop size + let format_options = FormatOptions { highlight: false, crop: Some(1) }; + let mut matcher = builder.build(text); + // because crop size < query size, partially format matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…split…" + ); + + // set crop size to 0 + let format_options = FormatOptions { highlight: false, crop: Some(0) }; + let mut matcher = builder.build(text); + // because crop size is 0, crop is ignored. + insta::assert_snapshot!( + matcher.format(format_options), + @"void void split the world void void." + ); + } + + #[test] + fn partial_matches() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let mut builder = MatcherBuilder::new_test(&mut ctx, "the \"t he\" door \"do or\""); + builder.highlight_prefix("_".to_string()); + builder.highlight_suffix("_".to_string()); + + let format_options = FormatOptions { highlight: true, crop: None }; + + let text = "the do or die can't be he do and or isn't he"; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_" + ); + } +} diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4d561d25b..ef7e61ee1 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -4,6 +4,7 @@ mod graph_based_ranking_rule; mod interner; mod limits; mod logger; +mod matches; mod query_graph; mod query_term; mod ranking_rule_graph; From ebe23b04c9eaa2784f83a3718853534380ddc3b1 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 12:28:28 +0200 Subject: [PATCH 2/9] Make the matcher consume the search context --- .../src/search/new/matches/matching_words.rs | 22 ++++----- milli/src/search/new/matches/mod.rs | 47 ++++++++++--------- 2 files changed, 35 insertions(+), 34 deletions(-) diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index a47a08c68..e737dc942 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -23,9 +23,9 @@ pub struct LocatedMatchingWords { /// Structure created from a query tree /// referencing words that match the given query tree. -pub struct MatchingWords<'ctx> { - word_interner: &'ctx DedupInterner, - phrase_interner: &'ctx DedupInterner, +pub struct MatchingWords { + word_interner: DedupInterner, + phrase_interner: DedupInterner, phrases: Vec, words: Vec, } @@ -82,8 +82,8 @@ fn extract_matching_terms(term: &QueryTerm) -> (Vec>, Vec MatchingWords<'ctx> { - pub fn new(ctx: &'ctx SearchContext, located_terms: Vec) -> Self { +impl MatchingWords { + pub fn new(ctx: SearchContext, located_terms: Vec) -> Self { let mut phrases = Vec::new(); let mut words = Vec::new(); @@ -112,18 +112,18 @@ impl<'ctx> MatchingWords<'ctx> { Self { phrases, words, - word_interner: &ctx.word_interner, - phrase_interner: &ctx.phrase_interner, + word_interner: ctx.word_interner, + phrase_interner: ctx.phrase_interner, } } /// Returns an iterator over terms that match or partially match the given token. - pub fn match_token<'b>(&'ctx self, token: &'b Token<'b>) -> MatchesIter<'ctx, 'b> { + pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token } } /// Try to match the token with one of the located_words. - fn match_unique_words(&'ctx self, token: &Token) -> Option> { + fn match_unique_words<'a>(&'a self, token: &Token) -> Option> { for located_words in &self.words { for word in &located_words.value { let word = self.word_interner.get(*word); @@ -148,7 +148,7 @@ impl<'ctx> MatchingWords<'ctx> { /// Iterator over terms that match the given token, /// This allow to lazily evaluate matches. pub struct MatchesIter<'a, 'b> { - matching_words: &'a MatchingWords<'a>, + matching_words: &'a MatchingWords, phrases: Box + 'a>, token: &'b Token<'b>, } @@ -268,7 +268,7 @@ pub(crate) mod tests { let tokenizer = TokenizerBuilder::new().build(); let tokens = tokenizer.tokenize("split this world"); let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); - let matching_words = MatchingWords::new(&ctx, query_terms); + let matching_words = MatchingWords::new(ctx, query_terms); assert_eq!( matching_words diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 33d0591a6..9b73c2098 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -14,17 +14,17 @@ const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; /// Structure used to build a Matcher allowing to customize formating tags. -pub struct MatcherBuilder<'a, 'ctx, A> { - matching_words: MatchingWords<'ctx>, +pub struct MatcherBuilder<'a, A> { + matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>, crop_marker: Option, highlight_prefix: Option, highlight_suffix: Option, } -impl<'a, 'ctx, A> MatcherBuilder<'a, 'ctx, A> { +impl<'a, A> MatcherBuilder<'a, A> { pub fn new( - ctx: &'ctx SearchContext, + ctx: SearchContext, located_terms: Vec, tokenizer: Tokenizer<'a, 'a, A>, ) -> Self { @@ -112,7 +112,7 @@ pub struct MatchBounds { /// and format the source string, returning a highlighted and cropped sub-string. pub struct Matcher<'t, 'm, A> { text: &'t str, - matching_words: &'m MatchingWords<'m>, + matching_words: &'m MatchingWords, tokenizer: &'m Tokenizer<'m, 'm, A>, crop_marker: &'m str, highlight_prefix: &'m str, @@ -509,11 +509,11 @@ mod tests { use super::super::located_query_terms_from_string; use super::*; - impl<'a, 'ctx> MatcherBuilder<'a, 'ctx, &[u8]> { - pub fn new_test(ctx: &'ctx mut SearchContext, query: &'a str) -> Self { + impl<'a, 'ctx> MatcherBuilder<'a, &[u8]> { + pub fn new_test(mut ctx: SearchContext, query: &'a str) -> Self { let tokenizer = TokenizerBuilder::new().build(); let tokens = tokenizer.tokenize(query); - let query_terms = located_query_terms_from_string(ctx, tokens, None).unwrap(); + let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); Self::new(ctx, query_terms, TokenizerBuilder::new().build()) } } @@ -522,8 +522,8 @@ mod tests { fn format_identity() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "split the world"); let format_options = FormatOptions { highlight: false, crop: None }; @@ -550,8 +550,8 @@ mod tests { fn format_highlight() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "split the world"); let format_options = FormatOptions { highlight: true, crop: None }; @@ -594,8 +594,8 @@ mod tests { fn highlight_unicode() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let builder = MatcherBuilder::new_test(&mut ctx, "world"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "world"); let format_options = FormatOptions { highlight: true, crop: None }; // Text containing prefix match. @@ -616,7 +616,8 @@ mod tests { @"Ŵôřlḑ" ); - let builder = MatcherBuilder::new_test(&mut ctx, "westfali"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "westfali"); let format_options = FormatOptions { highlight: true, crop: None }; // Text containing unicode match. @@ -633,8 +634,8 @@ mod tests { fn format_crop() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "split the world"); let format_options = FormatOptions { highlight: false, crop: Some(10) }; @@ -731,8 +732,8 @@ mod tests { fn format_highlight_crop() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "split the world"); let format_options = FormatOptions { highlight: true, crop: Some(10) }; @@ -794,8 +795,8 @@ mod tests { //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + let ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(ctx, "split the world"); let text = "void void split the world void void."; @@ -831,8 +832,8 @@ mod tests { fn partial_matches() { let temp_index = temp_index_with_documents(); let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn); - let mut builder = MatcherBuilder::new_test(&mut ctx, "the \"t he\" door \"do or\""); + let ctx = SearchContext::new(&temp_index, &rtxn); + let mut builder = MatcherBuilder::new_test(ctx, "the \"t he\" door \"do or\""); builder.highlight_prefix("_".to_string()); builder.highlight_suffix("_".to_string()); From 9c5f64769a38d766fb96c350ed7396cc57cefe3f Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 13:58:56 +0200 Subject: [PATCH 3/9] Integrate the new Highlighter in the search --- milli/src/lib.rs | 4 +- milli/src/search/matches/matching_words.rs | 3 +- milli/src/search/mod.rs | 8 ++- .../src/search/new/matches/matching_words.rs | 36 +++++++++++++ milli/src/search/new/matches/mod.rs | 13 ++--- milli/src/search/new/mod.rs | 54 ++++++++++--------- 6 files changed, 77 insertions(+), 41 deletions(-) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index eb63c3904..13e23a5bd 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -97,8 +97,8 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, - MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, + FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search, + SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 22241c457..5ccf0286f 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -289,8 +289,7 @@ mod tests { use charabia::TokenKind; - use super::*; - use crate::MatchingWords; + use super::{MatchingWords, *}; #[test] fn test_bytes_to_highlight() { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 08803b73f..3683a5cf0 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -5,9 +5,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; -pub use self::matches::{ - FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, -}; +pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords}; use crate::{ execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, }; @@ -109,9 +107,9 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> Result { - let mut ctx = SearchContext::new(self.index, self.rtxn); + let ctx = SearchContext::new(self.index, self.rtxn); execute_search( - &mut ctx, + ctx, &self.query, self.terms_matching_strategy, self.exhaustive_number_hits, diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index e737dc942..4ca04884a 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -1,4 +1,5 @@ use std::cmp::Reverse; +use std::fmt; use std::ops::RangeInclusive; use charabia::Token; @@ -23,6 +24,7 @@ pub struct LocatedMatchingWords { /// Structure created from a query tree /// referencing words that match the given query tree. +#[derive(Default)] pub struct MatchingWords { word_interner: DedupInterner, phrase_interner: DedupInterner, @@ -240,6 +242,40 @@ impl<'a> PartialMatch<'a> { } } +impl fmt::Debug for MatchingWords { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let MatchingWords { word_interner, phrase_interner, phrases, words } = self; + + let phrases: Vec<_> = phrases + .iter() + .map(|p| { + ( + phrase_interner + .get(p.value) + .words + .iter() + .map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w))) + .collect::>() + .join(" "), + p.positions.clone(), + ) + }) + .collect(); + + let words: Vec<_> = words + .iter() + .flat_map(|w| { + w.value + .iter() + .map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix)) + .collect::>() + }) + .collect(); + + f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish() + } +} + #[cfg(test)] pub(crate) mod tests { use std::borrow::Cow; diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 9b73c2098..2a9596902 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -1,7 +1,8 @@ use std::borrow::Cow; use charabia::{SeparatorKind, Token, Tokenizer}; -use matching_words::{MatchType, MatchingWords, PartialMatch, WordId}; +pub use matching_words::MatchingWords; +use matching_words::{MatchType, PartialMatch, WordId}; use serde::Serialize; use super::query_term::LocatedQueryTerm; @@ -23,12 +24,7 @@ pub struct MatcherBuilder<'a, A> { } impl<'a, A> MatcherBuilder<'a, A> { - pub fn new( - ctx: SearchContext, - located_terms: Vec, - tokenizer: Tokenizer<'a, 'a, A>, - ) -> Self { - let matching_words = MatchingWords::new(ctx, located_terms); + pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { Self { matching_words, tokenizer, @@ -514,7 +510,8 @@ mod tests { let tokenizer = TokenizerBuilder::new().build(); let tokens = tokenizer.tokenize(query); let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); - Self::new(ctx, query_terms, TokenizerBuilder::new().build()) + let matching_words = MatchingWords::new(ctx, query_terms); + Self::new(matching_words, TokenizerBuilder::new().build()) } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index ef7e61ee1..0bb454c06 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -4,7 +4,7 @@ mod graph_based_ranking_rule; mod interner; mod limits; mod logger; -mod matches; +pub mod matches; mod query_graph; mod query_term; mod ranking_rule_graph; @@ -271,7 +271,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( #[allow(clippy::too_many_arguments)] pub fn execute_search( - ctx: &mut SearchContext, + mut ctx: SearchContext, query: &Option, terms_matching_strategy: TermsMatchingStrategy, exhaustive_number_hits: bool, @@ -284,21 +284,22 @@ pub fn execute_search( query_graph_logger: &mut dyn SearchLogger, ) -> Result { let mut universe = if let Some(filters) = filters { - filters.evaluate(ctx.txn, ctx.index)? + filters.evaluate(&mut ctx.txn, &mut ctx.index)? } else { - ctx.index.documents_ids(ctx.txn)? + ctx.index.documents_ids(&mut ctx.txn)? }; + let mut located_query_terms = None; let documents_ids = if let Some(query) = query { // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut tokbuilder = TokenizerBuilder::new(); - let stop_words = ctx.index.stop_words(ctx.txn)?; + let stop_words = &mut ctx.index.stop_words(&mut ctx.txn)?; if let Some(ref stop_words) = stop_words { tokbuilder.stop_words(stop_words); } - let script_lang_map = ctx.index.script_language(ctx.txn)?; + let script_lang_map = &mut ctx.index.script_language(&mut ctx.txn)?; if !script_lang_map.is_empty() { tokbuilder.allow_list(&script_lang_map); } @@ -306,27 +307,31 @@ pub fn execute_search( let tokenizer = tokbuilder.build(); let tokens = tokenizer.tokenize(query); - let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; - let graph = QueryGraph::from_query(ctx, &query_terms)?; + let query_terms = located_query_terms_from_string(&mut ctx, tokens, words_limit)?; + let graph = QueryGraph::from_query(&mut ctx, &query_terms)?; + located_query_terms = Some(query_terms); - check_sort_criteria(ctx, sort_criteria.as_ref())?; + check_sort_criteria(&mut ctx, sort_criteria.as_ref())?; universe = resolve_maximally_reduced_query_graph( - ctx, + &mut ctx, &universe, &graph, terms_matching_strategy, query_graph_logger, )?; - let ranking_rules = - get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; + let ranking_rules = get_ranking_rules_for_query_graph_search( + &mut ctx, + sort_criteria, + terms_matching_strategy, + )?; - bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? + bucket_sort(&mut ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { - let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria)?; + let ranking_rules = get_ranking_rules_for_placeholder_search(&mut ctx, sort_criteria)?; bucket_sort( - ctx, + &mut ctx, ranking_rules, &PlaceholderQuery, &universe, @@ -340,19 +345,20 @@ pub fn execute_search( // is requested and a distinct attribute is set. let mut candidates = universe; if exhaustive_number_hits { - if let Some(f) = ctx.index.distinct_field(ctx.txn)? { - if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) { - candidates = apply_distinct_rule(ctx, distinct_fid, &candidates)?.remaining; + if let Some(f) = &mut ctx.index.distinct_field(&mut ctx.txn)? { + if let Some(distinct_fid) = ctx.index.fields_ids_map(&mut ctx.txn)?.id(f) { + candidates = apply_distinct_rule(&mut ctx, distinct_fid, &candidates)?.remaining; } } } - Ok(SearchResult { - // TODO: correct matching words - matching_words: MatchingWords::default(), - candidates, - documents_ids, - }) + // consume context and located_query_terms to build MatchingWords. + let matching_words = match located_query_terms { + Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms), + None => MatchingWords::default(), + }; + + Ok(SearchResult { matching_words, candidates, documents_ids }) } fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec>) -> Result<()> { From a1148c09c2b6e56f23428a04530a81edc9c657e8 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 14:00:21 +0200 Subject: [PATCH 4/9] remove old matcher --- milli/src/search/matches/matching_words.rs | 457 ----------- milli/src/search/matches/mod.rs | 865 --------------------- milli/src/search/mod.rs | 1 - 3 files changed, 1323 deletions(-) delete mode 100644 milli/src/search/matches/matching_words.rs delete mode 100644 milli/src/search/matches/mod.rs diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs deleted file mode 100644 index 5ccf0286f..000000000 --- a/milli/src/search/matches/matching_words.rs +++ /dev/null @@ -1,457 +0,0 @@ -use std::cmp::{min, Reverse}; -use std::collections::BTreeMap; -use std::fmt; -use std::ops::{Index, IndexMut}; -use std::rc::Rc; - -use charabia::Token; -use levenshtein_automata::{Distance, DFA}; - -use crate::error::InternalError; -use crate::search::build_dfa; -use crate::MAX_WORD_LENGTH; - -type IsPrefix = bool; - -/// Structure created from a query tree -/// referencing words that match the given query tree. -#[derive(Default)] -pub struct MatchingWords { - inner: Vec<(Vec>, Vec)>, -} - -impl fmt::Debug for MatchingWords { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - writeln!(f, "[")?; - for (matching_words, primitive_word_id) in self.inner.iter() { - writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?; - } - writeln!(f, "]")?; - Ok(()) - } -} - -impl MatchingWords { - pub fn new( - mut matching_words: Vec<(Vec>, Vec)>, - ) -> crate::Result { - // if one of the matching_words vec doesn't contain a word. - if matching_words.iter().any(|(mw, _)| mw.is_empty()) { - return Err(InternalError::InvalidMatchingWords.into()); - } - - // Sort word by len in DESC order prioritizing the longuest matches, - // in order to highlight the longuest part of the matched word. - matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len()))); - - Ok(Self { inner: matching_words }) - } - - /// Returns an iterator over terms that match or partially match the given token. - pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { - MatchesIter { inner: Box::new(self.inner.iter()), token } - } -} - -/// Iterator over terms that match the given token, -/// This allow to lazily evaluate matches. -pub struct MatchesIter<'a, 'b> { - #[allow(clippy::type_complexity)] - inner: Box>, Vec)> + 'a>, - token: &'b Token<'b>, -} - -impl<'a> Iterator for MatchesIter<'a, '_> { - type Item = MatchType<'a>; - - fn next(&mut self) -> Option { - match self.inner.next() { - Some((matching_words, ids)) => match matching_words[0].match_token(self.token) { - Some(char_len) => { - if matching_words.len() > 1 { - Some(MatchType::Partial(PartialMatch { - matching_words: &matching_words[1..], - ids, - char_len, - })) - } else { - Some(MatchType::Full { char_len, ids }) - } - } - None => self.next(), - }, - None => None, - } - } -} - -/// Id of a matching term corespounding to a word written by the end user. -pub type PrimitiveWordId = u8; - -/// Structure used to match a specific term. -pub struct MatchingWord { - pub dfa: DFA, - pub word: String, - pub typo: u8, - pub prefix: IsPrefix, -} - -impl fmt::Debug for MatchingWord { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("MatchingWord") - .field("word", &self.word) - .field("typo", &self.typo) - .field("prefix", &self.prefix) - .finish() - } -} - -impl PartialEq for MatchingWord { - fn eq(&self, other: &Self) -> bool { - self.prefix == other.prefix && self.typo == other.typo && self.word == other.word - } -} - -impl MatchingWord { - pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option { - if word.len() > MAX_WORD_LENGTH { - return None; - } - let dfa = build_dfa(&word, typo, prefix); - - Some(Self { dfa, word, typo, prefix }) - } - - /// Returns the lenght in chars of the match in case of the token matches the term. - pub fn match_token(&self, token: &Token) -> Option { - match self.dfa.eval(token.lemma()) { - Distance::Exact(t) if t <= self.typo => { - if self.prefix { - let len = bytes_to_highlight(token.lemma(), &self.word); - Some(token.original_lengths(len).0) - } else { - Some(token.original_lengths(token.lemma().len()).0) - } - } - _otherwise => None, - } - } -} - -/// A given token can partially match a query word for several reasons: -/// - split words -/// - multi-word synonyms -/// In these cases we need to match consecutively several tokens to consider that the match is full. -#[derive(Debug, PartialEq)] -pub enum MatchType<'a> { - Full { char_len: usize, ids: &'a [PrimitiveWordId] }, - Partial(PartialMatch<'a>), -} - -/// Structure helper to match several tokens in a row in order to complete a partial match. -#[derive(Debug, PartialEq)] -pub struct PartialMatch<'a> { - matching_words: &'a [Rc], - ids: &'a [PrimitiveWordId], - char_len: usize, -} - -impl<'a> PartialMatch<'a> { - /// Returns: - /// - None if the given token breaks the partial match - /// - Partial if the given token matches the partial match but doesn't complete it - /// - Full if the given token completes the partial match - pub fn match_token(self, token: &Token) -> Option> { - self.matching_words[0].match_token(token).map(|char_len| { - if self.matching_words.len() > 1 { - MatchType::Partial(PartialMatch { - matching_words: &self.matching_words[1..], - ids: self.ids, - char_len, - }) - } else { - MatchType::Full { char_len, ids: self.ids } - } - }) - } - - pub fn char_len(&self) -> usize { - self.char_len - } -} - -// A simple wrapper around vec so we can get contiguous but index it like it's 2D array. -struct N2Array { - y_size: usize, - buf: Vec, -} - -impl N2Array { - fn new(x: usize, y: usize, value: T) -> N2Array { - N2Array { y_size: y, buf: vec![value; x * y] } - } -} - -impl Index<(usize, usize)> for N2Array { - type Output = T; - - #[inline] - fn index(&self, (x, y): (usize, usize)) -> &T { - &self.buf[(x * self.y_size) + y] - } -} - -impl IndexMut<(usize, usize)> for N2Array { - #[inline] - fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T { - &mut self.buf[(x * self.y_size) + y] - } -} - -/// Returns the number of **bytes** we want to highlight in the `source` word. -/// Basically we want to highlight as much characters as possible in the source until it has too much -/// typos (= 2) -/// The algorithm is a modified -/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) -fn bytes_to_highlight(source: &str, target: &str) -> usize { - let n = source.chars().count(); - let m = target.chars().count(); - - if n == 0 { - return 0; - } - // since we allow two typos we can send two characters even if it's completely wrong - if m < 3 { - return source.chars().take(m).map(|c| c.len_utf8()).sum(); - } - if n == m && source == target { - return source.len(); - } - - let inf = n + m; - let mut matrix = N2Array::new(n + 2, m + 2, 0); - - matrix[(0, 0)] = inf; - for i in 0..=n { - matrix[(i + 1, 0)] = inf; - matrix[(i + 1, 1)] = i; - } - for j in 0..=m { - matrix[(0, j + 1)] = inf; - matrix[(1, j + 1)] = j; - } - - let mut last_row = BTreeMap::new(); - - for (row, char_s) in source.chars().enumerate() { - let mut last_match_col = 0; - let row = row + 1; - - for (col, char_t) in target.chars().enumerate() { - let col = col + 1; - let last_match_row = *last_row.get(&char_t).unwrap_or(&0); - let cost = usize::from(char_s != char_t); - - let dist_add = matrix[(row, col + 1)] + 1; - let dist_del = matrix[(row + 1, col)] + 1; - let dist_sub = matrix[(row, col)] + cost; - let dist_trans = matrix[(last_match_row, last_match_col)] - + (row - last_match_row - 1) - + 1 - + (col - last_match_col - 1); - let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans)); - matrix[(row + 1, col + 1)] = dist; - - if cost == 0 { - last_match_col = col; - } - } - - last_row.insert(char_s, row); - } - - let mut minimum = (u32::max_value(), 0); - for x in 0..=m { - let dist = matrix[(n + 1, x + 1)] as u32; - if dist < minimum.0 { - minimum = (dist, x); - } - } - - // everything was done characters wise and now we want to returns a number of bytes - source.chars().take(minimum.1).map(|c| c.len_utf8()).sum() -} - -#[cfg(test)] -mod tests { - use std::borrow::Cow; - use std::str::from_utf8; - - use charabia::TokenKind; - - use super::{MatchingWords, *}; - - #[test] - fn test_bytes_to_highlight() { - struct TestBytesToHighlight { - query: &'static str, - text: &'static str, - length: usize, - } - let tests = [ - TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() }, - TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() }, - TestBytesToHighlight { - query: "Levenshtein", - text: "Levenshtein", - length: "Levenshtein".len(), - }, - // we get to the end of our word with only one typo - TestBytesToHighlight { - query: "Levenste", - text: "Levenshtein", - length: "Levenste".len(), - }, - // we get our third and last authorized typo right on the last character - TestBytesToHighlight { - query: "Levenstein", - text: "Levenshte", - length: "Levenste".len(), - }, - // we get to the end of our word with only two typos at the beginning - TestBytesToHighlight { - query: "Bavenshtein", - text: "Levenshtein", - length: "Bavenshtein".len(), - }, - TestBytesToHighlight { - query: "Альфа", text: "Альфой", length: "Альф".len() - }, - TestBytesToHighlight { - query: "Go💼", text: "Go💼od luck.", length: "Go💼".len() - }, - TestBytesToHighlight { - query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len() - }, - TestBytesToHighlight { - query: "chäräcters", - text: "chäräcters", - length: "chäräcters".len(), - }, - TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() }, - TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() }, - ]; - - for test in &tests { - let length = bytes_to_highlight(test.text, test.query); - assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text); - assert!( - from_utf8(&test.query.as_bytes()[..length]).is_ok(), - r#"converting {}[..{}] to an utf8 str failed"#, - test.query, - length - ); - } - } - - #[test] - fn matching_words() { - let all = vec![ - Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()), - Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), - ]; - let matching_words = vec![ - (vec![all[0].clone()], vec![0]), - (vec![all[1].clone()], vec![1]), - (vec![all[2].clone()], vec![2]), - ]; - - let matching_words = MatchingWords::new(matching_words).unwrap(); - - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("word"), - char_end: "word".chars().count(), - byte_end: "word".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 3, ids: &[2] }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("nyc"), - char_end: "nyc".chars().count(), - byte_end: "nyc".len(), - ..Default::default() - }) - .next(), - None - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("world"), - char_end: "world".chars().count(), - byte_end: "world".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 5, ids: &[2] }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("splitted"), - char_end: "splitted".chars().count(), - byte_end: "splitted".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 5, ids: &[0] }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("thisnew"), - char_end: "thisnew".chars().count(), - byte_end: "thisnew".len(), - ..Default::default() - }) - .next(), - None - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("borld"), - char_end: "borld".chars().count(), - byte_end: "borld".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 5, ids: &[2] }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("wordsplit"), - char_end: "wordsplit".chars().count(), - byte_end: "wordsplit".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 4, ids: &[2] }) - ); - } -} diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs deleted file mode 100644 index c634ae297..000000000 --- a/milli/src/search/matches/mod.rs +++ /dev/null @@ -1,865 +0,0 @@ -use std::borrow::Cow; - -use charabia::{SeparatorKind, Token, Tokenizer}; -use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; -pub use matching_words::{MatchingWord, MatchingWords}; -use serde::Serialize; - -pub mod matching_words; - -const DEFAULT_CROP_MARKER: &str = "…"; -const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; -const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; - -/// Structure used to build a Matcher allowing to customize formating tags. -pub struct MatcherBuilder<'a, A> { - matching_words: MatchingWords, - tokenizer: Tokenizer<'a, 'a, A>, - crop_marker: Option, - highlight_prefix: Option, - highlight_suffix: Option, -} - -impl<'a, A> MatcherBuilder<'a, A> { - pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { - Self { - matching_words, - tokenizer, - crop_marker: None, - highlight_prefix: None, - highlight_suffix: None, - } - } - - pub fn crop_marker(&mut self, marker: String) -> &Self { - self.crop_marker = Some(marker); - self - } - - pub fn highlight_prefix(&mut self, prefix: String) -> &Self { - self.highlight_prefix = Some(prefix); - self - } - - pub fn highlight_suffix(&mut self, suffix: String) -> &Self { - self.highlight_suffix = Some(suffix); - self - } - - pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { - let crop_marker = match &self.crop_marker { - Some(marker) => marker.as_str(), - None => DEFAULT_CROP_MARKER, - }; - - let highlight_prefix = match &self.highlight_prefix { - Some(marker) => marker.as_str(), - None => DEFAULT_HIGHLIGHT_PREFIX, - }; - let highlight_suffix = match &self.highlight_suffix { - Some(marker) => marker.as_str(), - None => DEFAULT_HIGHLIGHT_SUFFIX, - }; - Matcher { - text, - matching_words: &self.matching_words, - tokenizer: &self.tokenizer, - crop_marker, - highlight_prefix, - highlight_suffix, - matches: None, - } - } -} - -#[derive(Copy, Clone, Default)] -pub struct FormatOptions { - pub highlight: bool, - pub crop: Option, -} - -impl FormatOptions { - pub fn merge(self, other: Self) -> Self { - Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } - } -} - -#[derive(Clone, Debug)] -pub struct Match { - match_len: usize, - // ids of the query words that matches. - ids: Vec, - // position of the word in the whole text. - word_position: usize, - // position of the token in the whole text. - token_position: usize, -} - -#[derive(Serialize, Debug, Clone, PartialEq, Eq)] -pub struct MatchBounds { - pub start: usize, - pub length: usize, -} - -/// Structure used to analize a string, compute words that match, -/// and format the source string, returning a highlighted and cropped sub-string. -pub struct Matcher<'t, 'm, A> { - text: &'t str, - matching_words: &'m MatchingWords, - tokenizer: &'m Tokenizer<'m, 'm, A>, - crop_marker: &'m str, - highlight_prefix: &'m str, - highlight_suffix: &'m str, - matches: Option<(Vec>, Vec)>, -} - -impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { - /// Iterates over tokens and save any of them that matches the query. - fn compute_matches(&mut self) -> &mut Self { - /// some words are counted as matches only if they are close together and in the good order, - /// compute_partial_match peek into next words to validate if the match is complete. - fn compute_partial_match<'a>( - mut partial: PartialMatch, - token_position: usize, - word_position: usize, - words_positions: &mut impl Iterator)>, - matches: &mut Vec, - ) -> bool { - let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; - - for (token_position, word_position, word) in words_positions { - partial = match partial.match_token(word) { - // token matches the partial match, but the match is not full, - // we temporarly save the current token then we try to match the next one. - Some(MatchType::Partial(partial)) => { - potential_matches.push((token_position, word_position, partial.char_len())); - partial - } - // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { char_len, ids }) => { - // save previously matched tokens as matches. - let iter = potential_matches.into_iter().map( - |(token_position, word_position, match_len)| Match { - match_len, - ids: ids.to_vec(), - word_position, - token_position, - }, - ); - matches.extend(iter); - - // save the token that closes the partial match as a match. - matches.push(Match { - match_len: char_len, - ids: ids.to_vec(), - word_position, - token_position, - }); - - // the match is complete, we return true. - return true; - } - // no match, continue to next match. - None => break, - }; - } - - // the match is not complete, we return false. - false - } - - let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect(); - let mut matches = Vec::new(); - - let mut words_positions = tokens - .iter() - .scan((0, 0), |(token_position, word_position), token| { - let current_token_position = *token_position; - let current_word_position = *word_position; - *token_position += 1; - if !token.is_separator() { - *word_position += 1; - } - - Some((current_token_position, current_word_position, token)) - }) - .filter(|(_, _, token)| !token.is_separator()); - - while let Some((token_position, word_position, word)) = words_positions.next() { - for match_type in self.matching_words.match_token(word) { - match match_type { - // we match, we save the current token as a match, - // then we continue the rest of the tokens. - MatchType::Full { char_len, ids } => { - matches.push(Match { - match_len: char_len, - ids: ids.to_vec(), - word_position, - token_position, - }); - break; - } - // we match partially, iterate over next tokens to check if we can complete the match. - MatchType::Partial(partial) => { - // if match is completed, we break the matching loop over the current token, - // then we continue the rest of the tokens. - let mut wp = words_positions.clone(); - if compute_partial_match( - partial, - token_position, - word_position, - &mut wp, - &mut matches, - ) { - words_positions = wp; - break; - } - } - } - } - } - - self.matches = Some((tokens, matches)); - self - } - - /// Returns boundaries of the words that match the query. - pub fn matches(&mut self) -> Vec { - match &self.matches { - None => self.compute_matches().matches(), - Some((tokens, matches)) => matches - .iter() - .map(|m| MatchBounds { - start: tokens[m.token_position].byte_start, - length: m.match_len, - }) - .collect(), - } - } - - /// Returns the bounds in byte index of the crop window. - fn crop_bounds(&self, tokens: &[Token], matches: &[Match], crop_size: usize) -> (usize, usize) { - // if there is no match, we start from the beginning of the string by default. - let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); - let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); - let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); - let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); - - // matches needs to be counted in the crop len. - let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; - - // create the initial state of the crop window: 2 iterators starting from the matches positions, - // a reverse iterator starting from the first match token position and going towards the beginning of the text, - let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); - // an iterator starting from the last match token position and going towards the end of the text. - let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); - - // grows the crop window peeking in both directions - // until the window contains the good number of words: - while remaining_words > 0 { - let before_token = before_tokens.peek().map(|t| t.separator_kind()); - let after_token = after_tokens.peek().map(|t| t.separator_kind()); - - match (before_token, after_token) { - // we can expand both sides. - (Some(before_token), Some(after_token)) => { - match (before_token, after_token) { - // if they are both separators and are the same kind then advance both, - // or expand in the soft separator separator side. - (Some(before_token_kind), Some(after_token_kind)) => { - if before_token_kind == after_token_kind { - before_tokens.next(); - - // this avoid having an ending separator before crop marker. - if remaining_words > 1 { - after_tokens.next(); - } - } else if before_token_kind == SeparatorKind::Hard { - after_tokens.next(); - } else { - before_tokens.next(); - } - } - // if one of the tokens is a word, we expend in the side of the word. - // left is a word, advance left. - (None, Some(_)) => { - before_tokens.next(); - remaining_words -= 1; - } - // right is a word, advance right. - (Some(_), None) => { - after_tokens.next(); - remaining_words -= 1; - } - // both are words, advance left then right if remaining_word > 0. - (None, None) => { - before_tokens.next(); - remaining_words -= 1; - - if remaining_words > 0 { - after_tokens.next(); - remaining_words -= 1; - } - } - } - } - // the end of the text is reached, advance left. - (Some(before_token), None) => { - before_tokens.next(); - if before_token.is_none() { - remaining_words -= 1; - } - } - // the start of the text is reached, advance right. - (None, Some(after_token)) => { - after_tokens.next(); - if after_token.is_none() { - remaining_words -= 1; - } - } - // no more token to add. - (None, None) => break, - } - } - - // finally, keep the byte index of each bound of the crop window. - let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); - let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); - - (crop_byte_start, crop_byte_end) - } - - /// Compute the score of a match interval: - /// 1) count unique matches - /// 2) calculate distance between matches - /// 3) count ordered matches - fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { - let mut ids: Vec = Vec::with_capacity(matches.len()); - let mut order_score = 0; - let mut distance_score = 0; - - let mut iter = matches.iter().peekable(); - while let Some(m) = iter.next() { - if let Some(next_match) = iter.peek() { - // if matches are ordered - if next_match.ids.iter().min() > m.ids.iter().min() { - order_score += 1; - } - - // compute distance between matches - distance_score -= (next_match.word_position - m.word_position).min(7) as i16; - } - - ids.extend(m.ids.iter()); - } - - ids.sort_unstable(); - ids.dedup(); - let uniq_score = ids.len() as i16; - - // rank by unique match count, then by distance between matches, then by ordered match count. - (uniq_score, distance_score, order_score) - } - - /// Returns the matches interval where the score computed by match_interval_score is the best. - fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { - // we compute the matches interval if we have at least 2 matches. - if matches.len() > 1 { - // positions of the first and the last match of the best matches interval in `matches`. - let mut best_interval = (0, 0); - let mut best_interval_score = self.match_interval_score(&matches[0..=0]); - // current interval positions. - let mut interval_first = 0; - let mut interval_last = 0; - for (index, next_match) in matches.iter().enumerate().skip(1) { - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - if next_match.word_position - matches[interval_first].word_position >= crop_size { - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - - // keep interval if it's the best - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - best_interval_score = interval_score; - } - - // advance start of the interval while interval is longer than crop_size. - while next_match.word_position - matches[interval_first].word_position - >= crop_size - { - interval_first += 1; - } - } - interval_last = index; - } - - // compute the last interval score and compare it to the best one. - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - } - - &matches[best_interval.0..=best_interval.1] - } else { - matches - } - } - - // Returns the formatted version of the original text. - pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { - if !format_options.highlight && format_options.crop.is_none() { - // compute matches is not needed if no highlight nor crop is requested. - Cow::Borrowed(self.text) - } else { - match &self.matches { - Some((tokens, matches)) => { - // If the text has to be cropped, - // compute the best interval to crop around. - let matches = match format_options.crop { - Some(crop_size) if crop_size > 0 => { - self.find_best_match_interval(matches, crop_size) - } - _ => matches, - }; - - // If the text has to be cropped, - // crop around the best interval. - let (byte_start, byte_end) = match format_options.crop { - Some(crop_size) if crop_size > 0 => { - self.crop_bounds(tokens, matches, crop_size) - } - _ => (0, self.text.len()), - }; - - let mut formatted = Vec::new(); - - // push crop marker if it's not the start of the text. - if byte_start > 0 && !self.crop_marker.is_empty() { - formatted.push(self.crop_marker); - } - - let mut byte_index = byte_start; - - if format_options.highlight { - // insert highlight markers around matches. - for m in matches { - let token = &tokens[m.token_position]; - - if byte_index < token.byte_start { - formatted.push(&self.text[byte_index..token.byte_start]); - } - - let highlight_byte_index = self.text[token.byte_start..] - .char_indices() - .enumerate() - .find(|(i, _)| *i == m.match_len) - .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); - formatted.push(self.highlight_prefix); - formatted.push(&self.text[token.byte_start..highlight_byte_index]); - formatted.push(self.highlight_suffix); - // if it's a prefix highlight, we put the end of the word after the highlight marker. - if highlight_byte_index < token.byte_end { - formatted.push(&self.text[highlight_byte_index..token.byte_end]); - } - - byte_index = token.byte_end; - } - } - - // push the rest of the text between last match and the end of crop. - if byte_index < byte_end { - formatted.push(&self.text[byte_index..byte_end]); - } - - // push crop marker if it's not the end of the text. - if byte_end < self.text.len() && !self.crop_marker.is_empty() { - formatted.push(self.crop_marker); - } - - if formatted.len() == 1 { - // avoid concatenating if there is already 1 slice. - Cow::Borrowed(&self.text[byte_start..byte_end]) - } else { - Cow::Owned(formatted.concat()) - } - } - None => self.compute_matches().format(format_options), - } - } - } -} - -#[cfg(test)] -mod tests { - use std::rc::Rc; - - use charabia::TokenizerBuilder; - - use super::*; - use crate::search::matches::matching_words::MatchingWord; - - fn matching_words() -> MatchingWords { - let all = vec![ - Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), - ]; - let matching_words = vec![ - (vec![all[0].clone()], vec![0]), - (vec![all[1].clone()], vec![1]), - (vec![all[2].clone()], vec![2]), - ]; - - MatchingWords::new(matching_words).unwrap() - } - - impl MatcherBuilder<'_, Vec> { - pub fn from_matching_words(matching_words: MatchingWords) -> Self { - Self::new(matching_words, TokenizerBuilder::default().build()) - } - } - - #[test] - fn format_identity() { - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: false, crop: None }; - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); - // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options), &text); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); - // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options), &text); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); - // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options), &text); - } - - #[test] - fn format_highlight() { - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: true, crop: None }; - - // empty text. - let text = ""; - let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ""); - - // text containing only separators. - let text = ":-)"; - let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ":-)"); - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); - // no crop should return complete text, because there is no matches. - assert_eq!(&matcher.format(format_options), &text); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World." - ); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Natalie risk her future to build a world with the boy she loves." - ); - } - - #[test] - fn highlight_unicode() { - let all = vec![ - Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()), - Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), - ]; - let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])]; - - let matching_words = MatchingWords::new(matching_words).unwrap(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: true, crop: None }; - - // Text containing prefix match. - let text = "Ŵôřlḑôle"; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Ŵôřlḑôle" - ); - - // Text containing unicode match. - let text = "Ŵôřlḑ"; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Ŵôřlḑ" - ); - - // Text containing unicode match. - let text = "Westfália"; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Westfália" - ); - } - - #[test] - fn format_crop() { - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: false, crop: Some(10) }; - - // empty text. - let text = ""; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @"" - ); - - // text containing only separators. - let text = ":-)"; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @":-)" - ); - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); - // no highlight should return 10 first words with a marker at the end. - insta::assert_snapshot!( - matcher.format(format_options), - @"A quick brown fox can not jump 32 feet, right…" - ); - - // Text without any match starting by a separator. - let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; - let mut matcher = builder.build(text); - // no highlight should return 10 first words with a marker at the end. - insta::assert_snapshot!( - matcher.format(format_options), - @"(A quick brown fox can not jump 32 feet, right…" - ); - - // Test phrase propagation - let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; - let mut matcher = builder.build(text); - // should crop the phrase instead of croping around the match. - insta::assert_snapshot!( - matcher.format(format_options), - @"… Split The World is a book written by Emily Henry…" - ); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); - // no highlight should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…future to build a world with the boy she loves…" - ); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); - // no highlight should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…she loves. Emily Henry: The Love That Split The World." - ); - - // Text containing a match unordered and a match ordered. - let text = "The world split void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - - // Text containing matches with diferent density. - let text = "split void the void void world void void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - - // Text containing matches with same word. - let text = "split split split split split split void void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - } - - #[test] - fn format_highlight_crop() { - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: true, crop: Some(10) }; - - // empty text. - let text = ""; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @"" - ); - - // text containing only separators. - let text = ":-)"; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @":-)" - ); - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); - // both should return 10 first words with a marker at the end. - insta::assert_snapshot!( - matcher.format(format_options), - @"A quick brown fox can not jump 32 feet, right…" - ); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); - // both should return 10 last words with a marker at the start and highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…future to build a world with the boy she loves…" - ); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); - // both should return 10 last words with a marker at the start and highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…she loves. Emily Henry: The Love That Split The World." - ); - - // Text containing a match unordered and a match ordered. - let text = "The world split void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - } - - #[test] - fn smaller_crop_size() { - //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let text = "void void split the world void void."; - - // set a smaller crop size - let format_options = FormatOptions { highlight: false, crop: Some(2) }; - let mut matcher = builder.build(text); - // because crop size < query size, partially format matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…split the…" - ); - - // set a smaller crop size - let format_options = FormatOptions { highlight: false, crop: Some(1) }; - let mut matcher = builder.build(text); - // because crop size < query size, partially format matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…split…" - ); - - // set crop size to 0 - let format_options = FormatOptions { highlight: false, crop: Some(0) }; - let mut matcher = builder.build(text); - // because crop size is 0, crop is ignored. - insta::assert_snapshot!( - matcher.format(format_options), - @"void void split the world void void." - ); - } - - #[test] - fn partial_matches() { - let all = vec![ - Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()), - ]; - let matching_words = vec![ - (vec![all[0].clone()], vec![0]), - (vec![all[1].clone(), all[2].clone()], vec![0]), - (vec![all[3].clone()], vec![1]), - (vec![all[4].clone(), all[5].clone()], vec![1]), - (vec![all[4].clone()], vec![2]), - ]; - - let matching_words = MatchingWords::new(matching_words).unwrap(); - - let mut builder = MatcherBuilder::from_matching_words(matching_words); - builder.highlight_prefix("_".to_string()); - builder.highlight_suffix("_".to_string()); - - let format_options = FormatOptions { highlight: true, crop: None }; - - let text = "the do or die can't be he do and or isn't he"; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_" - ); - } -} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 3683a5cf0..3e372e551 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,6 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); pub mod facet; mod fst_utils; -mod matches; pub mod new; pub struct Search<'a> { From ae17c62e24583cbc12d79701937803d42d7707a1 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 14:07:18 +0200 Subject: [PATCH 5/9] Remove warnings --- milli/src/search/new/matches/mod.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 2a9596902..2b87963ab 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -5,9 +5,6 @@ pub use matching_words::MatchingWords; use matching_words::{MatchType, PartialMatch, WordId}; use serde::Serialize; -use super::query_term::LocatedQueryTerm; -use crate::SearchContext; - pub mod matching_words; const DEFAULT_CROP_MARKER: &str = "…"; @@ -504,6 +501,7 @@ mod tests { use super::super::located_query_terms_from_string; use super::*; + use crate::SearchContext; impl<'a, 'ctx> MatcherBuilder<'a, &[u8]> { pub fn new_test(mut ctx: SearchContext, query: &'a str) -> Self { From 47f6a3ad3df3b1e6beff46821d4f0ad906cef1c6 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 15:02:23 +0200 Subject: [PATCH 6/9] Take into account that a logger need the search context --- milli/src/search/mod.rs | 38 ++++++++++++++--------- milli/src/search/new/mod.rs | 61 +++++++++++++++++-------------------- 2 files changed, 52 insertions(+), 47 deletions(-) diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 3e372e551..c4dfdd6b3 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -6,6 +6,7 @@ use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords}; +use self::new::PartialSearchResult; use crate::{ execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, }; @@ -106,20 +107,29 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> Result { - let ctx = SearchContext::new(self.index, self.rtxn); - execute_search( - ctx, - &self.query, - self.terms_matching_strategy, - self.exhaustive_number_hits, - &self.filter, - &self.sort_criteria, - self.offset, - self.limit, - Some(self.words_limit), - &mut DefaultSearchLogger, - &mut DefaultSearchLogger, - ) + let mut ctx = SearchContext::new(self.index, self.rtxn); + let PartialSearchResult { located_query_terms, candidates, documents_ids } = + execute_search( + &mut ctx, + &self.query, + self.terms_matching_strategy, + self.exhaustive_number_hits, + &self.filter, + &self.sort_criteria, + self.offset, + self.limit, + Some(self.words_limit), + &mut DefaultSearchLogger, + &mut DefaultSearchLogger, + )?; + + // consume context and located_query_terms to build MatchingWords. + let matching_words = match located_query_terms { + Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms), + None => MatchingWords::default(), + }; + + Ok(SearchResult { matching_words, candidates, documents_ids }) } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 0bb454c06..fc4d3b64c 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -28,7 +28,7 @@ use interner::DedupInterner; pub use logger::detailed::DetailedSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; -use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; +use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm}; use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; @@ -39,10 +39,7 @@ use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::resolve_query_graph::compute_query_graph_docids; use self::sort::Sort; use crate::search::new::distinct::apply_distinct_rule; -use crate::{ - AscDesc, Filter, Index, MatchingWords, Member, Result, SearchResult, TermsMatchingStrategy, - UserError, -}; +use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError}; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { @@ -54,6 +51,7 @@ pub struct SearchContext<'ctx> { pub term_interner: Interner, pub phrase_docids: PhraseDocIdsCache, } + impl<'ctx> SearchContext<'ctx> { pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { Self { @@ -271,7 +269,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( #[allow(clippy::too_many_arguments)] pub fn execute_search( - mut ctx: SearchContext, + ctx: &mut SearchContext, query: &Option, terms_matching_strategy: TermsMatchingStrategy, exhaustive_number_hits: bool, @@ -282,11 +280,11 @@ pub fn execute_search( words_limit: Option, placeholder_search_logger: &mut dyn SearchLogger, query_graph_logger: &mut dyn SearchLogger, -) -> Result { +) -> Result { let mut universe = if let Some(filters) = filters { - filters.evaluate(&mut ctx.txn, &mut ctx.index)? + filters.evaluate(ctx.txn, ctx.index)? } else { - ctx.index.documents_ids(&mut ctx.txn)? + ctx.index.documents_ids(ctx.txn)? }; let mut located_query_terms = None; @@ -294,12 +292,12 @@ pub fn execute_search( // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut tokbuilder = TokenizerBuilder::new(); - let stop_words = &mut ctx.index.stop_words(&mut ctx.txn)?; + let stop_words = ctx.index.stop_words(ctx.txn)?; if let Some(ref stop_words) = stop_words { tokbuilder.stop_words(stop_words); } - let script_lang_map = &mut ctx.index.script_language(&mut ctx.txn)?; + let script_lang_map = ctx.index.script_language(ctx.txn)?; if !script_lang_map.is_empty() { tokbuilder.allow_list(&script_lang_map); } @@ -307,31 +305,28 @@ pub fn execute_search( let tokenizer = tokbuilder.build(); let tokens = tokenizer.tokenize(query); - let query_terms = located_query_terms_from_string(&mut ctx, tokens, words_limit)?; - let graph = QueryGraph::from_query(&mut ctx, &query_terms)?; + let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; + let graph = QueryGraph::from_query(ctx, &query_terms)?; located_query_terms = Some(query_terms); - check_sort_criteria(&mut ctx, sort_criteria.as_ref())?; + check_sort_criteria(ctx, sort_criteria.as_ref())?; universe = resolve_maximally_reduced_query_graph( - &mut ctx, + ctx, &universe, &graph, terms_matching_strategy, query_graph_logger, )?; - let ranking_rules = get_ranking_rules_for_query_graph_search( - &mut ctx, - sort_criteria, - terms_matching_strategy, - )?; + let ranking_rules = + get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; - bucket_sort(&mut ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? + bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { - let ranking_rules = get_ranking_rules_for_placeholder_search(&mut ctx, sort_criteria)?; + let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria)?; bucket_sort( - &mut ctx, + ctx, ranking_rules, &PlaceholderQuery, &universe, @@ -345,20 +340,14 @@ pub fn execute_search( // is requested and a distinct attribute is set. let mut candidates = universe; if exhaustive_number_hits { - if let Some(f) = &mut ctx.index.distinct_field(&mut ctx.txn)? { - if let Some(distinct_fid) = ctx.index.fields_ids_map(&mut ctx.txn)?.id(f) { - candidates = apply_distinct_rule(&mut ctx, distinct_fid, &candidates)?.remaining; + if let Some(f) = ctx.index.distinct_field(ctx.txn)? { + if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) { + candidates = apply_distinct_rule(ctx, distinct_fid, &candidates)?.remaining; } } } - // consume context and located_query_terms to build MatchingWords. - let matching_words = match located_query_terms { - Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms), - None => MatchingWords::default(), - }; - - Ok(SearchResult { matching_words, candidates, documents_ids }) + Ok(PartialSearchResult { located_query_terms, candidates, documents_ids }) } fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec>) -> Result<()> { @@ -402,3 +391,9 @@ fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec> Ok(()) } + +pub struct PartialSearchResult { + pub located_query_terms: Option>, + pub candidates: RoaringBitmap, + pub documents_ids: Vec, +} From 1ba8a40d61ffa363a7c08ec084c863c8780970d6 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 15:10:16 +0200 Subject: [PATCH 7/9] Remove formating benchmark because they can't be isoloated easily anymore --- benchmarks/Cargo.toml | 4 -- benchmarks/benches/formatting.rs | 67 -------------------------------- 2 files changed, 71 deletions(-) delete mode 100644 benchmarks/benches/formatting.rs diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index 5203a7601..f0ed054df 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -48,7 +48,3 @@ harness = false [[bench]] name = "indexing" harness = false - -[[bench]] -name = "formatting" -harness = false diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs deleted file mode 100644 index 2e0fa0ce7..000000000 --- a/benchmarks/benches/formatting.rs +++ /dev/null @@ -1,67 +0,0 @@ -use std::rc::Rc; - -use criterion::{criterion_group, criterion_main}; -use milli::tokenizer::TokenizerBuilder; -use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; - -#[global_allocator] -static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc; - -struct Conf<'a> { - name: &'a str, - text: &'a str, - matching_words: MatcherBuilder<'a, Vec>, -} - -fn bench_formatting(c: &mut criterion::Criterion) { - #[rustfmt::skip] - let confs = &[ - Conf { - name: "'the door d'", - text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#, - matching_words: MatcherBuilder::new(MatchingWords::new(vec![ - (vec![Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap())], vec![0]), - (vec![Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap())], vec![0]), - (vec![Rc::new(MatchingWord::new("door".to_string(), 1, false).unwrap())], vec![1]), - (vec![Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap())], vec![0]), - (vec![Rc::new(MatchingWord::new("thedoor".to_string(), 1, false).unwrap())], vec![0, 1]), - (vec![Rc::new(MatchingWord::new("d".to_string(), 0, true).unwrap())], vec![2]), - (vec![Rc::new(MatchingWord::new("thedoord".to_string(), 1, true).unwrap())], vec![0, 1, 2]), - (vec![Rc::new(MatchingWord::new("doord".to_string(), 1, true).unwrap())], vec![1, 2]), - ] - ).unwrap(), TokenizerBuilder::default().build()), - }, - ]; - - let format_options = &[ - FormatOptions { highlight: false, crop: None }, - FormatOptions { highlight: true, crop: None }, - FormatOptions { highlight: false, crop: Some(10) }, - FormatOptions { highlight: true, crop: Some(10) }, - FormatOptions { highlight: false, crop: Some(20) }, - FormatOptions { highlight: true, crop: Some(20) }, - ]; - - for option in format_options { - let highlight = if option.highlight { "highlight" } else { "no-highlight" }; - - let name = match option.crop { - Some(size) => format!("{}-crop({})", highlight, size), - None => format!("{}-no-crop", highlight), - }; - - let mut group = c.benchmark_group(&name); - for conf in confs { - group.bench_function(conf.name, |b| { - b.iter(|| { - let mut matcher = conf.matching_words.build(conf.text); - matcher.format(*option); - }) - }); - } - group.finish(); - } -} - -criterion_group!(benches, bench_formatting); -criterion_main!(benches); From ba8dcc2d78dc6aca5adfdee3be9ef0804a37b372 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 15:50:47 +0200 Subject: [PATCH 8/9] Fix clippy --- milli/src/search/new/matches/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 2b87963ab..8dded0cab 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -503,7 +503,7 @@ mod tests { use super::*; use crate::SearchContext; - impl<'a, 'ctx> MatcherBuilder<'a, &[u8]> { + impl<'a> MatcherBuilder<'a, &[u8]> { pub fn new_test(mut ctx: SearchContext, query: &'a str) -> Self { let tokenizer = TokenizerBuilder::new().build(); let tokens = tokenizer.tokenize(query); From f7e7f438f89e40890fb3f2964c239ec609a0e508 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Thu, 6 Apr 2023 17:22:31 +0200 Subject: [PATCH 9/9] Patch prefix match --- milli/src/search/new/matches/matching_words.rs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 4ca04884a..d5d1b6906 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -20,6 +20,7 @@ pub struct LocatedMatchingWords { pub value: Vec>, pub positions: RangeInclusive, pub is_prefix: bool, + pub original_char_count: usize, } /// Structure created from a query tree @@ -101,10 +102,12 @@ impl MatchingWords { positions: located_term.positions.clone(), }); } + words.push(LocatedMatchingWords { value: matching_words, positions: located_term.positions.clone(), is_prefix: term.is_prefix, + original_char_count: ctx.word_interner.get(term.original).chars().count(), }); } @@ -131,7 +134,11 @@ impl MatchingWords { let word = self.word_interner.get(*word); // if the word is a prefix we match using starts_with. if located_words.is_prefix && token.lemma().starts_with(word) { - let char_len = token.original_lengths(word.len()).0; + let Some((char_index, c)) = word.char_indices().take(located_words.original_char_count).last() else { + continue; + }; + let prefix_length = char_index + c.len_utf8(); + let char_len = token.original_lengths(prefix_length).0; let ids = &located_words.positions; return Some(MatchType::Full { char_len, ids }); // else we exact match the token.