From 8221c94e7f5666c73944cc5f57211a0eb4035b59 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Thu, 3 Oct 2024 15:37:51 +0300 Subject: [PATCH] Split into multiple files, refactor --- .../search/new/matches/best_match_interval.rs | 139 ++++++++++ milli/src/search/new/matches/match.rs | 62 +++++ milli/src/search/new/matches/mod.rs | 244 +----------------- .../search/new/matches/simple_token_kind.rs | 15 ++ 4 files changed, 230 insertions(+), 230 deletions(-) create mode 100644 milli/src/search/new/matches/best_match_interval.rs create mode 100644 milli/src/search/new/matches/match.rs create mode 100644 milli/src/search/new/matches/simple_token_kind.rs diff --git a/milli/src/search/new/matches/best_match_interval.rs b/milli/src/search/new/matches/best_match_interval.rs new file mode 100644 index 000000000..a6497f351 --- /dev/null +++ b/milli/src/search/new/matches/best_match_interval.rs @@ -0,0 +1,139 @@ +use super::matching_words::WordId; +use super::{Match, MatchPosition}; + +struct MatchIntervalWithScore { + interval: [usize; 2], + score: [i16; 3], +} + +// count score for phrases +fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) { + let words_in_phrase_minus_one = (lwp - fwp) as i16; + // will always be ordered, so +1 for each space between words + *order_score += words_in_phrase_minus_one; + // distance will always be 1, so -1 for each space between words + *distance_score -= words_in_phrase_minus_one; +} + +/// Compute the score of a match interval: +/// 1) count unique matches +/// 2) calculate distance between matches +/// 3) count ordered matches +fn get_interval_score(matches: &[Match]) -> [i16; 3] { + let mut ids: Vec = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.ids.iter().min() > m.ids.iter().min() { + order_score += 1; + } + + let m_last_word_pos = match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => { + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + lwp + } + }; + let next_match_first_word_pos = next_match.get_first_word_pos(); + + // compute distance between matches + distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; + } else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position { + // in case last match is a phrase, count score for its words + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + } + + ids.extend(m.ids.iter()); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + [uniq_score, distance_score, order_score] +} + +/// Returns the first and last match where the score computed by match_interval_score is the best. +pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] { + if matches.is_empty() { + panic!("`matches` should not be empty at this point"); + } + + // positions of the first and the last match of the best matches interval in `matches`. + let mut best_interval: Option = None; + + let mut save_best_interval = |interval_first, interval_last| { + let interval_score = get_interval_score(&matches[interval_first..=interval_last]); + let is_interval_score_better = &best_interval + .as_ref() + .map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score); + + if *is_interval_score_better { + best_interval = Some(MatchIntervalWithScore { + interval: [interval_first, interval_last], + score: interval_score, + }); + } + }; + + // we compute the matches interval if we have at least 2 matches. + // current interval positions. + let mut interval_first = 0; + let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + + for (index, next_match) in matches.iter().enumerate() { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + let next_match_last_word_pos = next_match.get_last_word_pos(); + + // if the next match would mean that we pass the crop size window, + // we take the last valid match, that didn't pass this boundry, which is `index` - 1, + // and calculate a score for it, and check if it's better than our best so far + if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { + // if index is 0 there is no last viable match + if index != 0 { + let interval_last = index - 1; + // keep interval if it's the best + save_best_interval(interval_first, interval_last); + } + + // advance start of the interval while interval is longer than crop_size. + loop { + interval_first += 1; + if interval_first == matches.len() { + interval_first -= 1; + break; + } + + interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + + if interval_first_match_first_word_pos > next_match_last_word_pos + || next_match_last_word_pos - interval_first_match_first_word_pos < crop_size + { + break; + } + } + } + } + + // compute the last interval score and compare it to the best one. + let interval_last = matches.len() - 1; + // if it's the last match with itself, we need to make sure it's + // not a phrase longer than the crop window + if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { + save_best_interval(interval_first, interval_last); + } + + // if none of the matches fit the criteria above, default to the first one + best_interval.map_or( + [&matches[0], &matches[0]], + |MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]], + ) +} diff --git a/milli/src/search/new/matches/match.rs b/milli/src/search/new/matches/match.rs new file mode 100644 index 000000000..cc08b006c --- /dev/null +++ b/milli/src/search/new/matches/match.rs @@ -0,0 +1,62 @@ +use super::matching_words::WordId; + +#[derive(Clone, Debug)] +pub enum MatchPosition { + Word { + // position of the word in the whole text. + word_position: usize, + // position of the token in the whole text. + token_position: usize, + }, + Phrase { + // position of the first and last word in the phrase in the whole text. + word_positions: [usize; 2], + // position of the first and last token in the phrase in the whole text. + token_positions: [usize; 2], + }, +} + +#[derive(Clone, Debug)] +pub struct Match { + pub match_len: usize, + // ids of the query words that matches. + pub ids: Vec, + pub position: MatchPosition, +} + +impl Match { + pub(super) fn get_first_word_pos(&self) -> usize { + match self.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp, + } + } + + pub(super) fn get_last_word_pos(&self) -> usize { + match self.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp, + } + } + + pub(super) fn get_first_token_pos(&self) -> usize { + match self.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp, + } + } + + pub(super) fn get_last_token_pos(&self) -> usize { + match self.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp, + } + } + + pub(super) fn get_word_count(&self) -> usize { + match self.position { + MatchPosition::Word { .. } => 1, + MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1, + } + } +} diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index f8d60ef54..3df361702 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -1,12 +1,16 @@ -use std::borrow::Cow; +mod best_match_interval; +mod r#match; +mod matching_words; +mod simple_token_kind; -use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer}; +use charabia::{Language, SeparatorKind, Token, Tokenizer}; use either::Either; pub use matching_words::MatchingWords; -use matching_words::{MatchType, PartialMatch, WordId}; +use matching_words::{MatchType, PartialMatch}; +use r#match::{Match, MatchPosition}; use serde::Serialize; - -pub mod matching_words; +use simple_token_kind::SimpleTokenKind; +use std::borrow::Cow; const DEFAULT_CROP_MARKER: &str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; @@ -94,228 +98,12 @@ impl FormatOptions { } } -#[derive(Clone, Debug)] -pub enum MatchPosition { - Word { - // position of the word in the whole text. - word_position: usize, - // position of the token in the whole text. - token_position: usize, - }, - Phrase { - // position of the first and last word in the phrase in the whole text. - word_positions: (usize, usize), - // position of the first and last token in the phrase in the whole text. - token_positions: (usize, usize), - }, -} - -#[derive(Clone, Debug)] -pub struct Match { - match_len: usize, - // ids of the query words that matches. - ids: Vec, - position: MatchPosition, -} - -impl Match { - fn get_first_word_pos(&self) -> usize { - match self.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp, - } - } - - fn get_last_word_pos(&self) -> usize { - match self.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (_, lwp), .. } => lwp, - } - } - - fn get_first_token_pos(&self) -> usize { - match self.position { - MatchPosition::Word { token_position, .. } => token_position, - MatchPosition::Phrase { token_positions: (ftp, _), .. } => ftp, - } - } - - fn get_last_token_pos(&self) -> usize { - match self.position { - MatchPosition::Word { token_position, .. } => token_position, - MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp, - } - } - - fn get_word_count(&self) -> usize { - match self.position { - MatchPosition::Word { .. } => 1, - MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => lwp - fwp + 1, - } - } -} - #[derive(Serialize, Debug, Clone, PartialEq, Eq)] pub struct MatchBounds { pub start: usize, pub length: usize, } -enum SimpleTokenKind { - Separator(SeparatorKind), - NotSeparator, -} - -impl SimpleTokenKind { - fn new(token: &&Token<'_>) -> Self { - match token.kind { - TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), - _ => Self::NotSeparator, - } - } -} - -#[derive(PartialEq, PartialOrd)] -struct MatchIntervalScore(i16, i16, i16); - -impl MatchIntervalScore { - /// Compute the score of a match interval: - /// 1) count unique matches - /// 2) calculate distance between matches - /// 3) count ordered matches - fn new(matches: &[Match]) -> Self { - let mut ids: Vec = Vec::with_capacity(matches.len()); - let mut order_score = 0; - let mut distance_score = 0; - - // count score for phrases - fn tally_phrase_scores( - fwp: &usize, - lwp: &usize, - order_score: &mut i16, - distance_score: &mut i16, - ) { - let words_in_phrase_minus_one = (lwp - fwp) as i16; - // will always be ordered, so +1 for each space between words - *order_score += words_in_phrase_minus_one; - // distance will always be 1, so -1 for each space between words - *distance_score -= words_in_phrase_minus_one; - } - - let mut iter = matches.iter().peekable(); - while let Some(m) = iter.next() { - if let Some(next_match) = iter.peek() { - // if matches are ordered - if next_match.ids.iter().min() > m.ids.iter().min() { - order_score += 1; - } - - let m_last_word_pos = match m.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => { - tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); - lwp - } - }; - let next_match_first_word_pos = next_match.get_first_word_pos(); - - // compute distance between matches - distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; - } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position { - // in case last match is a phrase, count score for its words - tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); - } - - ids.extend(m.ids.iter()); - } - - ids.sort_unstable(); - ids.dedup(); - let uniq_score = ids.len() as i16; - - // rank by unique match count, then by distance between matches, then by ordered match count. - Self(uniq_score, distance_score, order_score) - } -} - -struct MatchIntervalWithScore { - interval: (usize, usize), - score: MatchIntervalScore, -} - -impl MatchIntervalWithScore { - /// Returns the matches interval where the score computed by match_interval_score is the best. - fn find_best_match_interval(matches: &[Match], crop_size: usize) -> &[Match] { - if matches.len() <= 1 { - return matches; - } - - // positions of the first and the last match of the best matches interval in `matches`. - let mut best_interval: Option = None; - - let mut save_best_interval = |interval_first, interval_last| { - let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]); - let is_interval_score_better = - &best_interval.as_ref().map_or(true, |Self { score, .. }| interval_score > *score); - - if *is_interval_score_better { - best_interval = - Some(Self { interval: (interval_first, interval_last), score: interval_score }); - } - }; - - // we compute the matches interval if we have at least 2 matches. - // current interval positions. - let mut interval_first = 0; - let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - - for (index, next_match) in matches.iter().enumerate() { - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - let next_match_last_word_pos = next_match.get_last_word_pos(); - - // if the next match would mean that we pass the crop size window, - // we take the last valid match, that didn't pass this boundry, which is `index` - 1, - // and calculate a score for it, and check if it's better than our best so far - if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { - // if index is 0 there is no last viable match - if index != 0 { - let interval_last = index - 1; - // keep interval if it's the best - save_best_interval(interval_first, interval_last); - } - - // advance start of the interval while interval is longer than crop_size. - loop { - interval_first += 1; - interval_first_match_first_word_pos = - matches[interval_first].get_first_word_pos(); - - if interval_first_match_first_word_pos > next_match_last_word_pos - || next_match_last_word_pos - interval_first_match_first_word_pos - < crop_size - { - break; - } - } - } - } - - // compute the last interval score and compare it to the best one. - let interval_last = matches.len() - 1; - // if it's the last match with itself, we need to make sure it's - // not a phrase longer than the crop window - if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { - save_best_interval(interval_first, interval_last); - } - - // if none of the matches fit the criteria above, default to the first one - let best_interval = best_interval.map_or((0, 0), |v| v.interval); - &matches[best_interval.0..=best_interval.1] - } -} - /// Structure used to analyze a string, compute words that match, /// and format the source string, returning a highlighted and cropped sub-string. pub struct Matcher<'t, 'tokenizer, 'b, 'lang> { @@ -355,8 +143,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { match_len: word.char_end - *first_word_char_start, ids: ids.clone().collect(), position: MatchPosition::Phrase { - word_positions: (first_word_position, word_position), - token_positions: (first_token_position, token_position), + word_positions: [first_word_position, word_position], + token_positions: [first_token_position, token_position], }, }); @@ -450,15 +238,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { matches: &[Match], crop_size: usize, ) -> (usize, usize) { - // if there is no match, we start from the beginning of the string by default. let ( mut remaining_words, is_iterating_forward, before_tokens_starting_index, after_tokens_starting_index, ) = if !matches.is_empty() { - let matches_first = matches.first().unwrap(); - let matches_last = matches.last().unwrap(); + let [matches_first, matches_last] = + best_match_interval::find_best_match_interval(matches, crop_size); let matches_size = matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1; @@ -600,9 +387,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // crop around the best interval. let (byte_start, byte_end) = match format_options.crop { Some(crop_size) if crop_size > 0 => { - let matches = MatchIntervalWithScore::find_best_match_interval( - matches, crop_size, - ); self.crop_bounds(tokens, matches, crop_size) } _ => (0, self.text.len()), @@ -625,7 +409,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let token = &tokens[token_position]; (&token.byte_start, &token.byte_end) } - MatchPosition::Phrase { token_positions: (ftp, ltp), .. } => { + MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => { (&tokens[ftp].byte_start, &tokens[ltp].byte_end) } }; diff --git a/milli/src/search/new/matches/simple_token_kind.rs b/milli/src/search/new/matches/simple_token_kind.rs new file mode 100644 index 000000000..b34a8c985 --- /dev/null +++ b/milli/src/search/new/matches/simple_token_kind.rs @@ -0,0 +1,15 @@ +use charabia::{SeparatorKind, Token, TokenKind}; + +pub enum SimpleTokenKind { + Separator(SeparatorKind), + NotSeparator, +} + +impl SimpleTokenKind { + pub fn new(token: &&Token<'_>) -> Self { + match token.kind { + TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), + _ => Self::NotSeparator, + } + } +}