Refactor, handle more cases for phrases

2024-11-25 19:45:05 +08:00 · 2024-09-30 21:24:41 +03:00 · 2024-09-30 21:24:41 +03:00 · eabc14c268
commit eabc14c268
parent 00ccf53ffa
2 changed files with 291 additions and 208 deletions
--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@ -181,7 +181,7 @@ impl<'a> PartialMatch<'a> {
        // return a new Partial match allowing the highlighter to continue.
        if is_matching && matching_words.len() > 1 {
            matching_words.remove(0);
-            Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))
+            Some(MatchType::Partial(Self { matching_words, ids, char_len }))
        // if there is no remaining word to match in the phrase and the current token is matching,
        // return a Full match.
        } else if is_matching {
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@ -1,6 +1,6 @@
 use std::borrow::Cow;
-use charabia::{Language, SeparatorKind, Token, Tokenizer};
+use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer};
 pub use matching_words::MatchingWords;
 use matching_words::{MatchType, PartialMatch, WordId};
 use serde::Serialize;
@ -145,6 +145,13 @@ impl Match {
            MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp,
        }
    }
    fn get_word_count(&self) -> usize {
        match self.position {
            MatchPosition::Word { .. } => 1,
            MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => lwp - fwp + 1,
        }
    }
 }
 #[derive(Serialize, Debug, Clone, PartialEq, Eq)]
@ -153,6 +160,27 @@ pub struct MatchBounds {
    pub length: usize,
 }
 enum SimpleTokenKind {
    Separator(SeparatorKind),
    NotSeparator,
 }
 impl SimpleTokenKind {
    fn get(token: &&Token<'_>) -> Self {
        match token.kind {
            TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
            _ => Self::NotSeparator,
        }
    }
    fn is_not_separator(&self) -> bool {
        match self {
            SimpleTokenKind::NotSeparator => true,
            SimpleTokenKind::Separator(_) => false,
        }
    }
 }
 /// Structure used to analyze a string, compute words that match,
 /// and format the source string, returning a highlighted and cropped sub-string.
 pub struct Matcher<'t, 'tokenizer, 'b, 'lang> {
@ -287,95 +315,130 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
        crop_size: usize,
    ) -> (usize, usize) {
        // if there is no match, we start from the beginning of the string by default.
-        let first_match_word_position =
+        let first_match_first_word_position =
            matches.first().map(|m| m.get_first_word_pos()).unwrap_or(0);
-        let first_match_token_position =
+        let first_match_first_token_position =
            matches.first().map(|m| m.get_first_token_pos()).unwrap_or(0);
-        let last_match_word_position = matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0);
+        let last_match_last_word_position =
-        let last_match_token_position = matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0);
+            matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0);
        let last_match_last_token_position =
            matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0);
-        // matches needs to be counted in the crop len.
+        let matches_window_len =
-        let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
+            last_match_last_word_position - first_match_first_word_position + 1;
-        // create the initial state of the crop window: 2 iterators starting from the matches positions,
+        if crop_size >= matches_window_len {
-        // a reverse iterator starting from the first match token position and going towards the beginning of the text,
+            // matches needs to be counted in the crop len.
-        let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable();
+            let mut remaining_words = crop_size - matches_window_len;
        // an iterator starting from the last match token position and going towards the end of the text.
        let mut after_tokens = tokens[last_match_token_position..].iter().peekable();
-        // grows the crop window peeking in both directions
+            // create the initial state of the crop window: 2 iterators starting from the matches positions,
-        // until the window contains the good number of words:
+            // a reverse iterator starting from the first match token position and going towards the beginning of the text,
-        while remaining_words > 0 {
+            let mut before_tokens =
-            let before_token = before_tokens.peek().map(|t| t.separator_kind());
+                tokens[..first_match_first_token_position].iter().rev().peekable();
-            let after_token = after_tokens.peek().map(|t| t.separator_kind());
+            // an iterator starting from the last match token position and going towards the end of the text.
            let mut after_tokens = tokens[last_match_last_token_position + 1..].iter().peekable();
-            match (before_token, after_token) {
+            // grows the crop window peeking in both directions
-                // we can expand both sides.
+            // until the window contains the good number of words:
-                (Some(before_token), Some(after_token)) => {
+            while remaining_words > 0 {
-                    match (before_token, after_token) {
+                let before_token_kind = before_tokens.peek().map(SimpleTokenKind::get);
-                        // if they are both separators and are the same kind then advance both,
+                let after_token_kind = after_tokens.peek().map(SimpleTokenKind::get);
                        // or expand in the soft separator separator side.
                        (Some(before_token_kind), Some(after_token_kind)) => {
                            if before_token_kind == after_token_kind {
                                before_tokens.next();
-                                // this avoid having an ending separator before crop marker.
+                match (before_token_kind, after_token_kind) {
-                                if remaining_words > 1 {
+                    // we can expand both sides.
                    (Some(before_token_kind), Some(after_token_kind)) => {
                        match (before_token_kind, after_token_kind) {
                            // if they are both separators and are the same kind then advance both,
                            // or expand in the soft separator separator side.
                            (
                                SimpleTokenKind::Separator(before_token_separator_kind),
                                SimpleTokenKind::Separator(after_token_separator_kind),
                            ) => {
                                if before_token_separator_kind == after_token_separator_kind {
                                    before_tokens.next();
                                    // this avoid having an ending separator before crop marker.
                                    if remaining_words > 1 {
                                        after_tokens.next();
                                    }
                                } else if let SeparatorKind::Hard = before_token_separator_kind {
                                    after_tokens.next();
                                } else {
                                    before_tokens.next();
                                }
                            } else if before_token_kind == SeparatorKind::Hard {
                                after_tokens.next();
                            } else {
                                before_tokens.next();
                            }
-                        }
+                            // if one of the tokens is a word, we expend in the side of the word.
-                        // if one of the tokens is a word, we expend in the side of the word.
+                            // left is a word, advance left.
-                        // left is a word, advance left.
+                            (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => {
-                        (None, Some(_)) => {
+                                before_tokens.next();
-                            before_tokens.next();
+                                remaining_words -= 1;
-                            remaining_words -= 1;
+                            }
-                        }
+                            // right is a word, advance right.
-                        // right is a word, advance right.
+                            (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => {
                        (Some(_), None) => {
                            after_tokens.next();
                            remaining_words -= 1;
                        }
                        // both are words, advance left then right if remaining_word > 0.
                        (None, None) => {
                            before_tokens.next();
                            remaining_words -= 1;
                            if remaining_words > 0 {
                                after_tokens.next();
                                remaining_words -= 1;
                            }
                            // both are words, advance left then right if remaining_word > 0.
                            (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => {
                                before_tokens.next();
                                remaining_words -= 1;
                                if remaining_words > 0 {
                                    after_tokens.next();
                                    remaining_words -= 1;
                                }
                            }
                        }
                    }
-                }
+                    // the end of the text is reached, advance left.
-                // the end of the text is reached, advance left.
+                    (Some(before_token_kind), None) => {
-                (Some(before_token), None) => {
+                        before_tokens.next();
-                    before_tokens.next();
+                        if let SimpleTokenKind::NotSeparator = before_token_kind {
-                    if before_token.is_none() {
+                            remaining_words -= 1;
-                        remaining_words -= 1;
+                        }
                    }
-                }
+                    // the start of the text is reached, advance right.
-                // the start of the text is reached, advance right.
+                    (None, Some(after_token_kind)) => {
-                (None, Some(after_token)) => {
+                        after_tokens.next();
-                    after_tokens.next();
+                        if let SimpleTokenKind::NotSeparator = after_token_kind {
-                    if after_token.is_none() {
+                            remaining_words -= 1;
-                        remaining_words -= 1;
+                        }
                    }
                    // no more token to add.
                    (None, None) => break,
                }
                // no more token to add.
                (None, None) => break,
            }
            // finally, keep the byte index of each bound of the crop window.
            let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
            let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
            (crop_byte_start, crop_byte_end)
        } else {
            // there's one match? and it's longer than the crop window, so we have to advance inward
            let mut remaining_extra_words = matches_window_len - crop_size;
            let mut tokens_from_end =
                tokens[..=last_match_last_token_position].iter().rev().peekable();
            while remaining_extra_words > 0 {
                let token_from_end_kind =
                    tokens_from_end.peek().map(SimpleTokenKind::get).expect("TODO");
                if token_from_end_kind.is_not_separator() {
                    remaining_extra_words -= 1;
                }
                tokens_from_end.next();
            }
            let crop_byte_start = if first_match_first_token_position > 0 {
                &tokens[first_match_first_token_position - 1].byte_end
            } else {
                &0
            };
            let crop_byte_end = tokens_from_end.next().map(|t| t.byte_start).expect("TODO");
            (*crop_byte_start, crop_byte_end)
        }
        // finally, keep the byte index of each bound of the crop window.
        let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
        let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
        (crop_byte_start, crop_byte_end)
    }
    /// Compute the score of a match interval:
@ -416,11 +479,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                        lwp
                    }
                };
-
+                let next_match_first_word_pos = next_match.get_first_word_pos();
                let next_match_first_word_pos = match next_match.position {
                    MatchPosition::Word { word_position, .. } => word_position,
                    MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp,
                };
                // compute distance between matches
                distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
@ -443,72 +502,96 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
    /// Returns the matches interval where the score computed by match_interval_score is the best.
    fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
        let matches_len = matches.len();
        if matches_len <= 1 {
            return matches;
        }
        // positions of the first and the last match of the best matches interval in `matches`.
        struct BestInterval {
            interval: (usize, usize),
            score: (i16, i16, i16),
        }
        fn save_best_interval(
            best_interval: &mut Option<BestInterval>,
            interval_first: usize,
            interval_last: usize,
            interval_score: (i16, i16, i16),
        ) {
            if let Some(best_interval) = best_interval {
                if interval_score > best_interval.score {
                    best_interval.interval = (interval_first, interval_last);
                    best_interval.score = interval_score;
                }
            } else {
                *best_interval = Some(BestInterval {
                    interval: (interval_first, interval_last),
                    score: interval_score,
                });
            }
        }
        let mut best_interval: Option<BestInterval> = None;
        // we compute the matches interval if we have at least 2 matches.
-        if matches_len > 1 {
+        // current interval positions.
-            // current interval positions.
+        let mut interval_first = 0;
-            let mut interval_first = 0;
+        let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
            // positions of the first and the last match of the best matches interval in `matches`.
            let mut best_interval = (0, 0);
            let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
-            let mut index = 1;
+        for (index, next_match) in matches.iter().enumerate() {
-            while index < matches_len - 1 {
+            // if next match would make interval gross more than crop_size,
-                let next_match = &matches[index];
+            // we compare the current interval with the best one,
            // then we increase `interval_first` until next match can be added.
            let next_match_last_word_pos = next_match.get_last_word_pos();
-                // if next match would make interval gross more than crop_size,
+            // if the next match would mean that we pass the crop size window,
-                // we compare the current interval with the best one,
+            // we take the last valid match, that didn't pass this boundry, which is `index` - 1,
-                // then we increase `interval_first` until next match can be added.
+            // and calculate a score for it, and check if it's better than our best so far
-                let next_match_last_word_pos = next_match.get_last_word_pos();
+            if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
-                let interval_first_match_first_word_pos =
+                // if index is 0 there is no last viable match
-                    matches[interval_first].get_first_word_pos();
+                if index != 0 {
                    let interval_last = index - 1;
                    let interval_score =
                        self.match_interval_score(&matches[interval_first..=interval_last]);
-                // if the next match would mean that we pass the crop size window,
+                    // keep interval if it's the best
-                // we take the last valid match, that didn't pass this boundry, which is `index` - 1,
+                    save_best_interval(
-                // and calculate a score for it, and check if it's better than our best so far
+                        &mut best_interval,
-                if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
+                        interval_first,
-                    // skip for 1, because it would result in the same as our very first interval score
+                        interval_last,
-                    if index != 1 {
+                        interval_score,
-                        let interval_last = index - 1;
+                    );
                        let interval_score =
                            self.match_interval_score(&matches[interval_first..=interval_last]);
                        // keep interval if it's the best
                        if interval_score > best_interval_score {
                            best_interval = (interval_first, interval_last);
                            best_interval_score = interval_score;
                        }
                    }
                    // advance start of the interval while interval is longer than crop_size.
                    loop {
                        interval_first += 1;
                        let interval_first_match_first_word_pos =
                            matches[interval_first].get_first_word_pos();
                        if next_match_last_word_pos - interval_first_match_first_word_pos
                            < crop_size
                        {
                            break;
                        }
                    }
                }
-                index += 1;
+                // advance start of the interval while interval is longer than crop_size.
-            }
+                loop {
                    interval_first += 1;
                    interval_first_match_first_word_pos =
                        matches[interval_first].get_first_word_pos();
-            // compute the last interval score and compare it to the best one.
+                    if interval_first_match_first_word_pos > next_match_last_word_pos
-            let interval_last = matches_len - 1;
+                        || next_match_last_word_pos - interval_first_match_first_word_pos
                            < crop_size
                    {
                        break;
                    }
                }
            }
        }
        // compute the last interval score and compare it to the best one.
        let interval_last = matches_len - 1;
        // if it's the last match with itself, we need to make sure it's
        // not a phrase longer than the crop window
        if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
            let interval_score =
                self.match_interval_score(&matches[interval_first..=interval_last]);
-            if interval_score > best_interval_score {
+            save_best_interval(&mut best_interval, interval_first, interval_last, interval_score);
                best_interval = (interval_first, interval_last);
            }
            &matches[best_interval.0..=best_interval.1]
        } else {
            matches
        }
        // if none of the matches fit the criteria above, default to the first one
        let best_interval = best_interval.map_or((0, 0), |v| v.interval);
        &matches[best_interval.0..=best_interval.1]
    }
    // Returns the formatted version of the original text.
@ -928,98 +1011,98 @@ mod tests {
        let format_options = FormatOptions { highlight: true, crop: Some(10) };
-        // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
+        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
-        // let mut matcher = builder.build(text, None);
+        let mut matcher = builder.build(text, None);
-        // // should return 10 words with a marker at the start as well the end, and the highlighted matches.
+        // should return 10 words with a marker at the start as well the end, and the highlighted matches.
-        // insta::assert_snapshot!(
+        insta::assert_snapshot!(
-        //     matcher.format(format_options),
+            matcher.format(format_options),
-        //     @"…the power to split <em>the world</em> between those who embraced…"
+            @"…the power to split <em>the world</em> between those who embraced…"
-        // );
+        );
-        // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\"");
+        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\"");
-        // let mut matcher = builder.build(text, None);
+        let mut matcher = builder.build(text, None);
-        // // should highlight "those" and the phrase "and those".
+        // should highlight "those" and the phrase "and those".
-        // insta::assert_snapshot!(
+        insta::assert_snapshot!(
-        //     matcher.format(format_options),
+            matcher.format(format_options),
-        //     @"…groundbreaking invention had the <em>power to</em> split the world between…"
+            @"…groundbreaking invention had the <em>power to</em> split the world between…"
-        // );
+        );
        // let builder = MatcherBuilder::new_test(
        //     &rtxn,
        //     &temp_index,
        //     "\"The groundbreaking invention had the power to split the world\"",
        // );
        // let mut matcher = builder.build(text, None);
        // insta::assert_snapshot!(
        //     matcher.format(format_options),
        //     @"<em>The groundbreaking invention had the power to split the world</em>…"
        // );
        let builder = MatcherBuilder::new_test(
            &rtxn,
            &temp_index,
-            "\"The groundbreaking invention had the power to split the world between\"",
+            "\"The groundbreaking invention had the power to split the world\"",
        );
        let mut matcher = builder.build(text, None);
        insta::assert_snapshot!(
            matcher.format(format_options),
-            @"The groundbreaking invention had the power to split the world …"
+            @"<em>The groundbreaking invention had the power to split the world</em>…"
        );
-        // let builder = MatcherBuilder::new_test(
+        let builder = MatcherBuilder::new_test(
-        //     &rtxn,
+            &rtxn,
-        //     &temp_index,
+            &temp_index,
-        //     "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
+            "\"The groundbreaking invention had the power to split the world between those\"",
-        // );
+        );
-        // let mut matcher = builder.build(text, None);
+        let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
+        insta::assert_snapshot!(
-        //     matcher.format(format_options),
+            matcher.format(format_options),
-        //     @"…between those who <em>embraced progress and those who resisted change</em>…"
+            @"The groundbreaking invention had the power to split the world…"
-        // );
+        );
-        // let builder = MatcherBuilder::new_test(
+        let builder = MatcherBuilder::new_test(
-        //     &rtxn,
+            &rtxn,
-        //     &temp_index,
+            &temp_index,
-        //     "\"The groundbreaking invention\" \"split the world between those\"",
+            "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
-        // );
+        );
-        // let mut matcher = builder.build(text, None);
+        let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
+        insta::assert_snapshot!(
-        //     matcher.format(format_options),
+            matcher.format(format_options),
-        //     @"…the power to <em>split the world between those</em> who embraced…"
+            @"…between those who <em>embraced progress and those who resisted change</em>…"
-        // );
+        );
-        // let builder = MatcherBuilder::new_test(
+        let builder = MatcherBuilder::new_test(
-        //     &rtxn,
+            &rtxn,
-        //     &temp_index,
+            &temp_index,
-        //     "\"groundbreaking invention\" \"split the world between\"",
+            "\"The groundbreaking invention\" \"split the world between those\"",
-        // );
+        );
-        // let mut matcher = builder.build(text, None);
+        let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
+        insta::assert_snapshot!(
-        //     matcher.format(format_options),
+            matcher.format(format_options),
-        //     @"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
+            @"…the power to <em>split the world between those</em> who embraced…"
-        // );
+        );
-        // let builder = MatcherBuilder::new_test(
+        let builder = MatcherBuilder::new_test(
-        //     &rtxn,
+            &rtxn,
-        //     &temp_index,
+            &temp_index,
-        //     "\"groundbreaking invention\" \"had the power to split the world between those\"",
+            "\"groundbreaking invention\" \"split the world between\"",
-        // );
+        );
-        // let mut matcher = builder.build(text, None);
+        let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
+        insta::assert_snapshot!(
-        //     matcher.format(format_options),
+            matcher.format(format_options),
-        //     @"…invention <em>had the power to split the world between those</em>…"
+            @"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
-        // );
+        );
-        // let builder = MatcherBuilder::new_test(
+        let builder = MatcherBuilder::new_test(
-        //     &rtxn,
+            &rtxn,
-        //     &temp_index,
+            &temp_index,
-        //     "\"The groundbreaking invention\" \"had the power to split the world between those\"",
+            "\"groundbreaking invention\" \"had the power to split the world between those\"",
-        // );
+        );
-        // let mut matcher = builder.build(text, None);
+        let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
+        insta::assert_snapshot!(
-        //     matcher.format(format_options),
+            matcher.format(format_options),
-        //     @"…invention <em>had the power to split the world between those</em>…"
+            @"…invention <em>had the power to split the world between those</em>…"
-        // );
+        );
        let builder = MatcherBuilder::new_test(
            &rtxn,
            &temp_index,
            "\"The groundbreaking invention\" \"had the power to split the world between those\"",
        );
        let mut matcher = builder.build(text, None);
        insta::assert_snapshot!(
            matcher.format(format_options),
            @"…invention <em>had the power to split the world between those</em>…"
        );
    }
    #[test]