Refactor, handle more cases for phrases

2024-11-22 10:07:40 +08:00 · 2024-09-30 21:24:41 +03:00 · 2024-09-30 21:24:41 +03:00 · eabc14c268
commit eabc14c268
parent 00ccf53ffa
2 changed files with 291 additions and 208 deletions
--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@ -181,7 +181,7 @@ impl<'a> PartialMatch<'a> {
        // return a new Partial match allowing the highlighter to continue.
        if is_matching && matching_words.len() > 1 {
            matching_words.remove(0);
-            Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))
+            Some(MatchType::Partial(Self { matching_words, ids, char_len }))
        // if there is no remaining word to match in the phrase and the current token is matching,
        // return a Full match.
        } else if is_matching {
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@ -1,6 +1,6 @@
 use std::borrow::Cow;

-use charabia::{Language, SeparatorKind, Token, Tokenizer};
+use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer};
 pub use matching_words::MatchingWords;
 use matching_words::{MatchType, PartialMatch, WordId};
 use serde::Serialize;
@ -145,6 +145,13 @@ impl Match {
            MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp,
        }
    }
+
+    fn get_word_count(&self) -> usize {
+        match self.position {
+            MatchPosition::Word { .. } => 1,
+            MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => lwp - fwp + 1,
+        }
+    }
 }

 #[derive(Serialize, Debug, Clone, PartialEq, Eq)]
@ -153,6 +160,27 @@ pub struct MatchBounds {
    pub length: usize,
 }

+enum SimpleTokenKind {
+    Separator(SeparatorKind),
+    NotSeparator,
+}
+
+impl SimpleTokenKind {
+    fn get(token: &&Token<'_>) -> Self {
+        match token.kind {
+            TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
+            _ => Self::NotSeparator,
+        }
+    }
+
+    fn is_not_separator(&self) -> bool {
+        match self {
+            SimpleTokenKind::NotSeparator => true,
+            SimpleTokenKind::Separator(_) => false,
+        }
+    }
+}
+
 /// Structure used to analyze a string, compute words that match,
 /// and format the source string, returning a highlighted and cropped sub-string.
 pub struct Matcher<'t, 'tokenizer, 'b, 'lang> {
@ -287,43 +315,53 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
        crop_size: usize,
    ) -> (usize, usize) {
        // if there is no match, we start from the beginning of the string by default.
-        let first_match_word_position =
+        let first_match_first_word_position =
            matches.first().map(|m| m.get_first_word_pos()).unwrap_or(0);
-        let first_match_token_position =
+        let first_match_first_token_position =
            matches.first().map(|m| m.get_first_token_pos()).unwrap_or(0);
-        let last_match_word_position = matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0);
-        let last_match_token_position = matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0);
+        let last_match_last_word_position =
+            matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0);
+        let last_match_last_token_position =
+            matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0);

+        let matches_window_len =
+            last_match_last_word_position - first_match_first_word_position + 1;
+
+        if crop_size >= matches_window_len {
            // matches needs to be counted in the crop len.
-        let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
+            let mut remaining_words = crop_size - matches_window_len;

            // create the initial state of the crop window: 2 iterators starting from the matches positions,
            // a reverse iterator starting from the first match token position and going towards the beginning of the text,
-        let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable();
+            let mut before_tokens =
+                tokens[..first_match_first_token_position].iter().rev().peekable();
            // an iterator starting from the last match token position and going towards the end of the text.
-        let mut after_tokens = tokens[last_match_token_position..].iter().peekable();
+            let mut after_tokens = tokens[last_match_last_token_position + 1..].iter().peekable();

            // grows the crop window peeking in both directions
            // until the window contains the good number of words:
            while remaining_words > 0 {
-            let before_token = before_tokens.peek().map(|t| t.separator_kind());
-            let after_token = after_tokens.peek().map(|t| t.separator_kind());
+                let before_token_kind = before_tokens.peek().map(SimpleTokenKind::get);
+                let after_token_kind = after_tokens.peek().map(SimpleTokenKind::get);

-            match (before_token, after_token) {
+                match (before_token_kind, after_token_kind) {
                    // we can expand both sides.
-                (Some(before_token), Some(after_token)) => {
-                    match (before_token, after_token) {
+                    (Some(before_token_kind), Some(after_token_kind)) => {
+                        match (before_token_kind, after_token_kind) {
                            // if they are both separators and are the same kind then advance both,
                            // or expand in the soft separator separator side.
-                        (Some(before_token_kind), Some(after_token_kind)) => {
-                            if before_token_kind == after_token_kind {
+                            (
+                                SimpleTokenKind::Separator(before_token_separator_kind),
+                                SimpleTokenKind::Separator(after_token_separator_kind),
+                            ) => {
+                                if before_token_separator_kind == after_token_separator_kind {
                                    before_tokens.next();

                                    // this avoid having an ending separator before crop marker.
                                    if remaining_words > 1 {
                                        after_tokens.next();
                                    }
-                            } else if before_token_kind == SeparatorKind::Hard {
+                                } else if let SeparatorKind::Hard = before_token_separator_kind {
                                    after_tokens.next();
                                } else {
                                    before_tokens.next();
@ -331,17 +369,17 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                            }
                            // if one of the tokens is a word, we expend in the side of the word.
                            // left is a word, advance left.
-                        (None, Some(_)) => {
+                            (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => {
                                before_tokens.next();
                                remaining_words -= 1;
                            }
                            // right is a word, advance right.
-                        (Some(_), None) => {
+                            (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => {
                                after_tokens.next();
                                remaining_words -= 1;
                            }
                            // both are words, advance left then right if remaining_word > 0.
-                        (None, None) => {
+                            (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => {
                                before_tokens.next();
                                remaining_words -= 1;

@ -353,16 +391,16 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                        }
                    }
                    // the end of the text is reached, advance left.
-                (Some(before_token), None) => {
+                    (Some(before_token_kind), None) => {
                        before_tokens.next();
-                    if before_token.is_none() {
+                        if let SimpleTokenKind::NotSeparator = before_token_kind {
                            remaining_words -= 1;
                        }
                    }
                    // the start of the text is reached, advance right.
-                (None, Some(after_token)) => {
+                    (None, Some(after_token_kind)) => {
                        after_tokens.next();
-                    if after_token.is_none() {
+                        if let SimpleTokenKind::NotSeparator = after_token_kind {
                            remaining_words -= 1;
                        }
                    }
@ -376,6 +414,31 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
            let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);

            (crop_byte_start, crop_byte_end)
+        } else {
+            // there's one match? and it's longer than the crop window, so we have to advance inward
+            let mut remaining_extra_words = matches_window_len - crop_size;
+            let mut tokens_from_end =
+                tokens[..=last_match_last_token_position].iter().rev().peekable();
+
+            while remaining_extra_words > 0 {
+                let token_from_end_kind =
+                    tokens_from_end.peek().map(SimpleTokenKind::get).expect("TODO");
+                if token_from_end_kind.is_not_separator() {
+                    remaining_extra_words -= 1;
+                }
+
+                tokens_from_end.next();
+            }
+
+            let crop_byte_start = if first_match_first_token_position > 0 {
+                &tokens[first_match_first_token_position - 1].byte_end
+            } else {
+                &0
+            };
+            let crop_byte_end = tokens_from_end.next().map(|t| t.byte_start).expect("TODO");
+
+            (*crop_byte_start, crop_byte_end)
+        }
    }

    /// Compute the score of a match interval:
@ -416,11 +479,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                        lwp
                    }
                };
-
-                let next_match_first_word_pos = match next_match.position {
-                    MatchPosition::Word { word_position, .. } => word_position,
-                    MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp,
-                };
+                let next_match_first_word_pos = next_match.get_first_word_pos();

                // compute distance between matches
                distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
@ -443,72 +502,96 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
    /// Returns the matches interval where the score computed by match_interval_score is the best.
    fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
        let matches_len = matches.len();
+        if matches_len <= 1 {
+            return matches;
+        }
+
+        // positions of the first and the last match of the best matches interval in `matches`.
+        struct BestInterval {
+            interval: (usize, usize),
+            score: (i16, i16, i16),
+        }
+
+        fn save_best_interval(
+            best_interval: &mut Option<BestInterval>,
+            interval_first: usize,
+            interval_last: usize,
+            interval_score: (i16, i16, i16),
+        ) {
+            if let Some(best_interval) = best_interval {
+                if interval_score > best_interval.score {
+                    best_interval.interval = (interval_first, interval_last);
+                    best_interval.score = interval_score;
+                }
+            } else {
+                *best_interval = Some(BestInterval {
+                    interval: (interval_first, interval_last),
+                    score: interval_score,
+                });
+            }
+        }
+
+        let mut best_interval: Option<BestInterval> = None;

        // we compute the matches interval if we have at least 2 matches.
-        if matches_len > 1 {
        // current interval positions.
        let mut interval_first = 0;
-            // positions of the first and the last match of the best matches interval in `matches`.
-            let mut best_interval = (0, 0);
-            let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
-
-            let mut index = 1;
-            while index < matches_len - 1 {
-                let next_match = &matches[index];
+        let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();

+        for (index, next_match) in matches.iter().enumerate() {
            // if next match would make interval gross more than crop_size,
            // we compare the current interval with the best one,
            // then we increase `interval_first` until next match can be added.
            let next_match_last_word_pos = next_match.get_last_word_pos();
-                let interval_first_match_first_word_pos =
-                    matches[interval_first].get_first_word_pos();

            // if the next match would mean that we pass the crop size window,
            // we take the last valid match, that didn't pass this boundry, which is `index` - 1,
            // and calculate a score for it, and check if it's better than our best so far
            if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
-                    // skip for 1, because it would result in the same as our very first interval score
-                    if index != 1 {
+                // if index is 0 there is no last viable match
+                if index != 0 {
                    let interval_last = index - 1;
                    let interval_score =
                        self.match_interval_score(&matches[interval_first..=interval_last]);

                    // keep interval if it's the best
-                        if interval_score > best_interval_score {
-                            best_interval = (interval_first, interval_last);
-                            best_interval_score = interval_score;
-                        }
+                    save_best_interval(
+                        &mut best_interval,
+                        interval_first,
+                        interval_last,
+                        interval_score,
+                    );
                }

                // advance start of the interval while interval is longer than crop_size.
                loop {
                    interval_first += 1;
-                        let interval_first_match_first_word_pos =
+                    interval_first_match_first_word_pos =
                        matches[interval_first].get_first_word_pos();

-                        if next_match_last_word_pos - interval_first_match_first_word_pos
+                    if interval_first_match_first_word_pos > next_match_last_word_pos
+                        || next_match_last_word_pos - interval_first_match_first_word_pos
                            < crop_size
                    {
                        break;
                    }
                }
            }
-
-                index += 1;
        }

        // compute the last interval score and compare it to the best one.
        let interval_last = matches_len - 1;
+        // if it's the last match with itself, we need to make sure it's
+        // not a phrase longer than the crop window
+        if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
            let interval_score =
                self.match_interval_score(&matches[interval_first..=interval_last]);
-            if interval_score > best_interval_score {
-                best_interval = (interval_first, interval_last);
+            save_best_interval(&mut best_interval, interval_first, interval_last, interval_score);
        }

+        // if none of the matches fit the criteria above, default to the first one
+        let best_interval = best_interval.map_or((0, 0), |v| v.interval);
        &matches[best_interval.0..=best_interval.1]
-        } else {
-            matches
-        }
    }

    // Returns the formatted version of the original text.
@ -928,98 +1011,98 @@ mod tests {

        let format_options = FormatOptions { highlight: true, crop: Some(10) };

-        // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
-        // let mut matcher = builder.build(text, None);
-        // // should return 10 words with a marker at the start as well the end, and the highlighted matches.
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…the power to split <em>the world</em> between those who embraced…"
-        // );
+        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
+        let mut matcher = builder.build(text, None);
+        // should return 10 words with a marker at the start as well the end, and the highlighted matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…the power to split <em>the world</em> between those who embraced…"
+        );

-        // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\"");
-        // let mut matcher = builder.build(text, None);
-        // // should highlight "those" and the phrase "and those".
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…groundbreaking invention had the <em>power to</em> split the world between…"
-        // );
-
-        // let builder = MatcherBuilder::new_test(
-        //     &rtxn,
-        //     &temp_index,
-        //     "\"The groundbreaking invention had the power to split the world\"",
-        // );
-        // let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"<em>The groundbreaking invention had the power to split the world</em>…"
-        // );
+        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\"");
+        let mut matcher = builder.build(text, None);
+        // should highlight "those" and the phrase "and those".
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…groundbreaking invention had the <em>power to</em> split the world between…"
+        );

        let builder = MatcherBuilder::new_test(
            &rtxn,
            &temp_index,
-            "\"The groundbreaking invention had the power to split the world between\"",
+            "\"The groundbreaking invention had the power to split the world\"",
        );
        let mut matcher = builder.build(text, None);
        insta::assert_snapshot!(
            matcher.format(format_options),
-            @"The groundbreaking invention had the power to split the world …"
+            @"<em>The groundbreaking invention had the power to split the world</em>…"
        );

-        // let builder = MatcherBuilder::new_test(
-        //     &rtxn,
-        //     &temp_index,
-        //     "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
-        // );
-        // let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…between those who <em>embraced progress and those who resisted change</em>…"
-        // );
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"The groundbreaking invention had the power to split the world between those\"",
+        );
+        let mut matcher = builder.build(text, None);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"The groundbreaking invention had the power to split the world…"
+        );

-        // let builder = MatcherBuilder::new_test(
-        //     &rtxn,
-        //     &temp_index,
-        //     "\"The groundbreaking invention\" \"split the world between those\"",
-        // );
-        // let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…the power to <em>split the world between those</em> who embraced…"
-        // );
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
+        );
+        let mut matcher = builder.build(text, None);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…between those who <em>embraced progress and those who resisted change</em>…"
+        );

-        // let builder = MatcherBuilder::new_test(
-        //     &rtxn,
-        //     &temp_index,
-        //     "\"groundbreaking invention\" \"split the world between\"",
-        // );
-        // let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
-        // );
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"The groundbreaking invention\" \"split the world between those\"",
+        );
+        let mut matcher = builder.build(text, None);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…the power to <em>split the world between those</em> who embraced…"
+        );

-        // let builder = MatcherBuilder::new_test(
-        //     &rtxn,
-        //     &temp_index,
-        //     "\"groundbreaking invention\" \"had the power to split the world between those\"",
-        // );
-        // let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…invention <em>had the power to split the world between those</em>…"
-        // );
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"groundbreaking invention\" \"split the world between\"",
+        );
+        let mut matcher = builder.build(text, None);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
+        );

-        // let builder = MatcherBuilder::new_test(
-        //     &rtxn,
-        //     &temp_index,
-        //     "\"The groundbreaking invention\" \"had the power to split the world between those\"",
-        // );
-        // let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…invention <em>had the power to split the world between those</em>…"
-        // );
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"groundbreaking invention\" \"had the power to split the world between those\"",
+        );
+        let mut matcher = builder.build(text, None);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…invention <em>had the power to split the world between those</em>…"
+        );
+
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"The groundbreaking invention\" \"had the power to split the world between those\"",
+        );
+        let mut matcher = builder.build(text, None);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…invention <em>had the power to split the world between those</em>…"
+        );
    }

    #[test]