Add some tests and fix some corner cases

2025-01-18 17:11:15 +08:00 · 2022-03-29 14:51:02 +02:00 · 2022-03-29 14:51:02 +02:00 · 4428cb5909
commit 4428cb5909
parent 844f546a8b
1 changed files with 109 additions and 9 deletions
--- a/milli/src/search/matches/mod.rs
+++ b/milli/src/search/matches/mod.rs
@ -158,9 +158,13 @@ impl<'t> Matcher<'t, '_> {
        let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
        let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);

-        // TODO: buggy if no match and fisrt token is a sepparator
+        // TODO: buggy if no match and first token is a sepparator
        let mut remaining_words =
-            self.crop_size + first_match_word_position - last_match_word_position - 1;
+            self.crop_size + first_match_word_position - last_match_word_position;
+        // if first token is a word, then remove 1 to remaining_words.
+        if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) {
+            remaining_words -= 1;
+        }
        let mut first_token_position = first_match_token_position;
        let mut last_token_position = last_match_token_position;

@ -204,18 +208,21 @@ impl<'t> Matcher<'t, '_> {
                        }
                    }
                }
+                // the end of the text is reached, advance left.
                (Some(ft), None) => {
                    first_token_position -= 1;
                    if ft.is_separator().is_none() {
                        remaining_words -= 1;
                    }
                }
+                // the start of the text is reached, advance right.
                (None, Some(lt)) => {
                    last_token_position += 1;
                    if lt.is_separator().is_none() {
                        remaining_words -= 1;
                    }
                }
+                // no more token to add.
                (None, None) => break,
            }
        }
@ -263,13 +270,14 @@ impl<'t> Matcher<'t, '_> {

    fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] {
        if matches.len() > 1 {
-            let mut best_interval = (0, 1);
-            let mut best_interval_score = self.match_interval_score(&matches[0..=1]);
+            let mut best_interval = (0, 0);
+            let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
            let mut interval_first = 0;
-            let mut interval_last = 1;
-            for (index, next_match) in matches.iter().enumerate().skip(2) {
+            let mut interval_last = 0;
+            for (index, next_match) in matches.iter().enumerate().skip(1) {
                // if next match would make interval gross more than crop_size
-                if next_match.word_position - matches[interval_first].word_position > self.crop_size
+                if next_match.word_position - matches[interval_first].word_position
+                    >= self.crop_size
                {
                    let interval_score =
                        self.match_interval_score(&matches[interval_first..=interval_last]);
@ -282,7 +290,7 @@ impl<'t> Matcher<'t, '_> {

                    // advance start of the interval while interval is longer than crop_size
                    while next_match.word_position - matches[interval_first].word_position
-                        > self.crop_size
+                        >= self.crop_size
                    {
                        interval_first += 1;
                    }
@ -307,10 +315,15 @@ impl<'t> Matcher<'t, '_> {

        let (first_token_position, last_token_position) = self.token_crop_bounds(match_interval);

-        (self.tokens[first_token_position].byte_start, self.tokens[last_token_position].byte_end)
+        let byte_start = self.tokens.get(first_token_position).map_or(0, |t| t.byte_start);
+        let byte_end = self.tokens.get(last_token_position).map_or(byte_start, |t| t.byte_end);
+        (byte_start, byte_end)
    }

    pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
+        // If 0 it will be considered null and thus not crop the field
+        // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
+        let crop = crop && self.crop_size > 0;
        if !highlight && !crop {
            // compute matches is not needed if no highlight or crop is requested.
            Cow::Borrowed(self.text)
@ -444,6 +457,20 @@ mod tests {
        let highlight = true;
        let crop = false;

+        // empty text.
+        let text = "";
+        let analyzed = analyzer.analyze(&text);
+        let tokens: Vec<_> = analyzed.tokens().collect();
+        let mut matcher = builder.build(&tokens[..], text);
+        assert_eq!(&matcher.format(highlight, crop), "");
+
+        // text containing only separators.
+        let text = ":-)";
+        let analyzed = analyzer.analyze(&text);
+        let tokens: Vec<_> = analyzed.tokens().collect();
+        let mut matcher = builder.build(&tokens[..], text);
+        assert_eq!(&matcher.format(highlight, crop), ":-)");
+
        // Text without any match.
        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
        let analyzed = analyzer.analyze(&text);
@ -482,6 +509,20 @@ mod tests {
        let highlight = false;
        let crop = true;

+        // empty text.
+        let text = "";
+        let analyzed = analyzer.analyze(&text);
+        let tokens: Vec<_> = analyzed.tokens().collect();
+        let mut matcher = builder.build(&tokens[..], text);
+        assert_eq!(&matcher.format(highlight, crop), "");
+
+        // text containing only separators.
+        let text = ":-)";
+        let analyzed = analyzer.analyze(&text);
+        let tokens: Vec<_> = analyzed.tokens().collect();
+        let mut matcher = builder.build(&tokens[..], text);
+        assert_eq!(&matcher.format(highlight, crop), ":-)");
+
        // Text without any match.
        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
        let analyzed = analyzer.analyze(&text);
@ -493,6 +534,17 @@ mod tests {
            "A quick brown fox can not jump 32 feet, right? …"
        );

+        // Text without any match starting by a separator.
+        let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)";
+        let analyzed = analyzer.analyze(&text);
+        let tokens: Vec<_> = analyzed.tokens().collect();
+        let mut matcher = builder.build(&tokens[..], text);
+        // no highlight should return 10 first words with a marker at the end.
+        assert_eq!(
+            &matcher.format(highlight, crop),
+            "(A quick brown fox can not jump 32 feet, right? …"
+        );
+
        // Test phrase propagation
        let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
        let analyzed = analyzer.analyze(&text);
@ -570,6 +622,20 @@ mod tests {
        let highlight = true;
        let crop = true;

+        // empty text.
+        let text = "";
+        let analyzed = analyzer.analyze(&text);
+        let tokens: Vec<_> = analyzed.tokens().collect();
+        let mut matcher = builder.build(&tokens[..], text);
+        assert_eq!(&matcher.format(highlight, crop), "");
+
+        // text containing only separators.
+        let text = ":-)";
+        let analyzed = analyzer.analyze(&text);
+        let tokens: Vec<_> = analyzed.tokens().collect();
+        let mut matcher = builder.build(&tokens[..], text);
+        assert_eq!(&matcher.format(highlight, crop), ":-)");
+
        // Text without any match.
        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
        let analyzed = analyzer.analyze(&text);
@ -611,4 +677,38 @@ mod tests {
            "…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
        );
    }
+
+    #[test]
+    fn smaller_crop_size() {
+        //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
+        let query_tree = query_tree();
+
+        let mut builder = MatcherBuilder::from_query_tree(&query_tree);
+        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
+
+        let highlight = false;
+        let crop = true;
+
+        let text = "void void split the world void void.";
+        let analyzed = analyzer.analyze(&text);
+        let tokens: Vec<_> = analyzed.tokens().collect();
+
+        // set a smaller crop size
+        builder.crop_size(2);
+        let mut matcher = builder.build(&tokens[..], text);
+        // because crop size < query size, partially format matches.
+        assert_eq!(&matcher.format(highlight, crop), "…split the …");
+
+        // set a smaller crop size
+        builder.crop_size(1);
+        let mut matcher = builder.build(&tokens[..], text);
+        // because crop size < query size, partially format matches.
+        assert_eq!(&matcher.format(highlight, crop), "…split …");
+
+        // set a smaller crop size
+        builder.crop_size(0);
+        let mut matcher = builder.build(&tokens[..], text);
+        // because crop size is 0, crop is ignored.
+        assert_eq!(&matcher.format(highlight, crop), "void void split the world void void.");
+    }
 }