From 9f62149b94a307de19510abd77e302aee6f9b149 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Thu, 1 Jul 2021 19:03:28 +0200
Subject: [PATCH] Fix matching lenghth in matching_words

---
 milli/src/search/matching_words.rs | 36 +++++++++++++-----------------
 1 file changed, 16 insertions(+), 20 deletions(-)
diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs
index 291378b43..37754a782 100644
--- a/milli/src/search/matching_words.rs
+++ b/milli/src/search/matching_words.rs
@@ -108,7 +108,8 @@ impl<T> IndexMut<(usize, usize)> for N2Array<T> {
 /// The algorithm is a modified
 /// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
 fn bytes_to_highlight(source: &str, target: &str) -> usize {
-    let (n, m) = (source.chars().count(), target.chars().count());
+    let n = source.chars().count();
+    let m = target.chars().count();
 
     if n == 0 {
         return 0;
@@ -125,11 +126,11 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize {
     let mut matrix = N2Array::new(n + 2, m + 2, 0);
 
     matrix[(0, 0)] = inf;
-    for i in 0..n + 1 {
+    for i in 0..=n {
         matrix[(i + 1, 0)] = inf;
         matrix[(i + 1, 1)] = i;
     }
-    for j in 0..m + 1 {
+    for j in 0..=m {
         matrix[(0, j + 1)] = inf;
         matrix[(1, j + 1)] = j;
     }
@@ -163,16 +164,16 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize {
         last_row.insert(char_s, row);
     }
 
-    let mut minimum = 2;
-    for x in 0..=n {
-        let min_dist = (0..=m).map(|y| matrix[(x + 1, y + 1)]).min().unwrap();
-        if min_dist <= 2 {
-            minimum = x;
+    let mut minimum = (u32::max_value(), 0);
+    for x in 0..=m {
+        let dist = matrix[(n + 1, x + 1)] as u32;
+        if dist < minimum.0 {
+            minimum = (dist, x);
         }
     }
 
     // everything was done characters wise and now we want to returns a number of bytes
-    source.chars().take(minimum).map(|c| c.len_utf8()).sum()
+    source.chars().take(minimum.1).map(|c| c.len_utf8()).sum()
 }
 
 #[cfg(test)]
@@ -208,7 +209,7 @@ mod tests {
             TestBytesToHighlight {
                 query: "Levenstein",
                 text: "Levenshte",
-                length: "Levenstei".len(),
+                length: "Levenste".len(),
             },
             // we get to the end of our word with only two typos at the beginning
             TestBytesToHighlight {
@@ -216,13 +217,8 @@ mod tests {
                 text: "Levenshtein",
                 length: "Bavenshtein".len(),
             },
-            // Since we calculate a distance char by char we are supposed to have only two mistakes
-            // here. That would've not be the case if we were computing the distance bytes per bytes
-            TestBytesToHighlight { query: "Båve", text: "Chiøt", length: "Bå".len() },
-            TestBytesToHighlight { query: "💪🙂🍤", text: "plouf", length: "💪🙂".len() },
-            TestBytesToHighlight { query: "clôu¿i", text: "bloubi", length: "clôu".len() },
             TestBytesToHighlight {
-                query: "Альфа", text: "Альфой", length: "Альфа".len()
+                query: "Альфа", text: "Альфой", length: "Альф".len()
             },
             TestBytesToHighlight {
                 query: "Go💼", text: "Go💼od luck.", length: "Go💼".len()
@@ -240,7 +236,7 @@ mod tests {
         ];
 
         for test in &tests {
-            let length = bytes_to_highlight(test.query, test.text);
+            let length = bytes_to_highlight(test.text, test.query);
             assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text);
             assert!(
                 from_utf8(&test.query.as_bytes()[..length]).is_ok(),
@@ -273,12 +269,12 @@ mod tests {
 
         let matching_words = MatchingWords::from_query_tree(&query_tree);
 
-        assert_eq!(matching_words.matching_bytes("word"), Some(4));
+        assert_eq!(matching_words.matching_bytes("word"), Some(3));
         assert_eq!(matching_words.matching_bytes("nyc"), None);
         assert_eq!(matching_words.matching_bytes("world"), Some(5));
-        assert_eq!(matching_words.matching_bytes("splitted"), Some(7));
+        assert_eq!(matching_words.matching_bytes("splitted"), Some(5));
         assert_eq!(matching_words.matching_bytes("thisnew"), None);
         assert_eq!(matching_words.matching_bytes("borld"), Some(5));
-        assert_eq!(matching_words.matching_bytes("wordsplit"), Some(5));
+        assert_eq!(matching_words.matching_bytes("wordsplit"), Some(4));
     }
 }