Merge #426

426: Fix search highlight for non-unicode chars r=ManyTheFish a=Samyak2 # Pull Request ## What does this PR do? Fixes https://github.com/meilisearch/MeiliSearch/issues/1480  ## PR checklist Please check if your PR fulfills the following requirements: - [x] Does this PR fix an existing issue? - [x] Have you read the contributing guidelines? - [x] Have you made sure that the title is accurate and descriptive of the changes? ## Changes The `matching_bytes` function takes a `&Token` now and: - gets the number of bytes to highlight (unchanged). - uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight. In essence, the `matching_bytes` function now returns the number of matching grapheme clusters instead of bytes. Added proper highlighting in the HTTP UI: - requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens - `<mark>` tag is put around only the matched part - before this change, the entire word was highlighted even if only a part of it matched ## Questions Since `matching_bytes` does not return number of bytes but grapheme clusters, should it be renamed to something like `matching_chars` or `matching_graphemes`? Will this break the API? Thank you very much `@ManyTheFish` for helping 😄 Co-authored-by: Samyak S Sarnayak <samyak201@gmail.com>
2024-11-23 10:37:41 +08:00 · 2022-01-17 13:39:00 +00:00 · 2022-01-17 13:39:00 +00:00 · 4c516c00da
commit 4c516c00da
parent 15bbde1022 c0313f3026
4 changed files with 101 additions and 21 deletions
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@ -10,7 +10,7 @@ anyhow = "1.0.38"
 byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
 crossbeam-channel = "0.5.0"
 heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" }
-meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" }
+meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" }
 memmap2 = "0.5.0"
 milli = { path = "../milli" }
 once_cell = "1.5.2"
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@ -160,13 +160,19 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
                let analyzed = self.analyzer.analyze(&old_string);
                for (word, token) in analyzed.reconstruct() {
                    if token.is_word() {
-                        let to_highlight = matching_words.matching_bytes(token.text()).is_some();
-                        if to_highlight {
-                            string.push_str("<mark>")
-                        }
-                        string.push_str(word);
-                        if to_highlight {
-                            string.push_str("</mark>")
+                        match matching_words.matching_bytes(&token) {
+                            Some(chars_to_highlight) => {
+                                let mut chars = word.chars();
+
+                                string.push_str("<mark>");
+                                // push the part to highlight
+                                string.extend(chars.by_ref().take(chars_to_highlight));
+                                string.push_str("</mark>");
+                                // push the suffix after highlight
+                                string.extend(chars);
+                            }
+                            // no highlight
+                            None => string.push_str(word),
                        }
                    } else {
                        string.push_str(word);
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -22,7 +22,7 @@ heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1", default-fe
 human_format = "1.0.3"
 levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
 linked-hash-map = "0.5.4"
-meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.6" }
+meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.7" }
 memmap2 = "0.5.0"
 obkv = "0.2.0"
 once_cell = "1.5.2"
--- a/milli/src/search/matching_words.rs
+++ b/milli/src/search/matching_words.rs
@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet};
 use std::ops::{Index, IndexMut};

 use levenshtein_automata::{Distance, DFA};
+use meilisearch_tokenizer::Token;

 use super::build_dfa;
 use crate::search::query_tree::{Operation, Query};
@ -33,15 +34,15 @@ impl MatchingWords {
    }

    /// Returns the number of matching bytes if the word matches one of the query words.
-    pub fn matching_bytes(&self, word_to_highlight: &str) -> Option<usize> {
+    pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
        self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
-            match dfa.eval(word_to_highlight) {
+            match dfa.eval(word_to_highlight.text()) {
                Distance::Exact(t) if t <= *typo => {
                    if *is_prefix {
-                        let len = bytes_to_highlight(word_to_highlight, query_word);
-                        Some(len)
+                        let len = bytes_to_highlight(word_to_highlight.text(), query_word);
+                        Some(word_to_highlight.num_chars_from_bytes(len))
                    } else {
-                        Some(word_to_highlight.len())
+                        Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()))
                    }
                }
                _otherwise => None,
@ -178,8 +179,11 @@ fn bytes_to_highlight(source: &str, target: &str) -> usize {

 #[cfg(test)]
 mod tests {
+    use std::borrow::Cow;
    use std::str::from_utf8;

+    use meilisearch_tokenizer::TokenKind;
+
    use super::*;
    use crate::search::query_tree::{Operation, Query, QueryKind};
    use crate::MatchingWords;
@ -269,12 +273,82 @@ mod tests {

        let matching_words = MatchingWords::from_query_tree(&query_tree);

-        assert_eq!(matching_words.matching_bytes("word"), Some(3));
-        assert_eq!(matching_words.matching_bytes("nyc"), None);
-        assert_eq!(matching_words.matching_bytes("world"), Some(5));
-        assert_eq!(matching_words.matching_bytes("splitted"), Some(5));
-        assert_eq!(matching_words.matching_bytes("thisnew"), None);
-        assert_eq!(matching_words.matching_bytes("borld"), Some(5));
-        assert_eq!(matching_words.matching_bytes("wordsplit"), Some(4));
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("word"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "word".len(),
+                char_map: None,
+            }),
+            Some(3)
+        );
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("nyc"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "nyc".len(),
+                char_map: None,
+            }),
+            None
+        );
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("world"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "world".len(),
+                char_map: None,
+            }),
+            Some(5)
+        );
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("splitted"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "splitted".len(),
+                char_map: None,
+            }),
+            Some(5)
+        );
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("thisnew"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "thisnew".len(),
+                char_map: None,
+            }),
+            None
+        );
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("borld"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "borld".len(),
+                char_map: None,
+            }),
+            Some(5)
+        );
+        assert_eq!(
+            matching_words.matching_bytes(&Token {
+                kind: TokenKind::Word,
+                word: Cow::Borrowed("wordsplit"),
+                byte_start: 0,
+                char_index: 0,
+                byte_end: "wordsplit".len(),
+                char_map: None,
+            }),
+            Some(4)
+        );
    }
 }