Fix search highlight for non-unicode chars

The `matching_bytes` function takes a `&Token` now and: - gets the number of bytes to highlight (unchanged). - uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight. In essence, the `matching_bytes` function returns the number of matching grapheme clusters instead of bytes. Should this function be renamed then? Added proper highlighting in the HTTP UI: - requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens - `<mark>` tag is put around only the matched part - before this change, the entire word was highlighted even if only a part of it matched
2024-11-27 04:25:06 +08:00 · 2021-12-17 22:53:34 +05:30 · 2021-12-17 22:53:34 +05:30 · 30247d70cd
commit 30247d70cd
parent 559e019de1
3 changed files with 26 additions and 12 deletions
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@ -17,6 +17,7 @@ once_cell = "1.5.2"
 rayon = "1.5.0"
 structopt = { version = "0.3.21", default-features = false, features = ["wrap_help"] }
 tempfile = "3.2.0"
+unicode-segmentation = "1.6.0"

 # http server
 askama = "0.10.5"
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@ -34,6 +34,7 @@ use structopt::StructOpt;
 use tokio::fs::File as TFile;
 use tokio::io::AsyncWriteExt;
 use tokio::sync::broadcast;
+use unicode_segmentation::UnicodeSegmentation;
 use warp::filters::ws::Message;
 use warp::http::Response;
 use warp::Filter;
@ -160,13 +161,21 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
                let analyzed = self.analyzer.analyze(&old_string);
                for (word, token) in analyzed.reconstruct() {
                    if token.is_word() {
-                        let to_highlight = matching_words.matching_bytes(token.text()).is_some();
-                        if to_highlight {
-                            string.push_str("<mark>")
-                        }
+                        let chars_to_highlight = matching_words.matching_bytes(&token).unwrap_or(0);
+                        if chars_to_highlight > 0 {
+                            let graphemes = word.graphemes(true);
+                            let chars = graphemes.clone().into_iter();
+
+                            string.push_str("<mark>");
+                            string.push_str(
+                                chars.take(chars_to_highlight).collect::<String>().as_str(),
+                            );
+                            string.push_str("</mark>");
+
+                            let chars = graphemes.into_iter().skip(chars_to_highlight);
+                            string.push_str(chars.collect::<String>().as_str());
+                        } else {
                            string.push_str(word);
-                        if to_highlight {
-                            string.push_str("</mark>")
                        }
                    } else {
                        string.push_str(word);
--- a/milli/src/search/matching_words.rs
+++ b/milli/src/search/matching_words.rs
@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet};
 use std::ops::{Index, IndexMut};

 use levenshtein_automata::{Distance, DFA};
+use meilisearch_tokenizer::Token;

 use super::build_dfa;
 use crate::search::query_tree::{Operation, Query};
@ -33,15 +34,18 @@ impl MatchingWords {
    }

    /// Returns the number of matching bytes if the word matches one of the query words.
-    pub fn matching_bytes(&self, word_to_highlight: &str) -> Option<usize> {
+    pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
        self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
-            match dfa.eval(word_to_highlight) {
+            match dfa.eval(word_to_highlight.text()) {
                Distance::Exact(t) if t <= *typo => {
                    if *is_prefix {
-                        let len = bytes_to_highlight(word_to_highlight, query_word);
-                        Some(len)
+                        let len = bytes_to_highlight(word_to_highlight.text(), query_word);
+                        Some(word_to_highlight.num_graphemes_from_bytes(len))
                    } else {
-                        Some(word_to_highlight.len())
+                        Some(
+                            word_to_highlight
+                                .num_graphemes_from_bytes(word_to_highlight.text().len()),
+                        )
                    }
                }
                _otherwise => None,