From 30247d70cd3da8f85c960d14f5af291257328a2e Mon Sep 17 00:00:00 2001 From: Samyak S Sarnayak Date: Fri, 17 Dec 2021 22:53:34 +0530 Subject: [PATCH] Fix search highlight for non-unicode chars The `matching_bytes` function takes a `&Token` now and: - gets the number of bytes to highlight (unchanged). - uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight. In essence, the `matching_bytes` function returns the number of matching grapheme clusters instead of bytes. Should this function be renamed then? Added proper highlighting in the HTTP UI: - requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens - `` tag is put around only the matched part - before this change, the entire word was highlighted even if only a part of it matched --- http-ui/Cargo.toml | 1 + http-ui/src/main.rs | 23 ++++++++++++++++------- milli/src/search/matching_words.rs | 14 +++++++++----- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 593dba3e5..79c784fdd 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -17,6 +17,7 @@ once_cell = "1.5.2" rayon = "1.5.0" structopt = { version = "0.3.21", default-features = false, features = ["wrap_help"] } tempfile = "3.2.0" +unicode-segmentation = "1.6.0" # http server askama = "0.10.5" diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 75a9012c6..386f10cb4 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -34,6 +34,7 @@ use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; use tokio::sync::broadcast; +use unicode_segmentation::UnicodeSegmentation; use warp::filters::ws::Message; use warp::http::Response; use warp::Filter; @@ -160,13 +161,21 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed = self.analyzer.analyze(&old_string); for (word, token) in analyzed.reconstruct() { if token.is_word() { - let to_highlight = matching_words.matching_bytes(token.text()).is_some(); - if to_highlight { - string.push_str("") - } - string.push_str(word); - if to_highlight { - string.push_str("") + let chars_to_highlight = matching_words.matching_bytes(&token).unwrap_or(0); + if chars_to_highlight > 0 { + let graphemes = word.graphemes(true); + let chars = graphemes.clone().into_iter(); + + string.push_str(""); + string.push_str( + chars.take(chars_to_highlight).collect::().as_str(), + ); + string.push_str(""); + + let chars = graphemes.into_iter().skip(chars_to_highlight); + string.push_str(chars.collect::().as_str()); + } else { + string.push_str(word); } } else { string.push_str(word); diff --git a/milli/src/search/matching_words.rs b/milli/src/search/matching_words.rs index 37754a782..b22335658 100644 --- a/milli/src/search/matching_words.rs +++ b/milli/src/search/matching_words.rs @@ -3,6 +3,7 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::{Index, IndexMut}; use levenshtein_automata::{Distance, DFA}; +use meilisearch_tokenizer::Token; use super::build_dfa; use crate::search::query_tree::{Operation, Query}; @@ -33,15 +34,18 @@ impl MatchingWords { } /// Returns the number of matching bytes if the word matches one of the query words. - pub fn matching_bytes(&self, word_to_highlight: &str) -> Option { + pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option { self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| { - match dfa.eval(word_to_highlight) { + match dfa.eval(word_to_highlight.text()) { Distance::Exact(t) if t <= *typo => { if *is_prefix { - let len = bytes_to_highlight(word_to_highlight, query_word); - Some(len) + let len = bytes_to_highlight(word_to_highlight.text(), query_word); + Some(word_to_highlight.num_graphemes_from_bytes(len)) } else { - Some(word_to_highlight.len()) + Some( + word_to_highlight + .num_graphemes_from_bytes(word_to_highlight.text().len()), + ) } } _otherwise => None,