From acfe31151ebfbd6a7d8e86eaf2878d9652d0a16e Mon Sep 17 00:00:00 2001 From: many Date: Thu, 1 Jul 2021 14:49:22 +0200 Subject: [PATCH] Hotfix panic for unicode characters When the highlight bound is in the middle of a character or if we are out of bounds, we highlight the complete matching word. note: we should enhance the tokenizer and the Highlighter to match char indices. Fix #1368 --- meilisearch-http/src/index/search.rs | 87 ++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 6 deletions(-) diff --git a/meilisearch-http/src/index/search.rs b/meilisearch-http/src/index/search.rs index 2d8095559..55fa75938 100644 --- a/meilisearch-http/src/index/search.rs +++ b/meilisearch-http/src/index/search.rs @@ -580,13 +580,23 @@ impl<'a, A: AsRef<[u8]>> Formatter<'a, A> { // Matcher::match since the call is expensive. if format_options.highlight && token.is_word() { if let Some(length) = matcher.matches(token.text()) { - if format_options.highlight { - out.push_str(&self.marks.0); - out.push_str(&word[..length]); - out.push_str(&self.marks.1); - out.push_str(&word[length..]); - return out; + match word.get(..length).zip(word.get(length..)) { + Some((head, tail)) => { + out.push_str(&self.marks.0); + out.push_str(head); + out.push_str(&self.marks.1); + out.push_str(tail); + } + // if we are in the middle of a character + // or if all the word should be highlighted, + // we highlight the complete word. + None => { + out.push_str(&self.marks.0); + out.push_str(&word); + out.push_str(&self.marks.1); + } } + return out; } } out.push_str(word); @@ -741,6 +751,71 @@ mod test { assert_eq!(value["author"], "J. R. R. Tolkien"); } + /// https://github.com/meilisearch/MeiliSearch/issues/1368 + #[test] + fn formatted_with_highlight_in_unicode_word() { + let stop_words = fst::Set::default(); + let mut config = AnalyzerConfig::default(); + config.stop_words(&stop_words); + let analyzer = Analyzer::new(config); + let formatter = Formatter::new(&analyzer, (String::from(""), String::from(""))); + + let mut fields = FieldsIdsMap::new(); + let title = fields.insert("title").unwrap(); + let author = fields.insert("author").unwrap(); + + let mut buf = Vec::new(); + let mut obkv = obkv::KvWriter::new(&mut buf); + obkv.insert( + title, + Value::String("Go💼od luck.".into()).to_string().as_bytes(), + ) + .unwrap(); + obkv.finish().unwrap(); + obkv = obkv::KvWriter::new(&mut buf); + obkv.insert( + author, + Value::String("JacobLey".into()).to_string().as_bytes(), + ) + .unwrap(); + obkv.finish().unwrap(); + + let obkv = obkv::KvReader::new(&buf); + + let mut formatted_options = BTreeMap::new(); + formatted_options.insert( + title, + FormatOptions { + highlight: true, + crop: None, + }, + ); + formatted_options.insert( + author, + FormatOptions { + highlight: false, + crop: None, + }, + ); + + let mut matching_words = BTreeMap::new(); + // emojis are deunicoded during tokenization + // TODO Tokenizer should remove spaces after deunicode + matching_words.insert("gobriefcase od", Some(11)); + + let value = format_fields( + &fields, + obkv, + &formatter, + &matching_words, + &formatted_options, + ) + .unwrap(); + + assert_eq!(value["title"], "Go💼od luck."); + assert_eq!(value["author"], "JacobLey"); + } + #[test] fn formatted_with_crop_2() { let stop_words = fst::Set::default();