From edcb4c60ba0bc416152bdfd931598bfa0df87467 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Thu, 12 Sep 2024 09:44:37 +0300 Subject: [PATCH] Change Matcher so that phrases are counted as one instead of word by word --- milli/src/search/new/matches/mod.rs | 45 +++++++++++------------------ 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 4688b8f32..6ddb81c6a 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -132,37 +132,21 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { mut partial: PartialMatch<'a>, token_position: usize, word_position: usize, + first_word_char_start: &usize, words_positions: &mut impl Iterator)>, matches: &mut Vec, ) -> bool { - let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; - - for (token_position, word_position, word) in words_positions { + for (_, _, word) in words_positions { partial = match partial.match_token(word) { // token matches the partial match, but the match is not full, // we temporarily save the current token then we try to match the next one. - Some(MatchType::Partial(partial)) => { - potential_matches.push((token_position, word_position, partial.char_len())); - partial - } + Some(MatchType::Partial(partial)) => partial, // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { char_len, ids }) => { - let ids: Vec<_> = ids.clone().collect(); - // save previously matched tokens as matches. - let iter = potential_matches.into_iter().map( - |(token_position, word_position, match_len)| Match { - match_len, - ids: ids.clone(), - word_position, - token_position, - }, - ); - matches.extend(iter); - + Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { - match_len: char_len, - ids, + match_len: word.char_end - first_word_char_start, + ids: ids.clone().collect(), word_position, token_position, }); @@ -221,6 +205,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { partial, token_position, word_position, + &word.char_start, &mut wp, &mut matches, ) { @@ -472,15 +457,17 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { .enumerate() .find(|(i, _)| *i == m.match_len) .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + formatted.push(self.highlight_prefix); formatted.push(&self.text[token.byte_start..highlight_byte_index]); formatted.push(self.highlight_suffix); + // if it's a prefix highlight, we put the end of the word after the highlight marker. if highlight_byte_index < token.byte_end { formatted.push(&self.text[highlight_byte_index..token.byte_end]); } - byte_index = token.byte_end; + byte_index = token.byte_start + m.match_len; } } @@ -821,22 +808,24 @@ mod tests { fn format_highlight_crop_phrase_query() { //! testing: https://github.com/meilisearch/meilisearch/issues/3975 let temp_index = TempIndex::new(); + + let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; temp_index .add_documents(documents!([ - { "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" } + { "id": 1, "text": text } ])) .unwrap(); + let rtxn = temp_index.read_txn().unwrap(); let format_options = FormatOptions { highlight: true, crop: Some(10) }; - let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); let mut matcher = builder.build(text, None); // should return 10 words with a marker at the start as well the end, and the highlighted matches. insta::assert_snapshot!( matcher.format(format_options), - @"…had the power to split the world between those who…" + @"…had the power to split the world between those who…" ); let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); @@ -844,7 +833,7 @@ mod tests { // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), - @"…world between those who embraced progress and those who resisted…" + @"…world between those who embraced progress and those who resisted…" ); } @@ -900,7 +889,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_" + @"_the_ _do or_ die can't be he do and or isn'_t he_" ); } }