mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
Change Matcher so that phrases are counted as one instead of word by word
This commit is contained in:
parent
02c2b660f8
commit
edcb4c60ba
@ -132,37 +132,21 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
mut partial: PartialMatch<'a>,
|
mut partial: PartialMatch<'a>,
|
||||||
token_position: usize,
|
token_position: usize,
|
||||||
word_position: usize,
|
word_position: usize,
|
||||||
|
first_word_char_start: &usize,
|
||||||
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
|
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
|
||||||
matches: &mut Vec<Match>,
|
matches: &mut Vec<Match>,
|
||||||
) -> bool {
|
) -> bool {
|
||||||
let mut potential_matches = vec![(token_position, word_position, partial.char_len())];
|
for (_, _, word) in words_positions {
|
||||||
|
|
||||||
for (token_position, word_position, word) in words_positions {
|
|
||||||
partial = match partial.match_token(word) {
|
partial = match partial.match_token(word) {
|
||||||
// token matches the partial match, but the match is not full,
|
// token matches the partial match, but the match is not full,
|
||||||
// we temporarily save the current token then we try to match the next one.
|
// we temporarily save the current token then we try to match the next one.
|
||||||
Some(MatchType::Partial(partial)) => {
|
Some(MatchType::Partial(partial)) => partial,
|
||||||
potential_matches.push((token_position, word_position, partial.char_len()));
|
|
||||||
partial
|
|
||||||
}
|
|
||||||
// partial match is now full, we keep this matches and we advance positions
|
// partial match is now full, we keep this matches and we advance positions
|
||||||
Some(MatchType::Full { char_len, ids }) => {
|
Some(MatchType::Full { ids, .. }) => {
|
||||||
let ids: Vec<_> = ids.clone().collect();
|
|
||||||
// save previously matched tokens as matches.
|
|
||||||
let iter = potential_matches.into_iter().map(
|
|
||||||
|(token_position, word_position, match_len)| Match {
|
|
||||||
match_len,
|
|
||||||
ids: ids.clone(),
|
|
||||||
word_position,
|
|
||||||
token_position,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
matches.extend(iter);
|
|
||||||
|
|
||||||
// save the token that closes the partial match as a match.
|
// save the token that closes the partial match as a match.
|
||||||
matches.push(Match {
|
matches.push(Match {
|
||||||
match_len: char_len,
|
match_len: word.char_end - first_word_char_start,
|
||||||
ids,
|
ids: ids.clone().collect(),
|
||||||
word_position,
|
word_position,
|
||||||
token_position,
|
token_position,
|
||||||
});
|
});
|
||||||
@ -221,6 +205,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
partial,
|
partial,
|
||||||
token_position,
|
token_position,
|
||||||
word_position,
|
word_position,
|
||||||
|
&word.char_start,
|
||||||
&mut wp,
|
&mut wp,
|
||||||
&mut matches,
|
&mut matches,
|
||||||
) {
|
) {
|
||||||
@ -472,15 +457,17 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
.enumerate()
|
.enumerate()
|
||||||
.find(|(i, _)| *i == m.match_len)
|
.find(|(i, _)| *i == m.match_len)
|
||||||
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
|
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
|
||||||
|
|
||||||
formatted.push(self.highlight_prefix);
|
formatted.push(self.highlight_prefix);
|
||||||
formatted.push(&self.text[token.byte_start..highlight_byte_index]);
|
formatted.push(&self.text[token.byte_start..highlight_byte_index]);
|
||||||
formatted.push(self.highlight_suffix);
|
formatted.push(self.highlight_suffix);
|
||||||
|
|
||||||
// if it's a prefix highlight, we put the end of the word after the highlight marker.
|
// if it's a prefix highlight, we put the end of the word after the highlight marker.
|
||||||
if highlight_byte_index < token.byte_end {
|
if highlight_byte_index < token.byte_end {
|
||||||
formatted.push(&self.text[highlight_byte_index..token.byte_end]);
|
formatted.push(&self.text[highlight_byte_index..token.byte_end]);
|
||||||
}
|
}
|
||||||
|
|
||||||
byte_index = token.byte_end;
|
byte_index = token.byte_start + m.match_len;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -821,22 +808,24 @@ mod tests {
|
|||||||
fn format_highlight_crop_phrase_query() {
|
fn format_highlight_crop_phrase_query() {
|
||||||
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
|
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
|
||||||
let temp_index = TempIndex::new();
|
let temp_index = TempIndex::new();
|
||||||
|
|
||||||
|
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
|
||||||
temp_index
|
temp_index
|
||||||
.add_documents(documents!([
|
.add_documents(documents!([
|
||||||
{ "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" }
|
{ "id": 1, "text": text }
|
||||||
]))
|
]))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
let rtxn = temp_index.read_txn().unwrap();
|
let rtxn = temp_index.read_txn().unwrap();
|
||||||
|
|
||||||
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
||||||
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
|
|
||||||
|
|
||||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
|
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
|
||||||
let mut matcher = builder.build(text, None);
|
let mut matcher = builder.build(text, None);
|
||||||
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
|
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
|
||||||
insta::assert_snapshot!(
|
insta::assert_snapshot!(
|
||||||
matcher.format(format_options),
|
matcher.format(format_options),
|
||||||
@"…had the power to split <em>the</em> <em>world</em> between those who…"
|
@"…had the power to split <em>the world</em> between those who…"
|
||||||
);
|
);
|
||||||
|
|
||||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
|
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
|
||||||
@ -844,7 +833,7 @@ mod tests {
|
|||||||
// should highlight "those" and the phrase "and those".
|
// should highlight "those" and the phrase "and those".
|
||||||
insta::assert_snapshot!(
|
insta::assert_snapshot!(
|
||||||
matcher.format(format_options),
|
matcher.format(format_options),
|
||||||
@"…world between <em>those</em> who embraced progress <em>and</em> <em>those</em> who resisted…"
|
@"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -900,7 +889,7 @@ mod tests {
|
|||||||
let mut matcher = builder.build(text, None);
|
let mut matcher = builder.build(text, None);
|
||||||
insta::assert_snapshot!(
|
insta::assert_snapshot!(
|
||||||
matcher.format(format_options),
|
matcher.format(format_options),
|
||||||
@"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_"
|
@"_the_ _do or_ die can't be he do and or isn'_t he_"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user