mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-22 18:17:39 +08:00
Improve changes to Matcher
This commit is contained in:
parent
edcb4c60ba
commit
e7af499314
@ -93,15 +93,28 @@ impl FormatOptions {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub enum MatchPosition {
|
||||||
|
Word {
|
||||||
|
// position of the word in the whole text.
|
||||||
|
word_position: usize,
|
||||||
|
// position of the token in the whole text.
|
||||||
|
token_position: usize,
|
||||||
|
},
|
||||||
|
Phrase {
|
||||||
|
// position of the first and last word in the phrase in the whole text.
|
||||||
|
word_positions: (usize, usize),
|
||||||
|
// position of the first and last token in the phrase in the whole text.
|
||||||
|
token_positions: (usize, usize),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct Match {
|
pub struct Match {
|
||||||
match_len: usize,
|
match_len: usize,
|
||||||
// ids of the query words that matches.
|
// ids of the query words that matches.
|
||||||
ids: Vec<WordId>,
|
ids: Vec<WordId>,
|
||||||
// position of the word in the whole text.
|
position: MatchPosition,
|
||||||
word_position: usize,
|
|
||||||
// position of the token in the whole text.
|
|
||||||
token_position: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
|
#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
|
||||||
@ -130,13 +143,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
/// compute_partial_match peek into next words to validate if the match is complete.
|
/// compute_partial_match peek into next words to validate if the match is complete.
|
||||||
fn compute_partial_match<'a>(
|
fn compute_partial_match<'a>(
|
||||||
mut partial: PartialMatch<'a>,
|
mut partial: PartialMatch<'a>,
|
||||||
token_position: usize,
|
first_token_position: usize,
|
||||||
word_position: usize,
|
first_word_position: usize,
|
||||||
first_word_char_start: &usize,
|
first_word_char_start: &usize,
|
||||||
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
|
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
|
||||||
matches: &mut Vec<Match>,
|
matches: &mut Vec<Match>,
|
||||||
) -> bool {
|
) -> bool {
|
||||||
for (_, _, word) in words_positions {
|
for (token_position, word_position, word) in words_positions {
|
||||||
partial = match partial.match_token(word) {
|
partial = match partial.match_token(word) {
|
||||||
// token matches the partial match, but the match is not full,
|
// token matches the partial match, but the match is not full,
|
||||||
// we temporarily save the current token then we try to match the next one.
|
// we temporarily save the current token then we try to match the next one.
|
||||||
@ -145,10 +158,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
Some(MatchType::Full { ids, .. }) => {
|
Some(MatchType::Full { ids, .. }) => {
|
||||||
// save the token that closes the partial match as a match.
|
// save the token that closes the partial match as a match.
|
||||||
matches.push(Match {
|
matches.push(Match {
|
||||||
match_len: word.char_end - first_word_char_start,
|
match_len: word.char_end - *first_word_char_start,
|
||||||
ids: ids.clone().collect(),
|
ids: ids.clone().collect(),
|
||||||
word_position,
|
position: MatchPosition::Phrase {
|
||||||
token_position,
|
word_positions: (first_word_position, word_position),
|
||||||
|
token_positions: (first_token_position, token_position),
|
||||||
|
},
|
||||||
});
|
});
|
||||||
|
|
||||||
// the match is complete, we return true.
|
// the match is complete, we return true.
|
||||||
@ -191,8 +206,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
matches.push(Match {
|
matches.push(Match {
|
||||||
match_len: char_len,
|
match_len: char_len,
|
||||||
ids,
|
ids,
|
||||||
word_position,
|
position: MatchPosition::Word { word_position, token_position },
|
||||||
token_position,
|
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -228,13 +242,47 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
Some((tokens, matches)) => matches
|
Some((tokens, matches)) => matches
|
||||||
.iter()
|
.iter()
|
||||||
.map(|m| MatchBounds {
|
.map(|m| MatchBounds {
|
||||||
start: tokens[m.token_position].byte_start,
|
start: tokens[match m.position {
|
||||||
|
MatchPosition::Word { token_position, .. } => token_position,
|
||||||
|
MatchPosition::Phrase {
|
||||||
|
token_positions: (first_token_position, _),
|
||||||
|
..
|
||||||
|
} => first_token_position,
|
||||||
|
}]
|
||||||
|
.byte_start,
|
||||||
length: m.match_len,
|
length: m.match_len,
|
||||||
})
|
})
|
||||||
.collect(),
|
.collect(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// @TODO: This should be improved, looks nasty
|
||||||
|
fn get_match_pos(&self, m: &Match, is_first: bool, is_word: bool) -> usize {
|
||||||
|
match m.position {
|
||||||
|
MatchPosition::Word { word_position, token_position } => {
|
||||||
|
if is_word {
|
||||||
|
word_position
|
||||||
|
} else {
|
||||||
|
token_position
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MatchPosition::Phrase { word_positions: (wpf, wpl), token_positions: (tpf, tpl) } => {
|
||||||
|
if is_word {
|
||||||
|
if is_first {
|
||||||
|
return wpf;
|
||||||
|
} else {
|
||||||
|
return wpl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if is_first {
|
||||||
|
tpf
|
||||||
|
} else {
|
||||||
|
tpl
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Returns the bounds in byte index of the crop window.
|
/// Returns the bounds in byte index of the crop window.
|
||||||
fn crop_bounds(
|
fn crop_bounds(
|
||||||
&self,
|
&self,
|
||||||
@ -243,10 +291,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
crop_size: usize,
|
crop_size: usize,
|
||||||
) -> (usize, usize) {
|
) -> (usize, usize) {
|
||||||
// if there is no match, we start from the beginning of the string by default.
|
// if there is no match, we start from the beginning of the string by default.
|
||||||
let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
|
let first_match_word_position =
|
||||||
let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
|
matches.first().map(|m| self.get_match_pos(m, true, true)).unwrap_or(0);
|
||||||
let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
|
let first_match_token_position =
|
||||||
let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
|
matches.first().map(|m| self.get_match_pos(m, true, false)).unwrap_or(0);
|
||||||
|
let last_match_word_position =
|
||||||
|
matches.last().map(|m| self.get_match_pos(m, false, true)).unwrap_or(0);
|
||||||
|
let last_match_token_position =
|
||||||
|
matches.last().map(|m| self.get_match_pos(m, false, false)).unwrap_or(0);
|
||||||
|
|
||||||
// matches needs to be counted in the crop len.
|
// matches needs to be counted in the crop len.
|
||||||
let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
|
let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
|
||||||
@ -350,7 +402,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// compute distance between matches
|
// compute distance between matches
|
||||||
distance_score -= (next_match.word_position - m.word_position).min(7) as i16;
|
distance_score -= (self.get_match_pos(next_match, true, true)
|
||||||
|
- self.get_match_pos(m, true, true))
|
||||||
|
.min(7) as i16;
|
||||||
}
|
}
|
||||||
|
|
||||||
ids.extend(m.ids.iter());
|
ids.extend(m.ids.iter());
|
||||||
@ -378,7 +432,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
// if next match would make interval gross more than crop_size,
|
// if next match would make interval gross more than crop_size,
|
||||||
// we compare the current interval with the best one,
|
// we compare the current interval with the best one,
|
||||||
// then we increase `interval_first` until next match can be added.
|
// then we increase `interval_first` until next match can be added.
|
||||||
if next_match.word_position - matches[interval_first].word_position >= crop_size {
|
let next_match_word_position = self.get_match_pos(next_match, true, true);
|
||||||
|
|
||||||
|
if next_match_word_position
|
||||||
|
- self.get_match_pos(&matches[interval_first], false, true)
|
||||||
|
>= crop_size
|
||||||
|
{
|
||||||
let interval_score =
|
let interval_score =
|
||||||
self.match_interval_score(&matches[interval_first..=interval_last]);
|
self.match_interval_score(&matches[interval_first..=interval_last]);
|
||||||
|
|
||||||
@ -389,10 +448,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// advance start of the interval while interval is longer than crop_size.
|
// advance start of the interval while interval is longer than crop_size.
|
||||||
while next_match.word_position - matches[interval_first].word_position
|
loop {
|
||||||
>= crop_size
|
|
||||||
{
|
|
||||||
interval_first += 1;
|
interval_first += 1;
|
||||||
|
|
||||||
|
if next_match_word_position
|
||||||
|
- self.get_match_pos(&matches[interval_first], false, true)
|
||||||
|
< crop_size
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
interval_last = index;
|
interval_last = index;
|
||||||
@ -441,33 +505,41 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
|
|||||||
if format_options.highlight {
|
if format_options.highlight {
|
||||||
// insert highlight markers around matches.
|
// insert highlight markers around matches.
|
||||||
for m in matches {
|
for m in matches {
|
||||||
let token = &tokens[m.token_position];
|
let (current_byte_start, current_byte_end) = match m.position {
|
||||||
|
MatchPosition::Word { token_position, .. } => {
|
||||||
|
let token = &tokens[token_position];
|
||||||
|
(&token.byte_start, &token.byte_end)
|
||||||
|
}
|
||||||
|
MatchPosition::Phrase { token_positions: (ftp, ltp), .. } => {
|
||||||
|
(&tokens[ftp].byte_start, &tokens[ltp].byte_end)
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
// skip matches out of the crop window.
|
// skip matches out of the crop window.
|
||||||
if token.byte_start < byte_start || token.byte_end > byte_end {
|
if *current_byte_start < byte_start || *current_byte_end > byte_end {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if byte_index < token.byte_start {
|
if byte_index < *current_byte_start {
|
||||||
formatted.push(&self.text[byte_index..token.byte_start]);
|
formatted.push(&self.text[byte_index..*current_byte_start]);
|
||||||
}
|
}
|
||||||
|
|
||||||
let highlight_byte_index = self.text[token.byte_start..]
|
let highlight_byte_index = self.text[*current_byte_start..]
|
||||||
.char_indices()
|
.char_indices()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.find(|(i, _)| *i == m.match_len)
|
.find(|(i, _)| *i == m.match_len)
|
||||||
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
|
.map_or(*current_byte_end, |(_, (i, _))| i + *current_byte_start);
|
||||||
|
|
||||||
formatted.push(self.highlight_prefix);
|
formatted.push(self.highlight_prefix);
|
||||||
formatted.push(&self.text[token.byte_start..highlight_byte_index]);
|
formatted.push(&self.text[*current_byte_start..highlight_byte_index]);
|
||||||
formatted.push(self.highlight_suffix);
|
formatted.push(self.highlight_suffix);
|
||||||
|
|
||||||
// if it's a prefix highlight, we put the end of the word after the highlight marker.
|
// if it's a prefix highlight, we put the end of the word after the highlight marker.
|
||||||
if highlight_byte_index < token.byte_end {
|
if highlight_byte_index < *current_byte_end {
|
||||||
formatted.push(&self.text[highlight_byte_index..token.byte_end]);
|
formatted.push(&self.text[highlight_byte_index..*current_byte_end]);
|
||||||
}
|
}
|
||||||
|
|
||||||
byte_index = token.byte_start + m.match_len;
|
byte_index = *current_byte_end;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user