Improve changes to Matcher

This commit is contained in:
F. Levi 2024-09-12 16:58:13 +03:00
parent edcb4c60ba
commit e7af499314

View File

@ -94,14 +94,27 @@ impl FormatOptions {
}
#[derive(Clone, Debug)]
pub struct Match {
match_len: usize,
// ids of the query words that matches.
ids: Vec<WordId>,
pub enum MatchPosition {
Word {
// position of the word in the whole text.
word_position: usize,
// position of the token in the whole text.
token_position: usize,
},
Phrase {
// position of the first and last word in the phrase in the whole text.
word_positions: (usize, usize),
// position of the first and last token in the phrase in the whole text.
token_positions: (usize, usize),
},
}
#[derive(Clone, Debug)]
pub struct Match {
match_len: usize,
// ids of the query words that matches.
ids: Vec<WordId>,
position: MatchPosition,
}
#[derive(Serialize, Debug, Clone, PartialEq, Eq)]
@ -130,13 +143,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
/// compute_partial_match peek into next words to validate if the match is complete.
fn compute_partial_match<'a>(
mut partial: PartialMatch<'a>,
token_position: usize,
word_position: usize,
first_token_position: usize,
first_word_position: usize,
first_word_char_start: &usize,
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
matches: &mut Vec<Match>,
) -> bool {
for (_, _, word) in words_positions {
for (token_position, word_position, word) in words_positions {
partial = match partial.match_token(word) {
// token matches the partial match, but the match is not full,
// we temporarily save the current token then we try to match the next one.
@ -145,10 +158,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
Some(MatchType::Full { ids, .. }) => {
// save the token that closes the partial match as a match.
matches.push(Match {
match_len: word.char_end - first_word_char_start,
match_len: word.char_end - *first_word_char_start,
ids: ids.clone().collect(),
word_position,
token_position,
position: MatchPosition::Phrase {
word_positions: (first_word_position, word_position),
token_positions: (first_token_position, token_position),
},
});
// the match is complete, we return true.
@ -191,8 +206,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
matches.push(Match {
match_len: char_len,
ids,
word_position,
token_position,
position: MatchPosition::Word { word_position, token_position },
});
break;
}
@ -228,13 +242,47 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
Some((tokens, matches)) => matches
.iter()
.map(|m| MatchBounds {
start: tokens[m.token_position].byte_start,
start: tokens[match m.position {
MatchPosition::Word { token_position, .. } => token_position,
MatchPosition::Phrase {
token_positions: (first_token_position, _),
..
} => first_token_position,
}]
.byte_start,
length: m.match_len,
})
.collect(),
}
}
// @TODO: This should be improved, looks nasty
fn get_match_pos(&self, m: &Match, is_first: bool, is_word: bool) -> usize {
match m.position {
MatchPosition::Word { word_position, token_position } => {
if is_word {
word_position
} else {
token_position
}
}
MatchPosition::Phrase { word_positions: (wpf, wpl), token_positions: (tpf, tpl) } => {
if is_word {
if is_first {
return wpf;
} else {
return wpl;
}
}
if is_first {
tpf
} else {
tpl
}
}
}
}
/// Returns the bounds in byte index of the crop window.
fn crop_bounds(
&self,
@ -243,10 +291,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
crop_size: usize,
) -> (usize, usize) {
// if there is no match, we start from the beginning of the string by default.
let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
let first_match_word_position =
matches.first().map(|m| self.get_match_pos(m, true, true)).unwrap_or(0);
let first_match_token_position =
matches.first().map(|m| self.get_match_pos(m, true, false)).unwrap_or(0);
let last_match_word_position =
matches.last().map(|m| self.get_match_pos(m, false, true)).unwrap_or(0);
let last_match_token_position =
matches.last().map(|m| self.get_match_pos(m, false, false)).unwrap_or(0);
// matches needs to be counted in the crop len.
let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
@ -350,7 +402,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
}
// compute distance between matches
distance_score -= (next_match.word_position - m.word_position).min(7) as i16;
distance_score -= (self.get_match_pos(next_match, true, true)
- self.get_match_pos(m, true, true))
.min(7) as i16;
}
ids.extend(m.ids.iter());
@ -378,7 +432,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
// if next match would make interval gross more than crop_size,
// we compare the current interval with the best one,
// then we increase `interval_first` until next match can be added.
if next_match.word_position - matches[interval_first].word_position >= crop_size {
let next_match_word_position = self.get_match_pos(next_match, true, true);
if next_match_word_position
- self.get_match_pos(&matches[interval_first], false, true)
>= crop_size
{
let interval_score =
self.match_interval_score(&matches[interval_first..=interval_last]);
@ -389,10 +448,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
}
// advance start of the interval while interval is longer than crop_size.
while next_match.word_position - matches[interval_first].word_position
>= crop_size
{
loop {
interval_first += 1;
if next_match_word_position
- self.get_match_pos(&matches[interval_first], false, true)
< crop_size
{
break;
}
}
}
interval_last = index;
@ -441,33 +505,41 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
if format_options.highlight {
// insert highlight markers around matches.
for m in matches {
let token = &tokens[m.token_position];
let (current_byte_start, current_byte_end) = match m.position {
MatchPosition::Word { token_position, .. } => {
let token = &tokens[token_position];
(&token.byte_start, &token.byte_end)
}
MatchPosition::Phrase { token_positions: (ftp, ltp), .. } => {
(&tokens[ftp].byte_start, &tokens[ltp].byte_end)
}
};
// skip matches out of the crop window.
if token.byte_start < byte_start || token.byte_end > byte_end {
if *current_byte_start < byte_start || *current_byte_end > byte_end {
continue;
}
if byte_index < token.byte_start {
formatted.push(&self.text[byte_index..token.byte_start]);
if byte_index < *current_byte_start {
formatted.push(&self.text[byte_index..*current_byte_start]);
}
let highlight_byte_index = self.text[token.byte_start..]
let highlight_byte_index = self.text[*current_byte_start..]
.char_indices()
.enumerate()
.find(|(i, _)| *i == m.match_len)
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
.map_or(*current_byte_end, |(_, (i, _))| i + *current_byte_start);
formatted.push(self.highlight_prefix);
formatted.push(&self.text[token.byte_start..highlight_byte_index]);
formatted.push(&self.text[*current_byte_start..highlight_byte_index]);
formatted.push(self.highlight_suffix);
// if it's a prefix highlight, we put the end of the word after the highlight marker.
if highlight_byte_index < token.byte_end {
formatted.push(&self.text[highlight_byte_index..token.byte_end]);
if highlight_byte_index < *current_byte_end {
formatted.push(&self.text[highlight_byte_index..*current_byte_end]);
}
byte_index = token.byte_start + m.match_len;
byte_index = *current_byte_end;
}
}