Refactor find_best_match_interval

This commit is contained in:
F. Levi 2024-09-27 15:44:30 +03:00
parent 0ffeea5a52
commit d20a39b959

View File

@ -442,36 +442,48 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
/// Returns the matches interval where the score computed by match_interval_score is the best.
fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
let matches_len = matches.len();
// we compute the matches interval if we have at least 2 matches.
if matches.len() > 1 {
if matches_len > 1 {
// current interval positions.
let mut interval_first = 0;
// positions of the first and the last match of the best matches interval in `matches`.
let mut best_interval = (0, 0);
let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
// current interval positions.
let mut interval_first = 0;
let mut interval_last = 0;
for (index, next_match) in matches.iter().enumerate().skip(1) {
let mut index = 1;
while index < matches_len - 1 {
let next_match = &matches[index];
// if next match would make interval gross more than crop_size,
// we compare the current interval with the best one,
// then we increase `interval_first` until next match can be added.
let next_match_last_word_pos = next_match.get_last_word_pos();
let mut interval_first_match_first_word_pos =
let interval_first_match_first_word_pos =
matches[interval_first].get_first_word_pos();
// if the next match would mean that we pass the crop size window,
// we take the last valid match, that didn't pass this boundry, which is `index` - 1,
// and calculate a score for it, and check if it's better than our best so far
if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
let interval_score =
self.match_interval_score(&matches[interval_first..=interval_last]);
// skip for 1, because it would result in the same as our very first interval score
if index != 1 {
let interval_last = index - 1;
let interval_score =
self.match_interval_score(&matches[interval_first..=interval_last]);
// keep interval if it's the best
if interval_score > best_interval_score {
best_interval = (interval_first, interval_last);
best_interval_score = interval_score;
// keep interval if it's the best
if interval_score > best_interval_score {
best_interval = (interval_first, interval_last);
best_interval_score = interval_score;
}
}
// advance start of the interval while interval is longer than crop_size.
loop {
interval_first += 1;
interval_first_match_first_word_pos =
let interval_first_match_first_word_pos =
matches[interval_first].get_first_word_pos();
if next_match_last_word_pos - interval_first_match_first_word_pos
@ -481,10 +493,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
}
}
}
interval_last = index;
index += 1;
}
// compute the last interval score and compare it to the best one.
let interval_last = matches_len - 1;
let interval_score =
self.match_interval_score(&matches[interval_first..=interval_last]);
if interval_score > best_interval_score {
@ -914,32 +928,32 @@ mod tests {
let format_options = FormatOptions { highlight: true, crop: Some(10) };
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
let mut matcher = builder.build(text, None);
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"…the power to split <em>the world</em> between those who embraced…"
);
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
// let mut matcher = builder.build(text, None);
// // should return 10 words with a marker at the start as well the end, and the highlighted matches.
// insta::assert_snapshot!(
// matcher.format(format_options),
// @"…the power to split <em>the world</em> between those who embraced…"
// );
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
let mut matcher = builder.build(text, None);
// should highlight "those" and the phrase "and those".
insta::assert_snapshot!(
matcher.format(format_options),
@"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"
);
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\"");
// let mut matcher = builder.build(text, None);
// // should highlight "those" and the phrase "and those".
// insta::assert_snapshot!(
// matcher.format(format_options),
// @"…groundbreaking invention had the <em>power to</em> split the world between…"
// );
let builder = MatcherBuilder::new_test(
&rtxn,
&temp_index,
"\"The groundbreaking invention had the power to split the world\"",
);
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@"<em>The groundbreaking invention had the power to split the world</em>…"
);
// let builder = MatcherBuilder::new_test(
// &rtxn,
// &temp_index,
// "\"The groundbreaking invention had the power to split the world\"",
// );
// let mut matcher = builder.build(text, None);
// insta::assert_snapshot!(
// matcher.format(format_options),
// @"<em>The groundbreaking invention had the power to split the world</em>…"
// );
let builder = MatcherBuilder::new_test(
&rtxn,
@ -952,16 +966,60 @@ mod tests {
@"The groundbreaking invention had the power to split the world …"
);
let builder = MatcherBuilder::new_test(
&rtxn,
&temp_index,
"\"The groundbreaking invention\" \"embraced progress and those who resisted change\"",
);
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@"…between those who <em>embraced progress and those who resisted change</em>…"
);
// let builder = MatcherBuilder::new_test(
// &rtxn,
// &temp_index,
// "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
// );
// let mut matcher = builder.build(text, None);
// insta::assert_snapshot!(
// matcher.format(format_options),
// @"…between those who <em>embraced progress and those who resisted change</em>…"
// );
// let builder = MatcherBuilder::new_test(
// &rtxn,
// &temp_index,
// "\"The groundbreaking invention\" \"split the world between those\"",
// );
// let mut matcher = builder.build(text, None);
// insta::assert_snapshot!(
// matcher.format(format_options),
// @"…the power to <em>split the world between those</em> who embraced…"
// );
// let builder = MatcherBuilder::new_test(
// &rtxn,
// &temp_index,
// "\"groundbreaking invention\" \"split the world between\"",
// );
// let mut matcher = builder.build(text, None);
// insta::assert_snapshot!(
// matcher.format(format_options),
// @"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
// );
// let builder = MatcherBuilder::new_test(
// &rtxn,
// &temp_index,
// "\"groundbreaking invention\" \"had the power to split the world between those\"",
// );
// let mut matcher = builder.build(text, None);
// insta::assert_snapshot!(
// matcher.format(format_options),
// @"…invention <em>had the power to split the world between those</em>…"
// );
// let builder = MatcherBuilder::new_test(
// &rtxn,
// &temp_index,
// "\"The groundbreaking invention\" \"had the power to split the world between those\"",
// );
// let mut matcher = builder.build(text, None);
// insta::assert_snapshot!(
// matcher.format(format_options),
// @"…invention <em>had the power to split the world between those</em>…"
// );
}
#[test]