Fix and refactor crop_bounds

This commit is contained in:
F. Levi 2024-10-03 10:40:14 +03:00
parent 37a9d64c44
commit 40336ce87d

View File

@ -1,6 +1,7 @@
use std::borrow::Cow;
use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer};
use either::Either;
pub use matching_words::MatchingWords;
use matching_words::{MatchType, PartialMatch, WordId};
use serde::Serialize;
@ -450,45 +451,70 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
crop_size: usize,
) -> (usize, usize) {
// if there is no match, we start from the beginning of the string by default.
let (matches_size, first_match_first_token_position, last_match_last_token_position) =
if !matches.is_empty() {
let (
mut remaining_words,
is_iterating_forward,
before_tokens_starting_index,
after_tokens_starting_index,
) = if !matches.is_empty() {
let matches_first = matches.first().unwrap();
let matches_last = matches.last().unwrap();
(
matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1,
matches_first.get_first_token_pos(),
matches_last.get_last_token_pos(),
)
let matches_size =
matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1;
let is_crop_size_gte_match_size = crop_size >= matches_size;
let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size;
let remaining_words = if is_crop_size_gte_match_size {
crop_size - matches_size
} else {
(0, 0, 0)
// in case matches size is greater than crop size, which implies there's only one match,
// we count words backwards, because we have to remove words, as they're extra words outside of
// crop window
matches_size - crop_size
};
if crop_size >= matches_size {
// matches needs to be counted in the crop len.
let mut remaining_words = crop_size - matches_size;
let last_match_last_token_position_plus_one = last_match_last_token_position + 1;
let after_tokens_starting_index = if matches_size == 0 {
0
} else if last_match_last_token_position_plus_one < tokens.len() {
} else {
let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1;
if last_match_last_token_position_plus_one < tokens.len() {
last_match_last_token_position_plus_one
} else {
tokens.len()
// we have matched the end of possible tokens, there's nothing to advance
tokens.len() - 1
}
};
(
remaining_words,
is_iterating_forward,
if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 },
after_tokens_starting_index,
)
} else {
(crop_size, true, 0, 0)
};
// create the initial state of the crop window: 2 iterators starting from the matches positions,
// a reverse iterator starting from the first match token position and going towards the beginning of the text,
let mut before_tokens =
tokens[..first_match_first_token_position].iter().rev().peekable();
// an iterator starting from the last match token position and going towards the end of the text.
let mut after_tokens = tokens[after_tokens_starting_index..].iter().peekable();
let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable();
// an iterator ...
let mut after_tokens = if is_iterating_forward {
// ... starting from the last match token position and going towards the end of the text.
Either::Left(tokens[after_tokens_starting_index..].iter().peekable())
} else {
// ... starting from the last match token position and going towards the start of the text.
Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable())
};
// grows the crop window peeking in both directions
// until the window contains the good number of words:
while remaining_words > 0 {
let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new);
let after_token_kind = after_tokens.peek().map(SimpleTokenKind::new);
let after_token_kind =
after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new);
match (before_token_kind, after_token_kind) {
// we can expand both sides.
@ -507,8 +533,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
if remaining_words > 1 {
after_tokens.next();
}
} else if matches!(before_token_separator_kind, SeparatorKind::Hard)
{
} else if matches!(before_token_separator_kind, SeparatorKind::Hard) {
after_tokens.next();
} else {
before_tokens.next();
@ -561,36 +586,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
(crop_byte_start, crop_byte_end)
} else {
// there's one match and it's longer than the crop window, so we have to advance inward
let mut remaining_extra_words = matches_size - crop_size;
let mut tokens_from_end =
tokens[..=last_match_last_token_position].iter().rev().peekable();
while remaining_extra_words > 0 {
let token_from_end_kind = tokens_from_end
.peek()
.map(SimpleTokenKind::new)
.expect("Expected iterator to not reach end");
if matches!(token_from_end_kind, SimpleTokenKind::NotSeparator) {
remaining_extra_words -= 1;
}
tokens_from_end.next();
}
let crop_byte_start = if first_match_first_token_position > 0 {
&tokens[first_match_first_token_position - 1].byte_end
} else {
&0
};
let crop_byte_end = tokens_from_end
.next()
.map(|t| t.byte_start)
.expect("Expected iterator to not reach end");
(*crop_byte_start, crop_byte_end)
}
}
// Returns the formatted version of the original text.