diff --git a/milli/src/search/new/matches/match.rs b/milli/src/search/new/matches/match.rs index cc08b006c..2eef4d5a6 100644 --- a/milli/src/search/new/matches/match.rs +++ b/milli/src/search/new/matches/match.rs @@ -18,7 +18,7 @@ pub enum MatchPosition { #[derive(Clone, Debug)] pub struct Match { - pub match_len: usize, + pub char_count: usize, // ids of the query words that matches. pub ids: Vec, pub position: MatchPosition, diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index e4d2785ca..1f30a17ad 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -86,14 +86,17 @@ impl MatchingWords { continue; }; let prefix_length = char_index + c.len_utf8(); - let char_len = token.original_lengths(prefix_length).0; + let (char_count, byte_len) = token.original_lengths(prefix_length); let ids = &located_words.positions; - return Some(MatchType::Full { char_len, ids }); + return Some(MatchType::Full { ids, char_count, byte_len }); // else we exact match the token. } else if token.lemma() == word { - let char_len = token.char_end - token.char_start; let ids = &located_words.positions; - return Some(MatchType::Full { char_len, ids }); + return Some(MatchType::Full { + char_count: token.char_end - token.char_start, + byte_len: token.byte_end - token.byte_start, + ids, + }); } } } @@ -149,7 +152,7 @@ pub type WordId = u16; /// In these cases we need to match consecutively several tokens to consider that the match is full. #[derive(Debug, PartialEq)] pub enum MatchType<'a> { - Full { char_len: usize, ids: &'a RangeInclusive }, + Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive }, Partial(PartialMatch<'a>), } @@ -183,7 +186,11 @@ impl<'a> PartialMatch<'a> { // if there is no remaining word to match in the phrase and the current token is matching, // return a Full match. } else if is_matching { - Some(MatchType::Full { char_len: token.char_end - token.char_start, ids }) + Some(MatchType::Full { + char_count: token.char_end - token.char_start, + byte_len: token.byte_end - token.byte_start, + ids, + }) // if the current token doesn't match, return None to break the match sequence. } else { None @@ -270,7 +277,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(0..=0) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) }) ); assert_eq!( matching_words @@ -294,7 +301,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) ); assert_eq!( matching_words @@ -306,7 +313,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) ); assert_eq!( matching_words diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index ac0fb7e7b..80e3ec7b2 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -10,7 +10,10 @@ use matching_words::{MatchType, PartialMatch}; use r#match::{Match, MatchPosition}; use serde::Serialize; use simple_token_kind::SimpleTokenKind; -use std::borrow::Cow; +use std::{ + borrow::Cow, + cmp::{max, min}, +}; const DEFAULT_CROP_MARKER: &str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; @@ -139,7 +142,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { - match_len: word.char_end - *first_word_char_start, + char_count: word.char_end - *first_word_char_start, ids: ids.clone().collect(), position: MatchPosition::Phrase { word_positions: [first_word_position, word_position], @@ -182,10 +185,10 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { match match_type { // we match, we save the current token as a match, // then we continue the rest of the tokens. - MatchType::Full { char_len, ids } => { + MatchType::Full { ids, char_count, .. } => { let ids: Vec<_> = ids.clone().collect(); matches.push(Match { - match_len: char_len, + char_count, ids, position: MatchPosition::Word { word_position, token_position }, }); @@ -224,19 +227,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { .iter() .map(|m| MatchBounds { start: tokens[m.get_first_token_pos()].byte_start, - length: m.match_len, + // TODO: Why is this in chars, while start is in bytes? + length: m.char_count, }) .collect(), } } /// Returns the bounds in byte index of the crop window. - fn crop_bounds( - &self, - tokens: &[Token<'_>], - matches: &[Match], - crop_size: usize, - ) -> (usize, usize) { + fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] { let ( mut remaining_words, is_iterating_forward, @@ -371,7 +370,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); - (crop_byte_start, crop_byte_end) + [crop_byte_start, crop_byte_end] } // Returns the formatted version of the original text. @@ -382,78 +381,87 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } else { match &self.matches { Some((tokens, matches)) => { - // If the text has to be cropped, - // crop around the best interval. - let (byte_start, byte_end) = match format_options.crop { + // If the text has to be cropped, crop around the best interval. + let [crop_byte_start, crop_byte_end] = match format_options.crop { Some(crop_size) if crop_size > 0 => { self.crop_bounds(tokens, matches, crop_size) } - _ => (0, self.text.len()), + _ => [0, self.text.len()], }; let mut formatted = Vec::new(); // push crop marker if it's not the start of the text. - if byte_start > 0 && !self.crop_marker.is_empty() { + if crop_byte_start > 0 && !self.crop_marker.is_empty() { formatted.push(self.crop_marker); } - let mut byte_index = byte_start; + let mut byte_index = crop_byte_start; if format_options.highlight { // insert highlight markers around matches. for m in matches { - let (current_byte_start, current_byte_end) = match m.position { + let [m_byte_start, m_byte_end] = match m.position { MatchPosition::Word { token_position, .. } => { let token = &tokens[token_position]; - (&token.byte_start, &token.byte_end) + [&token.byte_start, &token.byte_end] } MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => { - (&tokens[ftp].byte_start, &tokens[ltp].byte_end) + [&tokens[ftp].byte_start, &tokens[ltp].byte_end] } }; - // skip matches out of the crop window. - if *current_byte_start < byte_start || *current_byte_end > byte_end { + // skip matches out of the crop window + if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end { continue; } - if byte_index < *current_byte_start { - formatted.push(&self.text[byte_index..*current_byte_start]); + // adjust start and end to the crop window size + let [m_byte_start, m_byte_end] = [ + max(m_byte_start, &crop_byte_start), + min(m_byte_end, &crop_byte_end), + ]; + + // push text that is positioned before our matches + if byte_index < *m_byte_start { + formatted.push(&self.text[byte_index..*m_byte_start]); } - let highlight_byte_index = self.text[*current_byte_start..] - .char_indices() - .enumerate() - .find(|(i, _)| *i == m.match_len) - .map_or(*current_byte_end, |(_, (i, _))| i + *current_byte_start); - formatted.push(self.highlight_prefix); - formatted.push(&self.text[*current_byte_start..highlight_byte_index]); + + // TODO: This is additional work done, charabia::token::Token byte_len + // should already get us the original byte length, however, that doesn't work as + // it's supposed to, investigate why + let highlight_byte_index = self.text[*m_byte_start..] + .char_indices() + .nth(m.char_count) + .map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end)); + formatted.push(&self.text[*m_byte_start..highlight_byte_index]); + formatted.push(self.highlight_suffix); // if it's a prefix highlight, we put the end of the word after the highlight marker. - if highlight_byte_index < *current_byte_end { - formatted.push(&self.text[highlight_byte_index..*current_byte_end]); + if highlight_byte_index < *m_byte_end { + formatted.push(&self.text[highlight_byte_index..*m_byte_end]); } - byte_index = *current_byte_end; + byte_index = *m_byte_end; } } // push the rest of the text between last match and the end of crop. - if byte_index < byte_end { - formatted.push(&self.text[byte_index..byte_end]); + if byte_index < crop_byte_end { + formatted.push(&self.text[byte_index..crop_byte_end]); } // push crop marker if it's not the end of the text. - if byte_end < self.text.len() && !self.crop_marker.is_empty() { + if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() { formatted.push(self.crop_marker); } if formatted.len() == 1 { // avoid concatenating if there is already 1 slice. - Cow::Borrowed(&self.text[byte_start..byte_end]) + Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end]) } else { Cow::Owned(formatted.concat()) } @@ -825,8 +833,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - // @TODO: Should probably highlight it all, even if it didn't fit the whole phrase - @"The groundbreaking invention had the power to split the world…" + @"The groundbreaking invention had the power to split the world…" ); let builder = MatcherBuilder::new_test( @@ -837,7 +844,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - // @TODO: Should probably include end of string in this case? + // TODO: Should include exclamation mark without crop markers @"…between those who embraced progress and those who resisted change…" ); @@ -860,8 +867,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - // @TODO: "invention" should be highlighted as well - @"…invention had the power to split the world between those…" + @"…invention had the power to split the world between those…" ); }