Add format option structure

2024-11-23 10:37:41 +08:00 · 2022-04-12 13:42:14 +02:00 · 2022-04-12 13:42:14 +02:00 · 827cedcd15
commit 827cedcd15
parent 011f8210ed
4 changed files with 85 additions and 91 deletions
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@ -25,8 +25,8 @@ use milli::update::{
    ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
 };
 use milli::{
-    obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, Index, MatcherBuilder,
+    obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index,
-    SearchResult, SortError,
+    MatcherBuilder, SearchResult, SortError,
 };
 use once_cell::sync::OnceCell;
 use serde::{Deserialize, Serialize};
@ -162,7 +162,9 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
                let analyzed: Vec<_> = analyzed.tokens().collect();
                let mut matcher = matcher_builder.build(&analyzed[..], &old_string);
-                Value::String(matcher.format(true, true).to_string())
+                let format_options = FormatOptions { highlight: true, crop: Some(10) };
                Value::String(matcher.format(format_options).to_string())
            }
            Value::Array(values) => Value::Array(
                values.into_iter().map(|v| self.highlight_value(v, matcher_builder)).collect(),
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -37,7 +37,8 @@ pub use self::heed_codec::{
 };
 pub use self::index::Index;
 pub use self::search::{
-    FacetDistribution, Filter, MatchBounds, MatcherBuilder, MatchingWords, Search, SearchResult,
+    FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search,
    SearchResult,
 };
 pub type Result<T> = std::result::Result<T, error::Error>;
--- a/milli/src/search/matches/mod.rs
+++ b/milli/src/search/matches/mod.rs
@ -8,14 +8,12 @@ use crate::search::matches::matching_words::PartialMatch;
 pub mod matching_words;
 const DEFAULT_CROP_SIZE: usize = 10;
 const DEFAULT_CROP_MARKER: &'static str = "…";
 const DEFAULT_HIGHLIGHT_PREFIX: &'static str = "<em>";
 const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = "</em>";
 pub struct MatcherBuilder {
    matching_words: MatchingWords,
    crop_size: usize,
    crop_marker: Option<String>,
    highlight_prefix: Option<String>,
    highlight_suffix: Option<String>,
@ -23,18 +21,7 @@ pub struct MatcherBuilder {
 impl MatcherBuilder {
    pub fn from_matching_words(matching_words: MatchingWords) -> Self {
-        Self {
+        Self { matching_words, crop_marker: None, highlight_prefix: None, highlight_suffix: None }
            matching_words,
            crop_size: DEFAULT_CROP_SIZE,
            crop_marker: None,
            highlight_prefix: None,
            highlight_suffix: None,
        }
    }
    pub fn crop_size(&mut self, word_count: usize) -> &Self {
        self.crop_size = word_count;
        self
    }
    pub fn crop_marker(&mut self, marker: String) -> &Self {
@ -70,7 +57,6 @@ impl MatcherBuilder {
            text,
            tokens,
            matching_words: &self.matching_words,
            crop_size: self.crop_size,
            crop_marker,
            highlight_prefix,
            highlight_suffix,
@ -79,6 +65,18 @@ impl MatcherBuilder {
    }
 }
 #[derive(Copy, Clone, Default)]
 pub struct FormatOptions {
    pub highlight: bool,
    pub crop: Option<usize>,
 }
 impl FormatOptions {
    pub fn merge(self, other: Self) -> Self {
        Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) }
    }
 }
 #[derive(Clone, Debug)]
 pub struct Match {
    match_len: usize,
@ -100,7 +98,6 @@ pub struct Matcher<'t, 'm> {
    text: &'t str,
    tokens: &'t [Token<'t>],
    matching_words: &'m MatchingWords,
    crop_size: usize,
    crop_marker: &'m str,
    highlight_prefix: &'m str,
    highlight_suffix: &'m str,
@ -233,7 +230,7 @@ impl<'t> Matcher<'t, '_> {
    }
    /// Returns the bounds in byte index of the crop window.
-    fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
+    fn crop_bounds(&self, matches: &[Match], crop_size: usize) -> (usize, usize) {
        // if there is no match, we start from the beginning of the string by default.
        let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
        let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
@ -241,8 +238,7 @@ impl<'t> Matcher<'t, '_> {
        let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
        // matches needs to be counted in the crop len.
-        let mut remaining_words =
+        let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
            self.crop_size + first_match_word_position - last_match_word_position;
        let mut before_tokens = self.tokens[..first_match_token_position].iter().rev().peekable();
        let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable();
@ -348,7 +344,7 @@ impl<'t> Matcher<'t, '_> {
    }
    /// Returns the matches interval where the score computed by match_interval_score is maximal.
-    fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] {
+    fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
        // we compute the matches interval if we have at least 2 matches.
        if matches.len() > 1 {
            // positions of the first and the last match of the best matches interval in `matches`.
@ -361,9 +357,7 @@ impl<'t> Matcher<'t, '_> {
                // if next match would make interval gross more than crop_size,
                // we compare the current interval with the best one,
                // then we increase `interval_first` until next match can be added.
-                if next_match.word_position - matches[interval_first].word_position
+                if next_match.word_position - matches[interval_first].word_position >= crop_size {
                    >= self.crop_size
                {
                    let interval_score =
                        self.match_interval_score(&matches[interval_first..=interval_last]);
@ -375,7 +369,7 @@ impl<'t> Matcher<'t, '_> {
                    // advance start of the interval while interval is longer than crop_size.
                    while next_match.word_position - matches[interval_first].word_position
-                        >= self.crop_size
+                        >= crop_size
                    {
                        interval_first += 1;
                    }
@ -397,21 +391,24 @@ impl<'t> Matcher<'t, '_> {
    }
    // Returns the formatted version of the original text.
-    pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
+    pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> {
-        // If 0 it will be considered null and thus not crop the field
+        if !format_options.highlight && format_options.crop.is_none() {
        // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
        let crop = crop && self.crop_size > 0;
        if !highlight && !crop {
            // compute matches is not needed if no highlight nor crop is requested.
            Cow::Borrowed(self.text)
        } else {
            match &self.matches {
                Some(matches) => {
-                    let matches =
+                    let matches = match format_options.crop {
-                        if crop { self.find_best_match_interval(matches) } else { matches };
+                        Some(crop_size) if crop_size > 0 => {
                            self.find_best_match_interval(matches, crop_size)
                        }
                        _ => matches,
                    };
-                    let (byte_start, byte_end) =
+                    let (byte_start, byte_end) = match format_options.crop {
-                        if crop { self.crop_bounds(matches) } else { (0, self.text.len()) };
+                        Some(crop_size) if crop_size > 0 => self.crop_bounds(matches, crop_size),
                        _ => (0, self.text.len()),
                    };
                    let mut formatted = Vec::new();
@ -422,7 +419,7 @@ impl<'t> Matcher<'t, '_> {
                    let mut byte_index = byte_start;
-                    if highlight {
+                    if format_options.highlight {
                        // insert highlight markers around matches.
                        let tokens = self.tokens;
                        for m in matches {
@ -466,7 +463,7 @@ impl<'t> Matcher<'t, '_> {
                        Cow::Owned(formatted.concat())
                    }
                }
-                None => self.compute_matches().format(highlight, crop),
+                None => self.compute_matches().format(format_options),
            }
        }
    }
@ -496,8 +493,7 @@ mod tests {
        let builder = MatcherBuilder::from_matching_words(matching_words);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
-        let highlight = false;
+        let format_options = FormatOptions { highlight: false, crop: None };
        let crop = false;
        // Text without any match.
        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
@ -505,7 +501,7 @@ mod tests {
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop and no highlight should return complete text.
-        assert_eq!(&matcher.format(highlight, crop), &text);
+        assert_eq!(&matcher.format(format_options.clone()), &text);
        // Text containing all matches.
        let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
@ -513,7 +509,7 @@ mod tests {
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop and no highlight should return complete text.
-        assert_eq!(&matcher.format(highlight, crop), &text);
+        assert_eq!(&matcher.format(format_options.clone()), &text);
        // Text containing some matches.
        let text = "Natalie risk her future to build a world with the boy she loves.";
@ -521,7 +517,7 @@ mod tests {
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop and no highlight should return complete text.
-        assert_eq!(&matcher.format(highlight, crop), &text);
+        assert_eq!(&matcher.format(format_options.clone()), &text);
    }
    #[test]
@ -531,22 +527,21 @@ mod tests {
        let builder = MatcherBuilder::from_matching_words(matching_words);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
-        let highlight = true;
+        let format_options = FormatOptions { highlight: true, crop: None };
        let crop = false;
        // empty text.
        let text = "";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
-        assert_eq!(&matcher.format(highlight, crop), "");
+        assert_eq!(&matcher.format(format_options.clone()), "");
        // text containing only separators.
        let text = ":-)";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
-        assert_eq!(&matcher.format(highlight, crop), ":-)");
+        assert_eq!(&matcher.format(format_options.clone()), ":-)");
        // Text without any match.
        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
@ -554,7 +549,7 @@ mod tests {
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop should return complete text, because there is no matches.
-        assert_eq!(&matcher.format(highlight, crop), &text);
+        assert_eq!(&matcher.format(format_options.clone()), &text);
        // Text containing all matches.
        let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
@ -562,7 +557,7 @@ mod tests {
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop should return complete text with highlighted matches.
-        assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
+        assert_eq!(&matcher.format(format_options.clone()), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
        // Text containing some matches.
        let text = "Natalie risk her future to build a world with the boy she loves.";
@ -571,7 +566,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        // no crop should return complete text with highlighted matches.
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options.clone()),
            "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."
        );
    }
@ -588,8 +583,7 @@ mod tests {
        let builder = MatcherBuilder::from_matching_words(matching_words);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
-        let highlight = true;
+        let format_options = FormatOptions { highlight: true, crop: None };
        let crop = false;
        // Text containing prefix match.
        let text = "Ŵôřlḑôle";
@ -597,7 +591,7 @@ mod tests {
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop should return complete text with highlighted matches.
-        assert_eq!(&matcher.format(highlight, crop), "<em>Ŵôřlḑ</em>ôle");
+        assert_eq!(&matcher.format(format_options.clone()), "<em>Ŵôřlḑ</em>ôle");
        // Text containing unicode match.
        let text = "Ŵôřlḑ";
@ -605,7 +599,7 @@ mod tests {
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop should return complete text with highlighted matches.
-        assert_eq!(&matcher.format(highlight, crop), "<em>Ŵôřlḑ</em>");
+        assert_eq!(&matcher.format(format_options.clone()), "<em>Ŵôřlḑ</em>");
        // Text containing unicode match.
        let text = "Westfália";
@ -613,7 +607,7 @@ mod tests {
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop should return complete text with highlighted matches.
-        assert_eq!(&matcher.format(highlight, crop), "<em>Westfáli</em>a");
+        assert_eq!(&matcher.format(format_options.clone()), "<em>Westfáli</em>a");
    }
    #[test]
@ -623,22 +617,21 @@ mod tests {
        let builder = MatcherBuilder::from_matching_words(matching_words);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
-        let highlight = false;
+        let format_options = FormatOptions { highlight: false, crop: Some(10) };
        let crop = true;
        // empty text.
        let text = "";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
-        assert_eq!(&matcher.format(highlight, crop), "");
+        assert_eq!(&matcher.format(format_options.clone()), "");
        // text containing only separators.
        let text = ":-)";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
-        assert_eq!(&matcher.format(highlight, crop), ":-)");
+        assert_eq!(&matcher.format(format_options.clone()), ":-)");
        // Text without any match.
        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
@ -647,7 +640,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        // no highlight should return 10 first words with a marker at the end.
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options.clone()),
            "A quick brown fox can not jump 32 feet, right…"
        );
@ -658,7 +651,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        // no highlight should return 10 first words with a marker at the end.
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options.clone()),
            "(A quick brown fox can not jump 32 feet, right…"
        );
@ -669,7 +662,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        // should crop the phrase instead of croping around the match.
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options.clone()),
            "…Split The World is a book written by Emily Henry…"
        );
@ -680,7 +673,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        // no highlight should return 10 last words with a marker at the start.
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options.clone()),
            "…future to build a world with the boy she loves…"
        );
@ -691,7 +684,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        // no highlight should return 10 last words with a marker at the start.
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options.clone()),
            "…she loves. Emily Henry: The Love That Split The World."
        );
@ -702,7 +695,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        // crop should return 10 last words with a marker at the start.
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options.clone()),
            "…void void void void void split the world void void"
        );
@ -713,7 +706,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        // crop should return 10 last words with a marker at the start.
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options.clone()),
            "…void void void void void split the world void void"
        );
@ -724,7 +717,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        // crop should return 10 last words with a marker at the start.
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options.clone()),
            "…void void void void void split the world void void"
        );
    }
@ -736,22 +729,21 @@ mod tests {
        let builder = MatcherBuilder::from_matching_words(matching_words);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
-        let highlight = true;
+        let format_options = FormatOptions { highlight: true, crop: Some(10) };
        let crop = true;
        // empty text.
        let text = "";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
-        assert_eq!(&matcher.format(highlight, crop), "");
+        assert_eq!(&matcher.format(format_options.clone()), "");
        // text containing only separators.
        let text = ":-)";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
-        assert_eq!(&matcher.format(highlight, crop), ":-)");
+        assert_eq!(&matcher.format(format_options.clone()), ":-)");
        // Text without any match.
        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
@ -760,7 +752,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        // both should return 10 first words with a marker at the end.
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options.clone()),
            "A quick brown fox can not jump 32 feet, right…"
        );
@ -771,7 +763,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        // both should return 10 last words with a marker at the start and highlighted matches.
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options.clone()),
            "…future to build a <em>world</em> with <em>the</em> boy she loves…"
        );
@ -781,7 +773,7 @@ mod tests {
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // both should return 10 last words with a marker at the start and highlighted matches.
-        assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
+        assert_eq!(&matcher.format(format_options.clone()), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
        // Text containing a match unordered and a match ordered.
        let text = "The world split void void void void void void void void void split the world void void";
@ -790,7 +782,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        // crop should return 10 last words with a marker at the start.
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options.clone()),
            "…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
        );
    }
@ -800,33 +792,33 @@ mod tests {
        //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
        let matching_words = matching_words();
-        let mut builder = MatcherBuilder::from_matching_words(matching_words);
+        let builder = MatcherBuilder::from_matching_words(matching_words);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let highlight = false;
        let crop = true;
        let text = "void void split the world void void.";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        // set a smaller crop size
-        builder.crop_size(2);
+        let format_options = FormatOptions { highlight: false, crop: Some(2) };
        let mut matcher = builder.build(&tokens[..], text);
        // because crop size < query size, partially format matches.
-        assert_eq!(&matcher.format(highlight, crop), "…split the…");
+        assert_eq!(&matcher.format(format_options), "…split the…");
        // set a smaller crop size
-        builder.crop_size(1);
+        let format_options = FormatOptions { highlight: false, crop: Some(1) };
        let mut matcher = builder.build(&tokens[..], text);
        // because crop size < query size, partially format matches.
-        assert_eq!(&matcher.format(highlight, crop), "…split…");
+        assert_eq!(&matcher.format(format_options), "…split…");
        // set  crop size to 0
        let format_options = FormatOptions { highlight: false, crop: Some(0) };
        // set a smaller crop size
        builder.crop_size(0);
        let mut matcher = builder.build(&tokens[..], text);
        // because crop size is 0, crop is ignored.
-        assert_eq!(&matcher.format(highlight, crop), "void void split the world void void.");
+        assert_eq!(&matcher.format(format_options), "void void split the world void void.");
    }
    #[test]
@ -858,8 +850,7 @@ mod tests {
        builder.highlight_suffix("_".to_string());
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
-        let highlight = true;
+        let format_options = FormatOptions { highlight: true, crop: None };
        let crop = false;
        let text = "the do or die can't be he do and or isn't he";
        let analyzed = analyzer.analyze(&text);
@ -867,7 +858,7 @@ mod tests {
        let mut matcher = builder.build(&tokens[..], text);
        assert_eq!(
-            &matcher.format(highlight, crop),
+            &matcher.format(format_options),
            "_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_",
            "matches: {:?}",
            &matcher.matches
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap;
 pub use self::facet::{FacetDistribution, FacetNumberIter, Filter};
 use self::fst_utils::{Complement, Intersection, StartsWith, Union};
-pub use self::matches::{MatchBounds, Matcher, MatcherBuilder, MatchingWords};
+pub use self::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords};
 use self::query_tree::QueryTreeBuilder;
 use crate::error::UserError;
 use crate::search::criteria::r#final::{Final, FinalResult};