Create formater with some tests

2024-11-23 10:37:41 +08:00 · 2022-03-22 15:22:14 +01:00 · 2022-03-22 15:22:14 +01:00 · d96e72e5dc
commit d96e72e5dc
parent 900825bac0
3 changed files with 469 additions and 17 deletions
--- a/milli/src/search/matches/matching_words.rs
+++ b/milli/src/search/matches/matching_words.rs
@ -1,11 +1,11 @@
 use std::cmp::{min, Reverse};
-use std::collections::{BTreeMap, HashSet};
+use std::collections::{BTreeMap, HashMap};
 use std::ops::{Index, IndexMut};
 use levenshtein_automata::{Distance, DFA};
 use meilisearch_tokenizer::Token;
-use super::build_dfa;
+use crate::search::build_dfa;
 use crate::search::query_tree::{Operation, Query};
 type IsPrefix = bool;
@ -14,7 +14,7 @@ type IsPrefix = bool;
 /// referencing words that match the given query tree.
 #[derive(Default)]
 pub struct MatchingWords {
-    dfas: Vec<(DFA, String, u8, IsPrefix)>,
+    dfas: Vec<(DFA, String, u8, IsPrefix, usize)>,
 }
 impl MatchingWords {
@ -23,11 +23,11 @@ impl MatchingWords {
        let mut dfas: Vec<_> = fetch_queries(tree)
            .into_iter()
            // create DFAs for each word
-            .map(|(w, t, p)| (build_dfa(w, t, p), w.to_string(), t, p))
+            .map(|((w, t, p), id)| (build_dfa(w, t, p), w.to_string(), t, p, id))
            .collect();
        // Sort word by len in DESC order prioritizing the longuest word,
        // in order to highlight the longuest part of the matched word.
-        dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| {
+        dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix, _id)| {
            Reverse(query_word.len())
        });
        Self { dfas }
@ -35,14 +35,21 @@ impl MatchingWords {
    /// Returns the number of matching bytes if the word matches one of the query words.
    pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
-        self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
+        self.matching_bytes_with_id(word_to_highlight).map(|(len, _)| len)
    }
    pub fn matching_bytes_with_id(&self, word_to_highlight: &Token) -> Option<(usize, usize)> {
        self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix, id)| {
            match dfa.eval(word_to_highlight.text()) {
                Distance::Exact(t) if t <= *typo => {
                    if *is_prefix {
                        let len = bytes_to_highlight(word_to_highlight.text(), query_word);
-                        Some(word_to_highlight.num_chars_from_bytes(len))
+                        Some((word_to_highlight.num_chars_from_bytes(len), *id))
                    } else {
-                        Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()))
+                        Some((
                            word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()),
                            *id,
                        ))
                    }
                }
                _otherwise => None,
@ -52,26 +59,37 @@ impl MatchingWords {
 }
 /// Lists all words which can be considered as a match for the query tree.
-fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
+fn fetch_queries(tree: &Operation) -> HashMap<(&str, u8, IsPrefix), usize> {
-    fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
+    fn resolve_ops<'a>(
        tree: &'a Operation,
        out: &mut HashMap<(&'a str, u8, IsPrefix), usize>,
        id: &mut usize,
    ) {
        match tree {
            Operation::Or(_, ops) | Operation::And(ops) => {
-                ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
+                ops.as_slice().iter().for_each(|op| resolve_ops(op, out, id));
            }
            Operation::Query(Query { prefix, kind }) => {
                let typo = if kind.is_exact() { 0 } else { kind.typo() };
-                out.insert((kind.word(), typo, *prefix));
+                out.entry((kind.word(), typo, *prefix)).or_insert_with(|| {
                    *id += 1;
                    *id
                });
            }
            Operation::Phrase(words) => {
                for word in words {
-                    out.insert((word, 0, false));
+                    out.entry((word, 0, false)).or_insert_with(|| {
                        *id += 1;
                        *id
                    });
                }
            }
        }
    }
-    let mut queries = HashSet::new();
+    let mut queries = HashMap::new();
-    resolve_ops(tree, &mut queries);
+    let mut id = 0;
    resolve_ops(tree, &mut queries, &mut id);
    queries
 }
--- a/milli/src/search/matches/mod.rs
+++ b/milli/src/search/matches/mod.rs
@ -0,0 +1,434 @@
 use std::borrow::Cow;
 use matching_words::MatchingWords;
 use meilisearch_tokenizer::token::SeparatorKind;
 use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token};
 use crate::search::query_tree::Operation;
 pub mod matching_words;
 const DEFAULT_CROP_SIZE: usize = 10;
 const DEFAULT_CROP_MARKER: &'static str = "…";
 const DEFAULT_HIGHLIGHT_PREFIX: &'static str = "<em>";
 const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = "</em>";
 pub struct MatcherBuilder {
    matching_words: MatchingWords,
    crop_size: usize,
    crop_marker: Option<String>,
    highlight_prefix: Option<String>,
    highlight_suffix: Option<String>,
 }
 impl MatcherBuilder {
    pub fn from_query_tree(query_tree: &Operation) -> Self {
        let matching_words = MatchingWords::from_query_tree(query_tree);
        Self {
            matching_words,
            crop_size: DEFAULT_CROP_SIZE,
            crop_marker: None,
            highlight_prefix: None,
            highlight_suffix: None,
        }
    }
    pub fn crop_size(&mut self, word_count: usize) -> &Self {
        self.crop_size = word_count;
        self
    }
    pub fn crop_marker(&mut self, marker: String) -> &Self {
        self.crop_marker = Some(marker);
        self
    }
    pub fn highlight_prefix(&mut self, prefix: String) -> &Self {
        self.highlight_prefix = Some(prefix);
        self
    }
    pub fn highlight_suffix(&mut self, suffix: String) -> &Self {
        self.highlight_suffix = Some(suffix);
        self
    }
    pub fn build<'t, 'm>(&'m self, tokens: &'t [Token], text: &'t str) -> Matcher<'t, 'm> {
        let crop_marker = match &self.crop_marker {
            Some(marker) => marker.as_str(),
            None => &DEFAULT_CROP_MARKER,
        };
        let highlight_prefix = match &self.highlight_prefix {
            Some(marker) => marker.as_str(),
            None => &DEFAULT_HIGHLIGHT_PREFIX,
        };
        let highlight_suffix = match &self.highlight_suffix {
            Some(marker) => marker.as_str(),
            None => &DEFAULT_HIGHLIGHT_SUFFIX,
        };
        Matcher {
            text,
            tokens,
            matching_words: &self.matching_words,
            crop_size: self.crop_size,
            crop_marker,
            highlight_prefix,
            highlight_suffix,
            matches: None,
        }
    }
 }
 // impl Default for MatcherBuilder {
 //     fn default() -> Self {
 //         Self {
 //             crop_size: DEFAULT_CROP_SIZE,
 //             crop_marker: None,
 //             highlight_prefix: None,
 //             highlight_suffix: None,
 //         }
 //     }
 // }
 pub struct Match<'t> {
    token: &'t Token<'t>,
    match_len: usize,
    // id of the query word that matches.
    id: usize,
    // position of the word in the whole text.
    position: usize,
 }
 pub struct MatchBounds {
    start: usize,
    length: usize,
 }
 impl<'t> From<&Match<'t>> for MatchBounds {
    fn from(m: &Match) -> Self {
        MatchBounds { start: m.token.byte_start, length: m.match_len }
    }
 }
 pub struct Matcher<'t, 'm> {
    text: &'t str,
    tokens: &'t [Token<'t>],
    matching_words: &'m MatchingWords,
    crop_size: usize,
    crop_marker: &'m str,
    highlight_prefix: &'m str,
    highlight_suffix: &'m str,
    matches: Option<Vec<Match<'t>>>,
 }
 impl<'t> Matcher<'t, '_> {
    fn compute_matches(&mut self) -> &mut Self {
        let mut matches = Vec::new();
        let mut position = 0;
        for token in self.tokens {
            match token.is_separator() {
                Some(SeparatorKind::Hard) => position += 7,
                None => {
                    if let Some((match_len, id)) =
                        self.matching_words.matching_bytes_with_id(&token)
                    {
                        matches.push(Match { token, match_len, id, position });
                    }
                    position += 1;
                }
                _otherwise => {}
            }
        }
        self.matches = Some(matches);
        self
    }
    pub fn matches(&mut self) -> Vec<MatchBounds> {
        match &self.matches {
            None => self.compute_matches().matches(),
            Some(matches) => matches.iter().map(MatchBounds::from).collect(),
        }
    }
    fn crop_bounds(&self, matches: &[Match<'t>]) -> (usize, usize) {
        let byte_end = self
            .tokens
            .iter()
            .filter(|t| t.is_separator().is_none())
            .enumerate()
            .take_while(|(i, _)| *i < self.crop_size)
            .last()
            .map_or(self.text.len(), |(_, t)| t.byte_end);
        (0, byte_end)
    }
    pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
        if !highlight && !crop {
            // compute matches is not needed if no highlight or crop is requested.
            Cow::Borrowed(self.text)
        } else {
            match &self.matches {
                Some(matches) => {
                    let (byte_start, byte_end) =
                        if crop { self.crop_bounds(matches) } else { (0, self.text.len()) };
                    let mut formatted = Vec::new();
                    // push crop marker if it's not the start of the text.
                    if byte_start > 0 && !self.crop_marker.is_empty() {
                        formatted.push(self.crop_marker);
                    }
                    let mut byte_index = byte_start;
                    if highlight {
                        // insert highlight markers around matches.
                        for m in matches
                            .iter()
                            .skip_while(|m| m.token.byte_start < byte_start)
                            .take_while(|m| m.token.byte_start < byte_end)
                        {
                            if byte_index < m.token.byte_start {
                                formatted.push(&self.text[byte_index..m.token.byte_start]);
                            }
                            formatted.push(self.highlight_prefix);
                            formatted.push(&self.text[m.token.byte_start..m.token.byte_end]);
                            formatted.push(self.highlight_suffix);
                            byte_index = m.token.byte_end;
                        }
                    }
                    // push the rest of the text between last match and the end of crop.
                    if byte_index < byte_end {
                        formatted.push(&self.text[byte_index..byte_end]);
                    }
                    // push crop marker if it's not the end of the text.
                    if byte_end < self.text.len() && !self.crop_marker.is_empty() {
                        formatted.push(self.crop_marker);
                    }
                    if formatted.len() == 1 {
                        // avoid concatenating if there is already 1 slice.
                        Cow::Borrowed(&self.text[byte_start..byte_end])
                    } else {
                        Cow::Owned(formatted.concat())
                    }
                }
                None => self.compute_matches().format(highlight, crop),
            }
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::search::query_tree::{Query, QueryKind};
    fn query_tree() -> Operation {
        Operation::Or(
            false,
            vec![Operation::And(vec![
                Operation::Query(Query {
                    prefix: true,
                    kind: QueryKind::exact("split".to_string()),
                }),
                Operation::Query(Query {
                    prefix: false,
                    kind: QueryKind::exact("the".to_string()),
                }),
                Operation::Query(Query {
                    prefix: true,
                    kind: QueryKind::tolerant(1, "world".to_string()),
                }),
            ])],
        )
    }
    #[test]
    fn format_identity() {
        let query_tree = query_tree();
        let builder = MatcherBuilder::from_query_tree(&query_tree);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let highlight = false;
        let crop = false;
        // Text without any match.
        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop and no highlight should return complete text.
        assert_eq!(&matcher.format(highlight, crop), &text);
        // Text containing all matches.
        let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop and no highlight should return complete text.
        assert_eq!(&matcher.format(highlight, crop), &text);
        // Text containing some matches.
        let text = "Natalie risk her future to build a world with the boy she loves.";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop and no highlight should return complete text.
        assert_eq!(&matcher.format(highlight, crop), &text);
    }
    #[test]
    fn format_highlight() {
        let query_tree = query_tree();
        let builder = MatcherBuilder::from_query_tree(&query_tree);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let highlight = true;
        let crop = false;
        // Text without any match.
        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop should return complete text, because there is no matches.
        assert_eq!(&matcher.format(highlight, crop), &text);
        // Text containing all matches.
        let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop should return complete text with highlighted matches.
        assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>");
        // Text containing some matches.
        let text = "Natalie risk her future to build a world with the boy she loves.";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop should return complete text with highlighted matches.
        assert_eq!(
            &matcher.format(highlight, crop),
            "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."
        );
    }
    #[test]
    fn format_crop() {
        let query_tree = query_tree();
        let builder = MatcherBuilder::from_query_tree(&query_tree);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let highlight = false;
        let crop = true;
        // Text without any match.
        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no highlight should return 10 first words with a marker at the end.
        assert_eq!(
            &matcher.format(highlight, crop),
            "A quick brown fox can not jump 32 feet, right…"
        );
        // Text containing all matches.
        let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no highlight should return 10 last words with a marker at the start.
        assert_eq!(
            &matcher.format(highlight, crop),
            "…she loves. Emily Henry: The Love That Split The World"
        );
        // Text containing some matches.
        let text = "Natalie risk her future to build a world with the boy she loves.";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no highlight should return 10 last words with a marker at the start.
        assert_eq!(
            &matcher.format(highlight, crop),
            "…future to build a world with the boy she loves."
        );
        // Text containing a match unordered and a match ordered.
        let text = "The world split void void void void void void void void void split the world void void";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // crop should return 10 last words with a marker at the start.
        assert_eq!(
            &matcher.format(highlight, crop),
            "…void void void void void split the world void void"
        );
    }
    #[test]
    fn format_highlight_crop() {
        let query_tree = query_tree();
        let builder = MatcherBuilder::from_query_tree(&query_tree);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
        let highlight = true;
        let crop = true;
        // Text without any match.
        let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // both should return 10 first words with a marker at the end.
        assert_eq!(
            &matcher.format(highlight, crop),
            "A quick brown fox can not jump 32 feet, right…"
        );
        // Text containing all matches.
        let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // both should return 10 last words with a marker at the start and highlighted matches.
        assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>");
        // Text containing some matches.
        let text = "Natalie risk her future to build a world with the boy she loves.";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // both should return 10 last words with a marker at the start and highlighted matches.
        assert_eq!(
            &matcher.format(highlight, crop),
            "…future to build a <em>world</em> with <em>the</em> boy she loves."
        );
        // Text containing a match unordered and a match ordered.
        let text = "The world split void void void void void void void void void split the world void void";
        let analyzed = analyzer.analyze(&text);
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // crop should return 10 last words with a marker at the start.
        assert_eq!(
            &matcher.format(highlight, crop),
            "…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
        );
    }
 }
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -17,7 +17,7 @@ use roaring::bitmap::RoaringBitmap;
 pub use self::facet::{FacetDistribution, FacetNumberIter, Filter};
 use self::fst_utils::{Complement, Intersection, StartsWith, Union};
-pub use self::matching_words::MatchingWords;
+pub use self::matches::matching_words::MatchingWords;
 use self::query_tree::QueryTreeBuilder;
 use crate::error::UserError;
 use crate::search::criteria::r#final::{Final, FinalResult};
@ -32,7 +32,7 @@ mod criteria;
 mod distinct;
 mod facet;
 mod fst_utils;
-mod matching_words;
+mod matches;
 mod query_tree;
 pub struct Search<'a> {