Fix match count

2024-11-27 04:25:06 +08:00 · 2022-04-04 18:56:59 +02:00 · 2022-04-04 18:56:59 +02:00 · 3bb1e35ada
commit 3bb1e35ada
parent 56e0edd621
4 changed files with 469 additions and 231 deletions
--- a/milli/src/search/matches/matching_words.rs
+++ b/milli/src/search/matches/matching_words.rs
@ -1,12 +1,12 @@
 use std::cmp::{min, Reverse};
-use std::collections::{BTreeMap, HashMap};
+use std::collections::BTreeMap;
+use std::fmt;
 use std::ops::{Index, IndexMut};

 use levenshtein_automata::{Distance, DFA};
 use meilisearch_tokenizer::Token;

 use crate::search::build_dfa;
-use crate::search::query_tree::{Operation, Query};

 type IsPrefix = bool;

@ -14,83 +14,129 @@ type IsPrefix = bool;
 /// referencing words that match the given query tree.
 #[derive(Default)]
 pub struct MatchingWords {
-    dfas: Vec<(DFA, String, u8, IsPrefix, usize)>,
+    inner: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
 }

 impl MatchingWords {
-    pub fn from_query_tree(tree: &Operation) -> Self {
-        // fetch matchable words from the query tree
-        let mut dfas: Vec<_> = fetch_queries(tree)
-            .into_iter()
-            // create DFAs for each word
-            .map(|((w, t, p), id)| (build_dfa(w, t, p), w.to_string(), t, p, id))
-            .collect();
-        // Sort word by len in DESC order prioritizing the longuest word,
+    pub fn new(mut matching_words: Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>) -> Self {
+        // Sort word by len in DESC order prioritizing the longuest matches,
        // in order to highlight the longuest part of the matched word.
-        dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix, _id)| {
-            Reverse(query_word.len())
-        });
-        Self { dfas }
+        matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
+
+        Self { inner: matching_words }
    }

-    /// Returns the number of matching bytes if the word matches one of the query words.
-    pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
-        self.matching_bytes_with_id(word_to_highlight).map(|(len, _)| len)
+    pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
+        MatchesIter { inner: Box::new(self.inner.iter()), token }
+    }
 }

-    pub fn matching_bytes_with_id(&self, word_to_highlight: &Token) -> Option<(usize, usize)> {
-        self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix, id)| {
-            match dfa.eval(word_to_highlight.text()) {
-                Distance::Exact(t) if t <= *typo => {
-                    if *is_prefix {
-                        let len = bytes_to_highlight(word_to_highlight.text(), query_word);
-                        Some((word_to_highlight.num_chars_from_bytes(len), *id))
+pub struct MatchesIter<'a, 'b> {
+    inner: Box<dyn Iterator<Item = &'a (Vec<MatchingWord>, Vec<PrimitiveWordId>)> + 'a>,
+    token: &'b Token<'b>,
+}
+
+impl<'a> Iterator for MatchesIter<'a, '_> {
+    type Item = MatchType<'a>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.inner.next() {
+            Some((matching_words, ids)) => match matching_words[0].match_token(&self.token) {
+                Some(char_len) => {
+                    if matching_words.len() > 1 {
+                        Some(MatchType::Partial(PartialMatch {
+                            matching_words: &matching_words[1..],
+                            ids,
+                            char_len,
+                        }))
                    } else {
-                        Some((
-                            word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()),
-                            *id,
-                        ))
+                        Some(MatchType::Full { char_len, ids })
+                    }
+                }
+                None => self.next(),
+            },
+            None => None,
+        }
+    }
+}
+
+pub type PrimitiveWordId = u8;
+pub struct MatchingWord {
+    pub dfa: DFA,
+    pub word: String,
+    pub typo: u8,
+    pub prefix: IsPrefix,
+}
+
+impl fmt::Debug for MatchingWord {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("MatchingWord")
+            .field("word", &self.word)
+            .field("typo", &self.typo)
+            .field("prefix", &self.prefix)
+            .finish()
+    }
+}
+
+impl PartialEq for MatchingWord {
+    fn eq(&self, other: &Self) -> bool {
+        self.prefix == other.prefix && self.typo == other.typo && self.word == other.word
+    }
+}
+
+impl MatchingWord {
+    pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Self {
+        let dfa = build_dfa(&word, typo, prefix);
+
+        Self { dfa, word, typo, prefix }
+    }
+
+    pub fn match_token(&self, token: &Token) -> Option<usize> {
+        match self.dfa.eval(token.text()) {
+            Distance::Exact(t) if t <= self.typo => {
+                if self.prefix {
+                    let len = bytes_to_highlight(token.text(), &self.word);
+                    Some(token.num_chars_from_bytes(len))
+                } else {
+                    Some(token.num_chars_from_bytes(token.text().len()))
                }
            }
            _otherwise => None,
        }
+    }
+}
+
+#[derive(Debug, PartialEq)]
+pub enum MatchType<'a> {
+    Full { char_len: usize, ids: &'a [PrimitiveWordId] },
+    Partial(PartialMatch<'a>),
+}
+
+#[derive(Debug, PartialEq)]
+pub struct PartialMatch<'a> {
+    matching_words: &'a [MatchingWord],
+    ids: &'a [PrimitiveWordId],
+    char_len: usize,
+}
+
+impl<'a> PartialMatch<'a> {
+    pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
+        self.matching_words[0].match_token(token).map(|char_len| {
+            if self.matching_words.len() > 1 {
+                MatchType::Partial(PartialMatch {
+                    matching_words: &self.matching_words[1..],
+                    ids: self.ids,
+                    char_len,
+                })
+            } else {
+                MatchType::Full { char_len, ids: self.ids }
+            }
        })
    }
-}

-/// Lists all words which can be considered as a match for the query tree.
-fn fetch_queries(tree: &Operation) -> HashMap<(&str, u8, IsPrefix), usize> {
-    fn resolve_ops<'a>(
-        tree: &'a Operation,
-        out: &mut HashMap<(&'a str, u8, IsPrefix), usize>,
-        id: &mut usize,
-    ) {
-        match tree {
-            Operation::Or(_, ops) | Operation::And(ops) => {
-                ops.as_slice().iter().for_each(|op| resolve_ops(op, out, id));
+    pub fn char_len(&self) -> usize {
+        self.char_len
    }
-            Operation::Query(Query { prefix, kind }) => {
-                let typo = if kind.is_exact() { 0 } else { kind.typo() };
-                out.entry((kind.word(), typo, *prefix)).or_insert_with(|| {
-                    *id += 1;
-                    *id
-                });
-            }
-            Operation::Phrase(words) => {
-                for word in words {
-                    out.entry((word, 0, false)).or_insert_with(|| {
-                        *id += 1;
-                        *id
-                    });
-                }
-            }
-        }
-    }
-
-    let mut queries = HashMap::new();
-    let mut id = 0;
-    resolve_ops(tree, &mut queries, &mut id);
-    queries
 }

 // A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
@ -203,7 +249,6 @@ mod tests {
    use meilisearch_tokenizer::TokenKind;

    use super::*;
-    use crate::search::query_tree::{Operation, Query, QueryKind};
    use crate::MatchingWords;

    #[test]
@ -271,102 +316,104 @@ mod tests {

    #[test]
    fn matching_words() {
-        let query_tree = Operation::Or(
-            false,
-            vec![Operation::And(vec![
-                Operation::Query(Query {
-                    prefix: true,
-                    kind: QueryKind::exact("split".to_string()),
-                }),
-                Operation::Query(Query {
-                    prefix: false,
-                    kind: QueryKind::exact("this".to_string()),
-                }),
-                Operation::Query(Query {
-                    prefix: true,
-                    kind: QueryKind::tolerant(1, "world".to_string()),
-                }),
-            ])],
-        );
+        let matching_words = vec![
+            (vec![MatchingWord::new("split".to_string(), 1, true)], vec![0]),
+            (vec![MatchingWord::new("this".to_string(), 0, false)], vec![1]),
+            (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
+        ];

-        let matching_words = MatchingWords::from_query_tree(&query_tree);
+        let matching_words = MatchingWords::new(matching_words);

        assert_eq!(
-            matching_words.matching_bytes(&Token {
+            matching_words
+                .match_token(&Token {
                    kind: TokenKind::Word,
                    word: Cow::Borrowed("word"),
                    byte_start: 0,
                    char_index: 0,
                    byte_end: "word".len(),
                    char_map: None,
-            }),
-            Some(3)
+                })
+                .next(),
+            Some(MatchType::Full { char_len: 3, ids: &[2] })
        );
        assert_eq!(
-            matching_words.matching_bytes(&Token {
+            matching_words
+                .match_token(&Token {
                    kind: TokenKind::Word,
                    word: Cow::Borrowed("nyc"),
                    byte_start: 0,
                    char_index: 0,
                    byte_end: "nyc".len(),
                    char_map: None,
-            }),
+                })
+                .next(),
            None
        );
        assert_eq!(
-            matching_words.matching_bytes(&Token {
+            matching_words
+                .match_token(&Token {
                    kind: TokenKind::Word,
                    word: Cow::Borrowed("world"),
                    byte_start: 0,
                    char_index: 0,
                    byte_end: "world".len(),
                    char_map: None,
-            }),
-            Some(5)
+                })
+                .next(),
+            Some(MatchType::Full { char_len: 5, ids: &[2] })
        );
        assert_eq!(
-            matching_words.matching_bytes(&Token {
+            matching_words
+                .match_token(&Token {
                    kind: TokenKind::Word,
                    word: Cow::Borrowed("splitted"),
                    byte_start: 0,
                    char_index: 0,
                    byte_end: "splitted".len(),
                    char_map: None,
-            }),
-            Some(5)
+                })
+                .next(),
+            Some(MatchType::Full { char_len: 5, ids: &[0] })
        );
        assert_eq!(
-            matching_words.matching_bytes(&Token {
+            matching_words
+                .match_token(&Token {
                    kind: TokenKind::Word,
                    word: Cow::Borrowed("thisnew"),
                    byte_start: 0,
                    char_index: 0,
                    byte_end: "thisnew".len(),
                    char_map: None,
-            }),
+                })
+                .next(),
            None
        );
        assert_eq!(
-            matching_words.matching_bytes(&Token {
+            matching_words
+                .match_token(&Token {
                    kind: TokenKind::Word,
                    word: Cow::Borrowed("borld"),
                    byte_start: 0,
                    char_index: 0,
                    byte_end: "borld".len(),
                    char_map: None,
-            }),
-            Some(5)
+                })
+                .next(),
+            Some(MatchType::Full { char_len: 5, ids: &[2] })
        );
        assert_eq!(
-            matching_words.matching_bytes(&Token {
+            matching_words
+                .match_token(&Token {
                    kind: TokenKind::Word,
                    word: Cow::Borrowed("wordsplit"),
                    byte_start: 0,
                    char_index: 0,
                    byte_end: "wordsplit".len(),
                    char_map: None,
-            }),
-            Some(4)
+                })
+                .next(),
+            Some(MatchType::Full { char_len: 4, ids: &[2] })
        );
    }
 }
--- a/milli/src/search/matches/mod.rs
+++ b/milli/src/search/matches/mod.rs
@ -1,11 +1,10 @@
 use std::borrow::Cow;

 pub use matching_words::MatchingWords;
+use matching_words::{MatchType, PrimitiveWordId};
 use meilisearch_tokenizer::token::{SeparatorKind, Token};

-use crate::search::query_tree::Operation;
-
-mod matching_words;
+pub mod matching_words;

 const DEFAULT_CROP_SIZE: usize = 10;
 const DEFAULT_CROP_MARKER: &'static str = "…";
@ -21,18 +20,6 @@ pub struct MatcherBuilder {
 }

 impl MatcherBuilder {
-    pub fn from_query_tree(query_tree: &Operation) -> Self {
-        let matching_words = MatchingWords::from_query_tree(query_tree);
-
-        Self {
-            matching_words,
-            crop_size: DEFAULT_CROP_SIZE,
-            crop_marker: None,
-            highlight_prefix: None,
-            highlight_suffix: None,
-        }
-    }
-
    pub fn from_matching_words(matching_words: MatchingWords) -> Self {
        Self {
            matching_words,
@ -93,8 +80,8 @@ impl MatcherBuilder {
 #[derive(Clone, Debug)]
 pub struct Match {
    match_len: usize,
-    // id of the query word that matches.
-    id: usize,
+    // ids of the query words that matches.
+    ids: Vec<PrimitiveWordId>,
    // position of the word in the whole text.
    word_position: usize,
    // position of the token in the whole text.
@ -123,10 +110,72 @@ impl<'t> Matcher<'t, '_> {
        let mut matches = Vec::new();
        let mut word_position = 0;
        let mut token_position = 0;
-        for token in self.tokens {
+        while let Some(token) = self.tokens.get(token_position) {
            if token.is_separator().is_none() {
-                if let Some((match_len, id)) = self.matching_words.matching_bytes_with_id(&token) {
-                    matches.push(Match { match_len, id, word_position, token_position });
+                'matches: for match_type in self.matching_words.match_token(&token) {
+                    match match_type {
+                        MatchType::Full { char_len, ids } => {
+                            matches.push(Match {
+                                match_len: char_len,
+                                ids: ids.to_vec(),
+                                word_position,
+                                token_position,
+                            });
+                            // stop on the first match
+                            break;
+                        }
+                        MatchType::Partial(mut partial) => {
+                            let mut potential_matches =
+                                vec![(token_position, word_position, partial.char_len())];
+                            let mut t_position = 1;
+                            let mut w_position = 1;
+                            'partials: for token in &self.tokens[token_position + 1..] {
+                                if token.is_separator().is_none() {
+                                    partial = match partial.match_token(&token) {
+                                        Some(MatchType::Partial(partial)) => {
+                                            potential_matches.push((
+                                                token_position + t_position,
+                                                word_position + w_position,
+                                                partial.char_len(),
+                                            ));
+                                            partial
+                                        }
+                                        // partial match is now full, we keep this matches and we advance positions
+                                        Some(MatchType::Full { char_len, ids }) => {
+                                            let iter = potential_matches.into_iter().map(
+                                                |(token_position, word_position, match_len)| {
+                                                    Match {
+                                                        match_len,
+                                                        ids: ids.to_vec(),
+                                                        word_position,
+                                                        token_position,
+                                                    }
+                                                },
+                                            );
+
+                                            matches.extend(iter);
+
+                                            word_position += w_position;
+                                            token_position += t_position;
+
+                                            matches.push(Match {
+                                                match_len: char_len,
+                                                ids: ids.to_vec(),
+                                                word_position,
+                                                token_position,
+                                            });
+
+                                            break 'matches;
+                                        }
+                                        // no match, continue to next match.
+                                        None => break 'partials,
+                                    };
+                                    w_position += 1;
+                                }
+                                t_position += 1;
+                            }
+                        }
+                    }
                }
                word_position += 1;
            }
@ -229,7 +278,7 @@ impl<'t> Matcher<'t, '_> {
    }

    fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
-        let mut ids = Vec::with_capacity(matches.len());
+        let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len());
        let mut order_score = 0;
        let mut distance_score = 0;

@ -237,7 +286,7 @@ impl<'t> Matcher<'t, '_> {
        while let Some(m) = iter.next() {
            if let Some(next_match) = iter.peek() {
                // if matches are ordered
-                if next_match.id > m.id {
+                if next_match.ids.iter().min() > m.ids.iter().min() {
                    order_score += 1;
                }

@ -245,7 +294,7 @@ impl<'t> Matcher<'t, '_> {
                distance_score -= (next_match.word_position - m.word_position).min(7) as i16;
            }

-            ids.push(m.id);
+            ids.extend(m.ids.iter());
        }

        ids.sort_unstable();
@ -348,7 +397,8 @@ impl<'t> Matcher<'t, '_> {
                                .char_indices()
                                .enumerate()
                                .find(|(i, _)| *i == m.match_len)
-                                .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
+                                .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start)
+                                .min(token.byte_end);
                            formatted.push(self.highlight_prefix);
                            formatted.push(&self.text[token.byte_start..highlight_byte_index]);
                            formatted.push(self.highlight_suffix);
@ -386,33 +436,23 @@ mod tests {
    use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};

    use super::*;
-    use crate::search::query_tree::{Query, QueryKind};
+    use crate::search::matches::matching_words::MatchingWord;

-    fn query_tree() -> Operation {
-        Operation::Or(
-            false,
-            vec![Operation::And(vec![
-                Operation::Query(Query {
-                    prefix: true,
-                    kind: QueryKind::exact("split".to_string()),
-                }),
-                Operation::Query(Query {
-                    prefix: false,
-                    kind: QueryKind::exact("the".to_string()),
-                }),
-                Operation::Query(Query {
-                    prefix: true,
-                    kind: QueryKind::tolerant(1, "world".to_string()),
-                }),
-            ])],
-        )
+    fn matching_words() -> MatchingWords {
+        let matching_words = vec![
+            (vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]),
+            (vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]),
+            (vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
+        ];
+
+        MatchingWords::new(matching_words)
    }

    #[test]
    fn format_identity() {
-        let query_tree = query_tree();
+        let matching_words = matching_words();

-        let builder = MatcherBuilder::from_query_tree(&query_tree);
+        let builder = MatcherBuilder::from_matching_words(matching_words);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());

        let highlight = false;
@ -445,9 +485,9 @@ mod tests {

    #[test]
    fn format_highlight() {
-        let query_tree = query_tree();
+        let matching_words = matching_words();

-        let builder = MatcherBuilder::from_query_tree(&query_tree);
+        let builder = MatcherBuilder::from_matching_words(matching_words);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());

        let highlight = true;
@ -497,21 +537,14 @@ mod tests {

    #[test]
    fn highlight_unicode() {
-        let query_tree = Operation::Or(
-            false,
-            vec![Operation::And(vec![
-                Operation::Query(Query {
-                    prefix: true,
-                    kind: QueryKind::tolerant(1, "wessfalia".to_string()),
-                }),
-                Operation::Query(Query {
-                    prefix: true,
-                    kind: QueryKind::tolerant(1, "world".to_string()),
-                }),
-            ])],
-        );
+        let matching_words = vec![
+            (vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]),
+            (vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]),
+        ];

-        let builder = MatcherBuilder::from_query_tree(&query_tree);
+        let matching_words = MatchingWords::new(matching_words);
+
+        let builder = MatcherBuilder::from_matching_words(matching_words);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());

        let highlight = true;
@ -539,14 +572,14 @@ mod tests {
        let tokens: Vec<_> = analyzed.tokens().collect();
        let mut matcher = builder.build(&tokens[..], text);
        // no crop should return complete text with highlighted matches.
-        assert_eq!(&matcher.format(highlight, crop), "<em>Westfália</em>");
+        assert_eq!(&matcher.format(highlight, crop), "<em>Westfáli</em>a");
    }

    #[test]
    fn format_crop() {
-        let query_tree = query_tree();
+        let matching_words = matching_words();

-        let builder = MatcherBuilder::from_query_tree(&query_tree);
+        let builder = MatcherBuilder::from_matching_words(matching_words);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());

        let highlight = false;
@ -657,9 +690,9 @@ mod tests {

    #[test]
    fn format_highlight_crop() {
-        let query_tree = query_tree();
+        let matching_words = matching_words();

-        let builder = MatcherBuilder::from_query_tree(&query_tree);
+        let builder = MatcherBuilder::from_matching_words(matching_words);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());

        let highlight = true;
@ -724,9 +757,9 @@ mod tests {
    #[test]
    fn smaller_crop_size() {
        //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
-        let query_tree = query_tree();
+        let matching_words = matching_words();

-        let mut builder = MatcherBuilder::from_query_tree(&query_tree);
+        let mut builder = MatcherBuilder::from_matching_words(matching_words);
        let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());

        let highlight = false;
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -114,7 +114,7 @@ impl<'a> Search<'a> {
    pub fn execute(&self) -> Result<SearchResult> {
        // We create the query tree by spliting the query into tokens.
        let before = Instant::now();
-        let (query_tree, primitive_query) = match self.query.as_ref() {
+        let (query_tree, primitive_query, matching_words) = match self.query.as_ref() {
            Some(query) => {
                let mut builder = QueryTreeBuilder::new(self.rtxn, self.index);
                builder.optional_words(self.optional_words);
@ -132,9 +132,11 @@ impl<'a> Search<'a> {
                let analyzer = Analyzer::new(config);
                let result = analyzer.analyze(query);
                let tokens = result.tokens();
-                builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq)))
+                builder
+                    .build(tokens)?
+                    .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw)))
            }
-            None => (None, None),
+            None => (None, None, None),
        };

        debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed());
@ -148,11 +150,6 @@ impl<'a> Search<'a> {

        debug!("facet candidates: {:?} took {:.02?}", filtered_candidates, before.elapsed());

-        let matching_words = match query_tree.as_ref() {
-            Some(query_tree) => MatchingWords::from_query_tree(&query_tree),
-            None => MatchingWords::default(),
-        };
-
        // We check that we are allowed to use the sort criteria, we check
        // that they are declared in the sortable fields.
        if let Some(sort_criteria) = &self.sort_criteria {
@ -193,13 +190,13 @@ impl<'a> Search<'a> {
        )?;

        match self.index.distinct_field(self.rtxn)? {
-            None => self.perform_sort(NoopDistinct, matching_words, criteria),
+            None => self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria),
            Some(name) => {
                let field_ids_map = self.index.fields_ids_map(self.rtxn)?;
                match field_ids_map.id(name) {
                    Some(fid) => {
                        let distinct = FacetDistinct::new(fid, self.index, self.rtxn);
-                        self.perform_sort(distinct, matching_words, criteria)
+                        self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria)
                    }
                    None => Ok(SearchResult::default()),
                }
--- a/milli/src/search/query_tree.rs
+++ b/milli/src/search/query_tree.rs
@ -8,7 +8,8 @@ use meilisearch_tokenizer::TokenKind;
 use roaring::RoaringBitmap;
 use slice_group_by::GroupBy;

-use crate::{Index, Result};
+use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId};
+use crate::{Index, MatchingWords, Result};

 type IsOptionalWord = bool;
 type IsPrefix = bool;
@ -233,7 +234,10 @@ impl<'a> QueryTreeBuilder<'a> {
    /// - if `authorize_typos` is set to `false` the query tree will be generated
    ///   forcing all query words to match documents without any typo
    ///   (the criterion `typo` will be ignored)
-    pub fn build(&self, query: TokenStream) -> Result<Option<(Operation, PrimitiveQuery)>> {
+    pub fn build(
+        &self,
+        query: TokenStream,
+    ) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
        let stop_words = self.index.stop_words(self.rtxn)?;
        let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
        if !primitive_query.is_empty() {
@ -243,7 +247,9 @@ impl<'a> QueryTreeBuilder<'a> {
                self.authorize_typos,
                &primitive_query,
            )?;
-            Ok(Some((qt, primitive_query)))
+            let matching_words =
+                create_matching_words(self, self.authorize_typos, &primitive_query)?;
+            Ok(Some((qt, primitive_query, matching_words)))
        } else {
            Ok(None)
        }
@ -251,7 +257,7 @@ impl<'a> QueryTreeBuilder<'a> {
 }

 /// Split the word depending on the frequency of subwords in the database documents.
-fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<Operation>> {
+fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<(String, String)>> {
    let chars = word.char_indices().skip(1);
    let mut best = None;

@ -267,7 +273,7 @@ fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<O
        }
    }

-    Ok(best.map(|(_, left, right)| Operation::Phrase(vec![left.to_string(), right.to_string()])))
+    Ok(best.map(|(_, left, right)| (left.to_string(), right.to_string())))
 }

 #[derive(Clone)]
@ -336,8 +342,8 @@ fn create_query_tree(
            // 4. wrap all in an OR operation
            PrimitiveQueryPart::Word(word, prefix) => {
                let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
-                if let Some(child) = split_best_frequency(ctx, &word)? {
-                    children.push(child);
+                if let Some((left, right)) = split_best_frequency(ctx, &word)? {
+                    children.push(Operation::Phrase(vec![left, right]));
                }
                let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
                let exact_words = ctx.exact_words()?;
@ -464,6 +470,154 @@ fn create_query_tree(
    }
 }

+/// Main function that matchings words used for crop and highlight.
+fn create_matching_words(
+    ctx: &impl Context,
+    authorize_typos: bool,
+    query: &[PrimitiveQueryPart],
+) -> Result<MatchingWords> {
+    /// Matches on the `PrimitiveQueryPart` and create matchings words from it.
+    fn resolve_primitive_part(
+        ctx: &impl Context,
+        authorize_typos: bool,
+        part: PrimitiveQueryPart,
+        matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
+        id: PrimitiveWordId,
+    ) -> Result<()> {
+        match part {
+            // 1. try to split word in 2
+            // 2. try to fetch synonyms
+            PrimitiveQueryPart::Word(word, prefix) => {
+                if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? {
+                    for synonym in synonyms {
+                        let synonym = synonym
+                            .into_iter()
+                            .map(|syn| MatchingWord::new(syn.to_string(), 0, false))
+                            .collect();
+                        matching_words.push((synonym, vec![id]));
+                    }
+                }
+
+                if let Some((left, right)) = split_best_frequency(ctx, &word)? {
+                    let left = MatchingWord::new(left, 0, false);
+                    let right = MatchingWord::new(right, 0, false);
+                    matching_words.push((vec![left, right], vec![id]));
+                }
+
+                let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
+                let exact_words = ctx.exact_words()?;
+                let config =
+                    TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
+
+                let matching_word = match typos(word, authorize_typos, config) {
+                    QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix),
+                    QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix),
+                };
+                matching_words.push((vec![matching_word], vec![id]));
+            }
+            // create a CONSECUTIVE matchings words wrapping all word in the phrase
+            PrimitiveQueryPart::Phrase(words) => {
+                let ids: Vec<_> =
+                    (0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
+                let words =
+                    words.into_iter().map(|w| MatchingWord::new(w.to_string(), 0, false)).collect();
+                matching_words.push((words, ids));
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Create all ngrams 1..=3 generating query tree branches.
+    fn ngrams(
+        ctx: &impl Context,
+        authorize_typos: bool,
+        query: &[PrimitiveQueryPart],
+        matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
+        mut id: PrimitiveWordId,
+    ) -> Result<()> {
+        const MAX_NGRAM: usize = 3;
+
+        for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) {
+            for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
+                if let Some(group) = sub_query.get(..ngram) {
+                    let tail = &sub_query[ngram..];
+                    let is_last = tail.is_empty();
+
+                    match group {
+                        [part] => {
+                            resolve_primitive_part(
+                                ctx,
+                                authorize_typos,
+                                part.clone(),
+                                matching_words,
+                                id,
+                            )?;
+                        }
+                        words => {
+                            let is_prefix = words.last().map_or(false, |part| part.is_prefix());
+                            let words: Vec<_> = words
+                                .iter()
+                                .filter_map(|part| {
+                                    if let PrimitiveQueryPart::Word(word, _) = part {
+                                        Some(word.as_str())
+                                    } else {
+                                        None
+                                    }
+                                })
+                                .collect();
+                            let ids: Vec<_> = (0..words.len())
+                                .into_iter()
+                                .map(|i| id + i as PrimitiveWordId)
+                                .collect();
+
+                            if let Some(synonyms) = ctx.synonyms(&words)? {
+                                for synonym in synonyms {
+                                    let synonym = synonym
+                                        .into_iter()
+                                        .map(|syn| MatchingWord::new(syn.to_string(), 0, false))
+                                        .collect();
+                                    matching_words.push((synonym, ids.clone()));
+                                }
+                            }
+                            let word = words.concat();
+                            let (word_len_one_typo, word_len_two_typo) =
+                                ctx.min_word_len_for_typo()?;
+                            let exact_words = ctx.exact_words()?;
+                            let config = TypoConfig {
+                                max_typos: 1,
+                                word_len_one_typo,
+                                word_len_two_typo,
+                                exact_words,
+                            };
+                            let matching_word = match typos(word, authorize_typos, config) {
+                                QueryKind::Exact { word, .. } => {
+                                    MatchingWord::new(word, 0, is_prefix)
+                                }
+                                QueryKind::Tolerant { typo, word } => {
+                                    MatchingWord::new(word, typo, is_prefix)
+                                }
+                            };
+                            matching_words.push((vec![matching_word], ids));
+                        }
+                    }
+
+                    if !is_last {
+                        ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?;
+                    }
+                }
+            }
+            id += sub_query.iter().map(|x| x.len() as PrimitiveWordId).sum::<PrimitiveWordId>();
+        }
+
+        Ok(())
+    }
+
+    let mut matching_words = Vec::new();
+    ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?;
+    Ok(MatchingWords::new(matching_words))
+}
+
 pub type PrimitiveQuery = Vec<PrimitiveQueryPart>;

 #[derive(Debug, Clone)]
@ -480,6 +634,13 @@ impl PrimitiveQueryPart {
    fn is_prefix(&self) -> bool {
        matches!(self, Self::Word(_, is_prefix) if *is_prefix)
    }
+
+    fn len(&self) -> usize {
+        match self {
+            Self::Phrase(words) => words.len(),
+            Self::Word(_, _) => 1,
+        }
+    }
 }

 /// Create primitive query from tokenized query string,