meilisearch/milli/src/search/matching_words.rs

use std::cmp::{min, Reverse};
use std::collections::{BTreeMap, HashSet};
use std::ops::{Index, IndexMut};

use levenshtein_automata::{Distance, DFA};
use meilisearch_tokenizer::Token;

use super::build_dfa;
use crate::search::query_tree::{Operation, Query};

type IsPrefix = bool;

/// Structure created from a query tree
/// referencing words that match the given query tree.
#[derive(Default)]
pub struct MatchingWords {
    dfas: Vec<(DFA, String, u8, IsPrefix)>,
}

impl MatchingWords {
    pub fn from_query_tree(tree: &Operation) -> Self {
        // fetch matchable words from the query tree
        let mut dfas: Vec<_> = fetch_queries(tree)
            .into_iter()
            // create DFAs for each word
            .map(|(w, t, p)| (build_dfa(w, t, p), w.to_string(), t, p))
            .collect();
        // Sort word by len in DESC order prioritizing the longuest word,
        // in order to highlight the longuest part of the matched word.
        dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| {
            Reverse(query_word.len())
        });
        Self { dfas }
    }

    /// Returns the number of matching bytes if the word matches one of the query words.
    pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
        self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
            match dfa.eval(word_to_highlight.text()) {
                Distance::Exact(t) if t <= *typo => {
                    if *is_prefix {
                        let len = bytes_to_highlight(word_to_highlight.text(), query_word);
                        Some(word_to_highlight.num_chars_from_bytes(len))
                    } else {
                        Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()))
                    }
                }
                _otherwise => None,
            }
        })
    }
}

/// Lists all words which can be considered as a match for the query tree.
fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
    fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
        match tree {
            Operation::Or(_, ops) | Operation::And(ops) => {
                ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
            }
            Operation::Query(Query { prefix, kind }) => {
                let typo = if kind.is_exact() { 0 } else { kind.typo() };
                out.insert((kind.word(), typo, *prefix));
            }
            Operation::Phrase(words) => {
                for word in words {
                    out.insert((word, 0, false));
                }
            }
        }
    }

    let mut queries = HashSet::new();
    resolve_ops(tree, &mut queries);
    queries
}

// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
struct N2Array<T> {
    y_size: usize,
    buf: Vec<T>,
}

impl<T: Clone> N2Array<T> {
    fn new(x: usize, y: usize, value: T) -> N2Array<T> {
        N2Array { y_size: y, buf: vec![value; x * y] }
    }
}

impl<T> Index<(usize, usize)> for N2Array<T> {
    type Output = T;

    #[inline]
    fn index(&self, (x, y): (usize, usize)) -> &T {
        &self.buf[(x * self.y_size) + y]
    }
}

impl<T> IndexMut<(usize, usize)> for N2Array<T> {
    #[inline]
    fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
        &mut self.buf[(x * self.y_size) + y]
    }
}

/// Returns the number of **bytes** we want to highlight in the `source` word.
/// Basically we want to highlight as much characters as possible in the source until it has too much
/// typos (= 2)
/// The algorithm is a modified
/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
fn bytes_to_highlight(source: &str, target: &str) -> usize {
    let n = source.chars().count();
    let m = target.chars().count();

    if n == 0 {
        return 0;
    }
    // since we allow two typos we can send two characters even if it's completely wrong
    if m < 3 {
        return source.chars().take(m).map(|c| c.len_utf8()).sum();
    }
    if n == m && source == target {
        return source.len();
    }

    let inf = n + m;
    let mut matrix = N2Array::new(n + 2, m + 2, 0);

    matrix[(0, 0)] = inf;
    for i in 0..=n {
        matrix[(i + 1, 0)] = inf;
        matrix[(i + 1, 1)] = i;
    }
    for j in 0..=m {
        matrix[(0, j + 1)] = inf;
        matrix[(1, j + 1)] = j;
    }

    let mut last_row = BTreeMap::new();

    for (row, char_s) in source.chars().enumerate() {
        let mut last_match_col = 0;
        let row = row + 1;

        for (col, char_t) in target.chars().enumerate() {
            let col = col + 1;
            let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
            let cost = if char_s == char_t { 0 } else { 1 };

            let dist_add = matrix[(row, col + 1)] + 1;
            let dist_del = matrix[(row + 1, col)] + 1;
            let dist_sub = matrix[(row, col)] + cost;
            let dist_trans = matrix[(last_match_row, last_match_col)]
                + (row - last_match_row - 1)
                + 1
                + (col - last_match_col - 1);
            let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
            matrix[(row + 1, col + 1)] = dist;

            if cost == 0 {
                last_match_col = col;
            }
        }

        last_row.insert(char_s, row);
    }

    let mut minimum = (u32::max_value(), 0);
    for x in 0..=m {
        let dist = matrix[(n + 1, x + 1)] as u32;
        if dist < minimum.0 {
            minimum = (dist, x);
        }
    }

    // everything was done characters wise and now we want to returns a number of bytes
    source.chars().take(minimum.1).map(|c| c.len_utf8()).sum()
}

#[cfg(test)]
mod tests {
    use std::borrow::Cow;
    use std::str::from_utf8;

    use meilisearch_tokenizer::TokenKind;

    use super::*;
    use crate::search::query_tree::{Operation, Query, QueryKind};
    use crate::MatchingWords;

    #[test]
    fn test_bytes_to_highlight() {
        struct TestBytesToHighlight {
            query: &'static str,
            text: &'static str,
            length: usize,
        }
        let tests = [
            TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() },
            TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() },
            TestBytesToHighlight {
                query: "Levenshtein",
                text: "Levenshtein",
                length: "Levenshtein".len(),
            },
            // we get to the end of our word with only one typo
            TestBytesToHighlight {
                query: "Levenste",
                text: "Levenshtein",
                length: "Levenste".len(),
            },
            // we get our third and last authorized typo right on the last character
            TestBytesToHighlight {
                query: "Levenstein",
                text: "Levenshte",
                length: "Levenste".len(),
            },
            // we get to the end of our word with only two typos at the beginning
            TestBytesToHighlight {
                query: "Bavenshtein",
                text: "Levenshtein",
                length: "Bavenshtein".len(),
            },
            TestBytesToHighlight {
                query: "Альфа", text: "Альфой", length: "Альф".len()
            },
            TestBytesToHighlight {
                query: "Go💼", text: "Go💼od luck.", length: "Go💼".len()
            },
            TestBytesToHighlight {
                query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len()
            },
            TestBytesToHighlight {
                query: "chäräcters",
                text: "chäräcters",
                length: "chäräcters".len(),
            },
            TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() },
            TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() },
        ];

        for test in &tests {
            let length = bytes_to_highlight(test.text, test.query);
            assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text);
            assert!(
                from_utf8(&test.query.as_bytes()[..length]).is_ok(),
                r#"converting {}[..{}] to an utf8 str failed"#,
                test.query,
                length
            );
        }
    }

    #[test]
    fn matching_words() {
        let query_tree = Operation::Or(
            false,
            vec![Operation::And(vec![
                Operation::Query(Query {
                    prefix: true,
                    kind: QueryKind::exact("split".to_string()),
                }),
                Operation::Query(Query {
                    prefix: false,
                    kind: QueryKind::exact("this".to_string()),
                }),
                Operation::Query(Query {
                    prefix: true,
                    kind: QueryKind::tolerant(1, "world".to_string()),
                }),
            ])],
        );

        let matching_words = MatchingWords::from_query_tree(&query_tree);

        assert_eq!(
            matching_words.matching_bytes(&Token {
                kind: TokenKind::Word,
                word: Cow::Borrowed("word"),
                byte_start: 0,
                char_index: 0,
                byte_end: "word".len(),
                char_map: None,
            }),
            Some(3)
        );
        assert_eq!(
            matching_words.matching_bytes(&Token {
                kind: TokenKind::Word,
                word: Cow::Borrowed("nyc"),
                byte_start: 0,
                char_index: 0,
                byte_end: "nyc".len(),
                char_map: None,
            }),
            None
        );
        assert_eq!(
            matching_words.matching_bytes(&Token {
                kind: TokenKind::Word,
                word: Cow::Borrowed("world"),
                byte_start: 0,
                char_index: 0,
                byte_end: "world".len(),
                char_map: None,
            }),
            Some(5)
        );
        assert_eq!(
            matching_words.matching_bytes(&Token {
                kind: TokenKind::Word,
                word: Cow::Borrowed("splitted"),
                byte_start: 0,
                char_index: 0,
                byte_end: "splitted".len(),
                char_map: None,
            }),
            Some(5)
        );
        assert_eq!(
            matching_words.matching_bytes(&Token {
                kind: TokenKind::Word,
                word: Cow::Borrowed("thisnew"),
                byte_start: 0,
                char_index: 0,
                byte_end: "thisnew".len(),
                char_map: None,
            }),
            None
        );
        assert_eq!(
            matching_words.matching_bytes(&Token {
                kind: TokenKind::Word,
                word: Cow::Borrowed("borld"),
                byte_start: 0,
                char_index: 0,
                byte_end: "borld".len(),
                char_map: None,
            }),
            Some(5)
        );
        assert_eq!(
            matching_words.matching_bytes(&Token {
                kind: TokenKind::Word,
                word: Cow::Borrowed("wordsplit"),
                byte_start: 0,
                char_index: 0,
                byte_end: "wordsplit".len(),
                char_map: None,
            }),
            Some(4)
        );
    }
}
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`use std::cmp::{min, Reverse};`
format the whole project 2021-06-17 00:33:33 +08:00			`use std::collections::{BTreeMap, HashSet};`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`use std::ops::{Index, IndexMut};`

format the whole project 2021-06-17 00:33:33 +08:00			`use levenshtein_automata::{Distance, DFA};`
Fix search highlight for non-unicode chars The `matching_bytes` function takes a `&Token` now and: - gets the number of bytes to highlight (unchanged). - uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight. In essence, the `matching_bytes` function returns the number of matching grapheme clusters instead of bytes. Should this function be renamed then? Added proper highlighting in the HTTP UI: - requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens - `<mark>` tag is put around only the matched part - before this change, the entire word was highlighted even if only a part of it matched 2021-12-18 01:23:34 +08:00			`use meilisearch_tokenizer::Token;`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00
			`use super::build_dfa;`
format the whole project 2021-06-17 00:33:33 +08:00			`use crate::search::query_tree::{Operation, Query};`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00
			`type IsPrefix = bool;`

Resolve PR comments 2021-06-01 17:48:56 +08:00			`/// Structure created from a query tree`
			`/// referencing words that match the given query tree.`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`#[derive(Default)]`
			`pub struct MatchingWords {`
			`dfas: Vec<(DFA, String, u8, IsPrefix)>,`
			`}`

			`impl MatchingWords {`
			`pub fn from_query_tree(tree: &Operation) -> Self {`
Resolve PR comments 2021-06-01 17:48:56 +08:00			`// fetch matchable words from the query tree`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`let mut dfas: Vec<_> = fetch_queries(tree)`
			`.into_iter()`
Resolve PR comments 2021-06-01 17:48:56 +08:00			`// create DFAs for each word`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`.map(\|(w, t, p)\| (build_dfa(w, t, p), w.to_string(), t, p))`
			`.collect();`
Resolve PR comments 2021-06-01 17:48:56 +08:00			`// Sort word by len in DESC order prioritizing the longuest word,`
			`// in order to highlight the longuest part of the matched word.`
format the whole project 2021-06-17 00:33:33 +08:00			`dfas.sort_unstable_by_key(\|(_dfa, query_word, _typo, _is_prefix)\| {`
			`Reverse(query_word.len())`
			`});`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`Self { dfas }`
			`}`

Resolve PR comments 2021-06-01 17:48:56 +08:00			`/// Returns the number of matching bytes if the word matches one of the query words.`
Fix search highlight for non-unicode chars The `matching_bytes` function takes a `&Token` now and: - gets the number of bytes to highlight (unchanged). - uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight. In essence, the `matching_bytes` function returns the number of matching grapheme clusters instead of bytes. Should this function be renamed then? Added proper highlighting in the HTTP UI: - requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens - `<mark>` tag is put around only the matched part - before this change, the entire word was highlighted even if only a part of it matched 2021-12-18 01:23:34 +08:00			`pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`self.dfas.iter().find_map(\|(dfa, query_word, typo, is_prefix)\| {`
Fix search highlight for non-unicode chars The `matching_bytes` function takes a `&Token` now and: - gets the number of bytes to highlight (unchanged). - uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight. In essence, the `matching_bytes` function returns the number of matching grapheme clusters instead of bytes. Should this function be renamed then? Added proper highlighting in the HTTP UI: - requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens - `<mark>` tag is put around only the matched part - before this change, the entire word was highlighted even if only a part of it matched 2021-12-18 01:23:34 +08:00			`match dfa.eval(word_to_highlight.text()) {`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`Distance::Exact(t) if t <= *typo => {`
			`if *is_prefix {`
Fix search highlight for non-unicode chars The `matching_bytes` function takes a `&Token` now and: - gets the number of bytes to highlight (unchanged). - uses `Token.num_graphemes_from_bytes` to get the number of grapheme clusters to highlight. In essence, the `matching_bytes` function returns the number of matching grapheme clusters instead of bytes. Should this function be renamed then? Added proper highlighting in the HTTP UI: - requires dependency on `unicode-segmentation` to extract grapheme clusters from tokens - `<mark>` tag is put around only the matched part - before this change, the entire word was highlighted even if only a part of it matched 2021-12-18 01:23:34 +08:00			`let len = bytes_to_highlight(word_to_highlight.text(), query_word);`
Fix highlight by replacing num_graphemes_from_bytes num_graphemes_from_bytes has been renamed in the tokenizer to num_chars_from_bytes. Highlight now works correctly! 2022-01-17 15:32:55 +08:00			`Some(word_to_highlight.num_chars_from_bytes(len))`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`} else {`
Run cargo fmt on matching_words.rs 2022-01-17 15:34:33 +08:00			`Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()))`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`}`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`}`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`_otherwise => None,`
format the whole project 2021-06-17 00:33:33 +08:00			`}`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`})`
			`}`
			`}`

			`/// Lists all words which can be considered as a match for the query tree.`
			`fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {`
			`fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {`
			`match tree {`
Replace Consecutive by Phrase in query tree Replace Consecutive by Phrase in query tree in order to remove theorical bugs, due of the Consecutive enum type. 2021-06-09 23:28:12 +08:00			`Operation::Or(_, ops) \| Operation::And(ops) => {`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`ops.as_slice().iter().for_each(\|op\| resolve_ops(op, out));`
format the whole project 2021-06-17 00:33:33 +08:00			`}`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`Operation::Query(Query { prefix, kind }) => {`
			`let typo = if kind.is_exact() { 0 } else { kind.typo() };`
			`out.insert((kind.word(), typo, *prefix));`
format the whole project 2021-06-17 00:33:33 +08:00			`}`
Replace Consecutive by Phrase in query tree Replace Consecutive by Phrase in query tree in order to remove theorical bugs, due of the Consecutive enum type. 2021-06-09 23:28:12 +08:00			`Operation::Phrase(words) => {`
			`for word in words {`
			`out.insert((word, 0, false));`
			`}`
			`}`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`}`
			`}`

			`let mut queries = HashSet::new();`
			`resolve_ops(tree, &mut queries);`
			`queries`
			`}`

			`// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.`
			`struct N2Array<T> {`
			`y_size: usize,`
			`buf: Vec<T>,`
			`}`

			`impl<T: Clone> N2Array<T> {`
			`fn new(x: usize, y: usize, value: T) -> N2Array<T> {`
format the whole project 2021-06-17 00:33:33 +08:00			`N2Array { y_size: y, buf: vec![value; x * y] }`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`}`
			`}`

			`impl<T> Index<(usize, usize)> for N2Array<T> {`
			`type Output = T;`

			`#[inline]`
			`fn index(&self, (x, y): (usize, usize)) -> &T {`
			`&self.buf[(x * self.y_size) + y]`
			`}`
			`}`

			`impl<T> IndexMut<(usize, usize)> for N2Array<T> {`
			`#[inline]`
			`fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {`
			`&mut self.buf[(x * self.y_size) + y]`
			`}`
			`}`

re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			/// Returns the number of bytes we want to highlight in the `source` word.
			`/// Basically we want to highlight as much characters as possible in the source until it has too much`
			`/// typos (= 2)`
			`/// The algorithm is a modified`
			`/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)`
			`fn bytes_to_highlight(source: &str, target: &str) -> usize {`
Fix matching lenghth in matching_words 2021-07-02 01:03:28 +08:00			`let n = source.chars().count();`
			`let m = target.chars().count();`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00
			`if n == 0 {`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`return 0;`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`}`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`// since we allow two typos we can send two characters even if it's completely wrong`
			`if m < 3 {`
			`return source.chars().take(m).map(\|c\| c.len_utf8()).sum();`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`}`
			`if n == m && source == target {`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`return source.len();`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`}`

			`let inf = n + m;`
			`let mut matrix = N2Array::new(n + 2, m + 2, 0);`

			`matrix[(0, 0)] = inf;`
Fix matching lenghth in matching_words 2021-07-02 01:03:28 +08:00			`for i in 0..=n {`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`matrix[(i + 1, 0)] = inf;`
			`matrix[(i + 1, 1)] = i;`
			`}`
Fix matching lenghth in matching_words 2021-07-02 01:03:28 +08:00			`for j in 0..=m {`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`matrix[(0, j + 1)] = inf;`
			`matrix[(1, j + 1)] = j;`
			`}`

			`let mut last_row = BTreeMap::new();`

re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`for (row, char_s) in source.chars().enumerate() {`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`let mut last_match_col = 0;`
			`let row = row + 1;`

re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`for (col, char_t) in target.chars().enumerate() {`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`let col = col + 1;`
			`let last_match_row = *last_row.get(&char_t).unwrap_or(&0);`
			`let cost = if char_s == char_t { 0 } else { 1 };`

			`let dist_add = matrix[(row, col + 1)] + 1;`
			`let dist_del = matrix[(row + 1, col)] + 1;`
			`let dist_sub = matrix[(row, col)] + cost;`
			`let dist_trans = matrix[(last_match_row, last_match_col)]`
			`+ (row - last_match_row - 1)`
			`+ 1`
			`+ (col - last_match_col - 1);`
			`let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));`
			`matrix[(row + 1, col + 1)] = dist;`

			`if cost == 0 {`
			`last_match_col = col;`
			`}`
			`}`

			`last_row.insert(char_s, row);`
			`}`

Fix matching lenghth in matching_words 2021-07-02 01:03:28 +08:00			`let mut minimum = (u32::max_value(), 0);`
			`for x in 0..=m {`
			`let dist = matrix[(n + 1, x + 1)] as u32;`
			`if dist < minimum.0 {`
			`minimum = (dist, x);`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`}`
			`}`

re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`// everything was done characters wise and now we want to returns a number of bytes`
Fix matching lenghth in matching_words 2021-07-02 01:03:28 +08:00			`source.chars().take(minimum.1).map(\|c\| c.len_utf8()).sum()`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`}`

			`#[cfg(test)]`
			`mod tests {`
Fix matching_words tests to compile successfully The tests still fail due to a bug in https://github.com/meilisearch/tokenizer/pull/59 2021-12-18 01:56:06 +08:00			`use std::borrow::Cow;`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`use std::str::from_utf8;`

Fix matching_words tests to compile successfully The tests still fail due to a bug in https://github.com/meilisearch/tokenizer/pull/59 2021-12-18 01:56:06 +08:00			`use meilisearch_tokenizer::TokenKind;`

Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`use super::*;`
			`use crate::search::query_tree::{Operation, Query, QueryKind};`
format the whole project 2021-06-17 00:33:33 +08:00			`use crate::MatchingWords;`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00
			`#[test]`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`fn test_bytes_to_highlight() {`
			`struct TestBytesToHighlight {`
			`query: &'static str,`
			`text: &'static str,`
			`length: usize,`
			`}`
			`let tests = [`
			`TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() },`
			`TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() },`
			`TestBytesToHighlight {`
			`query: "Levenshtein",`
			`text: "Levenshtein",`
			`length: "Levenshtein".len(),`
			`},`
			`// we get to the end of our word with only one typo`
			`TestBytesToHighlight {`
			`query: "Levenste",`
			`text: "Levenshtein",`
			`length: "Levenste".len(),`
			`},`
			`// we get our third and last authorized typo right on the last character`
			`TestBytesToHighlight {`
			`query: "Levenstein",`
			`text: "Levenshte",`
Fix matching lenghth in matching_words 2021-07-02 01:03:28 +08:00			`length: "Levenste".len(),`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`},`
			`// we get to the end of our word with only two typos at the beginning`
			`TestBytesToHighlight {`
			`query: "Bavenshtein",`
			`text: "Levenshtein",`
			`length: "Bavenshtein".len(),`
			`},`
add more tests 2021-06-29 22:18:53 +08:00			`TestBytesToHighlight {`
Fix matching lenghth in matching_words 2021-07-02 01:03:28 +08:00			`query: "Альфа", text: "Альфой", length: "Альф".len()`
add more tests 2021-06-29 22:18:53 +08:00			`},`
			`TestBytesToHighlight {`
			`query: "Go💼", text: "Go💼od luck.", length: "Go💼".len()`
			`},`
			`TestBytesToHighlight {`
			`query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len()`
			`},`
			`TestBytesToHighlight {`
			`query: "chäräcters",`
			`text: "chäräcters",`
			`length: "chäräcters".len(),`
			`},`
			`TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() },`
			`TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() },`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`];`

			`for test in &tests {`
Fix matching lenghth in matching_words 2021-07-02 01:03:28 +08:00			`let length = bytes_to_highlight(test.text, test.query);`
re-implement the Damerau-Levenshtein used for the highlighting 2021-06-29 21:06:03 +08:00			`assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text);`
			`assert!(`
			`from_utf8(&test.query.as_bytes()[..length]).is_ok(),`
			`r#"converting {}[..{}] to an utf8 str failed"#,`
			`test.query,`
			`length`
			`);`
			`}`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`}`

			`#[test]`
			`fn matching_words() {`
format the whole project 2021-06-17 00:33:33 +08:00			`let query_tree = Operation::Or(`
			`false,`
			`vec![Operation::And(vec![`
			`Operation::Query(Query {`
			`prefix: true,`
			`kind: QueryKind::exact("split".to_string()),`
			`}),`
			`Operation::Query(Query {`
			`prefix: false,`
			`kind: QueryKind::exact("this".to_string()),`
			`}),`
			`Operation::Query(Query {`
			`prefix: true,`
			`kind: QueryKind::tolerant(1, "world".to_string()),`
			`}),`
			`])],`
			`);`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00
			`let matching_words = MatchingWords::from_query_tree(&query_tree);`

Run cargo fmt on matching_words.rs 2022-01-17 15:34:33 +08:00			`assert_eq!(`
			`matching_words.matching_bytes(&Token {`
			`kind: TokenKind::Word,`
			`word: Cow::Borrowed("word"),`
			`byte_start: 0,`
			`char_index: 0,`
			`byte_end: "word".len(),`
			`char_map: None,`
			`}),`
			`Some(3)`
			`);`
			`assert_eq!(`
			`matching_words.matching_bytes(&Token {`
			`kind: TokenKind::Word,`
			`word: Cow::Borrowed("nyc"),`
			`byte_start: 0,`
			`char_index: 0,`
			`byte_end: "nyc".len(),`
			`char_map: None,`
			`}),`
			`None`
			`);`
			`assert_eq!(`
			`matching_words.matching_bytes(&Token {`
			`kind: TokenKind::Word,`
			`word: Cow::Borrowed("world"),`
			`byte_start: 0,`
			`char_index: 0,`
			`byte_end: "world".len(),`
			`char_map: None,`
			`}),`
			`Some(5)`
			`);`
			`assert_eq!(`
			`matching_words.matching_bytes(&Token {`
			`kind: TokenKind::Word,`
			`word: Cow::Borrowed("splitted"),`
			`byte_start: 0,`
			`char_index: 0,`
			`byte_end: "splitted".len(),`
			`char_map: None,`
			`}),`
			`Some(5)`
			`);`
			`assert_eq!(`
			`matching_words.matching_bytes(&Token {`
			`kind: TokenKind::Word,`
			`word: Cow::Borrowed("thisnew"),`
			`byte_start: 0,`
			`char_index: 0,`
			`byte_end: "thisnew".len(),`
			`char_map: None,`
			`}),`
			`None`
			`);`
			`assert_eq!(`
			`matching_words.matching_bytes(&Token {`
			`kind: TokenKind::Word,`
			`word: Cow::Borrowed("borld"),`
			`byte_start: 0,`
			`char_index: 0,`
			`byte_end: "borld".len(),`
			`char_map: None,`
			`}),`
			`Some(5)`
			`);`
			`assert_eq!(`
			`matching_words.matching_bytes(&Token {`
			`kind: TokenKind::Word,`
			`word: Cow::Borrowed("wordsplit"),`
			`byte_start: 0,`
			`char_index: 0,`
			`byte_end: "wordsplit".len(),`
			`char_map: None,`
			`}),`
			`Some(4)`
			`);`
Make the MatchingWords return the number of matching bytes 2021-06-01 00:22:29 +08:00			`}`
			`}`