2021-06-01 00:22:29 +08:00
|
|
|
use std::cmp::{min, Reverse};
|
2021-06-17 00:33:33 +08:00
|
|
|
use std::collections::{BTreeMap, HashSet};
|
2021-06-01 00:22:29 +08:00
|
|
|
use std::ops::{Index, IndexMut};
|
|
|
|
|
2021-06-17 00:33:33 +08:00
|
|
|
use levenshtein_automata::{Distance, DFA};
|
2021-12-18 01:23:34 +08:00
|
|
|
use meilisearch_tokenizer::Token;
|
2021-06-01 00:22:29 +08:00
|
|
|
|
|
|
|
use super::build_dfa;
|
2021-06-17 00:33:33 +08:00
|
|
|
use crate::search::query_tree::{Operation, Query};
|
2021-06-01 00:22:29 +08:00
|
|
|
|
|
|
|
type IsPrefix = bool;
|
|
|
|
|
2021-06-01 17:48:56 +08:00
|
|
|
/// Structure created from a query tree
|
|
|
|
/// referencing words that match the given query tree.
|
2021-06-01 00:22:29 +08:00
|
|
|
#[derive(Default)]
|
|
|
|
pub struct MatchingWords {
|
|
|
|
dfas: Vec<(DFA, String, u8, IsPrefix)>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl MatchingWords {
|
|
|
|
pub fn from_query_tree(tree: &Operation) -> Self {
|
2021-06-01 17:48:56 +08:00
|
|
|
// fetch matchable words from the query tree
|
2021-06-01 00:22:29 +08:00
|
|
|
let mut dfas: Vec<_> = fetch_queries(tree)
|
|
|
|
.into_iter()
|
2021-06-01 17:48:56 +08:00
|
|
|
// create DFAs for each word
|
2021-06-01 00:22:29 +08:00
|
|
|
.map(|(w, t, p)| (build_dfa(w, t, p), w.to_string(), t, p))
|
|
|
|
.collect();
|
2021-06-01 17:48:56 +08:00
|
|
|
// Sort word by len in DESC order prioritizing the longuest word,
|
|
|
|
// in order to highlight the longuest part of the matched word.
|
2021-06-17 00:33:33 +08:00
|
|
|
dfas.sort_unstable_by_key(|(_dfa, query_word, _typo, _is_prefix)| {
|
|
|
|
Reverse(query_word.len())
|
|
|
|
});
|
2021-06-01 00:22:29 +08:00
|
|
|
Self { dfas }
|
|
|
|
}
|
|
|
|
|
2021-06-01 17:48:56 +08:00
|
|
|
/// Returns the number of matching bytes if the word matches one of the query words.
|
2021-12-18 01:23:34 +08:00
|
|
|
pub fn matching_bytes(&self, word_to_highlight: &Token) -> Option<usize> {
|
2021-06-29 21:06:03 +08:00
|
|
|
self.dfas.iter().find_map(|(dfa, query_word, typo, is_prefix)| {
|
2021-12-18 01:23:34 +08:00
|
|
|
match dfa.eval(word_to_highlight.text()) {
|
2021-06-29 21:06:03 +08:00
|
|
|
Distance::Exact(t) if t <= *typo => {
|
|
|
|
if *is_prefix {
|
2021-12-18 01:23:34 +08:00
|
|
|
let len = bytes_to_highlight(word_to_highlight.text(), query_word);
|
2022-01-17 15:32:55 +08:00
|
|
|
Some(word_to_highlight.num_chars_from_bytes(len))
|
2021-06-29 21:06:03 +08:00
|
|
|
} else {
|
2022-01-17 15:34:33 +08:00
|
|
|
Some(word_to_highlight.num_chars_from_bytes(word_to_highlight.text().len()))
|
2021-06-29 21:06:03 +08:00
|
|
|
}
|
2021-06-01 00:22:29 +08:00
|
|
|
}
|
2021-06-29 21:06:03 +08:00
|
|
|
_otherwise => None,
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2021-06-01 00:22:29 +08:00
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Lists all words which can be considered as a match for the query tree.
|
|
|
|
fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
|
|
|
|
fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
|
|
|
|
match tree {
|
2021-06-09 23:28:12 +08:00
|
|
|
Operation::Or(_, ops) | Operation::And(ops) => {
|
2021-06-01 00:22:29 +08:00
|
|
|
ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2021-06-01 00:22:29 +08:00
|
|
|
Operation::Query(Query { prefix, kind }) => {
|
|
|
|
let typo = if kind.is_exact() { 0 } else { kind.typo() };
|
|
|
|
out.insert((kind.word(), typo, *prefix));
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2021-06-09 23:28:12 +08:00
|
|
|
Operation::Phrase(words) => {
|
|
|
|
for word in words {
|
|
|
|
out.insert((word, 0, false));
|
|
|
|
}
|
|
|
|
}
|
2021-06-01 00:22:29 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut queries = HashSet::new();
|
|
|
|
resolve_ops(tree, &mut queries);
|
|
|
|
queries
|
|
|
|
}
|
|
|
|
|
|
|
|
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
|
|
|
|
struct N2Array<T> {
|
|
|
|
y_size: usize,
|
|
|
|
buf: Vec<T>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<T: Clone> N2Array<T> {
|
|
|
|
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
|
2021-06-17 00:33:33 +08:00
|
|
|
N2Array { y_size: y, buf: vec![value; x * y] }
|
2021-06-01 00:22:29 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<T> Index<(usize, usize)> for N2Array<T> {
|
|
|
|
type Output = T;
|
|
|
|
|
|
|
|
#[inline]
|
|
|
|
fn index(&self, (x, y): (usize, usize)) -> &T {
|
|
|
|
&self.buf[(x * self.y_size) + y]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<T> IndexMut<(usize, usize)> for N2Array<T> {
|
|
|
|
#[inline]
|
|
|
|
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
|
|
|
|
&mut self.buf[(x * self.y_size) + y]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-29 21:06:03 +08:00
|
|
|
/// Returns the number of **bytes** we want to highlight in the `source` word.
|
|
|
|
/// Basically we want to highlight as much characters as possible in the source until it has too much
|
|
|
|
/// typos (= 2)
|
|
|
|
/// The algorithm is a modified
|
|
|
|
/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
|
|
|
|
fn bytes_to_highlight(source: &str, target: &str) -> usize {
|
2021-07-02 01:03:28 +08:00
|
|
|
let n = source.chars().count();
|
|
|
|
let m = target.chars().count();
|
2021-06-01 00:22:29 +08:00
|
|
|
|
|
|
|
if n == 0 {
|
2021-06-29 21:06:03 +08:00
|
|
|
return 0;
|
2021-06-01 00:22:29 +08:00
|
|
|
}
|
2021-06-29 21:06:03 +08:00
|
|
|
// since we allow two typos we can send two characters even if it's completely wrong
|
|
|
|
if m < 3 {
|
|
|
|
return source.chars().take(m).map(|c| c.len_utf8()).sum();
|
2021-06-01 00:22:29 +08:00
|
|
|
}
|
|
|
|
if n == m && source == target {
|
2021-06-29 21:06:03 +08:00
|
|
|
return source.len();
|
2021-06-01 00:22:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
let inf = n + m;
|
|
|
|
let mut matrix = N2Array::new(n + 2, m + 2, 0);
|
|
|
|
|
|
|
|
matrix[(0, 0)] = inf;
|
2021-07-02 01:03:28 +08:00
|
|
|
for i in 0..=n {
|
2021-06-01 00:22:29 +08:00
|
|
|
matrix[(i + 1, 0)] = inf;
|
|
|
|
matrix[(i + 1, 1)] = i;
|
|
|
|
}
|
2021-07-02 01:03:28 +08:00
|
|
|
for j in 0..=m {
|
2021-06-01 00:22:29 +08:00
|
|
|
matrix[(0, j + 1)] = inf;
|
|
|
|
matrix[(1, j + 1)] = j;
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut last_row = BTreeMap::new();
|
|
|
|
|
2021-06-29 21:06:03 +08:00
|
|
|
for (row, char_s) in source.chars().enumerate() {
|
2021-06-01 00:22:29 +08:00
|
|
|
let mut last_match_col = 0;
|
|
|
|
let row = row + 1;
|
|
|
|
|
2021-06-29 21:06:03 +08:00
|
|
|
for (col, char_t) in target.chars().enumerate() {
|
2021-06-01 00:22:29 +08:00
|
|
|
let col = col + 1;
|
|
|
|
let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
|
|
|
|
let cost = if char_s == char_t { 0 } else { 1 };
|
|
|
|
|
|
|
|
let dist_add = matrix[(row, col + 1)] + 1;
|
|
|
|
let dist_del = matrix[(row + 1, col)] + 1;
|
|
|
|
let dist_sub = matrix[(row, col)] + cost;
|
|
|
|
let dist_trans = matrix[(last_match_row, last_match_col)]
|
|
|
|
+ (row - last_match_row - 1)
|
|
|
|
+ 1
|
|
|
|
+ (col - last_match_col - 1);
|
|
|
|
let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
|
|
|
|
matrix[(row + 1, col + 1)] = dist;
|
|
|
|
|
|
|
|
if cost == 0 {
|
|
|
|
last_match_col = col;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
last_row.insert(char_s, row);
|
|
|
|
}
|
|
|
|
|
2021-07-02 01:03:28 +08:00
|
|
|
let mut minimum = (u32::max_value(), 0);
|
|
|
|
for x in 0..=m {
|
|
|
|
let dist = matrix[(n + 1, x + 1)] as u32;
|
|
|
|
if dist < minimum.0 {
|
|
|
|
minimum = (dist, x);
|
2021-06-01 00:22:29 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-06-29 21:06:03 +08:00
|
|
|
// everything was done characters wise and now we want to returns a number of bytes
|
2021-07-02 01:03:28 +08:00
|
|
|
source.chars().take(minimum.1).map(|c| c.len_utf8()).sum()
|
2021-06-01 00:22:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
2021-12-18 01:56:06 +08:00
|
|
|
use std::borrow::Cow;
|
2021-06-29 21:06:03 +08:00
|
|
|
use std::str::from_utf8;
|
|
|
|
|
2021-12-18 01:56:06 +08:00
|
|
|
use meilisearch_tokenizer::TokenKind;
|
|
|
|
|
2021-06-01 00:22:29 +08:00
|
|
|
use super::*;
|
|
|
|
use crate::search::query_tree::{Operation, Query, QueryKind};
|
2021-06-17 00:33:33 +08:00
|
|
|
use crate::MatchingWords;
|
2021-06-01 00:22:29 +08:00
|
|
|
|
|
|
|
#[test]
|
2021-06-29 21:06:03 +08:00
|
|
|
fn test_bytes_to_highlight() {
|
|
|
|
struct TestBytesToHighlight {
|
|
|
|
query: &'static str,
|
|
|
|
text: &'static str,
|
|
|
|
length: usize,
|
|
|
|
}
|
|
|
|
let tests = [
|
|
|
|
TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() },
|
|
|
|
TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() },
|
|
|
|
TestBytesToHighlight {
|
|
|
|
query: "Levenshtein",
|
|
|
|
text: "Levenshtein",
|
|
|
|
length: "Levenshtein".len(),
|
|
|
|
},
|
|
|
|
// we get to the end of our word with only one typo
|
|
|
|
TestBytesToHighlight {
|
|
|
|
query: "Levenste",
|
|
|
|
text: "Levenshtein",
|
|
|
|
length: "Levenste".len(),
|
|
|
|
},
|
|
|
|
// we get our third and last authorized typo right on the last character
|
|
|
|
TestBytesToHighlight {
|
|
|
|
query: "Levenstein",
|
|
|
|
text: "Levenshte",
|
2021-07-02 01:03:28 +08:00
|
|
|
length: "Levenste".len(),
|
2021-06-29 21:06:03 +08:00
|
|
|
},
|
|
|
|
// we get to the end of our word with only two typos at the beginning
|
|
|
|
TestBytesToHighlight {
|
|
|
|
query: "Bavenshtein",
|
|
|
|
text: "Levenshtein",
|
|
|
|
length: "Bavenshtein".len(),
|
|
|
|
},
|
2021-06-29 22:18:53 +08:00
|
|
|
TestBytesToHighlight {
|
2021-07-02 01:03:28 +08:00
|
|
|
query: "Альфа", text: "Альфой", length: "Альф".len()
|
2021-06-29 22:18:53 +08:00
|
|
|
},
|
|
|
|
TestBytesToHighlight {
|
|
|
|
query: "Go💼", text: "Go💼od luck.", length: "Go💼".len()
|
|
|
|
},
|
|
|
|
TestBytesToHighlight {
|
|
|
|
query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len()
|
|
|
|
},
|
|
|
|
TestBytesToHighlight {
|
|
|
|
query: "chäräcters",
|
|
|
|
text: "chäräcters",
|
|
|
|
length: "chäräcters".len(),
|
|
|
|
},
|
|
|
|
TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() },
|
|
|
|
TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() },
|
2021-06-29 21:06:03 +08:00
|
|
|
];
|
|
|
|
|
|
|
|
for test in &tests {
|
2021-07-02 01:03:28 +08:00
|
|
|
let length = bytes_to_highlight(test.text, test.query);
|
2021-06-29 21:06:03 +08:00
|
|
|
assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text);
|
|
|
|
assert!(
|
|
|
|
from_utf8(&test.query.as_bytes()[..length]).is_ok(),
|
|
|
|
r#"converting {}[..{}] to an utf8 str failed"#,
|
|
|
|
test.query,
|
|
|
|
length
|
|
|
|
);
|
|
|
|
}
|
2021-06-01 00:22:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn matching_words() {
|
2021-06-17 00:33:33 +08:00
|
|
|
let query_tree = Operation::Or(
|
|
|
|
false,
|
|
|
|
vec![Operation::And(vec![
|
|
|
|
Operation::Query(Query {
|
|
|
|
prefix: true,
|
|
|
|
kind: QueryKind::exact("split".to_string()),
|
|
|
|
}),
|
|
|
|
Operation::Query(Query {
|
|
|
|
prefix: false,
|
|
|
|
kind: QueryKind::exact("this".to_string()),
|
|
|
|
}),
|
|
|
|
Operation::Query(Query {
|
|
|
|
prefix: true,
|
|
|
|
kind: QueryKind::tolerant(1, "world".to_string()),
|
|
|
|
}),
|
|
|
|
])],
|
|
|
|
);
|
2021-06-01 00:22:29 +08:00
|
|
|
|
|
|
|
let matching_words = MatchingWords::from_query_tree(&query_tree);
|
|
|
|
|
2022-01-17 15:34:33 +08:00
|
|
|
assert_eq!(
|
|
|
|
matching_words.matching_bytes(&Token {
|
|
|
|
kind: TokenKind::Word,
|
|
|
|
word: Cow::Borrowed("word"),
|
|
|
|
byte_start: 0,
|
|
|
|
char_index: 0,
|
|
|
|
byte_end: "word".len(),
|
|
|
|
char_map: None,
|
|
|
|
}),
|
|
|
|
Some(3)
|
|
|
|
);
|
|
|
|
assert_eq!(
|
|
|
|
matching_words.matching_bytes(&Token {
|
|
|
|
kind: TokenKind::Word,
|
|
|
|
word: Cow::Borrowed("nyc"),
|
|
|
|
byte_start: 0,
|
|
|
|
char_index: 0,
|
|
|
|
byte_end: "nyc".len(),
|
|
|
|
char_map: None,
|
|
|
|
}),
|
|
|
|
None
|
|
|
|
);
|
|
|
|
assert_eq!(
|
|
|
|
matching_words.matching_bytes(&Token {
|
|
|
|
kind: TokenKind::Word,
|
|
|
|
word: Cow::Borrowed("world"),
|
|
|
|
byte_start: 0,
|
|
|
|
char_index: 0,
|
|
|
|
byte_end: "world".len(),
|
|
|
|
char_map: None,
|
|
|
|
}),
|
|
|
|
Some(5)
|
|
|
|
);
|
|
|
|
assert_eq!(
|
|
|
|
matching_words.matching_bytes(&Token {
|
|
|
|
kind: TokenKind::Word,
|
|
|
|
word: Cow::Borrowed("splitted"),
|
|
|
|
byte_start: 0,
|
|
|
|
char_index: 0,
|
|
|
|
byte_end: "splitted".len(),
|
|
|
|
char_map: None,
|
|
|
|
}),
|
|
|
|
Some(5)
|
|
|
|
);
|
|
|
|
assert_eq!(
|
|
|
|
matching_words.matching_bytes(&Token {
|
|
|
|
kind: TokenKind::Word,
|
|
|
|
word: Cow::Borrowed("thisnew"),
|
|
|
|
byte_start: 0,
|
|
|
|
char_index: 0,
|
|
|
|
byte_end: "thisnew".len(),
|
|
|
|
char_map: None,
|
|
|
|
}),
|
|
|
|
None
|
|
|
|
);
|
|
|
|
assert_eq!(
|
|
|
|
matching_words.matching_bytes(&Token {
|
|
|
|
kind: TokenKind::Word,
|
|
|
|
word: Cow::Borrowed("borld"),
|
|
|
|
byte_start: 0,
|
|
|
|
char_index: 0,
|
|
|
|
byte_end: "borld".len(),
|
|
|
|
char_map: None,
|
|
|
|
}),
|
|
|
|
Some(5)
|
|
|
|
);
|
|
|
|
assert_eq!(
|
|
|
|
matching_words.matching_bytes(&Token {
|
|
|
|
kind: TokenKind::Word,
|
|
|
|
word: Cow::Borrowed("wordsplit"),
|
|
|
|
byte_start: 0,
|
|
|
|
char_index: 0,
|
|
|
|
byte_end: "wordsplit".len(),
|
|
|
|
char_map: None,
|
|
|
|
}),
|
|
|
|
Some(4)
|
|
|
|
);
|
2021-06-01 00:22:29 +08:00
|
|
|
}
|
|
|
|
}
|