Merge branch 'search-refactor-highlighter' into search-refactor-highlighter-merged

This commit is contained in:
Loïc Lecrenier 2023-04-11 12:22:34 +02:00
commit e7bb8c940f
8 changed files with 470 additions and 631 deletions

View File

@ -48,7 +48,3 @@ harness = false
[[bench]] [[bench]]
name = "indexing" name = "indexing"
harness = false harness = false
[[bench]]
name = "formatting"
harness = false

View File

@ -1,67 +0,0 @@
use std::rc::Rc;
use criterion::{criterion_group, criterion_main};
use milli::tokenizer::TokenizerBuilder;
use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords};
#[global_allocator]
static ALLOC: mimalloc::MiMalloc = mimalloc::MiMalloc;
struct Conf<'a> {
name: &'a str,
text: &'a str,
matching_words: MatcherBuilder<'a, Vec<u8>>,
}
fn bench_formatting(c: &mut criterion::Criterion) {
#[rustfmt::skip]
let confs = &[
Conf {
name: "'the door d'",
text: r#"He used to do the door sounds in "Star Trek" with his mouth, phssst, phssst. The MD-11 passenger and cargo doors also tend to behave like electromagnetic apertures, because the doors do not have continuous electrical contact with the door frames around the door perimeter. But Theodor said that the doors don't work."#,
matching_words: MatcherBuilder::new(MatchingWords::new(vec![
(vec![Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap())], vec![0]),
(vec![Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap())], vec![0]),
(vec![Rc::new(MatchingWord::new("door".to_string(), 1, false).unwrap())], vec![1]),
(vec![Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap())], vec![0]),
(vec![Rc::new(MatchingWord::new("thedoor".to_string(), 1, false).unwrap())], vec![0, 1]),
(vec![Rc::new(MatchingWord::new("d".to_string(), 0, true).unwrap())], vec![2]),
(vec![Rc::new(MatchingWord::new("thedoord".to_string(), 1, true).unwrap())], vec![0, 1, 2]),
(vec![Rc::new(MatchingWord::new("doord".to_string(), 1, true).unwrap())], vec![1, 2]),
]
).unwrap(), TokenizerBuilder::default().build()),
},
];
let format_options = &[
FormatOptions { highlight: false, crop: None },
FormatOptions { highlight: true, crop: None },
FormatOptions { highlight: false, crop: Some(10) },
FormatOptions { highlight: true, crop: Some(10) },
FormatOptions { highlight: false, crop: Some(20) },
FormatOptions { highlight: true, crop: Some(20) },
];
for option in format_options {
let highlight = if option.highlight { "highlight" } else { "no-highlight" };
let name = match option.crop {
Some(size) => format!("{}-crop({})", highlight, size),
None => format!("{}-no-crop", highlight),
};
let mut group = c.benchmark_group(&name);
for conf in confs {
group.bench_function(conf.name, |b| {
b.iter(|| {
let mut matcher = conf.matching_words.build(conf.text);
matcher.format(*option);
})
});
}
group.finish();
}
}
criterion_group!(benches, bench_formatting);
criterion_main!(benches);

View File

@ -98,8 +98,8 @@ pub use self::heed_codec::{
}; };
pub use self::index::Index; pub use self::index::Index;
pub use self::search::{ pub use self::search::{
FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search,
MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
}; };
pub type Result<T> = std::result::Result<T, error::Error>; pub type Result<T> = std::result::Result<T, error::Error>;

View File

@ -1,458 +0,0 @@
use std::cmp::{min, Reverse};
use std::collections::BTreeMap;
use std::fmt;
use std::ops::{Index, IndexMut};
use std::rc::Rc;
use charabia::Token;
use levenshtein_automata::{Distance, DFA};
use crate::error::InternalError;
use crate::search::build_dfa;
use crate::MAX_WORD_LENGTH;
type IsPrefix = bool;
/// Structure created from a query tree
/// referencing words that match the given query tree.
#[derive(Default)]
pub struct MatchingWords {
inner: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
}
impl fmt::Debug for MatchingWords {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "[")?;
for (matching_words, primitive_word_id) in self.inner.iter() {
writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?;
}
writeln!(f, "]")?;
Ok(())
}
}
impl MatchingWords {
pub fn new(
mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
) -> crate::Result<Self> {
// if one of the matching_words vec doesn't contain a word.
if matching_words.iter().any(|(mw, _)| mw.is_empty()) {
return Err(InternalError::InvalidMatchingWords.into());
}
// Sort word by len in DESC order prioritizing the longuest matches,
// in order to highlight the longuest part of the matched word.
matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
Ok(Self { inner: matching_words })
}
/// Returns an iterator over terms that match or partially match the given token.
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
MatchesIter { inner: Box::new(self.inner.iter()), token }
}
}
/// Iterator over terms that match the given token,
/// This allow to lazily evaluate matches.
pub struct MatchesIter<'a, 'b> {
#[allow(clippy::type_complexity)]
inner: Box<dyn Iterator<Item = &'a (Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)> + 'a>,
token: &'b Token<'b>,
}
impl<'a> Iterator for MatchesIter<'a, '_> {
type Item = MatchType<'a>;
fn next(&mut self) -> Option<Self::Item> {
match self.inner.next() {
Some((matching_words, ids)) => match matching_words[0].match_token(self.token) {
Some(char_len) => {
if matching_words.len() > 1 {
Some(MatchType::Partial(PartialMatch {
matching_words: &matching_words[1..],
ids,
char_len,
}))
} else {
Some(MatchType::Full { char_len, ids })
}
}
None => self.next(),
},
None => None,
}
}
}
/// Id of a matching term corespounding to a word written by the end user.
pub type PrimitiveWordId = u8;
/// Structure used to match a specific term.
pub struct MatchingWord {
pub dfa: DFA,
pub word: String,
pub typo: u8,
pub prefix: IsPrefix,
}
impl fmt::Debug for MatchingWord {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("MatchingWord")
.field("word", &self.word)
.field("typo", &self.typo)
.field("prefix", &self.prefix)
.finish()
}
}
impl PartialEq for MatchingWord {
fn eq(&self, other: &Self) -> bool {
self.prefix == other.prefix && self.typo == other.typo && self.word == other.word
}
}
impl MatchingWord {
pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option<Self> {
if word.len() > MAX_WORD_LENGTH {
return None;
}
let dfa = build_dfa(&word, typo, prefix);
Some(Self { dfa, word, typo, prefix })
}
/// Returns the lenght in chars of the match in case of the token matches the term.
pub fn match_token(&self, token: &Token) -> Option<usize> {
match self.dfa.eval(token.lemma()) {
Distance::Exact(t) if t <= self.typo => {
if self.prefix {
let len = bytes_to_highlight(token.lemma(), &self.word);
Some(token.original_lengths(len).0)
} else {
Some(token.original_lengths(token.lemma().len()).0)
}
}
_otherwise => None,
}
}
}
/// A given token can partially match a query word for several reasons:
/// - split words
/// - multi-word synonyms
/// In these cases we need to match consecutively several tokens to consider that the match is full.
#[derive(Debug, PartialEq)]
pub enum MatchType<'a> {
Full { char_len: usize, ids: &'a [PrimitiveWordId] },
Partial(PartialMatch<'a>),
}
/// Structure helper to match several tokens in a row in order to complete a partial match.
#[derive(Debug, PartialEq)]
pub struct PartialMatch<'a> {
matching_words: &'a [Rc<MatchingWord>],
ids: &'a [PrimitiveWordId],
char_len: usize,
}
impl<'a> PartialMatch<'a> {
/// Returns:
/// - None if the given token breaks the partial match
/// - Partial if the given token matches the partial match but doesn't complete it
/// - Full if the given token completes the partial match
pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
self.matching_words[0].match_token(token).map(|char_len| {
if self.matching_words.len() > 1 {
MatchType::Partial(PartialMatch {
matching_words: &self.matching_words[1..],
ids: self.ids,
char_len,
})
} else {
MatchType::Full { char_len, ids: self.ids }
}
})
}
pub fn char_len(&self) -> usize {
self.char_len
}
}
// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
struct N2Array<T> {
y_size: usize,
buf: Vec<T>,
}
impl<T: Clone> N2Array<T> {
fn new(x: usize, y: usize, value: T) -> N2Array<T> {
N2Array { y_size: y, buf: vec![value; x * y] }
}
}
impl<T> Index<(usize, usize)> for N2Array<T> {
type Output = T;
#[inline]
fn index(&self, (x, y): (usize, usize)) -> &T {
&self.buf[(x * self.y_size) + y]
}
}
impl<T> IndexMut<(usize, usize)> for N2Array<T> {
#[inline]
fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
&mut self.buf[(x * self.y_size) + y]
}
}
/// Returns the number of **bytes** we want to highlight in the `source` word.
/// Basically we want to highlight as much characters as possible in the source until it has too much
/// typos (= 2)
/// The algorithm is a modified
/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
fn bytes_to_highlight(source: &str, target: &str) -> usize {
let n = source.chars().count();
let m = target.chars().count();
if n == 0 {
return 0;
}
// since we allow two typos we can send two characters even if it's completely wrong
if m < 3 {
return source.chars().take(m).map(|c| c.len_utf8()).sum();
}
if n == m && source == target {
return source.len();
}
let inf = n + m;
let mut matrix = N2Array::new(n + 2, m + 2, 0);
matrix[(0, 0)] = inf;
for i in 0..=n {
matrix[(i + 1, 0)] = inf;
matrix[(i + 1, 1)] = i;
}
for j in 0..=m {
matrix[(0, j + 1)] = inf;
matrix[(1, j + 1)] = j;
}
let mut last_row = BTreeMap::new();
for (row, char_s) in source.chars().enumerate() {
let mut last_match_col = 0;
let row = row + 1;
for (col, char_t) in target.chars().enumerate() {
let col = col + 1;
let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
let cost = usize::from(char_s != char_t);
let dist_add = matrix[(row, col + 1)] + 1;
let dist_del = matrix[(row + 1, col)] + 1;
let dist_sub = matrix[(row, col)] + cost;
let dist_trans = matrix[(last_match_row, last_match_col)]
+ (row - last_match_row - 1)
+ 1
+ (col - last_match_col - 1);
let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
matrix[(row + 1, col + 1)] = dist;
if cost == 0 {
last_match_col = col;
}
}
last_row.insert(char_s, row);
}
let mut minimum = (u32::max_value(), 0);
for x in 0..=m {
let dist = matrix[(n + 1, x + 1)] as u32;
if dist < minimum.0 {
minimum = (dist, x);
}
}
// everything was done characters wise and now we want to returns a number of bytes
source.chars().take(minimum.1).map(|c| c.len_utf8()).sum()
}
#[cfg(test)]
mod tests {
use std::borrow::Cow;
use std::str::from_utf8;
use charabia::TokenKind;
use super::*;
use crate::MatchingWords;
#[test]
fn test_bytes_to_highlight() {
struct TestBytesToHighlight {
query: &'static str,
text: &'static str,
length: usize,
}
let tests = [
TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() },
TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() },
TestBytesToHighlight {
query: "Levenshtein",
text: "Levenshtein",
length: "Levenshtein".len(),
},
// we get to the end of our word with only one typo
TestBytesToHighlight {
query: "Levenste",
text: "Levenshtein",
length: "Levenste".len(),
},
// we get our third and last authorized typo right on the last character
TestBytesToHighlight {
query: "Levenstein",
text: "Levenshte",
length: "Levenste".len(),
},
// we get to the end of our word with only two typos at the beginning
TestBytesToHighlight {
query: "Bavenshtein",
text: "Levenshtein",
length: "Bavenshtein".len(),
},
TestBytesToHighlight {
query: "Альфа", text: "Альфой", length: "Альф".len()
},
TestBytesToHighlight {
query: "Go💼", text: "Go💼od luck.", length: "Go💼".len()
},
TestBytesToHighlight {
query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len()
},
TestBytesToHighlight {
query: "chäräcters",
text: "chäräcters",
length: "chäräcters".len(),
},
TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() },
TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() },
];
for test in &tests {
let length = bytes_to_highlight(test.text, test.query);
assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text);
assert!(
from_utf8(&test.query.as_bytes()[..length]).is_ok(),
r#"converting {}[..{}] to an utf8 str failed"#,
test.query,
length
);
}
}
#[test]
fn matching_words() {
let all = vec![
Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()),
Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()),
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
];
let matching_words = vec![
(vec![all[0].clone()], vec![0]),
(vec![all[1].clone()], vec![1]),
(vec![all[2].clone()], vec![2]),
];
let matching_words = MatchingWords::new(matching_words).unwrap();
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("word"),
char_end: "word".chars().count(),
byte_end: "word".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 3, ids: &[2] })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("nyc"),
char_end: "nyc".chars().count(),
byte_end: "nyc".len(),
..Default::default()
})
.next(),
None
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("world"),
char_end: "world".chars().count(),
byte_end: "world".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &[2] })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("splitted"),
char_end: "splitted".chars().count(),
byte_end: "splitted".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &[0] })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("thisnew"),
char_end: "thisnew".chars().count(),
byte_end: "thisnew".len(),
..Default::default()
})
.next(),
None
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("borld"),
char_end: "borld".chars().count(),
byte_end: "borld".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &[2] })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("wordsplit"),
char_end: "wordsplit".chars().count(),
byte_end: "wordsplit".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 4, ids: &[2] })
);
}
}

View File

@ -5,9 +5,8 @@ use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap; use roaring::bitmap::RoaringBitmap;
pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET};
pub use self::matches::{ pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords};
FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, use self::new::PartialSearchResult;
};
use crate::{ use crate::{
execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext,
}; };
@ -19,7 +18,6 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
pub mod facet; pub mod facet;
mod fst_utils; mod fst_utils;
mod matches;
pub mod new; pub mod new;
pub struct Search<'a> { pub struct Search<'a> {
@ -110,6 +108,7 @@ impl<'a> Search<'a> {
pub fn execute(&self) -> Result<SearchResult> { pub fn execute(&self) -> Result<SearchResult> {
let mut ctx = SearchContext::new(self.index, self.rtxn); let mut ctx = SearchContext::new(self.index, self.rtxn);
let PartialSearchResult { located_query_terms, candidates, documents_ids } =
execute_search( execute_search(
&mut ctx, &mut ctx,
&self.query, &self.query,
@ -122,7 +121,15 @@ impl<'a> Search<'a> {
Some(self.words_limit), Some(self.words_limit),
&mut DefaultSearchLogger, &mut DefaultSearchLogger,
&mut DefaultSearchLogger, &mut DefaultSearchLogger,
) )?;
// consume context and located_query_terms to build MatchingWords.
let matching_words = match located_query_terms {
Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
None => MatchingWords::default(),
};
Ok(SearchResult { matching_words, candidates, documents_ids })
} }
} }

View File

@ -0,0 +1,377 @@
use std::cmp::Reverse;
use std::fmt;
use std::ops::RangeInclusive;
use charabia::Token;
use super::super::interner::Interned;
use super::super::query_term::{
Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm,
};
use super::super::{DedupInterner, Phrase};
use crate::SearchContext;
pub struct LocatedMatchingPhrase {
pub value: Interned<Phrase>,
pub positions: RangeInclusive<WordId>,
}
pub struct LocatedMatchingWords {
pub value: Vec<Interned<String>>,
pub positions: RangeInclusive<WordId>,
pub is_prefix: bool,
pub original_char_count: usize,
}
/// Structure created from a query tree
/// referencing words that match the given query tree.
#[derive(Default)]
pub struct MatchingWords {
word_interner: DedupInterner<String>,
phrase_interner: DedupInterner<Phrase>,
phrases: Vec<LocatedMatchingPhrase>,
words: Vec<LocatedMatchingWords>,
}
/// Extract and centralize the different phrases and words to match stored in a QueryTerm.
fn extract_matching_terms(term: &QueryTerm) -> (Vec<Interned<Phrase>>, Vec<Interned<String>>) {
let mut matching_words = Vec::new();
let mut matching_phrases = Vec::new();
// the structure is exhaustively extracted to ensure that no field is missing.
let QueryTerm {
original: _,
is_multiple_words: _,
max_nbr_typos: _,
is_prefix: _,
zero_typo,
one_typo,
two_typo,
} = term;
// the structure is exhaustively extracted to ensure that no field is missing.
let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo;
// zero typo
if let Some(phrase) = phrase {
matching_phrases.push(*phrase);
}
if let Some(zero_typo) = zero_typo {
matching_words.push(*zero_typo);
}
for synonym in synonyms {
matching_phrases.push(*synonym);
}
// one typo
// the structure is exhaustively extracted to ensure that no field is missing.
if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo {
if let Some(split_words) = split_words {
matching_phrases.push(*split_words);
}
for one_typo in one_typo {
matching_words.push(*one_typo);
}
}
// two typos
// the structure is exhaustively extracted to ensure that no field is missing.
if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo {
for two_typos in two_typos {
matching_words.push(*two_typos);
}
}
(matching_phrases, matching_words)
}
impl MatchingWords {
pub fn new(ctx: SearchContext, located_terms: Vec<LocatedQueryTerm>) -> Self {
let mut phrases = Vec::new();
let mut words = Vec::new();
// Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms
// and wrap them in dedicated structures.
for located_term in located_terms {
let term = ctx.term_interner.get(located_term.value);
let (matching_phrases, matching_words) = extract_matching_terms(term);
for matching_phrase in matching_phrases {
phrases.push(LocatedMatchingPhrase {
value: matching_phrase,
positions: located_term.positions.clone(),
});
}
words.push(LocatedMatchingWords {
value: matching_words,
positions: located_term.positions.clone(),
is_prefix: term.is_prefix,
original_char_count: ctx.word_interner.get(term.original).chars().count(),
});
}
// Sort word to put prefixes at the bottom prioritizing the exact matches.
words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));
Self {
phrases,
words,
word_interner: ctx.word_interner,
phrase_interner: ctx.phrase_interner,
}
}
/// Returns an iterator over terms that match or partially match the given token.
pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
}
/// Try to match the token with one of the located_words.
fn match_unique_words<'a>(&'a self, token: &Token) -> Option<MatchType<'a>> {
for located_words in &self.words {
for word in &located_words.value {
let word = self.word_interner.get(*word);
// if the word is a prefix we match using starts_with.
if located_words.is_prefix && token.lemma().starts_with(word) {
let Some((char_index, c)) = word.char_indices().take(located_words.original_char_count).last() else {
continue;
};
let prefix_length = char_index + c.len_utf8();
let char_len = token.original_lengths(prefix_length).0;
let ids = &located_words.positions;
return Some(MatchType::Full { char_len, ids });
// else we exact match the token.
} else if token.lemma() == word {
let char_len = token.char_end - token.char_start;
let ids = &located_words.positions;
return Some(MatchType::Full { char_len, ids });
}
}
}
None
}
}
/// Iterator over terms that match the given token,
/// This allow to lazily evaluate matches.
pub struct MatchesIter<'a, 'b> {
matching_words: &'a MatchingWords,
phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
token: &'b Token<'b>,
}
impl<'a> Iterator for MatchesIter<'a, '_> {
type Item = MatchType<'a>;
fn next(&mut self) -> Option<Self::Item> {
match self.phrases.next() {
// Try to match all the phrases first.
Some(located_phrase) => {
let phrase = self.matching_words.phrase_interner.get(located_phrase.value);
// create a PartialMatch struct to make it compute the first match
// instead of duplicating the code.
let ids = &located_phrase.positions;
// collect the references of words from the interner.
let words = phrase
.words
.iter()
.map(|word| {
word.map(|word| self.matching_words.word_interner.get(word).as_str())
})
.collect();
let partial = PartialMatch { matching_words: words, ids, char_len: 0 };
partial.match_token(self.token).or_else(|| self.next())
}
// If no phrases matches, try to match uiques words.
None => self.matching_words.match_unique_words(self.token),
}
}
}
/// Id of a matching term corespounding to a word written by the end user.
pub type WordId = u16;
/// A given token can partially match a query word for several reasons:
/// - split words
/// - multi-word synonyms
/// In these cases we need to match consecutively several tokens to consider that the match is full.
#[derive(Debug, PartialEq)]
pub enum MatchType<'a> {
Full { char_len: usize, ids: &'a RangeInclusive<WordId> },
Partial(PartialMatch<'a>),
}
/// Structure helper to match several tokens in a row in order to complete a partial match.
#[derive(Debug, PartialEq)]
pub struct PartialMatch<'a> {
matching_words: Vec<Option<&'a str>>,
ids: &'a RangeInclusive<WordId>,
char_len: usize,
}
impl<'a> PartialMatch<'a> {
/// Returns:
/// - None if the given token breaks the partial match
/// - Partial if the given token matches the partial match but doesn't complete it
/// - Full if the given token completes the partial match
pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
let Self { mut matching_words, ids, .. } = self;
let is_matching = match matching_words.first()? {
Some(word) => &token.lemma() == word,
// a None value in the phrase corresponds to a stop word,
// the walue is considered a match if the current token is categorized as a stop word.
None => token.is_stopword(),
};
let char_len = token.char_end - token.char_start;
// if there are remaining words to match in the phrase and the current token is matching,
// return a new Partial match allowing the highlighter to continue.
if is_matching && matching_words.len() > 1 {
matching_words.remove(0);
Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))
// if there is no remaining word to match in the phrase and the current token is matching,
// return a Full match.
} else if is_matching {
Some(MatchType::Full { char_len, ids })
// if the current token doesn't match, return None to break the match sequence.
} else {
None
}
}
pub fn char_len(&self) -> usize {
self.char_len
}
}
impl fmt::Debug for MatchingWords {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
let phrases: Vec<_> = phrases
.iter()
.map(|p| {
(
phrase_interner
.get(p.value)
.words
.iter()
.map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
.collect::<Vec<_>>()
.join(" "),
p.positions.clone(),
)
})
.collect();
let words: Vec<_> = words
.iter()
.flat_map(|w| {
w.value
.iter()
.map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
.collect::<Vec<_>>()
})
.collect();
f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
}
}
#[cfg(test)]
pub(crate) mod tests {
use std::borrow::Cow;
use charabia::{TokenKind, TokenizerBuilder};
use super::super::super::located_query_terms_from_string;
use super::*;
use crate::index::tests::TempIndex;
pub(crate) fn temp_index_with_documents() -> TempIndex {
let temp_index = TempIndex::new();
temp_index
.add_documents(documents!([
{ "id": 1, "name": "split this world westfali westfalia the" },
]))
.unwrap();
temp_index
}
#[test]
fn matching_words() {
let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let mut ctx = SearchContext::new(&temp_index, &rtxn);
let tokenizer = TokenizerBuilder::new().build();
let tokens = tokenizer.tokenize("split this world");
let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap();
let matching_words = MatchingWords::new(ctx, query_terms);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("split"),
char_end: "split".chars().count(),
byte_end: "split".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &(0..=0) })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("nyc"),
char_end: "nyc".chars().count(),
byte_end: "nyc".len(),
..Default::default()
})
.next(),
None
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("world"),
char_end: "world".chars().count(),
byte_end: "world".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("worlded"),
char_end: "worlded".chars().count(),
byte_end: "worlded".len(),
..Default::default()
})
.next(),
Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
);
assert_eq!(
matching_words
.match_token(&Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("thisnew"),
char_end: "thisnew".chars().count(),
byte_end: "thisnew".len(),
..Default::default()
})
.next(),
None
);
}
}

View File

@ -1,8 +1,8 @@
use std::borrow::Cow; use std::borrow::Cow;
use charabia::{SeparatorKind, Token, Tokenizer}; use charabia::{SeparatorKind, Token, Tokenizer};
use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; pub use matching_words::MatchingWords;
pub use matching_words::{MatchingWord, MatchingWords}; use matching_words::{MatchType, PartialMatch, WordId};
use serde::Serialize; use serde::Serialize;
pub mod matching_words; pub mod matching_words;
@ -88,7 +88,7 @@ impl FormatOptions {
pub struct Match { pub struct Match {
match_len: usize, match_len: usize,
// ids of the query words that matches. // ids of the query words that matches.
ids: Vec<PrimitiveWordId>, ids: Vec<WordId>,
// position of the word in the whole text. // position of the word in the whole text.
word_position: usize, word_position: usize,
// position of the token in the whole text. // position of the token in the whole text.
@ -137,11 +137,12 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
} }
// partial match is now full, we keep this matches and we advance positions // partial match is now full, we keep this matches and we advance positions
Some(MatchType::Full { char_len, ids }) => { Some(MatchType::Full { char_len, ids }) => {
let ids: Vec<_> = ids.clone().into_iter().collect();
// save previously matched tokens as matches. // save previously matched tokens as matches.
let iter = potential_matches.into_iter().map( let iter = potential_matches.into_iter().map(
|(token_position, word_position, match_len)| Match { |(token_position, word_position, match_len)| Match {
match_len, match_len,
ids: ids.to_vec(), ids: ids.clone(),
word_position, word_position,
token_position, token_position,
}, },
@ -151,7 +152,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
// save the token that closes the partial match as a match. // save the token that closes the partial match as a match.
matches.push(Match { matches.push(Match {
match_len: char_len, match_len: char_len,
ids: ids.to_vec(), ids,
word_position, word_position,
token_position, token_position,
}); });
@ -191,9 +192,10 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
// we match, we save the current token as a match, // we match, we save the current token as a match,
// then we continue the rest of the tokens. // then we continue the rest of the tokens.
MatchType::Full { char_len, ids } => { MatchType::Full { char_len, ids } => {
let ids: Vec<_> = ids.clone().into_iter().collect();
matches.push(Match { matches.push(Match {
match_len: char_len, match_len: char_len,
ids: ids.to_vec(), ids,
word_position, word_position,
token_position, token_position,
}); });
@ -334,7 +336,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
/// 2) calculate distance between matches /// 2) calculate distance between matches
/// 3) count ordered matches /// 3) count ordered matches
fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len()); let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
let mut order_score = 0; let mut order_score = 0;
let mut distance_score = 0; let mut distance_score = 0;
@ -494,39 +496,29 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::rc::Rc;
use charabia::TokenizerBuilder; use charabia::TokenizerBuilder;
use matching_words::tests::temp_index_with_documents;
use super::super::located_query_terms_from_string;
use super::*; use super::*;
use crate::search::matches::matching_words::MatchingWord; use crate::SearchContext;
fn matching_words() -> MatchingWords { impl<'a> MatcherBuilder<'a, &[u8]> {
let all = vec![ pub fn new_test(mut ctx: SearchContext, query: &'a str) -> Self {
Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()), let tokenizer = TokenizerBuilder::new().build();
Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), let tokens = tokenizer.tokenize(query);
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap();
]; let matching_words = MatchingWords::new(ctx, query_terms);
let matching_words = vec![ Self::new(matching_words, TokenizerBuilder::new().build())
(vec![all[0].clone()], vec![0]),
(vec![all[1].clone()], vec![1]),
(vec![all[2].clone()], vec![2]),
];
MatchingWords::new(matching_words).unwrap()
}
impl MatcherBuilder<'_, Vec<u8>> {
pub fn from_matching_words(matching_words: MatchingWords) -> Self {
Self::new(matching_words, TokenizerBuilder::default().build())
} }
} }
#[test] #[test]
fn format_identity() { fn format_identity() {
let matching_words = matching_words(); let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::from_matching_words(matching_words); let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "split the world");
let format_options = FormatOptions { highlight: false, crop: None }; let format_options = FormatOptions { highlight: false, crop: None };
@ -551,9 +543,10 @@ mod tests {
#[test] #[test]
fn format_highlight() { fn format_highlight() {
let matching_words = matching_words(); let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::from_matching_words(matching_words); let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "split the world");
let format_options = FormatOptions { highlight: true, crop: None }; let format_options = FormatOptions { highlight: true, crop: None };
@ -594,16 +587,10 @@ mod tests {
#[test] #[test]
fn highlight_unicode() { fn highlight_unicode() {
let all = vec![ let temp_index = temp_index_with_documents();
Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()), let rtxn = temp_index.read_txn().unwrap();
Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), let ctx = SearchContext::new(&temp_index, &rtxn);
]; let builder = MatcherBuilder::new_test(ctx, "world");
let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
let matching_words = MatchingWords::new(matching_words).unwrap();
let builder = MatcherBuilder::from_matching_words(matching_words);
let format_options = FormatOptions { highlight: true, crop: None }; let format_options = FormatOptions { highlight: true, crop: None };
// Text containing prefix match. // Text containing prefix match.
@ -624,6 +611,10 @@ mod tests {
@"<em>Ŵôřlḑ</em>" @"<em>Ŵôřlḑ</em>"
); );
let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "westfali");
let format_options = FormatOptions { highlight: true, crop: None };
// Text containing unicode match. // Text containing unicode match.
let text = "Westfália"; let text = "Westfália";
let mut matcher = builder.build(text); let mut matcher = builder.build(text);
@ -636,9 +627,10 @@ mod tests {
#[test] #[test]
fn format_crop() { fn format_crop() {
let matching_words = matching_words(); let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::from_matching_words(matching_words); let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "split the world");
let format_options = FormatOptions { highlight: false, crop: Some(10) }; let format_options = FormatOptions { highlight: false, crop: Some(10) };
@ -733,9 +725,10 @@ mod tests {
#[test] #[test]
fn format_highlight_crop() { fn format_highlight_crop() {
let matching_words = matching_words(); let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::from_matching_words(matching_words); let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "split the world");
let format_options = FormatOptions { highlight: true, crop: Some(10) }; let format_options = FormatOptions { highlight: true, crop: Some(10) };
@ -795,9 +788,10 @@ mod tests {
#[test] #[test]
fn smaller_crop_size() { fn smaller_crop_size() {
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
let matching_words = matching_words(); let temp_index = temp_index_with_documents();
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::from_matching_words(matching_words); let ctx = SearchContext::new(&temp_index, &rtxn);
let builder = MatcherBuilder::new_test(ctx, "split the world");
let text = "void void split the world void void."; let text = "void void split the world void void.";
@ -831,25 +825,10 @@ mod tests {
#[test] #[test]
fn partial_matches() { fn partial_matches() {
let all = vec![ let temp_index = temp_index_with_documents();
Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), let rtxn = temp_index.read_txn().unwrap();
Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), let ctx = SearchContext::new(&temp_index, &rtxn);
Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()), let mut builder = MatcherBuilder::new_test(ctx, "the \"t he\" door \"do or\"");
Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()),
Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()),
Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()),
];
let matching_words = vec![
(vec![all[0].clone()], vec![0]),
(vec![all[1].clone(), all[2].clone()], vec![0]),
(vec![all[3].clone()], vec![1]),
(vec![all[4].clone(), all[5].clone()], vec![1]),
(vec![all[4].clone()], vec![2]),
];
let matching_words = MatchingWords::new(matching_words).unwrap();
let mut builder = MatcherBuilder::from_matching_words(matching_words);
builder.highlight_prefix("_".to_string()); builder.highlight_prefix("_".to_string());
builder.highlight_suffix("_".to_string()); builder.highlight_suffix("_".to_string());
@ -859,7 +838,7 @@ mod tests {
let mut matcher = builder.build(text); let mut matcher = builder.build(text);
insta::assert_snapshot!( insta::assert_snapshot!(
matcher.format(format_options), matcher.format(format_options),
@"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_" @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_"
); );
} }
} }

View File

@ -5,6 +5,7 @@ mod graph_based_ranking_rule;
mod interner; mod interner;
mod limits; mod limits;
mod logger; mod logger;
pub mod matches;
mod query_graph; mod query_graph;
mod query_term; mod query_term;
mod ranking_rule_graph; mod ranking_rule_graph;
@ -33,8 +34,8 @@ use interner::DedupInterner;
pub use logger::detailed::DetailedSearchLogger; pub use logger::detailed::DetailedSearchLogger;
pub use logger::{DefaultSearchLogger, SearchLogger}; pub use logger::{DefaultSearchLogger, SearchLogger};
use query_graph::{QueryGraph, QueryNode}; use query_graph::{QueryGraph, QueryNode};
use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm};
use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
use resolve_query_graph::PhraseDocIdsCache; use resolve_query_graph::PhraseDocIdsCache;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use words::Words; use words::Words;
@ -47,10 +48,7 @@ use self::ranking_rules::{BoxRankingRule, RankingRule};
use self::resolve_query_graph::compute_query_graph_docids; use self::resolve_query_graph::compute_query_graph_docids;
use self::sort::Sort; use self::sort::Sort;
use crate::search::new::distinct::apply_distinct_rule; use crate::search::new::distinct::apply_distinct_rule;
use crate::{ use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
AscDesc, Filter, Index, MatchingWords, Member, Result, SearchResult, TermsMatchingStrategy,
UserError,
};
/// A structure used throughout the execution of a search query. /// A structure used throughout the execution of a search query.
pub struct SearchContext<'ctx> { pub struct SearchContext<'ctx> {
@ -62,6 +60,7 @@ pub struct SearchContext<'ctx> {
pub term_interner: Interner<QueryTerm>, pub term_interner: Interner<QueryTerm>,
pub phrase_docids: PhraseDocIdsCache, pub phrase_docids: PhraseDocIdsCache,
} }
impl<'ctx> SearchContext<'ctx> { impl<'ctx> SearchContext<'ctx> {
pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self { pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self {
Self { Self {
@ -291,13 +290,14 @@ pub fn execute_search(
words_limit: Option<usize>, words_limit: Option<usize>,
placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>, placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
query_graph_logger: &mut dyn SearchLogger<QueryGraph>, query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
) -> Result<SearchResult> { ) -> Result<PartialSearchResult> {
let mut universe = if let Some(filters) = filters { let mut universe = if let Some(filters) = filters {
filters.evaluate(ctx.txn, ctx.index)? filters.evaluate(ctx.txn, ctx.index)?
} else { } else {
ctx.index.documents_ids(ctx.txn)? ctx.index.documents_ids(ctx.txn)?
}; };
let mut located_query_terms = None;
let bucket_sort_output = if let Some(query) = query { let bucket_sort_output = if let Some(query) = query {
// We make sure that the analyzer is aware of the stop words // We make sure that the analyzer is aware of the stop words
// this ensures that the query builder is able to properly remove them. // this ensures that the query builder is able to properly remove them.
@ -317,6 +317,7 @@ pub fn execute_search(
let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
let graph = QueryGraph::from_query(ctx, &query_terms)?; let graph = QueryGraph::from_query(ctx, &query_terms)?;
located_query_terms = Some(query_terms);
check_sort_criteria(ctx, sort_criteria.as_ref())?; check_sort_criteria(ctx, sort_criteria.as_ref())?;
@ -357,9 +358,7 @@ pub fn execute_search(
} }
} }
Ok(SearchResult { Ok(PartialSearchResult {
// TODO: correct matching words
matching_words: MatchingWords::default(),
candidates: all_candidates, candidates: all_candidates,
documents_ids: docids, documents_ids: docids,
}) })
@ -406,3 +405,9 @@ fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec<AscDesc>>
Ok(()) Ok(())
} }
pub struct PartialSearchResult {
pub located_query_terms: Option<Vec<LocatedQueryTerm>>,
pub candidates: RoaringBitmap,
pub documents_ids: Vec<DocumentId>,
}