meilisearch/milli/src/search/query_tree.rs

1235 lines
43 KiB
Rust
Raw Normal View History

2022-04-04 17:52:35 +08:00
use std::borrow::Cow;
2022-08-18 23:36:08 +08:00
use std::cmp::min;
2022-04-04 17:52:35 +08:00
use std::{cmp, fmt, mem};
2022-06-02 21:47:28 +08:00
use charabia::classifier::ClassifiedTokenIter;
use charabia::{SeparatorKind, TokenKind};
use fst::Set;
use roaring::RoaringBitmap;
use slice_group_by::GroupBy;
2022-04-05 00:56:59 +08:00
use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId};
2022-08-18 23:36:08 +08:00
use crate::search::TermsMatchingStrategy;
2022-04-05 00:56:59 +08:00
use crate::{Index, MatchingWords, Result};
type IsOptionalWord = bool;
type IsPrefix = bool;
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum Operation {
And(Vec<Operation>),
// serie of consecutive non prefix and exact words
Phrase(Vec<String>),
Or(IsOptionalWord, Vec<Operation>),
Query(Query),
}
impl fmt::Debug for Operation {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fn pprint_tree(f: &mut fmt::Formatter<'_>, op: &Operation, depth: usize) -> fmt::Result {
match op {
Operation::And(children) => {
writeln!(f, "{:1$}AND", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
2021-06-17 00:33:33 +08:00
}
Operation::Phrase(children) => {
writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2)
2021-06-17 00:33:33 +08:00
}
Operation::Or(true, children) => {
writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
2021-06-17 00:33:33 +08:00
}
Operation::Or(false, children) => {
writeln!(f, "{:1$}OR", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
2021-06-17 00:33:33 +08:00
}
Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2),
}
}
pprint_tree(f, self, 0)
}
}
impl Operation {
fn and(mut ops: Vec<Self>) -> Self {
if ops.len() == 1 {
ops.pop().unwrap()
} else {
Self::And(ops)
}
}
pub fn or(word_branch: IsOptionalWord, mut ops: Vec<Self>) -> Self {
if ops.len() == 1 {
ops.pop().unwrap()
} else {
2022-08-18 23:36:08 +08:00
let ops = ops
.into_iter()
.flat_map(|o| match o {
Operation::Or(wb, children) if wb == word_branch => children,
op => vec![op],
})
.collect();
Self::Or(word_branch, ops)
}
}
fn phrase(mut words: Vec<String>) -> Self {
if words.len() == 1 {
Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) })
} else {
Self::Phrase(words)
}
}
2021-02-23 00:17:01 +08:00
pub fn query(&self) -> Option<&Query> {
match self {
Operation::Query(query) => Some(query),
_ => None,
}
}
}
#[derive(Clone, Eq, PartialEq, Hash)]
pub struct Query {
pub prefix: IsPrefix,
pub kind: QueryKind,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum QueryKind {
Tolerant { typo: u8, word: String },
Exact { original_typo: u8, word: String },
}
impl QueryKind {
2021-02-24 17:25:22 +08:00
pub fn exact(word: String) -> Self {
QueryKind::Exact { original_typo: 0, word }
}
2021-02-24 17:25:22 +08:00
pub fn tolerant(typo: u8, word: String) -> Self {
QueryKind::Tolerant { typo, word }
}
pub fn typo(&self) -> u8 {
match self {
QueryKind::Tolerant { typo, .. } => *typo,
QueryKind::Exact { original_typo, .. } => *original_typo,
}
}
pub fn word(&self) -> &str {
match self {
QueryKind::Tolerant { word, .. } => word,
QueryKind::Exact { word, .. } => word,
}
}
}
impl fmt::Debug for Query {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let Query { prefix, kind } = self;
let prefix = if *prefix { String::from("Prefix") } else { String::default() };
match kind {
QueryKind::Exact { word, .. } => {
f.debug_struct(&(prefix + "Exact")).field("word", &word).finish()
2021-06-17 00:33:33 +08:00
}
QueryKind::Tolerant { typo, word } => f
.debug_struct(&(prefix + "Tolerant"))
.field("word", &word)
.field("max typo", &typo)
.finish(),
}
}
}
trait Context {
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
match self.word_docids(word)? {
Some(rb) => Ok(Some(rb.len())),
None => Ok(None),
}
}
/// Returns the minimum word len for 1 and 2 typos.
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
2022-05-24 18:14:55 +08:00
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
}
/// The query tree builder is the interface to build a query tree.
pub struct QueryTreeBuilder<'a> {
rtxn: &'a heed::RoTxn<'a>,
index: &'a Index,
terms_matching_strategy: TermsMatchingStrategy,
authorize_typos: bool,
2021-04-14 01:10:58 +08:00
words_limit: Option<usize>,
2022-05-24 15:43:17 +08:00
exact_words: Option<fst::Set<Cow<'a, [u8]>>>,
}
impl<'a> Context for QueryTreeBuilder<'a> {
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
self.index.word_docids.get(self.rtxn, word)
}
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
2021-04-07 16:53:57 +08:00
self.index.words_synonyms(self.rtxn, words)
}
2021-04-07 16:53:57 +08:00
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
self.index.word_documents_count(self.rtxn, word)
}
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
let one = self.index.min_word_len_one_typo(&self.rtxn)?;
let two = self.index.min_word_len_two_typos(&self.rtxn)?;
Ok((one, two))
}
2022-03-21 23:25:15 +08:00
2022-05-24 18:14:55 +08:00
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
self.exact_words.as_ref()
2022-03-21 23:25:15 +08:00
}
}
impl<'a> QueryTreeBuilder<'a> {
/// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn`
/// and an Index `index`.
2022-05-24 15:43:17 +08:00
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Result<Self> {
Ok(Self {
rtxn,
index,
terms_matching_strategy: TermsMatchingStrategy::default(),
2022-05-24 15:43:17 +08:00
authorize_typos: true,
words_limit: None,
2022-05-24 20:15:33 +08:00
exact_words: index.exact_words(rtxn)?,
2022-05-24 15:43:17 +08:00
})
}
/// if `terms_matching_strategy` is set to `All` the query tree will be
/// generated forcing all query words to be present in each matching documents
/// (the criterion `words` will be ignored).
/// default value if not called: `Last`
pub fn terms_matching_strategy(
&mut self,
terms_matching_strategy: TermsMatchingStrategy,
) -> &mut Self {
self.terms_matching_strategy = terms_matching_strategy;
self
}
/// if `authorize_typos` is set to `false` the query tree will be generated
/// forcing all query words to match documents without any typo
/// (the criterion `typo` will be ignored).
/// default value if not called: `true`
pub fn authorize_typos(&mut self, authorize_typos: bool) -> &mut Self {
self.authorize_typos = authorize_typos;
self
}
2021-04-14 01:10:58 +08:00
/// Limit words and phrases that will be taken for query building.
/// Any beyond `words_limit` will be ignored.
pub fn words_limit(&mut self, words_limit: usize) -> &mut Self {
self.words_limit = Some(words_limit);
self
}
/// Build the query tree:
/// - if `terms_matching_strategy` is set to `All` the query tree will be
/// generated forcing all query words to be present in each matching documents
/// (the criterion `words` will be ignored)
/// - if `authorize_typos` is set to `false` the query tree will be generated
/// forcing all query words to match documents without any typo
/// (the criterion `typo` will be ignored)
2022-06-02 21:47:28 +08:00
pub fn build<A: AsRef<[u8]>>(
2022-04-05 00:56:59 +08:00
&self,
2022-06-02 21:47:28 +08:00
query: ClassifiedTokenIter<A>,
2022-04-05 00:56:59 +08:00
) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
let stop_words = self.index.stop_words(self.rtxn)?;
2021-04-14 01:10:58 +08:00
let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
if !primitive_query.is_empty() {
2021-06-17 00:33:33 +08:00
let qt = create_query_tree(
self,
self.terms_matching_strategy,
2021-06-17 00:33:33 +08:00
self.authorize_typos,
&primitive_query,
)?;
2022-04-05 00:56:59 +08:00
let matching_words =
create_matching_words(self, self.authorize_typos, &primitive_query)?;
Ok(Some((qt, primitive_query, matching_words)))
} else {
Ok(None)
}
}
}
/// Split the word depending on the frequency of subwords in the database documents.
fn split_best_frequency<'a>(
ctx: &impl Context,
word: &'a str,
) -> heed::Result<Option<(&'a str, &'a str)>> {
let chars = word.char_indices().skip(1);
let mut best = None;
for (i, _) in chars {
let (left, right) = word.split_at(i);
let left_freq = ctx.word_documents_count(left)?.unwrap_or(0);
let right_freq = ctx.word_documents_count(right)?.unwrap_or(0);
let min_freq = cmp::min(left_freq, right_freq);
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
best = Some((min_freq, left, right));
}
}
Ok(best.map(|(_, left, right)| (left, right)))
}
2022-03-31 19:50:18 +08:00
#[derive(Clone)]
2022-03-21 23:25:15 +08:00
pub struct TypoConfig<'a> {
pub max_typos: u8,
pub word_len_one_typo: u8,
pub word_len_two_typo: u8,
2022-05-24 18:14:55 +08:00
pub exact_words: Option<&'a fst::Set<Cow<'a, [u8]>>>,
}
/// Return the `QueryKind` of a word depending on `authorize_typos`
/// and the provided word length.
2022-03-21 23:25:15 +08:00
fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind {
2022-05-24 20:15:33 +08:00
if authorize_typos && !config.exact_words.map_or(false, |s| s.contains(&word)) {
let count = word.chars().count().min(u8::MAX as usize) as u8;
2022-04-01 00:37:43 +08:00
if count < config.word_len_one_typo {
QueryKind::exact(word)
2022-04-01 00:37:43 +08:00
} else if count < config.word_len_two_typo {
QueryKind::tolerant(1.min(config.max_typos), word)
} else {
QueryKind::tolerant(2.min(config.max_typos), word)
}
} else {
QueryKind::exact(word)
}
}
/// Fetch synonyms from the `Context` for the provided word
/// and create the list of operations for the query tree
fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operation>>> {
let synonyms = ctx.synonyms(word)?;
Ok(synonyms.map(|synonyms| {
2021-06-17 00:33:33 +08:00
synonyms
.into_iter()
.map(|synonym| {
let words = synonym
.into_iter()
.map(|word| {
Operation::Query(Query { prefix: false, kind: QueryKind::exact(word) })
})
.collect();
Operation::and(words)
})
.collect()
}))
}
/// Main function that creates the final query tree from the primitive query.
fn create_query_tree(
ctx: &impl Context,
terms_matching_strategy: TermsMatchingStrategy,
authorize_typos: bool,
2021-05-04 19:44:55 +08:00
query: &[PrimitiveQueryPart],
2021-06-17 00:33:33 +08:00
) -> Result<Operation> {
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
fn resolve_primitive_part(
ctx: &impl Context,
authorize_typos: bool,
part: PrimitiveQueryPart,
2021-06-17 00:33:33 +08:00
) -> Result<Operation> {
match part {
// 1. try to split word in 2
// 2. try to fetch synonyms
// 3. create an operation containing the word
// 4. wrap all in an OR operation
PrimitiveQueryPart::Word(word, prefix) => {
let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
2022-04-05 00:56:59 +08:00
if let Some((left, right)) = split_best_frequency(ctx, &word)? {
children.push(Operation::Phrase(vec![left.to_string(), right.to_string()]));
}
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
2022-05-24 15:43:17 +08:00
let exact_words = ctx.exact_words();
2022-03-21 23:25:15 +08:00
let config =
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
2022-01-21 01:34:54 +08:00
children.push(Operation::Query(Query {
prefix,
kind: typos(word, authorize_typos, config),
2022-01-21 01:34:54 +08:00
}));
Ok(Operation::or(false, children))
2021-06-17 00:33:33 +08:00
}
// create a CONSECUTIVE operation wrapping all word in the phrase
2021-06-17 00:33:33 +08:00
PrimitiveQueryPart::Phrase(words) => Ok(Operation::phrase(words)),
}
}
/// Create all ngrams 1..=3 generating query tree branches.
fn ngrams(
ctx: &impl Context,
authorize_typos: bool,
query: &[PrimitiveQueryPart],
2022-08-18 23:36:08 +08:00
any_words: bool,
2021-06-17 00:33:33 +08:00
) -> Result<Operation> {
const MAX_NGRAM: usize = 3;
let mut op_children = Vec::new();
for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) {
let mut or_op_children = Vec::new();
for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
if let Some(group) = sub_query.get(..ngram) {
let mut and_op_children = Vec::new();
let tail = &sub_query[ngram..];
let is_last = tail.is_empty();
match group {
[part] => {
2021-06-17 00:33:33 +08:00
let operation =
resolve_primitive_part(ctx, authorize_typos, part.clone())?;
and_op_children.push(operation);
2021-06-17 00:33:33 +08:00
}
words => {
let is_prefix = words.last().map_or(false, |part| part.is_prefix());
2021-06-17 00:33:33 +08:00
let words: Vec<_> = words
.iter()
.filter_map(|part| {
if let PrimitiveQueryPart::Word(word, _) = part {
Some(word.as_str())
} else {
None
}
})
.collect();
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
let concat = words.concat();
let (word_len_one_typo, word_len_two_typo) =
ctx.min_word_len_for_typo()?;
2022-05-24 15:43:17 +08:00
let exact_words = ctx.exact_words();
2022-03-21 23:25:15 +08:00
let config = TypoConfig {
max_typos: 1,
word_len_one_typo,
word_len_two_typo,
exact_words,
};
2022-02-03 01:45:11 +08:00
let query = Query {
prefix: is_prefix,
kind: typos(concat, authorize_typos, config),
2022-02-03 01:45:11 +08:00
};
Revert "Integrate the stop_words in the querytree" This reverts commit 12fb509d8470e6d0c3a424756c9838a1efe306d2. We revert this commit because it's causing the bug #150. The initial algorithm we implemented for the stop_words was: 1. remove the stop_words from the dataset 2. keep the stop_words in the query to see if we can generate new words by integrating typos or if the word was a prefix => This was causing the bug since, in the case of “The hobbit”, we were **always** looking for something starting with “t he” or “th e” instead of ignoring the word completely. For now we are going to fix the bug by completely ignoring the stop_words in the query. This could cause another problem were someone mistyped a normal word and ended up typing a stop_word. For example imagine someone searching for the music “Won't he do it”. If that person misplace one space and write “Won' the do it” then we will loose a part of the request. One fix would be to update our query tree to something like that: --------------------- OR OR TOLERANT hobbit # the first option is to ignore the stop_word AND CONSECUTIVE # the second option is to do as we are doing EXACT t # currently EXACT he TOLERANT hobbit --------------------- This would increase drastically the size of our query tree on request with a lot of stop_words. For example think of “The Lord Of The Rings”. For now whatsoever we decided we were going to ignore this problem and consider that it doesn't reduce too much the relevancy of the search to do that while it improves the performances.
2021-04-08 21:12:37 +08:00
operations.push(Operation::Query(query));
and_op_children.push(Operation::or(false, operations));
}
}
if !is_last {
2022-08-18 23:36:08 +08:00
let ngrams = ngrams(ctx, authorize_typos, tail, any_words)?;
and_op_children.push(ngrams);
}
2022-08-18 23:36:08 +08:00
if any_words {
or_op_children.push(Operation::or(false, and_op_children));
} else {
or_op_children.push(Operation::and(and_op_children));
}
}
}
op_children.push(Operation::or(false, or_op_children));
}
2022-08-18 23:36:08 +08:00
if any_words {
Ok(Operation::or(false, op_children))
} else {
Ok(Operation::and(op_children))
}
}
2022-08-18 23:36:08 +08:00
let number_phrases = query.iter().filter(|p| p.is_phrase()).count();
let remove_count = query.len() - min(number_phrases, 1);
if remove_count == 0 {
return ngrams(ctx, authorize_typos, query, false);
}
2022-08-18 23:36:08 +08:00
let mut operation_children = Vec::new();
let mut query = query.to_vec();
2022-08-18 23:56:06 +08:00
for _ in 0..remove_count {
let pos = match terms_matching_strategy {
2022-08-18 23:36:08 +08:00
TermsMatchingStrategy::All => return ngrams(ctx, authorize_typos, &query, false),
TermsMatchingStrategy::Any => {
let operation = Operation::Or(
true,
vec![
// branch allowing matching documents to contains any query word.
ngrams(ctx, authorize_typos, &query, true)?,
// branch forcing matching documents to contains all the query words,
// keeping this documents of the top of the resulted list.
ngrams(ctx, authorize_typos, &query, false)?,
],
);
return Ok(operation);
}
TermsMatchingStrategy::Last => query
2021-06-17 00:33:33 +08:00
.iter()
2022-08-18 23:36:08 +08:00
.enumerate()
.filter(|(_, part)| !part.is_phrase())
.last()
.map(|(pos, _)| pos),
TermsMatchingStrategy::First => {
query.iter().enumerate().find(|(_, part)| !part.is_phrase()).map(|(pos, _)| pos)
}
TermsMatchingStrategy::Size => query
.iter()
.enumerate()
.filter(|(_, part)| !part.is_phrase())
.min_by_key(|(_, part)| match part {
PrimitiveQueryPart::Word(s, _) => s.len(),
_ => unreachable!(),
})
.map(|(pos, _)| pos),
TermsMatchingStrategy::Frequency => query
.iter()
.enumerate()
.filter(|(_, part)| !part.is_phrase())
.max_by_key(|(_, part)| match part {
PrimitiveQueryPart::Word(s, _) => {
ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value())
2021-06-17 00:33:33 +08:00
}
2022-08-18 23:36:08 +08:00
_ => unreachable!(),
2021-06-17 00:33:33 +08:00
})
2022-08-18 23:36:08 +08:00
.map(|(pos, _)| pos),
};
2022-08-18 23:36:08 +08:00
// compute and push the current branch on the front
operation_children.insert(0, ngrams(ctx, authorize_typos, &query, false)?);
// remove word from query before creating an new branch
match pos {
Some(pos) => query.remove(pos),
None => break,
};
}
2022-08-18 23:56:06 +08:00
Ok(Operation::or(true, operation_children))
}
2022-04-05 00:56:59 +08:00
/// Main function that matchings words used for crop and highlight.
fn create_matching_words(
ctx: &impl Context,
authorize_typos: bool,
query: &[PrimitiveQueryPart],
) -> Result<MatchingWords> {
/// Matches on the `PrimitiveQueryPart` and create matchings words from it.
fn resolve_primitive_part(
ctx: &impl Context,
authorize_typos: bool,
part: PrimitiveQueryPart,
matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
id: PrimitiveWordId,
) -> Result<()> {
match part {
// 1. try to split word in 2
// 2. try to fetch synonyms
PrimitiveQueryPart::Word(word, prefix) => {
if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? {
for synonym in synonyms {
let synonym = synonym
.into_iter()
.map(|syn| MatchingWord::new(syn.to_string(), 0, false))
.collect();
matching_words.push((synonym, vec![id]));
}
}
if let Some((left, right)) = split_best_frequency(ctx, &word)? {
let left = MatchingWord::new(left.to_string(), 0, false);
let right = MatchingWord::new(right.to_string(), 0, false);
2022-04-05 00:56:59 +08:00
matching_words.push((vec![left, right], vec![id]));
}
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
2022-05-24 15:43:17 +08:00
let exact_words = ctx.exact_words();
2022-04-05 00:56:59 +08:00
let config =
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
let matching_word = match typos(word, authorize_typos, config) {
QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix),
QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix),
};
matching_words.push((vec![matching_word], vec![id]));
}
// create a CONSECUTIVE matchings words wrapping all word in the phrase
PrimitiveQueryPart::Phrase(words) => {
let ids: Vec<_> =
(0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
let words =
words.into_iter().map(|w| MatchingWord::new(w.to_string(), 0, false)).collect();
matching_words.push((words, ids));
}
}
Ok(())
}
/// Create all ngrams 1..=3 generating query tree branches.
fn ngrams(
ctx: &impl Context,
authorize_typos: bool,
query: &[PrimitiveQueryPart],
matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
mut id: PrimitiveWordId,
) -> Result<()> {
const MAX_NGRAM: usize = 3;
for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) {
for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
if let Some(group) = sub_query.get(..ngram) {
let tail = &sub_query[ngram..];
let is_last = tail.is_empty();
match group {
[part] => {
resolve_primitive_part(
ctx,
authorize_typos,
part.clone(),
matching_words,
id,
)?;
}
words => {
let is_prefix = words.last().map_or(false, |part| part.is_prefix());
let words: Vec<_> = words
.iter()
.filter_map(|part| {
if let PrimitiveQueryPart::Word(word, _) = part {
Some(word.as_str())
} else {
None
}
})
.collect();
let ids: Vec<_> = (0..words.len())
.into_iter()
.map(|i| id + i as PrimitiveWordId)
.collect();
if let Some(synonyms) = ctx.synonyms(&words)? {
for synonym in synonyms {
let synonym = synonym
.into_iter()
.map(|syn| MatchingWord::new(syn.to_string(), 0, false))
.collect();
matching_words.push((synonym, ids.clone()));
}
}
let word = words.concat();
let (word_len_one_typo, word_len_two_typo) =
ctx.min_word_len_for_typo()?;
2022-05-24 15:43:17 +08:00
let exact_words = ctx.exact_words();
2022-04-05 00:56:59 +08:00
let config = TypoConfig {
max_typos: 1,
word_len_one_typo,
word_len_two_typo,
exact_words,
};
let matching_word = match typos(word, authorize_typos, config) {
QueryKind::Exact { word, .. } => {
MatchingWord::new(word, 0, is_prefix)
}
QueryKind::Tolerant { typo, word } => {
MatchingWord::new(word, typo, is_prefix)
}
};
matching_words.push((vec![matching_word], ids));
}
}
if !is_last {
ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?;
}
}
}
id += sub_query.iter().map(|x| x.len() as PrimitiveWordId).sum::<PrimitiveWordId>();
}
Ok(())
}
let mut matching_words = Vec::new();
ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?;
Ok(MatchingWords::new(matching_words))
}
2021-05-04 19:44:55 +08:00
pub type PrimitiveQuery = Vec<PrimitiveQueryPart>;
#[derive(Debug, Clone)]
2021-05-04 19:44:55 +08:00
pub enum PrimitiveQueryPart {
Phrase(Vec<String>),
Word(String, IsPrefix),
}
impl PrimitiveQueryPart {
fn is_phrase(&self) -> bool {
matches!(self, Self::Phrase(_))
}
fn is_prefix(&self) -> bool {
matches!(self, Self::Word(_, is_prefix) if *is_prefix)
}
2022-04-05 00:56:59 +08:00
fn len(&self) -> usize {
match self {
Self::Phrase(words) => words.len(),
Self::Word(_, _) => 1,
}
}
}
/// Create primitive query from tokenized query string,
/// the primitive query is an intermediate state to build the query tree.
2022-06-02 21:47:28 +08:00
fn create_primitive_query<A>(
query: ClassifiedTokenIter<A>,
2021-06-17 00:33:33 +08:00
stop_words: Option<Set<&[u8]>>,
words_limit: Option<usize>,
2022-06-02 21:47:28 +08:00
) -> PrimitiveQuery
where
A: AsRef<[u8]>,
{
let mut primitive_query = Vec::new();
let mut phrase = Vec::new();
let mut quoted = false;
2021-04-14 01:10:58 +08:00
let parts_limit = words_limit.unwrap_or(usize::MAX);
let mut peekable = query.peekable();
while let Some(token) = peekable.next() {
2021-04-14 01:10:58 +08:00
// early return if word limit is exceeded
2021-06-17 00:33:33 +08:00
if primitive_query.len() >= parts_limit {
return primitive_query;
}
2021-04-14 01:10:58 +08:00
match token.kind {
2021-06-17 00:33:33 +08:00
TokenKind::Word | TokenKind::StopWord => {
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
// 3. if the word is the last token of the query we push it as a prefix word.
if quoted {
2022-06-02 21:47:28 +08:00
phrase.push(token.lemma().to_string());
} else if peekable.peek().is_some() {
2022-06-02 21:47:28 +08:00
if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) {
2021-06-17 00:33:33 +08:00
primitive_query
2022-06-02 21:47:28 +08:00
.push(PrimitiveQueryPart::Word(token.lemma().to_string(), false));
2021-06-17 00:33:33 +08:00
}
} else {
2022-06-02 21:47:28 +08:00
primitive_query.push(PrimitiveQueryPart::Word(token.lemma().to_string(), true));
}
2021-06-17 00:33:33 +08:00
}
TokenKind::Separator(separator_kind) => {
2022-06-02 21:47:28 +08:00
let quote_count = token.lemma().chars().filter(|&s| s == '"').count();
// swap quoted state if we encounter a double quote
if quote_count % 2 != 0 {
quoted = !quoted;
}
// if there is a quote or a hard separator we close the phrase.
2021-06-17 00:33:33 +08:00
if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
{
primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase)));
}
2021-06-17 00:33:33 +08:00
}
_ => (),
}
}
// If a quote is never closed, we consider all of the end of the query as a phrase.
if !phrase.is_empty() {
primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase)));
}
primitive_query
}
/// Returns the maximum number of typos that this Operation allows.
pub fn maximum_typo(operation: &Operation) -> usize {
2021-06-17 00:33:33 +08:00
use Operation::{And, Or, Phrase, Query};
match operation {
Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0),
And(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
Query(q) => q.kind.typo() as usize,
// no typo allowed in phrases
Phrase(_) => 0,
}
}
/// Returns the maximum proximity that this Operation allows.
pub fn maximum_proximity(operation: &Operation) -> usize {
2021-06-17 00:33:33 +08:00
use Operation::{And, Or, Phrase, Query};
match operation {
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
2021-02-24 22:36:57 +08:00
And(ops) => {
2021-06-17 00:33:33 +08:00
ops.iter().map(maximum_proximity).sum::<usize>() + ops.len().saturating_sub(1) * 7
}
Query(_) | Phrase(_) => 0,
}
}
#[cfg(test)]
mod test {
2021-03-02 18:30:48 +08:00
use std::collections::HashMap;
2022-06-02 21:47:28 +08:00
use charabia::Tokenize;
2021-06-01 17:48:56 +08:00
use maplit::hashmap;
2021-06-17 00:33:33 +08:00
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use super::*;
2022-04-01 17:21:51 +08:00
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
2021-03-02 18:30:48 +08:00
#[derive(Debug)]
struct TestContext {
synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
postings: HashMap<String, RoaringBitmap>,
2022-05-24 15:43:17 +08:00
exact_words: Option<fst::Set<Cow<'static, [u8]>>>,
}
impl TestContext {
2022-06-02 21:47:28 +08:00
fn build<A: AsRef<[u8]>>(
&self,
terms_matching_strategy: TermsMatchingStrategy,
authorize_typos: bool,
2021-04-14 01:10:58 +08:00
words_limit: Option<usize>,
2022-06-02 21:47:28 +08:00
query: ClassifiedTokenIter<A>,
2021-06-17 00:33:33 +08:00
) -> Result<Option<(Operation, PrimitiveQuery)>> {
2021-04-14 01:10:58 +08:00
let primitive_query = create_primitive_query(query, None, words_limit);
if !primitive_query.is_empty() {
let qt = create_query_tree(
self,
terms_matching_strategy,
authorize_typos,
&primitive_query,
)?;
2021-05-04 19:44:55 +08:00
Ok(Some((qt, primitive_query)))
} else {
Ok(None)
}
}
}
impl Context for TestContext {
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
Ok(self.postings.get(word).cloned())
}
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
Ok(self.synonyms.get(&words).cloned())
}
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
}
2022-03-21 23:25:15 +08:00
2022-05-24 18:14:55 +08:00
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
self.exact_words.as_ref()
2022-03-21 23:25:15 +08:00
}
}
impl Default for TestContext {
fn default() -> TestContext {
let mut rng = StdRng::seed_from_u64(102);
let rng = &mut rng;
fn random_postings<R: Rng>(rng: &mut R, len: usize) -> RoaringBitmap {
let mut values = Vec::<u32>::with_capacity(len);
while values.len() != len {
values.push(rng.gen());
}
values.sort_unstable();
RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap()
}
2022-03-22 16:55:49 +08:00
let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap();
2022-05-24 15:43:17 +08:00
let exact_words =
Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
2022-03-22 16:55:49 +08:00
TestContext {
2021-06-17 00:33:33 +08:00
synonyms: hashmap! {
vec![String::from("hello")] => vec![
vec![String::from("hi")],
vec![String::from("good"), String::from("morning")],
],
vec![String::from("world")] => vec![
vec![String::from("earth")],
vec![String::from("nature")],
],
// new york city
vec![String::from("nyc")] => vec![
vec![String::from("new"), String::from("york")],
vec![String::from("new"), String::from("york"), String::from("city")],
],
vec![String::from("new"), String::from("york")] => vec![
vec![String::from("nyc")],
vec![String::from("new"), String::from("york"), String::from("city")],
],
vec![String::from("new"), String::from("york"), String::from("city")] => vec![
vec![String::from("nyc")],
vec![String::from("new"), String::from("york")],
],
},
2021-06-17 00:33:33 +08:00
postings: hashmap! {
String::from("hello") => random_postings(rng, 1500),
String::from("hi") => random_postings(rng, 4000),
String::from("word") => random_postings(rng, 2500),
String::from("split") => random_postings(rng, 400),
String::from("ngrams") => random_postings(rng, 1400),
String::from("world") => random_postings(rng, 15_000),
String::from("earth") => random_postings(rng, 8000),
String::from("2021") => random_postings(rng, 100),
String::from("2020") => random_postings(rng, 500),
String::from("is") => random_postings(rng, 50_000),
String::from("this") => random_postings(rng, 50_000),
String::from("good") => random_postings(rng, 1250),
String::from("morning") => random_postings(rng, 125),
},
2022-03-22 16:55:49 +08:00
exact_words,
}
}
}
#[test]
fn prefix() {
let query = "hey friends";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
Exact { word: "hey" }
PrefixTolerant { word: "friends", max typo: 1 }
PrefixTolerant { word: "heyfriends", max typo: 1 }
"###);
}
#[test]
fn no_prefix() {
let query = "hey friends ";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
Exact { word: "hey" }
Tolerant { word: "friends", max typo: 1 }
Tolerant { word: "heyfriends", max typo: 1 }
"###);
}
#[test]
fn synonyms() {
let query = "hello world ";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
OR
Exact { word: "hi" }
AND
Exact { word: "good" }
Exact { word: "morning" }
Tolerant { word: "hello", max typo: 1 }
OR
Exact { word: "earth" }
Exact { word: "nature" }
Tolerant { word: "world", max typo: 1 }
Tolerant { word: "helloworld", max typo: 1 }
"###);
}
#[test]
fn complex_synonyms() {
let query = "new york city ";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
Exact { word: "new" }
OR
AND
Exact { word: "york" }
Exact { word: "city" }
Tolerant { word: "yorkcity", max typo: 1 }
AND
OR
Exact { word: "nyc" }
AND
Exact { word: "new" }
Exact { word: "york" }
Exact { word: "city" }
Tolerant { word: "newyork", max typo: 1 }
Exact { word: "city" }
2022-08-18 23:36:08 +08:00
Exact { word: "nyc" }
AND
Exact { word: "new" }
Exact { word: "york" }
Tolerant { word: "newyorkcity", max typo: 1 }
"###);
}
#[test]
fn ngrams() {
let query = "n grams ";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
Exact { word: "n" }
Tolerant { word: "grams", max typo: 1 }
Tolerant { word: "ngrams", max typo: 1 }
"###);
}
#[test]
fn word_split() {
let query = "wordsplit fish ";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
OR
PHRASE ["word", "split"]
Tolerant { word: "wordsplit", max typo: 2 }
Exact { word: "fish" }
Tolerant { word: "wordsplitfish", max typo: 1 }
"###);
}
#[test]
fn phrase() {
let query = "\"hey friends\" \" \" \"wooop";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
AND
PHRASE ["hey", "friends"]
Exact { word: "wooop" }
"###);
}
#[test]
fn phrase_with_hard_separator() {
let query = "\"hey friends. wooop wooop\"";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::All, true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
AND
PHRASE ["hey", "friends"]
PHRASE ["wooop", "wooop"]
"###);
}
#[test]
fn optional_word() {
let query = "hey my friend ";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::default(), true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
OR(WORD)
Exact { word: "hey" }
OR
AND
Exact { word: "hey" }
Exact { word: "my" }
Tolerant { word: "heymy", max typo: 1 }
OR
AND
Exact { word: "hey" }
OR
AND
Exact { word: "my" }
Tolerant { word: "friend", max typo: 1 }
Tolerant { word: "myfriend", max typo: 1 }
AND
Tolerant { word: "heymy", max typo: 1 }
Tolerant { word: "friend", max typo: 1 }
Tolerant { word: "heymyfriend", max typo: 1 }
"###);
}
#[test]
fn optional_word_phrase() {
let query = "\"hey my\"";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::default(), true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
PHRASE ["hey", "my"]
"###);
}
#[test]
fn optional_word_multiple_phrases() {
let query = r#""hey" my good "friend""#;
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::default(), true, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
OR(WORD)
AND
Exact { word: "hey" }
Exact { word: "friend" }
AND
Exact { word: "hey" }
Exact { word: "my" }
Exact { word: "friend" }
AND
Exact { word: "hey" }
OR
AND
Exact { word: "my" }
Exact { word: "good" }
Tolerant { word: "mygood", max typo: 1 }
Exact { word: "friend" }
"###);
}
#[test]
fn no_typo() {
let query = "hey friends ";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::All, false, None, tokens)
.unwrap()
.unwrap();
insta::assert_debug_snapshot!(query_tree, @r###"
OR
AND
Exact { word: "hey" }
Exact { word: "friends" }
Exact { word: "heyfriends" }
"###);
}
2021-04-14 01:10:58 +08:00
#[test]
fn words_limit() {
let query = "\"hey my\" good friend";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2021-04-14 01:10:58 +08:00
2022-08-18 23:36:08 +08:00
let (query_tree, _) = TestContext::default()
.build(TermsMatchingStrategy::All, false, Some(2), tokens)
.unwrap()
.unwrap();
2021-04-14 01:10:58 +08:00
insta::assert_debug_snapshot!(query_tree, @r###"
AND
PHRASE ["hey", "my"]
Exact { word: "good" }
"###);
2021-04-14 01:10:58 +08:00
}
2022-03-31 19:50:18 +08:00
#[test]
fn test_min_word_len_typo() {
2022-05-24 18:14:55 +08:00
let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap();
2022-05-24 15:43:17 +08:00
let config = TypoConfig {
max_typos: 2,
word_len_one_typo: 5,
word_len_two_typo: 7,
2022-05-24 18:14:55 +08:00
exact_words: Some(&exact_words),
2022-05-24 15:43:17 +08:00
};
2022-03-31 19:50:18 +08:00
assert_eq!(
typos("hello".to_string(), true, config.clone()),
QueryKind::Tolerant { typo: 1, word: "hello".to_string() }
);
assert_eq!(
typos("hell".to_string(), true, config.clone()),
QueryKind::exact("hell".to_string())
);
assert_eq!(
typos("verylongword".to_string(), true, config.clone()),
QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
);
}
2022-03-22 16:55:49 +08:00
2022-03-22 16:55:49 +08:00
#[test]
fn disable_typo_on_word() {
let query = "goodbye";
2022-06-02 21:47:28 +08:00
let tokens = query.tokenize();
2022-03-22 16:55:49 +08:00
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
2022-05-24 15:43:17 +08:00
let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
2022-03-22 16:55:49 +08:00
let context = TestContext { exact_words, ..Default::default() };
2022-08-18 23:36:08 +08:00
let (query_tree, _) =
context.build(TermsMatchingStrategy::All, true, Some(2), tokens).unwrap().unwrap();
2022-03-22 16:55:49 +08:00
assert!(matches!(
2022-04-04 17:52:35 +08:00
query_tree,
2022-03-22 16:55:49 +08:00
Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
));
}
}