meilisearch/milli/src/search/query_tree.rs

1286 lines
47 KiB
Rust
Raw Normal View History

2022-03-21 23:25:15 +08:00
use std::{borrow::Cow, cmp, fmt, mem};
use fst::Set;
use meilisearch_tokenizer::token::SeparatorKind;
use meilisearch_tokenizer::tokenizer::TokenStream;
use meilisearch_tokenizer::TokenKind;
use roaring::RoaringBitmap;
use slice_group_by::GroupBy;
use crate::{Index, Result};
type IsOptionalWord = bool;
type IsPrefix = bool;
#[derive(Clone, PartialEq, Eq, Hash)]
pub enum Operation {
And(Vec<Operation>),
// serie of consecutive non prefix and exact words
Phrase(Vec<String>),
Or(IsOptionalWord, Vec<Operation>),
Query(Query),
}
impl fmt::Debug for Operation {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fn pprint_tree(f: &mut fmt::Formatter<'_>, op: &Operation, depth: usize) -> fmt::Result {
match op {
Operation::And(children) => {
writeln!(f, "{:1$}AND", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
2021-06-17 00:33:33 +08:00
}
Operation::Phrase(children) => {
writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2)
2021-06-17 00:33:33 +08:00
}
Operation::Or(true, children) => {
writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
2021-06-17 00:33:33 +08:00
}
Operation::Or(false, children) => {
writeln!(f, "{:1$}OR", "", depth * 2)?;
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
2021-06-17 00:33:33 +08:00
}
Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2),
}
}
pprint_tree(f, self, 0)
}
}
impl Operation {
fn and(mut ops: Vec<Self>) -> Self {
if ops.len() == 1 {
ops.pop().unwrap()
} else {
Self::And(ops)
}
}
pub fn or(word_branch: IsOptionalWord, mut ops: Vec<Self>) -> Self {
if ops.len() == 1 {
ops.pop().unwrap()
} else {
Self::Or(word_branch, ops)
}
}
fn phrase(mut words: Vec<String>) -> Self {
if words.len() == 1 {
Self::Query(Query { prefix: false, kind: QueryKind::exact(words.pop().unwrap()) })
} else {
Self::Phrase(words)
}
}
2021-02-23 00:17:01 +08:00
pub fn query(&self) -> Option<&Query> {
match self {
Operation::Query(query) => Some(query),
_ => None,
}
}
}
#[derive(Clone, Eq, PartialEq, Hash)]
pub struct Query {
pub prefix: IsPrefix,
pub kind: QueryKind,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum QueryKind {
Tolerant { typo: u8, word: String },
Exact { original_typo: u8, word: String },
}
impl QueryKind {
2021-02-24 17:25:22 +08:00
pub fn exact(word: String) -> Self {
QueryKind::Exact { original_typo: 0, word }
}
2021-02-24 17:25:22 +08:00
pub fn exact_with_typo(original_typo: u8, word: String) -> Self {
QueryKind::Exact { original_typo, word }
}
pub fn tolerant(typo: u8, word: String) -> Self {
QueryKind::Tolerant { typo, word }
}
pub fn is_tolerant(&self) -> bool {
matches!(self, QueryKind::Tolerant { .. })
}
pub fn is_exact(&self) -> bool {
matches!(self, QueryKind::Exact { .. })
}
pub fn typo(&self) -> u8 {
match self {
QueryKind::Tolerant { typo, .. } => *typo,
QueryKind::Exact { original_typo, .. } => *original_typo,
}
}
pub fn word(&self) -> &str {
match self {
QueryKind::Tolerant { word, .. } => word,
QueryKind::Exact { word, .. } => word,
}
}
}
impl fmt::Debug for Query {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let Query { prefix, kind } = self;
let prefix = if *prefix { String::from("Prefix") } else { String::default() };
match kind {
QueryKind::Exact { word, .. } => {
f.debug_struct(&(prefix + "Exact")).field("word", &word).finish()
2021-06-17 00:33:33 +08:00
}
QueryKind::Tolerant { typo, word } => f
.debug_struct(&(prefix + "Tolerant"))
.field("word", &word)
.field("max typo", &typo)
.finish(),
}
}
}
trait Context {
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
match self.word_docids(word)? {
Some(rb) => Ok(Some(rb.len())),
None => Ok(None),
}
}
/// Returns the minimum word len for 1 and 2 typos.
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
2022-03-21 23:25:15 +08:00
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>>;
}
/// The query tree builder is the interface to build a query tree.
pub struct QueryTreeBuilder<'a> {
rtxn: &'a heed::RoTxn<'a>,
index: &'a Index,
optional_words: bool,
authorize_typos: bool,
2021-04-14 01:10:58 +08:00
words_limit: Option<usize>,
}
impl<'a> Context for QueryTreeBuilder<'a> {
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
self.index.word_docids.get(self.rtxn, word)
}
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
2021-04-07 16:53:57 +08:00
self.index.words_synonyms(self.rtxn, words)
}
2021-04-07 16:53:57 +08:00
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
self.index.word_documents_count(self.rtxn, word)
}
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
let one = self.index.min_word_len_one_typo(&self.rtxn)?;
let two = self.index.min_word_len_two_typos(&self.rtxn)?;
Ok((one, two))
}
2022-03-21 23:25:15 +08:00
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
self.index.exact_words(self.rtxn)
}
}
impl<'a> QueryTreeBuilder<'a> {
/// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn`
/// and an Index `index`.
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Self {
2021-04-14 01:10:58 +08:00
Self { rtxn, index, optional_words: true, authorize_typos: true, words_limit: None }
}
/// if `optional_words` is set to `false` the query tree will be
/// generated forcing all query words to be present in each matching documents
/// (the criterion `words` will be ignored).
/// default value if not called: `true`
pub fn optional_words(&mut self, optional_words: bool) -> &mut Self {
self.optional_words = optional_words;
self
}
/// if `authorize_typos` is set to `false` the query tree will be generated
/// forcing all query words to match documents without any typo
/// (the criterion `typo` will be ignored).
/// default value if not called: `true`
pub fn authorize_typos(&mut self, authorize_typos: bool) -> &mut Self {
self.authorize_typos = authorize_typos;
self
}
2021-04-14 01:10:58 +08:00
/// Limit words and phrases that will be taken for query building.
/// Any beyond `words_limit` will be ignored.
pub fn words_limit(&mut self, words_limit: usize) -> &mut Self {
self.words_limit = Some(words_limit);
self
}
/// Build the query tree:
/// - if `optional_words` is set to `false` the query tree will be
/// generated forcing all query words to be present in each matching documents
/// (the criterion `words` will be ignored)
/// - if `authorize_typos` is set to `false` the query tree will be generated
/// forcing all query words to match documents without any typo
/// (the criterion `typo` will be ignored)
pub fn build(&self, query: TokenStream) -> Result<Option<(Operation, PrimitiveQuery)>> {
let stop_words = self.index.stop_words(self.rtxn)?;
2021-04-14 01:10:58 +08:00
let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
if !primitive_query.is_empty() {
2021-06-17 00:33:33 +08:00
let qt = create_query_tree(
self,
self.optional_words,
self.authorize_typos,
&primitive_query,
)?;
2021-05-04 19:44:55 +08:00
Ok(Some((qt, primitive_query)))
} else {
Ok(None)
}
}
}
/// Split the word depending on the frequency of subwords in the database documents.
2021-05-10 16:27:18 +08:00
fn split_best_frequency(ctx: &impl Context, word: &str) -> heed::Result<Option<Operation>> {
let chars = word.char_indices().skip(1);
let mut best = None;
for (i, _) in chars {
let (left, right) = word.split_at(i);
let left_freq = ctx.word_documents_count(left)?.unwrap_or(0);
let right_freq = ctx.word_documents_count(right)?.unwrap_or(0);
let min_freq = cmp::min(left_freq, right_freq);
if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) {
best = Some((min_freq, left, right));
}
}
2021-06-17 00:33:33 +08:00
Ok(best.map(|(_, left, right)| Operation::Phrase(vec![left.to_string(), right.to_string()])))
}
2022-03-31 19:50:18 +08:00
#[derive(Clone)]
2022-03-21 23:25:15 +08:00
pub struct TypoConfig<'a> {
pub max_typos: u8,
pub word_len_one_typo: u8,
pub word_len_two_typo: u8,
2022-03-21 23:25:15 +08:00
pub exact_words: fst::Set<Cow<'a, [u8]>>,
}
/// Return the `QueryKind` of a word depending on `authorize_typos`
/// and the provided word length.
2022-03-21 23:25:15 +08:00
fn typos<'a>(word: String, authorize_typos: bool, config: TypoConfig<'a>) -> QueryKind {
2022-04-04 17:02:43 +08:00
if authorize_typos && !config.exact_words.contains(&word) {
let count = word.chars().count().min(u8::MAX as usize) as u8;
2022-04-01 00:37:43 +08:00
if count < config.word_len_one_typo {
QueryKind::exact(word)
2022-04-01 00:37:43 +08:00
} else if count < config.word_len_two_typo {
QueryKind::tolerant(1.min(config.max_typos), word)
} else {
QueryKind::tolerant(2.min(config.max_typos), word)
}
} else {
QueryKind::exact(word)
}
}
/// Fetch synonyms from the `Context` for the provided word
/// and create the list of operations for the query tree
fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operation>>> {
let synonyms = ctx.synonyms(word)?;
Ok(synonyms.map(|synonyms| {
2021-06-17 00:33:33 +08:00
synonyms
.into_iter()
.map(|synonym| {
let words = synonym
.into_iter()
.map(|word| {
Operation::Query(Query { prefix: false, kind: QueryKind::exact(word) })
})
.collect();
Operation::and(words)
})
.collect()
}))
}
/// Main function that creates the final query tree from the primitive query.
fn create_query_tree(
ctx: &impl Context,
optional_words: bool,
authorize_typos: bool,
2021-05-04 19:44:55 +08:00
query: &[PrimitiveQueryPart],
2021-06-17 00:33:33 +08:00
) -> Result<Operation> {
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
fn resolve_primitive_part(
ctx: &impl Context,
authorize_typos: bool,
part: PrimitiveQueryPart,
2021-06-17 00:33:33 +08:00
) -> Result<Operation> {
match part {
// 1. try to split word in 2
// 2. try to fetch synonyms
// 3. create an operation containing the word
// 4. wrap all in an OR operation
PrimitiveQueryPart::Word(word, prefix) => {
let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
if let Some(child) = split_best_frequency(ctx, &word)? {
children.push(child);
}
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
2022-03-21 23:25:15 +08:00
let exact_words = ctx.exact_words()?;
let config =
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
2022-01-21 01:34:54 +08:00
children.push(Operation::Query(Query {
prefix,
kind: typos(word, authorize_typos, config),
2022-01-21 01:34:54 +08:00
}));
Ok(Operation::or(false, children))
2021-06-17 00:33:33 +08:00
}
// create a CONSECUTIVE operation wrapping all word in the phrase
2021-06-17 00:33:33 +08:00
PrimitiveQueryPart::Phrase(words) => Ok(Operation::phrase(words)),
}
}
/// Create all ngrams 1..=3 generating query tree branches.
fn ngrams(
ctx: &impl Context,
authorize_typos: bool,
query: &[PrimitiveQueryPart],
2021-06-17 00:33:33 +08:00
) -> Result<Operation> {
const MAX_NGRAM: usize = 3;
let mut op_children = Vec::new();
for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) {
let mut or_op_children = Vec::new();
for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
if let Some(group) = sub_query.get(..ngram) {
let mut and_op_children = Vec::new();
let tail = &sub_query[ngram..];
let is_last = tail.is_empty();
match group {
[part] => {
2021-06-17 00:33:33 +08:00
let operation =
resolve_primitive_part(ctx, authorize_typos, part.clone())?;
and_op_children.push(operation);
2021-06-17 00:33:33 +08:00
}
words => {
let is_prefix = words.last().map_or(false, |part| part.is_prefix());
2021-06-17 00:33:33 +08:00
let words: Vec<_> = words
.iter()
.filter_map(|part| {
if let PrimitiveQueryPart::Word(word, _) = part {
Some(word.as_str())
} else {
None
}
})
.collect();
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
let concat = words.concat();
let (word_len_one_typo, word_len_two_typo) =
ctx.min_word_len_for_typo()?;
2022-03-21 23:25:15 +08:00
let exact_words = ctx.exact_words()?;
let config = TypoConfig {
max_typos: 1,
word_len_one_typo,
word_len_two_typo,
exact_words,
};
2022-02-03 01:45:11 +08:00
let query = Query {
prefix: is_prefix,
kind: typos(concat, authorize_typos, config),
2022-02-03 01:45:11 +08:00
};
Revert "Integrate the stop_words in the querytree" This reverts commit 12fb509d8470e6d0c3a424756c9838a1efe306d2. We revert this commit because it's causing the bug #150. The initial algorithm we implemented for the stop_words was: 1. remove the stop_words from the dataset 2. keep the stop_words in the query to see if we can generate new words by integrating typos or if the word was a prefix => This was causing the bug since, in the case of “The hobbit”, we were **always** looking for something starting with “t he” or “th e” instead of ignoring the word completely. For now we are going to fix the bug by completely ignoring the stop_words in the query. This could cause another problem were someone mistyped a normal word and ended up typing a stop_word. For example imagine someone searching for the music “Won't he do it”. If that person misplace one space and write “Won' the do it” then we will loose a part of the request. One fix would be to update our query tree to something like that: --------------------- OR OR TOLERANT hobbit # the first option is to ignore the stop_word AND CONSECUTIVE # the second option is to do as we are doing EXACT t # currently EXACT he TOLERANT hobbit --------------------- This would increase drastically the size of our query tree on request with a lot of stop_words. For example think of “The Lord Of The Rings”. For now whatsoever we decided we were going to ignore this problem and consider that it doesn't reduce too much the relevancy of the search to do that while it improves the performances.
2021-04-08 21:12:37 +08:00
operations.push(Operation::Query(query));
and_op_children.push(Operation::or(false, operations));
}
}
if !is_last {
let ngrams = ngrams(ctx, authorize_typos, tail)?;
and_op_children.push(ngrams);
}
or_op_children.push(Operation::and(and_op_children));
}
}
op_children.push(Operation::or(false, or_op_children));
}
Ok(Operation::and(op_children))
}
/// Create a new branch removing the last non-phrase query parts.
fn optional_word(
ctx: &impl Context,
authorize_typos: bool,
query: PrimitiveQuery,
2021-06-17 00:33:33 +08:00
) -> Result<Operation> {
let number_phrases = query.iter().filter(|p| p.is_phrase()).count();
let mut operation_children = Vec::new();
let start = number_phrases + (number_phrases == 0) as usize;
for len in start..=query.len() {
let mut word_count = len - number_phrases;
2021-06-17 00:33:33 +08:00
let query: Vec<_> = query
.iter()
.filter(|p| {
if p.is_phrase() {
true
} else if word_count != 0 {
word_count -= 1;
true
} else {
false
}
})
.cloned()
.collect();
let ngrams = ngrams(ctx, authorize_typos, &query)?;
operation_children.push(ngrams);
}
Ok(Operation::or(true, operation_children))
}
if optional_words {
2021-05-04 19:44:55 +08:00
optional_word(ctx, authorize_typos, query.to_vec())
} else {
2021-05-04 19:44:55 +08:00
ngrams(ctx, authorize_typos, query)
}
}
2021-05-04 19:44:55 +08:00
pub type PrimitiveQuery = Vec<PrimitiveQueryPart>;
#[derive(Debug, Clone)]
2021-05-04 19:44:55 +08:00
pub enum PrimitiveQueryPart {
Phrase(Vec<String>),
Word(String, IsPrefix),
}
impl PrimitiveQueryPart {
fn is_phrase(&self) -> bool {
matches!(self, Self::Phrase(_))
}
fn is_prefix(&self) -> bool {
matches!(self, Self::Word(_, is_prefix) if *is_prefix)
}
}
/// Create primitive query from tokenized query string,
/// the primitive query is an intermediate state to build the query tree.
2021-06-17 00:33:33 +08:00
fn create_primitive_query(
query: TokenStream,
stop_words: Option<Set<&[u8]>>,
words_limit: Option<usize>,
) -> PrimitiveQuery {
let mut primitive_query = Vec::new();
let mut phrase = Vec::new();
let mut quoted = false;
2021-04-14 01:10:58 +08:00
let parts_limit = words_limit.unwrap_or(usize::MAX);
let mut peekable = query.peekable();
while let Some(token) = peekable.next() {
2021-04-14 01:10:58 +08:00
// early return if word limit is exceeded
2021-06-17 00:33:33 +08:00
if primitive_query.len() >= parts_limit {
return primitive_query;
}
2021-04-14 01:10:58 +08:00
match token.kind {
2021-06-17 00:33:33 +08:00
TokenKind::Word | TokenKind::StopWord => {
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
// 3. if the word is the last token of the query we push it as a prefix word.
if quoted {
phrase.push(token.word.to_string());
} else if peekable.peek().is_some() {
2021-06-17 00:33:33 +08:00
if !stop_words
.as_ref()
.map_or(false, |swords| swords.contains(token.word.as_ref()))
{
primitive_query
.push(PrimitiveQueryPart::Word(token.word.to_string(), false));
}
} else {
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true));
}
2021-06-17 00:33:33 +08:00
}
TokenKind::Separator(separator_kind) => {
let quote_count = token.word.chars().filter(|&s| s == '"').count();
// swap quoted state if we encounter a double quote
if quote_count % 2 != 0 {
quoted = !quoted;
}
// if there is a quote or a hard separator we close the phrase.
2021-06-17 00:33:33 +08:00
if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
{
primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase)));
}
2021-06-17 00:33:33 +08:00
}
_ => (),
}
}
// If a quote is never closed, we consider all of the end of the query as a phrase.
if !phrase.is_empty() {
primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase)));
}
primitive_query
}
/// Returns the maximum number of typos that this Operation allows.
pub fn maximum_typo(operation: &Operation) -> usize {
2021-06-17 00:33:33 +08:00
use Operation::{And, Or, Phrase, Query};
match operation {
Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0),
And(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
Query(q) => q.kind.typo() as usize,
// no typo allowed in phrases
Phrase(_) => 0,
}
}
/// Returns the maximum proximity that this Operation allows.
pub fn maximum_proximity(operation: &Operation) -> usize {
2021-06-17 00:33:33 +08:00
use Operation::{And, Or, Phrase, Query};
match operation {
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
2021-02-24 22:36:57 +08:00
And(ops) => {
2021-06-17 00:33:33 +08:00
ops.iter().map(maximum_proximity).sum::<usize>() + ops.len().saturating_sub(1) * 7
}
Query(_) | Phrase(_) => 0,
}
}
#[cfg(test)]
mod test {
2021-03-02 18:30:48 +08:00
use std::collections::HashMap;
2021-06-01 17:48:56 +08:00
use maplit::hashmap;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
2021-06-17 00:33:33 +08:00
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use super::*;
2022-04-01 17:21:51 +08:00
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
2021-03-02 18:30:48 +08:00
#[derive(Debug)]
struct TestContext {
synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
postings: HashMap<String, RoaringBitmap>,
2022-03-22 16:55:49 +08:00
// Raw bytes for the exact word fst Set
exact_words: Vec<u8>,
}
impl TestContext {
fn build(
&self,
optional_words: bool,
authorize_typos: bool,
2021-04-14 01:10:58 +08:00
words_limit: Option<usize>,
query: TokenStream,
2021-06-17 00:33:33 +08:00
) -> Result<Option<(Operation, PrimitiveQuery)>> {
2021-04-14 01:10:58 +08:00
let primitive_query = create_primitive_query(query, None, words_limit);
if !primitive_query.is_empty() {
2021-06-17 00:33:33 +08:00
let qt =
create_query_tree(self, optional_words, authorize_typos, &primitive_query)?;
2021-05-04 19:44:55 +08:00
Ok(Some((qt, primitive_query)))
} else {
Ok(None)
}
}
}
impl Context for TestContext {
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
Ok(self.postings.get(word).cloned())
}
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
Ok(self.synonyms.get(&words).cloned())
}
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
}
2022-03-21 23:25:15 +08:00
fn exact_words(&self) -> crate::Result<fst::Set<Cow<[u8]>>> {
2022-03-22 16:55:49 +08:00
Ok(fst::Set::new(Cow::Borrowed(self.exact_words.as_slice())).unwrap())
2022-03-21 23:25:15 +08:00
}
}
impl Default for TestContext {
fn default() -> TestContext {
let mut rng = StdRng::seed_from_u64(102);
let rng = &mut rng;
fn random_postings<R: Rng>(rng: &mut R, len: usize) -> RoaringBitmap {
let mut values = Vec::<u32>::with_capacity(len);
while values.len() != len {
values.push(rng.gen());
}
values.sort_unstable();
RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap()
}
2022-03-22 16:55:49 +08:00
let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap();
TestContext {
2021-06-17 00:33:33 +08:00
synonyms: hashmap! {
vec![String::from("hello")] => vec![
vec![String::from("hi")],
vec![String::from("good"), String::from("morning")],
],
vec![String::from("world")] => vec![
vec![String::from("earth")],
vec![String::from("nature")],
],
// new york city
vec![String::from("nyc")] => vec![
vec![String::from("new"), String::from("york")],
vec![String::from("new"), String::from("york"), String::from("city")],
],
vec![String::from("new"), String::from("york")] => vec![
vec![String::from("nyc")],
vec![String::from("new"), String::from("york"), String::from("city")],
],
vec![String::from("new"), String::from("york"), String::from("city")] => vec![
vec![String::from("nyc")],
vec![String::from("new"), String::from("york")],
],
},
2021-06-17 00:33:33 +08:00
postings: hashmap! {
String::from("hello") => random_postings(rng, 1500),
String::from("hi") => random_postings(rng, 4000),
String::from("word") => random_postings(rng, 2500),
String::from("split") => random_postings(rng, 400),
String::from("ngrams") => random_postings(rng, 1400),
String::from("world") => random_postings(rng, 15_000),
String::from("earth") => random_postings(rng, 8000),
String::from("2021") => random_postings(rng, 100),
String::from("2020") => random_postings(rng, 500),
String::from("is") => random_postings(rng, 50_000),
String::from("this") => random_postings(rng, 50_000),
String::from("good") => random_postings(rng, 1250),
String::from("morning") => random_postings(rng, 125),
},
2022-03-22 16:55:49 +08:00
exact_words,
}
}
}
#[test]
fn prefix() {
let query = "hey friends";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
2021-06-17 00:33:33 +08:00
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: true,
kind: QueryKind::tolerant(1, "friends".to_string()),
}),
]),
Operation::Query(Query {
prefix: true,
2022-02-03 01:45:11 +08:00
kind: QueryKind::tolerant(1, "heyfriends".to_string()),
2021-06-17 00:33:33 +08:00
}),
],
);
2021-06-17 00:33:33 +08:00
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
#[test]
fn no_prefix() {
let query = "hey friends ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
2021-06-17 00:33:33 +08:00
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "friends".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
2022-02-03 01:45:11 +08:00
kind: QueryKind::tolerant(1, "heyfriends".to_string()),
2021-06-17 00:33:33 +08:00
}),
],
);
2021-06-17 00:33:33 +08:00
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
#[test]
fn synonyms() {
let query = "hello world ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
2021-06-17 00:33:33 +08:00
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hi".to_string()),
}),
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("good".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("morning".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "hello".to_string()),
}),
],
),
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("earth".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("nature".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "world".to_string()),
}),
],
),
]),
2021-06-17 00:33:33 +08:00
Operation::Query(Query {
prefix: false,
2022-02-03 01:45:11 +08:00
kind: QueryKind::tolerant(1, "helloworld".to_string()),
2021-06-17 00:33:33 +08:00
}),
],
);
2021-06-17 00:33:33 +08:00
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
#[test]
fn complex_synonyms() {
let query = "new york city ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
2021-06-17 00:33:33 +08:00
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("new".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("york".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("city".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "yorkcity".to_string()),
}),
],
),
]),
Operation::And(vec![
2021-06-17 00:33:33 +08:00
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("nyc".to_string()),
}),
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("new".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("york".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("city".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "newyork".to_string()),
}),
],
),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("city".to_string()),
}),
]),
2021-06-17 00:33:33 +08:00
Operation::Or(
false,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("nyc".to_string()),
}),
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("new".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("york".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
2022-02-03 01:45:11 +08:00
kind: QueryKind::tolerant(1, "newyorkcity".to_string()),
2021-06-17 00:33:33 +08:00
}),
],
),
],
);
2021-06-17 00:33:33 +08:00
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
#[test]
fn ngrams() {
let query = "n grams ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
2021-06-17 00:33:33 +08:00
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("n".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "grams".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "ngrams".to_string()),
}),
],
);
2021-06-17 00:33:33 +08:00
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
#[test]
fn word_split() {
let query = "wordsplit fish ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
2021-06-17 00:33:33 +08:00
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Or(
false,
vec![
Operation::Phrase(vec!["word".to_string(), "split".to_string()]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(2, "wordsplit".to_string()),
}),
],
),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("fish".to_string()),
}),
]),
2021-06-17 00:33:33 +08:00
Operation::Query(Query {
prefix: false,
2022-02-03 01:45:11 +08:00
kind: QueryKind::tolerant(1, "wordsplitfish".to_string()),
2021-06-17 00:33:33 +08:00
}),
],
);
2021-06-17 00:33:33 +08:00
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
#[test]
fn phrase() {
let query = "\"hey friends\" \" \" \"wooop";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::And(vec![
2021-06-17 00:33:33 +08:00
Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }),
]);
2021-06-17 00:33:33 +08:00
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
#[test]
fn phrase_with_hard_separator() {
let query = "\"hey friends. wooop wooop\"";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::And(vec![
2021-06-17 00:33:33 +08:00
Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
Operation::Phrase(vec!["wooop".to_string(), "wooop".to_string()]),
]);
2021-06-17 00:33:33 +08:00
let (query_tree, _) =
TestContext::default().build(false, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
#[test]
fn optional_word() {
let query = "hey my friend ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
2021-06-17 00:33:33 +08:00
let expected = Operation::Or(
true,
vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![
2021-06-17 00:33:33 +08:00
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
]),
2021-06-17 00:33:33 +08:00
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "heymy".to_string()),
}),
],
),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "friend".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "myfriend".to_string()),
}),
],
),
]),
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "heymy".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "friend".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
2022-02-03 01:45:11 +08:00
kind: QueryKind::tolerant(1, "heymyfriend".to_string()),
2021-06-17 00:33:33 +08:00
}),
],
),
],
);
let (query_tree, _) =
TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
#[test]
fn optional_word_phrase() {
let query = "\"hey my\"";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
2021-06-17 00:33:33 +08:00
let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]);
let (query_tree, _) =
TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
#[test]
fn optional_word_multiple_phrases() {
let query = r#""hey" my good "friend""#;
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
2021-06-17 00:33:33 +08:00
let expected = Operation::Or(
true,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friend".to_string()),
}),
]),
2021-06-17 00:33:33 +08:00
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friend".to_string()),
}),
]),
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("my".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("good".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::tolerant(1, "mygood".to_string()),
}),
],
),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friend".to_string()),
}),
]),
],
);
let (query_tree, _) =
TestContext::default().build(true, true, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
#[test]
fn no_typo() {
let query = "hey friends ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
2021-06-17 00:33:33 +08:00
let expected = Operation::Or(
false,
vec![
Operation::And(vec![
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("hey".to_string()),
}),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("friends".to_string()),
}),
]),
Operation::Query(Query {
prefix: false,
kind: QueryKind::exact("heyfriends".to_string()),
}),
],
);
let (query_tree, _) =
TestContext::default().build(false, false, None, tokens).unwrap().unwrap();
assert_eq!(expected, query_tree);
}
2021-04-14 01:10:58 +08:00
#[test]
fn words_limit() {
let query = "\"hey my\" good friend";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::And(vec![
2021-06-17 00:33:33 +08:00
Operation::Phrase(vec!["hey".to_string(), "my".to_string()]),
2021-04-14 01:10:58 +08:00
Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }),
]);
2021-06-17 00:33:33 +08:00
let (query_tree, _) =
TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap();
2021-04-14 01:10:58 +08:00
assert_eq!(expected, query_tree);
}
2022-03-31 19:50:18 +08:00
#[test]
fn test_min_word_len_typo() {
2022-03-21 23:25:15 +08:00
let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap();
let config =
TypoConfig { max_typos: 2, word_len_one_typo: 5, word_len_two_typo: 7, exact_words };
2022-03-31 19:50:18 +08:00
assert_eq!(
typos("hello".to_string(), true, config.clone()),
QueryKind::Tolerant { typo: 1, word: "hello".to_string() }
);
assert_eq!(
typos("hell".to_string(), true, config.clone()),
QueryKind::exact("hell".to_string())
);
assert_eq!(
typos("verylongword".to_string(), true, config.clone()),
QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
);
}
2022-03-22 16:55:49 +08:00
#[test]
fn disable_typo_on_word() {
let query = "goodbye";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let result = analyzer.analyze(query);
let tokens = result.tokens();
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
let context = TestContext { exact_words, ..Default::default() };
let (query_tree, _) = context.build(false, true, Some(2), tokens).unwrap().unwrap();
assert!(matches!(
2022-04-04 17:02:43 +08:00
dbg!(query_tree),
2022-03-22 16:55:49 +08:00
Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
));
}
}