2022-04-04 17:52:35 +08:00
|
|
|
use std::borrow::Cow;
|
2022-09-01 18:10:47 +08:00
|
|
|
use std::cmp::max;
|
2022-10-12 15:48:23 +08:00
|
|
|
use std::{fmt, mem};
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-06-02 21:47:28 +08:00
|
|
|
use charabia::classifier::ClassifiedTokenIter;
|
|
|
|
use charabia::{SeparatorKind, TokenKind};
|
2021-04-09 03:21:20 +08:00
|
|
|
use fst::Set;
|
2021-03-03 19:03:31 +08:00
|
|
|
use roaring::RoaringBitmap;
|
|
|
|
use slice_group_by::GroupBy;
|
|
|
|
|
2022-04-05 00:56:59 +08:00
|
|
|
use crate::search::matches::matching_words::{MatchingWord, PrimitiveWordId};
|
2022-08-18 23:36:08 +08:00
|
|
|
use crate::search::TermsMatchingStrategy;
|
2022-10-12 15:48:23 +08:00
|
|
|
use crate::{CboRoaringBitmapLenCodec, Index, MatchingWords, Result};
|
2021-03-03 19:03:31 +08:00
|
|
|
|
|
|
|
type IsOptionalWord = bool;
|
|
|
|
type IsPrefix = bool;
|
|
|
|
|
|
|
|
#[derive(Clone, PartialEq, Eq, Hash)]
|
|
|
|
pub enum Operation {
|
|
|
|
And(Vec<Operation>),
|
2022-10-26 21:38:06 +08:00
|
|
|
// series of consecutive non prefix and exact words
|
|
|
|
// `None` means a stop word.
|
|
|
|
Phrase(Vec<Option<String>>),
|
2021-03-03 19:03:31 +08:00
|
|
|
Or(IsOptionalWord, Vec<Operation>),
|
|
|
|
Query(Query),
|
|
|
|
}
|
|
|
|
|
|
|
|
impl fmt::Debug for Operation {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
|
fn pprint_tree(f: &mut fmt::Formatter<'_>, op: &Operation, depth: usize) -> fmt::Result {
|
|
|
|
match op {
|
|
|
|
Operation::And(children) => {
|
|
|
|
writeln!(f, "{:1$}AND", "", depth * 2)?;
|
|
|
|
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2021-06-09 23:28:12 +08:00
|
|
|
Operation::Phrase(children) => {
|
|
|
|
writeln!(f, "{:2$}PHRASE {:?}", "", children, depth * 2)
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
Operation::Or(true, children) => {
|
|
|
|
writeln!(f, "{:1$}OR(WORD)", "", depth * 2)?;
|
|
|
|
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
Operation::Or(false, children) => {
|
|
|
|
writeln!(f, "{:1$}OR", "", depth * 2)?;
|
|
|
|
children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1))
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pprint_tree(f, self, 0)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Operation {
|
|
|
|
fn and(mut ops: Vec<Self>) -> Self {
|
|
|
|
if ops.len() == 1 {
|
|
|
|
ops.pop().unwrap()
|
|
|
|
} else {
|
|
|
|
Self::And(ops)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn or(word_branch: IsOptionalWord, mut ops: Vec<Self>) -> Self {
|
|
|
|
if ops.len() == 1 {
|
|
|
|
ops.pop().unwrap()
|
|
|
|
} else {
|
2022-08-18 23:36:08 +08:00
|
|
|
let ops = ops
|
|
|
|
.into_iter()
|
|
|
|
.flat_map(|o| match o {
|
|
|
|
Operation::Or(wb, children) if wb == word_branch => children,
|
|
|
|
op => vec![op],
|
|
|
|
})
|
|
|
|
.collect();
|
2021-03-03 19:03:31 +08:00
|
|
|
Self::Or(word_branch, ops)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-26 21:38:06 +08:00
|
|
|
fn phrase(mut words: Vec<Option<String>>) -> Self {
|
2021-06-09 23:28:12 +08:00
|
|
|
if words.len() == 1 {
|
2022-10-26 21:38:06 +08:00
|
|
|
if let Some(word) = words.pop().unwrap() {
|
|
|
|
Self::Query(Query { prefix: false, kind: QueryKind::exact(word) })
|
|
|
|
} else {
|
|
|
|
Self::Phrase(words)
|
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
} else {
|
2021-06-09 23:28:12 +08:00
|
|
|
Self::Phrase(words)
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
}
|
2021-02-23 00:17:01 +08:00
|
|
|
|
|
|
|
pub fn query(&self) -> Option<&Query> {
|
|
|
|
match self {
|
|
|
|
Operation::Query(query) => Some(query),
|
|
|
|
_ => None,
|
|
|
|
}
|
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Clone, Eq, PartialEq, Hash)]
|
|
|
|
pub struct Query {
|
|
|
|
pub prefix: IsPrefix,
|
|
|
|
pub kind: QueryKind,
|
|
|
|
}
|
|
|
|
|
|
|
|
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
|
|
|
|
pub enum QueryKind {
|
|
|
|
Tolerant { typo: u8, word: String },
|
|
|
|
Exact { original_typo: u8, word: String },
|
|
|
|
}
|
|
|
|
|
|
|
|
impl QueryKind {
|
2021-02-24 17:25:22 +08:00
|
|
|
pub fn exact(word: String) -> Self {
|
2021-03-03 19:03:31 +08:00
|
|
|
QueryKind::Exact { original_typo: 0, word }
|
|
|
|
}
|
|
|
|
|
2021-02-24 17:25:22 +08:00
|
|
|
pub fn tolerant(typo: u8, word: String) -> Self {
|
2021-03-03 19:03:31 +08:00
|
|
|
QueryKind::Tolerant { typo, word }
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn typo(&self) -> u8 {
|
|
|
|
match self {
|
|
|
|
QueryKind::Tolerant { typo, .. } => *typo,
|
|
|
|
QueryKind::Exact { original_typo, .. } => *original_typo,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn word(&self) -> &str {
|
|
|
|
match self {
|
|
|
|
QueryKind::Tolerant { word, .. } => word,
|
|
|
|
QueryKind::Exact { word, .. } => word,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl fmt::Debug for Query {
|
|
|
|
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
|
|
|
let Query { prefix, kind } = self;
|
|
|
|
let prefix = if *prefix { String::from("Prefix") } else { String::default() };
|
|
|
|
match kind {
|
|
|
|
QueryKind::Exact { word, .. } => {
|
|
|
|
f.debug_struct(&(prefix + "Exact")).field("word", &word).finish()
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
|
|
|
QueryKind::Tolerant { typo, word } => f
|
|
|
|
.debug_struct(&(prefix + "Tolerant"))
|
|
|
|
.field("word", &word)
|
|
|
|
.field("max typo", &typo)
|
|
|
|
.finish(),
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
trait Context {
|
|
|
|
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
2021-04-10 03:56:20 +08:00
|
|
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
2022-10-13 15:44:27 +08:00
|
|
|
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
|
|
|
match self.word_docids(word)? {
|
|
|
|
Some(rb) => Ok(Some(rb.len())),
|
|
|
|
None => Ok(None),
|
|
|
|
}
|
|
|
|
}
|
2022-03-21 20:29:59 +08:00
|
|
|
/// Returns the minimum word len for 1 and 2 typos.
|
|
|
|
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)>;
|
2022-05-24 18:14:55 +08:00
|
|
|
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>>;
|
2022-10-12 17:57:56 +08:00
|
|
|
fn word_pair_frequency(
|
|
|
|
&self,
|
|
|
|
left_word: &str,
|
|
|
|
right_word: &str,
|
2022-10-13 15:51:59 +08:00
|
|
|
proximity: u8,
|
2022-10-13 15:27:50 +08:00
|
|
|
) -> heed::Result<Option<u64>>;
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// The query tree builder is the interface to build a query tree.
|
|
|
|
pub struct QueryTreeBuilder<'a> {
|
|
|
|
rtxn: &'a heed::RoTxn<'a>,
|
|
|
|
index: &'a Index,
|
2022-08-22 23:37:36 +08:00
|
|
|
terms_matching_strategy: TermsMatchingStrategy,
|
2021-02-25 17:49:25 +08:00
|
|
|
authorize_typos: bool,
|
2021-04-14 01:10:58 +08:00
|
|
|
words_limit: Option<usize>,
|
2022-05-24 15:43:17 +08:00
|
|
|
exact_words: Option<fst::Set<Cow<'a, [u8]>>>,
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> Context for QueryTreeBuilder<'a> {
|
|
|
|
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
|
|
|
self.index.word_docids.get(self.rtxn, word)
|
|
|
|
}
|
|
|
|
|
2021-04-10 03:56:20 +08:00
|
|
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
2021-04-07 16:53:57 +08:00
|
|
|
self.index.words_synonyms(self.rtxn, words)
|
2021-02-18 22:06:58 +08:00
|
|
|
}
|
|
|
|
|
2022-10-13 15:51:59 +08:00
|
|
|
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
|
|
|
self.index.word_documents_count(self.rtxn, word)
|
|
|
|
}
|
|
|
|
|
2022-03-21 20:29:59 +08:00
|
|
|
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
|
2022-10-25 03:34:13 +08:00
|
|
|
let one = self.index.min_word_len_one_typo(self.rtxn)?;
|
|
|
|
let two = self.index.min_word_len_two_typos(self.rtxn)?;
|
2022-03-21 20:29:59 +08:00
|
|
|
Ok((one, two))
|
|
|
|
}
|
2022-03-21 23:25:15 +08:00
|
|
|
|
2022-05-24 18:14:55 +08:00
|
|
|
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
|
|
|
|
self.exact_words.as_ref()
|
2022-03-21 23:25:15 +08:00
|
|
|
}
|
2022-10-12 15:48:23 +08:00
|
|
|
|
2022-10-12 17:57:56 +08:00
|
|
|
fn word_pair_frequency(
|
|
|
|
&self,
|
|
|
|
left_word: &str,
|
|
|
|
right_word: &str,
|
|
|
|
proximity: u8,
|
|
|
|
) -> heed::Result<Option<u64>> {
|
2022-10-18 16:40:26 +08:00
|
|
|
let key = (proximity, left_word, right_word);
|
2022-10-12 17:57:56 +08:00
|
|
|
self.index
|
|
|
|
.word_pair_proximity_docids
|
|
|
|
.remap_data_type::<CboRoaringBitmapLenCodec>()
|
2022-10-25 03:34:13 +08:00
|
|
|
.get(self.rtxn, &key)
|
2022-10-12 15:48:23 +08:00
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl<'a> QueryTreeBuilder<'a> {
|
|
|
|
/// Create a `QueryTreeBuilder` from a heed ReadOnly transaction `rtxn`
|
|
|
|
/// and an Index `index`.
|
2022-05-24 15:43:17 +08:00
|
|
|
pub fn new(rtxn: &'a heed::RoTxn<'a>, index: &'a Index) -> Result<Self> {
|
|
|
|
Ok(Self {
|
|
|
|
rtxn,
|
|
|
|
index,
|
2022-08-22 23:37:36 +08:00
|
|
|
terms_matching_strategy: TermsMatchingStrategy::default(),
|
2022-05-24 15:43:17 +08:00
|
|
|
authorize_typos: true,
|
|
|
|
words_limit: None,
|
2022-05-24 20:15:33 +08:00
|
|
|
exact_words: index.exact_words(rtxn)?,
|
2022-05-24 15:43:17 +08:00
|
|
|
})
|
2021-02-25 17:49:25 +08:00
|
|
|
}
|
|
|
|
|
2022-08-22 23:37:36 +08:00
|
|
|
/// if `terms_matching_strategy` is set to `All` the query tree will be
|
2021-02-25 17:49:25 +08:00
|
|
|
/// generated forcing all query words to be present in each matching documents
|
|
|
|
/// (the criterion `words` will be ignored).
|
2022-08-22 23:37:36 +08:00
|
|
|
/// default value if not called: `Last`
|
|
|
|
pub fn terms_matching_strategy(
|
|
|
|
&mut self,
|
|
|
|
terms_matching_strategy: TermsMatchingStrategy,
|
|
|
|
) -> &mut Self {
|
|
|
|
self.terms_matching_strategy = terms_matching_strategy;
|
2021-02-25 17:49:25 +08:00
|
|
|
self
|
|
|
|
}
|
|
|
|
|
|
|
|
/// if `authorize_typos` is set to `false` the query tree will be generated
|
|
|
|
/// forcing all query words to match documents without any typo
|
|
|
|
/// (the criterion `typo` will be ignored).
|
|
|
|
/// default value if not called: `true`
|
|
|
|
pub fn authorize_typos(&mut self, authorize_typos: bool) -> &mut Self {
|
|
|
|
self.authorize_typos = authorize_typos;
|
|
|
|
self
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
2021-04-14 01:10:58 +08:00
|
|
|
/// Limit words and phrases that will be taken for query building.
|
|
|
|
/// Any beyond `words_limit` will be ignored.
|
|
|
|
pub fn words_limit(&mut self, words_limit: usize) -> &mut Self {
|
|
|
|
self.words_limit = Some(words_limit);
|
|
|
|
self
|
|
|
|
}
|
|
|
|
|
2021-03-03 19:03:31 +08:00
|
|
|
/// Build the query tree:
|
2022-08-22 23:37:36 +08:00
|
|
|
/// - if `terms_matching_strategy` is set to `All` the query tree will be
|
2021-03-03 19:03:31 +08:00
|
|
|
/// generated forcing all query words to be present in each matching documents
|
|
|
|
/// (the criterion `words` will be ignored)
|
|
|
|
/// - if `authorize_typos` is set to `false` the query tree will be generated
|
|
|
|
/// forcing all query words to match documents without any typo
|
|
|
|
/// (the criterion `typo` will be ignored)
|
2022-06-02 21:47:28 +08:00
|
|
|
pub fn build<A: AsRef<[u8]>>(
|
2022-04-05 00:56:59 +08:00
|
|
|
&self,
|
2022-06-02 21:47:28 +08:00
|
|
|
query: ClassifiedTokenIter<A>,
|
2022-04-05 00:56:59 +08:00
|
|
|
) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
|
2021-04-09 03:21:20 +08:00
|
|
|
let stop_words = self.index.stop_words(self.rtxn)?;
|
2021-04-14 01:10:58 +08:00
|
|
|
let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
|
2021-03-03 19:03:31 +08:00
|
|
|
if !primitive_query.is_empty() {
|
2021-06-17 00:33:33 +08:00
|
|
|
let qt = create_query_tree(
|
|
|
|
self,
|
2022-08-22 23:37:36 +08:00
|
|
|
self.terms_matching_strategy,
|
2021-06-17 00:33:33 +08:00
|
|
|
self.authorize_typos,
|
|
|
|
&primitive_query,
|
|
|
|
)?;
|
2022-04-05 00:56:59 +08:00
|
|
|
let matching_words =
|
|
|
|
create_matching_words(self, self.authorize_typos, &primitive_query)?;
|
|
|
|
Ok(Some((qt, primitive_query, matching_words)))
|
2021-03-03 19:03:31 +08:00
|
|
|
} else {
|
|
|
|
Ok(None)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-12 16:06:48 +08:00
|
|
|
/// Split the word depending on the frequency of pairs near together in the database documents.
|
2022-04-07 23:05:44 +08:00
|
|
|
fn split_best_frequency<'a>(
|
|
|
|
ctx: &impl Context,
|
|
|
|
word: &'a str,
|
2022-10-13 15:44:27 +08:00
|
|
|
) -> heed::Result<Option<(&'a str, &'a str)>> {
|
2021-03-03 19:03:31 +08:00
|
|
|
let chars = word.char_indices().skip(1);
|
|
|
|
let mut best = None;
|
|
|
|
|
|
|
|
for (i, _) in chars {
|
|
|
|
let (left, right) = word.split_at(i);
|
|
|
|
|
2022-10-12 15:48:23 +08:00
|
|
|
let pair_freq = ctx.word_pair_frequency(left, right, 1)?.unwrap_or(0);
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-10-12 15:48:23 +08:00
|
|
|
if pair_freq != 0 && best.map_or(true, |(old, _, _)| pair_freq > old) {
|
|
|
|
best = Some((pair_freq, left, right));
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-13 15:44:27 +08:00
|
|
|
Ok(best.map(|(_, left, right)| (left, right)))
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
2022-03-31 19:50:18 +08:00
|
|
|
#[derive(Clone)]
|
2022-03-21 23:25:15 +08:00
|
|
|
pub struct TypoConfig<'a> {
|
2022-03-21 20:29:59 +08:00
|
|
|
pub max_typos: u8,
|
2022-04-01 00:23:12 +08:00
|
|
|
pub word_len_one_typo: u8,
|
|
|
|
pub word_len_two_typo: u8,
|
2022-05-24 18:14:55 +08:00
|
|
|
pub exact_words: Option<&'a fst::Set<Cow<'a, [u8]>>>,
|
2022-03-21 20:29:59 +08:00
|
|
|
}
|
|
|
|
|
2021-03-03 19:03:31 +08:00
|
|
|
/// Return the `QueryKind` of a word depending on `authorize_typos`
|
|
|
|
/// and the provided word length.
|
2022-10-25 18:42:38 +08:00
|
|
|
fn typos(word: String, authorize_typos: bool, config: TypoConfig) -> QueryKind {
|
2022-05-24 20:15:33 +08:00
|
|
|
if authorize_typos && !config.exact_words.map_or(false, |s| s.contains(&word)) {
|
2022-03-21 20:29:59 +08:00
|
|
|
let count = word.chars().count().min(u8::MAX as usize) as u8;
|
2022-04-01 00:37:43 +08:00
|
|
|
if count < config.word_len_one_typo {
|
2022-03-21 20:29:59 +08:00
|
|
|
QueryKind::exact(word)
|
2022-04-01 00:37:43 +08:00
|
|
|
} else if count < config.word_len_two_typo {
|
2022-03-21 20:29:59 +08:00
|
|
|
QueryKind::tolerant(1.min(config.max_typos), word)
|
|
|
|
} else {
|
|
|
|
QueryKind::tolerant(2.min(config.max_typos), word)
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
} else {
|
|
|
|
QueryKind::exact(word)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-04-10 03:56:20 +08:00
|
|
|
/// Fetch synonyms from the `Context` for the provided word
|
2021-03-03 19:03:31 +08:00
|
|
|
/// and create the list of operations for the query tree
|
2021-04-10 03:56:20 +08:00
|
|
|
fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operation>>> {
|
|
|
|
let synonyms = ctx.synonyms(word)?;
|
2021-03-03 19:03:31 +08:00
|
|
|
|
|
|
|
Ok(synonyms.map(|synonyms| {
|
2021-06-17 00:33:33 +08:00
|
|
|
synonyms
|
|
|
|
.into_iter()
|
|
|
|
.map(|synonym| {
|
|
|
|
let words = synonym
|
|
|
|
.into_iter()
|
|
|
|
.map(|word| {
|
|
|
|
Operation::Query(Query { prefix: false, kind: QueryKind::exact(word) })
|
|
|
|
})
|
|
|
|
.collect();
|
|
|
|
Operation::and(words)
|
|
|
|
})
|
|
|
|
.collect()
|
2021-03-03 19:03:31 +08:00
|
|
|
}))
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Main function that creates the final query tree from the primitive query.
|
|
|
|
fn create_query_tree(
|
|
|
|
ctx: &impl Context,
|
2022-08-22 23:37:36 +08:00
|
|
|
terms_matching_strategy: TermsMatchingStrategy,
|
2021-03-03 19:03:31 +08:00
|
|
|
authorize_typos: bool,
|
2021-05-04 19:44:55 +08:00
|
|
|
query: &[PrimitiveQueryPart],
|
2021-06-17 00:33:33 +08:00
|
|
|
) -> Result<Operation> {
|
2021-03-03 19:03:31 +08:00
|
|
|
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
|
|
|
|
fn resolve_primitive_part(
|
|
|
|
ctx: &impl Context,
|
|
|
|
authorize_typos: bool,
|
|
|
|
part: PrimitiveQueryPart,
|
2021-06-17 00:33:33 +08:00
|
|
|
) -> Result<Operation> {
|
2021-03-03 19:03:31 +08:00
|
|
|
match part {
|
|
|
|
// 1. try to split word in 2
|
|
|
|
// 2. try to fetch synonyms
|
|
|
|
// 3. create an operation containing the word
|
|
|
|
// 4. wrap all in an OR operation
|
|
|
|
PrimitiveQueryPart::Word(word, prefix) => {
|
|
|
|
let mut children = synonyms(ctx, &[&word])?.unwrap_or_default();
|
2022-10-13 15:44:27 +08:00
|
|
|
if let Some((left, right)) = split_best_frequency(ctx, &word)? {
|
2022-10-26 21:38:06 +08:00
|
|
|
children.push(Operation::Phrase(vec![
|
|
|
|
Some(left.to_string()),
|
|
|
|
Some(right.to_string()),
|
|
|
|
]));
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
2022-04-01 00:23:12 +08:00
|
|
|
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
2022-05-24 15:43:17 +08:00
|
|
|
let exact_words = ctx.exact_words();
|
2022-03-21 23:25:15 +08:00
|
|
|
let config =
|
|
|
|
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
|
2022-01-21 01:34:54 +08:00
|
|
|
children.push(Operation::Query(Query {
|
|
|
|
prefix,
|
2022-03-21 20:29:59 +08:00
|
|
|
kind: typos(word, authorize_typos, config),
|
2022-01-21 01:34:54 +08:00
|
|
|
}));
|
2021-03-03 19:03:31 +08:00
|
|
|
Ok(Operation::or(false, children))
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
// create a CONSECUTIVE operation wrapping all word in the phrase
|
2021-06-17 00:33:33 +08:00
|
|
|
PrimitiveQueryPart::Phrase(words) => Ok(Operation::phrase(words)),
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Create all ngrams 1..=3 generating query tree branches.
|
|
|
|
fn ngrams(
|
|
|
|
ctx: &impl Context,
|
|
|
|
authorize_typos: bool,
|
|
|
|
query: &[PrimitiveQueryPart],
|
2022-08-18 23:36:08 +08:00
|
|
|
any_words: bool,
|
2021-06-17 00:33:33 +08:00
|
|
|
) -> Result<Operation> {
|
2021-03-03 19:03:31 +08:00
|
|
|
const MAX_NGRAM: usize = 3;
|
|
|
|
let mut op_children = Vec::new();
|
|
|
|
|
2021-04-09 03:21:20 +08:00
|
|
|
for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) {
|
2021-03-03 19:03:31 +08:00
|
|
|
let mut or_op_children = Vec::new();
|
|
|
|
|
|
|
|
for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
|
|
|
|
if let Some(group) = sub_query.get(..ngram) {
|
|
|
|
let mut and_op_children = Vec::new();
|
|
|
|
let tail = &sub_query[ngram..];
|
|
|
|
let is_last = tail.is_empty();
|
|
|
|
|
|
|
|
match group {
|
|
|
|
[part] => {
|
2021-06-17 00:33:33 +08:00
|
|
|
let operation =
|
|
|
|
resolve_primitive_part(ctx, authorize_typos, part.clone())?;
|
2021-03-03 19:03:31 +08:00
|
|
|
and_op_children.push(operation);
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
words => {
|
2021-04-09 03:21:20 +08:00
|
|
|
let is_prefix = words.last().map_or(false, |part| part.is_prefix());
|
2021-06-17 00:33:33 +08:00
|
|
|
let words: Vec<_> = words
|
|
|
|
.iter()
|
|
|
|
.filter_map(|part| {
|
|
|
|
if let PrimitiveQueryPart::Word(word, _) = part {
|
|
|
|
Some(word.as_str())
|
|
|
|
} else {
|
|
|
|
None
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.collect();
|
2021-03-03 19:03:31 +08:00
|
|
|
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
|
|
|
|
let concat = words.concat();
|
2022-04-01 00:23:12 +08:00
|
|
|
let (word_len_one_typo, word_len_two_typo) =
|
|
|
|
ctx.min_word_len_for_typo()?;
|
2022-05-24 15:43:17 +08:00
|
|
|
let exact_words = ctx.exact_words();
|
2022-03-21 23:25:15 +08:00
|
|
|
let config = TypoConfig {
|
|
|
|
max_typos: 1,
|
|
|
|
word_len_one_typo,
|
|
|
|
word_len_two_typo,
|
|
|
|
exact_words,
|
|
|
|
};
|
2022-02-03 01:45:11 +08:00
|
|
|
let query = Query {
|
|
|
|
prefix: is_prefix,
|
2022-03-21 20:29:59 +08:00
|
|
|
kind: typos(concat, authorize_typos, config),
|
2022-02-03 01:45:11 +08:00
|
|
|
};
|
2021-04-08 21:12:37 +08:00
|
|
|
operations.push(Operation::Query(query));
|
|
|
|
and_op_children.push(Operation::or(false, operations));
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if !is_last {
|
2022-08-18 23:36:08 +08:00
|
|
|
let ngrams = ngrams(ctx, authorize_typos, tail, any_words)?;
|
2021-03-03 19:03:31 +08:00
|
|
|
and_op_children.push(ngrams);
|
|
|
|
}
|
2022-08-18 23:36:08 +08:00
|
|
|
|
|
|
|
if any_words {
|
|
|
|
or_op_children.push(Operation::or(false, and_op_children));
|
|
|
|
} else {
|
|
|
|
or_op_children.push(Operation::and(and_op_children));
|
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
op_children.push(Operation::or(false, or_op_children));
|
|
|
|
}
|
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
if any_words {
|
|
|
|
Ok(Operation::or(false, op_children))
|
|
|
|
} else {
|
|
|
|
Ok(Operation::and(op_children))
|
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let number_phrases = query.iter().filter(|p| p.is_phrase()).count();
|
2022-09-01 18:10:47 +08:00
|
|
|
let remove_count = query.len() - max(number_phrases, 1);
|
2022-08-18 23:36:08 +08:00
|
|
|
if remove_count == 0 {
|
|
|
|
return ngrams(ctx, authorize_typos, query, false);
|
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let mut operation_children = Vec::new();
|
|
|
|
let mut query = query.to_vec();
|
2022-09-01 18:10:47 +08:00
|
|
|
for _ in 0..=remove_count {
|
2022-08-22 23:37:36 +08:00
|
|
|
let pos = match terms_matching_strategy {
|
2022-08-18 23:36:08 +08:00
|
|
|
TermsMatchingStrategy::All => return ngrams(ctx, authorize_typos, &query, false),
|
|
|
|
TermsMatchingStrategy::Any => {
|
|
|
|
let operation = Operation::Or(
|
|
|
|
true,
|
|
|
|
vec![
|
|
|
|
// branch allowing matching documents to contains any query word.
|
|
|
|
ngrams(ctx, authorize_typos, &query, true)?,
|
|
|
|
// branch forcing matching documents to contains all the query words,
|
|
|
|
// keeping this documents of the top of the resulted list.
|
|
|
|
ngrams(ctx, authorize_typos, &query, false)?,
|
|
|
|
],
|
|
|
|
);
|
|
|
|
|
|
|
|
return Ok(operation);
|
|
|
|
}
|
|
|
|
TermsMatchingStrategy::Last => query
|
2021-06-17 00:33:33 +08:00
|
|
|
.iter()
|
2022-08-18 23:36:08 +08:00
|
|
|
.enumerate()
|
|
|
|
.filter(|(_, part)| !part.is_phrase())
|
|
|
|
.last()
|
|
|
|
.map(|(pos, _)| pos),
|
|
|
|
TermsMatchingStrategy::First => {
|
|
|
|
query.iter().enumerate().find(|(_, part)| !part.is_phrase()).map(|(pos, _)| pos)
|
|
|
|
}
|
|
|
|
TermsMatchingStrategy::Size => query
|
|
|
|
.iter()
|
|
|
|
.enumerate()
|
|
|
|
.filter(|(_, part)| !part.is_phrase())
|
|
|
|
.min_by_key(|(_, part)| match part {
|
|
|
|
PrimitiveQueryPart::Word(s, _) => s.len(),
|
|
|
|
_ => unreachable!(),
|
|
|
|
})
|
|
|
|
.map(|(pos, _)| pos),
|
|
|
|
TermsMatchingStrategy::Frequency => query
|
|
|
|
.iter()
|
|
|
|
.enumerate()
|
|
|
|
.filter(|(_, part)| !part.is_phrase())
|
|
|
|
.max_by_key(|(_, part)| match part {
|
|
|
|
PrimitiveQueryPart::Word(s, _) => {
|
2022-10-13 15:51:59 +08:00
|
|
|
ctx.word_documents_count(s).unwrap_or_default().unwrap_or(u64::max_value())
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2022-08-18 23:36:08 +08:00
|
|
|
_ => unreachable!(),
|
2021-06-17 00:33:33 +08:00
|
|
|
})
|
2022-08-18 23:36:08 +08:00
|
|
|
.map(|(pos, _)| pos),
|
|
|
|
};
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
// compute and push the current branch on the front
|
|
|
|
operation_children.insert(0, ngrams(ctx, authorize_typos, &query, false)?);
|
|
|
|
// remove word from query before creating an new branch
|
|
|
|
match pos {
|
|
|
|
Some(pos) => query.remove(pos),
|
|
|
|
None => break,
|
|
|
|
};
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
2022-08-18 23:56:06 +08:00
|
|
|
Ok(Operation::or(true, operation_children))
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
2022-04-05 00:56:59 +08:00
|
|
|
/// Main function that matchings words used for crop and highlight.
|
|
|
|
fn create_matching_words(
|
|
|
|
ctx: &impl Context,
|
|
|
|
authorize_typos: bool,
|
|
|
|
query: &[PrimitiveQueryPart],
|
|
|
|
) -> Result<MatchingWords> {
|
|
|
|
/// Matches on the `PrimitiveQueryPart` and create matchings words from it.
|
|
|
|
fn resolve_primitive_part(
|
|
|
|
ctx: &impl Context,
|
|
|
|
authorize_typos: bool,
|
|
|
|
part: PrimitiveQueryPart,
|
|
|
|
matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
|
|
|
|
id: PrimitiveWordId,
|
|
|
|
) -> Result<()> {
|
|
|
|
match part {
|
|
|
|
// 1. try to split word in 2
|
|
|
|
// 2. try to fetch synonyms
|
|
|
|
PrimitiveQueryPart::Word(word, prefix) => {
|
|
|
|
if let Some(synonyms) = ctx.synonyms(&[word.as_str()])? {
|
|
|
|
for synonym in synonyms {
|
|
|
|
let synonym = synonym
|
|
|
|
.into_iter()
|
2022-10-25 03:34:13 +08:00
|
|
|
.map(|syn| MatchingWord::new(syn, 0, false))
|
2022-04-05 00:56:59 +08:00
|
|
|
.collect();
|
|
|
|
matching_words.push((synonym, vec![id]));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-10-13 15:44:27 +08:00
|
|
|
if let Some((left, right)) = split_best_frequency(ctx, &word)? {
|
2022-04-07 23:05:44 +08:00
|
|
|
let left = MatchingWord::new(left.to_string(), 0, false);
|
|
|
|
let right = MatchingWord::new(right.to_string(), 0, false);
|
2022-04-05 00:56:59 +08:00
|
|
|
matching_words.push((vec![left, right], vec![id]));
|
|
|
|
}
|
|
|
|
|
|
|
|
let (word_len_one_typo, word_len_two_typo) = ctx.min_word_len_for_typo()?;
|
2022-05-24 15:43:17 +08:00
|
|
|
let exact_words = ctx.exact_words();
|
2022-04-05 00:56:59 +08:00
|
|
|
let config =
|
|
|
|
TypoConfig { max_typos: 2, word_len_one_typo, word_len_two_typo, exact_words };
|
|
|
|
|
|
|
|
let matching_word = match typos(word, authorize_typos, config) {
|
|
|
|
QueryKind::Exact { word, .. } => MatchingWord::new(word, 0, prefix),
|
|
|
|
QueryKind::Tolerant { typo, word } => MatchingWord::new(word, typo, prefix),
|
|
|
|
};
|
|
|
|
matching_words.push((vec![matching_word], vec![id]));
|
|
|
|
}
|
|
|
|
// create a CONSECUTIVE matchings words wrapping all word in the phrase
|
|
|
|
PrimitiveQueryPart::Phrase(words) => {
|
|
|
|
let ids: Vec<_> =
|
|
|
|
(0..words.len()).into_iter().map(|i| id + i as PrimitiveWordId).collect();
|
2022-10-26 21:38:06 +08:00
|
|
|
let words = words
|
|
|
|
.into_iter()
|
|
|
|
.filter_map(|w| w)
|
|
|
|
.map(|w| MatchingWord::new(w, 0, false))
|
|
|
|
.collect();
|
2022-04-05 00:56:59 +08:00
|
|
|
matching_words.push((words, ids));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Create all ngrams 1..=3 generating query tree branches.
|
|
|
|
fn ngrams(
|
|
|
|
ctx: &impl Context,
|
|
|
|
authorize_typos: bool,
|
|
|
|
query: &[PrimitiveQueryPart],
|
|
|
|
matching_words: &mut Vec<(Vec<MatchingWord>, Vec<PrimitiveWordId>)>,
|
|
|
|
mut id: PrimitiveWordId,
|
|
|
|
) -> Result<()> {
|
|
|
|
const MAX_NGRAM: usize = 3;
|
|
|
|
|
|
|
|
for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) {
|
|
|
|
for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
|
|
|
|
if let Some(group) = sub_query.get(..ngram) {
|
|
|
|
let tail = &sub_query[ngram..];
|
|
|
|
let is_last = tail.is_empty();
|
|
|
|
|
|
|
|
match group {
|
|
|
|
[part] => {
|
|
|
|
resolve_primitive_part(
|
|
|
|
ctx,
|
|
|
|
authorize_typos,
|
|
|
|
part.clone(),
|
|
|
|
matching_words,
|
|
|
|
id,
|
|
|
|
)?;
|
|
|
|
}
|
|
|
|
words => {
|
|
|
|
let is_prefix = words.last().map_or(false, |part| part.is_prefix());
|
|
|
|
let words: Vec<_> = words
|
|
|
|
.iter()
|
|
|
|
.filter_map(|part| {
|
|
|
|
if let PrimitiveQueryPart::Word(word, _) = part {
|
|
|
|
Some(word.as_str())
|
|
|
|
} else {
|
|
|
|
None
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.collect();
|
|
|
|
let ids: Vec<_> = (0..words.len())
|
|
|
|
.into_iter()
|
|
|
|
.map(|i| id + i as PrimitiveWordId)
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
if let Some(synonyms) = ctx.synonyms(&words)? {
|
|
|
|
for synonym in synonyms {
|
|
|
|
let synonym = synonym
|
|
|
|
.into_iter()
|
2022-10-25 03:34:13 +08:00
|
|
|
.map(|syn| MatchingWord::new(syn, 0, false))
|
2022-04-05 00:56:59 +08:00
|
|
|
.collect();
|
|
|
|
matching_words.push((synonym, ids.clone()));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
let word = words.concat();
|
|
|
|
let (word_len_one_typo, word_len_two_typo) =
|
|
|
|
ctx.min_word_len_for_typo()?;
|
2022-05-24 15:43:17 +08:00
|
|
|
let exact_words = ctx.exact_words();
|
2022-04-05 00:56:59 +08:00
|
|
|
let config = TypoConfig {
|
|
|
|
max_typos: 1,
|
|
|
|
word_len_one_typo,
|
|
|
|
word_len_two_typo,
|
|
|
|
exact_words,
|
|
|
|
};
|
|
|
|
let matching_word = match typos(word, authorize_typos, config) {
|
|
|
|
QueryKind::Exact { word, .. } => {
|
|
|
|
MatchingWord::new(word, 0, is_prefix)
|
|
|
|
}
|
|
|
|
QueryKind::Tolerant { typo, word } => {
|
|
|
|
MatchingWord::new(word, typo, is_prefix)
|
|
|
|
}
|
|
|
|
};
|
|
|
|
matching_words.push((vec![matching_word], ids));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if !is_last {
|
|
|
|
ngrams(ctx, authorize_typos, tail, matching_words, id + 1)?;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
id += sub_query.iter().map(|x| x.len() as PrimitiveWordId).sum::<PrimitiveWordId>();
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
|
|
|
let mut matching_words = Vec::new();
|
|
|
|
ngrams(ctx, authorize_typos, query, &mut matching_words, 0)?;
|
|
|
|
Ok(MatchingWords::new(matching_words))
|
|
|
|
}
|
|
|
|
|
2021-05-04 19:44:55 +08:00
|
|
|
pub type PrimitiveQuery = Vec<PrimitiveQueryPart>;
|
2021-03-03 19:03:31 +08:00
|
|
|
|
|
|
|
#[derive(Debug, Clone)]
|
2021-05-04 19:44:55 +08:00
|
|
|
pub enum PrimitiveQueryPart {
|
2022-10-26 21:38:06 +08:00
|
|
|
Phrase(Vec<Option<String>>),
|
2021-03-03 19:03:31 +08:00
|
|
|
Word(String, IsPrefix),
|
|
|
|
}
|
|
|
|
|
|
|
|
impl PrimitiveQueryPart {
|
|
|
|
fn is_phrase(&self) -> bool {
|
|
|
|
matches!(self, Self::Phrase(_))
|
|
|
|
}
|
|
|
|
|
|
|
|
fn is_prefix(&self) -> bool {
|
|
|
|
matches!(self, Self::Word(_, is_prefix) if *is_prefix)
|
|
|
|
}
|
2022-04-05 00:56:59 +08:00
|
|
|
|
|
|
|
fn len(&self) -> usize {
|
|
|
|
match self {
|
|
|
|
Self::Phrase(words) => words.len(),
|
|
|
|
Self::Word(_, _) => 1,
|
|
|
|
}
|
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Create primitive query from tokenized query string,
|
|
|
|
/// the primitive query is an intermediate state to build the query tree.
|
2022-06-02 21:47:28 +08:00
|
|
|
fn create_primitive_query<A>(
|
|
|
|
query: ClassifiedTokenIter<A>,
|
2021-06-17 00:33:33 +08:00
|
|
|
stop_words: Option<Set<&[u8]>>,
|
|
|
|
words_limit: Option<usize>,
|
2022-06-02 21:47:28 +08:00
|
|
|
) -> PrimitiveQuery
|
|
|
|
where
|
|
|
|
A: AsRef<[u8]>,
|
|
|
|
{
|
2021-03-03 19:03:31 +08:00
|
|
|
let mut primitive_query = Vec::new();
|
|
|
|
let mut phrase = Vec::new();
|
|
|
|
let mut quoted = false;
|
|
|
|
|
2021-04-14 01:10:58 +08:00
|
|
|
let parts_limit = words_limit.unwrap_or(usize::MAX);
|
|
|
|
|
2021-03-03 19:03:31 +08:00
|
|
|
let mut peekable = query.peekable();
|
|
|
|
while let Some(token) = peekable.next() {
|
2021-04-14 01:10:58 +08:00
|
|
|
// early return if word limit is exceeded
|
2021-06-17 00:33:33 +08:00
|
|
|
if primitive_query.len() >= parts_limit {
|
|
|
|
return primitive_query;
|
|
|
|
}
|
2021-04-14 01:10:58 +08:00
|
|
|
|
2021-03-03 19:03:31 +08:00
|
|
|
match token.kind {
|
2021-06-17 00:33:33 +08:00
|
|
|
TokenKind::Word | TokenKind::StopWord => {
|
2021-03-03 19:03:31 +08:00
|
|
|
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
|
2021-04-09 03:21:20 +08:00
|
|
|
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
|
2021-03-03 19:03:31 +08:00
|
|
|
// 3. if the word is the last token of the query we push it as a prefix word.
|
|
|
|
if quoted {
|
2022-10-26 21:38:06 +08:00
|
|
|
if stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) {
|
|
|
|
phrase.push(None)
|
|
|
|
} else {
|
|
|
|
phrase.push(Some(token.lemma().to_string()));
|
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
} else if peekable.peek().is_some() {
|
2022-06-02 21:47:28 +08:00
|
|
|
if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) {
|
2021-06-17 00:33:33 +08:00
|
|
|
primitive_query
|
2022-06-02 21:47:28 +08:00
|
|
|
.push(PrimitiveQueryPart::Word(token.lemma().to_string(), false));
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
} else {
|
2022-06-02 21:47:28 +08:00
|
|
|
primitive_query.push(PrimitiveQueryPart::Word(token.lemma().to_string(), true));
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2021-06-08 23:29:38 +08:00
|
|
|
TokenKind::Separator(separator_kind) => {
|
2022-06-02 21:47:28 +08:00
|
|
|
let quote_count = token.lemma().chars().filter(|&s| s == '"').count();
|
2021-03-03 19:03:31 +08:00
|
|
|
// swap quoted state if we encounter a double quote
|
|
|
|
if quote_count % 2 != 0 {
|
|
|
|
quoted = !quoted;
|
|
|
|
}
|
2021-06-08 23:52:37 +08:00
|
|
|
// if there is a quote or a hard separator we close the phrase.
|
2021-06-17 00:33:33 +08:00
|
|
|
if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
|
|
|
|
{
|
2021-03-03 19:03:31 +08:00
|
|
|
primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase)));
|
|
|
|
}
|
2021-06-17 00:33:33 +08:00
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
_ => (),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
|
|
|
if !phrase.is_empty() {
|
|
|
|
primitive_query.push(PrimitiveQueryPart::Phrase(mem::take(&mut phrase)));
|
|
|
|
}
|
|
|
|
|
|
|
|
primitive_query
|
|
|
|
}
|
|
|
|
|
2021-02-23 22:50:33 +08:00
|
|
|
/// Returns the maximum number of typos that this Operation allows.
|
|
|
|
pub fn maximum_typo(operation: &Operation) -> usize {
|
2021-06-17 00:33:33 +08:00
|
|
|
use Operation::{And, Or, Phrase, Query};
|
2021-02-23 22:50:33 +08:00
|
|
|
match operation {
|
|
|
|
Or(_, ops) => ops.iter().map(maximum_typo).max().unwrap_or(0),
|
2021-06-09 23:28:12 +08:00
|
|
|
And(ops) => ops.iter().map(maximum_typo).sum::<usize>(),
|
2021-02-23 22:50:33 +08:00
|
|
|
Query(q) => q.kind.typo() as usize,
|
2021-06-09 23:28:12 +08:00
|
|
|
// no typo allowed in phrases
|
|
|
|
Phrase(_) => 0,
|
2021-02-23 22:50:33 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-02-23 22:53:24 +08:00
|
|
|
/// Returns the maximum proximity that this Operation allows.
|
|
|
|
pub fn maximum_proximity(operation: &Operation) -> usize {
|
2021-06-17 00:33:33 +08:00
|
|
|
use Operation::{And, Or, Phrase, Query};
|
2021-02-23 22:53:24 +08:00
|
|
|
match operation {
|
|
|
|
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
|
2021-02-24 22:36:57 +08:00
|
|
|
And(ops) => {
|
2021-06-17 00:33:33 +08:00
|
|
|
ops.iter().map(maximum_proximity).sum::<usize>() + ops.len().saturating_sub(1) * 7
|
|
|
|
}
|
2021-06-09 23:28:12 +08:00
|
|
|
Query(_) | Phrase(_) => 0,
|
2021-02-23 22:53:24 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-03 19:03:31 +08:00
|
|
|
#[cfg(test)]
|
|
|
|
mod test {
|
2021-03-02 18:30:48 +08:00
|
|
|
use std::collections::HashMap;
|
|
|
|
|
2022-06-02 21:47:28 +08:00
|
|
|
use charabia::Tokenize;
|
2021-06-01 17:48:56 +08:00
|
|
|
use maplit::hashmap;
|
2021-06-17 00:33:33 +08:00
|
|
|
use rand::rngs::StdRng;
|
|
|
|
use rand::{Rng, SeedableRng};
|
2021-03-03 19:03:31 +08:00
|
|
|
|
|
|
|
use super::*;
|
2022-04-01 17:21:51 +08:00
|
|
|
use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS};
|
2021-03-02 18:30:48 +08:00
|
|
|
|
2021-03-03 19:03:31 +08:00
|
|
|
#[derive(Debug)]
|
|
|
|
struct TestContext {
|
|
|
|
synonyms: HashMap<Vec<String>, Vec<Vec<String>>>,
|
|
|
|
postings: HashMap<String, RoaringBitmap>,
|
2022-05-24 15:43:17 +08:00
|
|
|
exact_words: Option<fst::Set<Cow<'static, [u8]>>>,
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl TestContext {
|
2022-06-02 21:47:28 +08:00
|
|
|
fn build<A: AsRef<[u8]>>(
|
2021-03-03 19:03:31 +08:00
|
|
|
&self,
|
2022-08-22 23:37:36 +08:00
|
|
|
terms_matching_strategy: TermsMatchingStrategy,
|
2021-03-03 19:03:31 +08:00
|
|
|
authorize_typos: bool,
|
2021-04-14 01:10:58 +08:00
|
|
|
words_limit: Option<usize>,
|
2022-06-02 21:47:28 +08:00
|
|
|
query: ClassifiedTokenIter<A>,
|
2021-06-17 00:33:33 +08:00
|
|
|
) -> Result<Option<(Operation, PrimitiveQuery)>> {
|
2021-04-14 01:10:58 +08:00
|
|
|
let primitive_query = create_primitive_query(query, None, words_limit);
|
2021-03-03 19:03:31 +08:00
|
|
|
if !primitive_query.is_empty() {
|
2022-08-22 23:37:36 +08:00
|
|
|
let qt = create_query_tree(
|
|
|
|
self,
|
|
|
|
terms_matching_strategy,
|
|
|
|
authorize_typos,
|
|
|
|
&primitive_query,
|
|
|
|
)?;
|
2021-05-04 19:44:55 +08:00
|
|
|
Ok(Some((qt, primitive_query)))
|
2021-03-03 19:03:31 +08:00
|
|
|
} else {
|
|
|
|
Ok(None)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl Context for TestContext {
|
|
|
|
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
|
|
|
|
Ok(self.postings.get(word).cloned())
|
|
|
|
}
|
|
|
|
|
2021-04-10 03:56:20 +08:00
|
|
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
|
|
|
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
2021-03-03 19:03:31 +08:00
|
|
|
Ok(self.synonyms.get(&words).cloned())
|
|
|
|
}
|
2022-03-21 20:29:59 +08:00
|
|
|
|
|
|
|
fn min_word_len_for_typo(&self) -> heed::Result<(u8, u8)> {
|
2022-04-01 00:42:10 +08:00
|
|
|
Ok((DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS))
|
2022-03-21 20:29:59 +08:00
|
|
|
}
|
2022-03-21 23:25:15 +08:00
|
|
|
|
2022-05-24 18:14:55 +08:00
|
|
|
fn exact_words(&self) -> Option<&fst::Set<Cow<[u8]>>> {
|
|
|
|
self.exact_words.as_ref()
|
2022-03-21 23:25:15 +08:00
|
|
|
}
|
2022-10-13 15:27:50 +08:00
|
|
|
|
|
|
|
fn word_pair_frequency(
|
|
|
|
&self,
|
|
|
|
left_word: &str,
|
|
|
|
right_word: &str,
|
|
|
|
_proximity: u8,
|
|
|
|
) -> heed::Result<Option<u64>> {
|
|
|
|
match self.word_docids(&format!("{} {}", left_word, right_word))? {
|
|
|
|
Some(rb) => Ok(Some(rb.len())),
|
|
|
|
None => Ok(None),
|
|
|
|
}
|
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
impl Default for TestContext {
|
|
|
|
fn default() -> TestContext {
|
|
|
|
let mut rng = StdRng::seed_from_u64(102);
|
|
|
|
let rng = &mut rng;
|
|
|
|
|
|
|
|
fn random_postings<R: Rng>(rng: &mut R, len: usize) -> RoaringBitmap {
|
|
|
|
let mut values = Vec::<u32>::with_capacity(len);
|
|
|
|
while values.len() != len {
|
|
|
|
values.push(rng.gen());
|
|
|
|
}
|
|
|
|
values.sort_unstable();
|
2022-03-15 00:13:07 +08:00
|
|
|
RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap()
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
2022-03-22 16:55:49 +08:00
|
|
|
let exact_words = fst::SetBuilder::new(Vec::new()).unwrap().into_inner().unwrap();
|
2022-05-24 15:43:17 +08:00
|
|
|
let exact_words =
|
|
|
|
Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
|
2022-03-22 16:55:49 +08:00
|
|
|
|
2021-03-03 19:03:31 +08:00
|
|
|
TestContext {
|
2021-06-17 00:33:33 +08:00
|
|
|
synonyms: hashmap! {
|
2021-03-03 19:03:31 +08:00
|
|
|
vec![String::from("hello")] => vec![
|
|
|
|
vec![String::from("hi")],
|
|
|
|
vec![String::from("good"), String::from("morning")],
|
|
|
|
],
|
|
|
|
vec![String::from("world")] => vec![
|
|
|
|
vec![String::from("earth")],
|
|
|
|
vec![String::from("nature")],
|
|
|
|
],
|
|
|
|
// new york city
|
|
|
|
vec![String::from("nyc")] => vec![
|
|
|
|
vec![String::from("new"), String::from("york")],
|
|
|
|
vec![String::from("new"), String::from("york"), String::from("city")],
|
|
|
|
],
|
|
|
|
vec![String::from("new"), String::from("york")] => vec![
|
|
|
|
vec![String::from("nyc")],
|
|
|
|
vec![String::from("new"), String::from("york"), String::from("city")],
|
|
|
|
],
|
|
|
|
vec![String::from("new"), String::from("york"), String::from("city")] => vec![
|
|
|
|
vec![String::from("nyc")],
|
|
|
|
vec![String::from("new"), String::from("york")],
|
|
|
|
],
|
|
|
|
},
|
2021-06-17 00:33:33 +08:00
|
|
|
postings: hashmap! {
|
2022-10-12 15:48:23 +08:00
|
|
|
String::from("hello") => random_postings(rng, 1500),
|
|
|
|
String::from("hi") => random_postings(rng, 4000),
|
|
|
|
String::from("word") => random_postings(rng, 2500),
|
|
|
|
String::from("split") => random_postings(rng, 400),
|
|
|
|
String::from("ngrams") => random_postings(rng, 1400),
|
|
|
|
String::from("world") => random_postings(rng, 15_000),
|
|
|
|
String::from("earth") => random_postings(rng, 8000),
|
|
|
|
String::from("2021") => random_postings(rng, 100),
|
|
|
|
String::from("2020") => random_postings(rng, 500),
|
|
|
|
String::from("is") => random_postings(rng, 50_000),
|
|
|
|
String::from("this") => random_postings(rng, 50_000),
|
|
|
|
String::from("good") => random_postings(rng, 1250),
|
|
|
|
String::from("morning") => random_postings(rng, 125),
|
|
|
|
String::from("word split") => random_postings(rng, 5000),
|
|
|
|
String::from("quick brownfox") => random_postings(rng, 7000),
|
|
|
|
String::from("quickbrown fox") => random_postings(rng, 8000),
|
2021-03-03 19:03:31 +08:00
|
|
|
},
|
2022-03-22 16:55:49 +08:00
|
|
|
exact_words,
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn prefix() {
|
|
|
|
let query = "hey friends";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::All, true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
OR
|
|
|
|
AND
|
|
|
|
Exact { word: "hey" }
|
|
|
|
PrefixTolerant { word: "friends", max typo: 1 }
|
|
|
|
PrefixTolerant { word: "heyfriends", max typo: 1 }
|
|
|
|
"###);
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn no_prefix() {
|
|
|
|
let query = "hey friends ";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::All, true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
OR
|
|
|
|
AND
|
|
|
|
Exact { word: "hey" }
|
|
|
|
Tolerant { word: "friends", max typo: 1 }
|
|
|
|
Tolerant { word: "heyfriends", max typo: 1 }
|
|
|
|
"###);
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn synonyms() {
|
|
|
|
let query = "hello world ";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::All, true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
OR
|
|
|
|
AND
|
|
|
|
OR
|
|
|
|
Exact { word: "hi" }
|
|
|
|
AND
|
|
|
|
Exact { word: "good" }
|
|
|
|
Exact { word: "morning" }
|
|
|
|
Tolerant { word: "hello", max typo: 1 }
|
|
|
|
OR
|
|
|
|
Exact { word: "earth" }
|
|
|
|
Exact { word: "nature" }
|
|
|
|
Tolerant { word: "world", max typo: 1 }
|
|
|
|
Tolerant { word: "helloworld", max typo: 1 }
|
|
|
|
"###);
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn complex_synonyms() {
|
|
|
|
let query = "new york city ";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::All, true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
OR
|
|
|
|
AND
|
|
|
|
Exact { word: "new" }
|
|
|
|
OR
|
|
|
|
AND
|
|
|
|
Exact { word: "york" }
|
|
|
|
Exact { word: "city" }
|
|
|
|
Tolerant { word: "yorkcity", max typo: 1 }
|
|
|
|
AND
|
|
|
|
OR
|
|
|
|
Exact { word: "nyc" }
|
|
|
|
AND
|
|
|
|
Exact { word: "new" }
|
|
|
|
Exact { word: "york" }
|
|
|
|
Exact { word: "city" }
|
|
|
|
Tolerant { word: "newyork", max typo: 1 }
|
|
|
|
Exact { word: "city" }
|
2022-08-18 23:36:08 +08:00
|
|
|
Exact { word: "nyc" }
|
|
|
|
AND
|
|
|
|
Exact { word: "new" }
|
|
|
|
Exact { word: "york" }
|
|
|
|
Tolerant { word: "newyorkcity", max typo: 1 }
|
2022-08-04 17:00:46 +08:00
|
|
|
"###);
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn ngrams() {
|
|
|
|
let query = "n grams ";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::All, true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
OR
|
|
|
|
AND
|
|
|
|
Exact { word: "n" }
|
|
|
|
Tolerant { word: "grams", max typo: 1 }
|
|
|
|
Tolerant { word: "ngrams", max typo: 1 }
|
|
|
|
"###);
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn word_split() {
|
|
|
|
let query = "wordsplit fish ";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::All, true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
OR
|
|
|
|
AND
|
|
|
|
OR
|
|
|
|
PHRASE ["word", "split"]
|
|
|
|
Tolerant { word: "wordsplit", max typo: 2 }
|
|
|
|
Exact { word: "fish" }
|
|
|
|
Tolerant { word: "wordsplitfish", max typo: 1 }
|
|
|
|
"###);
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
2022-10-12 15:48:23 +08:00
|
|
|
#[test]
|
|
|
|
fn word_split_choose_pair_with_max_freq() {
|
|
|
|
let query = "quickbrownfox";
|
|
|
|
let tokens = query.tokenize();
|
|
|
|
|
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::All, true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
OR
|
|
|
|
PHRASE ["quickbrown", "fox"]
|
|
|
|
PrefixTolerant { word: "quickbrownfox", max typo: 2 }
|
|
|
|
"###);
|
|
|
|
}
|
|
|
|
|
2021-03-03 19:03:31 +08:00
|
|
|
#[test]
|
|
|
|
fn phrase() {
|
|
|
|
let query = "\"hey friends\" \" \" \"wooop";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::All, true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
AND
|
|
|
|
PHRASE ["hey", "friends"]
|
|
|
|
Exact { word: "wooop" }
|
|
|
|
"###);
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
2022-09-01 18:02:10 +08:00
|
|
|
#[test]
|
|
|
|
fn phrase_2() {
|
|
|
|
// https://github.com/meilisearch/meilisearch/issues/2722
|
|
|
|
let query = "coco \"harry\"";
|
|
|
|
let tokens = query.tokenize();
|
|
|
|
|
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::default(), true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
|
|
|
|
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
OR(WORD)
|
|
|
|
Exact { word: "harry" }
|
|
|
|
AND
|
|
|
|
Exact { word: "coco" }
|
|
|
|
Exact { word: "harry" }
|
|
|
|
"###);
|
|
|
|
}
|
|
|
|
|
2021-06-08 23:29:38 +08:00
|
|
|
#[test]
|
|
|
|
fn phrase_with_hard_separator() {
|
|
|
|
let query = "\"hey friends. wooop wooop\"";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-06-08 23:29:38 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::All, true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-06-08 23:29:38 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
AND
|
|
|
|
PHRASE ["hey", "friends"]
|
|
|
|
PHRASE ["wooop", "wooop"]
|
|
|
|
"###);
|
2021-06-08 23:29:38 +08:00
|
|
|
}
|
|
|
|
|
2021-03-03 19:03:31 +08:00
|
|
|
#[test]
|
|
|
|
fn optional_word() {
|
|
|
|
let query = "hey my friend ";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::default(), true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-02-19 05:18:36 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
OR(WORD)
|
|
|
|
Exact { word: "hey" }
|
|
|
|
OR
|
|
|
|
AND
|
|
|
|
Exact { word: "hey" }
|
|
|
|
Exact { word: "my" }
|
|
|
|
Tolerant { word: "heymy", max typo: 1 }
|
|
|
|
OR
|
|
|
|
AND
|
|
|
|
Exact { word: "hey" }
|
|
|
|
OR
|
|
|
|
AND
|
|
|
|
Exact { word: "my" }
|
|
|
|
Tolerant { word: "friend", max typo: 1 }
|
|
|
|
Tolerant { word: "myfriend", max typo: 1 }
|
|
|
|
AND
|
|
|
|
Tolerant { word: "heymy", max typo: 1 }
|
|
|
|
Tolerant { word: "friend", max typo: 1 }
|
|
|
|
Tolerant { word: "heymyfriend", max typo: 1 }
|
|
|
|
"###);
|
2021-02-19 05:18:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn optional_word_phrase() {
|
|
|
|
let query = "\"hey my\"";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-02-19 05:18:36 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::default(), true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-02-19 05:18:36 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
PHRASE ["hey", "my"]
|
|
|
|
"###);
|
2021-02-19 05:18:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn optional_word_multiple_phrases() {
|
|
|
|
let query = r#""hey" my good "friend""#;
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-02-19 05:18:36 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::default(), true, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
OR(WORD)
|
|
|
|
AND
|
|
|
|
Exact { word: "hey" }
|
|
|
|
Exact { word: "friend" }
|
|
|
|
AND
|
|
|
|
Exact { word: "hey" }
|
|
|
|
Exact { word: "my" }
|
|
|
|
Exact { word: "friend" }
|
|
|
|
AND
|
|
|
|
Exact { word: "hey" }
|
|
|
|
OR
|
|
|
|
AND
|
|
|
|
Exact { word: "my" }
|
|
|
|
Exact { word: "good" }
|
|
|
|
Tolerant { word: "mygood", max typo: 1 }
|
|
|
|
Exact { word: "friend" }
|
|
|
|
"###);
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn no_typo() {
|
|
|
|
let query = "hey friends ";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::All, false, None, tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-03-03 19:03:31 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
OR
|
|
|
|
AND
|
|
|
|
Exact { word: "hey" }
|
|
|
|
Exact { word: "friends" }
|
|
|
|
Exact { word: "heyfriends" }
|
|
|
|
"###);
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|
|
|
|
|
2021-04-14 01:10:58 +08:00
|
|
|
#[test]
|
|
|
|
fn words_limit() {
|
|
|
|
let query = "\"hey my\" good friend";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2021-04-14 01:10:58 +08:00
|
|
|
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) = TestContext::default()
|
|
|
|
.build(TermsMatchingStrategy::All, false, Some(2), tokens)
|
|
|
|
.unwrap()
|
|
|
|
.unwrap();
|
2021-04-14 01:10:58 +08:00
|
|
|
|
2022-08-04 17:00:46 +08:00
|
|
|
insta::assert_debug_snapshot!(query_tree, @r###"
|
|
|
|
AND
|
|
|
|
PHRASE ["hey", "my"]
|
|
|
|
Exact { word: "good" }
|
|
|
|
"###);
|
2021-04-14 01:10:58 +08:00
|
|
|
}
|
2022-03-31 19:50:18 +08:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn test_min_word_len_typo() {
|
2022-05-24 18:14:55 +08:00
|
|
|
let exact_words = fst::Set::from_iter([b""]).unwrap().map_data(Cow::Owned).unwrap();
|
2022-05-24 15:43:17 +08:00
|
|
|
let config = TypoConfig {
|
|
|
|
max_typos: 2,
|
|
|
|
word_len_one_typo: 5,
|
|
|
|
word_len_two_typo: 7,
|
2022-05-24 18:14:55 +08:00
|
|
|
exact_words: Some(&exact_words),
|
2022-05-24 15:43:17 +08:00
|
|
|
};
|
2022-03-31 19:50:18 +08:00
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
typos("hello".to_string(), true, config.clone()),
|
|
|
|
QueryKind::Tolerant { typo: 1, word: "hello".to_string() }
|
|
|
|
);
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
typos("hell".to_string(), true, config.clone()),
|
|
|
|
QueryKind::exact("hell".to_string())
|
|
|
|
);
|
|
|
|
|
|
|
|
assert_eq!(
|
|
|
|
typos("verylongword".to_string(), true, config.clone()),
|
|
|
|
QueryKind::Tolerant { typo: 2, word: "verylongword".to_string() }
|
|
|
|
);
|
|
|
|
}
|
2022-03-22 16:55:49 +08:00
|
|
|
|
2022-03-22 16:55:49 +08:00
|
|
|
#[test]
|
|
|
|
fn disable_typo_on_word() {
|
|
|
|
let query = "goodbye";
|
2022-06-02 21:47:28 +08:00
|
|
|
let tokens = query.tokenize();
|
2022-03-22 16:55:49 +08:00
|
|
|
|
|
|
|
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
|
2022-05-24 15:43:17 +08:00
|
|
|
let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
|
2022-03-22 16:55:49 +08:00
|
|
|
let context = TestContext { exact_words, ..Default::default() };
|
2022-08-18 23:36:08 +08:00
|
|
|
let (query_tree, _) =
|
|
|
|
context.build(TermsMatchingStrategy::All, true, Some(2), tokens).unwrap().unwrap();
|
2022-03-22 16:55:49 +08:00
|
|
|
|
|
|
|
assert!(matches!(
|
2022-04-04 17:52:35 +08:00
|
|
|
query_tree,
|
2022-03-22 16:55:49 +08:00
|
|
|
Operation::Query(Query { prefix: true, kind: QueryKind::Exact { .. } })
|
|
|
|
));
|
|
|
|
}
|
2021-03-03 19:03:31 +08:00
|
|
|
}
|