Merge pull request #97 from meilisearch/criteria

Introduce all the criteria
This commit is contained in:
Clément Renault 2021-03-03 18:24:22 +01:00 committed by GitHub
commit 2924ed31f3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 1967 additions and 784 deletions

1
Cargo.lock generated
View File

@ -866,6 +866,7 @@ dependencies = [
"anyhow", "anyhow",
"byte-unit", "byte-unit",
"heed", "heed",
"jemallocator",
"milli", "milli",
"stderrlog", "stderrlog",
"structopt", "structopt",

View File

@ -32,7 +32,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use milli::facet::FacetValue; use milli::facet::FacetValue;
use milli::update::UpdateIndexingStep::*; use milli::update::UpdateIndexingStep::*;
use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; use milli::{obkv_to_json, Index, UpdateStore, SearchResult, MatchingWords, FacetCondition};
static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new(); static GLOBAL_THREAD_POOL: OnceCell<ThreadPool> = OnceCell::new();
@ -132,7 +132,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
Self { analyzer } Self { analyzer }
} }
fn highlight_value(&self, value: Value, words_to_highlight: &HashSet<String>) -> Value { fn highlight_value(&self, value: Value, matching_words: &MatchingWords) -> Value {
match value { match value {
Value::Null => Value::Null, Value::Null => Value::Null,
Value::Bool(boolean) => Value::Bool(boolean), Value::Bool(boolean) => Value::Bool(boolean),
@ -142,7 +142,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
let analyzed = self.analyzer.analyze(&old_string); let analyzed = self.analyzer.analyze(&old_string);
for (word, token) in analyzed.reconstruct() { for (word, token) in analyzed.reconstruct() {
if token.is_word() { if token.is_word() {
let to_highlight = words_to_highlight.contains(token.text()); let to_highlight = matching_words.matches(token.text());
if to_highlight { string.push_str("<mark>") } if to_highlight { string.push_str("<mark>") }
string.push_str(word); string.push_str(word);
if to_highlight { string.push_str("</mark>") } if to_highlight { string.push_str("</mark>") }
@ -154,12 +154,12 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
}, },
Value::Array(values) => { Value::Array(values) => {
Value::Array(values.into_iter() Value::Array(values.into_iter()
.map(|v| self.highlight_value(v, words_to_highlight)) .map(|v| self.highlight_value(v, matching_words))
.collect()) .collect())
}, },
Value::Object(object) => { Value::Object(object) => {
Value::Object(object.into_iter() Value::Object(object.into_iter()
.map(|(k, v)| (k, self.highlight_value(v, words_to_highlight))) .map(|(k, v)| (k, self.highlight_value(v, matching_words)))
.collect()) .collect())
}, },
} }
@ -168,14 +168,14 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
fn highlight_record( fn highlight_record(
&self, &self,
object: &mut Map<String, Value>, object: &mut Map<String, Value>,
words_to_highlight: &HashSet<String>, matching_words: &MatchingWords,
attributes_to_highlight: &HashSet<String>, attributes_to_highlight: &HashSet<String>,
) { ) {
// TODO do we need to create a string for element that are not and needs to be highlight? // TODO do we need to create a string for element that are not and needs to be highlight?
for (key, value) in object.iter_mut() { for (key, value) in object.iter_mut() {
if attributes_to_highlight.contains(key) { if attributes_to_highlight.contains(key) {
let old_value = mem::take(value); let old_value = mem::take(value);
*value = self.highlight_value(old_value, words_to_highlight); *value = self.highlight_value(old_value, matching_words);
} }
} }
} }
@ -722,7 +722,7 @@ async fn main() -> anyhow::Result<()> {
search.facet_condition(condition); search.facet_condition(condition);
} }
let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap(); let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap();
let number_of_candidates = candidates.len(); let number_of_candidates = candidates.len();
let facets = if query.facet_distribution == Some(true) { let facets = if query.facet_distribution == Some(true) {
@ -748,7 +748,7 @@ async fn main() -> anyhow::Result<()> {
for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() {
let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap();
if !disable_highlighting { if !disable_highlighting {
highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight); highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight);
} }
documents.push(object); documents.push(object);

View File

@ -598,7 +598,7 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec<u32>) -
let fields_ids_map = index.fields_ids_map(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?;
let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect();
let iter: Box<Iterator<Item = _>> = if internal_ids.is_empty() { let iter: Box<dyn Iterator<Item = _>> = if internal_ids.is_empty() {
Box::new(index.documents.iter(rtxn)?.map(|result| { Box::new(index.documents.iter(rtxn)?.map(|result| {
result.map(|(_id, obkv)| obkv) result.map(|(_id, obkv)| obkv)
})) }))

View File

@ -3,8 +3,6 @@
mod criterion; mod criterion;
mod external_documents_ids; mod external_documents_ids;
mod fields_ids_map; mod fields_ids_map;
mod mdfs;
mod query_tokens;
mod search; mod search;
mod update_store; mod update_store;
pub mod facet; pub mod facet;
@ -28,7 +26,7 @@ pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec};
pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec};
pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec};
pub use self::index::Index; pub use self::index::Index;
pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult}; pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords};
pub use self::update_store::UpdateStore; pub use self::update_store::UpdateStore;
pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>; pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;

View File

@ -1,163 +0,0 @@
use std::collections::hash_map::Entry::{Occupied, Vacant};
use std::collections::HashMap;
use std::mem;
use roaring::RoaringBitmap;
use crate::Index;
/// A mana depth first search implementation.
pub struct Mdfs<'a> {
index: &'a Index,
rtxn: &'a heed::RoTxn<'a>,
words: &'a [(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
union_cache: HashMap<(usize, u8), RoaringBitmap>,
candidates: RoaringBitmap,
mana: u32,
max_mana: u32,
}
impl<'a> Mdfs<'a> {
pub fn new(
index: &'a Index,
rtxn: &'a heed::RoTxn,
words: &'a [(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
candidates: RoaringBitmap,
) -> Mdfs<'a>
{
// Compute the number of pairs (windows) we have for this list of words.
let mana = words.len().saturating_sub(1) as u32;
let max_mana = mana * 8;
Mdfs { index, rtxn, words, union_cache: HashMap::new(), candidates, mana, max_mana }
}
}
impl<'a> Iterator for Mdfs<'a> {
type Item = anyhow::Result<(u32, RoaringBitmap)>;
fn next(&mut self) -> Option<Self::Item> {
// If there is less or only one word therefore the only
// possible documents that we can return are the candidates.
if self.words.len() <= 1 {
if self.candidates.is_empty() { return None }
return Some(Ok((0, mem::take(&mut self.candidates))));
}
while self.mana <= self.max_mana {
let mut answer = RoaringBitmap::new();
let result = mdfs_step(
&self.index,
&self.rtxn,
self.mana,
self.words,
&self.candidates,
&self.candidates,
&mut self.union_cache,
&mut answer,
);
match result {
Ok(()) => {
// We always increase the mana for the next loop.
let proximity = self.mana;
self.mana += 1;
// If no documents were found we must not return and continue
// the search with more mana.
if !answer.is_empty() {
// We remove the answered documents from the list of
// candidates to be sure we don't search for them again.
self.candidates.difference_with(&answer);
// We return the answer.
return Some(Ok((proximity, answer)));
}
},
Err(e) => return Some(Err(e)),
}
}
None
}
}
fn mdfs_step(
index: &Index,
rtxn: &heed::RoTxn,
mana: u32,
words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
candidates: &RoaringBitmap,
parent_docids: &RoaringBitmap,
union_cache: &mut HashMap<(usize, u8), RoaringBitmap>,
answer: &mut RoaringBitmap,
) -> anyhow::Result<()>
{
use std::cmp::{min, max};
let (words1, words2) = (&words[0].0, &words[1].0);
let pairs = words_pair_combinations(words1, words2);
let tail = &words[1..];
let nb_children = tail.len() as u32 - 1;
// The minimum amount of mana that you must consume is at least 1 and the
// amount of mana that your children can consume. Because the last child must
// consume the remaining mana, it is mandatory that there not too much at the end.
let min_proximity = max(1, mana.saturating_sub(nb_children * 8)) as u8;
// The maximum amount of mana that you can use is 8 or the remaining amount of
// mana minus your children, as you can't just consume all the mana,
// your children must have at least 1 mana.
let max_proximity = min(8, mana - nb_children) as u8;
for proximity in min_proximity..=max_proximity {
let mut docids = match union_cache.entry((words.len(), proximity)) {
Occupied(entry) => entry.get().clone(),
Vacant(entry) => {
let mut docids = RoaringBitmap::new();
if proximity == 8 {
docids = candidates.clone();
} else {
for (w1, w2) in pairs.iter().cloned() {
let key = (w1, w2, proximity);
if let Some(di) = index.word_pair_proximity_docids.get(rtxn, &key)? {
docids.union_with(&di);
}
}
}
entry.insert(docids).clone()
}
};
// We must be sure that we only return docids that are present in the candidates.
docids.intersect_with(parent_docids);
if !docids.is_empty() {
let mana = mana.checked_sub(proximity as u32).unwrap();
if tail.len() < 2 {
// We are the last pair, we return without recuring as we don't have any child.
answer.union_with(&docids);
return Ok(());
} else {
return mdfs_step(index, rtxn, mana, tail, candidates, &docids, union_cache, answer);
}
}
}
Ok(())
}
fn words_pair_combinations<'h>(
w1: &'h HashMap<String, (u8, RoaringBitmap)>,
w2: &'h HashMap<String, (u8, RoaringBitmap)>,
) -> Vec<(&'h str, &'h str)>
{
let mut pairs = Vec::new();
for (w1, (_typos, docids1)) in w1 {
for (w2, (_typos, docids2)) in w2 {
if !docids1.is_disjoint(&docids2) {
pairs.push((w1.as_str(), w2.as_str()));
}
}
}
pairs
}

View File

@ -1,217 +0,0 @@
use meilisearch_tokenizer::{Token, TokenKind};
#[derive(Debug)]
enum State {
Free,
Quoted,
}
impl State {
fn swap(&mut self) {
match self {
State::Quoted => *self = State::Free,
State::Free => *self = State::Quoted,
}
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum QueryToken<'a> {
Free(Token<'a>),
Quoted(Token<'a>),
}
pub fn query_tokens<'a>(mut tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = QueryToken<'a>> {
let mut state = State::Free;
let f = move || {
loop {
let token = tokens.next()?;
match token.kind() {
_ if token.text().trim() == "\"" => state.swap(),
TokenKind::Word => {
let token = match state {
State::Quoted => QueryToken::Quoted(token),
State::Free => QueryToken::Free(token),
};
return Some(token);
},
_ => (),
}
}
};
std::iter::from_fn(f)
}
#[cfg(test)]
mod tests {
use super::*;
use QueryToken::{Quoted, Free};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use fst::Set;
macro_rules! assert_eq_query_token {
($test:expr, Quoted($val:literal)) => {
match $test {
Quoted(val) => assert_eq!(val.text(), $val),
Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()),
}
};
($test:expr, Free($val:literal)) => {
match $test {
Quoted(val) => panic!("expected Free(\"{}\"), found Quoted(\"{}\")", $val, val.text()),
Free(val) => assert_eq!(val.text(), $val),
}
};
}
#[test]
fn empty() {
let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let query = "";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert!(iter.next().is_none());
let query = " ";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert!(iter.next().is_none());
}
#[test]
fn one_quoted_string() {
let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let query = "\"hello\"";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
assert!(iter.next().is_none());
}
#[test]
fn one_pending_quoted_string() {
let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let query = "\"hello";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
assert!(iter.next().is_none());
}
#[test]
fn one_non_quoted_string() {
let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let query = "hello";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
assert!(iter.next().is_none());
}
#[test]
fn quoted_directly_followed_by_free_strings() {
let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let query = "\"hello\"world";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
assert_eq_query_token!(iter.next().unwrap(), Free("world"));
assert!(iter.next().is_none());
}
#[test]
fn free_directly_followed_by_quoted_strings() {
let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let query = "hello\"world\"";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
assert!(iter.next().is_none());
}
#[test]
fn free_followed_by_quoted_strings() {
let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let query = "hello \"world\"";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
assert!(iter.next().is_none());
}
#[test]
fn multiple_spaces_separated_strings() {
let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let query = "hello world ";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
assert_eq_query_token!(iter.next().unwrap(), Free("world"));
assert!(iter.next().is_none());
}
#[test]
fn multi_interleaved_quoted_free_strings() {
let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let query = "hello \"world\" coucou \"monde\"";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Free("hello"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
assert_eq_query_token!(iter.next().unwrap(), Free("coucou"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("monde"));
assert!(iter.next().is_none());
}
#[test]
fn multi_quoted_strings() {
let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let query = "\"hello world\" coucou \"monde est beau\"";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Quoted("hello"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("world"));
assert_eq_query_token!(iter.next().unwrap(), Free("coucou"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("monde"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("est"));
assert_eq_query_token!(iter.next().unwrap(), Quoted("beau"));
assert!(iter.next().is_none());
}
#[test]
fn chinese() {
let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let query = "汽车男生";
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let mut iter = query_tokens(tokens);
assert_eq_query_token!(iter.next().unwrap(), Free("汽车"));
assert_eq_query_token!(iter.next().unwrap(), Free("男生"));
assert!(iter.next().is_none());
}
}

View File

@ -0,0 +1,282 @@
use std::collections::HashMap;
use std::mem::take;
use anyhow::bail;
use itertools::Itertools;
use log::debug;
use ordered_float::OrderedFloat;
use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec};
use crate::heed_codec::facet::{FieldDocIdFacetI64Codec, FieldDocIdFacetF64Codec};
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder};
use crate::search::facet::FacetIter;
use crate::search::query_tree::Operation;
use crate::{FieldId, Index};
use super::{Criterion, CriterionResult};
pub struct AscDesc<'t> {
index: &'t Index,
rtxn: &'t heed::RoTxn<'t>,
field_id: FieldId,
facet_type: FacetType,
ascending: bool,
query_tree: Option<Operation>,
candidates: RoaringBitmap,
bucket_candidates: RoaringBitmap,
faceted_candidates: RoaringBitmap,
parent: Option<Box<dyn Criterion + 't>>,
}
impl<'t> AscDesc<'t> {
pub fn initial_asc(
index: &'t Index,
rtxn: &'t heed::RoTxn,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
field_id: FieldId,
facet_type: FacetType,
) -> anyhow::Result<Self>
{
Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, true)
}
pub fn initial_desc(
index: &'t Index,
rtxn: &'t heed::RoTxn,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
field_id: FieldId,
facet_type: FacetType,
) -> anyhow::Result<Self>
{
Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, false)
}
pub fn asc(
index: &'t Index,
rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>,
field_id: FieldId,
facet_type: FacetType,
) -> anyhow::Result<Self>
{
Self::new(index, rtxn, parent, field_id, facet_type, true)
}
pub fn desc(
index: &'t Index,
rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>,
field_id: FieldId,
facet_type: FacetType,
) -> anyhow::Result<Self>
{
Self::new(index, rtxn, parent, field_id, facet_type, false)
}
fn initial(
index: &'t Index,
rtxn: &'t heed::RoTxn,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
field_id: FieldId,
facet_type: FacetType,
ascending: bool,
) -> anyhow::Result<Self>
{
let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?;
let candidates = match &query_tree {
Some(qt) => {
let context = CriteriaBuilder::new(rtxn, index)?;
let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new())?;
if let Some(candidates) = candidates {
qt_candidates.intersect_with(&candidates);
}
qt_candidates
},
None => candidates.unwrap_or(faceted_candidates.clone()),
};
Ok(AscDesc {
index,
rtxn,
field_id,
facet_type,
ascending,
query_tree,
candidates,
faceted_candidates,
bucket_candidates: RoaringBitmap::new(),
parent: None,
})
}
fn new(
index: &'t Index,
rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>,
field_id: FieldId,
facet_type: FacetType,
ascending: bool,
) -> anyhow::Result<Self>
{
Ok(AscDesc {
index,
rtxn,
field_id,
facet_type,
ascending,
query_tree: None,
candidates: RoaringBitmap::new(),
faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?,
bucket_candidates: RoaringBitmap::new(),
parent: Some(parent),
})
}
}
impl<'t> Criterion for AscDesc<'t> {
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>> {
loop {
debug!("Facet {} iteration ({:?})",
if self.ascending { "Asc" } else { "Desc" }, self.candidates,
);
match &mut self.candidates {
candidates if candidates.is_empty() => {
let query_tree = self.query_tree.take();
let candidates = take(&mut self.candidates);
let bucket_candidates = take(&mut self.bucket_candidates);
match self.parent.as_mut() {
Some(parent) => {
match parent.next()? {
Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => {
self.query_tree = query_tree;
candidates.intersect_with(&self.faceted_candidates);
self.candidates = candidates;
self.bucket_candidates = bucket_candidates;
},
None => return Ok(None),
}
},
None => if query_tree.is_none() && bucket_candidates.is_empty() {
return Ok(None)
},
}
return Ok(Some(CriterionResult { query_tree, candidates, bucket_candidates }));
},
candidates => {
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => candidates.clone(),
};
let found_candidates = facet_ordered(
self.index,
self.rtxn,
self.field_id,
self.facet_type,
self.ascending,
candidates.clone(),
)?;
candidates.difference_with(&found_candidates);
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: found_candidates,
bucket_candidates,
}));
},
}
}
}
}
fn facet_ordered(
index: &Index,
rtxn: &heed::RoTxn,
field_id: FieldId,
facet_type: FacetType,
ascending: bool,
candidates: RoaringBitmap,
) -> anyhow::Result<RoaringBitmap>
{
match facet_type {
FacetType::Float => {
if candidates.len() <= 1000 {
let db = index.field_id_docid_facet_values.remap_key_type::<FieldDocIdFacetF64Codec>();
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
for docid in candidates.iter() {
let left = (field_id, docid, f64::MIN);
let right = (field_id, docid, f64::MAX);
let mut iter = db.range(rtxn, &(left..=right))?;
let entry = if ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), ())) = entry.transpose()? {
docids_values.push((docid, OrderedFloat(value)));
}
}
docids_values.sort_unstable_by_key(|(_, value)| *value);
let iter = docids_values.into_iter();
let iter = if ascending {
Box::new(iter) as Box<dyn Iterator<Item = _>>
} else {
Box::new(iter.rev())
};
match iter.group_by(|(_, v)| *v).into_iter().next() {
Some((_, ids)) => Ok(ids.map(|(id, _)| id).into_iter().collect()),
None => Ok(RoaringBitmap::new())
}
} else {
let facet_fn = if ascending {
FacetIter::<f64, FacetLevelValueF64Codec>::new_reducing
} else {
FacetIter::<f64, FacetLevelValueF64Codec>::new_reverse_reducing
};
let mut iter = facet_fn(rtxn, index, field_id, candidates)?;
Ok(iter.next().transpose()?.map(|(_, docids)| docids).unwrap_or_default())
}
},
FacetType::Integer => {
if candidates.len() <= 1000 {
let db = index.field_id_docid_facet_values.remap_key_type::<FieldDocIdFacetI64Codec>();
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
for docid in candidates.iter() {
let left = (field_id, docid, i64::MIN);
let right = (field_id, docid, i64::MAX);
let mut iter = db.range(rtxn, &(left..=right))?;
let entry = if ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), ())) = entry.transpose()? {
docids_values.push((docid, value));
}
}
docids_values.sort_unstable_by_key(|(_, value)| *value);
let iter = docids_values.into_iter();
let iter = if ascending {
Box::new(iter) as Box<dyn Iterator<Item = _>>
} else {
Box::new(iter.rev())
};
match iter.group_by(|(_, v)| *v).into_iter().next() {
Some((_, ids)) => Ok(ids.map(|(id, _)| id).into_iter().collect()),
None => Ok(RoaringBitmap::new())
}
} else {
let facet_fn = if ascending {
FacetIter::<i64, FacetLevelValueI64Codec>::new_reducing
} else {
FacetIter::<i64, FacetLevelValueI64Codec>::new_reverse_reducing
};
let mut iter = facet_fn(rtxn, index, field_id, candidates)?;
Ok(iter.next().transpose()?.map(|(_, docids)| docids).unwrap_or_default())
}
},
FacetType::String => bail!("criteria facet type must be a number"),
}
}

View File

@ -0,0 +1,113 @@
use std::collections::HashMap;
use std::mem::take;
use log::debug;
use roaring::RoaringBitmap;
use crate::search::query_tree::Operation;
use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context};
pub struct Fetcher<'t> {
ctx: &'t dyn Context,
query_tree: Option<Operation>,
candidates: Candidates,
parent: Option<Box<dyn Criterion + 't>>,
should_get_documents_ids: bool,
}
impl<'t> Fetcher<'t> {
pub fn initial(
ctx: &'t dyn Context,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
) -> Self
{
Fetcher {
ctx,
query_tree,
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
parent: None,
should_get_documents_ids: true,
}
}
pub fn new(
ctx: &'t dyn Context,
parent: Box<dyn Criterion + 't>,
) -> Self
{
Fetcher {
ctx,
query_tree: None,
candidates: Candidates::default(),
parent: Some(parent),
should_get_documents_ids: true,
}
}
}
impl<'t> Criterion for Fetcher<'t> {
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>> {
use Candidates::{Allowed, Forbidden};
loop {
debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})",
self.should_get_documents_ids, self.candidates,
);
let should_get_documents_ids = take(&mut self.should_get_documents_ids);
match &mut self.candidates {
Allowed(_) => {
let candidates = take(&mut self.candidates).into_inner();
let candidates = match &self.query_tree {
Some(qt) if should_get_documents_ids => {
let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?;
docids.intersect_with(&candidates);
docids
},
_ => candidates,
};
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take(),
candidates: candidates.clone(),
bucket_candidates: candidates,
}));
},
Forbidden(_) => {
match self.parent.as_mut() {
Some(parent) => {
match parent.next()? {
Some(result) => return Ok(Some(result)),
None => if should_get_documents_ids {
let candidates = match &self.query_tree {
Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?,
None => self.ctx.documents_ids()?,
};
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: candidates.clone(),
bucket_candidates: candidates,
}));
},
}
},
None => if should_get_documents_ids {
let candidates = match &self.query_tree {
Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?,
None => self.ctx.documents_ids()?,
};
return Ok(Some(CriterionResult {
query_tree: self.query_tree.clone(),
candidates: candidates.clone(),
bucket_candidates: candidates,
}));
},
}
return Ok(None);
},
}
}
}
}

View File

@ -0,0 +1,483 @@
use std::collections::HashMap;
use std::borrow::Cow;
use anyhow::{bail, Context as _};
use roaring::RoaringBitmap;
use crate::facet::FacetType;
use crate::search::word_derivations;
use crate::{Index, FieldId};
use super::query_tree::{Operation, Query, QueryKind};
use self::typo::Typo;
use self::words::Words;
use self::asc_desc::AscDesc;
use self::proximity::Proximity;
use self::fetcher::Fetcher;
pub mod typo;
pub mod words;
pub mod asc_desc;
pub mod proximity;
pub mod fetcher;
pub trait Criterion {
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>>;
}
/// The result of a call to the parent criterion.
#[derive(Debug, Clone, PartialEq)]
pub struct CriterionResult {
/// The query tree that must be used by the children criterion to fetch candidates.
pub query_tree: Option<Operation>,
/// The candidates that this criterion is allowed to return subsets of.
pub candidates: RoaringBitmap,
/// Candidates that comes from the current bucket of the initial criterion.
pub bucket_candidates: RoaringBitmap,
}
/// Either a set of candidates that defines the candidates
/// that are allowed to be returned,
/// or the candidates that must never be returned.
#[derive(Debug)]
enum Candidates {
Allowed(RoaringBitmap),
Forbidden(RoaringBitmap)
}
impl Candidates {
fn into_inner(self) -> RoaringBitmap {
match self {
Self::Allowed(inner) => inner,
Self::Forbidden(inner) => inner,
}
}
}
impl Default for Candidates {
fn default() -> Self {
Self::Forbidden(RoaringBitmap::new())
}
}
pub trait Context {
fn documents_ids(&self) -> heed::Result<RoaringBitmap>;
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>>;
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>>;
fn in_prefix_cache(&self, word: &str) -> bool;
}
pub struct CriteriaBuilder<'t> {
rtxn: &'t heed::RoTxn<'t>,
index: &'t Index,
words_fst: fst::Set<Cow<'t, [u8]>>,
words_prefixes_fst: fst::Set<Cow<'t, [u8]>>,
}
impl<'a> Context for CriteriaBuilder<'a> {
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
self.index.documents_ids(self.rtxn)
}
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
self.index.word_docids.get(self.rtxn, &word)
}
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
self.index.word_prefix_docids.get(self.rtxn, &word)
}
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
let key = (left, right, proximity);
self.index.word_pair_proximity_docids.get(self.rtxn, &key)
}
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
let key = (left, right, proximity);
self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key)
}
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> {
&self.words_fst
}
fn in_prefix_cache(&self, word: &str) -> bool {
self.words_prefixes_fst.contains(word)
}
}
impl<'t> CriteriaBuilder<'t> {
pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> anyhow::Result<Self> {
let words_fst = index.words_fst(rtxn)?;
let words_prefixes_fst = index.words_prefixes_fst(rtxn)?;
Ok(Self { rtxn, index, words_fst, words_prefixes_fst })
}
pub fn build(
&'t self,
mut query_tree: Option<Operation>,
mut facet_candidates: Option<RoaringBitmap>,
) -> anyhow::Result<Fetcher<'t>>
{
use crate::criterion::Criterion as Name;
let fields_ids_map = self.index.fields_ids_map(&self.rtxn)?;
let faceted_fields = self.index.faceted_fields(&self.rtxn)?;
let field_id_facet_type = |field: &str| -> anyhow::Result<(FieldId, FacetType)> {
let id = fields_ids_map.id(field).with_context(|| {
format!("field {:?} isn't registered", field)
})?;
let facet_type = faceted_fields.get(field).with_context(|| {
format!("field {:?} isn't faceted", field)
})?;
Ok((id, *facet_type))
};
let mut criterion = None as Option<Box<dyn Criterion>>;
for name in self.index.criteria(&self.rtxn)? {
criterion = Some(match criterion.take() {
Some(father) => match name {
Name::Typo => Box::new(Typo::new(self, father)),
Name::Words => Box::new(Words::new(self, father)),
Name::Proximity => Box::new(Proximity::new(self, father)),
Name::Asc(field) => {
let (id, facet_type) = field_id_facet_type(&field)?;
Box::new(AscDesc::asc(&self.index, &self.rtxn, father, id, facet_type)?)
},
Name::Desc(field) => {
let (id, facet_type) = field_id_facet_type(&field)?;
Box::new(AscDesc::desc(&self.index, &self.rtxn, father, id, facet_type)?)
},
_otherwise => father,
},
None => match name {
Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())),
Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())),
Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())),
Name::Asc(field) => {
let (id, facet_type) = field_id_facet_type(&field)?;
Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?)
},
Name::Desc(field) => {
let (id, facet_type) = field_id_facet_type(&field)?;
Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?)
},
_otherwise => continue,
},
});
}
match criterion {
Some(criterion) => Ok(Fetcher::new(self, criterion)),
None => Ok(Fetcher::initial(self, query_tree, facet_candidates)),
}
}
}
pub fn resolve_query_tree<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
) -> anyhow::Result<RoaringBitmap>
{
fn resolve_operation<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
) -> anyhow::Result<RoaringBitmap>
{
use Operation::{And, Consecutive, Or, Query};
match query_tree {
And(ops) => {
let mut ops = ops.iter().map(|op| {
resolve_operation(ctx, op, cache)
}).collect::<anyhow::Result<Vec<_>>>()?;
ops.sort_unstable_by_key(|cds| cds.len());
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for docids in ops {
if first_loop {
candidates = docids;
first_loop = false;
} else {
candidates.intersect_with(&docids);
}
}
Ok(candidates)
},
Consecutive(ops) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for slice in ops.windows(2) {
match (&slice[0], &slice[1]) {
(Operation::Query(left), Operation::Query(right)) => {
match query_pair_proximity_docids(ctx, left, right, 1)? {
pair_docids if pair_docids.is_empty() => {
return Ok(RoaringBitmap::new())
},
pair_docids if first_loop => {
candidates = pair_docids;
first_loop = false;
},
pair_docids => {
candidates.intersect_with(&pair_docids);
},
}
},
_ => bail!("invalid consecutive query type"),
}
}
Ok(candidates)
},
Or(_, ops) => {
let mut candidates = RoaringBitmap::new();
for op in ops {
let docids = resolve_operation(ctx, op, cache)?;
candidates.union_with(&docids);
}
Ok(candidates)
},
Query(q) => Ok(query_docids(ctx, q)?),
}
}
resolve_operation(ctx, query_tree, cache)
}
fn all_word_pair_proximity_docids<T: AsRef<str>, U: AsRef<str>>(
ctx: &dyn Context,
left_words: &[(T, u8)],
right_words: &[(U, u8)],
proximity: u8
) -> anyhow::Result<RoaringBitmap> {
let mut docids = RoaringBitmap::new();
for (left, _l_typo) in left_words {
for (right, _r_typo) in right_words {
let current_docids = ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default();
docids.union_with(&current_docids);
}
}
Ok(docids)
}
fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result<RoaringBitmap> {
match &query.kind {
QueryKind::Exact { word, .. } => {
if query.prefix && ctx.in_prefix_cache(&word) {
Ok(ctx.word_prefix_docids(&word)?.unwrap_or_default())
} else if query.prefix {
let words = word_derivations(&word, true, 0, ctx.words_fst())?;
let mut docids = RoaringBitmap::new();
for (word, _typo) in words {
let current_docids = ctx.word_docids(&word)?.unwrap_or_default();
docids.union_with(&current_docids);
}
Ok(docids)
} else {
Ok(ctx.word_docids(&word)?.unwrap_or_default())
}
},
QueryKind::Tolerant { typo, word } => {
let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst())?;
let mut docids = RoaringBitmap::new();
for (word, _typo) in words {
let current_docids = ctx.word_docids(&word)?.unwrap_or_default();
docids.union_with(&current_docids);
}
Ok(docids)
},
}
}
fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, proximity: u8) -> anyhow::Result<RoaringBitmap> {
if proximity >= 8 {
let mut candidates = query_docids(ctx, left)?;
let right_candidates = query_docids(ctx, right)?;
candidates.intersect_with(&right_candidates);
return Ok(candidates);
}
let prefix = right.prefix;
match (&left.kind, &right.kind) {
(QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
if prefix && ctx.in_prefix_cache(&right) {
Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default())
} else if prefix {
let r_words = word_derivations(&right, true, 0, ctx.words_fst())?;
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
} else {
Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default())
}
},
(QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => {
let l_words = word_derivations(&left, false, *typo, ctx.words_fst())?;
if prefix && ctx.in_prefix_cache(&right) {
let mut docids = RoaringBitmap::new();
for (left, _) in l_words {
let current_docids = ctx.word_prefix_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default();
docids.union_with(&current_docids);
}
Ok(docids)
} else if prefix {
let r_words = word_derivations(&right, true, 0, ctx.words_fst())?;
all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity)
} else {
all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
}
},
(QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => {
let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst())?;
all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity)
},
(QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => {
let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst())?;
let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst())?;
all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity)
},
}
}
#[cfg(test)]
pub mod test {
use maplit::hashmap;
use rand::{Rng, SeedableRng, rngs::StdRng};
use super::*;
use std::collections::HashMap;
fn s(s: &str) -> String { s.to_string() }
pub struct TestContext<'t> {
words_fst: fst::Set<Cow<'t, [u8]>>,
word_docids: HashMap<String, RoaringBitmap>,
word_prefix_docids: HashMap<String, RoaringBitmap>,
word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>,
}
impl<'a> Context for TestContext<'a> {
fn documents_ids(&self) -> heed::Result<RoaringBitmap> {
Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids))
}
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
Ok(self.word_docids.get(&word.to_string()).cloned())
}
fn word_prefix_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>> {
Ok(self.word_prefix_docids.get(&word.to_string()).cloned())
}
fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
let key = (left.to_string(), right.to_string(), proximity.into());
Ok(self.word_pair_proximity_docids.get(&key).cloned())
}
fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result<Option<RoaringBitmap>> {
let key = (left.to_string(), right.to_string(), proximity.into());
Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned())
}
fn words_fst<'t>(&self) -> &'t fst::Set<Cow<[u8]>> {
&self.words_fst
}
fn in_prefix_cache(&self, word: &str) -> bool {
self.word_prefix_docids.contains_key(&word.to_string())
}
}
impl<'a> Default for TestContext<'a> {
fn default() -> TestContext<'a> {
let mut rng = StdRng::seed_from_u64(102);
let rng = &mut rng;
fn random_postings<R: Rng>(rng: &mut R, len: usize) -> RoaringBitmap {
let mut values = Vec::<u32>::with_capacity(len);
while values.len() != len {
values.push(rng.gen());
}
values.sort_unstable();
RoaringBitmap::from_sorted_iter(values.into_iter())
}
let word_docids = hashmap!{
s("hello") => random_postings(rng, 1500),
s("hi") => random_postings(rng, 4000),
s("word") => random_postings(rng, 2500),
s("split") => random_postings(rng, 400),
s("ngrams") => random_postings(rng, 1400),
s("world") => random_postings(rng, 15_000),
s("earth") => random_postings(rng, 8000),
s("2021") => random_postings(rng, 100),
s("2020") => random_postings(rng, 500),
s("is") => random_postings(rng, 50_000),
s("this") => random_postings(rng, 50_000),
s("good") => random_postings(rng, 1250),
s("morning") => random_postings(rng, 125),
};
let word_prefix_docids = hashmap!{
s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")],
s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")],
s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")],
};
let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")];
let hello_world_split = (hello_world.len() / 2) as usize;
let hello_world_1 = hello_world.iter().take(hello_world_split).collect();
let hello_world_2 = hello_world.iter().skip(hello_world_split).collect();
let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")];
let hello_word_split = (hello_word.len() / 2) as usize;
let hello_word_4 = hello_word.iter().take(hello_word_split).collect();
let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect();
let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect();
let word_pair_proximity_docids = hashmap!{
(s("good"), s("morning"), 1) => &word_docids[&s("good")] & &word_docids[&s("morning")],
(s("hello"), s("world"), 1) => hello_world_1,
(s("hello"), s("world"), 4) => hello_world_2,
(s("this"), s("is"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")],
(s("is"), s("2021"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")],
(s("is"), s("2020"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]),
(s("this"), s("2021"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")],
(s("this"), s("2020"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]),
(s("word"), s("split"), 1) => &word_docids[&s("word")] & &word_docids[&s("split")],
(s("world"), s("split"), 1) => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")],
(s("hello"), s("word"), 4) => hello_word_4,
(s("hello"), s("word"), 6) => hello_word_6,
(s("hello"), s("word"), 7) => hello_word_7,
(s("split"), s("ngrams"), 3) => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")],
(s("split"), s("ngrams"), 5) => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")],
(s("this"), s("ngrams"), 1) => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")],
(s("this"), s("ngrams"), 2) => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")],
};
let word_prefix_pair_proximity_docids = hashmap!{
(s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(),
(s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(),
(s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(),
(s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(),
(s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(),
(s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(),
};
let mut keys = word_docids.keys().collect::<Vec<_>>();
keys.sort_unstable();
let words_fst = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap();
TestContext {
words_fst,
word_docids,
word_prefix_docids,
word_pair_proximity_docids,
word_prefix_pair_proximity_docids,
}
}
}
}

View File

@ -0,0 +1,291 @@
use std::collections::HashMap;
use std::mem::take;
use roaring::RoaringBitmap;
use log::debug;
use crate::search::query_tree::{maximum_proximity, Operation, Query};
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids};
pub struct Proximity<'t> {
ctx: &'t dyn Context,
query_tree: Option<(usize, Operation)>,
proximity: u8,
candidates: Candidates,
bucket_candidates: RoaringBitmap,
parent: Option<Box<dyn Criterion + 't>>,
candidates_cache: HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
}
impl<'t> Proximity<'t> {
pub fn initial(
ctx: &'t dyn Context,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
) -> Self
{
Proximity {
ctx,
query_tree: query_tree.map(|op| (maximum_proximity(&op), op)),
proximity: 0,
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
bucket_candidates: RoaringBitmap::new(),
parent: None,
candidates_cache: HashMap::new(),
}
}
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
Proximity {
ctx,
query_tree: None,
proximity: 0,
candidates: Candidates::default(),
bucket_candidates: RoaringBitmap::new(),
parent: Some(parent),
candidates_cache: HashMap::new(),
}
}
}
impl<'t> Criterion for Proximity<'t> {
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>> {
use Candidates::{Allowed, Forbidden};
loop {
debug!("Proximity at iteration {} (max {:?}) ({:?})",
self.proximity,
self.query_tree.as_ref().map(|(mp, _)| mp),
self.candidates,
);
match (&mut self.query_tree, &mut self.candidates) {
(_, Allowed(candidates)) if candidates.is_empty() => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take().map(|(_, qt)| qt),
candidates: take(&mut self.candidates).into_inner(),
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(Some((max_prox, query_tree)), Allowed(candidates)) => {
if self.proximity as usize > *max_prox {
self.query_tree = None;
self.candidates = Candidates::default();
} else {
let mut new_candidates = resolve_candidates(
self.ctx,
&query_tree,
self.proximity,
&mut self.candidates_cache,
)?;
new_candidates.intersect_with(&candidates);
candidates.difference_with(&new_candidates);
self.proximity += 1;
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => new_candidates.clone(),
};
return Ok(Some(CriterionResult {
query_tree: Some(query_tree.clone()),
candidates: new_candidates,
bucket_candidates,
}));
}
},
(Some((max_prox, query_tree)), Forbidden(candidates)) => {
if self.proximity as usize > *max_prox {
self.query_tree = None;
self.candidates = Candidates::default();
} else {
let mut new_candidates = resolve_candidates(
self.ctx,
&query_tree,
self.proximity,
&mut self.candidates_cache,
)?;
new_candidates.difference_with(&candidates);
candidates.union_with(&new_candidates);
self.proximity += 1;
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => new_candidates.clone(),
};
return Ok(Some(CriterionResult {
query_tree: Some(query_tree.clone()),
candidates: new_candidates,
bucket_candidates,
}));
}
},
(None, Allowed(_)) => {
let candidates = take(&mut self.candidates).into_inner();
return Ok(Some(CriterionResult {
query_tree: None,
candidates: candidates.clone(),
bucket_candidates: candidates,
}));
},
(None, Forbidden(_)) => {
match self.parent.as_mut() {
Some(parent) => {
match parent.next()? {
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op));
self.proximity = 0;
self.candidates = Candidates::Allowed(candidates);
self.bucket_candidates.union_with(&bucket_candidates);
},
None => return Ok(None),
}
},
None => return Ok(None),
}
},
}
}
}
}
fn resolve_candidates<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
proximity: u8,
cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
) -> anyhow::Result<RoaringBitmap>
{
fn resolve_operation<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
proximity: u8,
cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
{
use Operation::{And, Consecutive, Or, Query};
let result = match query_tree {
And(ops) => mdfs(ctx, ops, proximity, cache)?,
Consecutive(ops) => if proximity == 0 {
mdfs(ctx, ops, 0, cache)?
} else {
Default::default()
},
Or(_, ops) => {
let mut output = Vec::new();
for op in ops {
let result = resolve_operation(ctx, op, proximity, cache)?;
output.extend(result);
}
output
},
Query(q) => if proximity == 0 {
let candidates = query_docids(ctx, q)?;
vec![(q.clone(), q.clone(), candidates)]
} else {
Default::default()
},
};
Ok(result)
}
fn mdfs_pair<'t>(
ctx: &'t dyn Context,
left: &Operation,
right: &Operation,
proximity: u8,
cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
{
fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator<Item = (u8, u8)> {
(0..=mana.min(left_max)).map(move |m| (m, mana - m))
}
let pair_max_proximity = 7;
let mut output = Vec::new();
for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) {
for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) {
let left_key = (left.clone(), left_p);
if !cache.contains_key(&left_key) {
let candidates = resolve_operation(ctx, left, left_p, cache)?;
cache.insert(left_key.clone(), candidates);
}
let right_key = (right.clone(), right_p);
if !cache.contains_key(&right_key) {
let candidates = resolve_operation(ctx, right, right_p, cache)?;
cache.insert(right_key.clone(), candidates);
}
let lefts = cache.get(&left_key).unwrap();
let rights = cache.get(&right_key).unwrap();
for (ll, lr, lcandidates) in lefts {
for (rl, rr, rcandidates) in rights {
let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1)?;
if lcandidates.len() < rcandidates.len() {
candidates.intersect_with(lcandidates);
candidates.intersect_with(rcandidates);
} else {
candidates.intersect_with(rcandidates);
candidates.intersect_with(lcandidates);
}
if !candidates.is_empty() {
output.push((ll.clone(), rr.clone(), candidates));
}
}
}
}
}
Ok(output)
}
fn mdfs<'t>(
ctx: &'t dyn Context,
branches: &[Operation],
proximity: u8,
cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>,
) -> anyhow::Result<Vec<(Query, Query, RoaringBitmap)>>
{
// Extract the first two elements but gives the tail
// that is just after the first element.
let next = branches.split_first().map(|(h1, t)| {
(h1, t.split_first().map(|(h2, _)| (h2, t)))
});
match next {
Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache),
Some((head1, Some((head2, tail)))) => {
let mut output = Vec::new();
for p in 0..=proximity {
for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache)? {
if !head_candidates.is_empty() {
for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache)? {
candidates.intersect_with(&head_candidates);
if !candidates.is_empty() {
output.push((lhead.clone(), rtail, candidates));
}
}
}
}
}
Ok(output)
},
Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache),
None => return Ok(Default::default()),
}
}
let mut candidates = RoaringBitmap::new();
for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache)? {
candidates.union_with(&cds);
}
Ok(candidates)
}

View File

@ -0,0 +1,482 @@
use std::{borrow::Cow, collections::HashMap, mem::take};
use anyhow::bail;
use log::debug;
use roaring::RoaringBitmap;
use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind};
use crate::search::word_derivations;
use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids};
pub struct Typo<'t> {
ctx: &'t dyn Context,
query_tree: Option<(usize, Operation)>,
number_typos: u8,
candidates: Candidates,
bucket_candidates: RoaringBitmap,
parent: Option<Box<dyn Criterion + 't>>,
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
typo_cache: HashMap<(String, bool, u8), Vec<(String, u8)>>,
}
impl<'t> Typo<'t> {
pub fn initial(
ctx: &'t dyn Context,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
) -> Self
{
Typo {
ctx,
query_tree: query_tree.map(|op| (maximum_typo(&op), op)),
number_typos: 0,
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
bucket_candidates: RoaringBitmap::new(),
parent: None,
candidates_cache: HashMap::new(),
typo_cache: HashMap::new(),
}
}
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
Typo {
ctx,
query_tree: None,
number_typos: 0,
candidates: Candidates::default(),
bucket_candidates: RoaringBitmap::new(),
parent: Some(parent),
candidates_cache: HashMap::new(),
typo_cache: HashMap::new(),
}
}
}
impl<'t> Criterion for Typo<'t> {
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>> {
use Candidates::{Allowed, Forbidden};
loop {
debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates);
match (&mut self.query_tree, &mut self.candidates) {
(_, Allowed(candidates)) if candidates.is_empty() => {
return Ok(Some(CriterionResult {
query_tree: self.query_tree.take().map(|(_, qt)| qt),
candidates: take(&mut self.candidates).into_inner(),
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(Some((max_typos, query_tree)), Allowed(candidates)) => {
if self.number_typos as usize > *max_typos {
self.query_tree = None;
self.candidates = Candidates::default();
} else {
let fst = self.ctx.words_fst();
let new_query_tree = if self.number_typos < 2 {
alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?
} else if self.number_typos == 2 {
*query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?;
query_tree.clone()
} else {
query_tree.clone()
};
let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?;
new_candidates.intersect_with(&candidates);
candidates.difference_with(&new_candidates);
self.number_typos += 1;
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => new_candidates.clone(),
};
return Ok(Some(CriterionResult {
query_tree: Some(new_query_tree),
candidates: new_candidates,
bucket_candidates,
}));
}
},
(Some((max_typos, query_tree)), Forbidden(candidates)) => {
if self.number_typos as usize > *max_typos {
self.query_tree = None;
self.candidates = Candidates::default();
} else {
let fst = self.ctx.words_fst();
let new_query_tree = if self.number_typos < 2 {
alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?
} else if self.number_typos == 2 {
*query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?;
query_tree.clone()
} else {
query_tree.clone()
};
let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?;
new_candidates.difference_with(&candidates);
candidates.union_with(&new_candidates);
self.number_typos += 1;
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => new_candidates.clone(),
};
return Ok(Some(CriterionResult {
query_tree: Some(new_query_tree),
candidates: new_candidates,
bucket_candidates,
}));
}
},
(None, Allowed(_)) => {
let candidates = take(&mut self.candidates).into_inner();
return Ok(Some(CriterionResult {
query_tree: None,
candidates: candidates.clone(),
bucket_candidates: candidates,
}));
},
(None, Forbidden(_)) => {
match self.parent.as_mut() {
Some(parent) => {
match parent.next()? {
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
self.query_tree = query_tree.map(|op| (maximum_typo(&op), op));
self.number_typos = 0;
self.candidates = Candidates::Allowed(candidates);
self.bucket_candidates.union_with(&bucket_candidates);
},
None => return Ok(None),
}
},
None => return Ok(None),
}
},
}
}
}
}
/// Modify the query tree by replacing every tolerant query by an Or operation
/// containing all of the corresponding exact words in the words FST. Each tolerant
/// query will only be replaced by exact query with up to `number_typos` maximum typos.
fn alterate_query_tree(
words_fst: &fst::Set<Cow<[u8]>>,
mut query_tree: Operation,
number_typos: u8,
typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>,
) -> anyhow::Result<Operation>
{
fn recurse(
words_fst: &fst::Set<Cow<[u8]>>,
operation: &mut Operation,
number_typos: u8,
typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>,
) -> anyhow::Result<()>
{
use Operation::{And, Consecutive, Or};
match operation {
And(ops) | Consecutive(ops) | Or(_, ops) => {
ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, typo_cache))
},
Operation::Query(q) => {
// TODO may be optimized when number_typos == 0
if let QueryKind::Tolerant { typo, word } = &q.kind {
// if no typo is allowed we don't call word_derivations function,
// and directly create an Exact query
if number_typos == 0 {
*operation = Operation::Query(Query {
prefix: q.prefix,
kind: QueryKind::Exact { original_typo: 0, word: word.clone() },
});
} else {
let typo = *typo.min(&number_typos);
let cache_key = (word.clone(), q.prefix, typo);
let words = if let Some(derivations) = typo_cache.get(&cache_key) {
derivations.clone()
} else {
let derivations = word_derivations(word, q.prefix, typo, words_fst)?;
typo_cache.insert(cache_key, derivations.clone());
derivations
};
let queries = words.into_iter().map(|(word, typo)| {
Operation::Query(Query {
prefix: false,
kind: QueryKind::Exact { original_typo: typo, word },
})
}).collect();
*operation = Operation::or(false, queries);
}
}
Ok(())
},
}
}
recurse(words_fst, &mut query_tree, number_typos, typo_cache)?;
Ok(query_tree)
}
fn resolve_candidates<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
number_typos: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
) -> anyhow::Result<RoaringBitmap>
{
fn resolve_operation<'t>(
ctx: &'t dyn Context,
query_tree: &Operation,
number_typos: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
) -> anyhow::Result<RoaringBitmap>
{
use Operation::{And, Consecutive, Or, Query};
match query_tree {
And(ops) => {
mdfs(ctx, ops, number_typos, cache)
},
Consecutive(ops) => {
let mut candidates = RoaringBitmap::new();
let mut first_loop = true;
for slice in ops.windows(2) {
match (&slice[0], &slice[1]) {
(Operation::Query(left), Operation::Query(right)) => {
match query_pair_proximity_docids(ctx, left, right, 1)? {
pair_docids if pair_docids.is_empty() => {
return Ok(RoaringBitmap::new())
},
pair_docids if first_loop => {
candidates = pair_docids;
first_loop = false;
},
pair_docids => {
candidates.intersect_with(&pair_docids);
},
}
},
_ => bail!("invalid consecutive query type"),
}
}
Ok(candidates)
},
Or(_, ops) => {
let mut candidates = RoaringBitmap::new();
for op in ops {
let docids = resolve_operation(ctx, op, number_typos, cache)?;
candidates.union_with(&docids);
}
Ok(candidates)
},
Query(q) => if q.kind.typo() == number_typos {
Ok(query_docids(ctx, q)?)
} else {
Ok(RoaringBitmap::new())
},
}
}
fn mdfs<'t>(
ctx: &'t dyn Context,
branches: &[Operation],
mana: u8,
cache: &mut HashMap<(Operation, u8), RoaringBitmap>,
) -> anyhow::Result<RoaringBitmap>
{
match branches.split_first() {
Some((head, [])) => {
let cache_key = (head.clone(), mana);
if let Some(candidates) = cache.get(&cache_key) {
Ok(candidates.clone())
} else {
let candidates = resolve_operation(ctx, head, mana, cache)?;
cache.insert(cache_key, candidates.clone());
Ok(candidates)
}
},
Some((head, tail)) => {
let mut candidates = RoaringBitmap::new();
for m in 0..=mana {
let mut head_candidates = {
let cache_key = (head.clone(), m);
if let Some(candidates) = cache.get(&cache_key) {
candidates.clone()
} else {
let candidates = resolve_operation(ctx, head, m, cache)?;
cache.insert(cache_key, candidates.clone());
candidates
}
};
if !head_candidates.is_empty() {
let tail_candidates = mdfs(ctx, tail, mana - m, cache)?;
head_candidates.intersect_with(&tail_candidates);
candidates.union_with(&head_candidates);
}
}
Ok(candidates)
},
None => Ok(RoaringBitmap::new()),
}
}
resolve_operation(ctx, query_tree, number_typos, cache)
}
#[cfg(test)]
mod test {
use super::*;
use super::super::test::TestContext;
#[test]
fn initial_placeholder_no_facets() {
let context = TestContext::default();
let query_tree = None;
let facet_candidates = None;
let mut criteria = Typo::initial(&context, query_tree, facet_candidates);
assert!(criteria.next().unwrap().is_none());
}
#[test]
fn initial_query_tree_no_facets() {
let context = TestContext::default();
let query_tree = Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }),
])
]);
let facet_candidates = None;
let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates);
let candidates_1 = context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap()
& context.word_docids("world").unwrap().unwrap();
let expected_1 = CriterionResult {
query_tree: Some(Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
]),
])),
candidates: candidates_1.clone(),
bucket_candidates: candidates_1,
};
assert_eq!(criteria.next().unwrap(), Some(expected_1));
let candidates_2 = (
context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap()
& context.word_docids("word").unwrap().unwrap()
) - context.word_docids("world").unwrap().unwrap();
let expected_2 = CriterionResult {
query_tree: Some(Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Or(false, vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
]),
]),
])),
candidates: candidates_2.clone(),
bucket_candidates: candidates_2,
};
assert_eq!(criteria.next().unwrap(), Some(expected_2));
}
#[test]
fn initial_placeholder_with_facets() {
let context = TestContext::default();
let query_tree = None;
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone()));
let expected = CriterionResult {
query_tree: None,
candidates: facet_candidates.clone(),
bucket_candidates: facet_candidates,
};
// first iteration, returns the facet candidates
assert_eq!(criteria.next().unwrap(), Some(expected));
// second iteration, returns None because there is no more things to do
assert!(criteria.next().unwrap().is_none());
}
#[test]
fn initial_query_tree_with_facets() {
let context = TestContext::default();
let query_tree = Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }),
])
]);
let facet_candidates = context.word_docids("earth").unwrap().unwrap();
let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone()));
let candidates_1 = context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap()
& context.word_docids("world").unwrap().unwrap();
let expected_1 = CriterionResult {
query_tree: Some(Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
]),
])),
candidates: &candidates_1 & &facet_candidates,
bucket_candidates: candidates_1 & &facet_candidates,
};
assert_eq!(criteria.next().unwrap(), Some(expected_1));
let candidates_2 = (
context.word_docids("split").unwrap().unwrap()
& context.word_docids("this").unwrap().unwrap()
& context.word_docids("word").unwrap().unwrap()
) - context.word_docids("world").unwrap().unwrap();
let expected_2 = CriterionResult {
query_tree: Some(Operation::Or(false, vec![
Operation::And(vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }),
Operation::Or(false, vec![
Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }),
Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }),
]),
]),
])),
candidates: &candidates_2 & &facet_candidates,
bucket_candidates: candidates_2 & &facet_candidates,
};
assert_eq!(criteria.next().unwrap(), Some(expected_2));
}
}

View File

@ -0,0 +1,128 @@
use std::collections::HashMap;
use std::mem::take;
use log::debug;
use roaring::RoaringBitmap;
use crate::search::query_tree::Operation;
use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context};
pub struct Words<'t> {
ctx: &'t dyn Context,
query_trees: Vec<Operation>,
candidates: Candidates,
bucket_candidates: RoaringBitmap,
parent: Option<Box<dyn Criterion + 't>>,
candidates_cache: HashMap<(Operation, u8), RoaringBitmap>,
}
impl<'t> Words<'t> {
pub fn initial(
ctx: &'t dyn Context,
query_tree: Option<Operation>,
candidates: Option<RoaringBitmap>,
) -> Self
{
Words {
ctx,
query_trees: query_tree.map(explode_query_tree).unwrap_or_default(),
candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed),
bucket_candidates: RoaringBitmap::new(),
parent: None,
candidates_cache: HashMap::default(),
}
}
pub fn new(ctx: &'t dyn Context, parent: Box<dyn Criterion + 't>) -> Self {
Words {
ctx,
query_trees: Vec::default(),
candidates: Candidates::default(),
bucket_candidates: RoaringBitmap::new(),
parent: Some(parent),
candidates_cache: HashMap::default(),
}
}
}
impl<'t> Criterion for Words<'t> {
fn next(&mut self) -> anyhow::Result<Option<CriterionResult>> {
use Candidates::{Allowed, Forbidden};
loop {
debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates);
match (self.query_trees.pop(), &mut self.candidates) {
(query_tree, Allowed(candidates)) if candidates.is_empty() => {
self.query_trees = Vec::new();
return Ok(Some(CriterionResult {
query_tree,
candidates: take(&mut self.candidates).into_inner(),
bucket_candidates: take(&mut self.bucket_candidates),
}));
},
(Some(qt), Allowed(candidates)) => {
let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?;
found_candidates.intersect_with(&candidates);
candidates.difference_with(&found_candidates);
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => found_candidates.clone(),
};
return Ok(Some(CriterionResult {
query_tree: Some(qt),
candidates: found_candidates,
bucket_candidates,
}));
},
(Some(qt), Forbidden(candidates)) => {
let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?;
found_candidates.difference_with(&candidates);
candidates.union_with(&found_candidates);
let bucket_candidates = match self.parent {
Some(_) => take(&mut self.bucket_candidates),
None => found_candidates.clone(),
};
return Ok(Some(CriterionResult {
query_tree: Some(qt),
candidates: found_candidates,
bucket_candidates,
}));
},
(None, Allowed(_)) => {
let candidates = take(&mut self.candidates).into_inner();
return Ok(Some(CriterionResult {
query_tree: None,
candidates: candidates.clone(),
bucket_candidates: candidates,
}));
},
(None, Forbidden(_)) => {
match self.parent.as_mut() {
Some(parent) => {
match parent.next()? {
Some(CriterionResult { query_tree, candidates, bucket_candidates }) => {
self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default();
self.candidates = Candidates::Allowed(candidates);
self.bucket_candidates.union_with(&bucket_candidates);
},
None => return Ok(None),
}
},
None => return Ok(None),
}
},
}
}
}
}
fn explode_query_tree(query_tree: Operation) -> Vec<Operation> {
match query_tree {
Operation::Or(true, ops) => ops,
otherwise => vec![otherwise],
}
}

View File

@ -1,27 +1,21 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::fmt; use std::fmt;
use std::time::Instant; use std::time::Instant;
use anyhow::{bail, Context};
use fst::{IntoStreamer, Streamer, Set}; use fst::{IntoStreamer, Streamer, Set};
use levenshtein_automata::DFA; use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder};
use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder;
use log::debug; use log::debug;
use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use meilisearch_tokenizer::{AnalyzerConfig, Analyzer};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use ordered_float::OrderedFloat;
use roaring::bitmap::RoaringBitmap; use roaring::bitmap::RoaringBitmap;
use crate::facet::FacetType; use crate::search::criteria::{Criterion, CriterionResult};
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; use crate::{Index, DocumentId};
use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec};
use crate::mdfs::Mdfs;
use crate::query_tokens::{query_tokens, QueryToken};
use crate::{Index, FieldId, DocumentId, Criterion};
pub use self::facet::FacetIter;
pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator};
pub use self::facet::{FacetIter}; pub use self::query_tree::MatchingWords;
use self::query_tree::QueryTreeBuilder;
// Building these factories is not free. // Building these factories is not free.
static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST0: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(0, true));
@ -30,6 +24,7 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));
mod facet; mod facet;
mod query_tree; mod query_tree;
mod criteria;
pub struct Search<'a> { pub struct Search<'a> {
query: Option<String>, query: Option<String>,
@ -65,208 +60,23 @@ impl<'a> Search<'a> {
self self
} }
/// Extracts the query words from the query string and returns the DFAs accordingly.
/// TODO introduce settings for the number of typos regarding the words lengths.
fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> {
let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2);
let stop_words = Set::default();
let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words));
let analyzed = analyzer.analyze(query);
let tokens = analyzed.tokens();
let words: Vec<_> = query_tokens(tokens).collect();
let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let number_of_words = words.len();
words.into_iter().enumerate().map(|(i, word)| {
let (word, quoted) = match word {
QueryToken::Free(token) => (token.text().to_string(), token.text().len() <= 3),
QueryToken::Quoted(token) => (token.text().to_string(), true),
};
let is_last = i + 1 == number_of_words;
let is_prefix = is_last && !ends_with_whitespace && !quoted;
let lev = match word.len() {
0..=4 => if quoted { lev0 } else { lev0 },
5..=8 => if quoted { lev0 } else { lev1 },
_ => if quoted { lev0 } else { lev2 },
};
let dfa = if is_prefix {
lev.build_prefix_dfa(&word)
} else {
lev.build_dfa(&word)
};
(word, is_prefix, dfa)
})
.collect()
}
/// Fetch the words from the given FST related to the given DFAs along with
/// the associated documents ids.
fn fetch_words_docids(
&self,
fst: &fst::Set<Cow<[u8]>>,
dfas: Vec<(String, bool, DFA)>,
) -> anyhow::Result<Vec<(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)>>
{
// A Vec storing all the derived words from the original query words, associated
// with the distance from the original word and the docids where the words appears.
let mut derived_words = Vec::<(HashMap::<String, (u8, RoaringBitmap)>, RoaringBitmap)>::with_capacity(dfas.len());
for (_word, _is_prefix, dfa) in dfas {
let mut acc_derived_words = HashMap::new();
let mut unions_docids = RoaringBitmap::new();
let mut stream = fst.search_with_state(&dfa).into_stream();
while let Some((word, state)) = stream.next() {
let word = std::str::from_utf8(word)?;
let docids = self.index.word_docids.get(self.rtxn, word)?.unwrap();
let distance = dfa.distance(state);
unions_docids.union_with(&docids);
acc_derived_words.insert(word.to_string(), (distance.to_u8(), docids));
}
derived_words.push((acc_derived_words, unions_docids));
}
Ok(derived_words)
}
/// Returns the set of docids that contains all of the query words.
fn compute_candidates(
derived_words: &[(HashMap<String, (u8, RoaringBitmap)>, RoaringBitmap)],
) -> RoaringBitmap
{
// We sort the derived words by inverse popularity, this way intersections are faster.
let mut derived_words: Vec<_> = derived_words.iter().collect();
derived_words.sort_unstable_by_key(|(_, docids)| docids.len());
// we do a union between all the docids of each of the derived words,
// we got N unions (the number of original query words), we then intersect them.
let mut candidates = RoaringBitmap::new();
for (i, (_, union_docids)) in derived_words.iter().enumerate() {
if i == 0 {
candidates = union_docids.clone();
} else {
candidates.intersect_with(&union_docids);
}
}
candidates
}
fn facet_ordered(
&self,
field_id: FieldId,
facet_type: FacetType,
ascending: bool,
mut documents_ids: RoaringBitmap,
limit: usize,
) -> anyhow::Result<Vec<DocumentId>>
{
let mut output: Vec<_> = match facet_type {
FacetType::Float => {
if documents_ids.len() <= 1000 {
let db = self.index.field_id_docid_facet_values.remap_key_type::<FieldDocIdFacetF64Codec>();
let mut docids_values = Vec::with_capacity(documents_ids.len() as usize);
for docid in documents_ids.iter() {
let left = (field_id, docid, f64::MIN);
let right = (field_id, docid, f64::MAX);
let mut iter = db.range(self.rtxn, &(left..=right))?;
let entry = if ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), ())) = entry.transpose()? {
docids_values.push((docid, OrderedFloat(value)));
}
}
docids_values.sort_unstable_by_key(|(_, value)| *value);
let iter = docids_values.into_iter().map(|(id, _)| id);
if ascending {
iter.take(limit).collect()
} else {
iter.rev().take(limit).collect()
}
} else {
let facet_fn = if ascending {
FacetIter::<f64, FacetLevelValueF64Codec>::new_reducing
} else {
FacetIter::<f64, FacetLevelValueF64Codec>::new_reverse_reducing
};
let mut limit_tmp = limit;
let mut output = Vec::new();
for result in facet_fn(self.rtxn, self.index, field_id, documents_ids.clone())? {
let (_val, docids) = result?;
limit_tmp = limit_tmp.saturating_sub(docids.len() as usize);
output.push(docids);
if limit_tmp == 0 { break }
}
output.into_iter().flatten().take(limit).collect()
}
},
FacetType::Integer => {
if documents_ids.len() <= 1000 {
let db = self.index.field_id_docid_facet_values.remap_key_type::<FieldDocIdFacetI64Codec>();
let mut docids_values = Vec::with_capacity(documents_ids.len() as usize);
for docid in documents_ids.iter() {
let left = (field_id, docid, i64::MIN);
let right = (field_id, docid, i64::MAX);
let mut iter = db.range(self.rtxn, &(left..=right))?;
let entry = if ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), ())) = entry.transpose()? {
docids_values.push((docid, value));
}
}
docids_values.sort_unstable_by_key(|(_, value)| *value);
let iter = docids_values.into_iter().map(|(id, _)| id);
if ascending {
iter.take(limit).collect()
} else {
iter.rev().take(limit).collect()
}
} else {
let facet_fn = if ascending {
FacetIter::<i64, FacetLevelValueI64Codec>::new_reducing
} else {
FacetIter::<i64, FacetLevelValueI64Codec>::new_reverse_reducing
};
let mut limit_tmp = limit;
let mut output = Vec::new();
for result in facet_fn(self.rtxn, self.index, field_id, documents_ids.clone())? {
let (_val, docids) = result?;
limit_tmp = limit_tmp.saturating_sub(docids.len() as usize);
output.push(docids);
if limit_tmp == 0 { break }
}
output.into_iter().flatten().take(limit).collect()
}
},
FacetType::String => bail!("criteria facet type must be a number"),
};
// if there isn't enough documents to return we try to complete that list
// with documents that are maybe not faceted under this field and therefore
// not returned by the previous facet iteration.
if output.len() < limit {
output.iter().for_each(|n| { documents_ids.remove(*n); });
let remaining = documents_ids.iter().take(limit - output.len());
output.extend(remaining);
}
Ok(output)
}
pub fn execute(&self) -> anyhow::Result<SearchResult> { pub fn execute(&self) -> anyhow::Result<SearchResult> {
let limit = self.limit; // We create the query tree by spliting the query into tokens.
let fst = self.index.words_fst(self.rtxn)?; let before = Instant::now();
let query_tree = match self.query.as_ref() {
// Construct the DFAs related to the query words. Some(query) => {
let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) { let builder = QueryTreeBuilder::new(self.rtxn, self.index);
Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?), let stop_words = &Set::default();
_otherwise => None, let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words));
let result = analyzer.analyze(query);
let tokens = result.tokens();
builder.build(tokens)?
},
None => None,
}; };
debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed());
// We create the original candidates with the facet conditions results. // We create the original candidates with the facet conditions results.
let before = Instant::now(); let before = Instant::now();
let facet_candidates = match &self.facet_condition { let facet_candidates = match &self.facet_condition {
@ -276,100 +86,42 @@ impl<'a> Search<'a> {
debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed());
let order_by_facet = { let matching_words = match query_tree.as_ref() {
let criteria = self.index.criteria(self.rtxn)?; Some(query_tree) => MatchingWords::from_query_tree(&query_tree),
let result = criteria.into_iter().flat_map(|criterion| { None => MatchingWords::default(),
match criterion {
Criterion::Asc(fid) => Some((fid, true)),
Criterion::Desc(fid) => Some((fid, false)),
_ => None
}
}).next();
match result {
Some((attr_name, is_ascending)) => {
let field_id_map = self.index.fields_ids_map(self.rtxn)?;
let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?;
let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?;
let ftype = *faceted_fields.get(&fid)
.with_context(|| format!("{:?} not found in the faceted fields.", attr_name))
.expect("corrupted data: ");
Some((fid, ftype, is_ascending))
},
None => None,
}
}; };
let before = Instant::now(); let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?;
let (candidates, derived_words) = match (facet_candidates, derived_words) { let mut criteria = criteria_builder.build(query_tree, facet_candidates)?;
(Some(mut facet_candidates), Some(derived_words)) => {
let words_candidates = Self::compute_candidates(&derived_words);
facet_candidates.intersect_with(&words_candidates);
(facet_candidates, derived_words)
},
(None, Some(derived_words)) => {
(Self::compute_candidates(&derived_words), derived_words)
},
(Some(facet_candidates), None) => {
// If the query is not set or results in no DFAs but
// there is some facet conditions we return a placeholder.
let documents_ids = match order_by_facet {
Some((fid, ftype, is_ascending)) => {
self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)?
},
None => facet_candidates.iter().take(limit).collect(),
};
return Ok(SearchResult {
documents_ids,
candidates: facet_candidates,
..Default::default()
})
},
(None, None) => {
// If the query is not set or results in no DFAs we return a placeholder.
let all_docids = self.index.documents_ids(self.rtxn)?;
let documents_ids = match order_by_facet {
Some((fid, ftype, is_ascending)) => {
self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)?
},
None => all_docids.iter().take(limit).collect(),
};
return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() })
},
};
debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed()); let mut offset = self.offset;
let mut limit = self.limit;
let mut documents_ids = Vec::new();
let mut initial_candidates = RoaringBitmap::new();
while let Some(CriterionResult { candidates, bucket_candidates, .. }) = criteria.next()? {
// The mana depth first search is a revised DFS that explore debug!("Number of candidates found {}", candidates.len());
// solutions in the order of their proximities.
let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone());
let mut documents = Vec::new();
// We execute the Mdfs iterator until we find enough documents. let mut len = candidates.len() as usize;
while documents.iter().map(RoaringBitmap::len).sum::<u64>() < limit as u64 { let mut candidates = candidates.into_iter();
match mdfs.next().transpose()? {
Some((proximity, answer)) => { initial_candidates.union_with(&bucket_candidates);
debug!("answer with a proximity of {}: {:?}", proximity, answer);
documents.push(answer); if offset != 0 {
}, candidates.by_ref().skip(offset).for_each(drop);
None => break, offset = offset.saturating_sub(len.min(offset));
} len = len.saturating_sub(len.min(offset));
} }
let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); if len != 0 {
let documents_ids = match order_by_facet { documents_ids.extend(candidates.take(limit));
Some((fid, ftype, order)) => { limit = limit.saturating_sub(len.min(limit));
let mut ordered_documents = Vec::new();
for documents_ids in documents {
let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?;
ordered_documents.push(docids);
if ordered_documents.iter().map(Vec::len).sum::<usize>() >= limit { break }
} }
ordered_documents.into_iter().flatten().take(limit).collect()
},
None => documents.into_iter().flatten().take(limit).collect(),
};
Ok(SearchResult { found_words, candidates, documents_ids }) if limit == 0 { break }
}
Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids })
} }
} }
@ -387,28 +139,21 @@ impl fmt::Debug for Search<'_> {
#[derive(Default)] #[derive(Default)]
pub struct SearchResult { pub struct SearchResult {
pub found_words: HashSet<String>, pub matching_words: MatchingWords,
pub candidates: RoaringBitmap, pub candidates: RoaringBitmap,
// TODO those documents ids should be associated with their criteria scores. // TODO those documents ids should be associated with their criteria scores.
pub documents_ids: Vec<DocumentId>, pub documents_ids: Vec<DocumentId>,
} }
pub fn word_typos(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set<Cow<[u8]>>) -> anyhow::Result<Vec<(String, u8)>> { pub fn word_derivations(
let dfa = { word: &str,
let lev = match max_typo { is_prefix: bool,
0 => &LEVDIST0, max_typo: u8,
1 => &LEVDIST1, fst: &fst::Set<Cow<[u8]>>,
_ => &LEVDIST2, ) -> anyhow::Result<Vec<(String, u8)>>
}; {
if is_prefix {
lev.build_prefix_dfa(&word)
} else {
lev.build_dfa(&word)
}
};
let mut derived_words = Vec::new(); let mut derived_words = Vec::new();
let dfa = build_dfa(word, max_typo, is_prefix);
let mut stream = fst.search_with_state(&dfa).into_stream(); let mut stream = fst.search_with_state(&dfa).into_stream();
while let Some((word, state)) = stream.next() { while let Some((word, state)) = stream.next() {
@ -419,3 +164,17 @@ pub fn word_typos(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set<Cow<
Ok(derived_words) Ok(derived_words)
} }
pub fn build_dfa(word: &str, typos: u8, is_prefix: bool) -> DFA {
let lev = match typos {
0 => &LEVDIST0,
1 => &LEVDIST1,
_ => &LEVDIST2,
};
if is_prefix {
lev.build_prefix_dfa(word)
} else {
lev.build_dfa(word)
}
}

View File

@ -1,14 +1,13 @@
#![allow(unused)] use std::collections::HashSet;
use std::borrow::Cow;
use std::collections::BTreeMap;
use std::{fmt, cmp, mem}; use std::{fmt, cmp, mem};
use levenshtein_automata::{DFA, Distance};
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::Index; use crate::Index;
use super::build_dfa;
type IsOptionalWord = bool; type IsOptionalWord = bool;
type IsPrefix = bool; type IsPrefix = bool;
@ -81,6 +80,13 @@ impl Operation {
Self::Consecutive(ops) Self::Consecutive(ops)
} }
} }
pub fn query(&self) -> Option<&Query> {
match self {
Operation::Query(query) => Some(query),
_ => None,
}
}
} }
#[derive(Clone, Eq, PartialEq, Hash)] #[derive(Clone, Eq, PartialEq, Hash)]
@ -96,14 +102,26 @@ pub enum QueryKind {
} }
impl QueryKind { impl QueryKind {
fn exact(word: String) -> Self { pub fn exact(word: String) -> Self {
QueryKind::Exact { original_typo: 0, word } QueryKind::Exact { original_typo: 0, word }
} }
fn tolerant(typo: u8, word: String) -> Self { pub fn exact_with_typo(original_typo: u8, word: String) -> Self {
QueryKind::Exact { original_typo, word }
}
pub fn tolerant(typo: u8, word: String) -> Self {
QueryKind::Tolerant { typo, word } QueryKind::Tolerant { typo, word }
} }
pub fn is_tolerant(&self) -> bool {
matches!(self, QueryKind::Tolerant { .. })
}
pub fn is_exact(&self) -> bool {
matches!(self, QueryKind::Exact { .. })
}
pub fn typo(&self) -> u8 { pub fn typo(&self) -> u8 {
match self { match self {
QueryKind::Tolerant { typo, .. } => *typo, QueryKind::Tolerant { typo, .. } => *typo,
@ -266,69 +284,45 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result<Option<Vec<Operat
} }
/// The query tree builder is the interface to build a query tree. /// The query tree builder is the interface to build a query tree.
#[derive(Default)]
pub struct MatchingWords { pub struct MatchingWords {
inner: BTreeMap<String, IsPrefix> dfas: Vec<(DFA, u8)>,
} }
impl MatchingWords { impl MatchingWords {
/// List all words which can be considered as a match for the query tree. /// List all words which can be considered as a match for the query tree.
pub fn from_query_tree(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> Self { pub fn from_query_tree(tree: &Operation) -> Self {
Self { inner: fetch_words(tree, fst).into_iter().collect() } Self {
dfas: fetch_queries(tree).into_iter().map(|(w, t, p)| (build_dfa(w, t, p), t)).collect()
}
} }
/// Return true if the word match. /// Return true if the word match.
pub fn is_match(&self, word: &str) -> bool { pub fn matches(&self, word: &str) -> bool {
fn first_char(s: &str) -> Option<&str> { self.dfas.iter().any(|(dfa, typo)| match dfa.eval(word) {
s.chars().next().map(|c| &s[..c.len_utf8()]) Distance::Exact(t) => t <= *typo,
} Distance::AtLeast(_) => false,
})
match first_char(word) {
Some(first) => {
let left = first.to_owned();
let right = word.to_owned();
self.inner.range(left..=right).any(|(w, is_prefix)| *is_prefix || *w == word)
},
None => false
}
} }
} }
type FetchedWords = Vec<(String, IsPrefix)>;
/// Lists all words which can be considered as a match for the query tree. /// Lists all words which can be considered as a match for the query tree.
fn fetch_words(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords { fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> {
fn resolve_branch(tree: &[Operation], fst: &fst::Set<Cow<[u8]>>) -> FetchedWords { fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) {
tree.iter().map(|op| resolve_ops(op, fst)).flatten().collect()
}
fn resolve_query(query: &Query, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
match query.kind.clone() {
QueryKind::Exact { word, .. } => vec![(word, query.prefix)],
QueryKind::Tolerant { typo, word } => {
if let Ok(words) = super::word_typos(&word, query.prefix, typo, fst) {
words.into_iter().map(|(w, _)| (w, query.prefix)).collect()
} else {
vec![(word, query.prefix)]
}
}
}
}
fn resolve_ops(tree: &Operation, fst: &fst::Set<Cow<[u8]>>) -> FetchedWords {
match tree { match tree {
Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => {
resolve_branch(ops.as_slice(), fst) ops.as_slice().iter().for_each(|op| resolve_ops(op, out));
}, },
Operation::Query(ops) => { Operation::Query(Query { prefix, kind }) => {
resolve_query(ops, fst) let typo = if kind.is_exact() { 0 } else { kind.typo() };
out.insert((kind.word(), typo, *prefix));
}, },
} }
} }
let mut words = resolve_ops(tree, fst); let mut queries = HashSet::new();
words.sort_unstable(); resolve_ops(tree, &mut queries);
words.dedup(); queries
words
} }
/// Main function that creates the final query tree from the primitive query. /// Main function that creates the final query tree from the primitive query.
@ -537,7 +531,10 @@ pub fn maximum_proximity(operation: &Operation) -> usize {
use Operation::{Or, And, Query, Consecutive}; use Operation::{Or, And, Query, Consecutive};
match operation { match operation {
Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0),
And(ops) => ops.len().saturating_sub(1) * 8, And(ops) => {
ops.iter().map(maximum_proximity).sum::<usize>()
+ ops.len().saturating_sub(1) * 7
},
Query(_) | Consecutive(_) => 0, Query(_) | Consecutive(_) => 0,
} }
} }
@ -547,7 +544,7 @@ mod test {
use std::collections::HashMap; use std::collections::HashMap;
use fst::Set; use fst::Set;
use maplit::hashmap; use maplit::{hashmap, hashset};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use rand::{Rng, SeedableRng, rngs::StdRng}; use rand::{Rng, SeedableRng, rngs::StdRng};
@ -958,26 +955,26 @@ mod test {
let context = TestContext::default(); let context = TestContext::default();
let query_tree = context.build(false, true, tokens).unwrap().unwrap(); let query_tree = context.build(false, true, tokens).unwrap().unwrap();
let expected = vec![ let expected = hashset!{
("city".to_string(), false), ("word", 0, false),
("earth".to_string(), false), ("nyc", 0, false),
("nature".to_string(), false), ("wordsplit", 2, false),
("new".to_string(), false), ("wordsplitnycworld", 2, true),
("nyc".to_string(), false), ("nature", 0, false),
("split".to_string(), false), ("new", 0, false),
("word".to_string(), false), ("city", 0, false),
("word".to_string(), true), ("world", 1, true),
("world".to_string(), true), ("york", 0, false),
("york".to_string(), false), ("split", 0, false),
("nycworld", 1, true),
]; ("earth", 0, false),
("wordsplitnyc", 2, false),
};
let mut keys = context.postings.keys().collect::<Vec<_>>(); let mut keys = context.postings.keys().collect::<Vec<_>>();
keys.sort_unstable(); keys.sort_unstable();
let set = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap();
let words = fetch_words(&query_tree, &set);
let words = fetch_queries(&query_tree);
assert_eq!(expected, words); assert_eq!(expected, words);
} }
} }

View File

@ -13,7 +13,7 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType};
use heed::BytesEncode; use heed::BytesEncode;
use linked_hash_map::LinkedHashMap; use linked_hash_map::LinkedHashMap;
use log::{debug, info}; use log::{debug, info};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind};
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
@ -274,13 +274,15 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?;
// We store document_id associated with all the words the record contains. // We store document_id associated with all the words the record contains.
for (word, _) in words_positions.drain() { for (word, _) in words_positions.iter() {
self.insert_word_docid(&word, document_id)?; self.insert_word_docid(word, document_id)?;
} }
self.documents_writer.insert(document_id.to_be_bytes(), record)?; self.documents_writer.insert(document_id.to_be_bytes(), record)?;
Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?;
words_positions.clear();
// We store document_id associated with all the field id and values. // We store document_id associated with all the field id and values.
for (field, values) in facet_values.drain() { for (field, values) in facet_values.drain() {
for value in values { for value in values {
@ -471,14 +473,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
}; };
let analyzed = self.analyzer.analyze(&content); let analyzed = self.analyzer.analyze(&content);
let tokens = analyzed let tokens = process_tokens(analyzed.tokens());
.tokens()
.filter(|t| t.is_word())
.map(|t| t.text().to_string());
for (pos, word) in tokens.enumerate().take(MAX_POSITION) { for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) {
let position = (attr as usize * MAX_POSITION + pos) as u32; let position = (attr as usize * MAX_POSITION + pos) as u32;
words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position);
} }
} }
} }
@ -609,6 +608,36 @@ enum FacetValue {
Integer(i64), Integer(i64),
} }
/// take an iterator on tokens and compute their relative position depending on separator kinds
/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
/// else we keep the standart proximity of 1 between words.
fn process_tokens<'a>(tokens: impl Iterator<Item = Token<'a>>) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens
.skip_while(|token| token.is_separator().is_some())
.scan((0, None), |(offset, prev_kind), token| {
match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
*offset += match *prev_kind {
Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
Some(_) => 1,
None => 0,
};
*prev_kind = Some(token.kind)
}
TokenKind::Separator(SeparatorKind::Hard) => {
*prev_kind = Some(token.kind);
}
TokenKind::Separator(SeparatorKind::Soft)
if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => {
*prev_kind = Some(token.kind);
}
_ => (),
}
Some((*offset, token))
})
.filter(|(_, t)| t.is_word())
}
fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec8<FacetValue>> { fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result<SmallVec8<FacetValue>> {
use FacetValue::*; use FacetValue::*;

View File

@ -41,7 +41,7 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
chunk_fusing_shrink_size: None, chunk_fusing_shrink_size: None,
max_nb_chunks: None, max_nb_chunks: None,
max_memory: None, max_memory: None,
threshold: 0.01, // 1% threshold: 0.1 / 100.0, // .01%
max_prefix_length: 4, max_prefix_length: 4,
_update_id: update_id, _update_id: update_id,
} }