diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index a3c4b89af..17cb8c47c 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -28,6 +28,7 @@ use crate::distinct_map::{BufferedDistinctMap, DistinctMap}; use crate::raw_document::RawDocument; use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; use crate::{store, Document, DocumentId, MResult}; +use crate::query_tree::create_query_tree; pub fn bucket_sort<'c, FI>( reader: &heed::RoTxn, @@ -46,6 +47,9 @@ pub fn bucket_sort<'c, FI>( where FI: Fn(DocumentId) -> bool, { + let operation = create_query_tree(reader, postings_lists_store, synonyms_store, query).unwrap(); + println!("{:?}", operation); + // We delegate the filter work to the distinct query builder, // specifying a distinct rule that has no effect. if filter.is_some() { diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 3d2dd4b67..755cb4759 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -10,6 +10,7 @@ mod error; mod levenshtein; mod number; mod query_builder; +mod query_tree; mod ranked_map; mod raw_document; mod reordered_attrs; diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs new file mode 100644 index 000000000..17bf5f483 --- /dev/null +++ b/meilisearch-core/src/query_tree.rs @@ -0,0 +1,354 @@ +use std::borrow::Cow; +use std::collections::HashMap; +use std::time::Instant; +use std::{cmp, fmt, iter::once}; + +use sdset::{Set, SetBuf, SetOperation}; +use slice_group_by::StrGroupBy; +use itertools::{EitherOrBoth, merge_join_by}; + +use crate::database::MainT; +use crate::{store, DocumentId, DocIndex, MResult}; + +#[derive(PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Operation { + And(Vec), + Or(Vec), + Query(Query), +} + +impl fmt::Debug for Operation { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn pprint_tree(f: &mut fmt::Formatter<'_>, op: &Operation, depth: usize) -> fmt::Result { + match op { + Operation::And(children) => { + writeln!(f, "{:1$}AND", "", depth * 2)?; + children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) + }, + Operation::Or(children) => { + writeln!(f, "{:1$}OR", "", depth * 2)?; + children.iter().try_for_each(|c| pprint_tree(f, c, depth + 1)) + }, + Operation::Query(query) => writeln!(f, "{:2$}{:?}", "", query, depth * 2), + } + } + + pprint_tree(f, self, 0) + } +} + +pub type QueryId = usize; + +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Query { + Tolerant(QueryId, String), + Exact(QueryId, String), + Prefix(QueryId, String), + Phrase(QueryId, Vec), +} + +impl Query { + fn tolerant(id: QueryId, s: &str) -> Query { + Query::Tolerant(id, s.to_string()) + } + + fn prefix(id: QueryId, s: &str) -> Query { + Query::Prefix(id, s.to_string()) + } + + fn phrase2(id: QueryId, (left, right): (&str, &str)) -> Query { + Query::Phrase(id, vec![left.to_owned(), right.to_owned()]) + } +} + +#[derive(Debug, Default)] +pub struct PostingsList { + docids: SetBuf, + matches: SetBuf, +} + +#[derive(Debug, Default)] +pub struct Context { + pub synonyms: HashMap, Vec>>, + pub postings: HashMap, +} + +fn split_best_frequency<'a>( + reader: &heed::RoTxn, + postings_lists: store::PostingsLists, + word: &'a str, +) -> MResult> +{ + let chars = word.char_indices().skip(1); + let mut best = None; + + for (i, _) in chars { + let (left, right) = word.split_at(i); + + let left_freq = postings_lists.postings_list(reader, left.as_bytes())?.map(|pl| pl.len()).unwrap_or(0); + let right_freq = postings_lists.postings_list(reader, right.as_bytes())?.map(|pl| pl.len()).unwrap_or(0); + + let min_freq = cmp::min(left_freq, right_freq); + if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { + best = Some((min_freq, left, right)); + } + } + + Ok(best.map(|(_, l, r)| (l, r))) +} + +fn fetch_synonyms( + reader: &heed::RoTxn, + synonyms: store::Synonyms, + words: &[&str], +) -> MResult>> +{ + let words = words.join(" "); // TODO ugly + // synonyms.synonyms(reader, words.as_bytes()).cloned().unwrap_or_default() + Ok(vec![]) +} + +fn is_last(iter: I) -> impl Iterator { + let mut iter = iter.into_iter().peekable(); + core::iter::from_fn(move || { + iter.next().map(|item| (iter.peek().is_none(), item)) + }) +} + +fn create_operation(iter: I, f: F) -> Operation +where I: IntoIterator, + F: Fn(Vec) -> Operation, +{ + let mut iter = iter.into_iter(); + match (iter.next(), iter.next()) { + (Some(first), None) => first, + (first, second) => f(first.into_iter().chain(second).chain(iter).collect()), + } +} + +const MAX_NGRAM: usize = 3; + +pub fn create_query_tree( + reader: &heed::RoTxn, + postings_lists: store::PostingsLists, + synonyms: store::Synonyms, + query: &str, +) -> MResult +{ + let query = query.to_lowercase(); + + let words = query.linear_group_by_key(char::is_whitespace).map(ToOwned::to_owned); + let words = words.filter(|s| !s.contains(char::is_whitespace)).enumerate(); + let words: Vec<_> = words.collect(); + + let mut ngrams = Vec::new(); + for ngram in 1..=MAX_NGRAM { + let ngiter = words.windows(ngram).enumerate().map(|(i, group)| { + let before = words[..i].windows(1); + let after = words[i + ngram..].windows(1); + before.chain(Some(group)).chain(after) + }); + + for group in ngiter { + let mut ops = Vec::new(); + + for (is_last, words) in is_last(group) { + let mut alts = Vec::new(); + match words { + [(id, word)] => { + let phrase = split_best_frequency(reader, postings_lists, word)? + .map(|ws| Query::phrase2(*id, ws)).map(Operation::Query); + + let synonyms = fetch_synonyms(reader, synonyms, &[word])?.into_iter().map(|alts| { + let iter = alts.into_iter().map(|w| Query::Exact(*id, w)).map(Operation::Query); + create_operation(iter, Operation::And) + }); + + let query = if is_last { + Query::prefix(*id, word) + } else { + Query::tolerant(*id, word) + }; + + alts.push(Operation::Query(query)); + alts.extend(synonyms.chain(phrase)); + }, + words => { + let id = words[0].0; + let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect(); + + for synonym in fetch_synonyms(reader, synonyms, &words)? { + let synonym = synonym.into_iter().map(|s| Operation::Query(Query::Exact(id, s))); + let synonym = create_operation(synonym, Operation::And); + alts.push(synonym); + } + + let query = if is_last { + Query::Prefix(id, words.concat()) + } else { + Query::Exact(id, words.concat()) + }; + + alts.push(Operation::Query(query)); + } + } + + ops.push(create_operation(alts, Operation::Or)); + } + + ngrams.push(create_operation(ops, Operation::And)); + if ngram == 1 { break } + } + } + + Ok(create_operation(ngrams, Operation::Or)) +} + +pub struct QueryResult<'q, 'c> { + pub docids: Cow<'c, Set>, + pub queries: HashMap<&'q Query, Cow<'c, Set>>, +} + +pub type Postings<'q, 'c> = HashMap<&'q Query, Cow<'c, Set>>; +pub type Cache<'o, 'c> = HashMap<&'o Operation, Cow<'c, Set>>; + +pub fn traverse_query_tree<'a, 'c>(ctx: &'c Context, tree: &'a Operation) -> QueryResult<'a, 'c> { + fn execute_and<'o, 'c>( + ctx: &'c Context, + cache: &mut Cache<'o, 'c>, + postings: &mut Postings<'o, 'c>, + depth: usize, + operations: &'o [Operation], + ) -> Cow<'c, Set> + { + println!("{:1$}AND", "", depth * 2); + + let before = Instant::now(); + let mut results = Vec::new(); + + for op in operations { + if cache.get(op).is_none() { + let docids = match op { + Operation::And(ops) => execute_and(ctx, cache, postings, depth + 1, &ops), + Operation::Or(ops) => execute_or(ctx, cache, postings, depth + 1, &ops), + Operation::Query(query) => execute_query(ctx, postings, depth + 1, &query), + }; + cache.insert(op, docids); + } + } + + for op in operations { + if let Some(docids) = cache.get(op) { + results.push(docids.as_ref()); + } + } + + let op = sdset::multi::Intersection::new(results); + let docids = op.into_set_buf(); + let docids: Cow> = Cow::Owned(docids); + + println!("{:3$}--- AND fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2); + + docids + } + + fn execute_or<'o, 'c>( + ctx: &'c Context, + cache: &mut Cache<'o, 'c>, + postings: &mut Postings<'o, 'c>, + depth: usize, + operations: &'o [Operation], + ) -> Cow<'c, Set> + { + println!("{:1$}OR", "", depth * 2); + + let before = Instant::now(); + let mut ids = Vec::new(); + + for op in operations { + let docids = match cache.get(op) { + Some(docids) => docids, + None => { + let docids = match op { + Operation::And(ops) => execute_and(ctx, cache, postings, depth + 1, &ops), + Operation::Or(ops) => execute_or(ctx, cache, postings, depth + 1, &ops), + Operation::Query(query) => execute_query(ctx, postings, depth + 1, &query), + }; + cache.entry(op).or_insert(docids) + } + }; + + ids.extend(docids.as_ref()); + } + + let docids = SetBuf::from_dirty(ids); + let docids: Cow> = Cow::Owned(docids); + + println!("{:3$}--- OR fetched {} documents in {:.02?}", "", docids.len(), before.elapsed(), depth * 2); + + docids + } + + fn execute_query<'o, 'c>( + ctx: &'c Context, + postings: &mut Postings<'o, 'c>, + depth: usize, + query: &'o Query, + ) -> Cow<'c, Set> + { + let before = Instant::now(); + let (docids, matches) = match query { + Query::Tolerant(_, word) | Query::Exact(_, word) | Query::Prefix(_, word) => { + if let Some(PostingsList { docids, matches }) = ctx.postings.get(word) { + (Cow::Borrowed(docids.as_set()), Cow::Borrowed(matches.as_set())) + } else { + (Cow::default(), Cow::default()) + } + }, + Query::Phrase(_, words) => { + if let [first, second] = words.as_slice() { + let default = SetBuf::default(); + let first = ctx.postings.get(first).map(|pl| &pl.matches).unwrap_or(&default); + let second = ctx.postings.get(second).map(|pl| &pl.matches).unwrap_or(&default); + + let iter = merge_join_by(first.as_slice(), second.as_slice(), |a, b| { + let x = (a.document_id, a.attribute, (a.word_index as u32) + 1); + let y = (b.document_id, b.attribute, b.word_index as u32); + x.cmp(&y) + }); + + let matches: Vec<_> = iter + .filter_map(EitherOrBoth::both) + .flat_map(|(a, b)| once(*a).chain(Some(*b))) + .collect(); + + let mut docids: Vec<_> = matches.iter().map(|m| m.document_id).collect(); + docids.dedup(); + + println!("{:2$}matches {:?}", "", matches, depth * 2); + + (Cow::Owned(SetBuf::new(docids).unwrap()), Cow::Owned(SetBuf::new(matches).unwrap())) + } else { + println!("{:2$}{:?} skipped", "", words, depth * 2); + (Cow::default(), Cow::default()) + } + }, + }; + + println!("{:4$}{:?} fetched {:?} documents in {:.02?}", "", query, docids.len(), before.elapsed(), depth * 2); + + postings.insert(query, matches); + docids + } + + let mut cache = Cache::new(); + let mut postings = Postings::new(); + + let docids = match tree { + Operation::And(operations) => execute_and(ctx, &mut cache, &mut postings, 0, &operations), + Operation::Or(operations) => execute_or(ctx, &mut cache, &mut postings, 0, &operations), + Operation::Query(query) => execute_query(ctx, &mut postings, 0, &query), + }; + + QueryResult { docids, queries: postings } +}