From d724a7659e5453fcb5e7c371e15ace420e40afa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 8 Jan 2020 13:37:22 +0100 Subject: [PATCH] Introduce a query tree context struct --- meilisearch-core/src/bucket_sort.rs | 26 +++++---- meilisearch-core/src/query_tree.rs | 91 +++++++++++------------------ 2 files changed, 48 insertions(+), 69 deletions(-) diff --git a/meilisearch-core/src/bucket_sort.rs b/meilisearch-core/src/bucket_sort.rs index 6b6a89ce9..4d8dfe9c0 100644 --- a/meilisearch-core/src/bucket_sort.rs +++ b/meilisearch-core/src/bucket_sort.rs @@ -29,6 +29,7 @@ use crate::raw_document::RawDocument; use crate::{database::MainT, reordered_attrs::ReorderedAttrs}; use crate::{store, Document, DocumentId, MResult}; use crate::query_tree::{create_query_tree, traverse_query_tree, QueryResult}; +use crate::query_tree::Context as QTContext; pub fn bucket_sort<'c, FI>( reader: &heed::RoTxn, @@ -47,22 +48,23 @@ pub fn bucket_sort<'c, FI>( where FI: Fn(DocumentId) -> bool, { - let operation = create_query_tree(reader, postings_lists_store, synonyms_store, query).unwrap(); - println!("{:?}", operation); - - let words = match unsafe { main_store.static_words_fst(reader)? } { + let words_set = match unsafe { main_store.static_words_fst(reader)? } { Some(words) => words, None => return Ok(Vec::new()), }; - let QueryResult { docids, queries } = - traverse_query_tree( - reader, - &words, - postings_lists_store, - prefix_postings_lists_cache_store, - &operation, - ).unwrap(); + let context = QTContext { + words_set, + synonyms: synonyms_store, + postings_lists: postings_lists_store, + prefix_postings_lists: prefix_postings_lists_cache_store, + }; + + let operation = create_query_tree(reader, &context, query).unwrap(); + println!("{:?}", operation); + + + let QueryResult { docids, queries } = traverse_query_tree(reader, &context, &operation).unwrap(); println!("found {} documents", docids.len()); println!("number of postings {:?}", queries.len()); diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index a62946ba3..1e6cc1305 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -93,26 +93,22 @@ pub struct PostingsList { matches: SetBuf, } -#[derive(Debug, Default)] pub struct Context { - pub synonyms: HashMap, Vec>>, - pub postings: HashMap, + pub words_set: fst::Set, + pub synonyms: store::Synonyms, + pub postings_lists: store::PostingsLists, + pub prefix_postings_lists: store::PrefixPostingsListsCache, } -fn split_best_frequency<'a>( - reader: &heed::RoTxn, - postings_lists: store::PostingsLists, - word: &'a str, -) -> MResult> -{ +fn split_best_frequency<'a>(reader: &heed::RoTxn, ctx: &Context, word: &'a str) -> MResult> { let chars = word.char_indices().skip(1); let mut best = None; for (i, _) in chars { let (left, right) = word.split_at(i); - let left_freq = postings_lists.postings_list(reader, left.as_bytes())?.map(|pl| pl.len()).unwrap_or(0); - let right_freq = postings_lists.postings_list(reader, right.as_bytes())?.map(|pl| pl.len()).unwrap_or(0); + let left_freq = ctx.postings_lists.postings_list(reader, left.as_bytes())?.map(|pl| pl.len()).unwrap_or(0); + let right_freq = ctx.postings_lists.postings_list(reader, right.as_bytes())?.map(|pl| pl.len()).unwrap_or(0); let min_freq = cmp::min(left_freq, right_freq); if min_freq != 0 && best.map_or(true, |(old, _, _)| min_freq > old) { @@ -123,12 +119,7 @@ fn split_best_frequency<'a>( Ok(best.map(|(_, l, r)| (l, r))) } -fn fetch_synonyms( - reader: &heed::RoTxn, - synonyms: store::Synonyms, - words: &[&str], -) -> MResult>> -{ +fn fetch_synonyms(reader: &heed::RoTxn, ctx: &Context, words: &[&str]) -> MResult>> { let words = words.join(" "); // TODO ugly // synonyms.synonyms(reader, words.as_bytes()).cloned().unwrap_or_default() Ok(vec![]) @@ -154,13 +145,7 @@ where I: IntoIterator, const MAX_NGRAM: usize = 3; -pub fn create_query_tree( - reader: &heed::RoTxn, - postings_lists: store::PostingsLists, - synonyms: store::Synonyms, - query: &str, -) -> MResult -{ +pub fn create_query_tree(reader: &heed::RoTxn, ctx: &Context, query: &str) -> MResult { let query = query.to_lowercase(); let words = query.linear_group_by_key(char::is_whitespace).map(ToOwned::to_owned); @@ -182,11 +167,11 @@ pub fn create_query_tree( let mut alts = Vec::new(); match words { [(id, word)] => { - let phrase = split_best_frequency(reader, postings_lists, word)? + let phrase = split_best_frequency(reader, ctx, word)? .map(|ws| Query::phrase2(*id, is_last, ws)) .map(Operation::Query); - let synonyms = fetch_synonyms(reader, synonyms, &[word])?.into_iter().map(|alts| { + let synonyms = fetch_synonyms(reader, ctx, &[word])?.into_iter().map(|alts| { let iter = alts.into_iter().map(|w| Query::exact(*id, false, &w)).map(Operation::Query); create_operation(iter, Operation::And) }); @@ -200,7 +185,7 @@ pub fn create_query_tree( let id = words[0].0; let words: Vec<_> = words.iter().map(|(_, s)| s.as_str()).collect(); - for synonym in fetch_synonyms(reader, synonyms, &words)? { + for synonym in fetch_synonyms(reader, ctx, &words)? { let synonym = synonym.into_iter().map(|s| Operation::Query(Query::exact(id, false, &s))); let synonym = create_operation(synonym, Operation::And); alts.push(synonym); @@ -232,17 +217,13 @@ pub type Cache<'o, 'c> = HashMap<&'o Operation, SetBuf>; pub fn traverse_query_tree<'o, 'txn>( reader: &'txn heed::RoTxn, - words_set: &fst::Set, - postings_lists: store::PostingsLists, - prefix_postings_lists: store::PrefixPostingsListsCache, + ctx: &Context, tree: &'o Operation, ) -> MResult> { fn execute_and<'o, 'txn>( reader: &'txn heed::RoTxn, - words_set: &fst::Set, - pls: store::PostingsLists, - ppls: store::PrefixPostingsListsCache, + ctx: &Context, cache: &mut Cache<'o, 'txn>, postings: &mut Postings<'o, 'txn>, depth: usize, @@ -257,9 +238,9 @@ pub fn traverse_query_tree<'o, 'txn>( for op in operations { if cache.get(op).is_none() { let docids = match op { - Operation::And(ops) => execute_and(reader, words_set, pls, ppls, cache, postings, depth + 1, &ops)?, - Operation::Or(ops) => execute_or(reader, words_set, pls, ppls, cache, postings, depth + 1, &ops)?, - Operation::Query(query) => execute_query(reader, words_set, pls, ppls, postings, depth + 1, &query)?, + Operation::And(ops) => execute_and(reader, ctx, cache, postings, depth + 1, &ops)?, + Operation::Or(ops) => execute_or(reader, ctx, cache, postings, depth + 1, &ops)?, + Operation::Query(query) => execute_query(reader, ctx, postings, depth + 1, &query)?, }; cache.insert(op, docids); } @@ -281,9 +262,7 @@ pub fn traverse_query_tree<'o, 'txn>( fn execute_or<'o, 'txn>( reader: &'txn heed::RoTxn, - words_set: &fst::Set, - pls: store::PostingsLists, - ppls: store::PrefixPostingsListsCache, + ctx: &Context, cache: &mut Cache<'o, 'txn>, postings: &mut Postings<'o, 'txn>, depth: usize, @@ -300,9 +279,9 @@ pub fn traverse_query_tree<'o, 'txn>( Some(docids) => docids, None => { let docids = match op { - Operation::And(ops) => execute_and(reader, words_set, pls, ppls, cache, postings, depth + 1, &ops)?, - Operation::Or(ops) => execute_or(reader, words_set, pls, ppls, cache, postings, depth + 1, &ops)?, - Operation::Query(query) => execute_query(reader, words_set, pls, ppls, postings, depth + 1, &query)?, + Operation::And(ops) => execute_and(reader, ctx, cache, postings, depth + 1, &ops)?, + Operation::Or(ops) => execute_or(reader, ctx, cache, postings, depth + 1, &ops)?, + Operation::Query(query) => execute_query(reader, ctx, postings, depth + 1, &query)?, }; cache.entry(op).or_insert(docids) } @@ -320,9 +299,7 @@ pub fn traverse_query_tree<'o, 'txn>( fn execute_query<'o, 'txn>( reader: &'txn heed::RoTxn, - words_set: &fst::Set, - pls: store::PostingsLists, - ppls: store::PrefixPostingsListsCache, + ctx: &Context, postings: &mut Postings<'o, 'txn>, depth: usize, query: &'o Query, @@ -335,7 +312,7 @@ pub fn traverse_query_tree<'o, 'txn>( QueryKind::Tolerant(word) => { if *prefix && word.len() == 1 { let prefix = [word.as_bytes()[0], 0, 0, 0]; - let matches = ppls.prefix_postings_list(reader, prefix)?.unwrap_or_default(); + let matches = ctx.prefix_postings_lists.prefix_postings_list(reader, prefix)?.unwrap_or_default(); let before = Instant::now(); let mut docids: Vec<_> = matches.into_iter().map(|m| m.document_id).collect(); @@ -349,14 +326,14 @@ pub fn traverse_query_tree<'o, 'txn>( let byte = word.as_bytes()[0]; let mut stream = if byte == u8::max_value() { - words_set.search(&dfa).ge(&[byte]).into_stream() + ctx.words_set.search(&dfa).ge(&[byte]).into_stream() } else { - words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream() + ctx.words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream() }; let mut docids = Vec::new(); while let Some(input) = stream.next() { - if let Some(matches) = pls.postings_list(reader, input)? { + if let Some(matches) = ctx.postings_lists.postings_list(reader, input)? { docids.extend(matches.iter().map(|d| d.document_id)) } } @@ -374,14 +351,14 @@ pub fn traverse_query_tree<'o, 'txn>( let byte = word.as_bytes()[0]; let mut stream = if byte == u8::max_value() { - words_set.search(&dfa).ge(&[byte]).into_stream() + ctx.words_set.search(&dfa).ge(&[byte]).into_stream() } else { - words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream() + ctx.words_set.search(&dfa).ge(&[byte]).lt(&[byte + 1]).into_stream() }; let mut docids = Vec::new(); while let Some(input) = stream.next() { - if let Some(matches) = pls.postings_list(reader, input)? { + if let Some(matches) = ctx.postings_lists.postings_list(reader, input)? { docids.extend(matches.iter().map(|d| d.document_id)) } } @@ -395,8 +372,8 @@ pub fn traverse_query_tree<'o, 'txn>( QueryKind::Phrase(words) => { // TODO support prefix and non-prefix exact DFA if let [first, second] = words.as_slice() { - let first = pls.postings_list(reader, first.as_bytes())?.unwrap_or_default(); - let second = pls.postings_list(reader, second.as_bytes())?.unwrap_or_default(); + let first = ctx.postings_lists.postings_list(reader, first.as_bytes())?.unwrap_or_default(); + let second = ctx.postings_lists.postings_list(reader, second.as_bytes())?.unwrap_or_default(); let iter = merge_join_by(first.as_slice(), second.as_slice(), |a, b| { let x = (a.document_id, a.attribute, (a.word_index as u32) + 1); @@ -435,9 +412,9 @@ pub fn traverse_query_tree<'o, 'txn>( let mut postings = Postings::new(); let docids = match tree { - Operation::And(ops) => execute_and(reader, words_set, postings_lists, prefix_postings_lists, &mut cache, &mut postings, 0, &ops)?, - Operation::Or(ops) => execute_or(reader, words_set, postings_lists, prefix_postings_lists, &mut cache, &mut postings, 0, &ops)?, - Operation::Query(query) => execute_query(reader, words_set, postings_lists, prefix_postings_lists, &mut postings, 0, &query)?, + Operation::And(ops) => execute_and(reader, ctx, &mut cache, &mut postings, 0, &ops)?, + Operation::Or(ops) => execute_or(reader, ctx, &mut cache, &mut postings, 0, &ops)?, + Operation::Query(query) => execute_query(reader, ctx, &mut postings, 0, &query)?, }; Ok(QueryResult { docids, queries: postings })