diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs new file mode 100644 index 000000000..7706ee280 --- /dev/null +++ b/milli/src/search/criteria/fetcher.rs @@ -0,0 +1,107 @@ +use std::collections::HashMap; +use std::mem::take; + +use log::debug; +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; + +pub struct Fetcher<'t> { + ctx: &'t dyn Context, + query_tree: Option, + candidates: Candidates, + parent: Option>, + should_get_documents_ids: bool, +} + +impl<'t> Fetcher<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> Self + { + Fetcher { + ctx, + query_tree, + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + parent: None, + should_get_documents_ids: true, + } + } + + pub fn new( + ctx: &'t dyn Context, + parent: Box, + ) -> Self + { + Fetcher { + ctx, + query_tree: None, + candidates: Candidates::default(), + parent: Some(parent), + should_get_documents_ids: true, + } + } +} + +impl<'t> Criterion for Fetcher<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + loop { + debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", + self.should_get_documents_ids, self.candidates, + ); + + match &mut self.candidates { + Allowed(candidates) => if candidates.is_empty() { + self.candidates = Candidates::default(); + } else { + self.should_get_documents_ids = false; + let candidates = take(&mut self.candidates).into_inner(); + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + Forbidden(_) => { + let should_get_documents_ids = take(&mut self.should_get_documents_ids); + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(result) => return Ok(Some(result)), + None => if should_get_documents_ids { + let candidates = match &self.query_tree { + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?, + None => self.ctx.documents_ids()?, + }; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + } + }, + None => if should_get_documents_ids { + let candidates = match &self.query_tree { + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?, + None => self.ctx.documents_ids()?, + }; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: candidates.clone(), + bucket_candidates: Some(candidates), + })); + }, + } + return Ok(None); + }, + } + } + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 52367ac5f..1845e607a 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -1,16 +1,19 @@ +use std::collections::HashMap; use std::borrow::Cow; +use anyhow::bail; +use roaring::RoaringBitmap; + use crate::Index; use crate::search::word_derivations; -use roaring::RoaringBitmap; - use super::query_tree::{Operation, Query, QueryKind}; pub mod typo; pub mod words; pub mod asc_desc; pub mod proximity; +pub mod fetcher; pub trait Criterion { fn next(&mut self) -> anyhow::Result>; @@ -51,6 +54,7 @@ impl Default for Candidates { } } pub trait Context { + fn documents_ids(&self) -> heed::Result; fn word_docids(&self, word: &str) -> heed::Result>; fn word_prefix_docids(&self, word: &str) -> heed::Result>; fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; @@ -66,6 +70,10 @@ pub struct HeedContext<'t> { } impl<'a> Context for HeedContext<'a> { + fn documents_ids(&self) -> heed::Result { + self.index.documents_ids(self.rtxn) + } + fn word_docids(&self, word: &str) -> heed::Result> { self.index.word_docids.get(self.rtxn, &word) } @@ -107,6 +115,80 @@ impl<'t> HeedContext<'t> { } } +pub fn resolve_query_tree<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, +) -> anyhow::Result +{ + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + ) -> anyhow::Result + { + use Operation::{And, Consecutive, Or, Query}; + + match query_tree { + And(ops) => { + let mut ops = ops.iter().map(|op| { + resolve_operation(ctx, op, cache) + }).collect::>>()?; + + ops.sort_unstable_by_key(|cds| cds.len()); + + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for docids in ops { + if first_loop { + candidates = docids; + first_loop = false; + } else { + candidates.intersect_with(&docids); + } + } + Ok(candidates) + }, + Consecutive(ops) => { + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for slice in ops.windows(2) { + match (&slice[0], &slice[1]) { + (Operation::Query(left), Operation::Query(right)) => { + match query_pair_proximity_docids(ctx, left, right, 1)? { + pair_docids if pair_docids.is_empty() => { + return Ok(RoaringBitmap::new()) + }, + pair_docids if first_loop => { + candidates = pair_docids; + first_loop = false; + }, + pair_docids => { + candidates.intersect_with(&pair_docids); + }, + } + }, + _ => bail!("invalid consecutive query type"), + } + } + Ok(candidates) + }, + Or(_, ops) => { + let mut candidates = RoaringBitmap::new(); + for op in ops { + let docids = resolve_operation(ctx, op, cache)?; + candidates.union_with(&docids); + } + Ok(candidates) + }, + Query(q) => Ok(query_docids(ctx, q)?), + } + } + + resolve_operation(ctx, query_tree, cache) +} + + fn all_word_pair_proximity_docids, U: AsRef>( ctx: &dyn Context, left_words: &[(T, u8)], @@ -218,6 +300,10 @@ pub mod test { } impl<'a> Context for TestContext<'a> { + fn documents_ids(&self) -> heed::Result { + Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids)) + } + fn word_docids(&self, word: &str) -> heed::Result> { Ok(self.word_docids.get(&word.to_string()).cloned()) } diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs index 3b0ecd54a..0913d429d 100644 --- a/milli/src/search/criteria/words.rs +++ b/milli/src/search/criteria/words.rs @@ -1,12 +1,11 @@ use std::collections::HashMap; use std::mem::take; -use anyhow::bail; use log::debug; use roaring::RoaringBitmap; use crate::search::query_tree::Operation; -use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; +use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; pub struct Words<'t> { ctx: &'t dyn Context, @@ -62,7 +61,7 @@ impl<'t> Criterion for Words<'t> { self.candidates = Candidates::default(); }, (Some(qt), Allowed(candidates)) => { - let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?; found_candidates.intersect_with(&candidates); candidates.difference_with(&found_candidates); @@ -78,7 +77,7 @@ impl<'t> Criterion for Words<'t> { })); }, (Some(qt), Forbidden(candidates)) => { - let mut found_candidates = resolve_candidates(self.ctx, &qt, &mut self.candidates_cache)?; + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?; found_candidates.difference_with(&candidates); candidates.union_with(&found_candidates); @@ -127,76 +126,3 @@ fn explode_query_tree(query_tree: Operation) -> Vec { otherwise => vec![otherwise], } } - -fn resolve_candidates<'t>( - ctx: &'t dyn Context, - query_tree: &Operation, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, -) -> anyhow::Result -{ - fn resolve_operation<'t>( - ctx: &'t dyn Context, - query_tree: &Operation, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, - ) -> anyhow::Result - { - use Operation::{And, Consecutive, Or, Query}; - - match query_tree { - And(ops) => { - let mut ops = ops.iter().map(|op| { - resolve_operation(ctx, op, cache) - }).collect::>>()?; - - ops.sort_unstable_by_key(|cds| cds.len()); - - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for docids in ops { - if first_loop { - candidates = docids; - first_loop = false; - } else { - candidates.intersect_with(&docids); - } - } - Ok(candidates) - }, - Consecutive(ops) => { - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for slice in ops.windows(2) { - match (&slice[0], &slice[1]) { - (Operation::Query(left), Operation::Query(right)) => { - match query_pair_proximity_docids(ctx, left, right, 1)? { - pair_docids if pair_docids.is_empty() => { - return Ok(RoaringBitmap::new()) - }, - pair_docids if first_loop => { - candidates = pair_docids; - first_loop = false; - }, - pair_docids => { - candidates.intersect_with(&pair_docids); - }, - } - }, - _ => bail!("invalid consecutive query type"), - } - } - Ok(candidates) - }, - Or(_, ops) => { - let mut candidates = RoaringBitmap::new(); - for op in ops { - let docids = resolve_operation(ctx, op, cache)?; - candidates.union_with(&docids); - } - Ok(candidates) - }, - Query(q) => Ok(query_docids(ctx, q)?), - } - } - - resolve_operation(ctx, query_tree, cache) -} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index dbb504368..84c6acf3e 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -10,7 +10,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; use crate::search::criteria::{Criterion, CriterionResult}; -use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity}; +use crate::search::criteria::{typo::Typo, words::Words, proximity::Proximity, fetcher::Fetcher}; use crate::{Index, DocumentId}; pub use self::facet::FacetIter; @@ -92,13 +92,12 @@ impl<'a> Search<'a> { None => MatchingWords::default(), }; - // We are testing the typo criteria but there will be more of them soon. let criteria_ctx = criteria::HeedContext::new(self.rtxn, self.index)?; let typo_criterion = Typo::initial(&criteria_ctx, query_tree, facet_candidates)?; let words_criterion = Words::new(&criteria_ctx, Box::new(typo_criterion))?; let proximity_criterion = Proximity::new(&criteria_ctx, Box::new(words_criterion))?; - // let proximity_criterion = Proximity::initial(&criteria_ctx, query_tree, facet_candidates)?; - let mut criteria = proximity_criterion; + let fetcher_criterion = Fetcher::new(&criteria_ctx, Box::new(proximity_criterion)); + let mut criteria = fetcher_criterion; // // We sort in descending order on a specific field *by hand*, don't do that at home. // let attr_name = "released-timestamp";