From a3f8686fbfdbdb0707561a9b497878fd86f0a0fa Mon Sep 17 00:00:00 2001 From: many Date: Tue, 4 May 2021 13:44:55 +0200 Subject: [PATCH] Introduce exactness criterion --- milli/src/fields_ids_map.rs | 10 + milli/src/search/criteria/exactness.rs | 335 +++++++++++++++++++++++++ milli/src/search/criteria/mod.rs | 40 ++- milli/src/search/mod.rs | 8 +- milli/src/search/query_tree.rs | 46 ++-- 5 files changed, 412 insertions(+), 27 deletions(-) create mode 100644 milli/src/search/criteria/exactness.rs diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index ce79e6e04..6eed9c41f 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -65,6 +65,16 @@ impl FieldsIdsMap { pub fn iter(&self) -> impl Iterator { self.ids_names.iter().map(|(id, name)| (*id, name.as_str())) } + + /// Iterate over the ids in the ids order. + pub fn ids<'a>(&'a self) -> impl Iterator + 'a { + self.ids_names.keys().copied() + } + + /// Iterate over the names in the ids order. + pub fn names(&self) -> impl Iterator { + self.ids_names.values().map(AsRef::as_ref) + } } impl Default for FieldsIdsMap { diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs new file mode 100644 index 000000000..a67b9ed3c --- /dev/null +++ b/milli/src/search/criteria/exactness.rs @@ -0,0 +1,335 @@ +use std::{collections::HashMap, mem}; + +use log::debug; +use roaring::RoaringBitmap; +use itertools::Itertools; +use std::ops::BitOr; + +use crate::search::query_tree::{Operation, PrimitiveQueryPart}; +use crate::search::criteria::{ + Context, + Criterion, + CriterionParameters, + CriterionResult, + resolve_query_tree, +}; +use crate::TreeLevel; + +pub struct Exactness<'t> { + ctx: &'t dyn Context<'t>, + query_tree: Option, + state: Option, + bucket_candidates: RoaringBitmap, + parent: Box, + query: Vec, +} + +impl<'t> Exactness<'t> { + pub fn new(ctx: &'t dyn Context<'t>, parent: Box, primitive_query: &[PrimitiveQueryPart]) -> heed::Result { + let mut query: Vec<_> = Vec::with_capacity(primitive_query.len()); + for part in primitive_query { + query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?); + } + + Ok(Exactness { + ctx, + query_tree: None, + state: None, + bucket_candidates: RoaringBitmap::new(), + parent, + query, + }) + } +} + +impl<'t> Criterion for Exactness<'t> { + #[logging_timer::time("Exactness::{}")] + fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { + // remove excluded candidates when next is called, instead of doing it in the loop. + if let Some(state) = self.state.as_mut() { + state.difference_with(params.excluded_candidates); + } + + loop { + debug!("Exactness for query {:?} at state {:?}", self.query, self.state); + + match self.state.as_mut() { + Some(state) if state.is_empty() => { + // reset state + self.state = None; + self.query_tree = None; + }, + Some(state) => { + let (candidates, state) = resolve_state(self.ctx, mem::take(state), &self.query)?; + self.state = state; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: Some(candidates), + bucket_candidates: mem::take(&mut self.bucket_candidates), + })); + }, + None => { + match self.parent.next(params)? { + Some(CriterionResult { query_tree: Some(query_tree), candidates, bucket_candidates }) => { + let candidates = match candidates { + Some(candidates) => candidates, + None => resolve_query_tree(self.ctx, &query_tree, &mut HashMap::new(), params.wdcache)?, + }; + self.state = Some(State::new(candidates)); + self.query_tree = Some(query_tree); + self.bucket_candidates |= bucket_candidates; + }, + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + return Ok(Some(CriterionResult { + query_tree, + candidates, + bucket_candidates, + })); + }, + None => return Ok(None), + } + }, + } + } + } +} + +#[derive(Debug)] +enum State { + /// Extract the documents that have an attribute that contains exactly the query. + ExactAttribute(RoaringBitmap), + /// Extract the documents that have an attribute that starts with exactly the query. + AttributeStartsWith(RoaringBitmap), + /// Rank the remaining documents by the number of exact words contained. + ExactWords(RoaringBitmap), + Remainings(Vec), +} + +impl State { + fn new(candidates: RoaringBitmap) -> Self { + Self::ExactAttribute(candidates) + } + + fn difference_with(&mut self, lhs: &RoaringBitmap) { + match self { + Self::ExactAttribute(candidates) | + Self::AttributeStartsWith(candidates) | + Self::ExactWords(candidates) => *candidates -= lhs, + Self::Remainings(candidates_array) => { + candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs); + candidates_array.retain(|candidates| !candidates.is_empty()); + } + } + } + + fn is_empty(&self) -> bool { + match self { + Self::ExactAttribute(candidates) | + Self::AttributeStartsWith(candidates) | + Self::ExactWords(candidates) => candidates.is_empty(), + Self::Remainings(candidates_array) => { + candidates_array.iter().all(RoaringBitmap::is_empty) + } + } + } +} + +impl Default for State { + fn default() -> Self { + Self::Remainings(vec![]) + } +} + +#[logging_timer::time("Exactness::{}")] +fn resolve_state( + ctx: &dyn Context, + state: State, + query: &[ExactQueryPart], +) -> anyhow::Result<(RoaringBitmap, Option)> +{ + use State::*; + match state { + ExactAttribute(mut allowed_candidates) | + AttributeStartsWith(mut allowed_candidates) => { + let mut candidates = RoaringBitmap::new(); + let attributes_ids = ctx.searchable_fields_ids()?; + for id in attributes_ids { + let attribute_candidates_array = attribute_start_with_docids(ctx, id as u32, query)?; + candidates |= intersection_of(attribute_candidates_array.iter().collect()); + } + + // only keep allowed candidates + candidates &= &allowed_candidates; + // remove current candidates from allowed candidates + allowed_candidates -= &candidates; + Ok((candidates, Some(ExactWords(allowed_candidates)))) + }, + ExactWords(mut allowed_candidates) => { + let number_of_part = query.len(); + let mut parts_candidates_array = Vec::with_capacity(number_of_part); + + for part in query { + let mut candidates = RoaringBitmap::new(); + use ExactQueryPart::*; + match part { + Synonyms(synonyms) => { + for synonym in synonyms { + if let Some(synonym_candidates) = ctx.word_docids(synonym)? { + candidates |= synonym_candidates; + } + } + }, + // compute intersection on pair of words with a proximity of 0. + Phrase(phrase) => { + let mut bitmaps = Vec::with_capacity(phrase.len().saturating_sub(1)); + for words in phrase.windows(2) { + if let [left, right] = words { + match ctx.word_pair_proximity_docids(left, right, 0)? { + Some(docids) => bitmaps.push(docids), + None => { + bitmaps.clear(); + break + }, + } + } + } + candidates |= intersection_of(bitmaps.iter().collect()); + } + } + parts_candidates_array.push(candidates); + } + + let mut candidates_array = Vec::new(); + + // compute documents that contain all exact words. + let mut all_exact_candidates = intersection_of(parts_candidates_array.iter().collect()); + all_exact_candidates &= &allowed_candidates; + allowed_candidates -= &all_exact_candidates; + + // push the result of combinations of exact words grouped by the number of exact words contained by documents. + for c_count in (1..number_of_part).rev() { + let mut combinations_candidates = parts_candidates_array + .iter() + // create all `c_count` combinations of exact words + .combinations(c_count) + // intersect each word candidates in combinations + .map(intersection_of) + // union combinations of `c_count` exact words + .fold(RoaringBitmap::new(), RoaringBitmap::bitor); + // only keep allowed candidates + combinations_candidates &= &allowed_candidates; + // remove current candidates from allowed candidates + allowed_candidates -= &combinations_candidates; + candidates_array.push(combinations_candidates); + } + + // push remainings allowed candidates as the worst valid candidates + candidates_array.push(allowed_candidates); + // reverse the array to be able to pop candidates from the best to the worst. + candidates_array.reverse(); + + Ok((all_exact_candidates, Some(Remainings(candidates_array)))) + }, + // pop remainings candidates until the emptiness + Remainings(mut candidates_array) => { + let candidates = candidates_array.pop().unwrap_or_default(); + if !candidates_array.is_empty() { + Ok((candidates, Some(Remainings(candidates_array)))) + } else { + Ok((candidates, None)) + } + }, + + } +} + +fn attribute_start_with_docids(ctx: &dyn Context, attribute_id: u32, query: &[ExactQueryPart]) -> heed::Result> { + let lowest_level = TreeLevel::min_value(); + let mut attribute_candidates_array = Vec::new(); + // start from attribute first position + let mut pos = attribute_id * 1000; + for part in query { + use ExactQueryPart::*; + match part { + Synonyms(synonyms) => { + let mut synonyms_candidates = RoaringBitmap::new(); + for word in synonyms { + let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; + if let Some(word_candidates) = wc { + synonyms_candidates |= word_candidates; + } + } + attribute_candidates_array.push(synonyms_candidates); + pos += 1; + }, + Phrase(phrase) => { + for word in phrase { + let wc = ctx.word_level_position_docids(word, lowest_level, pos, pos)?; + if let Some(word_candidates) = wc { + attribute_candidates_array.push(word_candidates); + } + pos += 1; + } + } + } + } + + Ok(attribute_candidates_array) +} + +fn intersection_of(mut to_intersect: Vec<&RoaringBitmap>) -> RoaringBitmap { + match to_intersect.len() { + 0 => RoaringBitmap::new(), + 1 => to_intersect[0].clone(), + 2 => to_intersect[0] & to_intersect[1], + _ => { + to_intersect.sort_unstable_by(|a, b| a.len().cmp(&b.len()).reverse()); + + match to_intersect.pop() { + None => RoaringBitmap::new(), + Some(candidates) => { + let mut candidates = candidates.clone(); + while let Some(bitmap) = to_intersect.pop() { + if candidates.is_empty() { break; } + candidates &= bitmap; + } + + candidates + }, + } + } + } +} + +#[derive(Debug, Clone)] +pub enum ExactQueryPart { + Phrase(Vec), + Synonyms(Vec), +} + +impl ExactQueryPart { + fn from_primitive_query_part(ctx: &dyn Context, part: &PrimitiveQueryPart) -> heed::Result { + let part = match part { + PrimitiveQueryPart::Word(word, _) => { + match ctx.synonyms(word)? { + Some(synonyms) => { + let mut synonyms: Vec<_> = synonyms.into_iter().filter_map(|mut array| { + // keep 1 word synonyms only. + match array.pop() { + Some(word) if array.is_empty() => Some(word), + _ => None, + } + }).collect(); + synonyms.push(word.clone()); + ExactQueryPart::Synonyms(synonyms) + }, + None => ExactQueryPart::Synonyms(vec![word.clone()]), + } + }, + PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()), + }; + + Ok(part) + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 164937dec..1c626e183 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -7,9 +7,10 @@ use roaring::RoaringBitmap; use crate::{TreeLevel, search::{word_derivations, WordDerivationsCache}}; use crate::{Index, DocumentId}; -use super::query_tree::{Operation, Query, QueryKind}; +use super::query_tree::{Operation, PrimitiveQuery, PrimitiveQueryPart, Query, QueryKind}; use self::asc_desc::AscDesc; use self::attribute::Attribute; +use self::exactness::Exactness; use self::r#final::Final; use self::initial::Initial; use self::proximity::Proximity; @@ -18,6 +19,7 @@ use self::words::Words; mod asc_desc; mod attribute; +mod exactness; mod initial; mod proximity; mod typo; @@ -81,6 +83,9 @@ pub trait Context<'c> { fn docid_words_positions(&self, docid: DocumentId) -> heed::Result>; fn word_position_iterator(&self, word: &str, level: TreeLevel, in_prefix_cache: bool, left: Option, right: Option) -> heed::Result> + 'c>>; fn word_position_last_level(&self, word: &str, in_prefix_cache: bool) -> heed::Result>; + fn synonyms(&self, word: &str) -> heed::Result>>>; + fn searchable_fields_ids(&self) -> heed::Result>; + fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error>; } pub struct CriteriaBuilder<'t> { rtxn: &'t heed::RoTxn<'t>, @@ -170,6 +175,23 @@ impl<'c> Context<'c> for CriteriaBuilder<'c> { Ok(last_level) } + + fn synonyms(&self, word: &str) -> heed::Result>>> { + self.index.words_synonyms(self.rtxn, &[word]) + } + + fn searchable_fields_ids(&self) -> heed::Result> { + match self.index.searchable_fields_ids(self.rtxn)? { + Some(searchable_fields_ids) => Ok(searchable_fields_ids), + None => Ok(self.index.fields_ids_map(self.rtxn)?.ids().collect()), + } + + } + + fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error> { + let key = (word, level, left, right); + self.index.word_level_position_docids.get(self.rtxn, &key) + } } impl<'t> CriteriaBuilder<'t> { @@ -182,11 +204,14 @@ impl<'t> CriteriaBuilder<'t> { pub fn build( &'t self, query_tree: Option, + primitive_query: Option>, facet_candidates: Option, ) -> anyhow::Result> { use crate::criterion::Criterion as Name; + let primitive_query = primitive_query.unwrap_or_default(); + let mut criterion = Box::new(Initial::new(query_tree, facet_candidates)) as Box; for name in self.index.criteria(&self.rtxn)? { criterion = match name { @@ -194,6 +219,7 @@ impl<'t> CriteriaBuilder<'t> { Name::Words => Box::new(Words::new(self, criterion)), Name::Proximity => Box::new(Proximity::new(self, criterion)), Name::Attribute => Box::new(Attribute::new(self, criterion)), + Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), Name::Asc(field) => Box::new(AscDesc::asc(&self.index, &self.rtxn, criterion, field)?), Name::Desc(field) => Box::new(AscDesc::desc(&self.index, &self.rtxn, criterion, field)?), _otherwise => criterion, @@ -455,6 +481,18 @@ pub mod test { fn word_position_last_level(&self, _word: &str, _in_prefix_cache: bool) -> heed::Result> { todo!() } + + fn synonyms(&self, word: &str) -> heed::Result>>> { + todo!() + } + + fn searchable_fields_ids(&self) -> heed::Result> { + todo!() + } + + fn word_level_position_docids(&self, word: &str, level: TreeLevel, left: u32, right: u32) -> Result, heed::Error> { + todo!() + } } impl<'a> Default for TestContext<'a> { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 4227ab0a6..be107bf72 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -97,7 +97,7 @@ impl<'a> Search<'a> { pub fn execute(&self) -> anyhow::Result { // We create the query tree by spliting the query into tokens. let before = Instant::now(); - let query_tree = match self.query.as_ref() { + let (query_tree, primitive_query) = match self.query.as_ref() { Some(query) => { let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); builder.optional_words(self.optional_words); @@ -113,9 +113,9 @@ impl<'a> Search<'a> { let analyzer = Analyzer::new(config); let result = analyzer.analyze(query); let tokens = result.tokens(); - builder.build(tokens)? + builder.build(tokens)?.map_or((None, None), |(qt, pq)| (Some(qt), Some(pq))) }, - None => None, + None => (None, None), }; debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed()); @@ -135,7 +135,7 @@ impl<'a> Search<'a> { }; let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - let criteria = criteria_builder.build(query_tree, facet_candidates)?; + let criteria = criteria_builder.build(query_tree, primitive_query, facet_candidates)?; match self.index.distinct_attribute(self.rtxn)? { None => self.perform_sort(NoopDistinct, matching_words, criteria), diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 492b98a1e..b74b8af58 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -228,11 +228,12 @@ impl<'a> QueryTreeBuilder<'a> { /// - if `authorize_typos` is set to `false` the query tree will be generated /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) - pub fn build(&self, query: TokenStream) -> anyhow::Result> { + pub fn build(&self, query: TokenStream) -> anyhow::Result> { let stop_words = self.index.stop_words(self.rtxn)?; let primitive_query = create_primitive_query(query, stop_words, self.words_limit); if !primitive_query.is_empty() { - create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some) + let qt = create_query_tree(self, self.optional_words, self.authorize_typos, &primitive_query)?; + Ok(Some((qt, primitive_query))) } else { Ok(None) } @@ -340,7 +341,7 @@ fn create_query_tree( ctx: &impl Context, optional_words: bool, authorize_typos: bool, - query: PrimitiveQuery, + query: &[PrimitiveQueryPart], ) -> anyhow::Result { /// Matches on the `PrimitiveQueryPart` and create an operation from it. @@ -458,16 +459,16 @@ fn create_query_tree( } if optional_words { - optional_word(ctx, authorize_typos, query) + optional_word(ctx, authorize_typos, query.to_vec()) } else { - ngrams(ctx, authorize_typos, query.as_slice()) + ngrams(ctx, authorize_typos, query) } } -type PrimitiveQuery = Vec; +pub type PrimitiveQuery = Vec; #[derive(Debug, Clone)] -enum PrimitiveQueryPart { +pub enum PrimitiveQueryPart { Phrase(Vec), Word(String, IsPrefix), } @@ -579,11 +580,12 @@ mod test { authorize_typos: bool, words_limit: Option, query: TokenStream, - ) -> anyhow::Result> + ) -> anyhow::Result> { let primitive_query = create_primitive_query(query, None, words_limit); if !primitive_query.is_empty() { - create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some) + let qt = create_query_tree(self, optional_words, authorize_typos, &primitive_query)?; + Ok(Some((qt, primitive_query))) } else { Ok(None) } @@ -674,7 +676,7 @@ mod test { Operation::Query(Query { prefix: true, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -694,7 +696,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heyfriends".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -725,7 +727,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "helloworld".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -770,7 +772,7 @@ mod test { ]), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -790,7 +792,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "ngrams".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -816,7 +818,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "wordsplitfish".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -836,7 +838,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("wooop".to_string()) }), ]); - let query_tree = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -875,7 +877,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(2, "heymyfriend".to_string()) }), ]), ]); - let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -891,7 +893,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("hey".to_string()) }), Operation::Query(Query { prefix: false, kind: QueryKind::exact("my".to_string()) }), ]); - let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -925,7 +927,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("friend".to_string()) }), ]), ]); - let query_tree = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(true, true, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -944,7 +946,7 @@ mod test { ]), Operation::Query(Query { prefix: false, kind: QueryKind::exact("heyfriends".to_string()) }), ]); - let query_tree = TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, false, None, tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); } @@ -957,7 +959,7 @@ mod test { let tokens = result.tokens(); let context = TestContext::default(); - let query_tree = context.build(false, true, None, tokens).unwrap().unwrap(); + let (query_tree, _) = context.build(false, true, None, tokens).unwrap().unwrap(); let expected = hashset!{ ("word", 0, false), @@ -997,7 +999,7 @@ mod test { Operation::Query(Query { prefix: false, kind: QueryKind::exact("good".to_string()) }), ]); - let query_tree = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); + let (query_tree, _) = TestContext::default().build(false, false, Some(2), tokens).unwrap().unwrap(); assert_eq!(expected, query_tree); }