From f5f5f03ec0fd67973e310c89d2ea369632cec148 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 23 Mar 2023 09:35:53 +0100 Subject: [PATCH] Remove old criteria code --- milli/src/search/criteria/asc_desc.rs | 569 ---------- milli/src/search/criteria/attribute.rs | 710 ------------ milli/src/search/criteria/exactness.rs | 766 ------------- milli/src/search/criteria/final.rs | 77 -- milli/src/search/criteria/geo.rs | 154 --- milli/src/search/criteria/initial.rs | 82 -- milli/src/search/criteria/mod.rs | 1049 ------------------ milli/src/search/criteria/proximity.rs | 712 ------------ milli/src/search/criteria/typo.rs | 493 -------- milli/src/search/criteria/words.rs | 106 -- milli/src/search/distinct/facet_distinct.rs | 218 ---- milli/src/search/distinct/mod.rs | 155 --- milli/src/search/distinct/noop_distinct.rs | 55 - milli/src/search/facet/facet_distribution.rs | 4 +- milli/src/search/facet/mod.rs | 36 +- milli/src/search/mod.rs | 205 +--- milli/src/search/new/mod.rs | 40 +- milli/src/search/new/query_term.rs | 2 +- 18 files changed, 88 insertions(+), 5345 deletions(-) delete mode 100644 milli/src/search/criteria/asc_desc.rs delete mode 100644 milli/src/search/criteria/attribute.rs delete mode 100644 milli/src/search/criteria/exactness.rs delete mode 100644 milli/src/search/criteria/final.rs delete mode 100644 milli/src/search/criteria/geo.rs delete mode 100644 milli/src/search/criteria/initial.rs delete mode 100644 milli/src/search/criteria/mod.rs delete mode 100644 milli/src/search/criteria/proximity.rs delete mode 100644 milli/src/search/criteria/typo.rs delete mode 100644 milli/src/search/criteria/words.rs delete mode 100644 milli/src/search/distinct/facet_distinct.rs delete mode 100644 milli/src/search/distinct/mod.rs delete mode 100644 milli/src/search/distinct/noop_distinct.rs diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs deleted file mode 100644 index 378e1c8da..000000000 --- a/milli/src/search/criteria/asc_desc.rs +++ /dev/null @@ -1,569 +0,0 @@ -use std::mem::take; - -use heed::BytesDecode; -use itertools::Itertools; -use log::debug; -use ordered_float::OrderedFloat; -use roaring::RoaringBitmap; - -use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; -use crate::heed_codec::ByteSliceRefCodec; -use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates}; -use crate::search::facet::{ascending_facet_sort, descending_facet_sort}; -use crate::search::query_tree::Operation; -use crate::search::CriterionImplementationStrategy; -use crate::{FieldId, Index, Result}; - -/// Threshold on the number of candidates that will make -/// the system to choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 1000; - -pub struct AscDesc<'t> { - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, - field_name: String, - field_id: Option, - is_ascending: bool, - query_tree: Option, - candidates: Box> + 't>, - allowed_candidates: RoaringBitmap, - initial_candidates: InitialCandidates, - faceted_candidates: RoaringBitmap, - implementation_strategy: CriterionImplementationStrategy, - parent: Box, -} - -impl<'t> AscDesc<'t> { - pub fn asc( - index: &'t Index, - rtxn: &'t heed::RoTxn, - parent: Box, - field_name: String, - implementation_strategy: CriterionImplementationStrategy, - ) -> Result { - Self::new(index, rtxn, parent, field_name, true, implementation_strategy) - } - - pub fn desc( - index: &'t Index, - rtxn: &'t heed::RoTxn, - parent: Box, - field_name: String, - implementation_strategy: CriterionImplementationStrategy, - ) -> Result { - Self::new(index, rtxn, parent, field_name, false, implementation_strategy) - } - - fn new( - index: &'t Index, - rtxn: &'t heed::RoTxn, - parent: Box, - field_name: String, - is_ascending: bool, - implementation_strategy: CriterionImplementationStrategy, - ) -> Result { - let fields_ids_map = index.fields_ids_map(rtxn)?; - let field_id = fields_ids_map.id(&field_name); - let faceted_candidates = match field_id { - Some(field_id) => { - let number_faceted = - index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?; - let string_faceted = - index.faceted_documents_ids(rtxn, field_id, FacetType::String)?; - number_faceted | string_faceted - } - None => RoaringBitmap::default(), - }; - - Ok(AscDesc { - index, - rtxn, - field_name, - field_id, - is_ascending, - query_tree: None, - candidates: Box::new(std::iter::empty()), - allowed_candidates: RoaringBitmap::new(), - faceted_candidates, - initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), - implementation_strategy, - parent, - }) - } -} - -impl<'t> Criterion for AscDesc<'t> { - #[logging_timer::time("AscDesc::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - // remove excluded candidates when next is called, instead of doing it in the loop. - self.allowed_candidates -= params.excluded_candidates; - - loop { - debug!( - "Facet {}({}) iteration", - if self.is_ascending { "Asc" } else { "Desc" }, - self.field_name - ); - - match self.candidates.next().transpose()? { - None if !self.allowed_candidates.is_empty() => { - return Ok(Some(CriterionResult { - query_tree: self.query_tree.clone(), - candidates: Some(take(&mut self.allowed_candidates)), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree, - candidates, - filtered_candidates, - initial_candidates, - }) => { - self.query_tree = query_tree; - let mut candidates = match (&self.query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => { - let context = CriteriaBuilder::new(self.rtxn, self.index)?; - resolve_query_tree(&context, qt, params.wdcache)? - } - (None, None) => self.index.documents_ids(self.rtxn)?, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; - } - - match initial_candidates { - Some(initial_candidates) => { - self.initial_candidates |= initial_candidates - } - None => self.initial_candidates.map_inplace(|c| c | &candidates), - } - - if candidates.is_empty() { - continue; - } - - self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = match self.field_id { - Some(field_id) => facet_ordered( - self.index, - self.rtxn, - field_id, - self.is_ascending, - candidates & &self.faceted_candidates, - self.implementation_strategy, - )?, - None => Box::new(std::iter::empty()), - }; - } - None => return Ok(None), - }, - Some(mut candidates) => { - candidates -= params.excluded_candidates; - self.allowed_candidates -= &candidates; - return Ok(Some(CriterionResult { - query_tree: self.query_tree.clone(), - candidates: Some(candidates), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - } - } - } -} - -fn facet_ordered_iterative<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - is_ascending: bool, - candidates: RoaringBitmap, -) -> Result> + 't>> { - let number_iter = iterative_facet_number_ordered_iter( - index, - rtxn, - field_id, - is_ascending, - candidates.clone(), - )?; - let string_iter = - iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?; - Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box>) -} - -fn facet_extreme_value<'t>( - mut extreme_it: impl Iterator> + 't, -) -> Result> { - let extreme_value = - if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) }; - let (_, extreme_value) = extreme_value?; - - Ok(OrderedF64Codec::bytes_decode(extreme_value)) -} - -pub fn facet_min_value<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - candidates: RoaringBitmap, -) -> Result> { - let db = index.facet_id_f64_docids.remap_key_type::>(); - let it = ascending_facet_sort(rtxn, db, field_id, candidates)?; - facet_extreme_value(it) -} - -pub fn facet_max_value<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - candidates: RoaringBitmap, -) -> Result> { - let db = index.facet_id_f64_docids.remap_key_type::>(); - let it = descending_facet_sort(rtxn, db, field_id, candidates)?; - facet_extreme_value(it) -} - -fn facet_ordered_set_based<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - is_ascending: bool, - candidates: RoaringBitmap, -) -> Result> + 't>> { - let number_db = - index.facet_id_f64_docids.remap_key_type::>(); - let string_db = - index.facet_id_string_docids.remap_key_type::>(); - - let (number_iter, string_iter) = if is_ascending { - let number_iter = ascending_facet_sort(rtxn, number_db, field_id, candidates.clone())?; - let string_iter = ascending_facet_sort(rtxn, string_db, field_id, candidates)?; - - (itertools::Either::Left(number_iter), itertools::Either::Left(string_iter)) - } else { - let number_iter = descending_facet_sort(rtxn, number_db, field_id, candidates.clone())?; - let string_iter = descending_facet_sort(rtxn, string_db, field_id, candidates)?; - - (itertools::Either::Right(number_iter), itertools::Either::Right(string_iter)) - }; - - Ok(Box::new(number_iter.chain(string_iter).map(|res| res.map(|(doc_ids, _)| doc_ids)))) -} - -/// Returns an iterator over groups of the given candidates in ascending or descending order. -/// -/// It will either use an iterative or a recursive method on the whole facet database depending -/// on the number of candidates to rank. -fn facet_ordered<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - is_ascending: bool, - candidates: RoaringBitmap, - implementation_strategy: CriterionImplementationStrategy, -) -> Result> + 't>> { - match implementation_strategy { - CriterionImplementationStrategy::OnlyIterative => { - facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates) - } - CriterionImplementationStrategy::OnlySetBased => { - facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates) - } - CriterionImplementationStrategy::Dynamic => { - if candidates.len() <= CANDIDATES_THRESHOLD { - facet_ordered_iterative(index, rtxn, field_id, is_ascending, candidates) - } else { - facet_ordered_set_based(index, rtxn, field_id, is_ascending, candidates) - } - } - } -} - -/// Fetch the whole list of candidates facet number values one by one and order them by it. -/// -/// This function is fast when the amount of candidates to rank is small. -fn iterative_facet_number_ordered_iter<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - is_ascending: bool, - candidates: RoaringBitmap, -) -> Result + 't> { - let mut docids_values = Vec::with_capacity(candidates.len() as usize); - for docid in candidates.iter() { - let left = (field_id, docid, f64::MIN); - let right = (field_id, docid, f64::MAX); - let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?; - let entry = if is_ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, OrderedFloat(value))); - } - } - docids_values.sort_unstable_by_key(|(_, v)| *v); - let iter = docids_values.into_iter(); - let iter = if is_ascending { - Box::new(iter) as Box> - } else { - Box::new(iter.rev()) - }; - - // The itertools GroupBy iterator doesn't provide an owned version, we are therefore - // required to collect the result into an owned collection (a Vec). - // https://github.com/rust-itertools/itertools/issues/499 - #[allow(clippy::needless_collect)] - let vec: Vec<_> = iter - .group_by(|(_, v)| *v) - .into_iter() - .map(|(_, ids)| ids.map(|(id, _)| id).collect()) - .collect(); - - Ok(vec.into_iter()) -} - -/// Fetch the whole list of candidates facet string values one by one and order them by it. -/// -/// This function is fast when the amount of candidates to rank is small. -fn iterative_facet_string_ordered_iter<'t>( - index: &'t Index, - rtxn: &'t heed::RoTxn, - field_id: FieldId, - is_ascending: bool, - candidates: RoaringBitmap, -) -> Result + 't> { - let mut docids_values = Vec::with_capacity(candidates.len() as usize); - for docid in candidates.iter() { - let left = (field_id, docid, ""); - let right = (field_id, docid.saturating_add(1), ""); - // FIXME Doing this means that it will never be possible to retrieve - // the document with id 2^32, not sure this is a real problem. - let mut iter = index.field_id_docid_facet_strings.range(rtxn, &(left..right))?; - let entry = if is_ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), _)) = entry.transpose()? { - docids_values.push((docid, value)); - } - } - docids_values.sort_unstable_by_key(|(_, v)| *v); - let iter = docids_values.into_iter(); - let iter = if is_ascending { - Box::new(iter) as Box> - } else { - Box::new(iter.rev()) - }; - - // The itertools GroupBy iterator doesn't provide an owned version, we are therefore - // required to collect the result into an owned collection (a Vec). - // https://github.com/rust-itertools/itertools/issues/499 - #[allow(clippy::needless_collect)] - let vec: Vec<_> = iter - .group_by(|(_, v)| *v) - .into_iter() - .map(|(_, ids)| ids.map(|(id, _)| id).collect()) - .collect(); - - Ok(vec.into_iter()) -} - -#[cfg(test)] -mod tests { - use std::str::FromStr; - - use big_s::S; - use maplit::hashset; - - use crate::index::tests::TempIndex; - use crate::{AscDesc, Criterion, Filter, Search, SearchResult}; - - // Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD - // constant to 0 to ensure that the other sort algorithms are also correct. - #[test] - fn sort_criterion_placeholder() { - let index = TempIndex::new(); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings - .set_sortable_fields(maplit::hashset! { S("id"), S("mod_10"), S("mod_20") }); - settings.set_criteria(vec![Criterion::Sort]); - }) - .unwrap(); - - let mut docs = vec![]; - for i in 0..100 { - docs.push( - serde_json::json!({ "id": i, "mod_10": format!("{}", i % 10), "mod_20": i % 20 }), - ); - } - - index.add_documents(documents!(docs)).unwrap(); - - let all_ids = (0..100).collect::>(); - - let rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&rtxn, &index); - search.sort_criteria(vec![AscDesc::from_str("mod_10:desc").unwrap()]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 19, 29, 39, 49, 59, 69, 79, 89, 99, 8, 18, 28, 38, 48, 58, 68, 78, 88, 98, 7, 17, 27, 37, 47, 57, 67, 77, 87, 97, 6, 16, 26, 36, 46, 56, 66, 76, 86, 96, 5, 15, 25, 35, 45, 55, 65, 75, 85, 95, 4, 14, 24, 34, 44, 54, 64, 74, 84, 94, 3, 13, 23, 33, 43, 53, 63, 73, 83, 93, 2, 12, 22, 32, 42, 52, 62, 72, 82, 92, 1, 11, 21, 31, 41, 51, 61, 71, 81, 91, 0, 10, 20, 30, 40, 50, 60, 70, 80, 90]"); - documents_ids.sort(); - assert_eq!(all_ids, documents_ids); - - let mut search = Search::new(&rtxn, &index); - search.sort_criteria(vec![ - AscDesc::from_str("mod_10:desc").unwrap(), - AscDesc::from_str("id:desc").unwrap(), - ]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[99, 89, 79, 69, 59, 49, 39, 29, 19, 9, 98, 88, 78, 68, 58, 48, 38, 28, 18, 8, 97, 87, 77, 67, 57, 47, 37, 27, 17, 7, 96, 86, 76, 66, 56, 46, 36, 26, 16, 6, 95, 85, 75, 65, 55, 45, 35, 25, 15, 5, 94, 84, 74, 64, 54, 44, 34, 24, 14, 4, 93, 83, 73, 63, 53, 43, 33, 23, 13, 3, 92, 82, 72, 62, 52, 42, 32, 22, 12, 2, 91, 81, 71, 61, 51, 41, 31, 21, 11, 1, 90, 80, 70, 60, 50, 40, 30, 20, 10, 0]"); - documents_ids.sort(); - assert_eq!(all_ids, documents_ids); - - let mut search = Search::new(&rtxn, &index); - search.sort_criteria(vec![ - AscDesc::from_str("mod_10:desc").unwrap(), - AscDesc::from_str("mod_20:asc").unwrap(), - ]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 29, 49, 69, 89, 19, 39, 59, 79, 99, 8, 28, 48, 68, 88, 18, 38, 58, 78, 98, 7, 27, 47, 67, 87, 17, 37, 57, 77, 97, 6, 26, 46, 66, 86, 16, 36, 56, 76, 96, 5, 25, 45, 65, 85, 15, 35, 55, 75, 95, 4, 24, 44, 64, 84, 14, 34, 54, 74, 94, 3, 23, 43, 63, 83, 13, 33, 53, 73, 93, 2, 22, 42, 62, 82, 12, 32, 52, 72, 92, 1, 21, 41, 61, 81, 11, 31, 51, 71, 91, 0, 20, 40, 60, 80, 10, 30, 50, 70, 90]"); - documents_ids.sort(); - assert_eq!(all_ids, documents_ids); - - let mut search = Search::new(&rtxn, &index); - search.sort_criteria(vec![ - AscDesc::from_str("mod_10:desc").unwrap(), - AscDesc::from_str("mod_20:desc").unwrap(), - ]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 39, 59, 79, 99, 9, 29, 49, 69, 89, 18, 38, 58, 78, 98, 8, 28, 48, 68, 88, 17, 37, 57, 77, 97, 7, 27, 47, 67, 87, 16, 36, 56, 76, 96, 6, 26, 46, 66, 86, 15, 35, 55, 75, 95, 5, 25, 45, 65, 85, 14, 34, 54, 74, 94, 4, 24, 44, 64, 84, 13, 33, 53, 73, 93, 3, 23, 43, 63, 83, 12, 32, 52, 72, 92, 2, 22, 42, 62, 82, 11, 31, 51, 71, 91, 1, 21, 41, 61, 81, 10, 30, 50, 70, 90, 0, 20, 40, 60, 80]"); - documents_ids.sort(); - assert_eq!(all_ids, documents_ids); - - let mut search = Search::new(&rtxn, &index); - search.sort_criteria(vec![ - AscDesc::from_str("mod_10:desc").unwrap(), - AscDesc::from_str("mod_20:desc").unwrap(), - AscDesc::from_str("id:desc").unwrap(), - ]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[99, 79, 59, 39, 19, 89, 69, 49, 29, 9, 98, 78, 58, 38, 18, 88, 68, 48, 28, 8, 97, 77, 57, 37, 17, 87, 67, 47, 27, 7, 96, 76, 56, 36, 16, 86, 66, 46, 26, 6, 95, 75, 55, 35, 15, 85, 65, 45, 25, 5, 94, 74, 54, 34, 14, 84, 64, 44, 24, 4, 93, 73, 53, 33, 13, 83, 63, 43, 23, 3, 92, 72, 52, 32, 12, 82, 62, 42, 22, 2, 91, 71, 51, 31, 11, 81, 61, 41, 21, 1, 90, 70, 50, 30, 10, 80, 60, 40, 20, 0]"); - documents_ids.sort(); - assert_eq!(all_ids, documents_ids); - } - - // Note that in this test, only the iterative sort algorithms are used. Set the CANDIDATES_THESHOLD - // constant to 0 to ensure that the other sort algorithms are also correct. - #[test] - fn sort_criterion_non_placeholder() { - let index = TempIndex::new(); - - index - .update_settings(|settings| { - settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") }); - settings.set_sortable_fields(hashset! { S("id"), S("mod_10"), S("mod_20") }); - settings.set_criteria(vec![Criterion::Sort]); - }) - .unwrap(); - - let mut docs = vec![]; - for i in 0..100 { - docs.push( - serde_json::json!({ "id": i, "mod_10": format!("{}", i % 10), "mod_20": i % 20 }), - ); - } - - index.add_documents(documents!(docs)).unwrap(); - - let rtxn = index.read_txn().unwrap(); - - let mut search = Search::new(&rtxn, &index); - search.filter( - Filter::from_str("mod_10 IN [1, 0, 2] OR mod_20 IN [10, 13] OR id IN [5, 6]") - .unwrap() - .unwrap(), - ); - search.sort_criteria(vec![ - AscDesc::from_str("mod_10:desc").unwrap(), - AscDesc::from_str("mod_20:asc").unwrap(), - AscDesc::from_str("id:desc").unwrap(), - ]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - // The order should be in increasing value of the id modulo 10, followed by increasing value of the id modulo 20, followed by decreasing value of the id - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 93, 73, 53, 33, 13, 82, 62, 42, 22, 2, 92, 72, 52, 32, 12, 81, 61, 41, 21, 1, 91, 71, 51, 31, 11, 80, 60, 40, 20, 0, 90, 70, 50, 30, 10]"); - let expected_ids = (0..100) - .filter(|id| { - [1, 0, 2].contains(&(id % 10)) - || [10, 13].contains(&(id % 20)) - || [5, 6].contains(id) - }) - .collect::>(); - documents_ids.sort(); - assert_eq!(expected_ids, documents_ids); - - let mut search = Search::new(&rtxn, &index); - search.filter( - Filter::from_str("mod_10 IN [7, 8, 0] OR mod_20 IN [1, 15, 16] OR id IN [0, 4]") - .unwrap() - .unwrap(), - ); - search.sort_criteria(vec![ - AscDesc::from_str("mod_10:asc").unwrap(), - AscDesc::from_str("mod_20:asc").unwrap(), - AscDesc::from_str("id:desc").unwrap(), - ]); - search.limit(100); - - let SearchResult { mut documents_ids, .. } = search.execute().unwrap(); - // The order should be in increasing value of the id modulo 10, followed by increasing value of the id modulo 20, followed by decreasing value of the id - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[80, 60, 40, 20, 0, 90, 70, 50, 30, 10, 81, 61, 41, 21, 1, 4, 95, 75, 55, 35, 15, 96, 76, 56, 36, 16, 87, 67, 47, 27, 7, 97, 77, 57, 37, 17, 88, 68, 48, 28, 8, 98, 78, 58, 38, 18]"); - let expected_ids = (0..100) - .filter(|id| { - [7, 8, 0].contains(&(id % 10)) - || [1, 15, 16].contains(&(id % 20)) - || [0, 4].contains(id) - }) - .collect::>(); - documents_ids.sort(); - assert_eq!(expected_ids, documents_ids); - - let mut search = Search::new(&rtxn, &index); - search.filter( - Filter::from_str("mod_10 IN [1, 0, 2] OR mod_20 IN [10, 13] OR id IN [5, 6]") - .unwrap() - .unwrap(), - ); - search.sort_criteria(vec![AscDesc::from_str("id:desc").unwrap()]); - search.limit(100); - - let SearchResult { documents_ids, .. } = search.execute().unwrap(); - // The order should be in decreasing value of the id - let mut expected_ids = (0..100) - .filter(|id| { - [1, 0, 2].contains(&(id % 10)) - || [10, 13].contains(&(id % 20)) - || [5, 6].contains(id) - }) - .collect::>(); - expected_ids.sort(); - expected_ids.reverse(); - assert_eq!(expected_ids, documents_ids); - } -} diff --git a/milli/src/search/criteria/attribute.rs b/milli/src/search/criteria/attribute.rs deleted file mode 100644 index 322f6e051..000000000 --- a/milli/src/search/criteria/attribute.rs +++ /dev/null @@ -1,710 +0,0 @@ -use std::cmp::{self, Ordering}; -use std::collections::binary_heap::PeekMut; -use std::collections::{btree_map, BTreeMap, BinaryHeap, HashMap}; -use std::iter::Peekable; -use std::mem::take; - -use roaring::RoaringBitmap; - -use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::{InitialCandidates, Query}; -use crate::search::query_tree::{Operation, QueryKind}; -use crate::search::{ - build_dfa, word_derivations, CriterionImplementationStrategy, WordDerivationsCache, -}; -use crate::Result; - -/// To be able to divide integers by the number of words in the query -/// we want to find a multiplier that allow us to divide by any number between 1 and 10. -/// We chose the LCM of all numbers between 1 and 10 as the multiplier (https://en.wikipedia.org/wiki/Least_common_multiple). -const LCM_10_FIRST_NUMBERS: u32 = 2520; - -/// Threshold on the number of candidates that will make -/// the system to choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 500; - -type FlattenedQueryTree = Vec>>; - -pub struct Attribute<'t> { - ctx: &'t dyn Context<'t>, - state: Option<(Operation, FlattenedQueryTree, RoaringBitmap)>, - initial_candidates: InitialCandidates, - parent: Box, - linear_buckets: Option>, - set_buckets: Option>>, - implementation_strategy: CriterionImplementationStrategy, -} - -impl<'t> Attribute<'t> { - pub fn new( - ctx: &'t dyn Context<'t>, - parent: Box, - implementation_strategy: CriterionImplementationStrategy, - ) -> Self { - Attribute { - ctx, - state: None, - initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), - parent, - linear_buckets: None, - set_buckets: None, - implementation_strategy, - } - } -} - -impl<'t> Criterion for Attribute<'t> { - #[logging_timer::time("Attribute::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some((_, _, allowed_candidates)) = self.state.as_mut() { - *allowed_candidates -= params.excluded_candidates; - } - - loop { - match self.state.take() { - Some((query_tree, _, allowed_candidates)) if allowed_candidates.is_empty() => { - return Ok(Some(CriterionResult { - query_tree: Some(query_tree), - candidates: Some(RoaringBitmap::new()), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - Some((query_tree, flattened_query_tree, mut allowed_candidates)) => { - let found_candidates = if matches!( - self.implementation_strategy, - CriterionImplementationStrategy::OnlyIterative - ) || (matches!( - self.implementation_strategy, - CriterionImplementationStrategy::Dynamic - ) && allowed_candidates.len() - < CANDIDATES_THRESHOLD) - { - let linear_buckets = match self.linear_buckets.as_mut() { - Some(linear_buckets) => linear_buckets, - None => { - let new_buckets = initialize_linear_buckets( - self.ctx, - &flattened_query_tree, - &allowed_candidates, - )?; - self.linear_buckets.get_or_insert(new_buckets.into_iter()) - } - }; - - match linear_buckets.next() { - Some((_score, candidates)) => candidates, - None => { - return Ok(Some(CriterionResult { - query_tree: Some(query_tree), - candidates: Some(RoaringBitmap::new()), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - } - } else { - let set_buckets = match self.set_buckets.as_mut() { - Some(set_buckets) => set_buckets, - None => { - let new_buckets = initialize_set_buckets( - self.ctx, - &flattened_query_tree, - &allowed_candidates, - params.wdcache, - )?; - self.set_buckets.get_or_insert(new_buckets) - } - }; - - match set_compute_candidates(set_buckets, &allowed_candidates)? { - Some((_score, candidates)) => candidates, - None => { - return Ok(Some(CriterionResult { - query_tree: Some(query_tree), - candidates: Some(allowed_candidates), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - } - }; - - allowed_candidates -= &found_candidates; - - self.state = - Some((query_tree.clone(), flattened_query_tree, allowed_candidates)); - - return Ok(Some(CriterionResult { - query_tree: Some(query_tree), - candidates: Some(found_candidates), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree: Some(query_tree), - candidates, - filtered_candidates, - initial_candidates, - }) => { - let mut candidates = match candidates { - Some(candidates) => candidates, - None => { - resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - - params.excluded_candidates - } - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; - } - - let flattened_query_tree = flatten_query_tree(&query_tree); - - match initial_candidates { - Some(initial_candidates) => { - self.initial_candidates |= initial_candidates - } - None => self.initial_candidates.map_inplace(|c| c | &candidates), - } - - self.state = Some((query_tree, flattened_query_tree, candidates)); - self.linear_buckets = None; - } - Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - })); - } - None => return Ok(None), - }, - } - } - } -} - -/// QueryPositionIterator is an Iterator over positions of a Query, -/// It contains iterators over words positions. -struct QueryPositionIterator<'t> { - #[allow(clippy::type_complexity)] - inner: - Vec> + 't>>>, -} - -impl<'t> QueryPositionIterator<'t> { - fn new( - ctx: &'t dyn Context<'t>, - queries: &[Query], - wdcache: &mut WordDerivationsCache, - ) -> Result { - let mut inner = Vec::with_capacity(queries.len()); - for query in queries { - let in_prefix_cache = query.prefix && ctx.in_prefix_cache(query.kind.word()); - match &query.kind { - QueryKind::Exact { word, .. } => { - if !query.prefix || in_prefix_cache { - let word = query.kind.word(); - let iter = ctx.word_position_iterator(word, in_prefix_cache)?; - inner.push(iter.peekable()); - } else { - for (word, _) in word_derivations(word, true, 0, ctx.words_fst(), wdcache)? - { - let iter = ctx.word_position_iterator(word, in_prefix_cache)?; - inner.push(iter.peekable()); - } - } - } - QueryKind::Tolerant { typo, word } => { - for (word, _) in - word_derivations(word, query.prefix, *typo, ctx.words_fst(), wdcache)? - { - let iter = ctx.word_position_iterator(word, in_prefix_cache)?; - inner.push(iter.peekable()); - } - } - }; - } - - Ok(Self { inner }) - } -} - -impl<'t> Iterator for QueryPositionIterator<'t> { - type Item = heed::Result<(u16, RoaringBitmap)>; - - fn next(&mut self) -> Option { - // sort inner words from the closest next position to the farthest next position. - let expected_pos = self - .inner - .iter_mut() - .filter_map(|wli| match wli.peek() { - Some(Ok(((_, pos), _))) => Some(*pos), - _ => None, - }) - .min()?; - - let mut candidates = None; - for wli in self.inner.iter_mut() { - if let Some(Ok(((_, pos), _))) = wli.peek() { - if *pos > expected_pos { - continue; - } - } - - match wli.next() { - Some(Ok((_, docids))) => { - candidates = match candidates.take() { - Some(candidates) => Some(candidates | docids), - None => Some(docids), - } - } - Some(Err(e)) => return Some(Err(e)), - None => continue, - } - } - - candidates.map(|candidates| Ok((expected_pos, candidates))) - } -} - -/// A Branch is represent a possible alternative of the original query and is build with the Query Tree, -/// This branch allows us to iterate over meta-interval of positions. -struct Branch<'t> { - query_level_iterator: Vec<(u16, RoaringBitmap, Peekable>)>, - last_result: (u16, RoaringBitmap), - branch_size: u16, -} - -impl<'t> Branch<'t> { - fn new( - ctx: &'t dyn Context<'t>, - flatten_branch: &[Vec], - wdcache: &mut WordDerivationsCache, - allowed_candidates: &RoaringBitmap, - ) -> Result { - let mut query_level_iterator = Vec::new(); - for queries in flatten_branch { - let mut qli = QueryPositionIterator::new(ctx, queries, wdcache)?.peekable(); - let (pos, docids) = qli.next().transpose()?.unwrap_or((0, RoaringBitmap::new())); - query_level_iterator.push((pos, docids & allowed_candidates, qli)); - } - - let mut branch = Self { - query_level_iterator, - last_result: (0, RoaringBitmap::new()), - branch_size: flatten_branch.len() as u16, - }; - - branch.update_last_result(); - - Ok(branch) - } - - /// return the next meta-interval of the branch, - /// and update inner interval in order to be ranked by the BinaryHeap. - fn next(&mut self, allowed_candidates: &RoaringBitmap) -> heed::Result { - // update the first query. - let index = self.lowest_iterator_index(); - match self.query_level_iterator.get_mut(index) { - Some((cur_pos, cur_docids, qli)) => match qli.next().transpose()? { - Some((next_pos, next_docids)) => { - *cur_pos = next_pos; - *cur_docids |= next_docids & allowed_candidates; - self.update_last_result(); - Ok(true) - } - None => Ok(false), - }, - None => Ok(false), - } - } - - fn lowest_iterator_index(&mut self) -> usize { - let (index, _) = self - .query_level_iterator - .iter_mut() - .map(|(pos, docids, qli)| { - if docids.is_empty() { - 0 - } else { - match qli.peek() { - Some(result) => { - result.as_ref().map(|(next_pos, _)| *next_pos - *pos).unwrap_or(0) - } - None => u16::MAX, - } - } - }) - .enumerate() - .min_by_key(|(_, diff)| *diff) - .unwrap_or((0, 0)); - - index - } - - fn update_last_result(&mut self) { - let mut result_pos = 0; - let mut result_docids = None; - - for (pos, docids, _qli) in self.query_level_iterator.iter() { - result_pos += pos; - result_docids = result_docids - .take() - .map_or_else(|| Some(docids.clone()), |candidates| Some(candidates & docids)); - } - - // remove last result docids from inner iterators - if let Some(docids) = result_docids.as_ref() { - for (_, query_docids, _) in self.query_level_iterator.iter_mut() { - *query_docids -= docids; - } - } - - self.last_result = (result_pos, result_docids.unwrap_or_default()); - } - - /// return the score of the current inner interval. - fn compute_rank(&self) -> u32 { - // we compute a rank from the position. - let (pos, _) = self.last_result; - pos.saturating_sub((0..self.branch_size).sum()) as u32 * LCM_10_FIRST_NUMBERS - / self.branch_size as u32 - } - - fn cmp(&self, other: &Self) -> Ordering { - let self_rank = self.compute_rank(); - let other_rank = other.compute_rank(); - - // lower rank is better, and because BinaryHeap give the higher ranked branch, we reverse it. - self_rank.cmp(&other_rank).reverse() - } -} - -impl<'t> Ord for Branch<'t> { - fn cmp(&self, other: &Self) -> Ordering { - self.cmp(other) - } -} - -impl<'t> PartialOrd for Branch<'t> { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl<'t> PartialEq for Branch<'t> { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == Ordering::Equal - } -} - -impl<'t> Eq for Branch<'t> {} - -fn initialize_set_buckets<'t>( - ctx: &'t dyn Context<'t>, - branches: &FlattenedQueryTree, - allowed_candidates: &RoaringBitmap, - wdcache: &mut WordDerivationsCache, -) -> Result>> { - let mut heap = BinaryHeap::new(); - for flatten_branch in branches { - let branch = Branch::new(ctx, flatten_branch, wdcache, allowed_candidates)?; - heap.push(branch); - } - - Ok(heap) -} - -fn set_compute_candidates( - branches_heap: &mut BinaryHeap, - allowed_candidates: &RoaringBitmap, -) -> Result> { - let mut final_candidates: Option<(u32, RoaringBitmap)> = None; - let mut allowed_candidates = allowed_candidates.clone(); - - while let Some(mut branch) = branches_heap.peek_mut() { - // if current is worst than best we break to return - // candidates that correspond to the best rank - let branch_rank = branch.compute_rank(); - if let Some((best_rank, _)) = final_candidates { - if branch_rank > best_rank { - break; - } - } - - let candidates = take(&mut branch.last_result.1); - if candidates.is_empty() { - // we don't have candidates, get next interval. - if !branch.next(&allowed_candidates)? { - PeekMut::pop(branch); - } - } else { - allowed_candidates -= &candidates; - final_candidates = match final_candidates.take() { - // we add current candidates to best candidates - Some((best_rank, mut best_candidates)) => { - best_candidates |= candidates; - branch.next(&allowed_candidates)?; - Some((best_rank, best_candidates)) - } - // we take current candidates as best candidates - None => { - branch.next(&allowed_candidates)?; - Some((branch_rank, candidates)) - } - }; - } - } - - Ok(final_candidates) -} - -fn initialize_linear_buckets( - ctx: &dyn Context, - branches: &FlattenedQueryTree, - allowed_candidates: &RoaringBitmap, -) -> Result> { - fn compute_candidate_rank( - branches: &FlattenedQueryTree, - words_positions: HashMap, - ) -> u64 { - let mut min_rank = u64::max_value(); - for branch in branches { - let branch_len = branch.len(); - let mut branch_rank = Vec::with_capacity(branch_len); - for derivates in branch { - let mut position = None; - for Query { prefix, kind } in derivates { - // find the best position of the current word in the document. - let current_position = match kind { - QueryKind::Exact { word, .. } => { - if *prefix { - word_derivations(word, true, 0, &words_positions) - .flat_map(|positions| positions.iter().next()) - .min() - } else { - words_positions - .get(word) - .and_then(|positions| positions.iter().next()) - } - } - QueryKind::Tolerant { typo, word } => { - word_derivations(word, *prefix, *typo, &words_positions) - .flat_map(|positions| positions.iter().next()) - .min() - } - }; - - match (position, current_position) { - (Some(p), Some(cp)) => position = Some(cmp::min(p, cp)), - (None, Some(cp)) => position = Some(cp), - _ => (), - } - } - - // if a position is found, we add it to the branch score, - // otherwise the branch is considered as unfindable in this document and we break. - if let Some(position) = position { - branch_rank.push(position as u64); - } else { - branch_rank.clear(); - break; - } - } - - if !branch_rank.is_empty() { - branch_rank.sort_unstable(); - // because several words in same query can't match all a the position 0, - // we substract the word index to the position. - let branch_rank: u64 = - branch_rank.into_iter().enumerate().map(|(i, r)| r - i as u64).sum(); - // here we do the means of the words of the branch - min_rank = - min_rank.min(branch_rank * LCM_10_FIRST_NUMBERS as u64 / branch_len as u64); - } - } - - min_rank - } - - fn word_derivations<'a>( - word: &str, - is_prefix: bool, - max_typo: u8, - words_positions: &'a HashMap, - ) -> impl Iterator { - let dfa = build_dfa(word, max_typo, is_prefix); - words_positions.iter().filter_map(move |(document_word, positions)| { - use levenshtein_automata::Distance; - match dfa.eval(document_word) { - Distance::Exact(_) => Some(positions), - Distance::AtLeast(_) => None, - } - }) - } - - let mut candidates = BTreeMap::new(); - for docid in allowed_candidates { - let words_positions = ctx.docid_words_positions(docid)?; - let rank = compute_candidate_rank(branches, words_positions); - candidates.entry(rank).or_insert_with(RoaringBitmap::new).insert(docid); - } - - Ok(candidates) -} - -// TODO can we keep refs of Query -fn flatten_query_tree(query_tree: &Operation) -> FlattenedQueryTree { - use crate::search::criteria::Operation::{And, Or, Phrase}; - - fn and_recurse(head: &Operation, tail: &[Operation]) -> FlattenedQueryTree { - match tail.split_first() { - Some((thead, tail)) => { - let tail = and_recurse(thead, tail); - let mut out = Vec::new(); - for array in recurse(head) { - for tail_array in &tail { - let mut array = array.clone(); - array.extend(tail_array.iter().cloned()); - out.push(array); - } - } - out - } - None => recurse(head), - } - } - - fn recurse(op: &Operation) -> FlattenedQueryTree { - match op { - And(ops) => ops.split_first().map_or_else(Vec::new, |(h, t)| and_recurse(h, t)), - Or(_, ops) => { - if ops.iter().all(|op| op.query().is_some()) { - vec![vec![ops.iter().flat_map(|op| op.query()).cloned().collect()]] - } else { - ops.iter().flat_map(recurse).collect() - } - } - Phrase(words) => { - let queries = words - .iter() - .filter_map(|w| w.as_ref()) - .map(|word| vec![Query { prefix: false, kind: QueryKind::exact(word.clone()) }]) - .collect(); - vec![queries] - } - Operation::Query(query) => vec![vec![vec![query.clone()]]], - } - } - - recurse(query_tree) -} - -#[cfg(test)] -mod tests { - use big_s::S; - - use super::*; - use crate::search::criteria::QueryKind; - - #[test] - fn simple_flatten_query_tree() { - let query_tree = Operation::Or( - false, - vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythefish")) }), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("manythe")) }), - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("fish")) }), - ]), - Operation::And(vec![ - Operation::Query(Query { prefix: false, kind: QueryKind::exact(S("many")) }), - Operation::Or( - false, - vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact(S("thefish")), - }), - Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact(S("the")), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact(S("fish")), - }), - ]), - ], - ), - ]), - ], - ); - let result = flatten_query_tree(&query_tree); - - insta::assert_debug_snapshot!(result, @r###" - [ - [ - [ - Exact { - word: "manythefish", - }, - ], - ], - [ - [ - Exact { - word: "manythe", - }, - ], - [ - Exact { - word: "fish", - }, - ], - ], - [ - [ - Exact { - word: "many", - }, - ], - [ - Exact { - word: "thefish", - }, - ], - ], - [ - [ - Exact { - word: "many", - }, - ], - [ - Exact { - word: "the", - }, - ], - [ - Exact { - word: "fish", - }, - ], - ], - ] - "###); - } -} diff --git a/milli/src/search/criteria/exactness.rs b/milli/src/search/criteria/exactness.rs deleted file mode 100644 index 078a9cd6c..000000000 --- a/milli/src/search/criteria/exactness.rs +++ /dev/null @@ -1,766 +0,0 @@ -use std::collections::btree_map::Entry; -use std::collections::BTreeMap; -use std::convert::TryFrom; -use std::mem::take; - -use log::debug; -use roaring::{MultiOps, RoaringBitmap}; - -use crate::search::criteria::{ - resolve_phrase, resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult, - InitialCandidates, -}; -use crate::search::query_tree::{Operation, PrimitiveQueryPart}; -use crate::{absolute_from_relative_position, FieldId, Result}; - -pub struct Exactness<'t> { - ctx: &'t dyn Context<'t>, - query_tree: Option, - state: Option, - initial_candidates: InitialCandidates, - parent: Box, - query: Vec, - cache: Option, -} - -impl<'t> Exactness<'t> { - pub fn new( - ctx: &'t dyn Context<'t>, - parent: Box, - primitive_query: &[PrimitiveQueryPart], - ) -> heed::Result { - let mut query: Vec<_> = Vec::with_capacity(primitive_query.len()); - for part in primitive_query { - query.push(ExactQueryPart::from_primitive_query_part(ctx, part)?); - } - - Ok(Exactness { - ctx, - query_tree: None, - state: None, - initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), - parent, - query, - cache: None, - }) - } -} - -impl<'t> Criterion for Exactness<'t> { - #[logging_timer::time("Exactness::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some(state) = self.state.as_mut() { - state.difference_with(params.excluded_candidates); - } - loop { - debug!("Exactness at state {:?}", self.state); - - match self.state.as_mut() { - Some(state) if state.is_empty() => { - // reset state - self.state = None; - self.query_tree = None; - // we don't need to reset the combinations cache since it only depends on - // the primitive query, which does not change - } - Some(state) => { - let (candidates, state) = - resolve_state(self.ctx, take(state), &self.query, &mut self.cache)?; - self.state = state; - - return Ok(Some(CriterionResult { - query_tree: self.query_tree.clone(), - candidates: Some(candidates), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree: Some(query_tree), - candidates, - filtered_candidates, - initial_candidates, - }) => { - let mut candidates = match candidates { - Some(candidates) => candidates, - None => { - resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - - params.excluded_candidates - } - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; - } - - match initial_candidates { - Some(initial_candidates) => { - self.initial_candidates |= initial_candidates - } - None => self.initial_candidates.map_inplace(|c| c | &candidates), - } - - self.state = Some(State::new(candidates)); - self.query_tree = Some(query_tree); - } - Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - })); - } - None => return Ok(None), - }, - } - } - } -} - -#[derive(Debug)] -enum State { - /// Extract the documents that have an attribute that contains exactly the query. - ExactAttribute(RoaringBitmap), - /// Extract the documents that have an attribute that starts with exactly the query. - AttributeStartsWith(RoaringBitmap), - /// Rank the remaining documents by the number of exact words contained. - ExactWords(RoaringBitmap), - Remainings(Vec), -} - -impl State { - fn new(candidates: RoaringBitmap) -> Self { - Self::ExactAttribute(candidates) - } - - fn difference_with(&mut self, lhs: &RoaringBitmap) { - match self { - Self::ExactAttribute(candidates) - | Self::AttributeStartsWith(candidates) - | Self::ExactWords(candidates) => *candidates -= lhs, - Self::Remainings(candidates_array) => { - candidates_array.iter_mut().for_each(|candidates| *candidates -= lhs); - candidates_array.retain(|candidates| !candidates.is_empty()); - } - } - } - - fn is_empty(&self) -> bool { - match self { - Self::ExactAttribute(candidates) - | Self::AttributeStartsWith(candidates) - | Self::ExactWords(candidates) => candidates.is_empty(), - Self::Remainings(candidates_array) => { - candidates_array.iter().all(RoaringBitmap::is_empty) - } - } - } -} - -impl Default for State { - fn default() -> Self { - Self::Remainings(vec![]) - } -} -#[logging_timer::time("Exactness::{}")] -fn resolve_state( - ctx: &dyn Context, - state: State, - query: &[ExactQueryPart], - cache: &mut Option, -) -> Result<(RoaringBitmap, Option)> { - use State::*; - match state { - ExactAttribute(mut allowed_candidates) => { - let mut candidates = RoaringBitmap::new(); - if let Ok(query_len) = u8::try_from(query.len()) { - let attributes_ids = ctx.searchable_fields_ids()?; - for id in attributes_ids { - if let Some(attribute_allowed_docids) = - ctx.field_id_word_count_docids(id, query_len)? - { - let mut attribute_candidates_array = - attribute_start_with_docids(ctx, id, query)?; - attribute_candidates_array.push(attribute_allowed_docids); - - candidates |= MultiOps::intersection(attribute_candidates_array); - } - } - - // only keep allowed candidates - candidates &= &allowed_candidates; - // remove current candidates from allowed candidates - allowed_candidates -= &candidates; - } - - Ok((candidates, Some(AttributeStartsWith(allowed_candidates)))) - } - AttributeStartsWith(mut allowed_candidates) => { - let mut candidates = RoaringBitmap::new(); - let attributes_ids = ctx.searchable_fields_ids()?; - for id in attributes_ids { - let attribute_candidates_array = attribute_start_with_docids(ctx, id, query)?; - candidates |= MultiOps::intersection(attribute_candidates_array); - } - - // only keep allowed candidates - candidates &= &allowed_candidates; - // remove current candidates from allowed candidates - allowed_candidates -= &candidates; - Ok((candidates, Some(ExactWords(allowed_candidates)))) - } - ExactWords(allowed_candidates) => { - // Retrieve the cache if it already exist, otherwise create it. - let owned_cache = if let Some(cache) = cache.take() { - cache - } else { - compute_combinations(ctx, query)? - }; - // The cache contains the sets of documents which contain exactly 1,2,3,.. exact words - // from the query. It cannot be empty. All the candidates in it are disjoint. - - let mut candidates_array = owned_cache.combinations.clone(); - for candidates in candidates_array.iter_mut() { - *candidates &= &allowed_candidates; - } - *cache = Some(owned_cache); - - let best_candidates = candidates_array.pop().unwrap(); - - candidates_array.insert(0, allowed_candidates); - Ok((best_candidates, Some(Remainings(candidates_array)))) - } - // pop remainings candidates until the emptiness - Remainings(mut candidates_array) => { - let candidates = candidates_array.pop().unwrap_or_default(); - if !candidates_array.is_empty() { - Ok((candidates, Some(Remainings(candidates_array)))) - } else { - Ok((candidates, None)) - } - } - } -} - -fn attribute_start_with_docids( - ctx: &dyn Context, - attribute_id: FieldId, - query: &[ExactQueryPart], -) -> heed::Result> { - let mut attribute_candidates_array = Vec::new(); - // start from attribute first position - let mut pos = absolute_from_relative_position(attribute_id, 0); - for part in query { - use ExactQueryPart::*; - match part { - Synonyms(synonyms) => { - let mut synonyms_candidates = RoaringBitmap::new(); - for word in synonyms { - let wc = ctx.word_position_docids(word, pos)?; - if let Some(word_candidates) = wc { - synonyms_candidates |= word_candidates; - } - } - attribute_candidates_array.push(synonyms_candidates); - pos += 1; - } - Phrase(phrase) => { - for word in phrase { - if let Some(word) = word { - let wc = ctx.word_position_docids(word, pos)?; - if let Some(word_candidates) = wc { - attribute_candidates_array.push(word_candidates); - } - } - pos += 1; - } - } - } - } - - Ok(attribute_candidates_array) -} - -#[derive(Debug, Clone)] -pub enum ExactQueryPart { - Phrase(Vec>), - Synonyms(Vec), -} - -impl ExactQueryPart { - fn from_primitive_query_part( - ctx: &dyn Context, - part: &PrimitiveQueryPart, - ) -> heed::Result { - let part = match part { - PrimitiveQueryPart::Word(word, _) => { - match ctx.synonyms(word)? { - Some(synonyms) => { - let mut synonyms: Vec<_> = synonyms - .into_iter() - .filter_map(|mut array| { - // keep 1 word synonyms only. - match array.pop() { - Some(word) if array.is_empty() => Some(word), - _ => None, - } - }) - .collect(); - synonyms.push(word.clone()); - ExactQueryPart::Synonyms(synonyms) - } - None => ExactQueryPart::Synonyms(vec![word.clone()]), - } - } - PrimitiveQueryPart::Phrase(phrase) => ExactQueryPart::Phrase(phrase.clone()), - }; - - Ok(part) - } -} - -struct ExactWordsCombinationCache { - // index 0 is only 1 word - combinations: Vec, -} - -fn compute_combinations( - ctx: &dyn Context, - query: &[ExactQueryPart], -) -> Result { - let number_of_part = query.len(); - let mut parts_candidates_array = Vec::with_capacity(number_of_part); - for part in query { - let mut candidates = RoaringBitmap::new(); - use ExactQueryPart::*; - match part { - Synonyms(synonyms) => { - for synonym in synonyms { - if let Some(synonym_candidates) = ctx.word_docids(synonym)? { - candidates |= synonym_candidates; - } - } - } - // compute intersection on pair of words with a proximity of 0. - Phrase(phrase) => { - candidates |= resolve_phrase(ctx, phrase)?; - } - } - parts_candidates_array.push(candidates); - } - let combinations = create_disjoint_combinations(parts_candidates_array); - - Ok(ExactWordsCombinationCache { combinations }) -} - -/// Given a list of bitmaps `b0,b1,...,bn` , compute the list of bitmaps `X0,X1,...,Xn` -/// such that `Xi` contains all the elements that are contained in **at least** `i+1` bitmaps among `b0,b1,...,bn`. -/// -/// The returned vector is guaranteed to be of length `n`. It is equal to `vec![X0, X1, ..., Xn]`. -/// -/// ## Implementation -/// -/// We do so by iteratively building a map containing the union of all the different ways to intersect `J` bitmaps among `b0,b1,...,bn`. -/// - The key of the map is the index `i` of the last bitmap in the intersections -/// - The value is the union of all the possible intersections of J bitmaps such that the last bitmap in the intersection is `bi` -/// -/// For example, with the bitmaps `b0,b1,b2,b3`, this map should look like this -/// ```text -/// Map 0: (first iteration, contains all the combinations of 1 bitmap) -/// // What follows are unions of intersection of bitmaps asscociated with the index of their last component -/// 0: [b0] -/// 1: [b1] -/// 2: [b2] -/// 3: [b3] -/// Map 1: (second iteration, combinations of 2 bitmaps) -/// 1: [b0&b1] -/// 2: [b0&b2 | b1&b2] -/// 3: [b0&b3 | b1&b3 | b2&b3] -/// Map 2: (third iteration, combinations of 3 bitmaps) -/// 2: [b0&b1&b2] -/// 3: [b0&b2&b3 | b1&b2&b3] -/// Map 3: (fourth iteration, combinations of 4 bitmaps) -/// 3: [b0&b1&b2&b3] -/// ``` -/// -/// These maps are built one by one from the content of the preceding map. -/// For example, to create Map 2, we look at each line of Map 1, for example: -/// ```text -/// 2: [b0&b2 | b1&b2] -/// ``` -/// And then for each i > 2, we compute `(b0&b2 | b1&b2) & bi = b0&b2&bi | b1&b2&bi` -/// and then add it the new map (Map 3) under the key `i` (if it is not empty): -/// ```text -/// 3: [b0&b2&b3 | b1&b2&b3] -/// 4: [b0&b2&b4 | b1&b2&b4] -/// 5: [b0&b2&b5 | b1&b2&b5] -/// etc. -/// ``` -/// We only keep two maps in memory at any one point. As soon as Map J is built, we flatten Map J-1 into -/// a single bitmap by taking the union of all of its values. This union gives us Xj-1. -/// -/// ## Memory Usage -/// This function is expected to be called on a maximum of 10 bitmaps. The worst case thus happens when -/// 10 identical large bitmaps are given. -/// -/// In the context of Meilisearch, let's imagine that we are given 10 bitmaps containing all -/// the document ids. If the dataset contains 16 million documents, then each bitmap will take -/// around 2MB of memory. -/// -/// When creating Map 3, we will have, in memory: -/// 1. The 10 original bitmaps (20MB) -/// 2. X0 : 2MB -/// 3. Map 1, containing 9 bitmaps: 18MB -/// 4. Map 2, containing 8 bitmaps: 16MB -/// 5. X1: 2MB -/// for a total of around 60MB of memory. This roughly represents the maximum memory usage of this function. -/// -/// ## Time complexity -/// Let N be the size of the given list of bitmaps and M the length of each individual bitmap. -/// -/// We need to create N new bitmaps. The most expensive one to create is the second one, where we need to -/// iterate over the N keys of Map 1, and for each of those keys `k_i`, we perform `N-k_i` bitmap unions. -/// Unioning two bitmaps is O(M), and we need to do it O(N^2) times. -/// -/// Therefore the time complexity is O(N^3 * M). -fn create_non_disjoint_combinations(bitmaps: Vec) -> Vec { - let nbr_parts = bitmaps.len(); - if nbr_parts == 1 { - return bitmaps; - } - let mut flattened_levels = vec![]; - let mut last_level: BTreeMap = - bitmaps.clone().into_iter().enumerate().collect(); - - for _ in 2..=nbr_parts { - let mut new_level = BTreeMap::new(); - for (last_part_index, base_combination) in last_level.iter() { - #[allow(clippy::needless_range_loop)] - for new_last_part_index in last_part_index + 1..nbr_parts { - let new_combination = base_combination & &bitmaps[new_last_part_index]; - if !new_combination.is_empty() { - match new_level.entry(new_last_part_index) { - Entry::Occupied(mut b) => { - *b.get_mut() |= new_combination; - } - Entry::Vacant(entry) => { - entry.insert(new_combination); - } - } - } - } - } - // Now flatten the last level to save memory - let flattened_last_level = MultiOps::union(last_level.into_values()); - flattened_levels.push(flattened_last_level); - last_level = new_level; - } - // Flatten the last level - let flattened_last_level = MultiOps::union(last_level.into_values()); - flattened_levels.push(flattened_last_level); - flattened_levels -} - -/// Given a list of bitmaps `b0,b1,...,bn` , compute the list of bitmaps `X0,X1,...,Xn` -/// such that `Xi` contains all the elements that are contained in **exactly** `i+1` bitmaps among `b0,b1,...,bn`. -/// -/// The returned vector is guaranteed to be of length `n`. It is equal to `vec![X0, X1, ..., Xn]`. -fn create_disjoint_combinations(parts_candidates_array: Vec) -> Vec { - let non_disjoint_combinations = create_non_disjoint_combinations(parts_candidates_array); - let mut disjoint_combinations = vec![]; - let mut combinations = non_disjoint_combinations.into_iter().peekable(); - while let Some(mut combination) = combinations.next() { - if let Some(forbidden) = combinations.peek() { - combination -= forbidden; - } - disjoint_combinations.push(combination) - } - - disjoint_combinations -} - -#[cfg(test)] -mod tests { - use big_s::S; - use roaring::RoaringBitmap; - - use crate::index::tests::TempIndex; - use crate::search::criteria::exactness::{ - create_disjoint_combinations, create_non_disjoint_combinations, - }; - use crate::snapshot_tests::display_bitmap; - use crate::{Criterion, SearchResult}; - - #[test] - fn test_exact_words_subcriterion() { - let index = TempIndex::new(); - - index - .update_settings(|settings| { - settings.set_primary_key(S("id")); - settings.set_criteria(vec![Criterion::Exactness]); - }) - .unwrap(); - - index - .add_documents(documents!([ - // not relevant - { "id": "0", "text": "cat good dog bad" }, - // 1 exact word - { "id": "1", "text": "they said: cats arebetter thandogs" }, - // 3 exact words - { "id": "2", "text": "they said: cats arebetter than dogs" }, - // 5 exact words - { "id": "3", "text": "they said: cats are better than dogs" }, - // attribute starts with the exact words - { "id": "4", "text": "cats are better than dogs except on Saturday" }, - // attribute equal to the exact words - { "id": "5", "text": "cats are better than dogs" }, - ])) - .unwrap(); - - let rtxn = index.read_txn().unwrap(); - - let SearchResult { matching_words: _, candidates: _, documents_ids } = - index.search(&rtxn).query("cats are better than dogs").execute().unwrap(); - - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4, 3, 2, 1]"); - } - - fn print_combinations(rbs: &[RoaringBitmap]) -> String { - let mut s = String::new(); - for rb in rbs { - s.push_str(&format!("{}\n", &display_bitmap(rb))); - } - s - } - - // In these unit tests, the test bitmaps always contain all the multiple of a certain number. - // This makes it easy to check the validity of the results of `create_disjoint_combinations` by - // counting the number of dividers of elements in the returned bitmaps. - fn assert_correct_combinations(combinations: &[RoaringBitmap], dividers: &[u32]) { - for (i, set) in combinations.iter().enumerate() { - let expected_nbr_dividers = i + 1; - for el in set { - let nbr_dividers = dividers.iter().map(|d| usize::from(el % d == 0)).sum::(); - assert_eq!( - nbr_dividers, expected_nbr_dividers, - "{el} is divisible by {nbr_dividers} elements, not {expected_nbr_dividers}." - ); - } - } - } - - #[test] - fn compute_combinations_1() { - let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); - - let parts_candidates = vec![b0]; - - let combinations = create_disjoint_combinations(parts_candidates); - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126, 128, 130, 132, 134, 136, 138, 140, 142, 144, 146, 148, ] - "###); - - assert_correct_combinations(&combinations, &[2]); - } - - #[test] - fn compute_combinations_2() { - let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); - let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); - - let parts_candidates = vec![b0, b1]; - - let combinations = create_disjoint_combinations(parts_candidates); - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [2, 3, 4, 8, 9, 10, 14, 15, 16, 20, 21, 22, 26, 27, 28, 32, 33, 34, 38, 39, 40, 44, 45, 46, 50, 51, 52, 56, 57, 58, 62, 63, 64, 68, 69, 70, 74, 75, 76, 80, 81, 82, 86, 87, 88, 92, 93, 94, 98, 99, 100, 104, 105, 106, 110, 111, 112, 116, 117, 118, 122, 123, 124, 128, 129, 130, 134, 135, 136, 140, 141, 142, 146, 147, 148, ] - [0, 6, 12, 18, 24, 30, 36, 42, 48, 54, 60, 66, 72, 78, 84, 90, 96, 102, 108, 114, 120, 126, 132, 138, 144, ] - "###); - } - - #[test] - fn compute_combinations_4() { - let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); - let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); - let b2: RoaringBitmap = (0..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect(); - let b3: RoaringBitmap = (0..).into_iter().map(|x| 7 * x).take_while(|x| *x < 150).collect(); - - let parts_candidates = vec![b0, b1, b2, b3]; - - let combinations = create_disjoint_combinations(parts_candidates); - - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [2, 3, 4, 5, 7, 8, 9, 16, 22, 25, 26, 27, 32, 33, 34, 38, 39, 44, 46, 49, 51, 52, 55, 57, 58, 62, 64, 65, 68, 69, 74, 76, 77, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 95, 99, 104, 106, 111, 115, 116, 117, 118, 119, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 145, 146, 148, ] - [6, 10, 12, 14, 15, 18, 20, 21, 24, 28, 35, 36, 40, 45, 48, 50, 54, 56, 63, 66, 72, 75, 78, 80, 96, 98, 100, 102, 108, 110, 112, 114, 130, 132, 135, 138, 144, 147, ] - [30, 42, 60, 70, 84, 90, 105, 120, 126, 140, ] - [0, ] - "###); - - // But we also check it programmatically - assert_correct_combinations(&combinations, &[2, 3, 5, 7]); - } - #[test] - fn compute_combinations_4_with_empty_results_at_end() { - let b0: RoaringBitmap = (1..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); - let b1: RoaringBitmap = (1..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); - let b2: RoaringBitmap = (1..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect(); - let b3: RoaringBitmap = (1..).into_iter().map(|x| 7 * x).take_while(|x| *x < 150).collect(); - - let parts_candidates = vec![b0, b1, b2, b3]; - - let combinations = create_disjoint_combinations(parts_candidates); - - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [2, 3, 4, 5, 7, 8, 9, 16, 22, 25, 26, 27, 32, 33, 34, 38, 39, 44, 46, 49, 51, 52, 55, 57, 58, 62, 64, 65, 68, 69, 74, 76, 77, 81, 82, 85, 86, 87, 88, 91, 92, 93, 94, 95, 99, 104, 106, 111, 115, 116, 117, 118, 119, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 145, 146, 148, ] - [6, 10, 12, 14, 15, 18, 20, 21, 24, 28, 35, 36, 40, 45, 48, 50, 54, 56, 63, 66, 72, 75, 78, 80, 96, 98, 100, 102, 108, 110, 112, 114, 130, 132, 135, 138, 144, 147, ] - [30, 42, 60, 70, 84, 90, 105, 120, 126, 140, ] - [] - "###); - - // But we also check it programmatically - assert_correct_combinations(&combinations, &[2, 3, 5, 7]); - } - - #[test] - fn compute_combinations_4_with_some_equal_bitmaps() { - let b0: RoaringBitmap = (0..).into_iter().map(|x| 2 * x).take_while(|x| *x < 150).collect(); - let b1: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); - let b2: RoaringBitmap = (0..).into_iter().map(|x| 5 * x).take_while(|x| *x < 150).collect(); - // b3 == b1 - let b3: RoaringBitmap = (0..).into_iter().map(|x| 3 * x).take_while(|x| *x < 150).collect(); - - let parts_candidates = vec![b0, b1, b2, b3]; - - let combinations = create_disjoint_combinations(parts_candidates); - - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [2, 4, 5, 8, 14, 16, 22, 25, 26, 28, 32, 34, 35, 38, 44, 46, 52, 55, 56, 58, 62, 64, 65, 68, 74, 76, 82, 85, 86, 88, 92, 94, 95, 98, 104, 106, 112, 115, 116, 118, 122, 124, 125, 128, 134, 136, 142, 145, 146, 148, ] - [3, 9, 10, 20, 21, 27, 33, 39, 40, 50, 51, 57, 63, 69, 70, 80, 81, 87, 93, 99, 100, 110, 111, 117, 123, 129, 130, 140, 141, 147, ] - [6, 12, 15, 18, 24, 36, 42, 45, 48, 54, 66, 72, 75, 78, 84, 96, 102, 105, 108, 114, 126, 132, 135, 138, 144, ] - [0, 30, 60, 90, 120, ] - "###); - - // But we also check it programmatically - assert_correct_combinations(&combinations, &[2, 3, 5, 3]); - } - - #[test] - fn compute_combinations_10() { - let dividers = [2, 3, 5, 7, 11, 6, 15, 35, 18, 14]; - let parts_candidates: Vec = dividers - .iter() - .map(|÷r| { - (0..).into_iter().map(|x| divider * x).take_while(|x| *x <= 210).collect() - }) - .collect(); - - let combinations = create_disjoint_combinations(parts_candidates); - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [2, 3, 4, 5, 7, 8, 9, 11, 16, 25, 26, 27, 32, 34, 38, 39, 46, 49, 51, 52, 57, 58, 62, 64, 65, 68, 69, 74, 76, 81, 82, 85, 86, 87, 91, 92, 93, 94, 95, 104, 106, 111, 115, 116, 117, 118, 119, 121, 122, 123, 124, 125, 128, 129, 133, 134, 136, 141, 142, 143, 145, 146, 148, 152, 153, 155, 158, 159, 161, 164, 166, 171, 172, 177, 178, 183, 184, 185, 187, 188, 194, 201, 202, 203, 205, 206, 207, 208, 209, ] - [10, 20, 21, 22, 33, 40, 44, 50, 55, 63, 77, 80, 88, 99, 100, 130, 147, 160, 170, 176, 189, 190, 200, ] - [6, 12, 14, 15, 24, 28, 35, 45, 48, 56, 75, 78, 96, 98, 102, 110, 112, 114, 135, 138, 156, 174, 175, 182, 186, 192, 195, 196, 204, ] - [18, 36, 54, 66, 72, 108, 132, 144, 154, 162, 165, ] - [30, 42, 60, 70, 84, 105, 120, 140, 150, 168, 198, ] - [90, 126, 180, ] - [] - [210, ] - [] - [0, ] - "###); - - assert_correct_combinations(&combinations, ÷rs); - } - - #[test] - fn compute_combinations_30() { - let dividers: [u32; 30] = [ - 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, - 5, - ]; - let parts_candidates: Vec = dividers - .iter() - .map(|divider| { - (0..).into_iter().map(|x| divider * x).take_while(|x| *x <= 100).collect() - }) - .collect(); - - let combinations = create_non_disjoint_combinations(parts_candidates.clone()); - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, ] - [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] - [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] - [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] - [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] - [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] - [0, 2, 3, 4, 5, 6, 8, 9, 10, 12, 14, 15, 16, 18, 20, 21, 22, 24, 25, 26, 27, 28, 30, 32, 33, 34, 35, 36, 38, 39, 40, 42, 44, 45, 46, 48, 50, 51, 52, 54, 55, 56, 57, 58, 60, 62, 63, 64, 65, 66, 68, 69, 70, 72, 74, 75, 76, 78, 80, 81, 82, 84, 85, 86, 87, 88, 90, 92, 93, 94, 95, 96, 98, 99, 100, ] - [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] - [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] - [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] - [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] - [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] - [0, 4, 6, 8, 10, 12, 15, 16, 18, 20, 24, 28, 30, 32, 36, 40, 42, 44, 45, 48, 50, 52, 54, 56, 60, 64, 66, 68, 70, 72, 75, 76, 78, 80, 84, 88, 90, 92, 96, 100, ] - [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] - [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] - [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] - [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] - [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] - [0, 12, 20, 24, 30, 36, 40, 48, 60, 72, 80, 84, 90, 96, 100, ] - [0, 60, ] - [0, 60, ] - [0, 60, ] - [0, 60, ] - [0, 60, ] - [0, 60, ] - "###); - - let combinations = create_disjoint_combinations(parts_candidates); - insta::assert_snapshot!(print_combinations(&combinations), @r###" - [] - [] - [] - [] - [] - [1, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 49, 53, 59, 61, 67, 71, 73, 77, 79, 83, 89, 91, 97, ] - [] - [] - [] - [] - [] - [2, 3, 5, 9, 14, 21, 22, 25, 26, 27, 33, 34, 35, 38, 39, 46, 51, 55, 57, 58, 62, 63, 65, 69, 74, 81, 82, 85, 86, 87, 93, 94, 95, 98, 99, ] - [] - [] - [] - [] - [] - [4, 6, 8, 10, 15, 16, 18, 28, 32, 42, 44, 45, 50, 52, 54, 56, 64, 66, 68, 70, 75, 76, 78, 88, 92, ] - [] - [] - [] - [] - [] - [12, 20, 24, 30, 36, 40, 48, 72, 80, 84, 90, 96, 100, ] - [] - [] - [] - [] - [] - [0, 60, ] - "###); - - assert_correct_combinations(&combinations, ÷rs); - } -} diff --git a/milli/src/search/criteria/final.rs b/milli/src/search/criteria/final.rs deleted file mode 100644 index 9f7a147b8..000000000 --- a/milli/src/search/criteria/final.rs +++ /dev/null @@ -1,77 +0,0 @@ -use log::debug; -use roaring::RoaringBitmap; - -use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::InitialCandidates; -use crate::search::query_tree::Operation; -use crate::search::WordDerivationsCache; -use crate::Result; - -/// The result of a call to the fetcher. -#[derive(Debug, Clone, PartialEq)] -pub struct FinalResult { - /// The query tree corresponding to the current bucket of the last criterion. - pub query_tree: Option, - /// The candidates of the current bucket of the last criterion. - pub candidates: RoaringBitmap, - /// Candidates that comes from the current bucket of the initial criterion. - pub initial_candidates: InitialCandidates, -} - -pub struct Final<'t> { - ctx: &'t dyn Context<'t>, - parent: Box, - wdcache: WordDerivationsCache, - returned_candidates: RoaringBitmap, -} - -impl<'t> Final<'t> { - pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Final<'t> { - Final { - ctx, - parent, - wdcache: WordDerivationsCache::new(), - returned_candidates: RoaringBitmap::new(), - } - } - - #[logging_timer::time("Final::{}")] - pub fn next(&mut self, excluded_candidates: &RoaringBitmap) -> Result> { - debug!("Final iteration"); - let excluded_candidates = &self.returned_candidates | excluded_candidates; - let mut criterion_parameters = CriterionParameters { - wdcache: &mut self.wdcache, - // returned_candidates is merged with excluded_candidates to avoid duplicas - excluded_candidates: &excluded_candidates, - }; - - match self.parent.next(&mut criterion_parameters)? { - Some(CriterionResult { - query_tree, - candidates, - filtered_candidates, - initial_candidates, - }) => { - let mut candidates = match (candidates, query_tree.as_ref()) { - (Some(candidates), _) => candidates, - (None, Some(qt)) => { - resolve_query_tree(self.ctx, qt, &mut self.wdcache)? - excluded_candidates - } - (None, None) => self.ctx.documents_ids()? - excluded_candidates, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; - } - - let initial_candidates = initial_candidates - .unwrap_or_else(|| InitialCandidates::Estimated(candidates.clone())); - - self.returned_candidates |= &candidates; - - Ok(Some(FinalResult { query_tree, candidates, initial_candidates })) - } - None => Ok(None), - } - } -} diff --git a/milli/src/search/criteria/geo.rs b/milli/src/search/criteria/geo.rs deleted file mode 100644 index 0b33e6b2f..000000000 --- a/milli/src/search/criteria/geo.rs +++ /dev/null @@ -1,154 +0,0 @@ -use std::iter; - -use roaring::RoaringBitmap; -use rstar::RTree; - -use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::{resolve_query_tree, CriteriaBuilder, InitialCandidates}; -use crate::{lat_lng_to_xyz, GeoPoint, Index, Result}; - -pub struct Geo<'t> { - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, - ascending: bool, - parent: Box, - candidates: Box>, - allowed_candidates: RoaringBitmap, - initial_candidates: InitialCandidates, - rtree: Option>, - point: [f64; 2], -} - -impl<'t> Geo<'t> { - pub fn asc( - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, - parent: Box, - point: [f64; 2], - ) -> Result { - Self::new(index, rtxn, parent, point, true) - } - - pub fn desc( - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, - parent: Box, - point: [f64; 2], - ) -> Result { - Self::new(index, rtxn, parent, point, false) - } - - fn new( - index: &'t Index, - rtxn: &'t heed::RoTxn<'t>, - parent: Box, - point: [f64; 2], - ascending: bool, - ) -> Result { - let candidates = Box::new(iter::empty()); - let allowed_candidates = index.geo_faceted_documents_ids(rtxn)?; - let initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new()); - let rtree = index.geo_rtree(rtxn)?; - - Ok(Self { - index, - rtxn, - ascending, - parent, - candidates, - allowed_candidates, - initial_candidates, - rtree, - point, - }) - } -} - -impl Criterion for Geo<'_> { - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - let rtree = self.rtree.as_ref(); - - loop { - match self.candidates.next() { - Some(mut candidates) => { - candidates -= params.excluded_candidates; - self.allowed_candidates -= &candidates; - return Ok(Some(CriterionResult { - query_tree: None, - candidates: Some(candidates), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.clone()), - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree, - candidates, - filtered_candidates, - initial_candidates, - }) => { - let mut candidates = match (&query_tree, candidates) { - (_, Some(candidates)) => candidates, - (Some(qt), None) => { - let context = CriteriaBuilder::new(self.rtxn, self.index)?; - resolve_query_tree(&context, qt, params.wdcache)? - } - (None, None) => self.index.documents_ids(self.rtxn)?, - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; - } - - match initial_candidates { - Some(initial_candidates) => { - self.initial_candidates |= initial_candidates - } - None => self.initial_candidates.map_inplace(|c| c | &candidates), - } - - if candidates.is_empty() { - continue; - } - self.allowed_candidates = &candidates - params.excluded_candidates; - self.candidates = match rtree { - Some(rtree) => geo_point( - rtree, - self.allowed_candidates.clone(), - self.point, - self.ascending, - ), - None => Box::new(std::iter::empty()), - }; - } - None => return Ok(None), - }, - } - } - } -} - -fn geo_point( - rtree: &RTree, - mut candidates: RoaringBitmap, - point: [f64; 2], - ascending: bool, -) -> Box> { - let point = lat_lng_to_xyz(&point); - - let mut results = Vec::new(); - for point in rtree.nearest_neighbor_iter(&point) { - if candidates.remove(point.data.0) { - results.push(std::iter::once(point.data.0).collect()); - if candidates.is_empty() { - break; - } - } - } - - if ascending { - Box::new(results.into_iter()) - } else { - Box::new(results.into_iter().rev()) - } -} diff --git a/milli/src/search/criteria/initial.rs b/milli/src/search/criteria/initial.rs deleted file mode 100644 index 0826a9f68..000000000 --- a/milli/src/search/criteria/initial.rs +++ /dev/null @@ -1,82 +0,0 @@ -use roaring::RoaringBitmap; - -use super::{Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::{resolve_query_tree, Context, InitialCandidates}; -use crate::search::query_tree::Operation; -use crate::search::Distinct; -use crate::Result; -/// Initial is a mandatory criterion, it is always the first -/// and is meant to initalize the CriterionResult used by the other criteria. -/// It behave like an [Once Iterator](https://doc.rust-lang.org/std/iter/struct.Once.html) and will return Some(CriterionResult) only one time. -pub struct Initial<'t, D> { - ctx: &'t dyn Context<'t>, - answer: Option, - exhaustive_number_hits: bool, - distinct: Option, -} - -impl<'t, D> Initial<'t, D> { - pub fn new( - ctx: &'t dyn Context<'t>, - query_tree: Option, - filtered_candidates: Option, - exhaustive_number_hits: bool, - distinct: Option, - ) -> Initial { - let answer = CriterionResult { - query_tree, - candidates: None, - filtered_candidates, - initial_candidates: None, - }; - Initial { ctx, answer: Some(answer), exhaustive_number_hits, distinct } - } -} - -impl Criterion for Initial<'_, D> { - #[logging_timer::time("Initial::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - self.answer - .take() - .map(|mut answer| { - if self.exhaustive_number_hits { - // resolve the whole query tree to retrieve an exhaustive list of documents matching the query. - let candidates = answer - .query_tree - .as_ref() - .map(|query_tree| resolve_query_tree(self.ctx, query_tree, params.wdcache)) - .transpose()?; - - // then intersect the candidates with the potential filtered candidates. - let mut candidates = match (candidates, answer.filtered_candidates.take()) { - (Some(candidates), Some(filtered)) => candidates & filtered, - (Some(candidates), None) => candidates, - (None, Some(filtered)) => filtered, - (None, None) => self.ctx.documents_ids()?, - }; - - // then remove the potential soft deleted documents. - candidates -= params.excluded_candidates; - - // because the initial_candidates should be an exhaustive count of the matching documents, - // we precompute the distinct attributes. - let initial_candidates = match &mut self.distinct { - Some(distinct) => { - let mut initial_candidates = RoaringBitmap::new(); - for c in distinct.distinct(candidates.clone(), RoaringBitmap::new()) { - initial_candidates.insert(c?); - } - initial_candidates - } - None => candidates.clone(), - }; - - answer.candidates = Some(candidates); - answer.initial_candidates = - Some(InitialCandidates::Exhaustive(initial_candidates)); - } - Ok(answer) - }) - .transpose() - } -} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs deleted file mode 100644 index 5e491672f..000000000 --- a/milli/src/search/criteria/mod.rs +++ /dev/null @@ -1,1049 +0,0 @@ -use std::borrow::Cow; -use std::collections::HashMap; -use std::mem::take; -use std::ops::{BitOr, BitOrAssign}; - -use roaring::RoaringBitmap; - -use self::asc_desc::AscDesc; -use self::attribute::Attribute; -use self::exactness::Exactness; -use self::initial::Initial; -use self::proximity::Proximity; -use self::r#final::Final; -use self::typo::Typo; -use self::words::Words; -use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; -use super::CriterionImplementationStrategy; -use crate::search::criteria::geo::Geo; -use crate::search::{word_derivations, Distinct, WordDerivationsCache}; -use crate::update::{MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB}; -use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; - -mod asc_desc; -pub use asc_desc::{facet_max_value, facet_min_value}; -mod attribute; -mod exactness; -pub mod r#final; -mod geo; -mod initial; -mod proximity; -mod typo; -mod words; - -pub trait Criterion { - fn next(&mut self, params: &mut CriterionParameters) -> Result>; -} - -/// The result of a call to the parent criterion. -#[derive(Debug, Clone, PartialEq)] -pub struct CriterionResult { - /// The query tree that must be used by the children criterion to fetch candidates. - query_tree: Option, - /// The candidates that this criterion is allowed to return subsets of, - /// if None, it is up to the child to compute the candidates itself. - candidates: Option, - /// The candidates, coming from facet filters, that this criterion is allowed to return subsets of. - filtered_candidates: Option, - /// Candidates that comes from the current bucket of the initial criterion. - initial_candidates: Option, -} - -#[derive(Debug, PartialEq)] -pub struct CriterionParameters<'a> { - wdcache: &'a mut WordDerivationsCache, - excluded_candidates: &'a RoaringBitmap, -} - -/// Either a set of candidates that defines the candidates -/// that are allowed to be returned, -/// or the candidates that must never be returned. -#[derive(Debug)] -enum Candidates { - Allowed(RoaringBitmap), - Forbidden(RoaringBitmap), -} - -impl Default for Candidates { - fn default() -> Self { - Self::Forbidden(RoaringBitmap::new()) - } -} - -/// Either a set of candidates that defines the estimated set of candidates -/// that could be returned, -/// or the Exhaustive set of candidates that will be returned if all possible results are fetched. -#[derive(Debug, Clone, PartialEq)] -pub enum InitialCandidates { - Estimated(RoaringBitmap), - Exhaustive(RoaringBitmap), -} - -impl InitialCandidates { - fn take(&mut self) -> Self { - match self { - Self::Estimated(c) => Self::Estimated(take(c)), - Self::Exhaustive(c) => Self::Exhaustive(take(c)), - } - } - - /// modify the containing roaring bitmap inplace if the set isn't already Exhaustive. - pub fn map_inplace(&mut self, f: F) - where - F: FnOnce(RoaringBitmap) -> RoaringBitmap, - { - if let Self::Estimated(c) = self { - *c = f(take(c)) - } - } - - pub fn into_inner(self) -> RoaringBitmap { - match self { - Self::Estimated(c) => c, - Self::Exhaustive(c) => c, - } - } -} - -impl BitOrAssign for InitialCandidates { - /// Make an union between the containing roaring bitmaps if the set isn't already Exhaustive. - /// In the case of rhs is Exhaustive and not self, then rhs replaces self. - fn bitor_assign(&mut self, rhs: Self) { - if let Self::Estimated(c) = self { - *self = match rhs { - Self::Estimated(rhs) => Self::Estimated(rhs | &*c), - Self::Exhaustive(rhs) => Self::Exhaustive(rhs), - } - } - } -} - -impl BitOr for InitialCandidates { - type Output = Self; - - /// Make an union between the containing roaring bitmaps if the set isn't already Exhaustive. - /// In the case of rhs is Exhaustive and not self, then rhs replaces self. - fn bitor(self, rhs: Self) -> Self::Output { - if let Self::Estimated(c) = self { - match rhs { - Self::Estimated(rhs) => Self::Estimated(rhs | c), - Self::Exhaustive(rhs) => Self::Exhaustive(rhs), - } - } else { - self.clone() - } - } -} - -pub trait Context<'c> { - fn documents_ids(&self) -> heed::Result; - fn word_docids(&self, word: &str) -> heed::Result>; - fn exact_word_docids(&self, word: &str) -> heed::Result>; - fn word_prefix_docids(&self, word: &str) -> heed::Result>; - fn exact_word_prefix_docids(&self, word: &str) -> heed::Result>; - - fn word_pair_proximity_docids( - &self, - left: &str, - right: &str, - proximity: u8, - ) -> heed::Result>; - fn word_prefix_pair_proximity_docids( - &self, - left: &str, - right: &str, - proximity: u8, - ) -> heed::Result>; - fn prefix_word_pair_proximity_docids( - &self, - prefix: &str, - right: &str, - proximity: u8, - ) -> heed::Result>; - fn words_fst<'t>(&self) -> &'t fst::Set>; - fn in_prefix_cache(&self, word: &str) -> bool; - fn docid_words_positions( - &self, - docid: DocumentId, - ) -> heed::Result>; - #[allow(clippy::type_complexity)] - fn word_position_iterator( - &self, - word: &str, - in_prefix_cache: bool, - ) -> heed::Result> + 'c>>; - fn synonyms(&self, word: &str) -> heed::Result>>>; - fn searchable_fields_ids(&self) -> Result>; - fn field_id_word_count_docids( - &self, - field_id: FieldId, - word_count: u8, - ) -> heed::Result>; - fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result>; -} - -pub struct CriteriaBuilder<'t> { - rtxn: &'t heed::RoTxn<'t>, - index: &'t Index, - words_fst: fst::Set>, - words_prefixes_fst: fst::Set>, -} - -/// Return the docids for the following word pairs and proximities using [`Context::word_pair_proximity_docids`]. -/// * `left, right, prox` (leftward proximity) -/// * `right, left, prox-1` (rightward proximity) -/// -/// ## Example -/// For a document with the text `the good fox eats the apple`, we have: -/// * `rightward_proximity(the, eats) = 3` -/// * `leftward_proximity(eats, the) = 1` -/// -/// So both the expressions `word_pair_overall_proximity_docids(ctx, the, eats, 3)` -/// and `word_pair_overall_proximity_docids(ctx, the, eats, 2)` would return a bitmap containing -/// the id of this document. -fn word_pair_overall_proximity_docids( - ctx: &dyn Context, - left: &str, - right: &str, - prox: u8, -) -> heed::Result> { - let rightward = ctx.word_pair_proximity_docids(left, right, prox)?; - let leftward = - if prox > 1 { ctx.word_pair_proximity_docids(right, left, prox - 1)? } else { None }; - if let Some(mut all) = rightward { - if let Some(leftward) = leftward { - all |= leftward; - } - Ok(Some(all)) - } else { - Ok(leftward) - } -} - -/// This function works identically to [`word_pair_overall_proximity_docids`] except that the -/// right word is replaced by a prefix string. -/// -/// It will return None if no documents were found or if the prefix does not exist in the -/// `word_prefix_pair_proximity_docids` database. -fn word_prefix_pair_overall_proximity_docids( - ctx: &dyn Context, - left: &str, - prefix: &str, - proximity: u8, -) -> heed::Result> { - // We retrieve the docids for the original and swapped word pairs: - // A: word1 prefix2 proximity - // B: prefix2 word1 proximity-1 - let rightward = ctx.word_prefix_pair_proximity_docids(left, prefix, proximity)?; - - let leftward = if proximity > 1 { - ctx.prefix_word_pair_proximity_docids(prefix, left, proximity - 1)? - } else { - None - }; - if let Some(mut all) = rightward { - if let Some(leftward) = leftward { - all |= leftward; - } - Ok(Some(all)) - } else { - Ok(leftward) - } -} - -impl<'c> Context<'c> for CriteriaBuilder<'c> { - fn documents_ids(&self) -> heed::Result { - self.index.documents_ids(self.rtxn) - } - - fn word_docids(&self, word: &str) -> heed::Result> { - self.index.word_docids.get(self.rtxn, word) - } - - fn exact_word_docids(&self, word: &str) -> heed::Result> { - self.index.exact_word_docids.get(self.rtxn, word) - } - - fn word_prefix_docids(&self, word: &str) -> heed::Result> { - self.index.word_prefix_docids.get(self.rtxn, word) - } - - fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { - self.index.exact_word_prefix_docids.get(self.rtxn, word) - } - - fn word_pair_proximity_docids( - &self, - left: &str, - right: &str, - proximity: u8, - ) -> heed::Result> { - self.index.word_pair_proximity_docids.get(self.rtxn, &(proximity, left, right)) - } - - fn word_prefix_pair_proximity_docids( - &self, - left: &str, - prefix: &str, - proximity: u8, - ) -> heed::Result> { - self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &(proximity, left, prefix)) - } - fn prefix_word_pair_proximity_docids( - &self, - prefix: &str, - right: &str, - proximity: u8, - ) -> heed::Result> { - self.index.prefix_word_pair_proximity_docids.get(self.rtxn, &(proximity, prefix, right)) - } - - fn words_fst<'t>(&self) -> &'t fst::Set> { - &self.words_fst - } - - fn in_prefix_cache(&self, word: &str) -> bool { - self.words_prefixes_fst.contains(word) - } - - fn docid_words_positions( - &self, - docid: DocumentId, - ) -> heed::Result> { - let mut words_positions = HashMap::new(); - for result in self.index.docid_word_positions.prefix_iter(self.rtxn, &(docid, ""))? { - let ((_, word), positions) = result?; - words_positions.insert(word.to_string(), positions); - } - Ok(words_positions) - } - - fn word_position_iterator( - &self, - word: &str, - in_prefix_cache: bool, - ) -> heed::Result> + 'c>> - { - let range = { - let left = u16::min_value(); // TODO: this is wrong - let right = u16::max_value(); // TODO: this is wrong - let left = (word, left); - let right = (word, right); - left..=right - }; - let db = match in_prefix_cache { - true => self.index.word_prefix_position_docids, - false => self.index.word_position_docids, - }; - - Ok(Box::new(db.range(self.rtxn, &range)?)) - } - - fn synonyms(&self, word: &str) -> heed::Result>>> { - self.index.words_synonyms(self.rtxn, &[word]) - } - - fn searchable_fields_ids(&self) -> Result> { - match self.index.searchable_fields_ids(self.rtxn)? { - Some(searchable_fields_ids) => Ok(searchable_fields_ids), - None => Ok(self.index.fields_ids_map(self.rtxn)?.ids().collect()), - } - } - - fn field_id_word_count_docids( - &self, - field_id: FieldId, - word_count: u8, - ) -> heed::Result> { - let key = (field_id, word_count); - self.index.field_id_word_count_docids.get(self.rtxn, &key) - } - - fn word_position_docids(&self, word: &str, pos: u32) -> heed::Result> { - let key = (word, pos as u16); // TODO: this is wrong - self.index.word_position_docids.get(self.rtxn, &key) - } -} - -impl<'t> CriteriaBuilder<'t> { - pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> Result { - let words_fst = index.words_fst(rtxn)?; - let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; - Ok(Self { rtxn, index, words_fst, words_prefixes_fst }) - } - - #[allow(clippy::too_many_arguments)] - pub fn build( - &'t self, - query_tree: Option, - primitive_query: Option>, - filtered_candidates: Option, - sort_criteria: Option>, - exhaustive_number_hits: bool, - distinct: Option, - implementation_strategy: CriterionImplementationStrategy, - ) -> Result> { - use crate::criterion::Criterion as Name; - - let primitive_query = primitive_query.unwrap_or_default(); - - let mut criterion = Box::new(Initial::new( - self, - query_tree, - filtered_candidates, - exhaustive_number_hits, - distinct, - )) as Box; - for name in self.index.criteria(self.rtxn)? { - criterion = match name { - Name::Words => Box::new(Words::new(self, criterion)), - Name::Typo => Box::new(Typo::new(self, criterion)), - Name::Sort => match sort_criteria { - Some(ref sort_criteria) => { - for asc_desc in sort_criteria { - criterion = match asc_desc { - AscDescName::Asc(Member::Field(field)) => Box::new(AscDesc::asc( - self.index, - self.rtxn, - criterion, - field.to_string(), - implementation_strategy, - )?), - AscDescName::Desc(Member::Field(field)) => Box::new(AscDesc::desc( - self.index, - self.rtxn, - criterion, - field.to_string(), - implementation_strategy, - )?), - AscDescName::Asc(Member::Geo(point)) => { - Box::new(Geo::asc(self.index, self.rtxn, criterion, *point)?) - } - AscDescName::Desc(Member::Geo(point)) => { - Box::new(Geo::desc(self.index, self.rtxn, criterion, *point)?) - } - }; - } - criterion - } - None => criterion, - }, - Name::Proximity => { - Box::new(Proximity::new(self, criterion, implementation_strategy)) - } - Name::Attribute => { - Box::new(Attribute::new(self, criterion, implementation_strategy)) - } - Name::Exactness => Box::new(Exactness::new(self, criterion, &primitive_query)?), - Name::Asc(field) => Box::new(AscDesc::asc( - self.index, - self.rtxn, - criterion, - field, - implementation_strategy, - )?), - Name::Desc(field) => Box::new(AscDesc::desc( - self.index, - self.rtxn, - criterion, - field, - implementation_strategy, - )?), - }; - } - - Ok(Final::new(self, criterion)) - } -} - -pub fn resolve_query_tree( - ctx: &dyn Context, - query_tree: &Operation, - wdcache: &mut WordDerivationsCache, -) -> Result { - fn resolve_operation( - ctx: &dyn Context, - query_tree: &Operation, - wdcache: &mut WordDerivationsCache, - ) -> Result { - use Operation::{And, Or, Phrase, Query}; - - match query_tree { - And(ops) => { - let mut ops = ops - .iter() - .map(|op| resolve_operation(ctx, op, wdcache)) - .collect::>>()?; - - ops.sort_unstable_by_key(|cds| cds.len()); - - let mut candidates = RoaringBitmap::new(); - let mut first_loop = true; - for docids in ops { - if first_loop { - candidates = docids; - first_loop = false; - } else { - candidates &= &docids; - } - } - Ok(candidates) - } - Phrase(words) => resolve_phrase(ctx, words), - Or(_, ops) => { - let mut candidates = RoaringBitmap::new(); - for op in ops { - let docids = resolve_operation(ctx, op, wdcache)?; - candidates |= docids; - } - Ok(candidates) - } - Query(q) => Ok(query_docids(ctx, q, wdcache)?), - } - } - - resolve_operation(ctx, query_tree, wdcache) -} - -pub fn resolve_phrase(ctx: &dyn Context, phrase: &[Option]) -> Result { - let mut candidates = RoaringBitmap::new(); - let mut first_iter = true; - let winsize = phrase.len().min(3); - - if phrase.is_empty() { - return Ok(candidates); - } - - for win in phrase.windows(winsize) { - // Get all the documents with the matching distance for each word pairs. - let mut bitmaps = Vec::with_capacity(winsize.pow(2)); - for (offset, s1) in win - .iter() - .enumerate() - .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) - { - for (dist, s2) in win - .iter() - .skip(offset + 1) - .enumerate() - .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) - { - if dist == 0 { - match ctx.word_pair_proximity_docids(s1, s2, 1)? { - Some(m) => bitmaps.push(m), - // If there are no document for this pair, there will be no - // results for the phrase query. - None => return Ok(RoaringBitmap::new()), - } - } else { - let mut bitmap = RoaringBitmap::new(); - for dist in 0..=dist { - if let Some(m) = ctx.word_pair_proximity_docids(s1, s2, dist as u8 + 1)? { - bitmap |= m - } - } - if bitmap.is_empty() { - return Ok(bitmap); - } else { - bitmaps.push(bitmap); - } - } - } - } - - // We sort the bitmaps so that we perform the small intersections first, which is faster. - bitmaps.sort_unstable_by_key(|a| a.len()); - - for bitmap in bitmaps { - if first_iter { - candidates = bitmap; - first_iter = false; - } else { - candidates &= bitmap; - } - // There will be no match, return early - if candidates.is_empty() { - break; - } - } - } - Ok(candidates) -} - -fn all_word_pair_overall_proximity_docids, U: AsRef>( - ctx: &dyn Context, - left_words: &[(T, u8)], - right_words: &[(U, u8)], - proximity: u8, -) -> Result { - let mut docids = RoaringBitmap::new(); - for (left, _l_typo) in left_words { - for (right, _r_typo) in right_words { - let current_docids = - word_pair_overall_proximity_docids(ctx, left.as_ref(), right.as_ref(), proximity)? - .unwrap_or_default(); - docids |= current_docids; - } - } - Ok(docids) -} - -fn query_docids( - ctx: &dyn Context, - query: &Query, - wdcache: &mut WordDerivationsCache, -) -> Result { - match &query.kind { - QueryKind::Exact { word, original_typo } => { - if query.prefix && ctx.in_prefix_cache(word) { - let mut docids = ctx.word_prefix_docids(word)?.unwrap_or_default(); - // only add the exact docids if the word hasn't been derived - if *original_typo == 0 { - docids |= ctx.exact_word_prefix_docids(word)?.unwrap_or_default(); - } - Ok(docids) - } else if query.prefix { - let words = word_derivations(word, true, 0, ctx.words_fst(), wdcache)?; - let mut docids = RoaringBitmap::new(); - for (word, _typo) in words { - docids |= ctx.word_docids(word)?.unwrap_or_default(); - // only add the exact docids if the word hasn't been derived - if *original_typo == 0 { - docids |= ctx.exact_word_docids(word)?.unwrap_or_default(); - } - } - Ok(docids) - } else { - let mut docids = ctx.word_docids(word)?.unwrap_or_default(); - // only add the exact docids if the word hasn't been derived - if *original_typo == 0 { - docids |= ctx.exact_word_docids(word)?.unwrap_or_default(); - } - Ok(docids) - } - } - QueryKind::Tolerant { typo, word } => { - let words = word_derivations(word, query.prefix, *typo, ctx.words_fst(), wdcache)?; - let mut docids = RoaringBitmap::new(); - for (word, typo) in words { - let mut current_docids = ctx.word_docids(word)?.unwrap_or_default(); - if *typo == 0 { - current_docids |= ctx.exact_word_docids(word)?.unwrap_or_default() - } - docids |= current_docids; - } - Ok(docids) - } - } -} - -fn query_pair_proximity_docids( - ctx: &dyn Context, - left: &Query, - right: &Query, - proximity: u8, - wdcache: &mut WordDerivationsCache, -) -> Result { - if proximity >= 8 { - let mut candidates = query_docids(ctx, left, wdcache)?; - let right_candidates = query_docids(ctx, right, wdcache)?; - candidates &= right_candidates; - return Ok(candidates); - } - - let prefix = right.prefix; - match (&left.kind, &right.kind) { - (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { - if prefix { - // There are three distinct cases which we need to distinguish regarding the prefix `right`: - // - // 1. `right` is not in any prefix cache because it is not the prefix of many words - // (and thus, it doesn't have many word derivations) - // 2. `right` is in the prefix cache but cannot be found in the "word prefix pair proximity" databases either - // because it is too long or because the given proximity is too high. - // 3. `right` is in the prefix cache and can be found in the "word prefix pair proximity" databases - // - // The three cases are handled as follows: - // 1. We manually retrieve all the word derivations of `right` and check the `word_pair_proximity` - // database for each of them. - // 2. It would be too expensive to apply the same strategy as (1), therefore, we "disable" the - // proximity ranking rule for the prefixes of the right word. This is done as follows: - // 1. Only find the documents where left is in proximity to the exact (ie non-prefix) right word - // 2. Otherwise, assume that their proximity in all the documents in which they coexist is >= 8 - // - // 3. Query the prefix proximity databases. - match ( - ctx.in_prefix_cache(right), - right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB - && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, - ) { - // Case 1: not in prefix cache - (false, _) => { - let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_overall_proximity_docids( - ctx, - &[(left, 0)], - r_words, - proximity, - ) - } - // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to - // query the prefix proximity databases. - (true, false) => { - // To "save" the relevancy a little bit, we still find the documents where the - // exact (i.e. non-prefix) right word is in the given proximity to the left word. - Ok(word_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? - .unwrap_or_default()) - } - // Case 3: in prefix cache, short enough, and proximity is low enough - (true, true) => Ok(word_prefix_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? - .unwrap_or_default()), - } - } else { - Ok(word_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? - .unwrap_or_default()) - } - } - (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { - let l_words = - word_derivations(left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); - if prefix { - // The logic here is almost identical to the one in the previous match branch. - // The difference is that we fetch the docids for each derivation of the left word. - match ( - ctx.in_prefix_cache(right), - right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB - && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, - ) { - // Case 1: not in prefix cache - (false, _) => { - let mut docids = RoaringBitmap::new(); - let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; - for (left, _) in l_words { - docids |= all_word_pair_overall_proximity_docids( - ctx, - &[(left, 0)], - r_words, - proximity, - )?; - } - Ok(docids) - } - // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to - // query the prefix proximity databases. - (true, false) => { - // To "save" the relevancy a little bit, we still find the documents where the - // exact (i.e. non-prefix) right word is in proximity to any derivation of the left word. - let mut candidates = RoaringBitmap::new(); - for (left, _) in l_words { - candidates |= ctx - .word_pair_proximity_docids(&left, right, proximity)? - .unwrap_or_default(); - } - Ok(candidates) - } - // Case 3: in prefix cache, short enough, and proximity is low enough - (true, true) => { - let mut docids = RoaringBitmap::new(); - for (left, _) in l_words { - docids |= word_prefix_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? - .unwrap_or_default(); - } - Ok(docids) - } - } - } else { - all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) - } - } - (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { - let r_words = word_derivations(right, prefix, *typo, ctx.words_fst(), wdcache)?; - all_word_pair_overall_proximity_docids(ctx, &[(left, 0)], r_words, proximity) - } - ( - QueryKind::Tolerant { typo: l_typo, word: left }, - QueryKind::Tolerant { typo: r_typo, word: right }, - ) => { - let l_words = - word_derivations(left, false, *l_typo, ctx.words_fst(), wdcache)?.to_owned(); - let r_words = word_derivations(right, prefix, *r_typo, ctx.words_fst(), wdcache)?; - all_word_pair_overall_proximity_docids(ctx, &l_words, r_words, proximity) - } - } -} - -#[cfg(test)] -pub mod test { - use std::collections::HashMap; - use std::iter; - - use maplit::hashmap; - use rand::rngs::StdRng; - use rand::{Rng, SeedableRng}; - - use super::*; - - fn s(s: &str) -> String { - s.to_string() - } - pub struct TestContext<'t> { - words_fst: fst::Set>, - word_docids: HashMap, - exact_word_docids: HashMap, - word_prefix_docids: HashMap, - exact_word_prefix_docids: HashMap, - word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, - word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, - prefix_word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, - docid_words: HashMap>, - } - - impl<'c> Context<'c> for TestContext<'c> { - fn documents_ids(&self) -> heed::Result { - Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids)) - } - - fn word_docids(&self, word: &str) -> heed::Result> { - Ok(self.word_docids.get(&word.to_string()).cloned()) - } - - fn exact_word_docids(&self, word: &str) -> heed::Result> { - Ok(self.exact_word_docids.get(&word.to_string()).cloned()) - } - - fn word_prefix_docids(&self, word: &str) -> heed::Result> { - Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) - } - - fn exact_word_prefix_docids(&self, word: &str) -> heed::Result> { - Ok(self.exact_word_prefix_docids.get(&word.to_string()).cloned()) - } - - fn word_pair_proximity_docids( - &self, - left: &str, - right: &str, - proximity: u8, - ) -> heed::Result> { - let key = (left.to_string(), right.to_string(), proximity.into()); - Ok(self.word_pair_proximity_docids.get(&key).cloned()) - } - - fn word_prefix_pair_proximity_docids( - &self, - word: &str, - prefix: &str, - proximity: u8, - ) -> heed::Result> { - let key = (word.to_string(), prefix.to_string(), proximity.into()); - Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) - } - fn prefix_word_pair_proximity_docids( - &self, - prefix: &str, - word: &str, - proximity: u8, - ) -> heed::Result> { - let key = (prefix.to_string(), word.to_string(), proximity.into()); - Ok(self.prefix_word_pair_proximity_docids.get(&key).cloned()) - } - - fn words_fst<'t>(&self) -> &'t fst::Set> { - &self.words_fst - } - - fn in_prefix_cache(&self, word: &str) -> bool { - self.word_prefix_docids.contains_key(&word.to_string()) - } - - fn docid_words_positions( - &self, - docid: DocumentId, - ) -> heed::Result> { - if let Some(docid_words) = self.docid_words.get(&docid) { - Ok(docid_words - .iter() - .enumerate() - .map(|(i, w)| { - let bitmap = RoaringBitmap::from_sorted_iter(iter::once(i as u32)).unwrap(); - (w.clone(), bitmap) - }) - .collect()) - } else { - Ok(HashMap::new()) - } - } - - fn word_position_iterator( - &self, - _word: &str, - _in_prefix_cache: bool, - ) -> heed::Result< - Box> + 'c>, - > { - todo!() - } - - fn synonyms(&self, _word: &str) -> heed::Result>>> { - todo!() - } - - fn searchable_fields_ids(&self) -> Result> { - todo!() - } - - fn word_position_docids( - &self, - _word: &str, - _pos: u32, - ) -> heed::Result> { - todo!() - } - - fn field_id_word_count_docids( - &self, - _field_id: FieldId, - _word_count: u8, - ) -> heed::Result> { - todo!() - } - } - - impl<'a> Default for TestContext<'a> { - fn default() -> TestContext<'a> { - let mut rng = StdRng::seed_from_u64(102); - let rng = &mut rng; - - fn random_postings(rng: &mut R, len: usize) -> RoaringBitmap { - let mut values = Vec::::with_capacity(len); - while values.len() != len { - values.push(rng.gen()); - } - values.sort_unstable(); - - RoaringBitmap::from_sorted_iter(values.into_iter()).unwrap() - } - - let word_docids = hashmap! { - s("hello") => random_postings(rng, 1500), - s("hi") => random_postings(rng, 4000), - s("word") => random_postings(rng, 2500), - s("split") => random_postings(rng, 400), - s("ngrams") => random_postings(rng, 1400), - s("world") => random_postings(rng, 15_000), - s("earth") => random_postings(rng, 8000), - s("2021") => random_postings(rng, 100), - s("2020") => random_postings(rng, 500), - s("is") => random_postings(rng, 50_000), - s("this") => random_postings(rng, 50_000), - s("good") => random_postings(rng, 1250), - s("morning") => random_postings(rng, 125), - }; - - let exact_word_docids = HashMap::new(); - - let mut docid_words = HashMap::new(); - for (word, docids) in word_docids.iter() { - for docid in docids { - let words: &mut Vec<_> = docid_words.entry(docid).or_default(); - words.push(word.clone()); - } - } - - let word_prefix_docids = hashmap! { - s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")], - s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")], - s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], - }; - - let exact_word_prefix_docids = HashMap::new(); - - let mut word_pair_proximity_docids = HashMap::new(); - let mut word_prefix_pair_proximity_docids = HashMap::new(); - let mut prefix_word_pair_proximity_docids = HashMap::new(); - - for (lword, lcandidates) in &word_docids { - for (rword, rcandidates) in &word_docids { - if lword == rword { - continue; - } - let candidates = lcandidates & rcandidates; - for candidate in candidates { - if let Some(docid_words) = docid_words.get(&candidate) { - let lposition = docid_words.iter().position(|w| w == lword).unwrap(); - let rposition = docid_words.iter().position(|w| w == rword).unwrap(); - let key = if lposition < rposition { - (s(lword), s(rword), (rposition - lposition) as i32) - } else { - (s(lword), s(rword), (lposition - rposition + 1) as i32) - }; - let docids: &mut RoaringBitmap = - word_pair_proximity_docids.entry(key).or_default(); - docids.push(candidate); - } - } - } - for (pword, pcandidates) in &word_prefix_docids { - if lword.starts_with(pword) { - continue; - } - let candidates = lcandidates & pcandidates; - for candidate in candidates { - if let Some(docid_words) = docid_words.get(&candidate) { - let lposition = docid_words.iter().position(|w| w == lword).unwrap(); - let rposition = - docid_words.iter().position(|w| w.starts_with(pword)).unwrap(); - if lposition < rposition { - let key = (s(lword), s(pword), (rposition - lposition) as i32); - let docids: &mut RoaringBitmap = - word_prefix_pair_proximity_docids.entry(key).or_default(); - docids.push(candidate); - } else { - let key = (s(lword), s(pword), (lposition - rposition) as i32); - let docids: &mut RoaringBitmap = - prefix_word_pair_proximity_docids.entry(key).or_default(); - docids.push(candidate); - }; - } - } - } - } - - let mut keys = word_docids.keys().collect::>(); - keys.sort_unstable(); - let words_fst = fst::Set::from_iter(keys).unwrap().map_data(Cow::Owned).unwrap(); - - TestContext { - words_fst, - word_docids, - exact_word_docids, - word_prefix_docids, - exact_word_prefix_docids, - word_pair_proximity_docids, - word_prefix_pair_proximity_docids, - prefix_word_pair_proximity_docids, - docid_words, - } - } - } -} diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs deleted file mode 100644 index 182f9fbea..000000000 --- a/milli/src/search/criteria/proximity.rs +++ /dev/null @@ -1,712 +0,0 @@ -use std::collections::btree_map::{self, BTreeMap}; -use std::collections::hash_map::HashMap; - -use log::debug; -use roaring::RoaringBitmap; -use slice_group_by::GroupBy; - -use super::{ - query_docids, query_pair_proximity_docids, resolve_phrase, resolve_query_tree, Context, - Criterion, CriterionParameters, CriterionResult, -}; -use crate::search::criteria::InitialCandidates; -use crate::search::query_tree::{maximum_proximity, Operation, Query, QueryKind}; -use crate::search::{build_dfa, CriterionImplementationStrategy, WordDerivationsCache}; -use crate::{Position, Result}; - -type Cache = HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>; - -/// Threshold on the number of candidates that will make -/// the system choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 1000; - -/// Threshold on the number of proximity that will make -/// the system choose between one algorithm or another. -const PROXIMITY_THRESHOLD: u8 = 0; - -pub struct Proximity<'t> { - ctx: &'t dyn Context<'t>, - /// (max_proximity, query_tree, allowed_candidates) - state: Option<(u8, Operation, RoaringBitmap)>, - proximity: u8, - initial_candidates: InitialCandidates, - parent: Box, - candidates_cache: Cache, - plane_sweep_cache: Option>, - implementation_strategy: CriterionImplementationStrategy, -} - -impl<'t> Proximity<'t> { - pub fn new( - ctx: &'t dyn Context<'t>, - parent: Box, - implementation_strategy: CriterionImplementationStrategy, - ) -> Self { - Proximity { - ctx, - state: None, - proximity: 0, - initial_candidates: InitialCandidates::Estimated(RoaringBitmap::new()), - parent, - candidates_cache: Cache::new(), - plane_sweep_cache: None, - implementation_strategy, - } - } -} - -impl<'t> Criterion for Proximity<'t> { - #[logging_timer::time("Proximity::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some((_, _, allowed_candidates)) = self.state.as_mut() { - *allowed_candidates -= params.excluded_candidates; - } - - loop { - debug!( - "Proximity at iteration {} (max prox {:?}) ({:?})", - self.proximity, - self.state.as_ref().map(|(mp, _, _)| mp), - self.state.as_ref().map(|(_, _, cd)| cd), - ); - - match &mut self.state { - Some((max_prox, _, allowed_candidates)) - if allowed_candidates.is_empty() || self.proximity > *max_prox => - { - self.state = None; // reset state - } - Some((_, query_tree, allowed_candidates)) => { - let mut new_candidates = if matches!( - self.implementation_strategy, - CriterionImplementationStrategy::OnlyIterative - ) || (matches!( - self.implementation_strategy, - CriterionImplementationStrategy::Dynamic - ) && allowed_candidates.len() - <= CANDIDATES_THRESHOLD - && self.proximity > PROXIMITY_THRESHOLD) - { - if let Some(cache) = self.plane_sweep_cache.as_mut() { - match cache.next() { - Some((p, candidates)) => { - self.proximity = p; - candidates - } - None => { - self.state = None; // reset state - continue; - } - } - } else { - let cache = resolve_plane_sweep_candidates( - self.ctx, - query_tree, - allowed_candidates, - )?; - self.plane_sweep_cache = Some(cache.into_iter()); - - continue; - } - } else { - // use set theory based algorithm - resolve_candidates( - self.ctx, - query_tree, - self.proximity, - &mut self.candidates_cache, - params.wdcache, - )? - }; - - new_candidates &= &*allowed_candidates; - *allowed_candidates -= &new_candidates; - self.proximity += 1; - - return Ok(Some(CriterionResult { - query_tree: Some(query_tree.clone()), - candidates: Some(new_candidates), - filtered_candidates: None, - initial_candidates: Some(self.initial_candidates.take()), - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree: Some(query_tree), - candidates, - filtered_candidates, - initial_candidates, - }) => { - let mut candidates = match candidates { - Some(candidates) => candidates, - None => { - resolve_query_tree(self.ctx, &query_tree, params.wdcache)? - - params.excluded_candidates - } - }; - - if let Some(filtered_candidates) = filtered_candidates { - candidates &= filtered_candidates; - } - - match initial_candidates { - Some(initial_candidates) => { - self.initial_candidates |= initial_candidates - } - None => self.initial_candidates.map_inplace(|c| c | &candidates), - } - - let maximum_proximity = maximum_proximity(&query_tree); - self.state = Some((maximum_proximity as u8, query_tree, candidates)); - self.proximity = 0; - self.plane_sweep_cache = None; - } - Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - })); - } - None => return Ok(None), - }, - } - } - } -} - -fn resolve_candidates( - ctx: &dyn Context, - query_tree: &Operation, - proximity: u8, - cache: &mut Cache, - wdcache: &mut WordDerivationsCache, -) -> Result { - fn resolve_operation( - ctx: &dyn Context, - query_tree: &Operation, - proximity: u8, - cache: &mut Cache, - wdcache: &mut WordDerivationsCache, - ) -> Result> { - use Operation::{And, Or, Phrase}; - - let result = match query_tree { - And(ops) => mdfs(ctx, ops, proximity, cache, wdcache)?, - Phrase(words) => { - if proximity == 0 { - let most_left = words - .iter() - .filter_map(|o| o.as_ref()) - .next() - .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - let most_right = words - .iter() - .rev() - .filter_map(|o| o.as_ref()) - .next() - .map(|w| Query { prefix: false, kind: QueryKind::exact(w.clone()) }); - - match (most_left, most_right) { - (Some(l), Some(r)) => vec![(l, r, resolve_phrase(ctx, words)?)], - _otherwise => Default::default(), - } - } else { - Default::default() - } - } - Or(_, ops) => { - let mut output = Vec::new(); - for op in ops { - let result = resolve_operation(ctx, op, proximity, cache, wdcache)?; - output.extend(result); - } - output - } - Operation::Query(q) => { - if proximity == 0 { - let candidates = query_docids(ctx, q, wdcache)?; - vec![(q.clone(), q.clone(), candidates)] - } else { - Default::default() - } - } - }; - - Ok(result) - } - - fn mdfs_pair( - ctx: &dyn Context, - left: &Operation, - right: &Operation, - proximity: u8, - cache: &mut Cache, - wdcache: &mut WordDerivationsCache, - ) -> Result> { - fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator { - (0..=mana.min(left_max)).map(move |m| (m, mana - m)) - } - - let pair_max_proximity = 7; - - let mut output = Vec::new(); - - for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) { - for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) { - let left_key = (left.clone(), left_p); - if !cache.contains_key(&left_key) { - let candidates = resolve_operation(ctx, left, left_p, cache, wdcache)?; - cache.insert(left_key.clone(), candidates); - } - - let right_key = (right.clone(), right_p); - if !cache.contains_key(&right_key) { - let candidates = resolve_operation(ctx, right, right_p, cache, wdcache)?; - cache.insert(right_key.clone(), candidates); - } - - let lefts = cache.get(&left_key).unwrap(); - let rights = cache.get(&right_key).unwrap(); - - for (ll, lr, lcandidates) in lefts { - for (rl, rr, rcandidates) in rights { - let mut candidates = - query_pair_proximity_docids(ctx, lr, rl, pair_p + 1, wdcache)?; - if lcandidates.len() < rcandidates.len() { - candidates &= lcandidates; - candidates &= rcandidates; - } else { - candidates &= rcandidates; - candidates &= lcandidates; - } - if !candidates.is_empty() { - output.push((ll.clone(), rr.clone(), candidates)); - } - } - } - } - } - - Ok(output) - } - - fn mdfs( - ctx: &dyn Context, - branches: &[Operation], - proximity: u8, - cache: &mut Cache, - wdcache: &mut WordDerivationsCache, - ) -> Result> { - // Extract the first two elements but gives the tail - // that is just after the first element. - let next = - branches.split_first().map(|(h1, t)| (h1, t.split_first().map(|(h2, _)| (h2, t)))); - - match next { - Some((head1, Some((head2, [_])))) => { - mdfs_pair(ctx, head1, head2, proximity, cache, wdcache) - } - Some((head1, Some((head2, tail)))) => { - let mut output = Vec::new(); - for p in 0..=proximity { - for (lhead, _, head_candidates) in - mdfs_pair(ctx, head1, head2, p, cache, wdcache)? - { - if !head_candidates.is_empty() { - for (_, rtail, mut candidates) in - mdfs(ctx, tail, proximity - p, cache, wdcache)? - { - candidates &= &head_candidates; - if !candidates.is_empty() { - output.push((lhead.clone(), rtail, candidates)); - } - } - } - } - } - Ok(output) - } - Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache, wdcache), - None => Ok(Default::default()), - } - } - - let mut candidates = RoaringBitmap::new(); - for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache, wdcache)? { - candidates |= cds; - } - Ok(candidates) -} - -fn resolve_plane_sweep_candidates( - ctx: &dyn Context, - query_tree: &Operation, - allowed_candidates: &RoaringBitmap, -) -> Result> { - /// FIXME may be buggy with query like "new new york" - fn plane_sweep( - groups_positions: Vec>, - consecutive: bool, - ) -> Result> { - fn compute_groups_proximity( - groups: &[(usize, (Position, u8, Position))], - consecutive: bool, - ) -> Option<(Position, u8, Position)> { - // take the inner proximity of the first group as initial - let (_, (_, mut proximity, _)) = groups.first()?; - let (_, (left_most_pos, _, _)) = groups.first()?; - let (_, (_, _, right_most_pos)) = - groups.iter().max_by_key(|(_, (_, _, right_most_pos))| right_most_pos)?; - - for pair in groups.windows(2) { - if let [(i1, (lpos1, _, rpos1)), (i2, (lpos2, prox2, rpos2))] = pair { - // if two positions are equal, meaning that they share at least a word, we return None - if rpos1 == rpos2 || lpos1 == lpos2 || rpos1 == lpos2 || lpos1 == rpos2 { - return None; - } - - let pair_proximity = { - // if intervals are disjoint [..].(..) - if lpos2 > rpos1 { - lpos2 - rpos1 - } - // if the second interval is a subset of the first [.(..).] - else if rpos2 < rpos1 { - (lpos2 - lpos1).min(rpos1 - rpos2) - } - // if intervals overlaps [.(..].) - else { - (lpos2 - lpos1).min(rpos2 - rpos1) - } - }; - - // if groups are in the good order (query order) we remove 1 to the proximity - // the proximity is clamped to 7 - let pair_proximity = - if i1 < i2 { (pair_proximity - 1).min(7) } else { pair_proximity.min(7) }; - - proximity += pair_proximity as u8 + prox2; - } - } - - // if groups should be consecutives, we will only accept groups with a proximity of 0 - if !consecutive || proximity == 0 { - Some((*left_most_pos, proximity, *right_most_pos)) - } else { - None - } - } - - let groups_len = groups_positions.len(); - - let mut groups_positions: Vec<_> = - groups_positions.into_iter().map(|pos| pos.into_iter()).collect(); - - // Pop top elements of each list. - let mut current = Vec::with_capacity(groups_len); - for (i, positions) in groups_positions.iter_mut().enumerate() { - match positions.next() { - Some(p) => current.push((i, p)), - // if a group return None, it means that the document does not contain all the words, - // we return an empty result. - None => return Ok(Vec::new()), - } - } - - // Sort k elements by their positions. - current.sort_unstable_by_key(|(_, p)| *p); - - // Find leftmost and rightmost group and their positions. - let mut leftmost = *current.first().unwrap(); - let mut rightmost = *current.last().unwrap(); - - let mut output = Vec::new(); - loop { - // Find the position p of the next elements of a list of the leftmost group. - // If the list is empty, break the loop. - let p = groups_positions[leftmost.0].next().map(|p| (leftmost.0, p)); - - // let q be the position q of second group of the interval. - let q = current[1]; - - // If p > r, then the interval [l, r] is minimal and - // we insert it into the heap according to its size. - if p.map_or(true, |p| p.1 > rightmost.1) { - if let Some(group) = compute_groups_proximity(¤t, consecutive) { - output.push(group); - } - } - - let p = match p { - Some(p) => p, - None => break, - }; - - // Replace the leftmost group P in the interval. - current[0] = p; - - if p.1 > rightmost.1 { - // if [l, r] is minimal, let r = p and l = q. - rightmost = p; - leftmost = q; - } else { - // Ohterwise, let l = min{p,q}. - leftmost = if p.1 < q.1 { p } else { q }; - } - - // Then update the interval and order of groups_positions in the interval. - current.sort_unstable_by_key(|(_, p)| *p); - } - - // Sort the list according to the size and the positions. - output.sort_unstable(); - - Ok(output) - } - - fn resolve_operation<'a>( - query_tree: &'a Operation, - rocache: &mut HashMap<&'a Operation, Vec<(Position, u8, Position)>>, - words_positions: &HashMap, - ) -> Result> { - use Operation::{And, Or, Phrase}; - - if let Some(result) = rocache.get(query_tree) { - return Ok(result.clone()); - } - - let result = match query_tree { - And(ops) => { - let mut groups_positions = Vec::with_capacity(ops.len()); - for operation in ops { - let positions = resolve_operation(operation, rocache, words_positions)?; - groups_positions.push(positions); - } - plane_sweep(groups_positions, false)? - } - Phrase(words) => { - let mut groups_positions = Vec::with_capacity(words.len()); - - // group stop_words together. - for words in words.linear_group_by_key(Option::is_none) { - // skip if it's a group of stop words. - if matches!(words.first(), None | Some(None)) { - continue; - } - // make a consecutive plane-sweep on the subgroup of words. - let mut subgroup = Vec::with_capacity(words.len()); - for word in words.iter().map(|w| w.as_deref().unwrap()) { - match words_positions.get(word) { - Some(positions) => { - subgroup.push(positions.iter().map(|p| (p, 0, p)).collect()) - } - None => return Ok(vec![]), - } - } - match subgroup.len() { - 0 => {} - 1 => groups_positions.push(subgroup.pop().unwrap()), - _ => groups_positions.push(plane_sweep(subgroup, true)?), - } - } - match groups_positions.len() { - 0 => vec![], - 1 => groups_positions.pop().unwrap(), - _ => plane_sweep(groups_positions, false)?, - } - } - Or(_, ops) => { - let mut result = Vec::new(); - for op in ops { - result.extend(resolve_operation(op, rocache, words_positions)?) - } - - result.sort_unstable(); - result - } - Operation::Query(Query { prefix, kind }) => { - let mut result = Vec::new(); - match kind { - QueryKind::Exact { word, .. } => { - if *prefix { - let iter = word_derivations(word, true, 0, words_positions) - .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); - result.extend(iter); - } else if let Some(positions) = words_positions.get(word) { - result.extend(positions.iter().map(|p| (p, 0, p))); - } - } - QueryKind::Tolerant { typo, word } => { - let iter = word_derivations(word, *prefix, *typo, words_positions) - .flat_map(|positions| positions.iter().map(|p| (p, 0, p))); - result.extend(iter); - } - } - - result.sort_unstable(); - result - } - }; - - rocache.insert(query_tree, result.clone()); - Ok(result) - } - - fn word_derivations<'a>( - word: &str, - is_prefix: bool, - max_typo: u8, - words_positions: &'a HashMap, - ) -> impl Iterator { - let dfa = build_dfa(word, max_typo, is_prefix); - words_positions.iter().filter_map(move |(document_word, positions)| { - use levenshtein_automata::Distance; - match dfa.eval(document_word) { - Distance::Exact(_) => Some(positions), - Distance::AtLeast(_) => None, - } - }) - } - - let mut resolve_operation_cache = HashMap::new(); - let mut candidates = BTreeMap::new(); - for docid in allowed_candidates { - let words_positions = ctx.docid_words_positions(docid)?; - resolve_operation_cache.clear(); - let positions = - resolve_operation(query_tree, &mut resolve_operation_cache, &words_positions)?; - let best_proximity = positions.into_iter().min_by_key(|(_, proximity, _)| *proximity); - let best_proximity = best_proximity.map(|(_, proximity, _)| proximity).unwrap_or(7); - candidates.entry(best_proximity).or_insert_with(RoaringBitmap::new).insert(docid); - } - - Ok(candidates) -} - -#[cfg(test)] -mod tests { - use std::io::Cursor; - - use big_s::S; - - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::{Criterion, CriterionImplementationStrategy, SearchResult}; - - fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { - let mut documents = Vec::new(); - for prefix in prefixes { - for i in 0..500 { - documents.push( - serde_json::json!({ - "text": format!("{prefix}{i:x}"), - }) - .as_object() - .unwrap() - .clone(), - ) - } - } - documents - } - - #[test] - fn test_proximity_criterion_prefix_handling() { - let mut index = TempIndex::new(); - index.index_documents_config.autogenerate_docids = true; - - index - .update_settings(|settings| { - settings.set_primary_key(S("id")); - settings.set_criteria(vec![ - Criterion::Words, - Criterion::Typo, - Criterion::Proximity, - ]); - }) - .unwrap(); - - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - - for doc in [ - // 0 - serde_json::json!({ "text": "zero is exactly the amount of configuration I want" }), - // 1 - serde_json::json!({ "text": "zero bad configuration" }), - // 2 - serde_json::json!({ "text": "zero configuration" }), - // 3 - serde_json::json!({ "text": "zero config" }), - // 4 - serde_json::json!({ "text": "zero conf" }), - // 5 - serde_json::json!({ "text": "zero bad conf" }), - ] { - documents.append_json_object(doc.as_object().unwrap()).unwrap(); - } - for doc in documents_with_enough_different_words_for_prefixes(&["conf"]) { - documents.append_json_object(&doc).unwrap(); - } - let documents = - DocumentsBatchReader::from_reader(Cursor::new(documents.into_inner().unwrap())) - .unwrap(); - - index.add_documents(documents).unwrap(); - - let rtxn = index.read_txn().unwrap(); - - let SearchResult { matching_words: _, candidates: _, documents_ids } = index - .search(&rtxn) - .query("zero c") - .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) - .execute() - .unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); - - let SearchResult { matching_words: _, candidates: _, documents_ids } = index - .search(&rtxn) - .query("zero co") - .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) - .execute() - .unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); - - let SearchResult { matching_words: _, candidates: _, documents_ids } = index - .search(&rtxn) - .query("zero con") - .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) - .execute() - .unwrap(); - // Here searh results are degraded because `con` is in the prefix cache but it is too - // long to be stored in the prefix proximity databases, and we don't want to iterate over - // all of its word derivations - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]"); - - let SearchResult { matching_words: _, candidates: _, documents_ids } = index - .search(&rtxn) - .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) - .query("zero conf") - .execute() - .unwrap(); - // Here search results are degraded as well, but we can still rank correctly documents - // that contain `conf` exactly, and not as a prefix. - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 2, 3]"); - - let SearchResult { matching_words: _, candidates: _, documents_ids } = index - .search(&rtxn) - .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) - .query("zero config") - .execute() - .unwrap(); - // `config` is not a common prefix, so the normal methods are used - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 4, 5]"); - } -} diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs deleted file mode 100644 index 69a210e7b..000000000 --- a/milli/src/search/criteria/typo.rs +++ /dev/null @@ -1,493 +0,0 @@ -use std::borrow::Cow; -use std::collections::HashMap; -use std::mem::take; - -use log::debug; -use roaring::RoaringBitmap; - -use super::{ - query_docids, resolve_query_tree, Candidates, Context, Criterion, CriterionParameters, - CriterionResult, -}; -use crate::search::criteria::{resolve_phrase, InitialCandidates}; -use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; -use crate::search::{word_derivations, WordDerivationsCache}; -use crate::Result; - -/// Maximum number of typo for a word of any length. -const MAX_TYPOS_PER_WORD: u8 = 2; - -pub struct Typo<'t> { - ctx: &'t dyn Context<'t>, - /// (max_typos, query_tree, candidates) - state: Option<(u8, Operation, Candidates)>, - typos: u8, - initial_candidates: Option, - parent: Box, - candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, -} - -impl<'t> Typo<'t> { - pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { - Typo { - ctx, - state: None, - typos: 0, - initial_candidates: None, - parent, - candidates_cache: HashMap::new(), - } - } -} - -impl<'t> Criterion for Typo<'t> { - #[logging_timer::time("Typo::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - use Candidates::{Allowed, Forbidden}; - // remove excluded candidates when next is called, instead of doing it in the loop. - match self.state.as_mut() { - Some((_, _, Allowed(candidates))) => *candidates -= params.excluded_candidates, - Some((_, _, Forbidden(candidates))) => *candidates |= params.excluded_candidates, - None => (), - } - - loop { - debug!( - "Typo at iteration {} (max typos {:?}) ({:?})", - self.typos, - self.state.as_ref().map(|(mt, _, _)| mt), - self.state.as_ref().map(|(_, _, cd)| cd), - ); - - match self.state.as_mut() { - Some((max_typos, _, _)) if self.typos > *max_typos => { - self.state = None; // reset state - } - Some((_, _, Allowed(allowed_candidates))) if allowed_candidates.is_empty() => { - self.state = None; // reset state - } - Some((_, query_tree, candidates_authorization)) => { - let fst = self.ctx.words_fst(); - let new_query_tree = match self.typos { - typos if typos < MAX_TYPOS_PER_WORD => alterate_query_tree( - fst, - query_tree.clone(), - self.typos, - params.wdcache, - )?, - MAX_TYPOS_PER_WORD => { - // When typos >= MAX_TYPOS_PER_WORD, no more alteration of the query tree is possible, - // we keep the altered query tree - *query_tree = alterate_query_tree( - fst, - query_tree.clone(), - self.typos, - params.wdcache, - )?; - // we compute the allowed candidates - let query_tree_allowed_candidates = - resolve_query_tree(self.ctx, query_tree, params.wdcache)?; - // we assign the allowed candidates to the candidates authorization. - *candidates_authorization = match take(candidates_authorization) { - Allowed(allowed_candidates) => { - Allowed(query_tree_allowed_candidates & allowed_candidates) - } - Forbidden(forbidden_candidates) => { - Allowed(query_tree_allowed_candidates - forbidden_candidates) - } - }; - query_tree.clone() - } - _otherwise => query_tree.clone(), - }; - - let mut candidates = resolve_candidates( - self.ctx, - &new_query_tree, - self.typos, - &mut self.candidates_cache, - params.wdcache, - )?; - - match candidates_authorization { - Allowed(allowed_candidates) => { - candidates &= &*allowed_candidates; - *allowed_candidates -= &candidates; - } - Forbidden(forbidden_candidates) => { - candidates -= &*forbidden_candidates; - *forbidden_candidates |= &candidates; - } - } - - let initial_candidates = match self.initial_candidates.as_mut() { - Some(initial_candidates) => initial_candidates.take(), - None => InitialCandidates::Estimated(candidates.clone()), - }; - - self.typos += 1; - - return Ok(Some(CriterionResult { - query_tree: Some(new_query_tree), - candidates: Some(candidates), - filtered_candidates: None, - initial_candidates: Some(initial_candidates), - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree: Some(query_tree), - candidates, - filtered_candidates, - initial_candidates, - }) => { - self.initial_candidates = - match (self.initial_candidates.take(), initial_candidates) { - (Some(self_ic), Some(parent_ic)) => Some(self_ic | parent_ic), - (self_ic, parent_ic) => self_ic.or(parent_ic), - }; - - let candidates = match candidates.or(filtered_candidates) { - Some(candidates) => { - Candidates::Allowed(candidates - params.excluded_candidates) - } - None => Candidates::Forbidden(params.excluded_candidates.clone()), - }; - - let maximum_typos = maximum_typo(&query_tree) as u8; - self.state = Some((maximum_typos, query_tree, candidates)); - self.typos = 0; - } - Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - })); - } - None => return Ok(None), - }, - } - } - } -} - -/// Modify the query tree by replacing every tolerant query by an Or operation -/// containing all of the corresponding exact words in the words FST. Each tolerant -/// query will only be replaced by exact query with up to `number_typos` maximum typos. -fn alterate_query_tree( - words_fst: &fst::Set>, - mut query_tree: Operation, - number_typos: u8, - wdcache: &mut WordDerivationsCache, -) -> Result { - fn recurse( - words_fst: &fst::Set>, - operation: &mut Operation, - number_typos: u8, - wdcache: &mut WordDerivationsCache, - ) -> Result<()> { - use Operation::{And, Or, Phrase}; - - match operation { - And(ops) | Or(_, ops) => { - ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, wdcache)) - } - // Because Phrases don't allow typos, no alteration can be done. - Phrase(_words) => Ok(()), - Operation::Query(q) => { - if let QueryKind::Tolerant { typo, word } = &q.kind { - // if no typo is allowed we don't call word_derivations function, - // and directly create an Exact query - if number_typos == 0 { - *operation = Operation::Query(Query { - prefix: q.prefix, - kind: QueryKind::Exact { original_typo: 0, word: word.clone() }, - }); - } else { - let typo = *typo.min(&number_typos); - let words = word_derivations(word, q.prefix, typo, words_fst, wdcache)?; - let queries = words - .iter() - .map(|(word, typo)| { - Operation::Query(Query { - prefix: false, - kind: QueryKind::Exact { - original_typo: *typo, - word: word.to_string(), - }, - }) - }) - .collect(); - - *operation = Operation::or(false, queries); - } - } - - Ok(()) - } - } - } - - recurse(words_fst, &mut query_tree, number_typos, wdcache)?; - Ok(query_tree) -} - -fn resolve_candidates( - ctx: &dyn Context, - query_tree: &Operation, - number_typos: u8, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, - wdcache: &mut WordDerivationsCache, -) -> Result { - fn resolve_operation( - ctx: &dyn Context, - query_tree: &Operation, - number_typos: u8, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, - wdcache: &mut WordDerivationsCache, - ) -> Result { - use Operation::{And, Or, Phrase, Query}; - - match query_tree { - And(ops) => mdfs(ctx, ops, number_typos, cache, wdcache), - Phrase(words) => resolve_phrase(ctx, words), - Or(_, ops) => { - let mut candidates = RoaringBitmap::new(); - for op in ops { - let docids = resolve_operation(ctx, op, number_typos, cache, wdcache)?; - candidates |= docids; - } - Ok(candidates) - } - Query(q) => { - if q.kind.typo() == number_typos { - Ok(query_docids(ctx, q, wdcache)?) - } else { - Ok(RoaringBitmap::new()) - } - } - } - } - - fn mdfs( - ctx: &dyn Context, - branches: &[Operation], - mana: u8, - cache: &mut HashMap<(Operation, u8), RoaringBitmap>, - wdcache: &mut WordDerivationsCache, - ) -> Result { - match branches.split_first() { - Some((head, [])) => { - let cache_key = (head.clone(), mana); - if let Some(candidates) = cache.get(&cache_key) { - Ok(candidates.clone()) - } else { - let candidates = resolve_operation(ctx, head, mana, cache, wdcache)?; - cache.insert(cache_key, candidates.clone()); - Ok(candidates) - } - } - Some((head, tail)) => { - let mut candidates = RoaringBitmap::new(); - - for m in 0..=mana { - let mut head_candidates = { - let cache_key = (head.clone(), m); - if let Some(candidates) = cache.get(&cache_key) { - candidates.clone() - } else { - let candidates = resolve_operation(ctx, head, m, cache, wdcache)?; - cache.insert(cache_key, candidates.clone()); - candidates - } - }; - if !head_candidates.is_empty() { - let tail_candidates = mdfs(ctx, tail, mana - m, cache, wdcache)?; - head_candidates &= tail_candidates; - candidates |= head_candidates; - } - } - - Ok(candidates) - } - None => Ok(RoaringBitmap::new()), - } - } - - resolve_operation(ctx, query_tree, number_typos, cache, wdcache) -} - -#[cfg(test)] -mod test { - use super::super::initial::Initial; - use super::super::test::TestContext; - use super::*; - use crate::search::NoopDistinct; - - fn display_criteria(mut criteria: Typo, mut parameters: CriterionParameters) -> String { - let mut result = String::new(); - while let Some(criterion) = criteria.next(&mut parameters).unwrap() { - result.push_str(&format!("{criterion:?}\n\n")); - } - result - } - - #[test] - fn initial_placeholder_no_facets() { - let context = TestContext::default(); - let query_tree = None; - let facet_candidates = None; - - let criterion_parameters = CriterionParameters { - wdcache: &mut WordDerivationsCache::new(), - excluded_candidates: &RoaringBitmap::new(), - }; - - let parent = - Initial::::new(&context, query_tree, facet_candidates, false, None); - let criteria = Typo::new(&context, Box::new(parent)); - - let result = display_criteria(criteria, criterion_parameters); - insta::assert_snapshot!(result, @r###" - CriterionResult { query_tree: None, candidates: None, filtered_candidates: None, initial_candidates: None } - - "###); - } - - #[test] - fn initial_query_tree_no_facets() { - let context = TestContext::default(); - let query_tree = Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ); - - let facet_candidates = None; - - let criterion_parameters = CriterionParameters { - wdcache: &mut WordDerivationsCache::new(), - excluded_candidates: &RoaringBitmap::new(), - }; - let parent = - Initial::::new(&context, Some(query_tree), facet_candidates, false, None); - let criteria = Typo::new(&context, Box::new(parent)); - - let result = display_criteria(criteria, criterion_parameters); - insta::assert_snapshot!(result, @r###" - CriterionResult { query_tree: Some(OR - AND - Exact { word: "split" } - Exact { word: "this" } - Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } - - CriterionResult { query_tree: Some(OR - AND - Exact { word: "split" } - Exact { word: "this" } - OR - Exact { word: "word" } - Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } - - "###); - } - - #[test] - fn initial_placeholder_with_facets() { - let context = TestContext::default(); - let query_tree = None; - let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - - let criterion_parameters = CriterionParameters { - wdcache: &mut WordDerivationsCache::new(), - excluded_candidates: &RoaringBitmap::new(), - }; - let parent = - Initial::::new(&context, query_tree, Some(facet_candidates), false, None); - let criteria = Typo::new(&context, Box::new(parent)); - - let result = display_criteria(criteria, criterion_parameters); - insta::assert_snapshot!(result, @r###" - CriterionResult { query_tree: None, candidates: None, filtered_candidates: Some(RoaringBitmap<8000 values between 986424 and 4294786076>), initial_candidates: None } - - "###); - } - - #[test] - fn initial_query_tree_with_facets() { - let context = TestContext::default(); - let query_tree = Operation::Or( - false, - vec![Operation::And(vec![ - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("split".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::exact("this".to_string()), - }), - Operation::Query(Query { - prefix: false, - kind: QueryKind::tolerant(1, "world".to_string()), - }), - ])], - ); - - let facet_candidates = context.word_docids("earth").unwrap().unwrap(); - - let criterion_parameters = CriterionParameters { - wdcache: &mut WordDerivationsCache::new(), - excluded_candidates: &RoaringBitmap::new(), - }; - let parent = Initial::::new( - &context, - Some(query_tree), - Some(facet_candidates), - false, - None, - ); - let criteria = Typo::new(&context, Box::new(parent)); - - let result = display_criteria(criteria, criterion_parameters); - insta::assert_snapshot!(result, @r###" - CriterionResult { query_tree: Some(OR - AND - Exact { word: "split" } - Exact { word: "this" } - Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } - - CriterionResult { query_tree: Some(OR - AND - Exact { word: "split" } - Exact { word: "this" } - OR - Exact { word: "word" } - Exact { word: "world" } - ), candidates: Some(RoaringBitmap<[]>), filtered_candidates: None, initial_candidates: Some(Estimated(RoaringBitmap<[]>)) } - - "###); - } -} diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs deleted file mode 100644 index 4c5f8b45b..000000000 --- a/milli/src/search/criteria/words.rs +++ /dev/null @@ -1,106 +0,0 @@ -use log::debug; -use roaring::RoaringBitmap; - -use super::{resolve_query_tree, Context, Criterion, CriterionParameters, CriterionResult}; -use crate::search::criteria::InitialCandidates; -use crate::search::query_tree::Operation; -use crate::Result; - -pub struct Words<'t> { - ctx: &'t dyn Context<'t>, - query_trees: Vec, - candidates: Option, - initial_candidates: Option, - filtered_candidates: Option, - parent: Box, -} - -impl<'t> Words<'t> { - pub fn new(ctx: &'t dyn Context<'t>, parent: Box) -> Self { - Words { - ctx, - query_trees: Vec::default(), - candidates: None, - initial_candidates: None, - parent, - filtered_candidates: None, - } - } -} - -impl<'t> Criterion for Words<'t> { - #[logging_timer::time("Words::{}")] - fn next(&mut self, params: &mut CriterionParameters) -> Result> { - // remove excluded candidates when next is called, instead of doing it in the loop. - if let Some(candidates) = self.candidates.as_mut() { - *candidates -= params.excluded_candidates; - } - - loop { - debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); - - match self.query_trees.pop() { - Some(query_tree) => { - let candidates = match self.candidates.as_mut() { - Some(allowed_candidates) => { - let mut candidates = - resolve_query_tree(self.ctx, &query_tree, params.wdcache)?; - candidates &= &*allowed_candidates; - *allowed_candidates -= &candidates; - Some(candidates) - } - None => None, - }; - - let initial_candidates = self.initial_candidates.clone(); - - return Ok(Some(CriterionResult { - query_tree: Some(query_tree), - candidates, - filtered_candidates: self.filtered_candidates.clone(), - initial_candidates, - })); - } - None => match self.parent.next(params)? { - Some(CriterionResult { - query_tree: Some(query_tree), - candidates, - filtered_candidates, - initial_candidates, - }) => { - self.query_trees = explode_query_tree(query_tree); - self.candidates = candidates; - self.filtered_candidates = filtered_candidates; - - self.initial_candidates = - match (self.initial_candidates.take(), initial_candidates) { - (Some(self_ic), Some(parent_ic)) => Some(self_ic | parent_ic), - (self_ic, parent_ic) => self_ic.or(parent_ic), - }; - } - Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - }) => { - return Ok(Some(CriterionResult { - query_tree: None, - candidates, - filtered_candidates, - initial_candidates, - })); - } - None => return Ok(None), - }, - } - } - } -} - -fn explode_query_tree(query_tree: Operation) -> Vec { - match query_tree { - Operation::Or(true, ops) => ops, - otherwise => vec![otherwise], - } -} diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs deleted file mode 100644 index 3ed683823..000000000 --- a/milli/src/search/distinct/facet_distinct.rs +++ /dev/null @@ -1,218 +0,0 @@ -use std::mem::size_of; - -use concat_arrays::concat_arrays; -use heed::types::{ByteSlice, Str, Unit}; -use roaring::RoaringBitmap; - -use super::{Distinct, DocIter}; -use crate::error::InternalError; -use crate::heed_codec::facet::{FacetGroupKey, *}; -use crate::index::db_name; -use crate::{DocumentId, FieldId, Index, Result}; - -const FID_SIZE: usize = size_of::(); -const DOCID_SIZE: usize = size_of::(); - -/// A distinct implementer that is backed by facets. -/// -/// On each iteration, the facet values for the -/// distinct attribute of the first document are retrieved. The document ids for these facet values -/// are then retrieved and taken out of the the candidate and added to the excluded set. We take -/// care to keep the document we are currently on, and remove it from the excluded list. The next -/// iterations will never contain any occurence of a document with the same distinct value as a -/// document from previous iterations. -#[derive(Clone)] -pub struct FacetDistinct<'a> { - distinct: FieldId, - index: &'a Index, - txn: &'a heed::RoTxn<'a>, -} - -impl<'a> FacetDistinct<'a> { - pub fn new(distinct: FieldId, index: &'a Index, txn: &'a heed::RoTxn<'a>) -> Self { - Self { distinct, index, txn } - } -} - -pub struct FacetDistinctIter<'a> { - candidates: RoaringBitmap, - distinct: FieldId, - excluded: RoaringBitmap, - index: &'a Index, - iter_offset: usize, - txn: &'a heed::RoTxn<'a>, -} - -impl<'a> FacetDistinctIter<'a> { - fn facet_string_docids(&self, key: &str) -> heed::Result> { - self.index - .facet_id_string_docids - .get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key }) - .map(|opt| opt.map(|v| v.bitmap)) - } - - fn facet_number_docids(&self, key: f64) -> heed::Result> { - // get facet docids on level 0 - self.index - .facet_id_f64_docids - .get(self.txn, &FacetGroupKey { field_id: self.distinct, level: 0, left_bound: key }) - .map(|opt| opt.map(|v| v.bitmap)) - } - - fn distinct_string(&mut self, id: DocumentId) -> Result<()> { - let iter = facet_string_values(id, self.distinct, self.index, self.txn)?; - - for item in iter { - let ((_, _, value), _) = item?; - let facet_docids = - self.facet_string_docids(value)?.ok_or(InternalError::DatabaseMissingEntry { - db_name: db_name::FACET_ID_STRING_DOCIDS, - key: None, - })?; - self.excluded |= facet_docids; - } - - self.excluded.remove(id); - - Ok(()) - } - - fn distinct_number(&mut self, id: DocumentId) -> Result<()> { - let iter = facet_number_values(id, self.distinct, self.index, self.txn)?; - - for item in iter { - let ((_, _, value), _) = item?; - let facet_docids = - self.facet_number_docids(value)?.ok_or(InternalError::DatabaseMissingEntry { - db_name: db_name::FACET_ID_F64_DOCIDS, - key: None, - })?; - self.excluded |= facet_docids; - } - - self.excluded.remove(id); - - Ok(()) - } - - /// Performs the next iteration of the facet distinct. This is a convenience method that is - /// called by the Iterator::next implementation that transposes the result. It makes error - /// handling easier. - fn next_inner(&mut self) -> Result> { - // The first step is to remove all the excluded documents from our candidates - self.candidates -= &self.excluded; - - let mut candidates_iter = self.candidates.iter().skip(self.iter_offset); - match candidates_iter.next() { - Some(id) => { - // We distinct the document id on its facet strings and facet numbers. - self.distinct_string(id)?; - self.distinct_number(id)?; - - // The first document of each iteration is kept, since the next call to - // `difference_with` will filter out all the documents for that facet value. By - // increasing the offset we make sure to get the first valid value for the next - // distinct document to keep. - self.iter_offset += 1; - - Ok(Some(id)) - } - // no more candidate at this offset, return. - None => Ok(None), - } - } -} - -#[allow(clippy::drop_non_drop)] -fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] { - concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) -} - -fn facet_number_values<'a>( - id: DocumentId, - distinct: FieldId, - index: &Index, - txn: &'a heed::RoTxn, -) -> Result> { - let key = facet_values_prefix_key(distinct, id); - - let iter = index - .field_id_docid_facet_f64s - .remap_key_type::() - .prefix_iter(txn, &key)? - .remap_key_type::(); - - Ok(iter) -} - -fn facet_string_values<'a>( - id: DocumentId, - distinct: FieldId, - index: &Index, - txn: &'a heed::RoTxn, -) -> Result> { - let key = facet_values_prefix_key(distinct, id); - - let iter = index - .field_id_docid_facet_strings - .remap_key_type::() - .prefix_iter(txn, &key)? - .remap_types::(); - - Ok(iter) -} - -impl Iterator for FacetDistinctIter<'_> { - type Item = Result; - - fn next(&mut self) -> Option { - self.next_inner().transpose() - } -} - -impl DocIter for FacetDistinctIter<'_> { - fn into_excluded(self) -> RoaringBitmap { - self.excluded - } -} - -impl<'a> Distinct for FacetDistinct<'a> { - type Iter = FacetDistinctIter<'a>; - - fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { - FacetDistinctIter { - candidates, - distinct: self.distinct, - excluded, - index: self.index, - iter_offset: 0, - txn: self.txn, - } - } -} - -#[cfg(test)] -mod test { - use super::super::test::{generate_index, validate_distinct_candidates}; - use super::*; - - macro_rules! test_facet_distinct { - ($name:ident, $distinct:literal) => { - #[test] - fn $name() { - let (index, fid, candidates) = generate_index($distinct); - let txn = index.read_txn().unwrap(); - let mut map_distinct = FacetDistinct::new(fid, &index, &txn); - let excluded = RoaringBitmap::new(); - let mut iter = map_distinct.distinct(candidates.clone(), excluded); - let count = validate_distinct_candidates(iter.by_ref(), fid, &index); - let excluded = iter.into_excluded(); - assert_eq!(count as u64 + excluded.len(), candidates.len()); - } - }; - } - - test_facet_distinct!(test_string, "txt"); - test_facet_distinct!(test_strings, "txts"); - test_facet_distinct!(test_number, "cat-int"); -} diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs deleted file mode 100644 index 12374c1f5..000000000 --- a/milli/src/search/distinct/mod.rs +++ /dev/null @@ -1,155 +0,0 @@ -mod facet_distinct; -mod noop_distinct; - -pub use facet_distinct::FacetDistinct; -pub use noop_distinct::NoopDistinct; -use roaring::RoaringBitmap; - -use crate::{DocumentId, Result}; - -/// A trait implemented by document interators that are returned by calls to `Distinct::distinct`. -/// It provides a way to get back the ownership to the excluded set. -pub trait DocIter: Iterator> { - /// Returns ownership on the internal exluded set. - fn into_excluded(self) -> RoaringBitmap; -} - -/// A trait that is implemented by structs that perform a distinct on `candidates`. Calling distinct -/// must return an iterator containing only distinct documents, and add the discarded documents to -/// the excluded set. The excluded set can later be retrieved by calling `DocIter::excluded` on the -/// returned iterator. -pub trait Distinct { - type Iter: DocIter; - - fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter; -} - -#[cfg(test)] -mod test { - use std::collections::HashSet; - use std::io::Cursor; - - use once_cell::sync::Lazy; - use rand::seq::SliceRandom; - use rand::Rng; - use roaring::RoaringBitmap; - use serde_json::{json, Value}; - - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - use crate::index::tests::TempIndex; - use crate::index::Index; - use crate::update::{ - IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, - }; - use crate::{DocumentId, FieldId, BEU32}; - - static JSON: Lazy> = Lazy::new(|| { - let mut rng = rand::thread_rng(); - let num_docs = rng.gen_range(10..30); - - let mut builder = DocumentsBatchBuilder::new(Vec::new()); - let txts = ["Toto", "Titi", "Tata"]; - let cats = (1..10).map(|i| i.to_string()).collect::>(); - let cat_ints = (1..10).collect::>(); - - for i in 0..num_docs { - let txt = txts.choose(&mut rng).unwrap(); - let mut sample_txts = cats.clone(); - sample_txts.shuffle(&mut rng); - - let mut sample_ints = cat_ints.clone(); - sample_ints.shuffle(&mut rng); - - let json = json!({ - "id": i, - "txt": txt, - "cat-int": rng.gen_range(0..3), - "txts": sample_txts[..(rng.gen_range(0..3))], - "cat-ints": sample_ints[..(rng.gen_range(0..3))], - }); - - let object = match json { - Value::Object(object) => object, - _ => panic!(), - }; - - builder.append_json_object(&object).unwrap(); - } - - builder.into_inner().unwrap() - }); - - /// Returns a temporary index populated with random test documents, the FieldId for the - /// distinct attribute, and the RoaringBitmap with the document ids. - pub(crate) fn generate_index(distinct: &str) -> (TempIndex, FieldId, RoaringBitmap) { - let index = TempIndex::new(); - let mut txn = index.write_txn().unwrap(); - - // set distinct and faceted attributes for the index. - let config = IndexerConfig::default(); - let mut update = Settings::new(&mut txn, &index, &config); - update.set_distinct_field(distinct.to_string()); - update.execute(|_| (), || false).unwrap(); - - // add documents to the index - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig { - update_method: IndexDocumentsMethod::ReplaceDocuments, - ..Default::default() - }; - let addition = - IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| (), || false) - .unwrap(); - - let reader = - crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())) - .unwrap(); - - let (addition, user_error) = addition.add_documents(reader).unwrap(); - user_error.unwrap(); - addition.execute().unwrap(); - - let fields_map = index.fields_ids_map(&txn).unwrap(); - let fid = fields_map.id(distinct).unwrap(); - - let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap(); - let map = (0..documents.documents_count()).collect(); - - txn.commit().unwrap(); - - (index, fid, map) - } - - /// Checks that all the candidates are distinct, and returns the candidates number. - pub(crate) fn validate_distinct_candidates( - candidates: impl Iterator>, - distinct: FieldId, - index: &Index, - ) -> usize { - fn test(seen: &mut HashSet, value: &Value) { - match value { - Value::Null | Value::Object(_) | Value::Bool(_) => (), - Value::Number(_) | Value::String(_) => { - let s = value.to_string(); - assert!(seen.insert(s)); - } - Value::Array(values) => values.iter().for_each(|value| test(seen, value)), - } - } - - let mut seen = HashSet::::new(); - - let txn = index.read_txn().unwrap(); - let mut count = 0; - for candidate in candidates { - count += 1; - let candidate = candidate.unwrap(); - let id = BEU32::new(candidate); - let document = index.documents.get(&txn, &id).unwrap().unwrap(); - let value = document.get(distinct).unwrap(); - let value = serde_json::from_slice(value).unwrap(); - test(&mut seen, &value); - } - count - } -} diff --git a/milli/src/search/distinct/noop_distinct.rs b/milli/src/search/distinct/noop_distinct.rs deleted file mode 100644 index 96a1f7d5d..000000000 --- a/milli/src/search/distinct/noop_distinct.rs +++ /dev/null @@ -1,55 +0,0 @@ -use roaring::bitmap::IntoIter; -use roaring::RoaringBitmap; - -use super::{Distinct, DocIter}; -use crate::{DocumentId, Result}; - -/// A distinct implementer that does not perform any distinct, -/// and simply returns an iterator to the candidates. -pub struct NoopDistinct; - -pub struct NoopDistinctIter { - candidates: IntoIter, - excluded: RoaringBitmap, -} - -impl Iterator for NoopDistinctIter { - type Item = Result; - - fn next(&mut self) -> Option { - self.candidates.next().map(Ok) - } -} - -impl DocIter for NoopDistinctIter { - fn into_excluded(self) -> RoaringBitmap { - self.excluded - } -} - -impl Distinct for NoopDistinct { - type Iter = NoopDistinctIter; - - fn distinct(&mut self, candidates: RoaringBitmap, excluded: RoaringBitmap) -> Self::Iter { - NoopDistinctIter { candidates: candidates.into_iter(), excluded } - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_noop() { - let candidates = (1..10).collect(); - let excluded = RoaringBitmap::new(); - let mut iter = NoopDistinct.distinct(candidates, excluded); - assert_eq!( - iter.by_ref().map(Result::unwrap).collect::>(), - (1..10).collect::>() - ); - - let excluded = iter.into_excluded(); - assert!(excluded.is_empty()); - } -} diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 2aae78bb2..e9435f180 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -309,7 +309,7 @@ impl<'a> FacetDistribution<'a> { let mut distribution = BTreeMap::new(); for (fid, name) in fields_ids_map.iter() { if crate::is_faceted(name, &fields) { - let min_value = if let Some(min_value) = crate::search::criteria::facet_min_value( + let min_value = if let Some(min_value) = crate::search::facet::facet_min_value( self.index, self.rtxn, fid, @@ -319,7 +319,7 @@ impl<'a> FacetDistribution<'a> { } else { continue; }; - let max_value = if let Some(max_value) = crate::search::criteria::facet_max_value( + let max_value = if let Some(max_value) = crate::search::facet::facet_max_value( self.index, self.rtxn, fid, diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index c88d4e9e7..51f1bf005 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -2,11 +2,13 @@ pub use facet_sort_ascending::ascending_facet_sort; pub use facet_sort_descending::descending_facet_sort; use heed::types::{ByteSlice, DecodeIgnore}; use heed::{BytesDecode, RoTxn}; +use roaring::RoaringBitmap; pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::filter::{BadGeoError, Filter}; -use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; use crate::heed_codec::ByteSliceRefCodec; +use crate::{Index, Result}; mod facet_distribution; mod facet_distribution_iter; mod facet_range_search; @@ -14,6 +16,38 @@ mod facet_sort_ascending; mod facet_sort_descending; mod filter; +fn facet_extreme_value<'t>( + mut extreme_it: impl Iterator> + 't, +) -> Result> { + let extreme_value = + if let Some(extreme_value) = extreme_it.next() { extreme_value } else { return Ok(None) }; + let (_, extreme_value) = extreme_value?; + + Ok(OrderedF64Codec::bytes_decode(extreme_value)) +} + +pub fn facet_min_value<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: u16, + candidates: RoaringBitmap, +) -> Result> { + let db = index.facet_id_f64_docids.remap_key_type::>(); + let it = ascending_facet_sort(rtxn, db, field_id, candidates)?; + facet_extreme_value(it) +} + +pub fn facet_max_value<'t>( + index: &'t Index, + rtxn: &'t heed::RoTxn, + field_id: u16, + candidates: RoaringBitmap, +) -> Result> { + let db = index.facet_id_f64_docids.remap_key_type::>(); + let it = descending_facet_sort(rtxn, db, field_id, candidates)?; + facet_extreme_value(it) +} + /// Get the first facet value in the facet database pub(crate) fn get_first_facet_value<'t, BoundCodec>( txn: &'t RoTxn, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index dc236dd0d..1015b01cb 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,38 +1,27 @@ -use std::borrow::Cow; -use std::collections::hash_map::{Entry, HashMap}; -use std::fmt; -use std::mem::take; -use std::result::Result as StdResult; -use std::str::Utf8Error; -use std::time::Instant; - -use charabia::TokenizerBuilder; -use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; -use fst::automaton::Str; -use fst::{Automaton, IntoStreamer, Streamer}; -use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; -use log::debug; -use once_cell::sync::Lazy; -use roaring::bitmap::RoaringBitmap; - pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, }; -use self::query_tree::QueryTreeBuilder; -use crate::error::UserError; -use crate::search::criteria::r#final::{Final, FinalResult}; -use crate::search::criteria::InitialCandidates; -use crate::{AscDesc, Criterion, DocumentId, Index, Member, Result}; +use crate::{ + execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, +}; +use fst::automaton::Str; +use fst::{Automaton, IntoStreamer, Streamer}; +use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; +use once_cell::sync::Lazy; +use roaring::bitmap::RoaringBitmap; +use std::borrow::Cow; +use std::collections::hash_map::{Entry, HashMap}; +use std::fmt; +use std::result::Result as StdResult; +use std::str::Utf8Error; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); static LEVDIST1: Lazy = Lazy::new(|| LevBuilder::new(1, true)); static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); -mod criteria; -mod distinct; pub mod facet; mod fst_utils; mod matches; @@ -135,162 +124,18 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> Result { - // We create the query tree by spliting the query into tokens. - let before = Instant::now(); - let (query_tree, primitive_query, matching_words) = match self.query.as_ref() { - Some(query) => { - let mut builder = QueryTreeBuilder::new(self.rtxn, self.index)?; - builder.terms_matching_strategy(self.terms_matching_strategy); - - builder.authorize_typos(self.is_typo_authorized()?); - - builder.words_limit(self.words_limit); - // We make sure that the analyzer is aware of the stop words - // this ensures that the query builder is able to properly remove them. - let mut tokbuilder = TokenizerBuilder::new(); - let stop_words = self.index.stop_words(self.rtxn)?; - if let Some(ref stop_words) = stop_words { - tokbuilder.stop_words(stop_words); - } - - let script_lang_map = self.index.script_language(self.rtxn)?; - if !script_lang_map.is_empty() { - tokbuilder.allow_list(&script_lang_map); - } - - let tokenizer = tokbuilder.build(); - let tokens = tokenizer.tokenize(query); - builder - .build(tokens)? - .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw))) - } - None => (None, None, None), - }; - - debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed()); - - // We create the original candidates with the facet conditions results. - let before = Instant::now(); - let filtered_candidates = match &self.filter { - Some(condition) => Some(condition.evaluate(self.rtxn, self.index)?), - None => None, - }; - - debug!("facet candidates: {:?} took {:.02?}", filtered_candidates, before.elapsed()); - - // We check that we are allowed to use the sort criteria, we check - // that they are declared in the sortable fields. - if let Some(sort_criteria) = &self.sort_criteria { - let sortable_fields = self.index.sortable_fields(self.rtxn)?; - for asc_desc in sort_criteria { - match asc_desc.member() { - Member::Field(ref field) if !crate::is_faceted(field, &sortable_fields) => { - return Err(UserError::InvalidSortableAttribute { - field: field.to_string(), - valid_fields: sortable_fields.into_iter().collect(), - })? - } - Member::Geo(_) if !sortable_fields.contains("_geo") => { - return Err(UserError::InvalidSortableAttribute { - field: "_geo".to_string(), - valid_fields: sortable_fields.into_iter().collect(), - })? - } - _ => (), - } - } - } - - // We check that the sort ranking rule exists and throw an - // error if we try to use it and that it doesn't. - let sort_ranking_rule_missing = !self.index.criteria(self.rtxn)?.contains(&Criterion::Sort); - let empty_sort_criteria = self.sort_criteria.as_ref().map_or(true, |s| s.is_empty()); - if sort_ranking_rule_missing && !empty_sort_criteria { - return Err(UserError::SortRankingRuleMissing.into()); - } - - let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; - - match self.index.distinct_field(self.rtxn)? { - None => { - let criteria = criteria_builder.build::( - query_tree, - primitive_query, - filtered_candidates, - self.sort_criteria.clone(), - self.exhaustive_number_hits, - None, - self.criterion_implementation_strategy, - )?; - self.perform_sort(NoopDistinct, matching_words.unwrap_or_default(), criteria) - } - Some(name) => { - let field_ids_map = self.index.fields_ids_map(self.rtxn)?; - match field_ids_map.id(name) { - Some(fid) => { - let distinct = FacetDistinct::new(fid, self.index, self.rtxn); - - let criteria = criteria_builder.build( - query_tree, - primitive_query, - filtered_candidates, - self.sort_criteria.clone(), - self.exhaustive_number_hits, - Some(distinct.clone()), - self.criterion_implementation_strategy, - )?; - self.perform_sort(distinct, matching_words.unwrap_or_default(), criteria) - } - None => Ok(SearchResult::default()), - } - } - } - } - - fn perform_sort( - &self, - mut distinct: D, - matching_words: MatchingWords, - mut criteria: Final, - ) -> Result { - let mut offset = self.offset; - let mut initial_candidates = InitialCandidates::Estimated(RoaringBitmap::new()); - let mut excluded_candidates = self.index.soft_deleted_documents_ids(self.rtxn)?; - let mut documents_ids = Vec::new(); - - while let Some(FinalResult { candidates, initial_candidates: ic, .. }) = - criteria.next(&excluded_candidates)? - { - debug!("Number of candidates found {}", candidates.len()); - - let excluded = take(&mut excluded_candidates); - let mut candidates = distinct.distinct(candidates, excluded); - - initial_candidates |= ic; - - if offset != 0 { - let discarded = candidates.by_ref().take(offset).count(); - offset = offset.saturating_sub(discarded); - } - - for candidate in candidates.by_ref().take(self.limit - documents_ids.len()) { - documents_ids.push(candidate?); - } - - excluded_candidates |= candidates.into_excluded(); - - if documents_ids.len() == self.limit { - break; - } - } - - initial_candidates.map_inplace(|c| c - excluded_candidates); - - Ok(SearchResult { - matching_words, - candidates: initial_candidates.into_inner(), - documents_ids, - }) + let mut ctx = SearchContext::new(self.index, self.rtxn); + execute_search( + &mut ctx, + &self.query, + self.terms_matching_strategy, + &self.filter, + self.offset, + self.limit, + Some(self.words_limit), + &mut DefaultSearchLogger, + &mut DefaultSearchLogger, + ) } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index fff180879..15c895583 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -18,7 +18,7 @@ mod words; // #[cfg(test)] use std::collections::{BTreeSet, HashSet}; -use charabia::Tokenize; +use charabia::{Tokenize, TokenizerBuilder}; use db_cache::DatabaseCache; use graph_based_ranking_rule::{Proximity, Typo}; use heed::RoTxn; @@ -224,32 +224,41 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( #[allow(clippy::too_many_arguments)] pub fn execute_search( ctx: &mut SearchContext, - query: &str, + query: &Option, terms_matching_strategy: TermsMatchingStrategy, - filters: Option, + filters: &Option, from: usize, length: usize, + words_limit: Option, placeholder_search_logger: &mut dyn SearchLogger, query_graph_logger: &mut dyn SearchLogger, ) -> Result { - assert!(!query.is_empty()); - let query_terms = located_query_terms_from_string(ctx, query.tokenize(), None)?; - let graph = QueryGraph::from_query(ctx, query_terms)?; - let mut universe = if let Some(filters) = filters { filters.evaluate(ctx.txn, ctx.index)? } else { ctx.index.documents_ids(ctx.txn)? }; - // TODO: other way to tell whether it is a placeholder search - // This way of doing things is not correct because if someone searches - // for a word that does not appear in any document, the word will be removed - // from the graph and thus its number of nodes will be == 2 - // But in that case, we should return no results. - // - // The search is a placeholder search only if there are no tokens? - let documents_ids = if graph.nodes.len() > 2 { + let documents_ids = if let Some(query) = query { + // We make sure that the analyzer is aware of the stop words + // this ensures that the query builder is able to properly remove them. + let mut tokbuilder = TokenizerBuilder::new(); + let stop_words = ctx.index.stop_words(ctx.txn)?; + if let Some(ref stop_words) = stop_words { + tokbuilder.stop_words(stop_words); + } + + let script_lang_map = ctx.index.script_language(ctx.txn)?; + if !script_lang_map.is_empty() { + tokbuilder.allow_list(&script_lang_map); + } + + let tokenizer = tokbuilder.build(); + let tokens = tokenizer.tokenize(&query); + + let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; + let graph = QueryGraph::from_query(ctx, query_terms)?; + universe = resolve_maximally_reduced_query_graph( ctx, &universe, @@ -259,6 +268,7 @@ pub fn execute_search( )?; let ranking_rules = get_ranking_rules_for_query_graph_search(ctx, terms_matching_strategy)?; + bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { let ranking_rules = get_ranking_rules_for_placeholder_search(ctx)?; diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index 8591670b8..e239d4669 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -427,7 +427,7 @@ impl LocatedQueryTerm { /// Convert the tokenised search query into a list of located query terms. pub fn located_query_terms_from_string( ctx: &mut SearchContext, - query: NormalizedTokenIter>, + query: NormalizedTokenIter<&[u8]>, words_limit: Option, ) -> Result> { let nbr_typos = number_of_typos_allowed(ctx)?;