diff --git a/Cargo.lock b/Cargo.lock index 883d836b7..b7f479d2e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -866,6 +866,7 @@ dependencies = [ "anyhow", "byte-unit", "heed", + "jemallocator", "milli", "stderrlog", "structopt", diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 2ce7f8bd1..86f965368 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -32,7 +32,7 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use milli::facet::FacetValue; use milli::update::UpdateIndexingStep::*; use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat}; -use milli::{obkv_to_json, Index, UpdateStore, SearchResult, FacetCondition}; +use milli::{obkv_to_json, Index, UpdateStore, SearchResult, MatchingWords, FacetCondition}; static GLOBAL_THREAD_POOL: OnceCell = OnceCell::new(); @@ -132,7 +132,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { Self { analyzer } } - fn highlight_value(&self, value: Value, words_to_highlight: &HashSet) -> Value { + fn highlight_value(&self, value: Value, matching_words: &MatchingWords) -> Value { match value { Value::Null => Value::Null, Value::Bool(boolean) => Value::Bool(boolean), @@ -142,7 +142,7 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { let analyzed = self.analyzer.analyze(&old_string); for (word, token) in analyzed.reconstruct() { if token.is_word() { - let to_highlight = words_to_highlight.contains(token.text()); + let to_highlight = matching_words.matches(token.text()); if to_highlight { string.push_str("") } string.push_str(word); if to_highlight { string.push_str("") } @@ -154,12 +154,12 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { }, Value::Array(values) => { Value::Array(values.into_iter() - .map(|v| self.highlight_value(v, words_to_highlight)) + .map(|v| self.highlight_value(v, matching_words)) .collect()) }, Value::Object(object) => { Value::Object(object.into_iter() - .map(|(k, v)| (k, self.highlight_value(v, words_to_highlight))) + .map(|(k, v)| (k, self.highlight_value(v, matching_words))) .collect()) }, } @@ -168,14 +168,14 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { fn highlight_record( &self, object: &mut Map, - words_to_highlight: &HashSet, + matching_words: &MatchingWords, attributes_to_highlight: &HashSet, ) { // TODO do we need to create a string for element that are not and needs to be highlight? for (key, value) in object.iter_mut() { if attributes_to_highlight.contains(key) { let old_value = mem::take(value); - *value = self.highlight_value(old_value, words_to_highlight); + *value = self.highlight_value(old_value, matching_words); } } } @@ -722,7 +722,7 @@ async fn main() -> anyhow::Result<()> { search.facet_condition(condition); } - let SearchResult { found_words, candidates, documents_ids } = search.execute().unwrap(); + let SearchResult { matching_words, candidates, documents_ids } = search.execute().unwrap(); let number_of_candidates = candidates.len(); let facets = if query.facet_distribution == Some(true) { @@ -748,7 +748,7 @@ async fn main() -> anyhow::Result<()> { for (_id, obkv) in index.documents(&rtxn, documents_ids).unwrap() { let mut object = obkv_to_json(&displayed_fields, &fields_ids_map, obkv).unwrap(); if !disable_highlighting { - highlighter.highlight_record(&mut object, &found_words, &attributes_to_highlight); + highlighter.highlight_record(&mut object, &matching_words, &attributes_to_highlight); } documents.push(object); diff --git a/infos/src/main.rs b/infos/src/main.rs index 91157aaad..0d2b7abb5 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -598,7 +598,7 @@ fn export_documents(index: &Index, rtxn: &heed::RoTxn, internal_ids: Vec) - let fields_ids_map = index.fields_ids_map(rtxn)?; let displayed_fields: Vec<_> = fields_ids_map.iter().map(|(id, _name)| id).collect(); - let iter: Box> = if internal_ids.is_empty() { + let iter: Box> = if internal_ids.is_empty() { Box::new(index.documents.iter(rtxn)?.map(|result| { result.map(|(_id, obkv)| obkv) })) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 0fa966ee8..d6a078a1f 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -3,8 +3,6 @@ mod criterion; mod external_documents_ids; mod fields_ids_map; -mod mdfs; -mod query_tokens; mod search; mod update_store; pub mod facet; @@ -28,7 +26,7 @@ pub use self::heed_codec::{BEU32StrCodec, StrStrU8Codec, ObkvCodec}; pub use self::heed_codec::{RoaringBitmapCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec}; pub use self::heed_codec::{RoaringBitmapLenCodec, BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec}; pub use self::index::Index; -pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult}; +pub use self::search::{Search, FacetDistribution, FacetCondition, SearchResult, MatchingWords}; pub use self::update_store::UpdateStore; pub type FastMap4 = HashMap>; diff --git a/milli/src/mdfs.rs b/milli/src/mdfs.rs deleted file mode 100644 index 6beba3c69..000000000 --- a/milli/src/mdfs.rs +++ /dev/null @@ -1,163 +0,0 @@ -use std::collections::hash_map::Entry::{Occupied, Vacant}; -use std::collections::HashMap; -use std::mem; - -use roaring::RoaringBitmap; -use crate::Index; - -/// A mana depth first search implementation. -pub struct Mdfs<'a> { - index: &'a Index, - rtxn: &'a heed::RoTxn<'a>, - words: &'a [(HashMap, RoaringBitmap)], - union_cache: HashMap<(usize, u8), RoaringBitmap>, - candidates: RoaringBitmap, - mana: u32, - max_mana: u32, -} - -impl<'a> Mdfs<'a> { - pub fn new( - index: &'a Index, - rtxn: &'a heed::RoTxn, - words: &'a [(HashMap, RoaringBitmap)], - candidates: RoaringBitmap, - ) -> Mdfs<'a> - { - // Compute the number of pairs (windows) we have for this list of words. - let mana = words.len().saturating_sub(1) as u32; - let max_mana = mana * 8; - Mdfs { index, rtxn, words, union_cache: HashMap::new(), candidates, mana, max_mana } - } -} - -impl<'a> Iterator for Mdfs<'a> { - type Item = anyhow::Result<(u32, RoaringBitmap)>; - - fn next(&mut self) -> Option { - // If there is less or only one word therefore the only - // possible documents that we can return are the candidates. - if self.words.len() <= 1 { - if self.candidates.is_empty() { return None } - return Some(Ok((0, mem::take(&mut self.candidates)))); - } - - while self.mana <= self.max_mana { - let mut answer = RoaringBitmap::new(); - let result = mdfs_step( - &self.index, - &self.rtxn, - self.mana, - self.words, - &self.candidates, - &self.candidates, - &mut self.union_cache, - &mut answer, - ); - - match result { - Ok(()) => { - // We always increase the mana for the next loop. - let proximity = self.mana; - self.mana += 1; - - // If no documents were found we must not return and continue - // the search with more mana. - if !answer.is_empty() { - - // We remove the answered documents from the list of - // candidates to be sure we don't search for them again. - self.candidates.difference_with(&answer); - - // We return the answer. - return Some(Ok((proximity, answer))); - } - }, - Err(e) => return Some(Err(e)), - } - } - - None - } -} - -fn mdfs_step( - index: &Index, - rtxn: &heed::RoTxn, - mana: u32, - words: &[(HashMap, RoaringBitmap)], - candidates: &RoaringBitmap, - parent_docids: &RoaringBitmap, - union_cache: &mut HashMap<(usize, u8), RoaringBitmap>, - answer: &mut RoaringBitmap, -) -> anyhow::Result<()> -{ - use std::cmp::{min, max}; - - let (words1, words2) = (&words[0].0, &words[1].0); - let pairs = words_pair_combinations(words1, words2); - let tail = &words[1..]; - let nb_children = tail.len() as u32 - 1; - - // The minimum amount of mana that you must consume is at least 1 and the - // amount of mana that your children can consume. Because the last child must - // consume the remaining mana, it is mandatory that there not too much at the end. - let min_proximity = max(1, mana.saturating_sub(nb_children * 8)) as u8; - - // The maximum amount of mana that you can use is 8 or the remaining amount of - // mana minus your children, as you can't just consume all the mana, - // your children must have at least 1 mana. - let max_proximity = min(8, mana - nb_children) as u8; - - for proximity in min_proximity..=max_proximity { - let mut docids = match union_cache.entry((words.len(), proximity)) { - Occupied(entry) => entry.get().clone(), - Vacant(entry) => { - let mut docids = RoaringBitmap::new(); - if proximity == 8 { - docids = candidates.clone(); - } else { - for (w1, w2) in pairs.iter().cloned() { - let key = (w1, w2, proximity); - if let Some(di) = index.word_pair_proximity_docids.get(rtxn, &key)? { - docids.union_with(&di); - } - } - } - entry.insert(docids).clone() - } - }; - - // We must be sure that we only return docids that are present in the candidates. - docids.intersect_with(parent_docids); - - if !docids.is_empty() { - let mana = mana.checked_sub(proximity as u32).unwrap(); - if tail.len() < 2 { - // We are the last pair, we return without recuring as we don't have any child. - answer.union_with(&docids); - return Ok(()); - } else { - return mdfs_step(index, rtxn, mana, tail, candidates, &docids, union_cache, answer); - } - } - } - - Ok(()) -} - -fn words_pair_combinations<'h>( - w1: &'h HashMap, - w2: &'h HashMap, -) -> Vec<(&'h str, &'h str)> -{ - let mut pairs = Vec::new(); - for (w1, (_typos, docids1)) in w1 { - for (w2, (_typos, docids2)) in w2 { - if !docids1.is_disjoint(&docids2) { - pairs.push((w1.as_str(), w2.as_str())); - } - } - } - pairs -} diff --git a/milli/src/query_tokens.rs b/milli/src/query_tokens.rs deleted file mode 100644 index 258c90765..000000000 --- a/milli/src/query_tokens.rs +++ /dev/null @@ -1,217 +0,0 @@ -use meilisearch_tokenizer::{Token, TokenKind}; - -#[derive(Debug)] -enum State { - Free, - Quoted, -} - -impl State { - fn swap(&mut self) { - match self { - State::Quoted => *self = State::Free, - State::Free => *self = State::Quoted, - } - } -} - -#[derive(Debug, PartialEq, Eq)] -pub enum QueryToken<'a> { - Free(Token<'a>), - Quoted(Token<'a>), -} - -pub fn query_tokens<'a>(mut tokens: impl Iterator>) -> impl Iterator> { - let mut state = State::Free; - let f = move || { - loop { - let token = tokens.next()?; - match token.kind() { - _ if token.text().trim() == "\"" => state.swap(), - TokenKind::Word => { - let token = match state { - State::Quoted => QueryToken::Quoted(token), - State::Free => QueryToken::Free(token), - }; - return Some(token); - }, - _ => (), - } - } - }; - std::iter::from_fn(f) -} - -#[cfg(test)] -mod tests { - use super::*; - use QueryToken::{Quoted, Free}; - use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; - use fst::Set; - - macro_rules! assert_eq_query_token { - ($test:expr, Quoted($val:literal)) => { - match $test { - Quoted(val) => assert_eq!(val.text(), $val), - Free(val) => panic!("expected Quoted(\"{}\"), found Free(\"{}\")", $val, val.text()), - } - }; - - ($test:expr, Free($val:literal)) => { - match $test { - Quoted(val) => panic!("expected Free(\"{}\"), found Quoted(\"{}\")", $val, val.text()), - Free(val) => assert_eq!(val.text(), $val), - } - }; - } - - #[test] - fn empty() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = ""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert!(iter.next().is_none()); - - let query = " "; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert!(iter.next().is_none()); - } - - #[test] - fn one_quoted_string() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "\"hello\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); - assert!(iter.next().is_none()); - } - - #[test] - fn one_pending_quoted_string() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "\"hello"; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); - assert!(iter.next().is_none()); - } - - #[test] - fn one_non_quoted_string() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello"; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert!(iter.next().is_none()); - } - - #[test] - fn quoted_directly_followed_by_free_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "\"hello\"world"; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); - assert_eq_query_token!(iter.next().unwrap(), Free("world")); - assert!(iter.next().is_none()); - } - - #[test] - fn free_directly_followed_by_quoted_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello\"world\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); - assert!(iter.next().is_none()); - } - - #[test] - fn free_followed_by_quoted_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello \"world\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); - assert!(iter.next().is_none()); - } - - #[test] - fn multiple_spaces_separated_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello world "; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert_eq_query_token!(iter.next().unwrap(), Free("world")); - assert!(iter.next().is_none()); - } - - #[test] - fn multi_interleaved_quoted_free_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "hello \"world\" coucou \"monde\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("hello")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); - assert_eq_query_token!(iter.next().unwrap(), Free("coucou")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("monde")); - assert!(iter.next().is_none()); - } - - #[test] - fn multi_quoted_strings() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "\"hello world\" coucou \"monde est beau\""; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Quoted("hello")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("world")); - assert_eq_query_token!(iter.next().unwrap(), Free("coucou")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("monde")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("est")); - assert_eq_query_token!(iter.next().unwrap(), Quoted("beau")); - assert!(iter.next().is_none()); - } - - #[test] - fn chinese() { - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let query = "汽车男生"; - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let mut iter = query_tokens(tokens); - assert_eq_query_token!(iter.next().unwrap(), Free("汽车")); - assert_eq_query_token!(iter.next().unwrap(), Free("男生")); - assert!(iter.next().is_none()); - } -} diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs new file mode 100644 index 000000000..193e9c942 --- /dev/null +++ b/milli/src/search/criteria/asc_desc.rs @@ -0,0 +1,282 @@ +use std::collections::HashMap; +use std::mem::take; + +use anyhow::bail; +use itertools::Itertools; +use log::debug; +use ordered_float::OrderedFloat; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; +use crate::heed_codec::facet::{FieldDocIdFacetI64Codec, FieldDocIdFacetF64Codec}; +use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; +use crate::search::facet::FacetIter; +use crate::search::query_tree::Operation; +use crate::{FieldId, Index}; +use super::{Criterion, CriterionResult}; + +pub struct AscDesc<'t> { + index: &'t Index, + rtxn: &'t heed::RoTxn<'t>, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + query_tree: Option, + candidates: RoaringBitmap, + bucket_candidates: RoaringBitmap, + faceted_candidates: RoaringBitmap, + parent: Option>, +} + +impl<'t> AscDesc<'t> { + pub fn initial_asc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + query_tree: Option, + candidates: Option, + field_id: FieldId, + facet_type: FacetType, + ) -> anyhow::Result + { + Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, true) + } + + pub fn initial_desc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + query_tree: Option, + candidates: Option, + field_id: FieldId, + facet_type: FacetType, + ) -> anyhow::Result + { + Self::initial(index, rtxn, query_tree, candidates, field_id, facet_type, false) + } + + pub fn asc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + parent: Box, + field_id: FieldId, + facet_type: FacetType, + ) -> anyhow::Result + { + Self::new(index, rtxn, parent, field_id, facet_type, true) + } + + pub fn desc( + index: &'t Index, + rtxn: &'t heed::RoTxn, + parent: Box, + field_id: FieldId, + facet_type: FacetType, + ) -> anyhow::Result + { + Self::new(index, rtxn, parent, field_id, facet_type, false) + } + + fn initial( + index: &'t Index, + rtxn: &'t heed::RoTxn, + query_tree: Option, + candidates: Option, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + ) -> anyhow::Result + { + let faceted_candidates = index.faceted_documents_ids(rtxn, field_id)?; + let candidates = match &query_tree { + Some(qt) => { + let context = CriteriaBuilder::new(rtxn, index)?; + let mut qt_candidates = resolve_query_tree(&context, qt, &mut HashMap::new())?; + if let Some(candidates) = candidates { + qt_candidates.intersect_with(&candidates); + } + qt_candidates + }, + None => candidates.unwrap_or(faceted_candidates.clone()), + }; + + Ok(AscDesc { + index, + rtxn, + field_id, + facet_type, + ascending, + query_tree, + candidates, + faceted_candidates, + bucket_candidates: RoaringBitmap::new(), + parent: None, + }) + } + + fn new( + index: &'t Index, + rtxn: &'t heed::RoTxn, + parent: Box, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + ) -> anyhow::Result + { + Ok(AscDesc { + index, + rtxn, + field_id, + facet_type, + ascending, + query_tree: None, + candidates: RoaringBitmap::new(), + faceted_candidates: index.faceted_documents_ids(rtxn, field_id)?, + bucket_candidates: RoaringBitmap::new(), + parent: Some(parent), + }) + } +} + +impl<'t> Criterion for AscDesc<'t> { + fn next(&mut self) -> anyhow::Result> { + loop { + debug!("Facet {} iteration ({:?})", + if self.ascending { "Asc" } else { "Desc" }, self.candidates, + ); + + match &mut self.candidates { + candidates if candidates.is_empty() => { + let query_tree = self.query_tree.take(); + let candidates = take(&mut self.candidates); + let bucket_candidates = take(&mut self.bucket_candidates); + + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(CriterionResult { query_tree, mut candidates, bucket_candidates }) => { + self.query_tree = query_tree; + candidates.intersect_with(&self.faceted_candidates); + self.candidates = candidates; + self.bucket_candidates = bucket_candidates; + }, + None => return Ok(None), + } + }, + None => if query_tree.is_none() && bucket_candidates.is_empty() { + return Ok(None) + }, + } + + return Ok(Some(CriterionResult { query_tree, candidates, bucket_candidates })); + }, + candidates => { + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => candidates.clone(), + }; + + let found_candidates = facet_ordered( + self.index, + self.rtxn, + self.field_id, + self.facet_type, + self.ascending, + candidates.clone(), + )?; + + candidates.difference_with(&found_candidates); + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: found_candidates, + bucket_candidates, + })); + }, + } + } + } +} + +fn facet_ordered( + index: &Index, + rtxn: &heed::RoTxn, + field_id: FieldId, + facet_type: FacetType, + ascending: bool, + candidates: RoaringBitmap, +) -> anyhow::Result +{ + match facet_type { + FacetType::Float => { + if candidates.len() <= 1000 { + let db = index.field_id_docid_facet_values.remap_key_type::(); + let mut docids_values = Vec::with_capacity(candidates.len() as usize); + for docid in candidates.iter() { + let left = (field_id, docid, f64::MIN); + let right = (field_id, docid, f64::MAX); + let mut iter = db.range(rtxn, &(left..=right))?; + let entry = if ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), ())) = entry.transpose()? { + docids_values.push((docid, OrderedFloat(value))); + } + } + docids_values.sort_unstable_by_key(|(_, value)| *value); + let iter = docids_values.into_iter(); + let iter = if ascending { + Box::new(iter) as Box> + } else { + Box::new(iter.rev()) + }; + match iter.group_by(|(_, v)| *v).into_iter().next() { + Some((_, ids)) => Ok(ids.map(|(id, _)| id).into_iter().collect()), + None => Ok(RoaringBitmap::new()) + } + } else { + let facet_fn = if ascending { + FacetIter::::new_reducing + } else { + FacetIter::::new_reverse_reducing + }; + + let mut iter = facet_fn(rtxn, index, field_id, candidates)?; + Ok(iter.next().transpose()?.map(|(_, docids)| docids).unwrap_or_default()) + } + }, + FacetType::Integer => { + if candidates.len() <= 1000 { + let db = index.field_id_docid_facet_values.remap_key_type::(); + let mut docids_values = Vec::with_capacity(candidates.len() as usize); + for docid in candidates.iter() { + let left = (field_id, docid, i64::MIN); + let right = (field_id, docid, i64::MAX); + let mut iter = db.range(rtxn, &(left..=right))?; + let entry = if ascending { iter.next() } else { iter.last() }; + if let Some(((_, _, value), ())) = entry.transpose()? { + docids_values.push((docid, value)); + } + } + docids_values.sort_unstable_by_key(|(_, value)| *value); + let iter = docids_values.into_iter(); + let iter = if ascending { + Box::new(iter) as Box> + } else { + Box::new(iter.rev()) + }; + match iter.group_by(|(_, v)| *v).into_iter().next() { + Some((_, ids)) => Ok(ids.map(|(id, _)| id).into_iter().collect()), + None => Ok(RoaringBitmap::new()) + } + } else { + let facet_fn = if ascending { + FacetIter::::new_reducing + } else { + FacetIter::::new_reverse_reducing + }; + + let mut iter = facet_fn(rtxn, index, field_id, candidates)?; + Ok(iter.next().transpose()?.map(|(_, docids)| docids).unwrap_or_default()) + } + }, + FacetType::String => bail!("criteria facet type must be a number"), + } +} diff --git a/milli/src/search/criteria/fetcher.rs b/milli/src/search/criteria/fetcher.rs new file mode 100644 index 000000000..38fee20d3 --- /dev/null +++ b/milli/src/search/criteria/fetcher.rs @@ -0,0 +1,113 @@ +use std::collections::HashMap; +use std::mem::take; + +use log::debug; +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; + +pub struct Fetcher<'t> { + ctx: &'t dyn Context, + query_tree: Option, + candidates: Candidates, + parent: Option>, + should_get_documents_ids: bool, +} + +impl<'t> Fetcher<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> Self + { + Fetcher { + ctx, + query_tree, + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + parent: None, + should_get_documents_ids: true, + } + } + + pub fn new( + ctx: &'t dyn Context, + parent: Box, + ) -> Self + { + Fetcher { + ctx, + query_tree: None, + candidates: Candidates::default(), + parent: Some(parent), + should_get_documents_ids: true, + } + } +} + +impl<'t> Criterion for Fetcher<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + loop { + debug!("Fetcher iteration (should_get_documents_ids: {}) ({:?})", + self.should_get_documents_ids, self.candidates, + ); + + let should_get_documents_ids = take(&mut self.should_get_documents_ids); + match &mut self.candidates { + Allowed(_) => { + let candidates = take(&mut self.candidates).into_inner(); + let candidates = match &self.query_tree { + Some(qt) if should_get_documents_ids => { + let mut docids = resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?; + docids.intersect_with(&candidates); + docids + }, + _ => candidates, + }; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take(), + candidates: candidates.clone(), + bucket_candidates: candidates, + })); + }, + Forbidden(_) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(result) => return Ok(Some(result)), + None => if should_get_documents_ids { + let candidates = match &self.query_tree { + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?, + None => self.ctx.documents_ids()?, + }; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: candidates.clone(), + bucket_candidates: candidates, + })); + }, + } + }, + None => if should_get_documents_ids { + let candidates = match &self.query_tree { + Some(qt) => resolve_query_tree(self.ctx, &qt, &mut HashMap::new())?, + None => self.ctx.documents_ids()?, + }; + + return Ok(Some(CriterionResult { + query_tree: self.query_tree.clone(), + candidates: candidates.clone(), + bucket_candidates: candidates, + })); + }, + } + return Ok(None); + }, + } + } + } +} diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs new file mode 100644 index 000000000..0dcaa5a69 --- /dev/null +++ b/milli/src/search/criteria/mod.rs @@ -0,0 +1,483 @@ +use std::collections::HashMap; +use std::borrow::Cow; + +use anyhow::{bail, Context as _}; +use roaring::RoaringBitmap; + +use crate::facet::FacetType; +use crate::search::word_derivations; +use crate::{Index, FieldId}; + +use super::query_tree::{Operation, Query, QueryKind}; +use self::typo::Typo; +use self::words::Words; +use self::asc_desc::AscDesc; +use self::proximity::Proximity; +use self::fetcher::Fetcher; + +pub mod typo; +pub mod words; +pub mod asc_desc; +pub mod proximity; +pub mod fetcher; + +pub trait Criterion { + fn next(&mut self) -> anyhow::Result>; +} + +/// The result of a call to the parent criterion. +#[derive(Debug, Clone, PartialEq)] +pub struct CriterionResult { + /// The query tree that must be used by the children criterion to fetch candidates. + pub query_tree: Option, + /// The candidates that this criterion is allowed to return subsets of. + pub candidates: RoaringBitmap, + /// Candidates that comes from the current bucket of the initial criterion. + pub bucket_candidates: RoaringBitmap, +} + +/// Either a set of candidates that defines the candidates +/// that are allowed to be returned, +/// or the candidates that must never be returned. +#[derive(Debug)] +enum Candidates { + Allowed(RoaringBitmap), + Forbidden(RoaringBitmap) +} + +impl Candidates { + fn into_inner(self) -> RoaringBitmap { + match self { + Self::Allowed(inner) => inner, + Self::Forbidden(inner) => inner, + } + } +} + +impl Default for Candidates { + fn default() -> Self { + Self::Forbidden(RoaringBitmap::new()) + } +} +pub trait Context { + fn documents_ids(&self) -> heed::Result; + fn word_docids(&self, word: &str) -> heed::Result>; + fn word_prefix_docids(&self, word: &str) -> heed::Result>; + fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; + fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result>; + fn words_fst<'t>(&self) -> &'t fst::Set>; + fn in_prefix_cache(&self, word: &str) -> bool; +} +pub struct CriteriaBuilder<'t> { + rtxn: &'t heed::RoTxn<'t>, + index: &'t Index, + words_fst: fst::Set>, + words_prefixes_fst: fst::Set>, +} + +impl<'a> Context for CriteriaBuilder<'a> { + fn documents_ids(&self) -> heed::Result { + self.index.documents_ids(self.rtxn) + } + + fn word_docids(&self, word: &str) -> heed::Result> { + self.index.word_docids.get(self.rtxn, &word) + } + + fn word_prefix_docids(&self, word: &str) -> heed::Result> { + self.index.word_prefix_docids.get(self.rtxn, &word) + } + + fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + let key = (left, right, proximity); + self.index.word_pair_proximity_docids.get(self.rtxn, &key) + } + + fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + let key = (left, right, proximity); + self.index.word_prefix_pair_proximity_docids.get(self.rtxn, &key) + } + + fn words_fst<'t>(&self) -> &'t fst::Set> { + &self.words_fst + } + + fn in_prefix_cache(&self, word: &str) -> bool { + self.words_prefixes_fst.contains(word) + } +} + +impl<'t> CriteriaBuilder<'t> { + pub fn new(rtxn: &'t heed::RoTxn<'t>, index: &'t Index) -> anyhow::Result { + let words_fst = index.words_fst(rtxn)?; + let words_prefixes_fst = index.words_prefixes_fst(rtxn)?; + Ok(Self { rtxn, index, words_fst, words_prefixes_fst }) + } + + pub fn build( + &'t self, + mut query_tree: Option, + mut facet_candidates: Option, + ) -> anyhow::Result> + { + use crate::criterion::Criterion as Name; + + let fields_ids_map = self.index.fields_ids_map(&self.rtxn)?; + let faceted_fields = self.index.faceted_fields(&self.rtxn)?; + let field_id_facet_type = |field: &str| -> anyhow::Result<(FieldId, FacetType)> { + let id = fields_ids_map.id(field).with_context(|| { + format!("field {:?} isn't registered", field) + })?; + let facet_type = faceted_fields.get(field).with_context(|| { + format!("field {:?} isn't faceted", field) + })?; + Ok((id, *facet_type)) + }; + + let mut criterion = None as Option>; + for name in self.index.criteria(&self.rtxn)? { + criterion = Some(match criterion.take() { + Some(father) => match name { + Name::Typo => Box::new(Typo::new(self, father)), + Name::Words => Box::new(Words::new(self, father)), + Name::Proximity => Box::new(Proximity::new(self, father)), + Name::Asc(field) => { + let (id, facet_type) = field_id_facet_type(&field)?; + Box::new(AscDesc::asc(&self.index, &self.rtxn, father, id, facet_type)?) + }, + Name::Desc(field) => { + let (id, facet_type) = field_id_facet_type(&field)?; + Box::new(AscDesc::desc(&self.index, &self.rtxn, father, id, facet_type)?) + }, + _otherwise => father, + }, + None => match name { + Name::Typo => Box::new(Typo::initial(self, query_tree.take(), facet_candidates.take())), + Name::Words => Box::new(Words::initial(self, query_tree.take(), facet_candidates.take())), + Name::Proximity => Box::new(Proximity::initial(self, query_tree.take(), facet_candidates.take())), + Name::Asc(field) => { + let (id, facet_type) = field_id_facet_type(&field)?; + Box::new(AscDesc::initial_asc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?) + }, + Name::Desc(field) => { + let (id, facet_type) = field_id_facet_type(&field)?; + Box::new(AscDesc::initial_desc(&self.index, &self.rtxn, query_tree.take(), facet_candidates.take(), id, facet_type)?) + }, + _otherwise => continue, + }, + }); + } + + match criterion { + Some(criterion) => Ok(Fetcher::new(self, criterion)), + None => Ok(Fetcher::initial(self, query_tree, facet_candidates)), + } + } +} + +pub fn resolve_query_tree<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, +) -> anyhow::Result +{ + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + ) -> anyhow::Result + { + use Operation::{And, Consecutive, Or, Query}; + + match query_tree { + And(ops) => { + let mut ops = ops.iter().map(|op| { + resolve_operation(ctx, op, cache) + }).collect::>>()?; + + ops.sort_unstable_by_key(|cds| cds.len()); + + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for docids in ops { + if first_loop { + candidates = docids; + first_loop = false; + } else { + candidates.intersect_with(&docids); + } + } + Ok(candidates) + }, + Consecutive(ops) => { + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for slice in ops.windows(2) { + match (&slice[0], &slice[1]) { + (Operation::Query(left), Operation::Query(right)) => { + match query_pair_proximity_docids(ctx, left, right, 1)? { + pair_docids if pair_docids.is_empty() => { + return Ok(RoaringBitmap::new()) + }, + pair_docids if first_loop => { + candidates = pair_docids; + first_loop = false; + }, + pair_docids => { + candidates.intersect_with(&pair_docids); + }, + } + }, + _ => bail!("invalid consecutive query type"), + } + } + Ok(candidates) + }, + Or(_, ops) => { + let mut candidates = RoaringBitmap::new(); + for op in ops { + let docids = resolve_operation(ctx, op, cache)?; + candidates.union_with(&docids); + } + Ok(candidates) + }, + Query(q) => Ok(query_docids(ctx, q)?), + } + } + + resolve_operation(ctx, query_tree, cache) +} + + +fn all_word_pair_proximity_docids, U: AsRef>( + ctx: &dyn Context, + left_words: &[(T, u8)], + right_words: &[(U, u8)], + proximity: u8 +) -> anyhow::Result { + let mut docids = RoaringBitmap::new(); + for (left, _l_typo) in left_words { + for (right, _r_typo) in right_words { + let current_docids = ctx.word_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } + } + Ok(docids) +} + +fn query_docids(ctx: &dyn Context, query: &Query) -> anyhow::Result { + match &query.kind { + QueryKind::Exact { word, .. } => { + if query.prefix && ctx.in_prefix_cache(&word) { + Ok(ctx.word_prefix_docids(&word)?.unwrap_or_default()) + } else if query.prefix { + let words = word_derivations(&word, true, 0, ctx.words_fst())?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } + Ok(docids) + } else { + Ok(ctx.word_docids(&word)?.unwrap_or_default()) + } + }, + QueryKind::Tolerant { typo, word } => { + let words = word_derivations(&word, query.prefix, *typo, ctx.words_fst())?; + let mut docids = RoaringBitmap::new(); + for (word, _typo) in words { + let current_docids = ctx.word_docids(&word)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } + Ok(docids) + }, + } +} + +fn query_pair_proximity_docids(ctx: &dyn Context, left: &Query, right: &Query, proximity: u8) -> anyhow::Result { + if proximity >= 8 { + let mut candidates = query_docids(ctx, left)?; + let right_candidates = query_docids(ctx, right)?; + candidates.intersect_with(&right_candidates); + return Ok(candidates); + } + + let prefix = right.prefix; + match (&left.kind, &right.kind) { + (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { + if prefix && ctx.in_prefix_cache(&right) { + Ok(ctx.word_prefix_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) + } else if prefix { + let r_words = word_derivations(&right, true, 0, ctx.words_fst())?; + all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + } else { + Ok(ctx.word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?.unwrap_or_default()) + } + }, + (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { + let l_words = word_derivations(&left, false, *typo, ctx.words_fst())?; + if prefix && ctx.in_prefix_cache(&right) { + let mut docids = RoaringBitmap::new(); + for (left, _) in l_words { + let current_docids = ctx.word_prefix_pair_proximity_docids(left.as_ref(), right.as_ref(), proximity)?.unwrap_or_default(); + docids.union_with(¤t_docids); + } + Ok(docids) + } else if prefix { + let r_words = word_derivations(&right, true, 0, ctx.words_fst())?; + all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) + } else { + all_word_pair_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) + } + }, + (QueryKind::Exact { word: left, .. }, QueryKind::Tolerant { typo, word: right }) => { + let r_words = word_derivations(&right, prefix, *typo, ctx.words_fst())?; + all_word_pair_proximity_docids(ctx, &[(left, 0)], &r_words, proximity) + }, + (QueryKind::Tolerant { typo: l_typo, word: left }, QueryKind::Tolerant { typo: r_typo, word: right }) => { + let l_words = word_derivations(&left, false, *l_typo, ctx.words_fst())?; + let r_words = word_derivations(&right, prefix, *r_typo, ctx.words_fst())?; + all_word_pair_proximity_docids(ctx, &l_words, &r_words, proximity) + }, + } +} + +#[cfg(test)] +pub mod test { + use maplit::hashmap; + use rand::{Rng, SeedableRng, rngs::StdRng}; + + use super::*; + use std::collections::HashMap; + + fn s(s: &str) -> String { s.to_string() } + pub struct TestContext<'t> { + words_fst: fst::Set>, + word_docids: HashMap, + word_prefix_docids: HashMap, + word_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + word_prefix_pair_proximity_docids: HashMap<(String, String, i32), RoaringBitmap>, + } + + impl<'a> Context for TestContext<'a> { + fn documents_ids(&self) -> heed::Result { + Ok(self.word_docids.iter().fold(RoaringBitmap::new(), |acc, (_, docids)| acc | docids)) + } + + fn word_docids(&self, word: &str) -> heed::Result> { + Ok(self.word_docids.get(&word.to_string()).cloned()) + } + + fn word_prefix_docids(&self, word: &str) -> heed::Result> { + Ok(self.word_prefix_docids.get(&word.to_string()).cloned()) + } + + fn word_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + let key = (left.to_string(), right.to_string(), proximity.into()); + Ok(self.word_pair_proximity_docids.get(&key).cloned()) + } + + fn word_prefix_pair_proximity_docids(&self, left: &str, right: &str, proximity: u8) -> heed::Result> { + let key = (left.to_string(), right.to_string(), proximity.into()); + Ok(self.word_prefix_pair_proximity_docids.get(&key).cloned()) + } + + fn words_fst<'t>(&self) -> &'t fst::Set> { + &self.words_fst + } + + fn in_prefix_cache(&self, word: &str) -> bool { + self.word_prefix_docids.contains_key(&word.to_string()) + } + } + + impl<'a> Default for TestContext<'a> { + fn default() -> TestContext<'a> { + let mut rng = StdRng::seed_from_u64(102); + let rng = &mut rng; + + fn random_postings(rng: &mut R, len: usize) -> RoaringBitmap { + let mut values = Vec::::with_capacity(len); + while values.len() != len { + values.push(rng.gen()); + } + values.sort_unstable(); + + RoaringBitmap::from_sorted_iter(values.into_iter()) + } + + let word_docids = hashmap!{ + s("hello") => random_postings(rng, 1500), + s("hi") => random_postings(rng, 4000), + s("word") => random_postings(rng, 2500), + s("split") => random_postings(rng, 400), + s("ngrams") => random_postings(rng, 1400), + s("world") => random_postings(rng, 15_000), + s("earth") => random_postings(rng, 8000), + s("2021") => random_postings(rng, 100), + s("2020") => random_postings(rng, 500), + s("is") => random_postings(rng, 50_000), + s("this") => random_postings(rng, 50_000), + s("good") => random_postings(rng, 1250), + s("morning") => random_postings(rng, 125), + }; + + let word_prefix_docids = hashmap!{ + s("h") => &word_docids[&s("hello")] | &word_docids[&s("hi")], + s("wor") => &word_docids[&s("word")] | &word_docids[&s("world")], + s("20") => &word_docids[&s("2020")] | &word_docids[&s("2021")], + }; + + let hello_world = &word_docids[&s("hello")] & &word_docids[&s("world")]; + let hello_world_split = (hello_world.len() / 2) as usize; + let hello_world_1 = hello_world.iter().take(hello_world_split).collect(); + let hello_world_2 = hello_world.iter().skip(hello_world_split).collect(); + + let hello_word = &word_docids[&s("hello")] & &word_docids[&s("word")]; + let hello_word_split = (hello_word.len() / 2) as usize; + let hello_word_4 = hello_word.iter().take(hello_word_split).collect(); + let hello_word_6 = hello_word.iter().skip(hello_word_split).take(hello_word_split/2).collect(); + let hello_word_7 = hello_word.iter().skip(hello_word_split + hello_word_split/2).collect(); + let word_pair_proximity_docids = hashmap!{ + (s("good"), s("morning"), 1) => &word_docids[&s("good")] & &word_docids[&s("morning")], + (s("hello"), s("world"), 1) => hello_world_1, + (s("hello"), s("world"), 4) => hello_world_2, + (s("this"), s("is"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")], + (s("is"), s("2021"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], + (s("is"), s("2020"), 1) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), + (s("this"), s("2021"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & &word_docids[&s("2021")], + (s("this"), s("2020"), 2) => &word_docids[&s("this")] & &word_docids[&s("is")] & (&word_docids[&s("2020")] - &word_docids[&s("2021")]), + (s("word"), s("split"), 1) => &word_docids[&s("word")] & &word_docids[&s("split")], + (s("world"), s("split"), 1) => (&word_docids[&s("world")] & &word_docids[&s("split")]) - &word_docids[&s("word")], + (s("hello"), s("word"), 4) => hello_word_4, + (s("hello"), s("word"), 6) => hello_word_6, + (s("hello"), s("word"), 7) => hello_word_7, + (s("split"), s("ngrams"), 3) => (&word_docids[&s("split")] & &word_docids[&s("ngrams")]) - &word_docids[&s("word")], + (s("split"), s("ngrams"), 5) => &word_docids[&s("split")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], + (s("this"), s("ngrams"), 1) => (&word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] ) - &word_docids[&s("word")], + (s("this"), s("ngrams"), 2) => &word_docids[&s("split")] & &word_docids[&s("this")] & &word_docids[&s("ngrams")] & &word_docids[&s("word")], + }; + + let word_prefix_pair_proximity_docids = hashmap!{ + (s("hello"), s("wor"), 1) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 1)).unwrap().clone(), + (s("hello"), s("wor"), 4) => word_pair_proximity_docids.get(&(s("hello"), s("world"), 4)).unwrap() | word_pair_proximity_docids.get(&(s("hello"), s("word"), 4)).unwrap(), + (s("hello"), s("wor"), 6) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 6)).unwrap().clone(), + (s("hello"), s("wor"), 7) => word_pair_proximity_docids.get(&(s("hello"), s("word"), 7)).unwrap().clone(), + (s("is"), s("20"), 1) => word_pair_proximity_docids.get(&(s("is"), s("2020"), 1)).unwrap() | word_pair_proximity_docids.get(&(s("is"), s("2021"), 1)).unwrap(), + (s("this"), s("20"), 2) => word_pair_proximity_docids.get(&(s("this"), s("2020"), 2)).unwrap() | word_pair_proximity_docids.get(&(s("this"), s("2021"), 2)).unwrap(), + }; + + let mut keys = word_docids.keys().collect::>(); + keys.sort_unstable(); + let words_fst = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap(); + + TestContext { + words_fst, + word_docids, + word_prefix_docids, + word_pair_proximity_docids, + word_prefix_pair_proximity_docids, + } + } + } +} diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs new file mode 100644 index 000000000..b192902c1 --- /dev/null +++ b/milli/src/search/criteria/proximity.rs @@ -0,0 +1,291 @@ +use std::collections::HashMap; +use std::mem::take; + +use roaring::RoaringBitmap; +use log::debug; + +use crate::search::query_tree::{maximum_proximity, Operation, Query}; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; + +pub struct Proximity<'t> { + ctx: &'t dyn Context, + query_tree: Option<(usize, Operation)>, + proximity: u8, + candidates: Candidates, + bucket_candidates: RoaringBitmap, + parent: Option>, + candidates_cache: HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, +} + +impl<'t> Proximity<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> Self + { + Proximity { + ctx, + query_tree: query_tree.map(|op| (maximum_proximity(&op), op)), + proximity: 0, + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + bucket_candidates: RoaringBitmap::new(), + parent: None, + candidates_cache: HashMap::new(), + } + } + + pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + Proximity { + ctx, + query_tree: None, + proximity: 0, + candidates: Candidates::default(), + bucket_candidates: RoaringBitmap::new(), + parent: Some(parent), + candidates_cache: HashMap::new(), + } + } +} + +impl<'t> Criterion for Proximity<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + loop { + debug!("Proximity at iteration {} (max {:?}) ({:?})", + self.proximity, + self.query_tree.as_ref().map(|(mp, _)| mp), + self.candidates, + ); + + match (&mut self.query_tree, &mut self.candidates) { + (_, Allowed(candidates)) if candidates.is_empty() => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take().map(|(_, qt)| qt), + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + (Some((max_prox, query_tree)), Allowed(candidates)) => { + if self.proximity as usize > *max_prox { + self.query_tree = None; + self.candidates = Candidates::default(); + } else { + let mut new_candidates = resolve_candidates( + self.ctx, + &query_tree, + self.proximity, + &mut self.candidates_cache, + )?; + + new_candidates.intersect_with(&candidates); + candidates.difference_with(&new_candidates); + self.proximity += 1; + + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => new_candidates.clone(), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(query_tree.clone()), + candidates: new_candidates, + bucket_candidates, + })); + } + }, + (Some((max_prox, query_tree)), Forbidden(candidates)) => { + if self.proximity as usize > *max_prox { + self.query_tree = None; + self.candidates = Candidates::default(); + } else { + let mut new_candidates = resolve_candidates( + self.ctx, + &query_tree, + self.proximity, + &mut self.candidates_cache, + )?; + + new_candidates.difference_with(&candidates); + candidates.union_with(&new_candidates); + self.proximity += 1; + + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => new_candidates.clone(), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(query_tree.clone()), + candidates: new_candidates, + bucket_candidates, + })); + } + }, + (None, Allowed(_)) => { + let candidates = take(&mut self.candidates).into_inner(); + return Ok(Some(CriterionResult { + query_tree: None, + candidates: candidates.clone(), + bucket_candidates: candidates, + })); + }, + (None, Forbidden(_)) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_tree = query_tree.map(|op| (maximum_proximity(&op), op)); + self.proximity = 0; + self.candidates = Candidates::Allowed(candidates); + self.bucket_candidates.union_with(&bucket_candidates); + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } + } +} + +fn resolve_candidates<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + proximity: u8, + cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, +) -> anyhow::Result +{ + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + proximity: u8, + cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + ) -> anyhow::Result> + { + use Operation::{And, Consecutive, Or, Query}; + + let result = match query_tree { + And(ops) => mdfs(ctx, ops, proximity, cache)?, + Consecutive(ops) => if proximity == 0 { + mdfs(ctx, ops, 0, cache)? + } else { + Default::default() + }, + Or(_, ops) => { + let mut output = Vec::new(); + for op in ops { + let result = resolve_operation(ctx, op, proximity, cache)?; + output.extend(result); + } + output + }, + Query(q) => if proximity == 0 { + let candidates = query_docids(ctx, q)?; + vec![(q.clone(), q.clone(), candidates)] + } else { + Default::default() + }, + }; + + Ok(result) + } + + fn mdfs_pair<'t>( + ctx: &'t dyn Context, + left: &Operation, + right: &Operation, + proximity: u8, + cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + ) -> anyhow::Result> + { + fn pair_combinations(mana: u8, left_max: u8) -> impl Iterator { + (0..=mana.min(left_max)).map(move |m| (m, mana - m)) + } + + let pair_max_proximity = 7; + + let mut output = Vec::new(); + + for (pair_p, left_right_p) in pair_combinations(proximity, pair_max_proximity) { + for (left_p, right_p) in pair_combinations(left_right_p, left_right_p) { + let left_key = (left.clone(), left_p); + if !cache.contains_key(&left_key) { + let candidates = resolve_operation(ctx, left, left_p, cache)?; + cache.insert(left_key.clone(), candidates); + } + + let right_key = (right.clone(), right_p); + if !cache.contains_key(&right_key) { + let candidates = resolve_operation(ctx, right, right_p, cache)?; + cache.insert(right_key.clone(), candidates); + } + + let lefts = cache.get(&left_key).unwrap(); + let rights = cache.get(&right_key).unwrap(); + + for (ll, lr, lcandidates) in lefts { + for (rl, rr, rcandidates) in rights { + let mut candidates = query_pair_proximity_docids(ctx, lr, rl, pair_p + 1)?; + if lcandidates.len() < rcandidates.len() { + candidates.intersect_with(lcandidates); + candidates.intersect_with(rcandidates); + } else { + candidates.intersect_with(rcandidates); + candidates.intersect_with(lcandidates); + } + if !candidates.is_empty() { + output.push((ll.clone(), rr.clone(), candidates)); + } + } + } + } + } + + Ok(output) + } + + fn mdfs<'t>( + ctx: &'t dyn Context, + branches: &[Operation], + proximity: u8, + cache: &mut HashMap<(Operation, u8), Vec<(Query, Query, RoaringBitmap)>>, + ) -> anyhow::Result> + { + // Extract the first two elements but gives the tail + // that is just after the first element. + let next = branches.split_first().map(|(h1, t)| { + (h1, t.split_first().map(|(h2, _)| (h2, t))) + }); + + match next { + Some((head1, Some((head2, [_])))) => mdfs_pair(ctx, head1, head2, proximity, cache), + Some((head1, Some((head2, tail)))) => { + let mut output = Vec::new(); + for p in 0..=proximity { + for (lhead, _, head_candidates) in mdfs_pair(ctx, head1, head2, p, cache)? { + if !head_candidates.is_empty() { + for (_, rtail, mut candidates) in mdfs(ctx, tail, proximity - p, cache)? { + candidates.intersect_with(&head_candidates); + if !candidates.is_empty() { + output.push((lhead.clone(), rtail, candidates)); + } + } + } + } + } + Ok(output) + }, + Some((head1, None)) => resolve_operation(ctx, head1, proximity, cache), + None => return Ok(Default::default()), + } + } + + let mut candidates = RoaringBitmap::new(); + for (_, _, cds) in resolve_operation(ctx, query_tree, proximity, cache)? { + candidates.union_with(&cds); + } + Ok(candidates) +} diff --git a/milli/src/search/criteria/typo.rs b/milli/src/search/criteria/typo.rs new file mode 100644 index 000000000..a78ac3339 --- /dev/null +++ b/milli/src/search/criteria/typo.rs @@ -0,0 +1,482 @@ +use std::{borrow::Cow, collections::HashMap, mem::take}; + +use anyhow::bail; +use log::debug; +use roaring::RoaringBitmap; + +use crate::search::query_tree::{maximum_typo, Operation, Query, QueryKind}; +use crate::search::word_derivations; +use super::{Candidates, Criterion, CriterionResult, Context, query_docids, query_pair_proximity_docids}; + +pub struct Typo<'t> { + ctx: &'t dyn Context, + query_tree: Option<(usize, Operation)>, + number_typos: u8, + candidates: Candidates, + bucket_candidates: RoaringBitmap, + parent: Option>, + candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, + typo_cache: HashMap<(String, bool, u8), Vec<(String, u8)>>, +} + +impl<'t> Typo<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> Self + { + Typo { + ctx, + query_tree: query_tree.map(|op| (maximum_typo(&op), op)), + number_typos: 0, + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + bucket_candidates: RoaringBitmap::new(), + parent: None, + candidates_cache: HashMap::new(), + typo_cache: HashMap::new(), + } + } + + pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + Typo { + ctx, + query_tree: None, + number_typos: 0, + candidates: Candidates::default(), + bucket_candidates: RoaringBitmap::new(), + parent: Some(parent), + candidates_cache: HashMap::new(), + typo_cache: HashMap::new(), + } + } +} + +impl<'t> Criterion for Typo<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + loop { + debug!("Typo at iteration {} ({:?})", self.number_typos, self.candidates); + + match (&mut self.query_tree, &mut self.candidates) { + (_, Allowed(candidates)) if candidates.is_empty() => { + return Ok(Some(CriterionResult { + query_tree: self.query_tree.take().map(|(_, qt)| qt), + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + (Some((max_typos, query_tree)), Allowed(candidates)) => { + if self.number_typos as usize > *max_typos { + self.query_tree = None; + self.candidates = Candidates::default(); + } else { + let fst = self.ctx.words_fst(); + let new_query_tree = if self.number_typos < 2 { + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? + } else if self.number_typos == 2 { + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; + query_tree.clone() + } else { + query_tree.clone() + }; + + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; + new_candidates.intersect_with(&candidates); + candidates.difference_with(&new_candidates); + self.number_typos += 1; + + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => new_candidates.clone(), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: new_candidates, + bucket_candidates, + })); + } + }, + (Some((max_typos, query_tree)), Forbidden(candidates)) => { + if self.number_typos as usize > *max_typos { + self.query_tree = None; + self.candidates = Candidates::default(); + } else { + let fst = self.ctx.words_fst(); + let new_query_tree = if self.number_typos < 2 { + alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)? + } else if self.number_typos == 2 { + *query_tree = alterate_query_tree(&fst, query_tree.clone(), self.number_typos, &mut self.typo_cache)?; + query_tree.clone() + } else { + query_tree.clone() + }; + + let mut new_candidates = resolve_candidates(self.ctx, &new_query_tree, self.number_typos, &mut self.candidates_cache)?; + new_candidates.difference_with(&candidates); + candidates.union_with(&new_candidates); + self.number_typos += 1; + + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => new_candidates.clone(), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(new_query_tree), + candidates: new_candidates, + bucket_candidates, + })); + } + }, + (None, Allowed(_)) => { + let candidates = take(&mut self.candidates).into_inner(); + return Ok(Some(CriterionResult { + query_tree: None, + candidates: candidates.clone(), + bucket_candidates: candidates, + })); + }, + (None, Forbidden(_)) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_tree = query_tree.map(|op| (maximum_typo(&op), op)); + self.number_typos = 0; + self.candidates = Candidates::Allowed(candidates); + self.bucket_candidates.union_with(&bucket_candidates); + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } + } +} + +/// Modify the query tree by replacing every tolerant query by an Or operation +/// containing all of the corresponding exact words in the words FST. Each tolerant +/// query will only be replaced by exact query with up to `number_typos` maximum typos. +fn alterate_query_tree( + words_fst: &fst::Set>, + mut query_tree: Operation, + number_typos: u8, + typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>, +) -> anyhow::Result +{ + fn recurse( + words_fst: &fst::Set>, + operation: &mut Operation, + number_typos: u8, + typo_cache: &mut HashMap<(String, bool, u8), Vec<(String, u8)>>, + ) -> anyhow::Result<()> + { + use Operation::{And, Consecutive, Or}; + + match operation { + And(ops) | Consecutive(ops) | Or(_, ops) => { + ops.iter_mut().try_for_each(|op| recurse(words_fst, op, number_typos, typo_cache)) + }, + Operation::Query(q) => { + // TODO may be optimized when number_typos == 0 + if let QueryKind::Tolerant { typo, word } = &q.kind { + // if no typo is allowed we don't call word_derivations function, + // and directly create an Exact query + if number_typos == 0 { + *operation = Operation::Query(Query { + prefix: q.prefix, + kind: QueryKind::Exact { original_typo: 0, word: word.clone() }, + }); + } else { + let typo = *typo.min(&number_typos); + let cache_key = (word.clone(), q.prefix, typo); + let words = if let Some(derivations) = typo_cache.get(&cache_key) { + derivations.clone() + } else { + let derivations = word_derivations(word, q.prefix, typo, words_fst)?; + typo_cache.insert(cache_key, derivations.clone()); + derivations + }; + + let queries = words.into_iter().map(|(word, typo)| { + Operation::Query(Query { + prefix: false, + kind: QueryKind::Exact { original_typo: typo, word }, + }) + }).collect(); + + *operation = Operation::or(false, queries); + } + } + + Ok(()) + }, + } + } + + recurse(words_fst, &mut query_tree, number_typos, typo_cache)?; + Ok(query_tree) +} + +fn resolve_candidates<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + number_typos: u8, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, +) -> anyhow::Result +{ + fn resolve_operation<'t>( + ctx: &'t dyn Context, + query_tree: &Operation, + number_typos: u8, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + ) -> anyhow::Result + { + use Operation::{And, Consecutive, Or, Query}; + + match query_tree { + And(ops) => { + mdfs(ctx, ops, number_typos, cache) + }, + Consecutive(ops) => { + let mut candidates = RoaringBitmap::new(); + let mut first_loop = true; + for slice in ops.windows(2) { + match (&slice[0], &slice[1]) { + (Operation::Query(left), Operation::Query(right)) => { + match query_pair_proximity_docids(ctx, left, right, 1)? { + pair_docids if pair_docids.is_empty() => { + return Ok(RoaringBitmap::new()) + }, + pair_docids if first_loop => { + candidates = pair_docids; + first_loop = false; + }, + pair_docids => { + candidates.intersect_with(&pair_docids); + }, + } + }, + _ => bail!("invalid consecutive query type"), + } + } + Ok(candidates) + }, + Or(_, ops) => { + let mut candidates = RoaringBitmap::new(); + for op in ops { + let docids = resolve_operation(ctx, op, number_typos, cache)?; + candidates.union_with(&docids); + } + Ok(candidates) + }, + Query(q) => if q.kind.typo() == number_typos { + Ok(query_docids(ctx, q)?) + } else { + Ok(RoaringBitmap::new()) + }, + } + } + + fn mdfs<'t>( + ctx: &'t dyn Context, + branches: &[Operation], + mana: u8, + cache: &mut HashMap<(Operation, u8), RoaringBitmap>, + ) -> anyhow::Result + { + match branches.split_first() { + Some((head, [])) => { + let cache_key = (head.clone(), mana); + if let Some(candidates) = cache.get(&cache_key) { + Ok(candidates.clone()) + } else { + let candidates = resolve_operation(ctx, head, mana, cache)?; + cache.insert(cache_key, candidates.clone()); + Ok(candidates) + } + }, + Some((head, tail)) => { + let mut candidates = RoaringBitmap::new(); + + for m in 0..=mana { + let mut head_candidates = { + let cache_key = (head.clone(), m); + if let Some(candidates) = cache.get(&cache_key) { + candidates.clone() + } else { + let candidates = resolve_operation(ctx, head, m, cache)?; + cache.insert(cache_key, candidates.clone()); + candidates + } + }; + if !head_candidates.is_empty() { + let tail_candidates = mdfs(ctx, tail, mana - m, cache)?; + head_candidates.intersect_with(&tail_candidates); + candidates.union_with(&head_candidates); + } + } + + Ok(candidates) + }, + None => Ok(RoaringBitmap::new()), + } + } + + resolve_operation(ctx, query_tree, number_typos, cache) +} + +#[cfg(test)] +mod test { + + use super::*; + use super::super::test::TestContext; + + #[test] + fn initial_placeholder_no_facets() { + let context = TestContext::default(); + let query_tree = None; + let facet_candidates = None; + + let mut criteria = Typo::initial(&context, query_tree, facet_candidates); + + assert!(criteria.next().unwrap().is_none()); + } + + #[test] + fn initial_query_tree_no_facets() { + let context = TestContext::default(); + let query_tree = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), + ]) + ]); + + let facet_candidates = None; + + let mut criteria = Typo::initial(&context, Some(query_tree), facet_candidates); + + let candidates_1 = context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("world").unwrap().unwrap(); + let expected_1 = CriterionResult { + query_tree: Some(Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), + ]), + ])), + candidates: candidates_1.clone(), + bucket_candidates: candidates_1, + }; + + assert_eq!(criteria.next().unwrap(), Some(expected_1)); + + let candidates_2 = ( + context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("word").unwrap().unwrap() + ) - context.word_docids("world").unwrap().unwrap(); + let expected_2 = CriterionResult { + query_tree: Some(Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), + ]), + ]), + ])), + candidates: candidates_2.clone(), + bucket_candidates: candidates_2, + }; + + assert_eq!(criteria.next().unwrap(), Some(expected_2)); + } + + #[test] + fn initial_placeholder_with_facets() { + let context = TestContext::default(); + let query_tree = None; + let facet_candidates = context.word_docids("earth").unwrap().unwrap(); + + let mut criteria = Typo::initial(&context, query_tree, Some(facet_candidates.clone())); + + let expected = CriterionResult { + query_tree: None, + candidates: facet_candidates.clone(), + bucket_candidates: facet_candidates, + }; + + // first iteration, returns the facet candidates + assert_eq!(criteria.next().unwrap(), Some(expected)); + + // second iteration, returns None because there is no more things to do + assert!(criteria.next().unwrap().is_none()); + } + + #[test] + fn initial_query_tree_with_facets() { + let context = TestContext::default(); + let query_tree = Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::tolerant(1, "world".to_string()) }), + ]) + ]); + + let facet_candidates = context.word_docids("earth").unwrap().unwrap(); + + let mut criteria = Typo::initial(&context, Some(query_tree), Some(facet_candidates.clone())); + + let candidates_1 = context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("world").unwrap().unwrap(); + let expected_1 = CriterionResult { + query_tree: Some(Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), + ]), + ])), + candidates: &candidates_1 & &facet_candidates, + bucket_candidates: candidates_1 & &facet_candidates, + }; + + assert_eq!(criteria.next().unwrap(), Some(expected_1)); + + let candidates_2 = ( + context.word_docids("split").unwrap().unwrap() + & context.word_docids("this").unwrap().unwrap() + & context.word_docids("word").unwrap().unwrap() + ) - context.word_docids("world").unwrap().unwrap(); + let expected_2 = CriterionResult { + query_tree: Some(Operation::Or(false, vec![ + Operation::And(vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact("split".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("this".to_string()) }), + Operation::Or(false, vec![ + Operation::Query(Query { prefix: false, kind: QueryKind::exact_with_typo(1, "word".to_string()) }), + Operation::Query(Query { prefix: false, kind: QueryKind::exact("world".to_string()) }), + ]), + ]), + ])), + candidates: &candidates_2 & &facet_candidates, + bucket_candidates: candidates_2 & &facet_candidates, + }; + + assert_eq!(criteria.next().unwrap(), Some(expected_2)); + } + +} diff --git a/milli/src/search/criteria/words.rs b/milli/src/search/criteria/words.rs new file mode 100644 index 000000000..1827cd1ed --- /dev/null +++ b/milli/src/search/criteria/words.rs @@ -0,0 +1,128 @@ +use std::collections::HashMap; +use std::mem::take; + +use log::debug; +use roaring::RoaringBitmap; + +use crate::search::query_tree::Operation; +use super::{resolve_query_tree, Candidates, Criterion, CriterionResult, Context}; + +pub struct Words<'t> { + ctx: &'t dyn Context, + query_trees: Vec, + candidates: Candidates, + bucket_candidates: RoaringBitmap, + parent: Option>, + candidates_cache: HashMap<(Operation, u8), RoaringBitmap>, +} + +impl<'t> Words<'t> { + pub fn initial( + ctx: &'t dyn Context, + query_tree: Option, + candidates: Option, + ) -> Self + { + Words { + ctx, + query_trees: query_tree.map(explode_query_tree).unwrap_or_default(), + candidates: candidates.map_or_else(Candidates::default, Candidates::Allowed), + bucket_candidates: RoaringBitmap::new(), + parent: None, + candidates_cache: HashMap::default(), + } + } + + pub fn new(ctx: &'t dyn Context, parent: Box) -> Self { + Words { + ctx, + query_trees: Vec::default(), + candidates: Candidates::default(), + bucket_candidates: RoaringBitmap::new(), + parent: Some(parent), + candidates_cache: HashMap::default(), + } + } +} + +impl<'t> Criterion for Words<'t> { + fn next(&mut self) -> anyhow::Result> { + use Candidates::{Allowed, Forbidden}; + loop { + debug!("Words at iteration {} ({:?})", self.query_trees.len(), self.candidates); + + match (self.query_trees.pop(), &mut self.candidates) { + (query_tree, Allowed(candidates)) if candidates.is_empty() => { + self.query_trees = Vec::new(); + return Ok(Some(CriterionResult { + query_tree, + candidates: take(&mut self.candidates).into_inner(), + bucket_candidates: take(&mut self.bucket_candidates), + })); + }, + (Some(qt), Allowed(candidates)) => { + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?; + found_candidates.intersect_with(&candidates); + candidates.difference_with(&found_candidates); + + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => found_candidates.clone(), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(qt), + candidates: found_candidates, + bucket_candidates, + })); + }, + (Some(qt), Forbidden(candidates)) => { + let mut found_candidates = resolve_query_tree(self.ctx, &qt, &mut self.candidates_cache)?; + found_candidates.difference_with(&candidates); + candidates.union_with(&found_candidates); + + let bucket_candidates = match self.parent { + Some(_) => take(&mut self.bucket_candidates), + None => found_candidates.clone(), + }; + + return Ok(Some(CriterionResult { + query_tree: Some(qt), + candidates: found_candidates, + bucket_candidates, + })); + }, + (None, Allowed(_)) => { + let candidates = take(&mut self.candidates).into_inner(); + return Ok(Some(CriterionResult { + query_tree: None, + candidates: candidates.clone(), + bucket_candidates: candidates, + })); + }, + (None, Forbidden(_)) => { + match self.parent.as_mut() { + Some(parent) => { + match parent.next()? { + Some(CriterionResult { query_tree, candidates, bucket_candidates }) => { + self.query_trees = query_tree.map(explode_query_tree).unwrap_or_default(); + self.candidates = Candidates::Allowed(candidates); + self.bucket_candidates.union_with(&bucket_candidates); + }, + None => return Ok(None), + } + }, + None => return Ok(None), + } + }, + } + } + } +} + +fn explode_query_tree(query_tree: Operation) -> Vec { + match query_tree { + Operation::Or(true, ops) => ops, + otherwise => vec![otherwise], + } +} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index e5672982e..8570cefaa 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -1,27 +1,21 @@ use std::borrow::Cow; -use std::collections::{HashMap, HashSet}; use std::fmt; use std::time::Instant; -use anyhow::{bail, Context}; use fst::{IntoStreamer, Streamer, Set}; -use levenshtein_automata::DFA; -use levenshtein_automata::LevenshteinAutomatonBuilder as LevBuilder; +use levenshtein_automata::{DFA, LevenshteinAutomatonBuilder as LevBuilder}; use log::debug; use meilisearch_tokenizer::{AnalyzerConfig, Analyzer}; use once_cell::sync::Lazy; -use ordered_float::OrderedFloat; use roaring::bitmap::RoaringBitmap; -use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetLevelValueI64Codec}; -use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetI64Codec}; -use crate::mdfs::Mdfs; -use crate::query_tokens::{query_tokens, QueryToken}; -use crate::{Index, FieldId, DocumentId, Criterion}; +use crate::search::criteria::{Criterion, CriterionResult}; +use crate::{Index, DocumentId}; +pub use self::facet::FacetIter; pub use self::facet::{FacetCondition, FacetDistribution, FacetNumberOperator, FacetStringOperator}; -pub use self::facet::{FacetIter}; +pub use self::query_tree::MatchingWords; +use self::query_tree::QueryTreeBuilder; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -30,6 +24,7 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); mod facet; mod query_tree; +mod criteria; pub struct Search<'a> { query: Option, @@ -65,208 +60,23 @@ impl<'a> Search<'a> { self } - /// Extracts the query words from the query string and returns the DFAs accordingly. - /// TODO introduce settings for the number of typos regarding the words lengths. - fn generate_query_dfas(query: &str) -> Vec<(String, bool, DFA)> { - let (lev0, lev1, lev2) = (&LEVDIST0, &LEVDIST1, &LEVDIST2); - - let stop_words = Set::default(); - let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(&stop_words)); - let analyzed = analyzer.analyze(query); - let tokens = analyzed.tokens(); - let words: Vec<_> = query_tokens(tokens).collect(); - - let ends_with_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let number_of_words = words.len(); - - words.into_iter().enumerate().map(|(i, word)| { - let (word, quoted) = match word { - QueryToken::Free(token) => (token.text().to_string(), token.text().len() <= 3), - QueryToken::Quoted(token) => (token.text().to_string(), true), - }; - let is_last = i + 1 == number_of_words; - let is_prefix = is_last && !ends_with_whitespace && !quoted; - let lev = match word.len() { - 0..=4 => if quoted { lev0 } else { lev0 }, - 5..=8 => if quoted { lev0 } else { lev1 }, - _ => if quoted { lev0 } else { lev2 }, - }; - - let dfa = if is_prefix { - lev.build_prefix_dfa(&word) - } else { - lev.build_dfa(&word) - }; - - (word, is_prefix, dfa) - }) - .collect() - } - - /// Fetch the words from the given FST related to the given DFAs along with - /// the associated documents ids. - fn fetch_words_docids( - &self, - fst: &fst::Set>, - dfas: Vec<(String, bool, DFA)>, - ) -> anyhow::Result, RoaringBitmap)>> - { - // A Vec storing all the derived words from the original query words, associated - // with the distance from the original word and the docids where the words appears. - let mut derived_words = Vec::<(HashMap::, RoaringBitmap)>::with_capacity(dfas.len()); - - for (_word, _is_prefix, dfa) in dfas { - - let mut acc_derived_words = HashMap::new(); - let mut unions_docids = RoaringBitmap::new(); - let mut stream = fst.search_with_state(&dfa).into_stream(); - while let Some((word, state)) = stream.next() { - - let word = std::str::from_utf8(word)?; - let docids = self.index.word_docids.get(self.rtxn, word)?.unwrap(); - let distance = dfa.distance(state); - unions_docids.union_with(&docids); - acc_derived_words.insert(word.to_string(), (distance.to_u8(), docids)); - } - derived_words.push((acc_derived_words, unions_docids)); - } - - Ok(derived_words) - } - - /// Returns the set of docids that contains all of the query words. - fn compute_candidates( - derived_words: &[(HashMap, RoaringBitmap)], - ) -> RoaringBitmap - { - // We sort the derived words by inverse popularity, this way intersections are faster. - let mut derived_words: Vec<_> = derived_words.iter().collect(); - derived_words.sort_unstable_by_key(|(_, docids)| docids.len()); - - // we do a union between all the docids of each of the derived words, - // we got N unions (the number of original query words), we then intersect them. - let mut candidates = RoaringBitmap::new(); - - for (i, (_, union_docids)) in derived_words.iter().enumerate() { - if i == 0 { - candidates = union_docids.clone(); - } else { - candidates.intersect_with(&union_docids); - } - } - - candidates - } - - fn facet_ordered( - &self, - field_id: FieldId, - facet_type: FacetType, - ascending: bool, - mut documents_ids: RoaringBitmap, - limit: usize, - ) -> anyhow::Result> - { - let mut output: Vec<_> = match facet_type { - FacetType::Float => { - if documents_ids.len() <= 1000 { - let db = self.index.field_id_docid_facet_values.remap_key_type::(); - let mut docids_values = Vec::with_capacity(documents_ids.len() as usize); - for docid in documents_ids.iter() { - let left = (field_id, docid, f64::MIN); - let right = (field_id, docid, f64::MAX); - let mut iter = db.range(self.rtxn, &(left..=right))?; - let entry = if ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, OrderedFloat(value))); - } - } - docids_values.sort_unstable_by_key(|(_, value)| *value); - let iter = docids_values.into_iter().map(|(id, _)| id); - if ascending { - iter.take(limit).collect() - } else { - iter.rev().take(limit).collect() - } - } else { - let facet_fn = if ascending { - FacetIter::::new_reducing - } else { - FacetIter::::new_reverse_reducing - }; - let mut limit_tmp = limit; - let mut output = Vec::new(); - for result in facet_fn(self.rtxn, self.index, field_id, documents_ids.clone())? { - let (_val, docids) = result?; - limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); - output.push(docids); - if limit_tmp == 0 { break } - } - output.into_iter().flatten().take(limit).collect() - } - }, - FacetType::Integer => { - if documents_ids.len() <= 1000 { - let db = self.index.field_id_docid_facet_values.remap_key_type::(); - let mut docids_values = Vec::with_capacity(documents_ids.len() as usize); - for docid in documents_ids.iter() { - let left = (field_id, docid, i64::MIN); - let right = (field_id, docid, i64::MAX); - let mut iter = db.range(self.rtxn, &(left..=right))?; - let entry = if ascending { iter.next() } else { iter.last() }; - if let Some(((_, _, value), ())) = entry.transpose()? { - docids_values.push((docid, value)); - } - } - docids_values.sort_unstable_by_key(|(_, value)| *value); - let iter = docids_values.into_iter().map(|(id, _)| id); - if ascending { - iter.take(limit).collect() - } else { - iter.rev().take(limit).collect() - } - } else { - let facet_fn = if ascending { - FacetIter::::new_reducing - } else { - FacetIter::::new_reverse_reducing - }; - let mut limit_tmp = limit; - let mut output = Vec::new(); - for result in facet_fn(self.rtxn, self.index, field_id, documents_ids.clone())? { - let (_val, docids) = result?; - limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); - output.push(docids); - if limit_tmp == 0 { break } - } - output.into_iter().flatten().take(limit).collect() - } - }, - FacetType::String => bail!("criteria facet type must be a number"), - }; - - // if there isn't enough documents to return we try to complete that list - // with documents that are maybe not faceted under this field and therefore - // not returned by the previous facet iteration. - if output.len() < limit { - output.iter().for_each(|n| { documents_ids.remove(*n); }); - let remaining = documents_ids.iter().take(limit - output.len()); - output.extend(remaining); - } - - Ok(output) - } - pub fn execute(&self) -> anyhow::Result { - let limit = self.limit; - let fst = self.index.words_fst(self.rtxn)?; - - // Construct the DFAs related to the query words. - let derived_words = match self.query.as_deref().map(Self::generate_query_dfas) { - Some(dfas) if !dfas.is_empty() => Some(self.fetch_words_docids(&fst, dfas)?), - _otherwise => None, + // We create the query tree by spliting the query into tokens. + let before = Instant::now(); + let query_tree = match self.query.as_ref() { + Some(query) => { + let builder = QueryTreeBuilder::new(self.rtxn, self.index); + let stop_words = &Set::default(); + let analyzer = Analyzer::new(AnalyzerConfig::default_with_stopwords(stop_words)); + let result = analyzer.analyze(query); + let tokens = result.tokens(); + builder.build(tokens)? + }, + None => None, }; + debug!("query tree: {:?} took {:.02?}", query_tree, before.elapsed()); + // We create the original candidates with the facet conditions results. let before = Instant::now(); let facet_candidates = match &self.facet_condition { @@ -276,100 +86,42 @@ impl<'a> Search<'a> { debug!("facet candidates: {:?} took {:.02?}", facet_candidates, before.elapsed()); - let order_by_facet = { - let criteria = self.index.criteria(self.rtxn)?; - let result = criteria.into_iter().flat_map(|criterion| { - match criterion { - Criterion::Asc(fid) => Some((fid, true)), - Criterion::Desc(fid) => Some((fid, false)), - _ => None - } - }).next(); - match result { - Some((attr_name, is_ascending)) => { - let field_id_map = self.index.fields_ids_map(self.rtxn)?; - let fid = field_id_map.id(&attr_name).with_context(|| format!("unknown field: {:?}", attr_name))?; - let faceted_fields = self.index.faceted_fields_ids(self.rtxn)?; - let ftype = *faceted_fields.get(&fid) - .with_context(|| format!("{:?} not found in the faceted fields.", attr_name)) - .expect("corrupted data: "); - Some((fid, ftype, is_ascending)) - }, - None => None, - } + let matching_words = match query_tree.as_ref() { + Some(query_tree) => MatchingWords::from_query_tree(&query_tree), + None => MatchingWords::default(), }; - let before = Instant::now(); - let (candidates, derived_words) = match (facet_candidates, derived_words) { - (Some(mut facet_candidates), Some(derived_words)) => { - let words_candidates = Self::compute_candidates(&derived_words); - facet_candidates.intersect_with(&words_candidates); - (facet_candidates, derived_words) - }, - (None, Some(derived_words)) => { - (Self::compute_candidates(&derived_words), derived_words) - }, - (Some(facet_candidates), None) => { - // If the query is not set or results in no DFAs but - // there is some facet conditions we return a placeholder. - let documents_ids = match order_by_facet { - Some((fid, ftype, is_ascending)) => { - self.facet_ordered(fid, ftype, is_ascending, facet_candidates.clone(), limit)? - }, - None => facet_candidates.iter().take(limit).collect(), - }; - return Ok(SearchResult { - documents_ids, - candidates: facet_candidates, - ..Default::default() - }) - }, - (None, None) => { - // If the query is not set or results in no DFAs we return a placeholder. - let all_docids = self.index.documents_ids(self.rtxn)?; - let documents_ids = match order_by_facet { - Some((fid, ftype, is_ascending)) => { - self.facet_ordered(fid, ftype, is_ascending, all_docids.clone(), limit)? - }, - None => all_docids.iter().take(limit).collect(), - }; - return Ok(SearchResult { documents_ids, candidates: all_docids,..Default::default() }) - }, - }; + let criteria_builder = criteria::CriteriaBuilder::new(self.rtxn, self.index)?; + let mut criteria = criteria_builder.build(query_tree, facet_candidates)?; - debug!("candidates: {:?} took {:.02?}", candidates, before.elapsed()); + let mut offset = self.offset; + let mut limit = self.limit; + let mut documents_ids = Vec::new(); + let mut initial_candidates = RoaringBitmap::new(); + while let Some(CriterionResult { candidates, bucket_candidates, .. }) = criteria.next()? { - // The mana depth first search is a revised DFS that explore - // solutions in the order of their proximities. - let mut mdfs = Mdfs::new(self.index, self.rtxn, &derived_words, candidates.clone()); - let mut documents = Vec::new(); + debug!("Number of candidates found {}", candidates.len()); - // We execute the Mdfs iterator until we find enough documents. - while documents.iter().map(RoaringBitmap::len).sum::() < limit as u64 { - match mdfs.next().transpose()? { - Some((proximity, answer)) => { - debug!("answer with a proximity of {}: {:?}", proximity, answer); - documents.push(answer); - }, - None => break, + let mut len = candidates.len() as usize; + let mut candidates = candidates.into_iter(); + + initial_candidates.union_with(&bucket_candidates); + + if offset != 0 { + candidates.by_ref().skip(offset).for_each(drop); + offset = offset.saturating_sub(len.min(offset)); + len = len.saturating_sub(len.min(offset)); } + + if len != 0 { + documents_ids.extend(candidates.take(limit)); + limit = limit.saturating_sub(len.min(limit)); + } + + if limit == 0 { break } } - let found_words = derived_words.into_iter().flat_map(|(w, _)| w).map(|(w, _)| w).collect(); - let documents_ids = match order_by_facet { - Some((fid, ftype, order)) => { - let mut ordered_documents = Vec::new(); - for documents_ids in documents { - let docids = self.facet_ordered(fid, ftype, order, documents_ids, limit)?; - ordered_documents.push(docids); - if ordered_documents.iter().map(Vec::len).sum::() >= limit { break } - } - ordered_documents.into_iter().flatten().take(limit).collect() - }, - None => documents.into_iter().flatten().take(limit).collect(), - }; - - Ok(SearchResult { found_words, candidates, documents_ids }) + Ok(SearchResult { matching_words, candidates: initial_candidates, documents_ids }) } } @@ -387,28 +139,21 @@ impl fmt::Debug for Search<'_> { #[derive(Default)] pub struct SearchResult { - pub found_words: HashSet, + pub matching_words: MatchingWords, pub candidates: RoaringBitmap, // TODO those documents ids should be associated with their criteria scores. pub documents_ids: Vec, } -pub fn word_typos(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set>) -> anyhow::Result> { - let dfa = { - let lev = match max_typo { - 0 => &LEVDIST0, - 1 => &LEVDIST1, - _ => &LEVDIST2, - }; - - if is_prefix { - lev.build_prefix_dfa(&word) - } else { - lev.build_dfa(&word) - } - }; - +pub fn word_derivations( + word: &str, + is_prefix: bool, + max_typo: u8, + fst: &fst::Set>, +) -> anyhow::Result> +{ let mut derived_words = Vec::new(); + let dfa = build_dfa(word, max_typo, is_prefix); let mut stream = fst.search_with_state(&dfa).into_stream(); while let Some((word, state)) = stream.next() { @@ -419,3 +164,17 @@ pub fn word_typos(word: &str, is_prefix: bool, max_typo: u8, fst: &fst::Set DFA { + let lev = match typos { + 0 => &LEVDIST0, + 1 => &LEVDIST1, + _ => &LEVDIST2, + }; + + if is_prefix { + lev.build_prefix_dfa(word) + } else { + lev.build_dfa(word) + } +} diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 00905db2e..114032eb8 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,14 +1,13 @@ -#![allow(unused)] - -use std::borrow::Cow; -use std::collections::BTreeMap; +use std::collections::HashSet; use std::{fmt, cmp, mem}; +use levenshtein_automata::{DFA, Distance}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use roaring::RoaringBitmap; use slice_group_by::GroupBy; use crate::Index; +use super::build_dfa; type IsOptionalWord = bool; type IsPrefix = bool; @@ -81,6 +80,13 @@ impl Operation { Self::Consecutive(ops) } } + + pub fn query(&self) -> Option<&Query> { + match self { + Operation::Query(query) => Some(query), + _ => None, + } + } } #[derive(Clone, Eq, PartialEq, Hash)] @@ -96,14 +102,26 @@ pub enum QueryKind { } impl QueryKind { - fn exact(word: String) -> Self { + pub fn exact(word: String) -> Self { QueryKind::Exact { original_typo: 0, word } } - fn tolerant(typo: u8, word: String) -> Self { + pub fn exact_with_typo(original_typo: u8, word: String) -> Self { + QueryKind::Exact { original_typo, word } + } + + pub fn tolerant(typo: u8, word: String) -> Self { QueryKind::Tolerant { typo, word } } + pub fn is_tolerant(&self) -> bool { + matches!(self, QueryKind::Tolerant { .. }) + } + + pub fn is_exact(&self) -> bool { + matches!(self, QueryKind::Exact { .. }) + } + pub fn typo(&self) -> u8 { match self { QueryKind::Tolerant { typo, .. } => *typo, @@ -266,69 +284,45 @@ fn synonyms(ctx: &impl Context, word: &[&str]) -> heed::Result + dfas: Vec<(DFA, u8)>, } impl MatchingWords { /// List all words which can be considered as a match for the query tree. - pub fn from_query_tree(tree: &Operation, fst: &fst::Set>) -> Self { - Self { inner: fetch_words(tree, fst).into_iter().collect() } + pub fn from_query_tree(tree: &Operation) -> Self { + Self { + dfas: fetch_queries(tree).into_iter().map(|(w, t, p)| (build_dfa(w, t, p), t)).collect() + } } /// Return true if the word match. - pub fn is_match(&self, word: &str) -> bool { - fn first_char(s: &str) -> Option<&str> { - s.chars().next().map(|c| &s[..c.len_utf8()]) - } - - match first_char(word) { - Some(first) => { - let left = first.to_owned(); - let right = word.to_owned(); - self.inner.range(left..=right).any(|(w, is_prefix)| *is_prefix || *w == word) - }, - None => false - } + pub fn matches(&self, word: &str) -> bool { + self.dfas.iter().any(|(dfa, typo)| match dfa.eval(word) { + Distance::Exact(t) => t <= *typo, + Distance::AtLeast(_) => false, + }) } } -type FetchedWords = Vec<(String, IsPrefix)>; - /// Lists all words which can be considered as a match for the query tree. -fn fetch_words(tree: &Operation, fst: &fst::Set>) -> FetchedWords { - fn resolve_branch(tree: &[Operation], fst: &fst::Set>) -> FetchedWords { - tree.iter().map(|op| resolve_ops(op, fst)).flatten().collect() - } - - fn resolve_query(query: &Query, fst: &fst::Set>) -> FetchedWords { - match query.kind.clone() { - QueryKind::Exact { word, .. } => vec![(word, query.prefix)], - QueryKind::Tolerant { typo, word } => { - if let Ok(words) = super::word_typos(&word, query.prefix, typo, fst) { - words.into_iter().map(|(w, _)| (w, query.prefix)).collect() - } else { - vec![(word, query.prefix)] - } - } - } - } - - fn resolve_ops(tree: &Operation, fst: &fst::Set>) -> FetchedWords { +fn fetch_queries(tree: &Operation) -> HashSet<(&str, u8, IsPrefix)> { + fn resolve_ops<'a>(tree: &'a Operation, out: &mut HashSet<(&'a str, u8, IsPrefix)>) { match tree { Operation::Or(_, ops) | Operation::And(ops) | Operation::Consecutive(ops) => { - resolve_branch(ops.as_slice(), fst) + ops.as_slice().iter().for_each(|op| resolve_ops(op, out)); }, - Operation::Query(ops) => { - resolve_query(ops, fst) + Operation::Query(Query { prefix, kind }) => { + let typo = if kind.is_exact() { 0 } else { kind.typo() }; + out.insert((kind.word(), typo, *prefix)); }, } } - let mut words = resolve_ops(tree, fst); - words.sort_unstable(); - words.dedup(); - words + let mut queries = HashSet::new(); + resolve_ops(tree, &mut queries); + queries } /// Main function that creates the final query tree from the primitive query. @@ -537,7 +531,10 @@ pub fn maximum_proximity(operation: &Operation) -> usize { use Operation::{Or, And, Query, Consecutive}; match operation { Or(_, ops) => ops.iter().map(maximum_proximity).max().unwrap_or(0), - And(ops) => ops.len().saturating_sub(1) * 8, + And(ops) => { + ops.iter().map(maximum_proximity).sum::() + + ops.len().saturating_sub(1) * 7 + }, Query(_) | Consecutive(_) => 0, } } @@ -547,7 +544,7 @@ mod test { use std::collections::HashMap; use fst::Set; - use maplit::hashmap; + use maplit::{hashmap, hashset}; use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rand::{Rng, SeedableRng, rngs::StdRng}; @@ -958,26 +955,26 @@ mod test { let context = TestContext::default(); let query_tree = context.build(false, true, tokens).unwrap().unwrap(); - let expected = vec![ - ("city".to_string(), false), - ("earth".to_string(), false), - ("nature".to_string(), false), - ("new".to_string(), false), - ("nyc".to_string(), false), - ("split".to_string(), false), - ("word".to_string(), false), - ("word".to_string(), true), - ("world".to_string(), true), - ("york".to_string(), false), - - ]; + let expected = hashset!{ + ("word", 0, false), + ("nyc", 0, false), + ("wordsplit", 2, false), + ("wordsplitnycworld", 2, true), + ("nature", 0, false), + ("new", 0, false), + ("city", 0, false), + ("world", 1, true), + ("york", 0, false), + ("split", 0, false), + ("nycworld", 1, true), + ("earth", 0, false), + ("wordsplitnyc", 2, false), + }; let mut keys = context.postings.keys().collect::>(); keys.sort_unstable(); - let set = fst::Set::from_iter(keys).unwrap().map_data(|v| Cow::Owned(v)).unwrap(); - - let words = fetch_words(&query_tree, &set); + let words = fetch_queries(&query_tree); assert_eq!(expected, words); } } diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 96d1098f9..05767080a 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -13,7 +13,7 @@ use grenad::{Reader, FileFuse, Writer, Sorter, CompressionType}; use heed::BytesEncode; use linked_hash_map::LinkedHashMap; use log::{debug, info}; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; +use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind, token::SeparatorKind}; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; @@ -274,13 +274,15 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { self.insert_words_pairs_proximities_docids(words_pair_proximities, document_id)?; // We store document_id associated with all the words the record contains. - for (word, _) in words_positions.drain() { - self.insert_word_docid(&word, document_id)?; + for (word, _) in words_positions.iter() { + self.insert_word_docid(word, document_id)?; } self.documents_writer.insert(document_id.to_be_bytes(), record)?; Self::write_docid_word_positions(&mut self.docid_word_positions_writer, document_id, words_positions)?; + words_positions.clear(); + // We store document_id associated with all the field id and values. for (field, values) in facet_values.drain() { for value in values { @@ -471,14 +473,11 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { }; let analyzed = self.analyzer.analyze(&content); - let tokens = analyzed - .tokens() - .filter(|t| t.is_word()) - .map(|t| t.text().to_string()); + let tokens = process_tokens(analyzed.tokens()); - for (pos, word) in tokens.enumerate().take(MAX_POSITION) { + for (pos, token) in tokens.take_while(|(pos, _)| *pos < MAX_POSITION) { let position = (attr as usize * MAX_POSITION + pos) as u32; - words_positions.entry(word).or_insert_with(SmallVec32::new).push(position); + words_positions.entry(token.text().to_string()).or_insert_with(SmallVec32::new).push(position); } } } @@ -609,6 +608,36 @@ enum FacetValue { Integer(i64), } +/// take an iterator on tokens and compute their relative position depending on separator kinds +/// if it's an `Hard` separator we add an additional relative proximity of 8 between words, +/// else we keep the standart proximity of 1 between words. +fn process_tokens<'a>(tokens: impl Iterator>) -> impl Iterator)> { + tokens + .skip_while(|token| token.is_separator().is_some()) + .scan((0, None), |(offset, prev_kind), token| { + match token.kind { + TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { + *offset += match *prev_kind { + Some(TokenKind::Separator(SeparatorKind::Hard)) => 8, + Some(_) => 1, + None => 0, + }; + *prev_kind = Some(token.kind) + } + TokenKind::Separator(SeparatorKind::Hard) => { + *prev_kind = Some(token.kind); + } + TokenKind::Separator(SeparatorKind::Soft) + if *prev_kind != Some(TokenKind::Separator(SeparatorKind::Hard)) => { + *prev_kind = Some(token.kind); + } + _ => (), + } + Some((*offset, token)) + }) + .filter(|(_, t)| t.is_word()) +} + fn parse_facet_value(ftype: FacetType, value: &Value) -> anyhow::Result> { use FacetValue::*; diff --git a/milli/src/update/words_prefixes.rs b/milli/src/update/words_prefixes.rs index f7c898c89..70b82b217 100644 --- a/milli/src/update/words_prefixes.rs +++ b/milli/src/update/words_prefixes.rs @@ -41,7 +41,7 @@ impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> { chunk_fusing_shrink_size: None, max_nb_chunks: None, max_memory: None, - threshold: 0.01, // 1% + threshold: 0.1 / 100.0, // .01% max_prefix_length: 4, _update_id: update_id, }