diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 5e741c7f3..dc236dd0d 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -367,6 +367,7 @@ pub fn word_derivations<'c>( match cache.entry((word.to_string(), is_prefix, max_typo)) { Entry::Occupied(entry) => Ok(entry.into_mut()), Entry::Vacant(entry) => { + // println!("word derivations {word} {is_prefix} {max_typo}"); let mut derived_words = Vec::new(); if max_typo == 0 { if is_prefix { diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index efc5a6dcc..f83f01074 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -318,9 +318,10 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase let mut used_words = HashSet::new(); let mut used_phrases = HashSet::new(); for condition in used_conditions.iter() { - let condition = graph.conditions_interner.get(condition); - used_words.extend(G::words_used_by_condition(ctx, condition)?); - used_phrases.extend(G::phrases_used_by_condition(ctx, condition)?); + let (ws, ps) = + condition_docids_cache.get_condition_used_words_and_phrases(condition); + used_words.extend(ws); + used_phrases.extend(ps); } // 2. Remove the unused words and phrases from all the nodes in the graph let mut nodes_to_remove = vec![]; diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index c26e18524..b8f54d087 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -30,7 +30,7 @@ impl Interned { #[derive(Clone)] pub struct DedupInterner { stable_store: Vec, - lookup: FxHashMap>, + lookup: FxHashMap>, // TODO: Arc } impl Default for DedupInterner { fn default() -> Self { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index c4e494242..44e26a9ea 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -287,368 +287,3 @@ impl<'a> Search<'a> { todo!() } } - -#[cfg(test)] -mod tests { - // use crate::allocator::ALLOC; - use std::fs::File; - use std::io::{BufRead, BufReader, Cursor, Seek}; - use std::time::Instant; - - use big_s::S; - use heed::EnvOpenOptions; - use maplit::hashset; - - use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; - // use crate::search::new::logger::detailed::DetailedSearchLogger; - use crate::search::new::logger::DefaultSearchLogger; - use crate::search::new::{execute_search, SearchContext}; - use crate::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; - use crate::{Criterion, Index, Object, Search, TermsMatchingStrategy}; - - #[test] - fn search_wiki_new() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_wiki").unwrap(); - let txn = index.read_txn().unwrap(); - - println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len()); - - loop { - let start = Instant::now(); - - // let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "released from prison by the government", - // "which a the releases from poison by the government", - // "sun flower s are the best", - // "zero config", - TermsMatchingStrategy::Last, - None, - 0, - 20, - &mut DefaultSearchLogger, - &mut DefaultSearchLogger, - // &mut logger, - ) - .unwrap(); - - // logger.write_d2_description(&mut ctx); - - let elapsed = start.elapsed(); - println!("{}us", elapsed.as_micros()); - - let _documents = index - .documents(&txn, results.documents_ids.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), results); - } - // for (id, document) in documents { - // println!("{id}:"); - // // println!("{document}"); - // } - } - - #[test] - fn search_wiki_old() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_wiki").unwrap(); - - let txn = index.read_txn().unwrap(); - - let rr = index.criteria(&txn).unwrap(); - println!("{rr:?}"); - - let start = Instant::now(); - - let mut s = Search::new(&txn, &index); - s.query( - // "which a the releases from poison by the government", - // "sun flower s are the best", - "zero config", - ); - s.terms_matching_strategy(TermsMatchingStrategy::Last); - // s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlyIterative); - let docs = s.execute().unwrap(); - - let elapsed = start.elapsed(); - - let documents = index - .documents(&txn, docs.documents_ids.iter().copied()) - .unwrap() - .into_iter() - .map(|(id, obkv)| { - let mut object = serde_json::Map::default(); - for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() { - let value = obkv.get(fid).unwrap(); - let value: serde_json::Value = serde_json::from_slice(value).unwrap(); - object.insert(fid_name.to_owned(), value); - } - (id, serde_json::to_string_pretty(&object).unwrap()) - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - for (id, _document) in documents { - println!("{id}:"); - // println!("{document}"); - } - } - #[test] - fn search_movies_new() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - let txn = index.read_txn().unwrap(); - - // let primary_key = index.primary_key(&txn).unwrap().unwrap(); - // let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - // loop { - let start = Instant::now(); - - let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log"); - let mut ctx = SearchContext::new(&index, &txn); - let results = execute_search( - &mut ctx, - "releases from poison by the government", - TermsMatchingStrategy::Last, - None, - 0, - 20, - &mut DefaultSearchLogger, - &mut logger, - ) - .unwrap(); - - logger.write_d2_description(&mut ctx); - - let elapsed = start.elapsed(); - - // let ids = index - // .documents(&txn, results.iter().copied()) - // .unwrap() - // .into_iter() - // .map(|x| { - // let obkv = &x.1; - // let id = obkv.get(primary_key).unwrap(); - // let id: serde_json::Value = serde_json::from_slice(id).unwrap(); - // id.as_str().unwrap().to_owned() - // }) - // .collect::>(); - - println!("{}us: {results:?}", elapsed.as_micros()); - // println!("external ids: {ids:?}"); - // } - } - - #[test] - fn search_movies_old() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - - let txn = index.read_txn().unwrap(); - - let rr = index.criteria(&txn).unwrap(); - println!("{rr:?}"); - - let primary_key = index.primary_key(&txn).unwrap().unwrap(); - let primary_key = index.fields_ids_map(&txn).unwrap().id(primary_key).unwrap(); - - let start = Instant::now(); - - let mut s = Search::new(&txn, &index); - s.query("which a the releases from poison by the government"); - s.terms_matching_strategy(TermsMatchingStrategy::Last); - s.criterion_implementation_strategy(crate::CriterionImplementationStrategy::OnlySetBased); - let docs = s.execute().unwrap(); - - let elapsed = start.elapsed(); - - let ids = index - .documents(&txn, docs.documents_ids.iter().copied()) - .unwrap() - .into_iter() - .map(|x| { - let obkv = &x.1; - let id = obkv.get(primary_key).unwrap(); - let id: serde_json::Value = serde_json::from_slice(id).unwrap(); - id.as_str().unwrap().to_owned() - }) - .collect::>(); - - println!("{}us: {:?}", elapsed.as_micros(), docs.documents_ids); - println!("external ids: {ids:?}"); - } - - #[test] - fn _settings_movies() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - let mut wtxn = index.write_txn().unwrap(); - - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - - builder.set_min_word_len_one_typo(5); - builder.set_min_word_len_two_typos(100); - builder.set_sortable_fields(hashset! { S("release_date") }); - builder.set_criteria(vec![ - Criterion::Words, - Criterion::Typo, - Criterion::Proximity, - Criterion::Asc("release_date".to_owned()), - ]); - - builder.execute(|_| (), || false).unwrap(); - wtxn.commit().unwrap(); - } - - #[test] - fn _index_movies() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_movies").unwrap(); - let mut wtxn = index.write_txn().unwrap(); - - let primary_key = "id"; - let searchable_fields = vec!["title", "overview"]; - let filterable_fields = vec!["release_date", "genres"]; - - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - builder.set_primary_key(primary_key.to_owned()); - let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); - builder.set_filterable_fields(filterable_fields); - - builder.set_min_word_len_one_typo(5); - builder.set_min_word_len_two_typos(100); - builder.set_criteria(vec![Criterion::Words, Criterion::Proximity]); - builder.execute(|_| (), || false).unwrap(); - - let config = IndexerConfig::default(); - let indexing_config = IndexDocumentsConfig::default(); - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) - .unwrap(); - - let documents = documents_from( - "/Users/meilisearch/Documents/milli2/benchmarks/datasets/movies.json", - "json", - ); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); - } - #[test] - fn _index_wiki() { - let mut options = EnvOpenOptions::new(); - options.map_size(100 * 1024 * 1024 * 1024); // 100 GB - - let index = Index::new(options, "data_wiki").unwrap(); - let mut wtxn = index.write_txn().unwrap(); - - // let primary_key = "id"; - let searchable_fields = vec!["body", "title", "url"]; - // let filterable_fields = vec![]; - let config = IndexerConfig::default(); - let mut builder = Settings::new(&mut wtxn, &index, &config); - // builder.set_primary_key(primary_key.to_owned()); - let searchable_fields = searchable_fields.iter().map(|s| s.to_string()).collect(); - builder.set_searchable_fields(searchable_fields); - // let filterable_fields = filterable_fields.iter().map(|s| s.to_string()).collect(); - // builder.set_filterable_fields(filterable_fields); - - // builder.set_min_word_len_one_typo(5); - // builder.set_min_word_len_two_typos(100); - builder.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]); - builder.execute(|_| (), || false).unwrap(); - - let config = IndexerConfig::default(); - let indexing_config = - IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false) - .unwrap(); - - let documents = documents_from( - "/Users/meilisearch/Documents/milli2/benchmarks/datasets/smol-wiki-articles.csv", - "csv", - ); - let (builder, user_error) = builder.add_documents(documents).unwrap(); - user_error.unwrap(); - builder.execute().unwrap(); - wtxn.commit().unwrap(); - - index.prepare_for_closing().wait(); - } - - fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { - let reader = File::open(filename) - .unwrap_or_else(|_| panic!("could not find the dataset in: {}", filename)); - let reader = BufReader::new(reader); - let documents = match filetype { - "csv" => documents_from_csv(reader).unwrap(), - "json" => documents_from_json(reader).unwrap(), - "jsonl" => documents_from_jsonl(reader).unwrap(), - otherwise => panic!("invalid update format {:?}", otherwise), - }; - DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() - } - - fn documents_from_jsonl(reader: impl BufRead) -> crate::Result> { - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - - for result in serde_json::Deserializer::from_reader(reader).into_iter::() { - let object = result.unwrap(); - documents.append_json_object(&object)?; - } - - documents.into_inner().map_err(Into::into) - } - - fn documents_from_json(reader: impl BufRead) -> crate::Result> { - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - - documents.append_json_array(reader)?; - - documents.into_inner().map_err(Into::into) - } - - fn documents_from_csv(reader: impl BufRead) -> crate::Result> { - let csv = csv::Reader::from_reader(reader); - - let mut documents = DocumentsBatchBuilder::new(Vec::new()); - documents.append_csv(csv)?; - - documents.into_inner().map_err(Into::into) - } -} diff --git a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs index a3d2ae419..15d82a2be 100644 --- a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs @@ -1,19 +1,28 @@ use std::marker::PhantomData; -use fxhash::FxHashMap; +use fxhash::{FxHashMap, FxHashSet}; use roaring::RoaringBitmap; use super::{RankingRuleGraph, RankingRuleGraphTrait}; use crate::search::new::interner::Interned; +use crate::search::new::query_term::Phrase; use crate::search::new::SearchContext; use crate::Result; // TODO: give a generation to each universe, then be able to get the exact // delta of docids between two universes of different generations! +#[derive(Default)] +pub struct ComputedCondition { + docids: RoaringBitmap, + universe_len: u64, + used_words: FxHashSet>, + used_phrases: FxHashSet>, +} + /// A cache storing the document ids associated with each ranking rule edge pub struct ConditionDocIdsCache { - pub cache: FxHashMap, (u64, RoaringBitmap)>, + pub cache: FxHashMap, ComputedCondition>, _phantom: PhantomData, } impl Default for ConditionDocIdsCache { @@ -22,6 +31,14 @@ impl Default for ConditionDocIdsCache { } } impl ConditionDocIdsCache { + pub fn get_condition_used_words_and_phrases( + &mut self, + interned_condition: Interned, + ) -> (&FxHashSet>, &FxHashSet>) { + let ComputedCondition { used_words, used_phrases, .. } = &self.cache[&interned_condition]; + (used_words, used_phrases) + } + /// Retrieve the document ids for the given edge condition. /// /// If the cache does not yet contain these docids, they are computed @@ -30,14 +47,14 @@ impl ConditionDocIdsCache { &'s mut self, ctx: &mut SearchContext<'ctx>, interned_condition: Interned, - graph: &RankingRuleGraph, - // TODO: maybe universe doesn't belong here + graph: &mut RankingRuleGraph, universe: &RoaringBitmap, ) -> Result<&'s RoaringBitmap> { if self.cache.contains_key(&interned_condition) { // TODO compare length of universe compared to the one in self // if it is smaller, then update the value - let (universe_len, docids) = self.cache.entry(interned_condition).or_default(); + let ComputedCondition { docids, universe_len, .. } = + self.cache.entry(interned_condition).or_default(); if *universe_len == universe.len() { return Ok(docids); } else { @@ -46,12 +63,13 @@ impl ConditionDocIdsCache { return Ok(docids); } } - // TODO: maybe universe doesn't belong here - let condition = graph.conditions_interner.get(interned_condition); - // TODO: faster way to do this? - let docids = G::resolve_condition(ctx, condition, universe)?; - let _ = self.cache.insert(interned_condition, (universe.len(), docids)); - let (_, docids) = &self.cache[&interned_condition]; + let condition = graph.conditions_interner.get_mut(interned_condition); + let (docids, used_words, used_phrases) = G::resolve_condition(ctx, condition, universe)?; + let _ = self.cache.insert( + interned_condition, + ComputedCondition { docids, universe_len: universe.len(), used_words, used_phrases }, + ); + let ComputedCondition { docids, .. } = &self.cache[&interned_condition]; Ok(docids) } } diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 528ff3107..baeb8bb71 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -15,11 +15,11 @@ mod proximity; /// Implementation of the `typo` ranking rule mod typo; -use std::collections::HashSet; use std::hash::Hash; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; +use fxhash::FxHashSet; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph}; @@ -80,23 +80,13 @@ pub trait RankingRuleGraphTrait: Sized { condition: &Self::Condition, ) -> Result; - fn words_used_by_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result>>; - - fn phrases_used_by_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result>>; - /// Compute the document ids associated with the given edge condition, /// restricted to the given universe. fn resolve_condition<'ctx>( ctx: &mut SearchContext<'ctx>, condition: &Self::Condition, universe: &RoaringBitmap, - ) -> Result; + ) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)>; /// Return the costs and conditions of the edges going from the source node to the destination node fn build_edges<'ctx>( diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index d0977d732..097120b49 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -1,56 +1,18 @@ #![allow(clippy::too_many_arguments)] -use std::collections::BTreeMap; - -use heed::RoTxn; use super::ProximityCondition; -use crate::search::new::db_cache::DatabaseCache; use crate::search::new::interner::{DedupInterner, Interned}; use crate::search::new::query_graph::QueryNodeData; -use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm}; -use crate::search::new::ranking_rule_graph::proximity::WordPair; +use crate::search::new::query_term::LocatedQueryTerm; use crate::search::new::{QueryNode, SearchContext}; use crate::Result; -fn last_word_of_term_iter<'t>( - t: &'t QueryTerm, - phrase_interner: &'t DedupInterner, -) -> impl Iterator>, Interned)> + 't { - t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map( - move |p| { - let phrase = phrase_interner.get(p); - phrase.words.last().unwrap().map(|last| (Some(p), last)) - }, - )) -} -fn first_word_of_term_iter<'t>( - t: &'t QueryTerm, - phrase_interner: &'t DedupInterner, -) -> impl Iterator, Option>)> + 't { - t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map( - move |p| { - let phrase = phrase_interner.get(p); - phrase.words.first().unwrap().map(|first| (first, Some(p))) - }, - )) -} - pub fn build_edges<'ctx>( - ctx: &mut SearchContext<'ctx>, + _ctx: &mut SearchContext<'ctx>, conditions_interner: &mut DedupInterner, from_node: &QueryNode, to_node: &QueryNode, ) -> Result>)>> { - let SearchContext { - index, - txn, - db_cache, - word_interner, - phrase_interner, - term_interner, - term_docids: _, - } = ctx; - let right_term = match &to_node.data { QueryNodeData::End => return Ok(vec![(0, None)]), QueryNodeData::Deleted | QueryNodeData::Start => return Ok(vec![]), @@ -59,13 +21,11 @@ pub fn build_edges<'ctx>( let LocatedQueryTerm { value: right_term_interned, positions: right_positions } = right_term; - let (right_term, right_start_position, right_ngram_length) = - (term_interner.get(*right_term_interned), *right_positions.start(), right_positions.len()); + let (right_start_position, right_ngram_length) = + (*right_positions.start(), right_positions.len()); - let (left_term, left_end_position) = match &from_node.data { - QueryNodeData::Term(LocatedQueryTerm { value, positions }) => { - (term_interner.get(*value), *positions.end()) - } + let (left_term_interned, left_end_position) = match &from_node.data { + QueryNodeData::Term(LocatedQueryTerm { value, positions }) => (*value, *positions.end()), QueryNodeData::Deleted => return Ok(vec![]), QueryNodeData::Start => { return Ok(vec![( @@ -94,175 +54,24 @@ pub fn build_edges<'ctx>( )]); } - let mut cost_word_pairs = BTreeMap::>::new(); - - if let Some(right_prefix) = right_term.use_prefix_db { - for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { - add_prefix_edges( - index, - txn, - db_cache, - word_interner, - right_ngram_length, - left_word, - right_prefix, - &mut cost_word_pairs, - left_phrase, - )?; - } - } - - // TODO: add safeguard in case the cartesian product is too large! - // even if we restrict the word derivations to a maximum of 100, the size of the - // caterisan product could reach a maximum of 10_000 derivations, which is way too much. - // Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo - // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been - // reached - - for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { - for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) { - add_non_prefix_edges( - index, - txn, - db_cache, - word_interner, - right_ngram_length, - left_word, - right_word, - &mut cost_word_pairs, - &[left_phrase, right_phrase].iter().copied().flatten().collect::>(), - )?; - } - } - - let mut new_edges = cost_word_pairs - .into_iter() - .map(|(cost, word_pairs)| { - ( + let mut conditions = vec![]; + for cost in right_ngram_length..(7 + right_ngram_length) { + let cost = cost as u8; + conditions.push(( + cost, + Some(conditions_interner.insert(ProximityCondition::Uninit { + left_term: left_term_interned, + right_term: *right_term_interned, + right_term_ngram_len: right_ngram_length as u8, cost, - Some( - conditions_interner - .insert(ProximityCondition::Pairs { pairs: word_pairs.into_boxed_slice() }), - ), - ) - }) - .collect::>(); - new_edges.push(( - 8 + (right_ngram_length - 1) as u8, + })), + )) + } + + conditions.push(( + (7 + right_ngram_length) as u8, Some(conditions_interner.insert(ProximityCondition::Term { term: *right_term_interned })), )); - Ok(new_edges) -} -fn add_prefix_edges<'ctx>( - index: &mut &crate::Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - word_interner: &mut DedupInterner, - right_ngram_length: usize, - left_word: Interned, - right_prefix: Interned, - cost_proximity_word_pairs: &mut BTreeMap>, - left_phrase: Option>, -) -> Result<()> { - for proximity in 1..=(8 - right_ngram_length) { - let cost = (proximity + right_ngram_length - 1) as u8; - // TODO: if we had access to the universe here, we could already check whether - // the bitmap corresponding to this word pair is disjoint with the universe or not - if db_cache - .get_word_prefix_pair_proximity_docids( - index, - txn, - word_interner, - left_word, - right_prefix, - proximity as u8, - )? - .is_some() - { - cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefix { - phrases: left_phrase.into_iter().collect(), - left: left_word, - right_prefix, - proximity: proximity as u8, - }); - } - - // No swapping when computing the proximity between a phrase and a word - if left_phrase.is_none() - && db_cache - .get_prefix_word_pair_proximity_docids( - index, - txn, - word_interner, - right_prefix, - left_word, - proximity as u8 - 1, - )? - .is_some() - { - cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::WordPrefixSwapped { - left_prefix: right_prefix, - right: left_word, - proximity: proximity as u8 - 1, - }); - } - } - Ok(()) -} - -fn add_non_prefix_edges<'ctx>( - index: &mut &crate::Index, - txn: &'ctx RoTxn, - db_cache: &mut DatabaseCache<'ctx>, - word_interner: &mut DedupInterner, - right_ngram_length: usize, - word1: Interned, - word2: Interned, - cost_proximity_word_pairs: &mut BTreeMap>, - phrases: &[Interned], -) -> Result<()> { - for proximity in 1..=(8 - right_ngram_length) { - let cost = (proximity + right_ngram_length - 1) as u8; - if db_cache - .get_word_pair_proximity_docids( - index, - txn, - word_interner, - word1, - word2, - proximity as u8, - )? - .is_some() - { - cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words { - phrases: phrases.to_vec(), - left: word1, - right: word2, - proximity: proximity as u8, - }); - } - if proximity > 1 - // no swapping when either term is a phrase - && phrases.is_empty() - && db_cache - .get_word_pair_proximity_docids( - index, - txn, - word_interner, - word2, - word1, - proximity as u8 - 1, - )? - .is_some() - { - cost_proximity_word_pairs.entry(cost).or_default().push(WordPair::Words { - phrases: vec![], - left: word2, - right: word1, - proximity: proximity as u8 - 1, - }); - } - } - Ok(()) + Ok(conditions) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index bf5278f8d..eabdb2cb1 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -1,6 +1,15 @@ +#![allow(clippy::too_many_arguments)] + +use std::iter::FromIterator; + +use fxhash::FxHashSet; +use heed::RoTxn; use roaring::RoaringBitmap; -use super::{ProximityCondition, WordPair}; +use super::ProximityCondition; +use crate::search::new::db_cache::DatabaseCache; +use crate::search::new::interner::{DedupInterner, Interned}; +use crate::search::new::query_term::{Phrase, QueryTerm}; use crate::search::new::SearchContext; use crate::{CboRoaringBitmapCodec, Result}; @@ -8,7 +17,7 @@ pub fn compute_docids<'ctx>( ctx: &mut SearchContext<'ctx>, condition: &ProximityCondition, universe: &RoaringBitmap, -) -> Result { +) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)> { let SearchContext { index, txn, @@ -18,96 +27,238 @@ pub fn compute_docids<'ctx>( phrase_interner, term_interner, } = ctx; - let pairs = match condition { - ProximityCondition::Term { term } => { - return term_docids - .get_query_term_docids( - index, - txn, - db_cache, - word_interner, - term_interner, - phrase_interner, - *term, - ) - .cloned() + + let (left_term, right_term, right_term_ngram_len, cost) = match condition { + ProximityCondition::Uninit { left_term, right_term, right_term_ngram_len, cost } => { + (*left_term, *right_term, *right_term_ngram_len, *cost) + } + ProximityCondition::Term { term } => { + let term_v = term_interner.get(*term); + return Ok(( + term_docids + .get_query_term_docids( + index, + txn, + db_cache, + word_interner, + term_interner, + phrase_interner, + *term, + )? + .clone(), + FxHashSet::from_iter(term_v.all_single_words_except_prefix_db()), + FxHashSet::from_iter(term_v.all_phrases()), + )); } - ProximityCondition::Pairs { pairs } => pairs, }; - let mut pair_docids = RoaringBitmap::new(); - for pair in pairs.iter() { - let pair = match pair { - WordPair::Words { phrases, left, right, proximity } => { - let mut docids = db_cache - .get_word_pair_proximity_docids( - index, - txn, - word_interner, - *left, - *right, - *proximity, - )? - .map(CboRoaringBitmapCodec::deserialize_from) - .transpose()? - .unwrap_or_default(); - if !docids.is_empty() { - for phrase in phrases { - docids &= ctx.term_docids.get_phrase_docids( - index, - txn, - db_cache, - word_interner, - &ctx.phrase_interner, - *phrase, - )?; - } - } - docids - } - WordPair::WordPrefix { phrases, left, right_prefix, proximity } => { - let mut docids = db_cache - .get_word_prefix_pair_proximity_docids( - index, - txn, - word_interner, - *left, - *right_prefix, - *proximity, - )? - .map(CboRoaringBitmapCodec::deserialize_from) - .transpose()? - .unwrap_or_default(); - if !docids.is_empty() { - for phrase in phrases { - docids &= ctx.term_docids.get_phrase_docids( - index, - txn, - db_cache, - word_interner, - &ctx.phrase_interner, - *phrase, - )?; - } - } - docids - } - WordPair::WordPrefixSwapped { left_prefix, right, proximity } => db_cache - .get_prefix_word_pair_proximity_docids( - index, - txn, - word_interner, - *left_prefix, - *right, - *proximity, - )? - .map(CboRoaringBitmapCodec::deserialize_from) - .transpose()? - .unwrap_or_default(), - }; - // TODO: deserialize bitmap within a universe - let bitmap = universe & pair; - pair_docids |= bitmap; + + let left_term = term_interner.get(left_term); + let right_term = term_interner.get(right_term); + + // e.g. for the simple words `sun .. flower` + // the cost is 5 + // the forward proximity is 5 + // the backward proximity is 4 + // + // for the 2gram `the sunflower` + // the cost is 5 + // the forward proximity is 4 + // the backward proximity is 3 + let forward_proximity = 1 + cost - right_term_ngram_len; + let backward_proximity = cost - right_term_ngram_len; + + let mut used_words = FxHashSet::default(); + let mut used_phrases = FxHashSet::default(); + + let mut docids = RoaringBitmap::new(); + + if let Some(right_prefix) = right_term.use_prefix_db { + for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { + compute_prefix_edges( + index, + txn, + db_cache, + word_interner, + left_word, + right_prefix, + left_phrase, + forward_proximity, + backward_proximity, + &mut docids, + universe, + &mut used_words, + &mut used_phrases, + )?; + } } - Ok(pair_docids) + // TODO: add safeguard in case the cartesian product is too large! + // even if we restrict the word derivations to a maximum of 100, the size of the + // caterisan product could reach a maximum of 10_000 derivations, which is way too much. + // Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo + // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been + // reached + + for (left_phrase, left_word) in last_word_of_term_iter(left_term, phrase_interner) { + for (right_word, right_phrase) in first_word_of_term_iter(right_term, phrase_interner) { + compute_non_prefix_edges( + index, + txn, + db_cache, + word_interner, + left_word, + right_word, + &[left_phrase, right_phrase].iter().copied().flatten().collect::>(), + forward_proximity, + backward_proximity, + &mut docids, + universe, + &mut used_words, + &mut used_phrases, + )?; + } + } + + Ok((docids, used_words, used_phrases)) +} + +fn compute_prefix_edges<'ctx>( + index: &mut &crate::Index, + txn: &'ctx RoTxn, + db_cache: &mut DatabaseCache<'ctx>, + word_interner: &mut DedupInterner, + left_word: Interned, + right_prefix: Interned, + left_phrase: Option>, + forward_proximity: u8, + backward_proximity: u8, + docids: &mut RoaringBitmap, + universe: &RoaringBitmap, + used_words: &mut FxHashSet>, + used_phrases: &mut FxHashSet>, +) -> Result<()> { + if let Some(phrase) = left_phrase { + // TODO: compute the phrase, take the intersection between + // the phrase and the docids + used_phrases.insert(phrase); // This is not fully correct + } + + if let Some(new_docids) = db_cache.get_word_prefix_pair_proximity_docids( + index, + txn, + word_interner, + left_word, + right_prefix, + forward_proximity, + )? { + let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + if !new_docids.is_empty() { + used_words.insert(left_word); + used_words.insert(right_prefix); + *docids |= new_docids; + } + } + + // No swapping when computing the proximity between a phrase and a word + if left_phrase.is_none() { + if let Some(new_docids) = db_cache.get_prefix_word_pair_proximity_docids( + index, + txn, + word_interner, + right_prefix, + left_word, + backward_proximity, + )? { + let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + if !new_docids.is_empty() { + used_words.insert(left_word); + used_words.insert(right_prefix); + *docids |= new_docids; + } + } + } + + Ok(()) +} + +fn compute_non_prefix_edges<'ctx>( + index: &mut &crate::Index, + txn: &'ctx RoTxn, + db_cache: &mut DatabaseCache<'ctx>, + word_interner: &mut DedupInterner, + word1: Interned, + word2: Interned, + phrases: &[Interned], + forward_proximity: u8, + backward_proximity: u8, + docids: &mut RoaringBitmap, + universe: &RoaringBitmap, + used_words: &mut FxHashSet>, + used_phrases: &mut FxHashSet>, +) -> Result<()> { + if !phrases.is_empty() { + // TODO: compute the docids associated with these phrases + // take their intersection with the new docids + used_phrases.extend(phrases); // This is not fully correct + } + if let Some(new_docids) = db_cache.get_word_pair_proximity_docids( + index, + txn, + word_interner, + word1, + word2, + forward_proximity, + )? { + let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + if !new_docids.is_empty() { + used_words.insert(word1); + used_words.insert(word2); + *docids |= new_docids; + } + } + if backward_proximity >= 1 + // no swapping when either term is a phrase + && phrases.is_empty() + { + if let Some(new_docids) = db_cache.get_word_pair_proximity_docids( + index, + txn, + word_interner, + word2, + word1, + backward_proximity, + )? { + let new_docids = universe & CboRoaringBitmapCodec::deserialize_from(new_docids)?; + if !new_docids.is_empty() { + used_words.insert(word1); + used_words.insert(word2); + *docids |= new_docids; + } + } + } + + Ok(()) +} + +fn last_word_of_term_iter<'t>( + t: &'t QueryTerm, + phrase_interner: &'t DedupInterner, +) -> impl Iterator>, Interned)> + 't { + t.all_single_words_except_prefix_db().map(|w| (None, w)).chain(t.all_phrases().flat_map( + move |p| { + let phrase = phrase_interner.get(p); + phrase.words.last().unwrap().map(|last| (Some(p), last)) + }, + )) +} +fn first_word_of_term_iter<'t>( + t: &'t QueryTerm, + phrase_interner: &'t DedupInterner, +) -> impl Iterator, Option>)> + 't { + t.all_single_words_except_prefix_db().map(|w| (w, None)).chain(t.all_phrases().flat_map( + move |p| { + let phrase = phrase_interner.get(p); + phrase.words.first().unwrap().map(|first| (first, Some(p))) + }, + )) } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 568a2c2b0..7b8a066ab 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -1,9 +1,7 @@ pub mod build; pub mod compute_docids; -use std::collections::HashSet; -use std::iter::FromIterator; - +use fxhash::FxHashSet; use roaring::RoaringBitmap; use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; @@ -13,31 +11,17 @@ use crate::search::new::query_term::{Phrase, QueryTerm}; use crate::search::new::{QueryGraph, QueryNode, SearchContext}; use crate::Result; -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub enum WordPair { - Words { - phrases: Vec>, - left: Interned, - right: Interned, - proximity: u8, - }, - WordPrefix { - phrases: Vec>, - left: Interned, - right_prefix: Interned, - proximity: u8, - }, - WordPrefixSwapped { - left_prefix: Interned, - right: Interned, - proximity: u8, - }, -} - #[derive(Clone, PartialEq, Eq, Hash)] pub enum ProximityCondition { - Term { term: Interned }, - Pairs { pairs: Box<[WordPair]> }, + Uninit { + left_term: Interned, + right_term: Interned, + right_term_ngram_len: u8, + cost: u8, + }, + Term { + term: Interned, + }, } pub enum ProximityGraph {} @@ -49,7 +33,8 @@ impl RankingRuleGraphTrait for ProximityGraph { ctx: &mut SearchContext<'ctx>, condition: &Self::Condition, universe: &RoaringBitmap, - ) -> Result { + ) -> Result<(roaring::RoaringBitmap, FxHashSet>, FxHashSet>)> + { compute_docids::compute_docids(ctx, condition, universe) } @@ -79,107 +64,14 @@ impl RankingRuleGraphTrait for ProximityGraph { condition: &Self::Condition, ) -> Result { match condition { + ProximityCondition::Uninit { cost, .. } => { + // TODO + Ok(format!("{cost}: cost")) + } ProximityCondition::Term { term } => { let term = ctx.term_interner.get(*term); Ok(format!("{} : exists", ctx.word_interner.get(term.original))) } - ProximityCondition::Pairs { pairs } => { - let mut s = String::new(); - for pair in pairs.iter() { - match pair { - WordPair::Words { phrases, left, right, proximity } => { - let left = ctx.word_interner.get(*left); - let right = ctx.word_interner.get(*right); - if !phrases.is_empty() { - s.push_str(&format!("{} phrases + ", phrases.len())); - } - s.push_str(&format!("\"{left} {right}\": {proximity}\n")); - } - WordPair::WordPrefix { phrases, left, right_prefix, proximity } => { - let left = ctx.word_interner.get(*left); - let right = ctx.word_interner.get(*right_prefix); - if !phrases.is_empty() { - s.push_str(&format!("{} phrases + ", phrases.len())); - } - s.push_str(&format!("\"{left} {right}...\" : {proximity}\n")); - } - WordPair::WordPrefixSwapped { left_prefix, right, proximity } => { - let left = ctx.word_interner.get(*left_prefix); - let right = ctx.word_interner.get(*right); - s.push_str(&format!("\"{left}... {right}\" : {proximity}\n")); - } - } - } - Ok(s) - } - } - } - - fn words_used_by_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result>> { - match condition { - ProximityCondition::Term { term } => { - let term = ctx.term_interner.get(*term); - Ok(HashSet::from_iter(term.all_single_words_except_prefix_db())) - } - ProximityCondition::Pairs { pairs } => { - let mut set = HashSet::new(); - for pair in pairs.iter() { - match pair { - WordPair::Words { phrases: _, left, right, proximity: _ } => { - set.insert(*left); - set.insert(*right); - } - WordPair::WordPrefix { phrases: _, left, right_prefix, proximity: _ } => { - set.insert(*left); - // TODO: this is not correct, there should be another trait method for collecting the prefixes - // to be used with the prefix DBs - set.insert(*right_prefix); - } - WordPair::WordPrefixSwapped { left_prefix, right, proximity: _ } => { - // TODO: this is not correct, there should be another trait method for collecting the prefixes - // to be used with the prefix DBs - set.insert(*left_prefix); - set.insert(*right); - } - } - } - Ok(set) - } - } - } - - fn phrases_used_by_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result>> { - match condition { - ProximityCondition::Term { term } => { - let term = ctx.term_interner.get(*term); - Ok(HashSet::from_iter(term.all_phrases())) - } - ProximityCondition::Pairs { pairs } => { - let mut set = HashSet::new(); - for pair in pairs.iter() { - match pair { - WordPair::Words { phrases, left: _, right: _, proximity: _ } => { - set.extend(phrases.iter().copied()); - } - WordPair::WordPrefix { - phrases, - left: _, - right_prefix: _, - proximity: _, - } => { - set.extend(phrases.iter().copied()); - } - WordPair::WordPrefixSwapped { left_prefix: _, right: _, proximity: _ } => {} - } - } - Ok(set) - } } } } diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index 32b905244..4ef0d15d1 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -1,7 +1,8 @@ -use std::collections::HashSet; +// use std::collections::HashSet; use std::fmt::Write; use std::iter::FromIterator; +use fxhash::FxHashSet; use roaring::RoaringBitmap; use super::{DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; @@ -26,7 +27,7 @@ impl RankingRuleGraphTrait for TypoGraph { ctx: &mut SearchContext<'ctx>, condition: &Self::Condition, universe: &RoaringBitmap, - ) -> Result { + ) -> Result<(RoaringBitmap, FxHashSet>, FxHashSet>)> { let SearchContext { index, txn, @@ -48,7 +49,12 @@ impl RankingRuleGraphTrait for TypoGraph { condition.term, )?; - Ok(docids) + let term = term_interner.get(condition.term); + Ok(( + docids, + FxHashSet::from_iter(term.all_single_words_except_prefix_db()), + FxHashSet::from_iter(term.all_phrases()), + )) } fn build_edges<'ctx>( @@ -202,21 +208,21 @@ impl RankingRuleGraphTrait for TypoGraph { Ok(s) } - fn words_used_by_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result>> { - let TypoCondition { term, .. } = condition; - let term = ctx.term_interner.get(*term); - Ok(HashSet::from_iter(term.all_single_words_except_prefix_db())) - } + // fn words_used_by_condition<'ctx>( + // ctx: &mut SearchContext<'ctx>, + // condition: &Self::Condition, + // ) -> Result>> { + // let TypoCondition { term, .. } = condition; + // let term = ctx.term_interner.get(*term); + // Ok(HashSet::from_iter(term.all_single_words_except_prefix_db())) + // } - fn phrases_used_by_condition<'ctx>( - ctx: &mut SearchContext<'ctx>, - condition: &Self::Condition, - ) -> Result>> { - let TypoCondition { term, .. } = condition; - let term = ctx.term_interner.get(*term); - Ok(HashSet::from_iter(term.all_phrases())) - } + // fn phrases_used_by_condition<'ctx>( + // ctx: &mut SearchContext<'ctx>, + // condition: &Self::Condition, + // ) -> Result>> { + // let TypoCondition { term, .. } = condition; + // let term = ctx.term_interner.get(*term); + // Ok(HashSet::from_iter(term.all_phrases())) + // } } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 7549cfff7..32434248c 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -125,7 +125,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( let mut results = vec![]; let mut cur_offset = 0usize; - /// Add the candidates to the results. Take `distinct`, `from`, `limit`, and `cur_offset` + /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` /// into account and inform the logger. macro_rules! maybe_add_to_results { ($candidates:expr) => { @@ -181,6 +181,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( cur_offset += len as usize; }; } + while results.len() < length { // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index f5f8c0895..ff8a9bf2f 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -9,9 +9,9 @@ use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; use crate::{Result, TermsMatchingStrategy}; pub struct Words { - exhausted: bool, + exhausted: bool, // TODO: remove query_graph: Option, - iterating: bool, + iterating: bool, // TODO: remove positions_to_remove: Vec, terms_matching_strategy: TermsMatchingStrategy, }