diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 4232cadaa..f2f8f12c5 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -121,18 +121,18 @@ impl<'transaction> DatabaseCache<'transaction> { &mut self, index: &Index, txn: &'transaction RoTxn, - word1: &str, - prefix2: &str, + left_prefix: &str, + right: &str, proximity: u8, ) -> Result> { - let key = (proximity, prefix2.to_owned(), word1.to_owned()); + let key = (proximity, left_prefix.to_owned(), right.to_owned()); match self.prefix_word_pair_proximity_docids.entry(key) { Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()), Entry::Vacant(entry) => { let bitmap_ptr = index .prefix_word_pair_proximity_docids .remap_data_type::() - .get(txn, &(proximity, prefix2, word1))?; + .get(txn, &(proximity, left_prefix, right))?; entry.insert(bitmap_ptr); Ok(bitmap_ptr) } diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index a9f4ee045..c2415837d 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -194,7 +194,7 @@ impl DetailedSearchLogger { for event in self.events.iter() { match event { SearchEvents::RankingRuleStartIteration { ranking_rule_idx, time, .. } => { - let elapsed = time.duration_since(prev_time); + let _elapsed = time.duration_since(prev_time); prev_time = *time; let parent_activated_id = activated_id(×tamp); timestamp.push(0); @@ -216,7 +216,7 @@ impl DetailedSearchLogger { }}").unwrap(); } SearchEvents::RankingRuleNextBucket { ranking_rule_idx, time, universe, candidates } => { - let elapsed = time.duration_since(prev_time); + let _elapsed = time.duration_since(prev_time); prev_time = *time; let old_activated_id = activated_id(×tamp); // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); @@ -227,7 +227,7 @@ impl DetailedSearchLogger { .unwrap(); } SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates, time } => { - let elapsed = time.duration_since(prev_time); + let _elapsed = time.duration_since(prev_time); prev_time = *time; let old_activated_id = activated_id(×tamp); // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); @@ -239,7 +239,7 @@ impl DetailedSearchLogger { .unwrap(); } SearchEvents::RankingRuleEndIteration { ranking_rule_idx, time, .. } => { - let elapsed = time.duration_since(prev_time); + let _elapsed = time.duration_since(prev_time); prev_time = *time; let cur_activated_id = activated_id(×tamp); // writeln!(&mut file, "time.{cur_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap(); @@ -332,7 +332,7 @@ results.{random} {{ writeln!(&mut file, "}}").unwrap(); } - fn query_node_d2_desc(node_idx: usize, node: &QueryNode, distances: &[u64], file: &mut File) { + fn query_node_d2_desc(node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) { match &node { QueryNode::Term(LocatedQueryTerm { value, .. }) => { match value { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index e09fe2300..94120cd8a 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -1,18 +1,22 @@ -pub mod db_cache; -pub mod graph_based_ranking_rule; -pub mod logger; -pub mod query_graph; -pub mod query_term; -pub mod ranking_rule_graph; -pub mod ranking_rules; -pub mod resolve_query_graph; -pub mod sort; -pub mod words; +mod db_cache; +mod graph_based_ranking_rule; +mod logger; +mod query_graph; +mod query_term; +mod ranking_rule_graph; +mod ranking_rules; +mod resolve_query_graph; +mod sort; +mod words; use charabia::Tokenize; use heed::RoTxn; -pub use query_graph::*; -pub use ranking_rules::*; + +use query_graph::{QueryGraph, QueryNode}; +pub use ranking_rules::{ + execute_search, RankingRule, RankingRuleOutput, RankingRuleOutputIter, + RankingRuleOutputIterWrapper, RankingRuleQueryTrait, +}; use roaring::RoaringBitmap; use self::db_cache::DatabaseCache; diff --git a/milli/src/search/new/ranking_rule_graph/paths_map.rs b/milli/src/search/new/ranking_rule_graph/paths_map.rs index 3b01508c9..b9d089efc 100644 --- a/milli/src/search/new/ranking_rule_graph/paths_map.rs +++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs @@ -6,6 +6,10 @@ use roaring::RoaringBitmap; use super::cheapest_paths::Path; +// What is PathsMap used for? +// For the empty_prefixes field in the EmptyPathsCache only :/ +// but it could be used for more, like efficient computing of a set of paths + #[derive(Debug, Clone)] pub struct PathsMap { diff --git a/milli/src/search/new/ranking_rule_graph/proximity/build.rs b/milli/src/search/new/ranking_rule_graph/proximity/build.rs index 0d7e68272..fbe3c8169 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs @@ -13,16 +13,14 @@ use crate::{Index, Result}; pub fn visit_from_node(from_node: &QueryNode) -> Result> { Ok(Some(match from_node { - QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => { - match value1 { - QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()), - QueryTerm::Phrase { phrase: phrase1 } => { - // TODO: remove second unwrap - let original = phrase1.words.last().unwrap().as_ref().unwrap().clone(); + QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => match value1 { + QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()), + QueryTerm::Phrase { phrase: phrase1 } => { + if let Some(original) = phrase1.words.last().unwrap().as_ref() { ( WordDerivations { original: original.clone(), - zero_typo: vec![original], + zero_typo: vec![original.to_owned()], one_typo: vec![], two_typos: vec![], use_prefix_db: false, @@ -31,9 +29,12 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result ( WordDerivations { original: String::new(), @@ -68,26 +69,27 @@ pub fn visit_to_node<'transaction, 'from_data>( let (derivations2, pos2, ngram_len2) = match value2 { QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()), QueryTerm::Phrase { phrase: phrase2 } => { - // TODO: remove second unwrap - let original = phrase2.words.last().unwrap().as_ref().unwrap().clone(); - ( - WordDerivations { - original: original.clone(), - zero_typo: vec![original], - one_typo: vec![], - two_typos: vec![], - use_prefix_db: false, - synonyms: vec![], - split_words: None, - }, - *pos2.start(), - 1, - ) + if let Some(original) = phrase2.words.first().unwrap().as_ref() { + ( + WordDerivations { + original: original.clone(), + zero_typo: vec![original.to_owned()], + one_typo: vec![], + two_typos: vec![], + use_prefix_db: false, + synonyms: vec![], + split_words: None, + }, + *pos2.start(), + 1, + ) + } else { + // No word pairs if the phrase does not have a regular word as its first term + return Ok(vec![]); + } } }; - // TODO: here we would actually do it for each combination of word1 and word2 - // and take the union of them if pos1 + 1 != pos2 { // TODO: how should this actually be handled? // We want to effectively ignore this pair of terms @@ -130,19 +132,37 @@ pub fn visit_to_node<'transaction, 'from_data>( right_prefix: original_word_2.to_owned(), }); } + if db_cache + .get_prefix_word_pair_proximity_docids( + index, + txn, + original_word_2.as_str(), + word1.as_str(), + proximity as u8 - 1, + )? + .is_some() + { + cost_proximity_word_pairs + .entry(cost) + .or_default() + .entry(proximity as u8) + .or_default() + .push(WordPair::WordPrefixSwapped { + left_prefix: original_word_2.to_owned(), + right: word1.to_owned(), + }); + } } } } let derivations2 = derivations2.all_derivations_except_prefix_db(); - // TODO: safeguard in case the cartesian product is too large? + // TODO: add safeguard in case the cartesian product is too large? let product_derivations = derivations1.cartesian_product(derivations2); for (word1, word2) in product_derivations { for proximity in 1..=(8 - ngram_len2) { let cost = (proximity + ngram_len2 - 1) as u8; - // TODO: do the opposite way with a proximity penalty as well! - // search for (word2, word1, proximity-1), I guess? if db_cache .get_word_pair_proximity_docids(index, txn, word1, word2, proximity as u8)? .is_some() @@ -154,6 +174,18 @@ pub fn visit_to_node<'transaction, 'from_data>( .or_default() .push(WordPair::Words { left: word1.to_owned(), right: word2.to_owned() }); } + if proximity > 1 + && db_cache + .get_word_pair_proximity_docids(index, txn, word2, word1, proximity as u8 - 1)? + .is_some() + { + cost_proximity_word_pairs + .entry(cost) + .or_default() + .entry(proximity as u8 - 1) + .or_default() + .push(WordPair::Words { left: word2.to_owned(), right: word1.to_owned() }); + } } } let mut new_edges = cost_proximity_word_pairs diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 908f50ef6..34f7deea1 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -20,11 +20,8 @@ pub fn compute_docids<'transaction>( } WordPair::WordPrefix { left, right_prefix } => db_cache .get_word_prefix_pair_proximity_docids(index, txn, left, right_prefix, *proximity), - WordPair::WordsSwapped { left, right } => { - db_cache.get_word_pair_proximity_docids(index, txn, left, right, *proximity) - } - WordPair::WordPrefixSwapped { left, right_prefix } => db_cache - .get_prefix_word_pair_proximity_docids(index, txn, left, right_prefix, *proximity), + WordPair::WordPrefixSwapped { left_prefix, right } => db_cache + .get_prefix_word_pair_proximity_docids(index, txn, left_prefix, right, *proximity), }?; let bitmap = bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default(); diff --git a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs index 5b3869ea8..c0dbbefa9 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs @@ -18,9 +18,8 @@ use crate::{Index, Result}; #[derive(Debug, Clone)] pub enum WordPair { Words { left: String, right: String }, - WordsSwapped { left: String, right: String }, WordPrefix { left: String, right_prefix: String }, - WordPrefixSwapped { left: String, right_prefix: String }, + WordPrefixSwapped { left_prefix: String, right: String }, } #[derive(Clone)] diff --git a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs index f3394206b..94a51756e 100644 --- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs @@ -24,9 +24,13 @@ impl RankingRuleGraph { mut paths: Vec>, ) -> Result { paths.sort_unstable(); + // let mut needs_filtering_empty_edges = false; + // let mut needs_filtering_empty_prefix = false; + // let mut needs_filtering_empty_couple_edges = false; let mut needs_filtering = false; let mut path_bitmaps = vec![]; 'path_loop: loop { + // TODO: distinguish between empty_edges, empty_prefix, and empty_couple_edges filtering if needs_filtering { for path in paths.iter_mut() { if empty_paths_cache.path_is_empty(path) { @@ -61,11 +65,13 @@ impl RankingRuleGraph { self.remove_edge(edge_index); edge_docids_cache.cache.remove(&edge_index); needs_filtering = true; + // needs_filtering_empty_edges = true; // 3. continue executing this function again on the remaining paths continue 'path_loop; } else { path_bitmap &= edge_docids; if path_bitmap.is_disjoint(universe) { + // needs_filtering_empty_prefix = true; needs_filtering = true; empty_paths_cache.forbid_prefix(&visited_edges); // if the intersection between this edge and any @@ -76,6 +82,7 @@ impl RankingRuleGraph { { let intersection = edge_docids & edge_docids2; if intersection.is_disjoint(universe) { + // needs_filtering_empty_couple_edges = true; empty_paths_cache .forbid_couple_edges(*edge_index2, edge_index); } diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index 9b3bcb38c..bc0523b31 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -98,20 +98,6 @@ pub struct RankingRuleOutput { pub candidates: RoaringBitmap, } -#[allow(unused)] -pub fn get_start_universe<'transaction>( - index: &Index, - txn: &'transaction RoTxn, - db_cache: &mut DatabaseCache<'transaction>, - query_graph: &QueryGraph, - term_matching_strategy: TermsMatchingStrategy, - // filters: Filters, -) -> Result { - // TODO: actually compute the universe from the query graph - let universe = index.documents_ids(txn).unwrap(); - Ok(universe) -} - // TODO: can make it generic over the query type (either query graph or placeholder) fairly easily #[allow(clippy::too_many_arguments)] pub fn execute_search<'transaction>(