From 2da86b31a6097d0b4bb44de54d0ebe69fcd1315f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 6 Jun 2023 11:31:26 +0200 Subject: [PATCH] Remove comments and add documentation --- milli/src/search/new/distinct.rs | 1 - milli/src/search/new/exact_attribute.rs | 2 +- milli/src/search/new/interner.rs | 2 +- milli/src/search/new/limits.rs | 1 - milli/src/search/new/query_graph.rs | 4 +- milli/src/search/new/query_term/mod.rs | 4 - .../src/search/new/query_term/parse_query.rs | 3 - .../new/ranking_rule_graph/cheapest_paths.rs | 109 +++++++++++++++--- .../condition_docids_cache.rs | 6 +- .../new/ranking_rule_graph/dead_ends_cache.rs | 3 +- .../search/new/ranking_rule_graph/fid/mod.rs | 9 +- .../new/ranking_rule_graph/position/mod.rs | 7 +- .../proximity/compute_docids.rs | 16 +-- milli/src/search/new/resolve_query_graph.rs | 10 -- milli/src/search/new/sort.rs | 4 - milli/src/search/new/tests/distinct.rs | 2 +- milli/src/search/new/tests/proximity.rs | 16 +-- milli/src/search/new/tests/proximity_typo.rs | 7 +- milli/src/search/new/tests/typo.rs | 4 +- 19 files changed, 117 insertions(+), 93 deletions(-) diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs index fbb7550a9..fff96bd5d 100644 --- a/milli/src/search/new/distinct.rs +++ b/milli/src/search/new/distinct.rs @@ -26,7 +26,6 @@ pub fn apply_distinct_rule( ctx: &mut SearchContext, field_id: u16, candidates: &RoaringBitmap, - // TODO: add a universe here, such that the `excluded` are a subset of the universe? ) -> Result { let mut excluded = RoaringBitmap::new(); let mut remaining = RoaringBitmap::new(); diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index dc9c95d3d..6e0381295 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -206,7 +206,7 @@ impl State { )?; intersection &= &candidates; if !intersection.is_empty() { - // TODO: although not really worth it in terms of performance, + // Although not really worth it in terms of performance, // if would be good to put this in cache for the sake of consistency let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize { ctx.index diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs index ebf18f38c..c2d325a86 100644 --- a/milli/src/search/new/interner.rs +++ b/milli/src/search/new/interner.rs @@ -32,7 +32,7 @@ impl Interned { #[derive(Clone)] pub struct DedupInterner { stable_store: Vec, - lookup: FxHashMap>, // TODO: Arc + lookup: FxHashMap>, } impl Default for DedupInterner { fn default() -> Self { diff --git a/milli/src/search/new/limits.rs b/milli/src/search/new/limits.rs index 33a5a4a6c..d08946424 100644 --- a/milli/src/search/new/limits.rs +++ b/milli/src/search/new/limits.rs @@ -1,5 +1,4 @@ /// Maximum number of tokens we consider in a single search. -// TODO: Loic, find proper value here so we don't overflow the interner. pub const MAX_TOKEN_COUNT: usize = 1_000; /// Maximum number of prefixes that can be derived from a single word. diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index dc25d1bc3..114eb8c4e 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -92,7 +92,7 @@ impl QueryGraph { /// which contains ngrams. pub fn from_query( ctx: &mut SearchContext, - // NOTE: the terms here must be consecutive + // The terms here must be consecutive terms: &[LocatedQueryTerm], ) -> Result<(QueryGraph, Vec)> { let mut new_located_query_terms = terms.to_vec(); @@ -103,7 +103,7 @@ impl QueryGraph { let root_node = 0; let end_node = 1; - // TODO: we could consider generalizing to 4,5,6,7,etc. ngrams + // Ee could consider generalizing to 4,5,6,7,etc. ngrams let (mut prev2, mut prev1, mut prev0): (Vec, Vec, Vec) = (vec![], vec![], vec![root_node]); diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index fb749a797..8db843037 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -132,7 +132,6 @@ impl QueryTermSubset { if full_query_term.ngram_words.is_some() { return None; } - // TODO: included in subset if let Some(phrase) = full_query_term.zero_typo.phrase { self.zero_typo_subset.contains_phrase(phrase).then_some(ExactTerm::Phrase(phrase)) } else if let Some(word) = full_query_term.zero_typo.exact { @@ -182,7 +181,6 @@ impl QueryTermSubset { let word = match &self.zero_typo_subset { NTypoTermSubset::All => Some(use_prefix_db), NTypoTermSubset::Subset { words, phrases: _ } => { - // TODO: use a subset of prefix words instead if words.contains(&use_prefix_db) { Some(use_prefix_db) } else { @@ -204,7 +202,6 @@ impl QueryTermSubset { ctx: &mut SearchContext, ) -> Result> { let mut result = BTreeSet::default(); - // TODO: a compute_partially funtion if !self.one_typo_subset.is_empty() || !self.two_typo_subset.is_empty() { self.original.compute_fully_if_needed(ctx)?; } @@ -300,7 +297,6 @@ impl QueryTermSubset { let mut result = BTreeSet::default(); if !self.one_typo_subset.is_empty() { - // TODO: compute less than fully if possible self.original.compute_fully_if_needed(ctx)?; } let original = ctx.term_interner.get_mut(self.original); diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 6f146b208..5e97d6578 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -139,7 +139,6 @@ pub fn number_of_typos_allowed<'ctx>( let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?; let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?; - // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms? let exact_words = ctx.index.exact_words(ctx.txn)?; Ok(Box::new(move |word: &str| { @@ -250,8 +249,6 @@ impl PhraseBuilder { } else { // token has kind Word let word = ctx.word_interner.insert(token.lemma().to_string()); - // TODO: in a phrase, check that every word exists - // otherwise return an empty term self.words.push(Some(word)); } } diff --git a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs index 8fd943e6e..e93a91d29 100644 --- a/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs +++ b/milli/src/search/new/ranking_rule_graph/cheapest_paths.rs @@ -1,5 +1,48 @@ -#![allow(clippy::too_many_arguments)] +/** Implements a "PathVisitor" which finds all paths of a certain cost +from the START to END node of a ranking rule graph. +A path is a list of conditions. A condition is the data associated with +an edge, given by the ranking rule. Some edges don't have a condition associated +with them, they are "unconditional". These kinds of edges are used to "skip" a node. + +The algorithm uses a depth-first search. It benefits from two main optimisations: +- The list of all possible costs to go from any node to the END node is precomputed +- The `DeadEndsCache` reduces the number of valid paths drastically, by making some edges +untraversable depending on what other edges were selected. + +These two optimisations are meant to avoid traversing edges that wouldn't lead +to a valid path. In practically all cases, we avoid the exponential complexity +that is inherent to depth-first search in a large ranking rule graph. + +The DeadEndsCache is a sort of prefix tree which associates a list of forbidden +conditions to a list of traversed conditions. +For example, the DeadEndsCache could say the following: +- Immediately, from the start, the conditions `[a,b]` are forbidden + - if we take the condition `c`, then the conditions `[e]` are also forbidden + - and if after that, we take `f`, then `[h,i]` are also forbidden + - etc. + - if we take `g`, then `[f]` is also forbidden + - etc. + - etc. +As we traverse the graph, we also traverse the `DeadEndsCache` and keep a list of forbidden +conditions in memory. Then, we know to avoid all edges which have a condition that is forbidden. + +When a path is found from START to END, we give it to the `visit` closure. +This closure takes a mutable reference to the `DeadEndsCache`. This means that +the caller can update this cache. Therefore, we must handle the case where the +DeadEndsCache has been updated. This means potentially backtracking up to the point +where the traversed conditions are all allowed by the new DeadEndsCache. + +The algorithm also implements the `TermsMatchingStrategy` logic. +Some edges are augmented with a list of "nodes_to_skip". Skipping +a node means "reaching this node through an unconditional edge". If we have +already traversed (ie. not skipped) a node that is in this list, then we know that we +can't traverse this edge. Otherwise, we traverse the edge but make sure to skip any +future node that was present in the "nodes_to_skip" list. + +The caller can decide to stop the path finding algorithm +by returning a `ControlFlow::Break` from the `visit` closure. +*/ use std::collections::{BTreeSet, VecDeque}; use std::iter::FromIterator; use std::ops::ControlFlow; @@ -12,30 +55,41 @@ use crate::search::new::query_graph::QueryNode; use crate::search::new::small_bitmap::SmallBitmap; use crate::Result; +/// Closure which processes a path found by the `PathVisitor` type VisitFn<'f, G> = &'f mut dyn FnMut( + // the path as a list of conditions &[Interned<::Condition>], &mut RankingRuleGraph, + // a mutable reference to the DeadEndsCache, to update it in case the given + // path doesn't resolve to any valid document ids &mut DeadEndsCache<::Condition>, ) -> Result>; +/// A structure which is kept but not updated during the traversal of the graph. +/// It can however be updated by the `visit` closure once a valid path has been found. struct VisitorContext<'a, G: RankingRuleGraphTrait> { graph: &'a mut RankingRuleGraph, all_costs_from_node: &'a MappedInterner>, dead_ends_cache: &'a mut DeadEndsCache, } +/// The internal state of the traversal algorithm struct VisitorState { + /// Budget from the current node to the end node remaining_cost: u64, - + /// Previously visited conditions, in order. path: Vec>, - + /// Previously visited conditions, as an efficient and compact set. visited_conditions: SmallBitmap, + /// Previously visited (ie not skipped) nodes, as an efficient and compact set. visited_nodes: SmallBitmap, - + /// The conditions that cannot be visited anymore forbidden_conditions: SmallBitmap, - forbidden_conditions_to_nodes: SmallBitmap, + /// The nodes that cannot be visited anymore (they must be skipped) + nodes_to_skip: SmallBitmap, } +/// See module documentation pub struct PathVisitor<'a, G: RankingRuleGraphTrait> { state: VisitorState, ctx: VisitorContext<'a, G>, @@ -56,14 +110,13 @@ impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> { forbidden_conditions: SmallBitmap::for_interned_values_in( &graph.conditions_interner, ), - forbidden_conditions_to_nodes: SmallBitmap::for_interned_values_in( - &graph.query_graph.nodes, - ), + nodes_to_skip: SmallBitmap::for_interned_values_in(&graph.query_graph.nodes), }, ctx: VisitorContext { graph, all_costs_from_node, dead_ends_cache }, } } + /// See module documentation pub fn visit_paths(mut self, visit: VisitFn) -> Result<()> { let _ = self.state.visit_node(self.ctx.graph.query_graph.root_node, visit, &mut self.ctx)?; @@ -72,22 +125,31 @@ impl<'a, G: RankingRuleGraphTrait> PathVisitor<'a, G> { } impl VisitorState { + /// Visits a node: traverse all its valid conditional and unconditional edges. + /// + /// Returns ControlFlow::Break if the path finding algorithm should stop. + /// Returns whether a valid path was found from this node otherwise. fn visit_node( &mut self, from_node: Interned, visit: VisitFn, ctx: &mut VisitorContext, ) -> Result> { + // any valid path will be found from this point + // if a valid path was found, then we know that the DeadEndsCache may have been updated, + // and we will need to do more work to potentially backtrack let mut any_valid = false; let edges = ctx.graph.edges_of_node.get(from_node).clone(); for edge_idx in edges.iter() { + // could be none if the edge was deleted let Some(edge) = ctx.graph.edges_store.get(edge_idx).clone() else { continue }; if self.remaining_cost < edge.cost as u64 { continue; } self.remaining_cost -= edge.cost as u64; + let cf = match edge.condition { Some(condition) => self.visit_condition( condition, @@ -119,6 +181,10 @@ impl VisitorState { Ok(ControlFlow::Continue(any_valid)) } + /// Visits an unconditional edge. + /// + /// Returns ControlFlow::Break if the path finding algorithm should stop. + /// Returns whether a valid path was found from this node otherwise. fn visit_no_condition( &mut self, dest_node: Interned, @@ -134,20 +200,29 @@ impl VisitorState { { return Ok(ControlFlow::Continue(false)); } + // We've reached the END node! if dest_node == ctx.graph.query_graph.end_node { let control_flow = visit(&self.path, ctx.graph, ctx.dead_ends_cache)?; + // We could change the return type of the visit closure such that the caller + // tells us whether the dead ends cache was updated or not. + // Alternatively, maybe the DeadEndsCache should have a generation number + // to it, so that we don't need to play with these booleans at all. match control_flow { ControlFlow::Continue(_) => Ok(ControlFlow::Continue(true)), ControlFlow::Break(_) => Ok(ControlFlow::Break(())), } } else { - let old_fbct = self.forbidden_conditions_to_nodes.clone(); - self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip); + let old_fbct = self.nodes_to_skip.clone(); + self.nodes_to_skip.union(edge_new_nodes_to_skip); let cf = self.visit_node(dest_node, visit, ctx)?; - self.forbidden_conditions_to_nodes = old_fbct; + self.nodes_to_skip = old_fbct; Ok(cf) } } + /// Visits a conditional edge. + /// + /// Returns ControlFlow::Break if the path finding algorithm should stop. + /// Returns whether a valid path was found from this node otherwise. fn visit_condition( &mut self, condition: Interned, @@ -159,7 +234,7 @@ impl VisitorState { assert!(dest_node != ctx.graph.query_graph.end_node); if self.forbidden_conditions.contains(condition) - || self.forbidden_conditions_to_nodes.contains(dest_node) + || self.nodes_to_skip.contains(dest_node) || edge_new_nodes_to_skip.intersects(&self.visited_nodes) { return Ok(ControlFlow::Continue(false)); @@ -180,19 +255,19 @@ impl VisitorState { self.visited_nodes.insert(dest_node); self.visited_conditions.insert(condition); - let old_fc = self.forbidden_conditions.clone(); + let old_forb_cond = self.forbidden_conditions.clone(); if let Some(next_forbidden) = ctx.dead_ends_cache.forbidden_conditions_after_prefix(self.path.iter().copied()) { self.forbidden_conditions.union(&next_forbidden); } - let old_fctn = self.forbidden_conditions_to_nodes.clone(); - self.forbidden_conditions_to_nodes.union(edge_new_nodes_to_skip); + let old_nodes_to_skip = self.nodes_to_skip.clone(); + self.nodes_to_skip.union(edge_new_nodes_to_skip); let cf = self.visit_node(dest_node, visit, ctx)?; - self.forbidden_conditions_to_nodes = old_fctn; - self.forbidden_conditions = old_fc; + self.nodes_to_skip = old_nodes_to_skip; + self.forbidden_conditions = old_forb_cond; self.visited_conditions.remove(condition); self.visited_nodes.remove(dest_node); diff --git a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs index d0fcd8bd8..5d199c82a 100644 --- a/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/condition_docids_cache.rs @@ -9,12 +9,8 @@ use crate::search::new::query_term::LocatedQueryTermSubset; use crate::search::new::SearchContext; use crate::Result; -// TODO: give a generation to each universe, then be able to get the exact -// delta of docids between two universes of different generations! - /// A cache storing the document ids associated with each ranking rule edge pub struct ConditionDocIdsCache { - // TOOD: should be a mapped interner? pub cache: FxHashMap, ComputedCondition>, _phantom: PhantomData, } @@ -54,7 +50,7 @@ impl ConditionDocIdsCache { } let condition = graph.conditions_interner.get_mut(interned_condition); let computed = G::resolve_condition(ctx, condition, universe)?; - // TODO: if computed.universe_len != universe.len() ? + // Can we put an assert here for computed.universe_len == universe.len() ? let _ = self.cache.insert(interned_condition, computed); let computed = &self.cache[&interned_condition]; Ok(computed) diff --git a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs index 4bbf91fcd..bac25da82 100644 --- a/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs +++ b/milli/src/search/new/ranking_rule_graph/dead_ends_cache.rs @@ -2,6 +2,7 @@ use crate::search::new::interner::{FixedSizeInterner, Interned}; use crate::search::new::small_bitmap::SmallBitmap; pub struct DeadEndsCache { + // conditions and next could/should be part of the same vector conditions: Vec>, next: Vec, pub forbidden: SmallBitmap, @@ -27,7 +28,7 @@ impl DeadEndsCache { self.forbidden.insert(condition); } - pub fn advance(&mut self, condition: Interned) -> Option<&mut Self> { + fn advance(&mut self, condition: Interned) -> Option<&mut Self> { if let Some(idx) = self.conditions.iter().position(|c| *c == condition) { Some(&mut self.next[idx]) } else { diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index 0f2cceaec..e3ccf23fa 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -69,14 +69,9 @@ impl RankingRuleGraphTrait for FidGraph { let mut edges = vec![]; for fid in all_fields { - // TODO: We can improve performances and relevancy by storing - // the term subsets associated to each field ids fetched. edges.push(( - fid as u32 * term.term_ids.len() as u32, // TODO improve the fid score i.e. fid^10. - conditions_interner.insert(FidCondition { - term: term.clone(), // TODO remove this ugly clone - fid, - }), + fid as u32 * term.term_ids.len() as u32, + conditions_interner.insert(FidCondition { term: term.clone(), fid }), )); } diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index 9b0b6478f..c2e3b9012 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -94,14 +94,9 @@ impl RankingRuleGraphTrait for PositionGraph { let mut edges = vec![]; for (cost, positions) in positions_for_costs { - // TODO: We can improve performances and relevancy by storing - // the term subsets associated to each position fetched edges.push(( cost, - conditions_interner.insert(PositionCondition { - term: term.clone(), // TODO remove this ugly clone - positions, - }), + conditions_interner.insert(PositionCondition { term: term.clone(), positions }), )); } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 057779a22..29a1876b4 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -65,13 +65,6 @@ pub fn compute_docids( } } - // TODO: add safeguard in case the cartesian product is too large! - // even if we restrict the word derivations to a maximum of 100, the size of the - // caterisan product could reach a maximum of 10_000 derivations, which is way too much. - // Maybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo - // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been - // reached - for (left_phrase, left_word) in last_words_of_term_derivations(ctx, &left_term.term_subset)? { // Before computing the edges, check that the left word and left phrase // aren't disjoint with the universe, but only do it if there is more than @@ -111,8 +104,6 @@ pub fn compute_docids( Ok(ComputedCondition { docids, universe_len: universe.len(), - // TODO: think about whether we want to reduce the subset, - // we probably should! start_term_subset: Some(left_term.clone()), end_term_subset: right_term.clone(), }) @@ -203,12 +194,7 @@ fn compute_non_prefix_edges( *docids |= new_docids; } } - if backward_proximity >= 1 - // TODO: for now, we don't do any swapping when either term is a phrase - // but maybe we should. We'd need to look at the first/last word of the phrase - // depending on the context. - && left_phrase.is_none() && right_phrase.is_none() - { + if backward_proximity >= 1 && left_phrase.is_none() && right_phrase.is_none() { if let Some(new_docids) = ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)? { diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 797db5875..d992cd22f 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -33,8 +33,6 @@ pub fn compute_query_term_subset_docids( ctx: &mut SearchContext, term: &QueryTermSubset, ) -> Result { - // TODO Use the roaring::MultiOps trait - let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { if let Some(word_docids) = ctx.word_docids(word)? { @@ -59,8 +57,6 @@ pub fn compute_query_term_subset_docids_within_field_id( term: &QueryTermSubset, fid: u16, ) -> Result { - // TODO Use the roaring::MultiOps trait - let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? { @@ -71,7 +67,6 @@ pub fn compute_query_term_subset_docids_within_field_id( for phrase in term.all_phrases(ctx)? { // There may be false positives when resolving a phrase, so we're not // guaranteed that all of its words are within a single fid. - // TODO: fix this? if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? { docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids; @@ -95,7 +90,6 @@ pub fn compute_query_term_subset_docids_within_position( term: &QueryTermSubset, position: u16, ) -> Result { - // TODO Use the roaring::MultiOps trait let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { if let Some(word_position_docids) = @@ -108,7 +102,6 @@ pub fn compute_query_term_subset_docids_within_position( for phrase in term.all_phrases(ctx)? { // It's difficult to know the expected position of the words in the phrase, // so instead we just check the first one. - // TODO: fix this? if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? { docids |= ctx.get_phrase_docids(phrase)? & word_position_docids @@ -132,9 +125,6 @@ pub fn compute_query_graph_docids( q: &QueryGraph, universe: &RoaringBitmap, ) -> Result { - // TODO: there must be a faster way to compute this big - // roaring bitmap expression - let mut nodes_resolved = SmallBitmap::for_interned_values_in(&q.nodes); let mut path_nodes_docids = q.nodes.map(|_| RoaringBitmap::new()); diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index 53144d00d..3f57b2aa5 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -141,10 +141,6 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, universe: &RoaringBitmap, ) -> Result>> { let iter = self.iter.as_mut().unwrap(); - // TODO: we should make use of the universe in the function below - // good for correctness, but ideally iter.next_bucket would take the current universe into account, - // as right now it could return buckets that don't intersect with the universe, meaning we will make many - // unneeded calls. if let Some(mut bucket) = iter.next_bucket()? { bucket.candidates &= universe; Ok(Some(bucket)) diff --git a/milli/src/search/new/tests/distinct.rs b/milli/src/search/new/tests/distinct.rs index 2c147d514..c54600f27 100644 --- a/milli/src/search/new/tests/distinct.rs +++ b/milli/src/search/new/tests/distinct.rs @@ -527,7 +527,7 @@ fn test_distinct_all_candidates() { let SearchResult { documents_ids, candidates, .. } = s.execute().unwrap(); let candidates = candidates.iter().collect::>(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); - // TODO: this is incorrect! + // This is incorrect, but unfortunately impossible to do better efficiently. insta::assert_snapshot!(format!("{candidates:?}"), @"[1, 4, 7, 8, 14, 17, 19, 20, 23, 24, 25, 26]"); } diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs index 401508866..6e4181a95 100644 --- a/milli/src/search/new/tests/proximity.rs +++ b/milli/src/search/new/tests/proximity.rs @@ -122,11 +122,11 @@ fn create_edge_cases_index() -> TempIndex { sta stb stc ste stf stg sth sti stj stk stl stm stn sto stp stq str stst stt stu stv stw stx sty stz " }, - // The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`. - // If the search query is "sunflower", the split word "Sun Flower" will match some documents. + // The next 5 documents lay out a trap with the split word, phrase search, or synonym `sun flower`. + // If the search query is "sunflower", the split word "Sun Flower" will match some documents. // If the query is `sunflower wilting`, then we should make sure that - // the sprximity condition `flower wilting: sprx N` also comes with the condition - // `sun wilting: sprx N+1`. TODO: this is not the exact condition we use for now. + // the proximity condition `flower wilting: sprx N` also comes with the condition + // `sun wilting: sprx N+1`, but this is not the exact condition we use for now. // We only check that the phrase `sun flower` exists and `flower wilting: sprx N`, which // is better than nothing but not the best. { @@ -139,7 +139,7 @@ fn create_edge_cases_index() -> TempIndex { }, { "id": 3, - // This document matches the query `sunflower wilting`, but the sprximity condition + // This document matches the query `sunflower wilting`, but the sprximity condition // between `sunflower` and `wilting` cannot be through the split-word `Sun Flower` // which would reduce to only `flower` and `wilting` being in sprximity. "text": "A flower wilting under the sun, unlike a sunflower" @@ -299,7 +299,7 @@ fn test_proximity_split_word() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 5, 1, 3]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); - // TODO: "2" and "4" should be swapped ideally + // "2" and "4" should be swapped ideally insta::assert_debug_snapshot!(texts, @r###" [ "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", @@ -316,7 +316,7 @@ fn test_proximity_split_word() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); - // TODO: "2" and "4" should be swapped ideally + // "2" and "4" should be swapped ideally insta::assert_debug_snapshot!(texts, @r###" [ "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", @@ -341,7 +341,7 @@ fn test_proximity_split_word() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 4, 1]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); - // TODO: "2" and "4" should be swapped ideally + // "2" and "4" should be swapped ideally insta::assert_debug_snapshot!(texts, @r###" [ "\"Sun Flower sounds like the title of a painting, maybe about a flower wilting under the heat.\"", diff --git a/milli/src/search/new/tests/proximity_typo.rs b/milli/src/search/new/tests/proximity_typo.rs index ab98f99c0..b459b178b 100644 --- a/milli/src/search/new/tests/proximity_typo.rs +++ b/milli/src/search/new/tests/proximity_typo.rs @@ -2,9 +2,8 @@ This module tests the interactions between the proximity and typo ranking rules. The proximity ranking rule should transform the query graph such that it -only contains the word pairs that it used to compute its bucket. - -TODO: This is not currently implemented. +only contains the word pairs that it used to compute its bucket, but this is not currently +implemented. */ use crate::index::tests::TempIndex; @@ -64,7 +63,7 @@ fn test_trap_basic() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); - // TODO: this is incorrect, 1 should come before 0 + // This is incorrect, 1 should come before 0 insta::assert_debug_snapshot!(texts, @r###" [ "\"summer. holiday. sommer holidty\"", diff --git a/milli/src/search/new/tests/typo.rs b/milli/src/search/new/tests/typo.rs index 8fd9de5fc..536f6653d 100644 --- a/milli/src/search/new/tests/typo.rs +++ b/milli/src/search/new/tests/typo.rs @@ -571,8 +571,8 @@ fn test_typo_synonyms() { s.terms_matching_strategy(TermsMatchingStrategy::All); s.query("the fast brownish fox jumps over the lackadaisical dog"); - // TODO: is this correct? interaction of ngrams + synonyms means that the - // multi-word synonyms end up having a typo cost. This is probably not what we want. + // The interaction of ngrams + synonyms means that the multi-word synonyms end up having a typo cost. + // This is probably not what we want. let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[21, 0, 22]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids);