From 16fefd364ead573faad167750fad89dadb999cc3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 27 Mar 2023 11:04:04 +0200 Subject: [PATCH] Add TODO notes --- milli/src/search/new/mod.rs | 2 ++ milli/src/search/new/query_graph.rs | 3 +++ milli/src/search/new/query_term.rs | 5 +++++ milli/src/search/new/small_bitmap.rs | 3 +++ milli/src/search/new/sort.rs | 18 ++++++++++++++++++ 5 files changed, 31 insertions(+) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 45cad378a..194ffb035 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -11,6 +11,7 @@ mod resolve_query_graph; // TODO: documentation + comments mod small_bitmap; // TODO: documentation + comments +// implementation is currently an adaptation of the previous implementation to fit with the new model mod sort; // TODO: documentation + comments mod words; @@ -42,6 +43,7 @@ pub struct SearchContext<'ctx> { pub word_interner: DedupInterner, pub phrase_interner: DedupInterner, pub term_interner: DedupInterner, + // think about memory usage of that field (roaring bitmaps in a hashmap) pub term_docids: QueryTermDocIdsCache, } impl<'ctx> SearchContext<'ctx> { diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 863ec0045..0f06b9b95 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -119,6 +119,8 @@ impl QueryGraph { impl QueryGraph { /// Build the query graph from the parsed user search query. + /// + /// The ngrams are made at this point. pub fn from_query(ctx: &mut SearchContext, terms: Vec) -> Result { let nbr_typos = number_of_typos_allowed(ctx)?; @@ -219,6 +221,7 @@ impl QueryGraph { } /// Remove the given nodes and all their edges from the query graph. + /// TODO: need to check where this is used, and if this is correct. pub fn remove_nodes(&mut self, nodes: &[Interned]) { for &node_id in nodes { let node = &self.nodes.get(node_id); diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs index e239d4669..0850b2181 100644 --- a/milli/src/search/new/query_term.rs +++ b/milli/src/search/new/query_term.rs @@ -414,6 +414,7 @@ impl QueryTerm { #[derive(Clone)] pub struct LocatedQueryTerm { pub value: Interned, + // TODO: consider changing to u8, or even a u16 pub positions: RangeInclusive, } @@ -425,6 +426,8 @@ impl LocatedQueryTerm { } /// Convert the tokenised search query into a list of located query terms. +// TODO: checking if the positions are correct for phrases, separators, ngrams +// hard-limit the number of tokens that are considered pub fn located_query_terms_from_string( ctx: &mut SearchContext, query: NormalizedTokenIter<&[u8]>, @@ -484,6 +487,7 @@ pub fn located_query_terms_from_string( } } else { let word = token.lemma(); + // eagerly compute all derivations let term = query_term_from_word(ctx, word, nbr_typos(word), true)?; let located_term = LocatedQueryTerm { value: ctx.term_interner.insert(term), @@ -507,6 +511,7 @@ pub fn located_query_terms_from_string( quoted = !quoted; } // if there is a quote or a hard separator we close the phrase. + // TODO: limit phrase size? if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard) { let located_query_term = LocatedQueryTerm { diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs index 503bd72f5..24541eb6c 100644 --- a/milli/src/search/new/small_bitmap.rs +++ b/milli/src/search/new/small_bitmap.rs @@ -16,6 +16,9 @@ impl SmallBitmap { pub fn for_interned_values_in(interner: &FixedSizeInterner) -> Self { Self::new(interner.len()) } + // universe_length not stored anywhere, only used to decide between tiny/small + // universe_length: passed 63, actual length will be rounded up 64 + // passed 66, actual 64 * xs.len() as u16 = 128, passed sized rounded up to the next 64 pub fn new(universe_length: u16) -> Self { if universe_length <= 64 { Self { internal: SmallBitmapInternal::Tiny(0), _phantom: PhantomData } diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs index 6277149bd..04152f0f0 100644 --- a/milli/src/search/new/sort.rs +++ b/milli/src/search/new/sort.rs @@ -28,6 +28,21 @@ impl<'ctx, Query> RankingRuleOutputIter<'ctx, Query> for RankingRuleOutputIterWr } } +// `Query` type parameter: the same as the type parameter to bucket_sort +// implements RankingRuleQuery trait, either querygraph or placeholdersearch +// The sort ranking rule doesn't need the query parameter, it is doing the same thing +// whether we're doing a querygraph or placeholder search. +// +// Query Stored anyway because every ranking rule must return a query from next_bucket +// --- +// "Mismatch" between new/old impl.: +// - old impl: roaring bitmap as input, ranking rule iterates other all the buckets +// - new impl: still works like that but it shouldn't, because the universe may change for every call to next_bucket, itself due to: +// 1. elements that were already returned by the ranking rule are subtracted from the universe, also done in the old impl (subtracted from the candidates) +// 2. NEW in the new impl.: distinct rule might have been applied btwn calls to next_bucket +// new impl ignores docs removed in (2), which is a missed perf opt issue, see `next_bucket` +// this perf problem is P2 +// mostly happens when many documents map to the same distinct attribute value. pub struct Sort<'ctx, Query> { field_name: String, field_id: Option, @@ -127,6 +142,9 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx, ) -> Result>> { let iter = self.iter.as_mut().unwrap(); // TODO: we should make use of the universe in the function below + // good for correctness, but ideally iter.next_bucket would take the current universe into account, + // as right now it could return buckets that don't intersect with the universe, meaning we will make many + // unneeded calls. if let Some(mut bucket) = iter.next_bucket()? { bucket.candidates &= universe; Ok(Some(bucket))