From 16fefd364ead573faad167750fad89dadb999cc3 Mon Sep 17 00:00:00 2001
From: Louis Dureuil <louis@meilisearch.com>
Date: Mon, 27 Mar 2023 11:04:04 +0200
Subject: [PATCH] Add TODO notes

---
 milli/src/search/new/mod.rs          |  2 ++
 milli/src/search/new/query_graph.rs  |  3 +++
 milli/src/search/new/query_term.rs   |  5 +++++
 milli/src/search/new/small_bitmap.rs |  3 +++
 milli/src/search/new/sort.rs         | 18 ++++++++++++++++++
 5 files changed, 31 insertions(+)
diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs
index 45cad378a..194ffb035 100644
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -11,6 +11,7 @@ mod resolve_query_graph;
 // TODO: documentation + comments
 mod small_bitmap;
 // TODO: documentation + comments
+// implementation is currently an adaptation of the previous implementation to fit with the new model
 mod sort;
 // TODO: documentation + comments
 mod words;
@@ -42,6 +43,7 @@ pub struct SearchContext<'ctx> {
     pub word_interner: DedupInterner<String>,
     pub phrase_interner: DedupInterner<Phrase>,
     pub term_interner: DedupInterner<QueryTerm>,
+    // think about memory usage of that field (roaring bitmaps in a hashmap)
     pub term_docids: QueryTermDocIdsCache,
 }
 impl<'ctx> SearchContext<'ctx> {
diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs
index 863ec0045..0f06b9b95 100644
--- a/milli/src/search/new/query_graph.rs
+++ b/milli/src/search/new/query_graph.rs
@@ -119,6 +119,8 @@ impl QueryGraph {
 
 impl QueryGraph {
     /// Build the query graph from the parsed user search query.
+    ///
+    /// The ngrams are made at this point.
     pub fn from_query(ctx: &mut SearchContext, terms: Vec<LocatedQueryTerm>) -> Result<QueryGraph> {
         let nbr_typos = number_of_typos_allowed(ctx)?;
 
@@ -219,6 +221,7 @@ impl QueryGraph {
     }
 
     /// Remove the given nodes and all their edges from the query graph.
+    /// TODO: need to check where this is used, and if this is correct.
     pub fn remove_nodes(&mut self, nodes: &[Interned<QueryNode>]) {
         for &node_id in nodes {
             let node = &self.nodes.get(node_id);
diff --git a/milli/src/search/new/query_term.rs b/milli/src/search/new/query_term.rs
index e239d4669..0850b2181 100644
--- a/milli/src/search/new/query_term.rs
+++ b/milli/src/search/new/query_term.rs
@@ -414,6 +414,7 @@ impl QueryTerm {
 #[derive(Clone)]
 pub struct LocatedQueryTerm {
     pub value: Interned<QueryTerm>,
+    // TODO: consider changing to u8, or even a u16
     pub positions: RangeInclusive<i8>,
 }
 
@@ -425,6 +426,8 @@ impl LocatedQueryTerm {
 }
 
 /// Convert the tokenised search query into a list of located query terms.
+// TODO: checking if the positions are correct for phrases, separators, ngrams
+// hard-limit the number of tokens that are considered
 pub fn located_query_terms_from_string(
     ctx: &mut SearchContext,
     query: NormalizedTokenIter<&[u8]>,
@@ -484,6 +487,7 @@ pub fn located_query_terms_from_string(
                     }
                 } else {
                     let word = token.lemma();
+                    // eagerly compute all derivations
                     let term = query_term_from_word(ctx, word, nbr_typos(word), true)?;
                     let located_term = LocatedQueryTerm {
                         value: ctx.term_interner.insert(term),
@@ -507,6 +511,7 @@ pub fn located_query_terms_from_string(
                     quoted = !quoted;
                 }
                 // if there is a quote or a hard separator we close the phrase.
+                // TODO: limit phrase size?
                 if !phrase.is_empty() && (quote_count > 0 || separator_kind == SeparatorKind::Hard)
                 {
                     let located_query_term = LocatedQueryTerm {
diff --git a/milli/src/search/new/small_bitmap.rs b/milli/src/search/new/small_bitmap.rs
index 503bd72f5..24541eb6c 100644
--- a/milli/src/search/new/small_bitmap.rs
+++ b/milli/src/search/new/small_bitmap.rs
@@ -16,6 +16,9 @@ impl<T> SmallBitmap<T> {
     pub fn for_interned_values_in(interner: &FixedSizeInterner<T>) -> Self {
         Self::new(interner.len())
     }
+    // universe_length not stored anywhere, only used to decide between tiny/small
+    // universe_length: passed 63, actual length will be rounded up 64
+    // passed 66, actual 64 * xs.len() as u16 = 128, passed sized rounded up to the next 64
     pub fn new(universe_length: u16) -> Self {
         if universe_length <= 64 {
             Self { internal: SmallBitmapInternal::Tiny(0), _phantom: PhantomData }
diff --git a/milli/src/search/new/sort.rs b/milli/src/search/new/sort.rs
index 6277149bd..04152f0f0 100644
--- a/milli/src/search/new/sort.rs
+++ b/milli/src/search/new/sort.rs
@@ -28,6 +28,21 @@ impl<'ctx, Query> RankingRuleOutputIter<'ctx, Query> for RankingRuleOutputIterWr
     }
 }
 
+// `Query` type parameter: the same as the type parameter to bucket_sort
+// implements RankingRuleQuery trait, either querygraph or placeholdersearch
+// The sort ranking rule doesn't need the query parameter, it is doing the same thing
+// whether we're doing a querygraph or placeholder search.
+//
+// Query Stored anyway because every ranking rule must return a query from next_bucket
+// ---
+// "Mismatch" between new/old impl.:
+// - old impl: roaring bitmap as input, ranking rule iterates other all the buckets
+// - new impl: still works like that but it shouldn't, because the universe may change for every call to next_bucket, itself due to:
+//    1. elements that were already returned by the ranking rule are subtracted from the universe, also done in the old impl (subtracted from the candidates)
+//    2. NEW in the new impl.: distinct rule might have been applied btwn calls to next_bucket
+// new impl ignores docs removed in (2), which is a missed perf opt issue, see `next_bucket`
+// this perf problem is P2
+// mostly happens when many documents map to the same distinct attribute value.
 pub struct Sort<'ctx, Query> {
     field_name: String,
     field_id: Option<FieldId>,
@@ -127,6 +142,9 @@ impl<'ctx, Query: RankingRuleQueryTrait> RankingRule<'ctx, Query> for Sort<'ctx,
     ) -> Result<Option<RankingRuleOutput<Query>>> {
         let iter = self.iter.as_mut().unwrap();
         // TODO: we should make use of the universe in the function below
+        // good for correctness, but ideally iter.next_bucket would take the current universe into account,
+        // as right now it could return buckets that don't intersect with the universe, meaning we will make many
+        // unneeded calls.
         if let Some(mut bucket) = iter.next_bucket()? {
             bucket.candidates &= universe;
             Ok(Some(bucket))