From 1b514517f52efb833d4dc98361d70827d760ad6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Mon, 1 May 2023 16:26:01 +0200
Subject: [PATCH] Fix bug in computation of query term at a position

---
 .../search/new/graph_based_ranking_rule.rs    | 25 ------
 .../new/ranking_rule_graph/position/mod.rs    |  8 +-
 milli/src/search/new/resolve_query_graph.rs   | 10 +--
 milli/src/search/new/tests/integration.rs     | 78 +++++++++++++++++++
 milli/src/search/new/tests/mod.rs             |  3 +-
 5 files changed, 84 insertions(+), 40 deletions(-)
 create mode 100644 milli/src/search/new/tests/integration.rs
diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs
index 379a0b2ab..d8f6836e7 100644
--- a/milli/src/search/new/graph_based_ranking_rule.rs
+++ b/milli/src/search/new/graph_based_ranking_rule.rs
@@ -100,8 +100,6 @@ impl<G: RankingRuleGraphTrait> GraphBasedRankingRule<G> {
     }
 }
 
-static mut COUNT_PATHS: usize = 0;
-
 /// The internal state of a graph-based ranking rule during iteration
 pub struct GraphBasedRankingRuleState<G: RankingRuleGraphTrait> {
     /// The current graph
@@ -219,34 +217,11 @@ impl<'ctx, G: RankingRuleGraphTrait> RankingRule<'ctx, QueryGraph> for GraphBase
         // the number of future candidate paths given by that same function.
 
         let mut subpaths_docids: Vec<(Interned<G::Condition>, RoaringBitmap)> = vec![];
-        let mut at_least_one = false;
 
-        // unsafe {
-        //     if COUNT_PATHS >= 1489 && COUNT_PATHS < 1491 {
-        //         println!("COUNT_PATHS {COUNT_PATHS} COST {cost}, NODES {COUNT_VISITED_NODES}, UNIVERSE {}", universe.len());
-        //         // let all_costs = all_costs.get(graph.query_graph.root_node);
-        //         // println!("{all_costs:?}");
-        //         dead_ends_cache.debug_print(0);
-        //         println!("{universe:?}");
-
-        //         println!("==================");
-        //     }
-        // }
         let mut nodes_with_removed_outgoing_conditions = BTreeSet::new();
         let visitor = PathVisitor::new(cost, graph, all_costs, dead_ends_cache);
 
         visitor.visit_paths(&mut |path, graph, dead_ends_cache| {
-            unsafe {
-                COUNT_PATHS += 1;
-            }
-            // if self.id == "position" {
-            // at_least_one = true;
-            //     print!(".");
-            // }
-            // if self.id == "fid" {
-            at_least_one = true;
-            //     print!("!");
-            // }
             considered_paths.push(path.to_vec());
             // If the universe is empty, stop exploring the graph, since no docids will ever be found anymore.
             if universe.is_empty() {
diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs
index c3d33174b..d4640097e 100644
--- a/milli/src/search/new/ranking_rule_graph/position/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs
@@ -35,7 +35,6 @@ impl RankingRuleGraphTrait for PositionGraph {
                     *position,
                 )?;
         }
-
         Ok(ComputedCondition {
             docids,
             universe_len: universe.len(),
@@ -91,15 +90,12 @@ impl RankingRuleGraphTrait for PositionGraph {
             };
             positions_for_costs.entry(cost).or_default().push(position);
         }
-        println!(
-            "positions for cost {} : {positions_for_costs:?}",
-            term.term_subset.description(ctx)
-        );
+
         let mut edges = vec![];
 
         for (cost, positions) in positions_for_costs {
             // TODO: We can improve performances and relevancy by storing
-            //       the term subsets associated to each position fetched.
+            //       the term subsets associated to each position fetched
             edges.push((
                 cost,
                 conditions_interner.insert(PositionCondition {
diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs
index 131dcd856..2bcb4d3ac 100644
--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@@ -69,16 +69,14 @@ pub fn compute_query_term_subset_docids_within_field_id(
     }
 
     for phrase in term.all_phrases(ctx)? {
-        let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone();
         // There may be false positives when resolving a phrase, so we're not
         // guaranteed that all of its words are within a single fid.
         // TODO: fix this?
         if let Some(word) = phrase.words(ctx).iter().flatten().next() {
             if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? {
-                phrase_docids &= word_fid_docids;
+                docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids;
             }
         }
-        docids |= phrase_docids;
     }
 
     if let Some(word_prefix) = term.use_prefix_db(ctx) {
@@ -98,7 +96,6 @@ pub fn compute_query_term_subset_docids_within_position(
     position: u16,
 ) -> Result<RoaringBitmap> {
     // TODO Use the roaring::MultiOps trait
-
     let mut docids = RoaringBitmap::new();
     for word in term.all_single_words_except_prefix_db(ctx)? {
         if let Some(word_position_docids) =
@@ -109,16 +106,14 @@ pub fn compute_query_term_subset_docids_within_position(
     }
 
     for phrase in term.all_phrases(ctx)? {
-        let mut phrase_docids = ctx.get_phrase_docids(phrase)?.clone();
         // It's difficult to know the expected position of the words in the phrase,
         // so instead we just check the first one.
         // TODO: fix this?
         if let Some(word) = phrase.words(ctx).iter().flatten().next() {
             if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? {
-                phrase_docids &= word_position_docids;
+                docids |= ctx.get_phrase_docids(phrase)? & word_position_docids
             }
         }
-        docids |= phrase_docids;
     }
 
     if let Some(word_prefix) = term.use_prefix_db(ctx) {
@@ -128,7 +123,6 @@ pub fn compute_query_term_subset_docids_within_position(
             docids |= word_position_docids;
         }
     }
-
     Ok(docids)
 }
 
diff --git a/milli/src/search/new/tests/integration.rs b/milli/src/search/new/tests/integration.rs
new file mode 100644
index 000000000..153a80343
--- /dev/null
+++ b/milli/src/search/new/tests/integration.rs
@@ -0,0 +1,78 @@
+use std::io::Cursor;
+
+use big_s::S;
+use heed::EnvOpenOptions;
+use maplit::{hashmap, hashset};
+
+use crate::{
+    db_snap,
+    documents::{DocumentsBatchBuilder, DocumentsBatchReader},
+    update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings},
+    Criterion, Index, Object,
+};
+pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson");
+
+pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
+    let path = tempfile::tempdir().unwrap();
+    let mut options = EnvOpenOptions::new();
+    options.map_size(10 * 1024 * 1024); // 10 MB
+    let index = Index::new(options, &path).unwrap();
+
+    let mut wtxn = index.write_txn().unwrap();
+    let config = IndexerConfig::default();
+
+    let mut builder = Settings::new(&mut wtxn, &index, &config);
+
+    builder.set_criteria(criteria.to_vec());
+    builder.set_filterable_fields(hashset! {
+        S("tag"),
+        S("asc_desc_rank"),
+        S("_geo"),
+        S("opt1"),
+        S("opt1.opt2"),
+        S("tag_in")
+    });
+    builder.set_sortable_fields(hashset! {
+        S("tag"),
+        S("asc_desc_rank"),
+    });
+    builder.set_synonyms(hashmap! {
+        S("hello") => vec![S("good morning")],
+        S("world") => vec![S("earth")],
+        S("america") => vec![S("the united states")],
+    });
+    builder.set_searchable_fields(vec![S("title"), S("description")]);
+    builder.execute(|_| (), || false).unwrap();
+
+    // index documents
+    let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
+    let indexing_config = IndexDocumentsConfig::default();
+
+    let builder =
+        IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| (), || false).unwrap();
+    let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
+    let reader = Cursor::new(CONTENT.as_bytes());
+
+    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
+        let object = result.unwrap();
+        documents_builder.append_json_object(&object).unwrap();
+    }
+
+    let vector = documents_builder.into_inner().unwrap();
+
+    // index documents
+    let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
+    let (builder, user_error) = builder.add_documents(content).unwrap();
+    user_error.unwrap();
+    builder.execute().unwrap();
+
+    wtxn.commit().unwrap();
+
+    index
+}
+
+#[test]
+fn snapshot_integration_dataset() {
+    let index = setup_search_index_with_criteria(&[Criterion::Attribute]);
+    db_snap!(index, word_position_docids, @"3c9347a767bceef3beb31465f1e5f3ae");
+}
diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs
index 2ad806a87..906aeda83 100644
--- a/milli/src/search/new/tests/mod.rs
+++ b/milli/src/search/new/tests/mod.rs
@@ -3,16 +3,17 @@ pub mod attribute_position;
 pub mod distinct;
 pub mod exactness;
 pub mod geo_sort;
+pub mod integration;
 #[cfg(feature = "default")]
 pub mod language;
 pub mod ngram_split_words;
 pub mod proximity;
 pub mod proximity_typo;
 pub mod sort;
+pub mod stop_words;
 pub mod typo;
 pub mod typo_proximity;
 pub mod words_tms;
-pub mod stop_words;
 
 fn collect_field_values(
     index: &crate::Index,