From b5691802a32fd63697a83882124ecf645ae4df86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Wed, 5 Apr 2023 14:10:01 +0200
Subject: [PATCH] Add new tests and fix construction of query graph from paths

---
 milli/src/search/new/interner.rs             |   3 +
 milli/src/search/new/query_graph.rs          | 101 ++++++++++---------
 milli/src/search/new/tests/proximity_typo.rs |   9 +-
 milli/src/search/new/tests/sort.rs           |  12 +--
 milli/src/search/new/tests/typo_proximity.rs |   8 +-
 5 files changed, 73 insertions(+), 60 deletions(-)
diff --git a/milli/src/search/new/interner.rs b/milli/src/search/new/interner.rs
index e9bfbef86..ebf18f38c 100644
--- a/milli/src/search/new/interner.rs
+++ b/milli/src/search/new/interner.rs
@@ -176,6 +176,9 @@ impl<T> Interner<T> {
     pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
         self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
     }
+    pub fn freeze(self) -> FixedSizeInterner<T> {
+        FixedSizeInterner { stable_store: self.stable_store }
+    }
 }
 
 /// A store of values of type `T`, each linked to a value of type `From`
diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs
index 33e178494..2662ef730 100644
--- a/milli/src/search/new/query_graph.rs
+++ b/milli/src/search/new/query_graph.rs
@@ -4,7 +4,7 @@ use super::query_term::{
 };
 use super::small_bitmap::SmallBitmap;
 use super::SearchContext;
-use crate::search::new::interner::DedupInterner;
+use crate::search::new::interner::Interner;
 use crate::Result;
 use std::cmp::Ordering;
 use std::collections::BTreeMap;
@@ -345,29 +345,13 @@ impl QueryGraph {
     Build a query graph from a list of paths
 
     The paths are composed of source and dest terms.
-    If the source term is `None`, then the last dest term is used
-    as the predecessor of the dest term. If the source is Some(_),
-    then an edge is built between the last dest term and the source,
-    and between the source and new dest term.
 
-    Note that the resulting graph will not correspond to a perfect
-    representation of the set of paths.
     For example, consider the following paths:
     ```txt
     PATH 1 :  a -> b1 -> c1 -> d -> e1
     PATH 2 :  a -> b2 -> c2 -> d -> e2
     ```
     Then the resulting graph will be:
-    ```txt
-              ┌────┐  ┌────┐             ┌────┐
-           ┌──│ b1 │──│ c1 │─┐        ┌──│ e1 │
-    ┌────┐ │  └────┘  └────┘ │ ┌────┐ │  └────┘
-    │ a  │─┤                 ├─│ d  │─┤
-    └────┘ │  ┌────┐  ┌────┐ │ └────┘ │  ┌────┐
-           └──│ b2 │──│ c2 │─┘        └──│ e2 │
-              └────┘  └────┘             └────┘
-    ```
-    which is different from the fully correct representation:
     ```txt
               ┌────┐  ┌────┐   ┌────┐   ┌────┐
            ┌──│ b1 │──│ c1 │───│ d  │───│ e1 │
@@ -383,21 +367,51 @@ impl QueryGraph {
     pub fn build_from_paths(
         paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>>,
     ) -> Self {
-        let mut node_data = DedupInterner::default();
-        let root_node = node_data.insert(QueryNodeData::Start);
-        let end_node = node_data.insert(QueryNodeData::End);
+        let mut node_data = Interner::default();
+        let root_node = node_data.push(QueryNodeData::Start);
+        let end_node = node_data.push(QueryNodeData::End);
+
+        let mut paths_with_single_terms = vec![];
+
+        for path in paths {
+            let mut processed_path = vec![];
+            let mut prev_dest_term: Option<LocatedQueryTermSubset> = None;
+            for (start_term, dest_term) in path {
+                if let Some(prev_dest_term) = prev_dest_term.take() {
+                    if let Some(mut start_term) = start_term {
+                        if start_term.term_ids == prev_dest_term.term_ids {
+                            start_term.term_subset.intersect(&prev_dest_term.term_subset);
+                            processed_path.push(start_term);
+                        } else {
+                            processed_path.push(prev_dest_term);
+                            processed_path.push(start_term);
+                        }
+                    } else {
+                        processed_path.push(prev_dest_term);
+                    }
+                } else if let Some(start_term) = start_term {
+                    processed_path.push(start_term);
+                }
+                prev_dest_term = Some(dest_term);
+            }
+            if let Some(prev_dest_term) = prev_dest_term {
+                processed_path.push(prev_dest_term);
+            }
+            paths_with_single_terms.push(processed_path);
+        }
+
+        // TODO: make a prefix tree of the processed paths to avoid uselessly duplicating nodes
 
         let mut paths_with_ids = vec![];
-        for path in paths {
+        for path in paths_with_single_terms {
             let mut path_with_ids = vec![];
-            for node in path {
-                let (start_term, end_term) = node;
-                let src_node_id = start_term.map(|x| node_data.insert(QueryNodeData::Term(x)));
-                let dest_node_id = node_data.insert(QueryNodeData::Term(end_term));
-                path_with_ids.push((src_node_id, dest_node_id));
+            for term in path {
+                let id = node_data.push(QueryNodeData::Term(term));
+                path_with_ids.push(Interned::from_raw(id.into_raw()));
             }
             paths_with_ids.push(path_with_ids);
         }
+
         let nodes_data = node_data.freeze();
         let nodes_data_len = nodes_data.len();
         let mut nodes = nodes_data.map_move(|n| QueryNode {
@@ -406,31 +420,22 @@ impl QueryGraph {
             successors: SmallBitmap::new(nodes_data_len),
         });
 
-        let root_node = Interned::from_raw(root_node.into_raw());
-        let end_node = Interned::from_raw(end_node.into_raw());
+        let root_node = Interned::<QueryNode>::from_raw(root_node.into_raw());
+        let end_node = Interned::<QueryNode>::from_raw(end_node.into_raw());
 
         for path in paths_with_ids {
-            let mut prev_node = root_node;
-            for node in path {
-                let (start_term, dest_term) = node;
-                let end_term = Interned::from_raw(dest_term.into_raw());
-                let src = if let Some(start_term) = start_term {
-                    // TODO: this is incorrect! should take the intersection
-                    // between the prev node and the start term if they refer to the same
-                    // original query term!
-                    let start_term = Interned::from_raw(start_term.into_raw());
-                    nodes.get_mut(prev_node).successors.insert(start_term);
-                    nodes.get_mut(start_term).predecessors.insert(prev_node);
-                    start_term
-                } else {
-                    prev_node
-                };
-                nodes.get_mut(src).successors.insert(end_term);
-                nodes.get_mut(end_term).predecessors.insert(src);
-                prev_node = end_term;
+            let mut prev_node_id = root_node;
+            for node_id in path {
+                let prev_node = nodes.get_mut(prev_node_id);
+                prev_node.successors.insert(node_id);
+                let node = nodes.get_mut(node_id);
+                node.predecessors.insert(prev_node_id);
+                prev_node_id = node_id;
             }
-            nodes.get_mut(prev_node).successors.insert(end_node);
-            nodes.get_mut(end_node).predecessors.insert(prev_node);
+            let prev_node = nodes.get_mut(prev_node_id);
+            prev_node.successors.insert(end_node);
+            let node = nodes.get_mut(end_node);
+            node.predecessors.insert(prev_node_id);
         }
 
         QueryGraph { root_node, end_node, nodes }
diff --git a/milli/src/search/new/tests/proximity_typo.rs b/milli/src/search/new/tests/proximity_typo.rs
index 3bf869d1d..9f9601e3f 100644
--- a/milli/src/search/new/tests/proximity_typo.rs
+++ b/milli/src/search/new/tests/proximity_typo.rs
@@ -3,6 +3,8 @@ This module tests the interactions between the proximity and typo ranking rules.
 
 The proximity ranking rule should transform the query graph such that it
 only contains the word pairs that it used to compute its bucket.
+
+TODO: This is not currently implemented.
 */
 
 use crate::{
@@ -61,8 +63,13 @@ fn test_trap_basic() {
     s.terms_matching_strategy(TermsMatchingStrategy::All);
     s.query("summer holiday");
     let SearchResult { documents_ids, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]");
     let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    // TODO: this is incorrect, 1 should come before 0
     insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"summer. holiday. sommer holidty\"",
+        "\"summer. holiday. sommer holiday\"",
+    ]
     "###);
 }
diff --git a/milli/src/search/new/tests/sort.rs b/milli/src/search/new/tests/sort.rs
index d3a952a24..d2201f55b 100644
--- a/milli/src/search/new/tests/sort.rs
+++ b/milli/src/search/new/tests/sort.rs
@@ -271,13 +271,13 @@ fn test_sort() {
         "false",
         "true",
         "true",
-        "__does_not_exist___",
+        "__does_not_exist__",
         "null",
         "[null,null,\"\"]",
         "\"\"",
         "{\"sub\":0}",
-        "__does_not_exist___",
-        "__does_not_exist___",
+        "__does_not_exist__",
+        "__does_not_exist__",
     ]
     "###);
 
@@ -304,13 +304,13 @@ fn test_sort() {
         "false",
         "\"1\"",
         "\"0\"",
-        "__does_not_exist___",
+        "__does_not_exist__",
         "null",
         "[null,null,\"\"]",
         "\"\"",
         "{\"sub\":0}",
-        "__does_not_exist___",
-        "__does_not_exist___",
+        "__does_not_exist__",
+        "__does_not_exist__",
     ]
     "###);
 }
diff --git a/milli/src/search/new/tests/typo_proximity.rs b/milli/src/search/new/tests/typo_proximity.rs
index ba6371544..220fc69e1 100644
--- a/milli/src/search/new/tests/typo_proximity.rs
+++ b/milli/src/search/new/tests/typo_proximity.rs
@@ -59,8 +59,7 @@ fn create_index() -> TempIndex {
                 "id": 3,
                 "text": "beautiful sommer"
             },
-            // The next two documents lay out an even more complex trap, which the current implementation
-            // fails to handle properly.
+            // The next two documents lay out an even more complex trap.
             // With the user query `delicious sweet dessert`, the typo ranking rule will return one bucket of:
             // - id 4: delicitous + sweet + dessert
             // - id 5: beautiful + sweet + desgert
@@ -114,13 +113,12 @@ fn test_trap_complex2() {
     s.terms_matching_strategy(TermsMatchingStrategy::All);
     s.query("delicious sweet dessert");
     let SearchResult { documents_ids, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4]");
     let texts = collect_field_values(&index, &txn, "text", &documents_ids);
-    // TODO: this is incorrect. 5 should appear before 4
     insta::assert_debug_snapshot!(texts, @r###"
     [
-        "\"delicitous. sweet. dessert. delicitous sweet desgert\"",
         "\"delicious. sweet desgert. delicious sweet desgert\"",
+        "\"delicitous. sweet. dessert. delicitous sweet desgert\"",
     ]
     "###);
 }