Add new tests and fix construction of query graph from paths

2025-02-20 17:45:54 +08:00 · 2023-04-05 14:10:01 +02:00 · 2023-04-05 14:10:01 +02:00 · b5691802a3
commit b5691802a3
parent 6e50f23896
5 changed files with 73 additions and 60 deletions
--- a/milli/src/search/new/interner.rs
+++ b/milli/src/search/new/interner.rs
@ -176,6 +176,9 @@ impl<T> Interner<T> {
    pub fn iter_mut(&mut self) -> impl Iterator<Item = (Interned<T>, &mut T)> {
        self.stable_store.iter_mut().enumerate().map(|(i, x)| (Interned::from_raw(i as u16), x))
    }
+    pub fn freeze(self) -> FixedSizeInterner<T> {
+        FixedSizeInterner { stable_store: self.stable_store }
+    }
 }

 /// A store of values of type `T`, each linked to a value of type `From`
--- a/milli/src/search/new/query_graph.rs
+++ b/milli/src/search/new/query_graph.rs
@ -4,7 +4,7 @@ use super::query_term::{
 };
 use super::small_bitmap::SmallBitmap;
 use super::SearchContext;
-use crate::search::new::interner::DedupInterner;
+use crate::search::new::interner::Interner;
 use crate::Result;
 use std::cmp::Ordering;
 use std::collections::BTreeMap;
@ -345,29 +345,13 @@ impl QueryGraph {
    Build a query graph from a list of paths

    The paths are composed of source and dest terms.
-    If the source term is `None`, then the last dest term is used
-    as the predecessor of the dest term. If the source is Some(_),
-    then an edge is built between the last dest term and the source,
-    and between the source and new dest term.

-    Note that the resulting graph will not correspond to a perfect
-    representation of the set of paths.
    For example, consider the following paths:
    ```txt
    PATH 1 :  a -> b1 -> c1 -> d -> e1
    PATH 2 :  a -> b2 -> c2 -> d -> e2
    ```
    Then the resulting graph will be:
-    ```txt
-              ┌────┐  ┌────┐             ┌────┐
-           ┌──│ b1 │──│ c1 │─┐        ┌──│ e1 │
-    ┌────┐ │  └────┘  └────┘ │ ┌────┐ │  └────┘
-    │ a  │─┤                 ├─│ d  │─┤
-    └────┘ │  ┌────┐  ┌────┐ │ └────┘ │  ┌────┐
-           └──│ b2 │──│ c2 │─┘        └──│ e2 │
-              └────┘  └────┘             └────┘
-    ```
-    which is different from the fully correct representation:
    ```txt
              ┌────┐  ┌────┐   ┌────┐   ┌────┐
           ┌──│ b1 │──│ c1 │───│ d  │───│ e1 │
@ -383,21 +367,51 @@ impl QueryGraph {
    pub fn build_from_paths(
        paths: Vec<Vec<(Option<LocatedQueryTermSubset>, LocatedQueryTermSubset)>>,
    ) -> Self {
-        let mut node_data = DedupInterner::default();
-        let root_node = node_data.insert(QueryNodeData::Start);
-        let end_node = node_data.insert(QueryNodeData::End);
+        let mut node_data = Interner::default();
+        let root_node = node_data.push(QueryNodeData::Start);
+        let end_node = node_data.push(QueryNodeData::End);
+
+        let mut paths_with_single_terms = vec![];
+
+        for path in paths {
+            let mut processed_path = vec![];
+            let mut prev_dest_term: Option<LocatedQueryTermSubset> = None;
+            for (start_term, dest_term) in path {
+                if let Some(prev_dest_term) = prev_dest_term.take() {
+                    if let Some(mut start_term) = start_term {
+                        if start_term.term_ids == prev_dest_term.term_ids {
+                            start_term.term_subset.intersect(&prev_dest_term.term_subset);
+                            processed_path.push(start_term);
+                        } else {
+                            processed_path.push(prev_dest_term);
+                            processed_path.push(start_term);
+                        }
+                    } else {
+                        processed_path.push(prev_dest_term);
+                    }
+                } else if let Some(start_term) = start_term {
+                    processed_path.push(start_term);
+                }
+                prev_dest_term = Some(dest_term);
+            }
+            if let Some(prev_dest_term) = prev_dest_term {
+                processed_path.push(prev_dest_term);
+            }
+            paths_with_single_terms.push(processed_path);
+        }
+
+        // TODO: make a prefix tree of the processed paths to avoid uselessly duplicating nodes

        let mut paths_with_ids = vec![];
-        for path in paths {
+        for path in paths_with_single_terms {
            let mut path_with_ids = vec![];
-            for node in path {
-                let (start_term, end_term) = node;
-                let src_node_id = start_term.map(|x| node_data.insert(QueryNodeData::Term(x)));
-                let dest_node_id = node_data.insert(QueryNodeData::Term(end_term));
-                path_with_ids.push((src_node_id, dest_node_id));
+            for term in path {
+                let id = node_data.push(QueryNodeData::Term(term));
+                path_with_ids.push(Interned::from_raw(id.into_raw()));
            }
            paths_with_ids.push(path_with_ids);
        }
+
        let nodes_data = node_data.freeze();
        let nodes_data_len = nodes_data.len();
        let mut nodes = nodes_data.map_move(|n| QueryNode {
@ -406,31 +420,22 @@ impl QueryGraph {
            successors: SmallBitmap::new(nodes_data_len),
        });

-        let root_node = Interned::from_raw(root_node.into_raw());
-        let end_node = Interned::from_raw(end_node.into_raw());
+        let root_node = Interned::<QueryNode>::from_raw(root_node.into_raw());
+        let end_node = Interned::<QueryNode>::from_raw(end_node.into_raw());

        for path in paths_with_ids {
-            let mut prev_node = root_node;
-            for node in path {
-                let (start_term, dest_term) = node;
-                let end_term = Interned::from_raw(dest_term.into_raw());
-                let src = if let Some(start_term) = start_term {
-                    // TODO: this is incorrect! should take the intersection
-                    // between the prev node and the start term if they refer to the same
-                    // original query term!
-                    let start_term = Interned::from_raw(start_term.into_raw());
-                    nodes.get_mut(prev_node).successors.insert(start_term);
-                    nodes.get_mut(start_term).predecessors.insert(prev_node);
-                    start_term
-                } else {
-                    prev_node
-                };
-                nodes.get_mut(src).successors.insert(end_term);
-                nodes.get_mut(end_term).predecessors.insert(src);
-                prev_node = end_term;
+            let mut prev_node_id = root_node;
+            for node_id in path {
+                let prev_node = nodes.get_mut(prev_node_id);
+                prev_node.successors.insert(node_id);
+                let node = nodes.get_mut(node_id);
+                node.predecessors.insert(prev_node_id);
+                prev_node_id = node_id;
            }
-            nodes.get_mut(prev_node).successors.insert(end_node);
-            nodes.get_mut(end_node).predecessors.insert(prev_node);
+            let prev_node = nodes.get_mut(prev_node_id);
+            prev_node.successors.insert(end_node);
+            let node = nodes.get_mut(end_node);
+            node.predecessors.insert(prev_node_id);
        }

        QueryGraph { root_node, end_node, nodes }
--- a/milli/src/search/new/tests/proximity_typo.rs
+++ b/milli/src/search/new/tests/proximity_typo.rs
@ -3,6 +3,8 @@ This module tests the interactions between the proximity and typo ranking rules.

 The proximity ranking rule should transform the query graph such that it
 only contains the word pairs that it used to compute its bucket.
+
+TODO: This is not currently implemented.
 */

 use crate::{
@ -61,8 +63,13 @@ fn test_trap_basic() {
    s.terms_matching_strategy(TermsMatchingStrategy::All);
    s.query("summer holiday");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    // TODO: this is incorrect, 1 should come before 0
    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"summer. holiday. sommer holidty\"",
+        "\"summer. holiday. sommer holiday\"",
+    ]
    "###);
 }
--- a/milli/src/search/new/tests/sort.rs
+++ b/milli/src/search/new/tests/sort.rs
@ -271,13 +271,13 @@ fn test_sort() {
        "false",
        "true",
        "true",
-        "__does_not_exist___",
+        "__does_not_exist__",
        "null",
        "[null,null,\"\"]",
        "\"\"",
        "{\"sub\":0}",
-        "__does_not_exist___",
-        "__does_not_exist___",
+        "__does_not_exist__",
+        "__does_not_exist__",
    ]
    "###);

@ -304,13 +304,13 @@ fn test_sort() {
        "false",
        "\"1\"",
        "\"0\"",
-        "__does_not_exist___",
+        "__does_not_exist__",
        "null",
        "[null,null,\"\"]",
        "\"\"",
        "{\"sub\":0}",
-        "__does_not_exist___",
-        "__does_not_exist___",
+        "__does_not_exist__",
+        "__does_not_exist__",
    ]
    "###);
 }
--- a/milli/src/search/new/tests/typo_proximity.rs
+++ b/milli/src/search/new/tests/typo_proximity.rs
@ -59,8 +59,7 @@ fn create_index() -> TempIndex {
                "id": 3,
                "text": "beautiful sommer"
            },
-            // The next two documents lay out an even more complex trap, which the current implementation
-            // fails to handle properly.
+            // The next two documents lay out an even more complex trap.
            // With the user query `delicious sweet dessert`, the typo ranking rule will return one bucket of:
            // - id 4: delicitous + sweet + dessert
            // - id 5: beautiful + sweet + desgert
@ -114,13 +113,12 @@ fn test_trap_complex2() {
    s.terms_matching_strategy(TermsMatchingStrategy::All);
    s.query("delicious sweet dessert");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 4]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
-    // TODO: this is incorrect. 5 should appear before 4
    insta::assert_debug_snapshot!(texts, @r###"
    [
-        "\"delicitous. sweet. dessert. delicitous sweet desgert\"",
        "\"delicious. sweet desgert. delicious sweet desgert\"",
+        "\"delicitous. sweet. dessert. delicitous sweet desgert\"",
    ]
    "###);
 }