From 6e50f23896418c5d363a69cc13813fc6bad548f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Wed, 5 Apr 2023 13:33:23 +0200
Subject: [PATCH] Add more search tests

---
 milli/src/search/new/tests/mod.rs            |   2 +
 milli/src/search/new/tests/proximity.rs      |   2 +-
 milli/src/search/new/tests/proximity_typo.rs |  68 ++++++++++
 milli/src/search/new/tests/typo_proximity.rs | 126 +++++++++++++++++++
 4 files changed, 197 insertions(+), 1 deletion(-)
 create mode 100644 milli/src/search/new/tests/proximity_typo.rs
 create mode 100644 milli/src/search/new/tests/typo_proximity.rs

diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs
index 0fd5013db..898276858 100644
--- a/milli/src/search/new/tests/mod.rs
+++ b/milli/src/search/new/tests/mod.rs
@@ -3,8 +3,10 @@ pub mod distinct;
 pub mod language;
 pub mod ngram_split_words;
 pub mod proximity;
+pub mod proximity_typo;
 pub mod sort;
 pub mod typo;
+pub mod typo_proximity;
 pub mod words_tms;
 
 fn collect_field_values(
diff --git a/milli/src/search/new/tests/proximity.rs b/milli/src/search/new/tests/proximity.rs
index 44ff94f1d..880f933f0 100644
--- a/milli/src/search/new/tests/proximity.rs
+++ b/milli/src/search/new/tests/proximity.rs
@@ -1,7 +1,7 @@
 /*!
 This module tests the Proximity ranking rule:
 
-1. A sprximity of >7 always has the same cost.
+1. A proximity of >7 always has the same cost.
 
 2. Phrase terms can be in sprximity to other terms via their start and end words,
 but we need to make sure that the phrase exists in the document that meets this
diff --git a/milli/src/search/new/tests/proximity_typo.rs b/milli/src/search/new/tests/proximity_typo.rs
new file mode 100644
index 000000000..3bf869d1d
--- /dev/null
+++ b/milli/src/search/new/tests/proximity_typo.rs
@@ -0,0 +1,68 @@
+/*!
+This module tests the interactions between the proximity and typo ranking rules.
+
+The proximity ranking rule should transform the query graph such that it
+only contains the word pairs that it used to compute its bucket.
+*/
+
+use crate::{
+    index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
+    SearchResult, TermsMatchingStrategy,
+};
+
+fn create_index() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["text".to_owned()]);
+            s.set_criteria(vec![Criterion::Words, Criterion::Proximity, Criterion::Typo]);
+        })
+        .unwrap();
+
+    index
+        .add_documents(documents!([
+            // Basic trap.
+            //
+            // We have one document with the perfect word pair: `sommer - holiday`
+            // and another with the perfect word pair: `sommer holidty`.
+            //
+            // The proximity ranking rule will put them both in the same bucket, and it
+            // should minify the query graph to make it represent:
+            // EITHER:
+            //    sommer + holiday
+            // OR:
+            //    sommer + holidty
+            //
+            // Such that the child typo ranking rule does not find any match
+            // for its zero-typo bucket `summer + holiday`, even though both documents
+            // contain these two exact words.
+            {
+                "id": 0,
+                "text": "summer. holiday. sommer holidty"
+            },
+            {
+                "id": 1,
+                "text": "summer. holiday. sommer holiday"
+            },
+
+        ]))
+        .unwrap();
+    index
+}
+
+#[test]
+fn test_trap_basic() {
+    let index = create_index();
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::All);
+    s.query("summer holiday");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    "###);
+}
diff --git a/milli/src/search/new/tests/typo_proximity.rs b/milli/src/search/new/tests/typo_proximity.rs
new file mode 100644
index 000000000..ba6371544
--- /dev/null
+++ b/milli/src/search/new/tests/typo_proximity.rs
@@ -0,0 +1,126 @@
+/*!
+This module tests the interactions between the typo and proximity ranking rules.
+
+The typo ranking rule should transform the query graph such that it only contains
+the combinations of word derivations that it used to compute its bucket.
+
+The proximity ranking rule should then look for proximities only between those specific derivations.
+For example, given the the search query `beautiful summer` and the dataset:
+```text
+{ "id": 0, "text": "beautigul summer...... beautiful day in the summer" }
+{ "id": 1, "text": "beautiful summer" }
+```
+Then the document with id `1` should be returned before `0`.
+The proximity ranking rule is not allowed to look for the proximity between `beautigul` and `summer`
+because the typo ranking rule before it only used the derivation `beautiful`.
+*/
+
+use crate::{
+    index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
+    SearchResult, TermsMatchingStrategy,
+};
+
+fn create_index() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["text".to_owned()]);
+            s.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
+        })
+        .unwrap();
+
+    index
+        .add_documents(documents!([
+            // trap explained in the module documentation
+            {
+                "id": 0,
+                "text": "beautigul summer. beautiful x y z summer"
+            },
+            {
+                "id": 1,
+                "text": "beautiful summer"
+            },
+            // the next 2 documents set up a more complicated trap
+            // with the query `beautiful summer`, we will have:
+            // 1. documents with no typos, id 0 and 1
+            // 2. documents with 1 typos: id 2 and 3, those are interpreted as EITHER
+            //      - id 2: "beautigul + summer" ; OR
+            //      - id 3: "beautiful + sommer"
+            // To sort these two documents, the proximity ranking rule must use only the
+            // word pairs: `beautigul -- summer` and `beautiful -- sommer` even though
+            // all variations of `beautiful` and `sommer` were used by the typo ranking rule.
+            {
+                "id": 2,
+                "text": "beautigul sommer. beautigul x summer"
+            },
+            {
+                "id": 3,
+                "text": "beautiful sommer"
+            },
+            // The next two documents lay out an even more complex trap, which the current implementation
+            // fails to handle properly.
+            // With the user query `delicious sweet dessert`, the typo ranking rule will return one bucket of:
+            // - id 4: delicitous + sweet + dessert
+            // - id 5: beautiful + sweet + desgert
+            // The word pairs that the proximity ranking rules is allowed to use are
+            // EITHER:
+            //      delicitous -- sweet AND sweet -- dessert
+            // OR
+            //      delicious -- sweet AND sweet -- desgert
+            // So the word pair to use for the terms `summer` and `dessert` depend on the
+            // word pairs explored before them.
+            {
+                "id": 4,
+                "text": "delicitous. sweet. dessert. delicitous sweet desgert",
+            },
+            {
+                "id": 5,
+                "text": "delicious. sweet desgert. delicious sweet desgert",
+            },
+        ]))
+        .unwrap();
+    index
+}
+
+#[test]
+fn test_trap_basic_and_complex1() {
+    let index = create_index();
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::All);
+    s.query("beautiful summer");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"beautiful summer\"",
+        "\"beautigul summer. beautiful x y z summer\"",
+        "\"beautiful sommer\"",
+        "\"beautigul sommer. beautigul x summer\"",
+    ]
+    "###);
+}
+
+#[test]
+fn test_trap_complex2() {
+    let index = create_index();
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::All);
+    s.query("delicious sweet dessert");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    // TODO: this is incorrect. 5 should appear before 4
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"delicitous. sweet. dessert. delicitous sweet desgert\"",
+        "\"delicious. sweet desgert. delicious sweet desgert\"",
+    ]
+    "###);
+}