Add more search tests

2024-11-26 20:15:07 +08:00 · 2023-04-05 13:33:23 +02:00 · 2023-04-05 13:33:23 +02:00 · 6e50f23896
commit 6e50f23896
parent 4c8a0179ba
4 changed files with 197 additions and 1 deletions
--- a/milli/src/search/new/tests/mod.rs
+++ b/milli/src/search/new/tests/mod.rs
@ -3,8 +3,10 @@ pub mod distinct;
 pub mod language;
 pub mod ngram_split_words;
 pub mod proximity;
+pub mod proximity_typo;
 pub mod sort;
 pub mod typo;
+pub mod typo_proximity;
 pub mod words_tms;

 fn collect_field_values(
--- a/milli/src/search/new/tests/proximity.rs
+++ b/milli/src/search/new/tests/proximity.rs
@ -1,7 +1,7 @@
 /*!
 This module tests the Proximity ranking rule:

-1. A sprximity of >7 always has the same cost.
+1. A proximity of >7 always has the same cost.

 2. Phrase terms can be in sprximity to other terms via their start and end words,
 but we need to make sure that the phrase exists in the document that meets this
--- a/milli/src/search/new/tests/proximity_typo.rs
+++ b/milli/src/search/new/tests/proximity_typo.rs
@ -0,0 +1,68 @@
+/*!
+This module tests the interactions between the proximity and typo ranking rules.
+
+The proximity ranking rule should transform the query graph such that it
+only contains the word pairs that it used to compute its bucket.
+*/
+
+use crate::{
+    index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
+    SearchResult, TermsMatchingStrategy,
+};
+
+fn create_index() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["text".to_owned()]);
+            s.set_criteria(vec![Criterion::Words, Criterion::Proximity, Criterion::Typo]);
+        })
+        .unwrap();
+
+    index
+        .add_documents(documents!([
+            // Basic trap.
+            //
+            // We have one document with the perfect word pair: `sommer - holiday`
+            // and another with the perfect word pair: `sommer holidty`.
+            //
+            // The proximity ranking rule will put them both in the same bucket, and it
+            // should minify the query graph to make it represent:
+            // EITHER:
+            //    sommer + holiday
+            // OR:
+            //    sommer + holidty
+            //
+            // Such that the child typo ranking rule does not find any match
+            // for its zero-typo bucket `summer + holiday`, even though both documents
+            // contain these two exact words.
+            {
+                "id": 0,
+                "text": "summer. holiday. sommer holidty"
+            },
+            {
+                "id": 1,
+                "text": "summer. holiday. sommer holiday"
+            },
+
+        ]))
+        .unwrap();
+    index
+}
+
+#[test]
+fn test_trap_basic() {
+    let index = create_index();
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::All);
+    s.query("summer holiday");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    "###);
+}
--- a/milli/src/search/new/tests/typo_proximity.rs
+++ b/milli/src/search/new/tests/typo_proximity.rs
@ -0,0 +1,126 @@
+/*!
+This module tests the interactions between the typo and proximity ranking rules.
+
+The typo ranking rule should transform the query graph such that it only contains
+the combinations of word derivations that it used to compute its bucket.
+
+The proximity ranking rule should then look for proximities only between those specific derivations.
+For example, given the the search query `beautiful summer` and the dataset:
+```text
+{ "id": 0, "text": "beautigul summer...... beautiful day in the summer" }
+{ "id": 1, "text": "beautiful summer" }
+```
+Then the document with id `1` should be returned before `0`.
+The proximity ranking rule is not allowed to look for the proximity between `beautigul` and `summer`
+because the typo ranking rule before it only used the derivation `beautiful`.
+*/
+
+use crate::{
+    index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
+    SearchResult, TermsMatchingStrategy,
+};
+
+fn create_index() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["text".to_owned()]);
+            s.set_criteria(vec![Criterion::Words, Criterion::Typo, Criterion::Proximity]);
+        })
+        .unwrap();
+
+    index
+        .add_documents(documents!([
+            // trap explained in the module documentation
+            {
+                "id": 0,
+                "text": "beautigul summer. beautiful x y z summer"
+            },
+            {
+                "id": 1,
+                "text": "beautiful summer"
+            },
+            // the next 2 documents set up a more complicated trap
+            // with the query `beautiful summer`, we will have:
+            // 1. documents with no typos, id 0 and 1
+            // 2. documents with 1 typos: id 2 and 3, those are interpreted as EITHER
+            //      - id 2: "beautigul + summer" ; OR
+            //      - id 3: "beautiful + sommer"
+            // To sort these two documents, the proximity ranking rule must use only the
+            // word pairs: `beautigul -- summer` and `beautiful -- sommer` even though
+            // all variations of `beautiful` and `sommer` were used by the typo ranking rule.
+            {
+                "id": 2,
+                "text": "beautigul sommer. beautigul x summer"
+            },
+            {
+                "id": 3,
+                "text": "beautiful sommer"
+            },
+            // The next two documents lay out an even more complex trap, which the current implementation
+            // fails to handle properly.
+            // With the user query `delicious sweet dessert`, the typo ranking rule will return one bucket of:
+            // - id 4: delicitous + sweet + dessert
+            // - id 5: beautiful + sweet + desgert
+            // The word pairs that the proximity ranking rules is allowed to use are
+            // EITHER:
+            //      delicitous -- sweet AND sweet -- dessert
+            // OR
+            //      delicious -- sweet AND sweet -- desgert
+            // So the word pair to use for the terms `summer` and `dessert` depend on the
+            // word pairs explored before them.
+            {
+                "id": 4,
+                "text": "delicitous. sweet. dessert. delicitous sweet desgert",
+            },
+            {
+                "id": 5,
+                "text": "delicious. sweet desgert. delicious sweet desgert",
+            },
+        ]))
+        .unwrap();
+    index
+}
+
+#[test]
+fn test_trap_basic_and_complex1() {
+    let index = create_index();
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::All);
+    s.query("beautiful summer");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[1, 0, 3, 2]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"beautiful summer\"",
+        "\"beautigul summer. beautiful x y z summer\"",
+        "\"beautiful sommer\"",
+        "\"beautigul sommer. beautigul x summer\"",
+    ]
+    "###);
+}
+
+#[test]
+fn test_trap_complex2() {
+    let index = create_index();
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::All);
+    s.query("delicious sweet dessert");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    // TODO: this is incorrect. 5 should appear before 4
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"delicitous. sweet. dessert. delicitous sweet desgert\"",
+        "\"delicious. sweet desgert. delicious sweet desgert\"",
+    ]
+    "###);
+}