Add tests for "exactness" rr, make correct universe computation

2024-11-26 12:05:05 +08:00 · 2023-04-24 16:57:12 +02:00 · 2023-04-24 16:57:12 +02:00 · 8f2e971879
commit 8f2e971879
parent d1fdbb63da
3 changed files with 533 additions and 9 deletions
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -51,6 +51,7 @@ use resolve_query_graph::compute_query_graph_docids;
 use sort::Sort;

 use self::interner::Interned;
+use self::query_term::ExactTerm;

 /// A structure used throughout the execution of a search query.
 pub struct SearchContext<'ctx> {
@ -119,6 +120,75 @@ fn resolve_maximally_reduced_query_graph(
    Ok(docids)
 }

+fn resolve_docids_containing_any_exact_word(
+    ctx: &mut SearchContext,
+    universe: &RoaringBitmap,
+    query_graph: &QueryGraph,
+) -> Result<RoaringBitmap> {
+    let mut docids = RoaringBitmap::new();
+    for (_, node) in query_graph.nodes.iter() {
+        let term = match &node.data {
+            query_graph::QueryNodeData::Term(term) => term,
+            query_graph::QueryNodeData::Deleted
+            | query_graph::QueryNodeData::Start
+            | query_graph::QueryNodeData::End => {
+                continue;
+            }
+        };
+        if term.term_ids.len() != 1 {
+            continue;
+        }
+        let Some(exact_term) = term.term_subset.exact_term(ctx) else {
+            continue
+        };
+        let exact_term_docids = match exact_term {
+            ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)? & universe,
+            ExactTerm::Word(word) => {
+                if let Some(word_docids) = ctx.word_docids(Word::Original(word))? {
+                    word_docids & universe
+                } else {
+                    continue;
+                }
+            }
+        };
+        docids |= exact_term_docids;
+    }
+    Ok(docids)
+}
+
+fn resolve_universe(
+    ctx: &mut SearchContext,
+    initial_universe: &RoaringBitmap,
+    query_graph: &QueryGraph,
+    method: UniverseResolutionMethod,
+    matching_strategy: TermsMatchingStrategy,
+    logger: &mut dyn SearchLogger<QueryGraph>,
+) -> Result<RoaringBitmap> {
+    match method {
+        UniverseResolutionMethod::TermMatchingStrategyOnly => {
+            resolve_maximally_reduced_query_graph(
+                ctx,
+                initial_universe,
+                query_graph,
+                matching_strategy,
+                logger,
+            )
+        }
+        UniverseResolutionMethod::TermMatchingStrategyAndExactness => {
+            let mut resolved_universe = resolve_maximally_reduced_query_graph(
+                ctx,
+                initial_universe,
+                query_graph,
+                matching_strategy,
+                logger,
+            )?;
+            resolved_universe |=
+                resolve_docids_containing_any_exact_word(ctx, initial_universe, query_graph)?;
+            Ok(resolved_universe)
+        }
+    }
+}
+
 /// Return the list of initialised ranking rules to be used for a placeholder search.
 fn get_ranking_rules_for_placeholder_search<'ctx>(
    ctx: &SearchContext<'ctx>,
@ -163,12 +233,17 @@ fn get_ranking_rules_for_placeholder_search<'ctx>(
    Ok(ranking_rules)
 }

+enum UniverseResolutionMethod {
+    TermMatchingStrategyOnly,
+    TermMatchingStrategyAndExactness,
+}
+
 /// Return the list of initialised ranking rules to be used for a query graph search.
 fn get_ranking_rules_for_query_graph_search<'ctx>(
    ctx: &SearchContext<'ctx>,
    sort_criteria: &Option<Vec<AscDesc>>,
    terms_matching_strategy: TermsMatchingStrategy,
-) -> Result<Vec<BoxRankingRule<'ctx, QueryGraph>>> {
+) -> Result<(Vec<BoxRankingRule<'ctx, QueryGraph>>, UniverseResolutionMethod)> {
    // query graph search
    let mut words = false;
    let mut typo = false;
@ -179,10 +254,12 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
    let mut asc = HashSet::new();
    let mut desc = HashSet::new();

+    let mut universe_resolution_method = UniverseResolutionMethod::TermMatchingStrategyOnly;
+
    let mut ranking_rules: Vec<BoxRankingRule<QueryGraph>> = vec![];
    let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
    for rr in settings_ranking_rules {
-        // Add Words before any of: typo, proximity, attribute, exactness
+        // Add Words before any of: typo, proximity, attribute
        match rr {
            crate::Criterion::Typo | crate::Criterion::Attribute | crate::Criterion::Proximity => {
                if !words {
@ -236,6 +313,11 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
                ranking_rules.push(Box::new(ExactAttribute::new()));
                ranking_rules.push(Box::new(Exactness::new()));
                exactness = true;
+
+                if !words {
+                    universe_resolution_method =
+                        UniverseResolutionMethod::TermMatchingStrategyAndExactness;
+                }
            }
            crate::Criterion::Asc(field_name) => {
                if asc.contains(&field_name) {
@ -253,7 +335,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
            }
        }
    }
-    Ok(ranking_rules)
+    Ok((ranking_rules, universe_resolution_method))
 }

 fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
@ -335,19 +417,18 @@ pub fn execute_search(

        check_sort_criteria(ctx, sort_criteria.as_ref())?;

-        // TODO: if the exactness criterion is the first one, then
-        // use a different strategy to find the universe (union of any term)
-        universe = resolve_maximally_reduced_query_graph(
+        let (ranking_rules, universe_resolution_method) =
+            get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?;
+
+        universe = resolve_universe(
            ctx,
            &universe,
            &graph,
+            universe_resolution_method,
            terms_matching_strategy,
            query_graph_logger,
        )?;

-        let ranking_rules =
-            get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?;
-
        bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)?
    } else {
        let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria)?;
--- a/milli/src/search/new/tests/exactness.rs
+++ b/milli/src/search/new/tests/exactness.rs
@ -0,0 +1,442 @@
+/*!
+This module tests the following properties about the exactness ranking rule:
+
+- it sorts documents as follows:
+    1. documents which have an attribute which is equal to the whole query
+    2. documents which have an attribute which start with the whole query
+    3. documents which contain the most exact words from the query
+
+- the set of all candidates when `exactness` precedes `word` is the union of:
+    1. the same set of candidates that would be returned normally
+    2. the set of documents that contain at least one exact word from the query
+
+- if it is placed after `word`, then it will only sort documents by:
+    1. those that have an attribute which is equal to the whole remaining query, if this query does not have any "gap"
+    2. those that have an attribute which start with the whole remaining query, if this query does not have any "gap"
+    3. those that contain the most exact words from the remaining query
+*/
+
+use crate::{
+    index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
+    SearchResult, TermsMatchingStrategy,
+};
+
+fn create_index_exact_words_simple_ordered() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["text".to_owned()]);
+            s.set_criteria(vec![Criterion::Exactness]);
+        })
+        .unwrap();
+
+    index
+        .add_documents(documents!([
+            {
+                "id": 0,
+                "text": "",
+            },
+            {
+                "id": 1,
+                "text": "the",
+            },
+            {
+                "id": 2,
+                "text": "the quick",
+            },
+            {
+                "id": 3,
+                "text": "the quick brown",
+            },
+            {
+                "id": 4,
+                "text": "the quick brown fox",
+            },
+            {
+                "id": 5,
+                "text": "the quick brown fox jumps",
+            },
+
+            {
+                "id": 6,
+                "text": "the quick brown fox jumps over",
+            },
+            {
+                "id": 7,
+                "text": "the quick brown fox jumps over the",
+            },
+            {
+                "id": 8,
+                "text": "the quick brown fox jumps over the lazy",
+            },
+            {
+                "id": 9,
+                "text": "the quick brown fox jumps over the lazy dog",
+            },
+        ]))
+        .unwrap();
+    index
+}
+
+fn create_index_exact_words_simple_reversed() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["text".to_owned()]);
+            s.set_criteria(vec![Criterion::Exactness]);
+        })
+        .unwrap();
+
+    index
+        .add_documents(documents!([
+            {
+                "id": 0,
+                "text": "",
+            },
+            {
+                "id": 1,
+                "text": "dog",
+            },
+            {
+                "id": 2,
+                "text": "lazy dog",
+            },
+            {
+                "id": 3,
+                "text": "the lazy dog",
+            },
+            {
+                "id": 4,
+                "text": "over the lazy dog",
+            },
+            {
+                "id": 5,
+                "text": "jumps over the lazy dog",
+            },
+            {
+                "id": 6,
+                "text": "fox jumps over the lazy dog",
+            },
+            {
+                "id": 7,
+                "text": "brown fox jumps over the lazy dog",
+            },
+            {
+                "id": 8,
+                "text": "quick brown fox jumps over the lazy dog",
+            },
+            {
+                "id": 9,
+                "text": "the quick brown fox jumps over the lazy dog",
+            }
+        ]))
+        .unwrap();
+    index
+}
+
+fn create_index_exact_words_simple_random() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["text".to_owned()]);
+            s.set_criteria(vec![Criterion::Exactness]);
+        })
+        .unwrap();
+
+    index
+        .add_documents(documents!([
+            {
+                "id": 0,
+                "text": "",
+            },
+            {
+                "id": 1,
+                "text": "over",
+            },
+            {
+                "id": 2,
+                "text": "jump dog",
+            },
+            {
+                "id": 3,
+                "text": "brown the lazy",
+            },
+            {
+                "id": 4,
+                "text": "jump dog quick the",
+            },
+            {
+                "id": 5,
+                "text": "fox the lazy dog brown",
+            },
+            {
+                "id": 6,
+                "text": "jump fox quick lazy the dog",
+            },
+            {
+                "id": 7,
+                "text": "the dog brown over jumps quick lazy",
+            },
+            {
+                "id": 8,
+                "text": "the jumps dog quick over brown lazy fox",
+            }
+        ]))
+        .unwrap();
+    index
+}
+
+fn create_index_attribute_starts_with() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["text".to_owned()]);
+            s.set_criteria(vec![Criterion::Exactness]);
+        })
+        .unwrap();
+
+    index
+        .add_documents(documents!([
+            {
+                "id": 0,
+                "text": "what a lovely view from this balcony, I love it",
+            },
+            {
+                "id": 1,
+                "text": "this balcony is overlooking the sea",
+            },
+            {
+                "id": 2,
+                "text": "this balcony",
+            },
+            {
+                "id": 3,
+                "text": "over looking the sea is a beautiful balcony",
+            },
+            {
+                "id": 4,
+                "text": "a beautiful balcony is overlooking the sea",
+            },
+            {
+                "id": 5,
+                "text": "overlooking the sea is a beautiful balcony, I love it",
+            },
+            {
+                "id": 6,
+                "text": "overlooking the sea is a beautiful balcony",
+            },
+            {
+                "id": 7,
+                "text": "overlooking",
+            },
+        ]))
+        .unwrap();
+    index
+}
+
+#[test]
+fn test_exactness_simple_ordered() {
+    let index = create_index_exact_words_simple_ordered();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    s.query("the quick brown fox jumps over the lazy dog");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 6, 7, 5, 4, 3, 2, 1]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"the quick brown fox jumps over the lazy dog\"",
+        "\"the quick brown fox jumps over the lazy\"",
+        "\"the quick brown fox jumps over\"",
+        "\"the quick brown fox jumps over the\"",
+        "\"the quick brown fox jumps\"",
+        "\"the quick brown fox\"",
+        "\"the quick brown\"",
+        "\"the quick\"",
+        "\"the\"",
+    ]
+    "###);
+}
+
+#[test]
+fn test_exactness_simple_reversed() {
+    let index = create_index_exact_words_simple_reversed();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    s.query("the quick brown fox jumps over the lazy dog");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 7, 6, 5, 4, 3, 2, 1]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"the quick brown fox jumps over the lazy dog\"",
+        "\"quick brown fox jumps over the lazy dog\"",
+        "\"brown fox jumps over the lazy dog\"",
+        "\"fox jumps over the lazy dog\"",
+        "\"jumps over the lazy dog\"",
+        "\"over the lazy dog\"",
+        "\"the lazy dog\"",
+        "\"lazy dog\"",
+        "\"dog\"",
+    ]
+    "###);
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    s.query("the quick brown fox jumps over the lazy dog");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 7, 6, 5, 4, 3, 2, 1]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"the quick brown fox jumps over the lazy dog\"",
+        "\"quick brown fox jumps over the lazy dog\"",
+        "\"brown fox jumps over the lazy dog\"",
+        "\"fox jumps over the lazy dog\"",
+        "\"jumps over the lazy dog\"",
+        "\"over the lazy dog\"",
+        "\"the lazy dog\"",
+        "\"lazy dog\"",
+        "\"dog\"",
+    ]
+    "###);
+}
+
+#[test]
+fn test_exactness_simple_random() {
+    let index = create_index_exact_words_simple_random();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    s.query("the quick brown fox jumps over the lazy dog");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[8, 7, 5, 6, 3, 4, 1, 2]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"the jumps dog quick over brown lazy fox\"",
+        "\"the dog brown over jumps quick lazy\"",
+        "\"fox the lazy dog brown\"",
+        "\"jump fox quick lazy the dog\"",
+        "\"brown the lazy\"",
+        "\"jump dog quick the\"",
+        "\"over\"",
+        "\"jump dog\"",
+    ]
+    "###);
+}
+
+#[test]
+fn test_exactness_attribute_starts_with_simple() {
+    let index = create_index_attribute_starts_with();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    s.query("this balcony");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 3, 4, 5, 6]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"this balcony\"",
+        "\"this balcony is overlooking the sea\"",
+        "\"what a lovely view from this balcony, I love it\"",
+        "\"over looking the sea is a beautiful balcony\"",
+        "\"a beautiful balcony is overlooking the sea\"",
+        "\"overlooking the sea is a beautiful balcony, I love it\"",
+        "\"overlooking the sea is a beautiful balcony\"",
+    ]
+    "###);
+}
+
+#[test]
+fn test_exactness_attribute_starts_with_phrase() {
+    let index = create_index_attribute_starts_with();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    s.query("\"overlooking the sea\" is a beautiful balcony");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 6, 4, 3, 1, 0, 2]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    // TODO: this is incorrect, the first document returned here should actually be the second one
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"overlooking the sea is a beautiful balcony, I love it\"",
+        "\"overlooking the sea is a beautiful balcony\"",
+        "\"a beautiful balcony is overlooking the sea\"",
+        "\"over looking the sea is a beautiful balcony\"",
+        "\"this balcony is overlooking the sea\"",
+        "\"what a lovely view from this balcony, I love it\"",
+        "\"this balcony\"",
+    ]
+    "###);
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    s.query("overlooking the sea is a beautiful balcony");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 3, 1, 0, 2, 7]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    // TODO: this is correct, so the exactness ranking rule probably has a bug in the handling of phrases
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"overlooking the sea is a beautiful balcony\"",
+        "\"overlooking the sea is a beautiful balcony, I love it\"",
+        "\"a beautiful balcony is overlooking the sea\"",
+        "\"over looking the sea is a beautiful balcony\"",
+        "\"this balcony is overlooking the sea\"",
+        "\"what a lovely view from this balcony, I love it\"",
+        "\"this balcony\"",
+        "\"overlooking\"",
+    ]
+    "###);
+}
+
+#[test]
+fn test_exactness_all_candidates_with_typo() {
+    let index = create_index_attribute_starts_with();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    s.query("overlocking the sea is a beautiful balcony");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 5, 6, 1, 0, 2, 7]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+    // "overlooking" is returned here because the term matching strategy allows it
+    // but it has the worst exactness score (0 exact words)
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"over looking the sea is a beautiful balcony\"",
+        "\"a beautiful balcony is overlooking the sea\"",
+        "\"overlooking the sea is a beautiful balcony, I love it\"",
+        "\"overlooking the sea is a beautiful balcony\"",
+        "\"this balcony is overlooking the sea\"",
+        "\"what a lovely view from this balcony, I love it\"",
+        "\"this balcony\"",
+        "\"overlooking\"",
+    ]
+    "###);
+}
--- a/milli/src/search/new/tests/mod.rs
+++ b/milli/src/search/new/tests/mod.rs
@ -1,6 +1,7 @@
 pub mod attribute_fid;
 pub mod attribute_position;
 pub mod distinct;
+pub mod exactness;
 #[cfg(feature = "default")]
 pub mod language;
 pub mod ngram_split_words;