Add tests for "exactness" rr, make correct universe computation

2024-11-30 00:55:00 +08:00 · 2023-04-24 16:57:12 +02:00 · 2023-04-24 16:57:12 +02:00 · 8f2e971879
commit 8f2e971879
parent d1fdbb63da
3 changed files with 533 additions and 9 deletions
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -51,6 +51,7 @@ use resolve_query_graph::compute_query_graph_docids;
 use sort::Sort;
 use self::interner::Interned;
 use self::query_term::ExactTerm;
 /// A structure used throughout the execution of a search query.
 pub struct SearchContext<'ctx> {
@ -119,6 +120,75 @@ fn resolve_maximally_reduced_query_graph(
    Ok(docids)
 }
 fn resolve_docids_containing_any_exact_word(
    ctx: &mut SearchContext,
    universe: &RoaringBitmap,
    query_graph: &QueryGraph,
 ) -> Result<RoaringBitmap> {
    let mut docids = RoaringBitmap::new();
    for (_, node) in query_graph.nodes.iter() {
        let term = match &node.data {
            query_graph::QueryNodeData::Term(term) => term,
            query_graph::QueryNodeData::Deleted
            | query_graph::QueryNodeData::Start
            | query_graph::QueryNodeData::End => {
                continue;
            }
        };
        if term.term_ids.len() != 1 {
            continue;
        }
        let Some(exact_term) = term.term_subset.exact_term(ctx) else {
            continue
        };
        let exact_term_docids = match exact_term {
            ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)? & universe,
            ExactTerm::Word(word) => {
                if let Some(word_docids) = ctx.word_docids(Word::Original(word))? {
                    word_docids & universe
                } else {
                    continue;
                }
            }
        };
        docids |= exact_term_docids;
    }
    Ok(docids)
 }
 fn resolve_universe(
    ctx: &mut SearchContext,
    initial_universe: &RoaringBitmap,
    query_graph: &QueryGraph,
    method: UniverseResolutionMethod,
    matching_strategy: TermsMatchingStrategy,
    logger: &mut dyn SearchLogger<QueryGraph>,
 ) -> Result<RoaringBitmap> {
    match method {
        UniverseResolutionMethod::TermMatchingStrategyOnly => {
            resolve_maximally_reduced_query_graph(
                ctx,
                initial_universe,
                query_graph,
                matching_strategy,
                logger,
            )
        }
        UniverseResolutionMethod::TermMatchingStrategyAndExactness => {
            let mut resolved_universe = resolve_maximally_reduced_query_graph(
                ctx,
                initial_universe,
                query_graph,
                matching_strategy,
                logger,
            )?;
            resolved_universe |=
                resolve_docids_containing_any_exact_word(ctx, initial_universe, query_graph)?;
            Ok(resolved_universe)
        }
    }
 }
 /// Return the list of initialised ranking rules to be used for a placeholder search.
 fn get_ranking_rules_for_placeholder_search<'ctx>(
    ctx: &SearchContext<'ctx>,
@ -163,12 +233,17 @@ fn get_ranking_rules_for_placeholder_search<'ctx>(
    Ok(ranking_rules)
 }
 enum UniverseResolutionMethod {
    TermMatchingStrategyOnly,
    TermMatchingStrategyAndExactness,
 }
 /// Return the list of initialised ranking rules to be used for a query graph search.
 fn get_ranking_rules_for_query_graph_search<'ctx>(
    ctx: &SearchContext<'ctx>,
    sort_criteria: &Option<Vec<AscDesc>>,
    terms_matching_strategy: TermsMatchingStrategy,
-) -> Result<Vec<BoxRankingRule<'ctx, QueryGraph>>> {
+) -> Result<(Vec<BoxRankingRule<'ctx, QueryGraph>>, UniverseResolutionMethod)> {
    // query graph search
    let mut words = false;
    let mut typo = false;
@ -179,10 +254,12 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
    let mut asc = HashSet::new();
    let mut desc = HashSet::new();
    let mut universe_resolution_method = UniverseResolutionMethod::TermMatchingStrategyOnly;
    let mut ranking_rules: Vec<BoxRankingRule<QueryGraph>> = vec![];
    let settings_ranking_rules = ctx.index.criteria(ctx.txn)?;
    for rr in settings_ranking_rules {
-        // Add Words before any of: typo, proximity, attribute, exactness
+        // Add Words before any of: typo, proximity, attribute
        match rr {
            crate::Criterion::Typo | crate::Criterion::Attribute | crate::Criterion::Proximity => {
                if !words {
@ -236,6 +313,11 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
                ranking_rules.push(Box::new(ExactAttribute::new()));
                ranking_rules.push(Box::new(Exactness::new()));
                exactness = true;
                if !words {
                    universe_resolution_method =
                        UniverseResolutionMethod::TermMatchingStrategyAndExactness;
                }
            }
            crate::Criterion::Asc(field_name) => {
                if asc.contains(&field_name) {
@ -253,7 +335,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>(
            }
        }
    }
-    Ok(ranking_rules)
+    Ok((ranking_rules, universe_resolution_method))
 }
 fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(
@ -335,19 +417,18 @@ pub fn execute_search(
        check_sort_criteria(ctx, sort_criteria.as_ref())?;
-        // TODO: if the exactness criterion is the first one, then
+        let (ranking_rules, universe_resolution_method) =
-        // use a different strategy to find the universe (union of any term)
+            get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?;
-        universe = resolve_maximally_reduced_query_graph(
+
        universe = resolve_universe(
            ctx,
            &universe,
            &graph,
            universe_resolution_method,
            terms_matching_strategy,
            query_graph_logger,
        )?;
        let ranking_rules =
            get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?;
        bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)?
    } else {
        let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria)?;
--- a/milli/src/search/new/tests/exactness.rs
+++ b/milli/src/search/new/tests/exactness.rs
@ -0,0 +1,442 @@
 /*!
 This module tests the following properties about the exactness ranking rule:
 - it sorts documents as follows:
    1. documents which have an attribute which is equal to the whole query
    2. documents which have an attribute which start with the whole query
    3. documents which contain the most exact words from the query
 - the set of all candidates when `exactness` precedes `word` is the union of:
    1. the same set of candidates that would be returned normally
    2. the set of documents that contain at least one exact word from the query
 - if it is placed after `word`, then it will only sort documents by:
    1. those that have an attribute which is equal to the whole remaining query, if this query does not have any "gap"
    2. those that have an attribute which start with the whole remaining query, if this query does not have any "gap"
    3. those that contain the most exact words from the remaining query
 */
 use crate::{
    index::tests::TempIndex, search::new::tests::collect_field_values, Criterion, Search,
    SearchResult, TermsMatchingStrategy,
 };
 fn create_index_exact_words_simple_ordered() -> TempIndex {
    let index = TempIndex::new();
    index
        .update_settings(|s| {
            s.set_primary_key("id".to_owned());
            s.set_searchable_fields(vec!["text".to_owned()]);
            s.set_criteria(vec![Criterion::Exactness]);
        })
        .unwrap();
    index
        .add_documents(documents!([
            {
                "id": 0,
                "text": "",
            },
            {
                "id": 1,
                "text": "the",
            },
            {
                "id": 2,
                "text": "the quick",
            },
            {
                "id": 3,
                "text": "the quick brown",
            },
            {
                "id": 4,
                "text": "the quick brown fox",
            },
            {
                "id": 5,
                "text": "the quick brown fox jumps",
            },
            {
                "id": 6,
                "text": "the quick brown fox jumps over",
            },
            {
                "id": 7,
                "text": "the quick brown fox jumps over the",
            },
            {
                "id": 8,
                "text": "the quick brown fox jumps over the lazy",
            },
            {
                "id": 9,
                "text": "the quick brown fox jumps over the lazy dog",
            },
        ]))
        .unwrap();
    index
 }
 fn create_index_exact_words_simple_reversed() -> TempIndex {
    let index = TempIndex::new();
    index
        .update_settings(|s| {
            s.set_primary_key("id".to_owned());
            s.set_searchable_fields(vec!["text".to_owned()]);
            s.set_criteria(vec![Criterion::Exactness]);
        })
        .unwrap();
    index
        .add_documents(documents!([
            {
                "id": 0,
                "text": "",
            },
            {
                "id": 1,
                "text": "dog",
            },
            {
                "id": 2,
                "text": "lazy dog",
            },
            {
                "id": 3,
                "text": "the lazy dog",
            },
            {
                "id": 4,
                "text": "over the lazy dog",
            },
            {
                "id": 5,
                "text": "jumps over the lazy dog",
            },
            {
                "id": 6,
                "text": "fox jumps over the lazy dog",
            },
            {
                "id": 7,
                "text": "brown fox jumps over the lazy dog",
            },
            {
                "id": 8,
                "text": "quick brown fox jumps over the lazy dog",
            },
            {
                "id": 9,
                "text": "the quick brown fox jumps over the lazy dog",
            }
        ]))
        .unwrap();
    index
 }
 fn create_index_exact_words_simple_random() -> TempIndex {
    let index = TempIndex::new();
    index
        .update_settings(|s| {
            s.set_primary_key("id".to_owned());
            s.set_searchable_fields(vec!["text".to_owned()]);
            s.set_criteria(vec![Criterion::Exactness]);
        })
        .unwrap();
    index
        .add_documents(documents!([
            {
                "id": 0,
                "text": "",
            },
            {
                "id": 1,
                "text": "over",
            },
            {
                "id": 2,
                "text": "jump dog",
            },
            {
                "id": 3,
                "text": "brown the lazy",
            },
            {
                "id": 4,
                "text": "jump dog quick the",
            },
            {
                "id": 5,
                "text": "fox the lazy dog brown",
            },
            {
                "id": 6,
                "text": "jump fox quick lazy the dog",
            },
            {
                "id": 7,
                "text": "the dog brown over jumps quick lazy",
            },
            {
                "id": 8,
                "text": "the jumps dog quick over brown lazy fox",
            }
        ]))
        .unwrap();
    index
 }
 fn create_index_attribute_starts_with() -> TempIndex {
    let index = TempIndex::new();
    index
        .update_settings(|s| {
            s.set_primary_key("id".to_owned());
            s.set_searchable_fields(vec!["text".to_owned()]);
            s.set_criteria(vec![Criterion::Exactness]);
        })
        .unwrap();
    index
        .add_documents(documents!([
            {
                "id": 0,
                "text": "what a lovely view from this balcony, I love it",
            },
            {
                "id": 1,
                "text": "this balcony is overlooking the sea",
            },
            {
                "id": 2,
                "text": "this balcony",
            },
            {
                "id": 3,
                "text": "over looking the sea is a beautiful balcony",
            },
            {
                "id": 4,
                "text": "a beautiful balcony is overlooking the sea",
            },
            {
                "id": 5,
                "text": "overlooking the sea is a beautiful balcony, I love it",
            },
            {
                "id": 6,
                "text": "overlooking the sea is a beautiful balcony",
            },
            {
                "id": 7,
                "text": "overlooking",
            },
        ]))
        .unwrap();
    index
 }
 #[test]
 fn test_exactness_simple_ordered() {
    let index = create_index_exact_words_simple_ordered();
    let txn = index.read_txn().unwrap();
    let mut s = Search::new(&txn, &index);
    s.terms_matching_strategy(TermsMatchingStrategy::Last);
    s.query("the quick brown fox jumps over the lazy dog");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 6, 7, 5, 4, 3, 2, 1]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"the quick brown fox jumps over the lazy dog\"",
        "\"the quick brown fox jumps over the lazy\"",
        "\"the quick brown fox jumps over\"",
        "\"the quick brown fox jumps over the\"",
        "\"the quick brown fox jumps\"",
        "\"the quick brown fox\"",
        "\"the quick brown\"",
        "\"the quick\"",
        "\"the\"",
    ]
    "###);
 }
 #[test]
 fn test_exactness_simple_reversed() {
    let index = create_index_exact_words_simple_reversed();
    let txn = index.read_txn().unwrap();
    let mut s = Search::new(&txn, &index);
    s.terms_matching_strategy(TermsMatchingStrategy::Last);
    s.query("the quick brown fox jumps over the lazy dog");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 7, 6, 5, 4, 3, 2, 1]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"the quick brown fox jumps over the lazy dog\"",
        "\"quick brown fox jumps over the lazy dog\"",
        "\"brown fox jumps over the lazy dog\"",
        "\"fox jumps over the lazy dog\"",
        "\"jumps over the lazy dog\"",
        "\"over the lazy dog\"",
        "\"the lazy dog\"",
        "\"lazy dog\"",
        "\"dog\"",
    ]
    "###);
    let mut s = Search::new(&txn, &index);
    s.terms_matching_strategy(TermsMatchingStrategy::Last);
    s.query("the quick brown fox jumps over the lazy dog");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[9, 8, 7, 6, 5, 4, 3, 2, 1]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"the quick brown fox jumps over the lazy dog\"",
        "\"quick brown fox jumps over the lazy dog\"",
        "\"brown fox jumps over the lazy dog\"",
        "\"fox jumps over the lazy dog\"",
        "\"jumps over the lazy dog\"",
        "\"over the lazy dog\"",
        "\"the lazy dog\"",
        "\"lazy dog\"",
        "\"dog\"",
    ]
    "###);
 }
 #[test]
 fn test_exactness_simple_random() {
    let index = create_index_exact_words_simple_random();
    let txn = index.read_txn().unwrap();
    let mut s = Search::new(&txn, &index);
    s.terms_matching_strategy(TermsMatchingStrategy::Last);
    s.query("the quick brown fox jumps over the lazy dog");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[8, 7, 5, 6, 3, 4, 1, 2]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"the jumps dog quick over brown lazy fox\"",
        "\"the dog brown over jumps quick lazy\"",
        "\"fox the lazy dog brown\"",
        "\"jump fox quick lazy the dog\"",
        "\"brown the lazy\"",
        "\"jump dog quick the\"",
        "\"over\"",
        "\"jump dog\"",
    ]
    "###);
 }
 #[test]
 fn test_exactness_attribute_starts_with_simple() {
    let index = create_index_attribute_starts_with();
    let txn = index.read_txn().unwrap();
    let mut s = Search::new(&txn, &index);
    s.terms_matching_strategy(TermsMatchingStrategy::Last);
    s.query("this balcony");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 3, 4, 5, 6]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"this balcony\"",
        "\"this balcony is overlooking the sea\"",
        "\"what a lovely view from this balcony, I love it\"",
        "\"over looking the sea is a beautiful balcony\"",
        "\"a beautiful balcony is overlooking the sea\"",
        "\"overlooking the sea is a beautiful balcony, I love it\"",
        "\"overlooking the sea is a beautiful balcony\"",
    ]
    "###);
 }
 #[test]
 fn test_exactness_attribute_starts_with_phrase() {
    let index = create_index_attribute_starts_with();
    let txn = index.read_txn().unwrap();
    let mut s = Search::new(&txn, &index);
    s.terms_matching_strategy(TermsMatchingStrategy::Last);
    s.query("\"overlooking the sea\" is a beautiful balcony");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 6, 4, 3, 1, 0, 2]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
    // TODO: this is incorrect, the first document returned here should actually be the second one
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"overlooking the sea is a beautiful balcony, I love it\"",
        "\"overlooking the sea is a beautiful balcony\"",
        "\"a beautiful balcony is overlooking the sea\"",
        "\"over looking the sea is a beautiful balcony\"",
        "\"this balcony is overlooking the sea\"",
        "\"what a lovely view from this balcony, I love it\"",
        "\"this balcony\"",
    ]
    "###);
    let mut s = Search::new(&txn, &index);
    s.terms_matching_strategy(TermsMatchingStrategy::Last);
    s.query("overlooking the sea is a beautiful balcony");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 3, 1, 0, 2, 7]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
    // TODO: this is correct, so the exactness ranking rule probably has a bug in the handling of phrases
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"overlooking the sea is a beautiful balcony\"",
        "\"overlooking the sea is a beautiful balcony, I love it\"",
        "\"a beautiful balcony is overlooking the sea\"",
        "\"over looking the sea is a beautiful balcony\"",
        "\"this balcony is overlooking the sea\"",
        "\"what a lovely view from this balcony, I love it\"",
        "\"this balcony\"",
        "\"overlooking\"",
    ]
    "###);
 }
 #[test]
 fn test_exactness_all_candidates_with_typo() {
    let index = create_index_attribute_starts_with();
    let txn = index.read_txn().unwrap();
    let mut s = Search::new(&txn, &index);
    s.terms_matching_strategy(TermsMatchingStrategy::Last);
    s.query("overlocking the sea is a beautiful balcony");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 4, 5, 6, 1, 0, 2, 7]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
    // "overlooking" is returned here because the term matching strategy allows it
    // but it has the worst exactness score (0 exact words)
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"over looking the sea is a beautiful balcony\"",
        "\"a beautiful balcony is overlooking the sea\"",
        "\"overlooking the sea is a beautiful balcony, I love it\"",
        "\"overlooking the sea is a beautiful balcony\"",
        "\"this balcony is overlooking the sea\"",
        "\"what a lovely view from this balcony, I love it\"",
        "\"this balcony\"",
        "\"overlooking\"",
    ]
    "###);
 }
--- a/milli/src/search/new/tests/mod.rs
+++ b/milli/src/search/new/tests/mod.rs
@ -1,6 +1,7 @@
 pub mod attribute_fid;
 pub mod attribute_position;
 pub mod distinct;
 pub mod exactness;
 #[cfg(feature = "default")]
 pub mod language;
 pub mod ngram_split_words;