Avoid a prefix-related worst-case scenario in the proximity criterion

2024-11-23 10:37:41 +08:00 · 2022-12-08 11:53:21 +01:00 · 2022-12-08 11:53:21 +01:00 · 777b387dc4
commit 777b387dc4
parent a8defb585b
4 changed files with 108 additions and 50 deletions
--- a/milli/src/search/criteria/mod.rs
+++ b/milli/src/search/criteria/mod.rs
@ -17,6 +17,7 @@ use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind};
 use super::CriterionImplementationStrategy;
 use crate::search::criteria::geo::Geo;
 use crate::search::{word_derivations, Distinct, WordDerivationsCache};
+use crate::update::{MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB};
 use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result};

 mod asc_desc;
@ -653,14 +654,30 @@ fn query_pair_proximity_docids(
    match (&left.kind, &right.kind) {
        (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => {
            if prefix {
-                match word_prefix_pair_overall_proximity_docids(
-                    ctx,
-                    left.as_str(),
-                    right.as_str(),
-                    proximity,
-                )? {
-                    Some(docids) => Ok(docids),
-                    None => {
+                // There are three distinct cases which we need to distinguish regarding the prefix `right`:
+                //
+                // 1. `right` is not in any prefix cache because it is not the prefix of many words
+                //     (and thus, it doesn't have many word derivations)
+                // 2. `right` is in the prefix cache but cannot be found in the "word prefix pair proximity" databases either
+                //     because it is too long or because the given proximity is too high.
+                // 3. `right` is in the prefix cache and can be found in the "word prefix pair proximity" databases
+                //
+                // The three cases are handled as follows:
+                // 1. We manually retrieve all the word derivations of `right` and check the `word_pair_proximity`
+                //    database for each of them.
+                // 2. It would be too expensive to apply the same strategy as (1), therefore, we "disable" the
+                //    proximity ranking rule for the prefixes of the right word. This is done as follows:
+                //    1. Only find the documents where left is in proximity to the exact (ie non-prefix) right word
+                //    2. Otherwise, assume that their proximity in all the documents in which they coexist is >= 8
+                //
+                // 3. Query the prefix proximity databases.
+                match (
+                    ctx.in_prefix_cache(right),
+                    right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB
+                        && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
+                ) {
+                    // Case 1: not in prefix cache
+                    (false, _) => {
                        let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?;
                        all_word_pair_overall_proximity_docids(
                            ctx,
@ -669,40 +686,91 @@ fn query_pair_proximity_docids(
                            proximity,
                        )
                    }
+                    // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to
+                    // query the prefix proximity databases.
+                    (true, false) => {
+                        // To "save" the relevancy a little bit, we still find the documents where the
+                        // exact (i.e. non-prefix) right word is in the given proximity to the left word.
+                        Ok(word_pair_overall_proximity_docids(
+                            ctx,
+                            left.as_str(),
+                            right.as_str(),
+                            proximity,
+                        )?
+                        .unwrap_or_default())
+                    }
+                    // Case 3: in prefix cache, short enough, and proximity is low enough
+                    (true, true) => Ok(word_prefix_pair_overall_proximity_docids(
+                        ctx,
+                        left.as_str(),
+                        right.as_str(),
+                        proximity,
+                    )?
+                    .unwrap_or_default()),
                }
            } else {
-                Ok(ctx
-                    .word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)?
-                    .unwrap_or_default())
+                Ok(word_pair_overall_proximity_docids(
+                    ctx,
+                    left.as_str(),
+                    right.as_str(),
+                    proximity,
+                )?
+                .unwrap_or_default())
            }
        }
        (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => {
            let l_words =
                word_derivations(left, false, *typo, ctx.words_fst(), wdcache)?.to_owned();
            if prefix {
-                let mut docids = RoaringBitmap::new();
-                for (left, _) in l_words {
-                    let current_docids = match word_prefix_pair_overall_proximity_docids(
-                        ctx,
-                        left.as_str(),
-                        right.as_str(),
-                        proximity,
-                    )? {
-                        Some(docids) => Ok(docids),
-                        None => {
-                            let r_words =
-                                word_derivations(right, true, 0, ctx.words_fst(), wdcache)?;
-                            all_word_pair_overall_proximity_docids(
+                // The logic here is almost identical to the one in the previous match branch.
+                // The difference is that we fetch the docids for each derivation of the left word.
+                match (
+                    ctx.in_prefix_cache(right),
+                    right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB
+                        && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
+                ) {
+                    // Case 1: not in prefix cache
+                    (false, _) => {
+                        let mut docids = RoaringBitmap::new();
+                        let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?;
+                        for (left, _) in l_words {
+                            docids |= all_word_pair_overall_proximity_docids(
                                ctx,
                                &[(left, 0)],
                                r_words,
                                proximity,
-                            )
+                            )?;
                        }
-                    }?;
-                    docids |= current_docids;
+                        Ok(docids)
+                    }
+                    // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to
+                    // query the prefix proximity databases.
+                    (true, false) => {
+                        // To "save" the relevancy a little bit, we still find the documents where the
+                        // exact (i.e. non-prefix) right word is in proximity to any derivation of the left word.
+                        let mut candidates = RoaringBitmap::new();
+                        for (left, _) in l_words {
+                            candidates |= ctx
+                                .word_pair_proximity_docids(&left, right, proximity)?
+                                .unwrap_or_default();
+                        }
+                        Ok(candidates)
+                    }
+                    // Case 3: in prefix cache, short enough, and proximity is low enough
+                    (true, true) => {
+                        let mut docids = RoaringBitmap::new();
+                        for (left, _) in l_words {
+                            docids |= word_prefix_pair_overall_proximity_docids(
+                                ctx,
+                                left.as_str(),
+                                right.as_str(),
+                                proximity,
+                            )?
+                            .unwrap_or_default();
+                        }
+                        Ok(docids)
+                    }
                }
-                Ok(docids)
            } else {
                all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity)
            }
--- a/milli/src/search/criteria/proximity.rs
+++ b/milli/src/search/criteria/proximity.rs
@ -590,3 +590,6 @@ fn resolve_plane_sweep_candidates(

    Ok(candidates)
 }
+
+#[cfg(test)]
+mod tests {}
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@ -7,7 +7,10 @@ pub use self::index_documents::{
    DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
 };
 pub use self::indexer_config::IndexerConfig;
-pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids;
+pub use self::prefix_word_pairs::{
+    PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
+    MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
+};
 pub use self::settings::{Setting, Settings};
 pub use self::update_step::UpdateIndexingStep;
 pub use self::word_prefix_docids::WordPrefixDocids;
--- a/milli/src/update/prefix_word_pairs/mod.rs
+++ b/milli/src/update/prefix_word_pairs/mod.rs
@ -14,6 +14,9 @@ mod word_prefix;
 pub use prefix_word::index_prefix_word_database;
 pub use word_prefix::index_word_prefix_database;

+pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4;
+pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2;
+
 pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> {
    wtxn: &'t mut heed::RwTxn<'i, 'u>,
    index: &'i Index,
@ -32,31 +35,12 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> {
        Self {
            wtxn,
            index,
-            max_proximity: 4,
-            max_prefix_length: 2,
+            max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB,
+            max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB,
            chunk_compression_type,
            chunk_compression_level,
        }
    }
-    /// Set the maximum proximity required to make a prefix be part of the words prefixes
-    /// database. If two words are too far from the threshold the associated documents will
-    /// not be part of the prefix database.
-    ///
-    /// Default value is 4. This value must be lower or equal than 7 and will be clamped
-    /// to this bound otherwise.
-    pub fn max_proximity(&mut self, value: u8) -> &mut Self {
-        self.max_proximity = value.max(7);
-        self
-    }
-    /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words
-    /// prefixes database. If the prefix length is higher than the threshold, the associated documents
-    /// will not be part of the prefix database.
-    ///
-    /// Default value is 2.
-    pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
-        self.max_prefix_length = value;
-        self
-    }

    #[logging_timer::time("WordPrefixPairProximityDocids::{}")]
    pub fn execute<'a>(