Fix code visibility issue + unimplemented detail in proximity rule

2025-02-20 17:45:54 +08:00 · 2023-03-05 20:07:37 +01:00 · 2023-03-05 20:07:37 +01:00 · c4979a2fda
commit c4979a2fda
parent 23931f8a4f
9 changed files with 99 additions and 70 deletions
--- a/milli/src/search/new/db_cache.rs
+++ b/milli/src/search/new/db_cache.rs
@ -121,18 +121,18 @@ impl<'transaction> DatabaseCache<'transaction> {
        &mut self,
        index: &Index,
        txn: &'transaction RoTxn,
-        word1: &str,
-        prefix2: &str,
+        left_prefix: &str,
+        right: &str,
        proximity: u8,
    ) -> Result<Option<&'transaction [u8]>> {
-        let key = (proximity, prefix2.to_owned(), word1.to_owned());
+        let key = (proximity, left_prefix.to_owned(), right.to_owned());
        match self.prefix_word_pair_proximity_docids.entry(key) {
            Entry::Occupied(bitmap_ptr) => Ok(*bitmap_ptr.get()),
            Entry::Vacant(entry) => {
                let bitmap_ptr = index
                    .prefix_word_pair_proximity_docids
                    .remap_data_type::<ByteSlice>()
-                    .get(txn, &(proximity, prefix2, word1))?;
+                    .get(txn, &(proximity, left_prefix, right))?;
                entry.insert(bitmap_ptr);
                Ok(bitmap_ptr)
            }
--- a/milli/src/search/new/logger/detailed.rs
+++ b/milli/src/search/new/logger/detailed.rs
@ -194,7 +194,7 @@ impl DetailedSearchLogger {
        for event in self.events.iter() {
            match event {
                SearchEvents::RankingRuleStartIteration { ranking_rule_idx, time, .. } => {
-                    let elapsed = time.duration_since(prev_time);
+                    let _elapsed = time.duration_since(prev_time);
                    prev_time = *time;
                    let parent_activated_id = activated_id(&timestamp);
                    timestamp.push(0);
@ -216,7 +216,7 @@ impl DetailedSearchLogger {
 }}").unwrap();
                }
                SearchEvents::RankingRuleNextBucket { ranking_rule_idx, time, universe, candidates } => {
-                    let elapsed = time.duration_since(prev_time);
+                    let _elapsed = time.duration_since(prev_time);
                    prev_time = *time;
                    let old_activated_id = activated_id(&timestamp);
                    // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap();
@ -227,7 +227,7 @@ impl DetailedSearchLogger {
                        .unwrap();
                }
                SearchEvents::RankingRuleSkipBucket { ranking_rule_idx, candidates, time } => {
-                    let elapsed = time.duration_since(prev_time);
+                    let _elapsed = time.duration_since(prev_time);
                    prev_time = *time;
                    let old_activated_id = activated_id(&timestamp);
                    // writeln!(&mut file, "time.{old_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap();
@ -239,7 +239,7 @@ impl DetailedSearchLogger {
                        .unwrap();
                }
                SearchEvents::RankingRuleEndIteration { ranking_rule_idx, time, .. } => {
-                    let elapsed = time.duration_since(prev_time);
+                    let _elapsed = time.duration_since(prev_time);
                    prev_time = *time;
                    let cur_activated_id = activated_id(&timestamp);
                    // writeln!(&mut file, "time.{cur_activated_id}: {:.2}", elapsed.as_micros() as f64 / 1000.0).unwrap();
@ -332,7 +332,7 @@ results.{random} {{
        writeln!(&mut file, "}}").unwrap();
    }
    
-    fn query_node_d2_desc(node_idx: usize, node: &QueryNode, distances: &[u64], file: &mut File) {
+    fn query_node_d2_desc(node_idx: usize, node: &QueryNode, _distances: &[u64], file: &mut File) {
        match &node {
            QueryNode::Term(LocatedQueryTerm { value, .. }) => {
                match value {
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -1,18 +1,22 @@
-pub mod db_cache;
-pub mod graph_based_ranking_rule;
-pub mod logger;
-pub mod query_graph;
-pub mod query_term;
-pub mod ranking_rule_graph;
-pub mod ranking_rules;
-pub mod resolve_query_graph;
-pub mod sort;
-pub mod words;
+mod db_cache;
+mod graph_based_ranking_rule;
+mod logger;
+mod query_graph;
+mod query_term;
+mod ranking_rule_graph;
+mod ranking_rules;
+mod resolve_query_graph;
+mod sort;
+mod words;

 use charabia::Tokenize;
 use heed::RoTxn;
-pub use query_graph::*;
-pub use ranking_rules::*;
+
+use query_graph::{QueryGraph, QueryNode};
+pub use ranking_rules::{
+    execute_search, RankingRule, RankingRuleOutput, RankingRuleOutputIter,
+    RankingRuleOutputIterWrapper, RankingRuleQueryTrait,
+};
 use roaring::RoaringBitmap;

 use self::db_cache::DatabaseCache;
--- a/milli/src/search/new/ranking_rule_graph/paths_map.rs
+++ b/milli/src/search/new/ranking_rule_graph/paths_map.rs
@ -6,6 +6,10 @@ use roaring::RoaringBitmap;

 use super::cheapest_paths::Path;

+// What is PathsMap used for?
+// For the empty_prefixes field in the EmptyPathsCache only :/
+// but it could be used for more, like efficient computing of a set of paths
+

 #[derive(Debug, Clone)]
 pub struct PathsMap<V> {
--- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs
@ -13,16 +13,14 @@ use crate::{Index, Result};

 pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations, i8)>> {
    Ok(Some(match from_node {
-        QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => {
-            match value1 {
-                QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()),
-                QueryTerm::Phrase { phrase: phrase1 } => {
-                    // TODO: remove second unwrap
-                    let original = phrase1.words.last().unwrap().as_ref().unwrap().clone();
+        QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => match value1 {
+            QueryTerm::Word { derivations } => (derivations.clone(), *pos1.end()),
+            QueryTerm::Phrase { phrase: phrase1 } => {
+                if let Some(original) = phrase1.words.last().unwrap().as_ref() {
                    (
                        WordDerivations {
                            original: original.clone(),
-                            zero_typo: vec![original],
+                            zero_typo: vec![original.to_owned()],
                            one_typo: vec![],
                            two_typos: vec![],
                            use_prefix_db: false,
@ -31,9 +29,12 @@ pub fn visit_from_node(from_node: &QueryNode) -> Result<Option<(WordDerivations,
                        },
                        *pos1.end(),
                    )
+                } else {
+                    // No word pairs if the phrase does not have a regular word as its last term
+                    return Ok(None);
                }
            }
-        }
+        },
        QueryNode::Start => (
            WordDerivations {
                original: String::new(),
@ -68,26 +69,27 @@ pub fn visit_to_node<'transaction, 'from_data>(
    let (derivations2, pos2, ngram_len2) = match value2 {
        QueryTerm::Word { derivations } => (derivations.clone(), *pos2.start(), pos2.len()),
        QueryTerm::Phrase { phrase: phrase2 } => {
-            // TODO: remove second unwrap
-            let original = phrase2.words.last().unwrap().as_ref().unwrap().clone();
-            (
-                WordDerivations {
-                    original: original.clone(),
-                    zero_typo: vec![original],
-                    one_typo: vec![],
-                    two_typos: vec![],
-                    use_prefix_db: false,
-                    synonyms: vec![],
-                    split_words: None,
-                },
-                *pos2.start(),
-                1,
-            )
+            if let Some(original) = phrase2.words.first().unwrap().as_ref() {
+                (
+                    WordDerivations {
+                        original: original.clone(),
+                        zero_typo: vec![original.to_owned()],
+                        one_typo: vec![],
+                        two_typos: vec![],
+                        use_prefix_db: false,
+                        synonyms: vec![],
+                        split_words: None,
+                    },
+                    *pos2.start(),
+                    1,
+                )
+            } else {
+                // No word pairs if the phrase does not have a regular word as its first term
+                return Ok(vec![]);
+            }
        }
    };

-    // TODO: here we would actually do it for each combination of word1 and word2
-    // and take the union of them
    if pos1 + 1 != pos2 {
        // TODO: how should this actually be handled?
        // We want to effectively ignore this pair of terms
@ -130,19 +132,37 @@ pub fn visit_to_node<'transaction, 'from_data>(
                            right_prefix: original_word_2.to_owned(),
                        });
                }
+                if db_cache
+                    .get_prefix_word_pair_proximity_docids(
+                        index,
+                        txn,
+                        original_word_2.as_str(),
+                        word1.as_str(),
+                        proximity as u8 - 1,
+                    )?
+                    .is_some()
+                {
+                    cost_proximity_word_pairs
+                        .entry(cost)
+                        .or_default()
+                        .entry(proximity as u8)
+                        .or_default()
+                        .push(WordPair::WordPrefixSwapped {
+                            left_prefix: original_word_2.to_owned(),
+                            right: word1.to_owned(),
+                        });
+                }
            }
        }
    }

    let derivations2 = derivations2.all_derivations_except_prefix_db();
-    // TODO: safeguard in case the cartesian product is too large?
+    // TODO: add safeguard in case the cartesian product is too large?
    let product_derivations = derivations1.cartesian_product(derivations2);

    for (word1, word2) in product_derivations {
        for proximity in 1..=(8 - ngram_len2) {
            let cost = (proximity + ngram_len2 - 1) as u8;
-            // TODO: do the opposite way with a proximity penalty as well!
-            //       search for (word2, word1, proximity-1), I guess?
            if db_cache
                .get_word_pair_proximity_docids(index, txn, word1, word2, proximity as u8)?
                .is_some()
@ -154,6 +174,18 @@ pub fn visit_to_node<'transaction, 'from_data>(
                    .or_default()
                    .push(WordPair::Words { left: word1.to_owned(), right: word2.to_owned() });
            }
+            if proximity > 1
+                && db_cache
+                    .get_word_pair_proximity_docids(index, txn, word2, word1, proximity as u8 - 1)?
+                    .is_some()
+            {
+                cost_proximity_word_pairs
+                    .entry(cost)
+                    .or_default()
+                    .entry(proximity as u8 - 1)
+                    .or_default()
+                    .push(WordPair::Words { left: word2.to_owned(), right: word1.to_owned() });
+            }
        }
    }
    let mut new_edges = cost_proximity_word_pairs
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@ -20,11 +20,8 @@ pub fn compute_docids<'transaction>(
            }
            WordPair::WordPrefix { left, right_prefix } => db_cache
                .get_word_prefix_pair_proximity_docids(index, txn, left, right_prefix, *proximity),
-            WordPair::WordsSwapped { left, right } => {
-                db_cache.get_word_pair_proximity_docids(index, txn, left, right, *proximity)
-            }
-            WordPair::WordPrefixSwapped { left, right_prefix } => db_cache
-                .get_prefix_word_pair_proximity_docids(index, txn, left, right_prefix, *proximity),
+            WordPair::WordPrefixSwapped { left_prefix, right } => db_cache
+                .get_prefix_word_pair_proximity_docids(index, txn, left_prefix, right, *proximity),
        }?;
        let bitmap =
            bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default();
--- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs
@ -18,9 +18,8 @@ use crate::{Index, Result};
 #[derive(Debug, Clone)]
 pub enum WordPair {
    Words { left: String, right: String },
-    WordsSwapped { left: String, right: String },
    WordPrefix { left: String, right_prefix: String },
-    WordPrefixSwapped { left: String, right_prefix: String },
+    WordPrefixSwapped { left_prefix: String, right: String },
 }

 #[derive(Clone)]
--- a/milli/src/search/new/ranking_rule_graph/resolve_paths.rs
+++ b/milli/src/search/new/ranking_rule_graph/resolve_paths.rs
@ -24,9 +24,13 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
        mut paths: Vec<Vec<u32>>,
    ) -> Result<RoaringBitmap> {
        paths.sort_unstable();
+        // let mut needs_filtering_empty_edges = false;
+        // let mut needs_filtering_empty_prefix = false;
+        // let mut needs_filtering_empty_couple_edges = false;
        let mut needs_filtering = false;
        let mut path_bitmaps = vec![];
        'path_loop: loop {
+            // TODO: distinguish between empty_edges, empty_prefix, and empty_couple_edges filtering
            if needs_filtering {
                for path in paths.iter_mut() {
                    if empty_paths_cache.path_is_empty(path) {
@ -61,11 +65,13 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
                            self.remove_edge(edge_index);
                            edge_docids_cache.cache.remove(&edge_index);
                            needs_filtering = true;
+                            // needs_filtering_empty_edges = true;
                            // 3. continue executing this function again on the remaining paths
                            continue 'path_loop;
                        } else {
                            path_bitmap &= edge_docids;
                            if path_bitmap.is_disjoint(universe) {
+                                // needs_filtering_empty_prefix = true;
                                needs_filtering = true;
                                empty_paths_cache.forbid_prefix(&visited_edges);
                                // if the intersection between this edge and any
@ -76,6 +82,7 @@ impl<G: RankingRuleGraphTrait> RankingRuleGraph<G> {
                                {
                                    let intersection = edge_docids & edge_docids2;
                                    if intersection.is_disjoint(universe) {
+                                        // needs_filtering_empty_couple_edges = true;
                                        empty_paths_cache
                                            .forbid_couple_edges(*edge_index2, edge_index);
                                    }
--- a/milli/src/search/new/ranking_rules.rs
+++ b/milli/src/search/new/ranking_rules.rs
@ -98,20 +98,6 @@ pub struct RankingRuleOutput<Q> {
    pub candidates: RoaringBitmap,
 }

-#[allow(unused)]
-pub fn get_start_universe<'transaction>(
-    index: &Index,
-    txn: &'transaction RoTxn,
-    db_cache: &mut DatabaseCache<'transaction>,
-    query_graph: &QueryGraph,
-    term_matching_strategy: TermsMatchingStrategy,
-    // filters: Filters,
-) -> Result<RoaringBitmap> {
-    // TODO: actually compute the universe from the query graph
-    let universe = index.documents_ids(txn).unwrap();
-    Ok(universe)
-}
-
 // TODO: can make it generic over the query type (either query graph or placeholder) fairly easily
 #[allow(clippy::too_many_arguments)]
 pub fn execute_search<'transaction>(