From 8a13ed7e3fd0df9c163fdbdc7d7b56e6a5b0fbe4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 4 Apr 2023 17:12:07 +0200 Subject: [PATCH] Add exactness ranking rules --- milli/src/search/new/exact_attribute.rs | 175 ++++++++++++++++++ .../search/new/graph_based_ranking_rule.rs | 10 +- milli/src/search/new/mod.rs | 12 +- .../new/ranking_rule_graph/exactness/mod.rs | 107 +++++++++++ .../src/search/new/ranking_rule_graph/mod.rs | 3 + 5 files changed, 301 insertions(+), 6 deletions(-) create mode 100644 milli/src/search/new/exact_attribute.rs create mode 100644 milli/src/search/new/ranking_rule_graph/exactness/mod.rs diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs new file mode 100644 index 000000000..bb6299e28 --- /dev/null +++ b/milli/src/search/new/exact_attribute.rs @@ -0,0 +1,175 @@ +use heed::BytesDecode; +use roaring::MultiOps; + +use super::query_graph::QueryGraph; +use super::ranking_rules::{RankingRule, RankingRuleOutput}; +use crate::search::new::query_graph::QueryNodeData; +use crate::search::new::query_term::ExactTerm; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; + +/// FIXME: +/// +/// - A lot of work done in next_bucket that start_iteration could do. +/// - Consider calling the graph based rule directly from this one. +/// - currently we did exact term, don't forget about prefix +/// - some tests +pub struct ExactAttribute { + query_graph: Option, +} + +impl ExactAttribute { + pub fn new() -> Self { + Self { query_graph: None } + } +} + +impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { + fn id(&self) -> String { + "exact_attribute".to_owned() + } + + fn start_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + _universe: &roaring::RoaringBitmap, + query: &QueryGraph, + ) -> Result<()> { + self.query_graph = Some(query.clone()); + Ok(()) + } + + fn next_bucket( + &mut self, + ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + universe: &roaring::RoaringBitmap, + ) -> Result>> { + // iterate on the nodes of the graph, retain LocatedQueryTermSubset + let query_graph = self.query_graph.as_ref().unwrap(); + let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = + Vec::with_capacity(query_graph.nodes.len() as usize); + for (_, node) in query_graph.nodes.iter() { + match &node.data { + QueryNodeData::Term(term) => { + let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { + exact_term + } else { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + }; + exact_term_position_ids.push(( + exact_term, + *term.positions.start(), + *term.term_ids.start(), + )) + } + QueryNodeData::Deleted | QueryNodeData::Start | QueryNodeData::End => continue, + } + } + + exact_term_position_ids.sort_by_key(|(_, _, id)| *id); + // bail if there is a "hole" (missing word) in remaining query graph + let mut previous_id = 0; + for (_, _, id) in exact_term_position_ids.iter().copied() { + if id < previous_id || id - previous_id > 1 { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } else { + previous_id = id; + } + } + + // sample query: "sunflower are pretty" + // sunflower at pos 0 in attr A + // are at pos 1 in attr B + // pretty at pos 2 in attr C + // We want to eliminate such document + + // first check that for each term, there exists some attribute that has this term at the correct position + //"word-position-docids"; + let mut candidates = universe.clone(); + let words_positions: Vec<(Vec<_>, _)> = exact_term_position_ids + .iter() + .copied() + .map(|(term, position, _)| (term.interned_words(ctx).collect(), position)) + .collect(); + for (words, position) in &words_positions { + if candidates.is_empty() { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } + + 'words: for (offset, word) in words.iter().enumerate() { + let offset = offset as u16; + let word = if let Some(word) = word { + word + } else { + continue 'words; + }; + let word_position_docids = CboRoaringBitmapCodec::bytes_decode( + ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(), + ) + .unwrap_or_default(); + candidates &= word_position_docids; + } + } + + let candidates = candidates; + + if candidates.is_empty() { + // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules + return Ok(Some(RankingRuleOutput { + query: query_graph.clone(), + candidates: universe.clone(), + })); + } + + let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); + + let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len()); + + // then check that there exists at least one attribute that has all of the terms + for fid in searchable_fields_ids { + let mut intersection = MultiOps::intersection( + words_positions + .iter() + .flat_map(|(words, ..)| words.iter()) + // ignore stop words words in phrases + .flatten() + .map(|word| -> Result<_> { + Ok(ctx + .get_db_word_fid_docids(*word, fid)? + .map(CboRoaringBitmapCodec::bytes_decode) + .unwrap_or_default() + .unwrap_or_default()) + }), + )?; + intersection &= &candidates; + if !intersection.is_empty() { + candidates_per_attributes.push(intersection); + } + } + // note we could have "false positives" where there both exist different attributes that collectively + // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. + + let candidates = MultiOps::union(candidates_per_attributes.into_iter()); + Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates })) + } + + fn end_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + ) { + } +} diff --git a/milli/src/search/new/graph_based_ranking_rule.rs b/milli/src/search/new/graph_based_ranking_rule.rs index b8c58c726..28b4ed1f4 100644 --- a/milli/src/search/new/graph_based_ranking_rule.rs +++ b/milli/src/search/new/graph_based_ranking_rule.rs @@ -44,8 +44,8 @@ use super::interner::{Interned, MappedInterner}; use super::logger::SearchLogger; use super::query_graph::QueryNode; use super::ranking_rule_graph::{ - ConditionDocIdsCache, DeadEndsCache, ProximityGraph, RankingRuleGraph, RankingRuleGraphTrait, - TypoGraph, + ConditionDocIdsCache, DeadEndsCache, ExactnessGraph, ProximityGraph, RankingRuleGraph, + RankingRuleGraphTrait, TypoGraph, }; use super::small_bitmap::SmallBitmap; use super::{QueryGraph, RankingRule, RankingRuleOutput, SearchContext}; @@ -65,6 +65,12 @@ impl GraphBasedRankingRule { Self::new_with_id("typo".to_owned(), terms_matching_strategy) } } +pub type Exactness = GraphBasedRankingRule; +impl GraphBasedRankingRule { + pub fn new() -> Self { + Self::new_with_id("exactness".to_owned(), None) + } +} /// A generic graph-based ranking rule pub struct GraphBasedRankingRule { diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4d561d25b..779e589b3 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -9,8 +9,9 @@ mod query_term; mod ranking_rule_graph; mod ranking_rules; mod resolve_query_graph; -// TODO: documentation + comments mod small_bitmap; + +mod exact_attribute; // TODO: documentation + comments // implementation is currently an adaptation of the previous implementation to fit with the new model mod sort; @@ -33,6 +34,8 @@ use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; +use self::exact_attribute::ExactAttribute; +use self::graph_based_ranking_rule::Exactness; use self::interner::Interner; use self::ranking_rules::{BoxRankingRule, RankingRule}; use self::resolve_query_graph::compute_query_graph_docids; @@ -150,7 +153,7 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( let mut proximity = false; let mut sort = false; let attribute = false; - let exactness = false; + let mut exactness = false; let mut asc = HashSet::new(); let mut desc = HashSet::new(); @@ -211,8 +214,9 @@ fn get_ranking_rules_for_query_graph_search<'ctx>( if exactness { continue; } - // todo!(); - // exactness = false; + ranking_rules.push(Box::new(ExactAttribute::new())); + ranking_rules.push(Box::new(Exactness::new())); + exactness = true; } crate::Criterion::Asc(field_name) => { if asc.contains(&field_name) { diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs new file mode 100644 index 000000000..a1e19a015 --- /dev/null +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -0,0 +1,107 @@ +use roaring::RoaringBitmap; + +use super::{ComputedCondition, DeadEndsCache, RankingRuleGraph, RankingRuleGraphTrait}; +use crate::search::new::interner::{DedupInterner, Interned, MappedInterner}; +use crate::search::new::query_graph::{QueryGraph, QueryNode}; +use crate::search::new::query_term::{ExactTerm, LocatedQueryTermSubset}; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; + +/// - Exactness as first ranking rule: TermsMatchingStrategy? prefer a document that matches 1 word exactly and no other +/// word than a doc that matches 9 words non exactly but none exactly +/// - `TermsMatchingStrategy` as a word + exactness optimization: we could consider +/// +/// "naive vision" +/// condition from one node to another: +/// - word exactly present: cost 0 +/// - word typo/ngram/prefix/missing: cost 1, not remove from query graph, edge btwn the two nodes, return the universe without condition when resolving, destination query term is inside +/// +/// Three strategies: +/// 1. ExactAttribute: word position / word_fid_docid +/// 2. AttributeStart: +/// 3. AttributeContainsExact => implementable via `RankingRuleGraphTrait` + +#[derive(Clone, PartialEq, Eq, Hash)] +pub enum ExactnessCondition { + ExactInAttribute(LocatedQueryTermSubset), + Skip(LocatedQueryTermSubset), +} + +pub enum ExactnessGraph {} + +fn compute_docids( + ctx: &mut SearchContext, + dest_node: &LocatedQueryTermSubset, + universe: &RoaringBitmap, +) -> Result { + let exact_term = if let Some(exact_term) = dest_node.term_subset.exact_term(ctx) { + exact_term + } else { + return Ok(Default::default()); + }; + let mut candidates = match exact_term { + ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), + ExactTerm::Word(word) => { + if let Some(word_candidates) = ctx.get_db_word_docids(word)? { + CboRoaringBitmapCodec::deserialize_from(word_candidates)? + } else { + return Ok(Default::default()); + } + } + }; + // TODO: synonyms? + candidates &= universe; + Ok(candidates) +} + +impl RankingRuleGraphTrait for ExactnessGraph { + type Condition = ExactnessCondition; + + fn resolve_condition( + ctx: &mut SearchContext, + condition: &Self::Condition, + universe: &RoaringBitmap, + ) -> Result { + let (docids, dest_node) = match condition { + ExactnessCondition::ExactInAttribute(dest_node) => { + (compute_docids(ctx, dest_node, universe)?, dest_node) + } + ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node), + }; + Ok(ComputedCondition { + docids, + universe_len: universe.len(), + start_term_subset: None, + end_term_subset: dest_node.clone(), + }) + } + + fn build_edges( + _ctx: &mut SearchContext, + conditions_interner: &mut DedupInterner, + _source_node: Option<&LocatedQueryTermSubset>, + dest_node: &LocatedQueryTermSubset, + ) -> Result)>> { + let exact_condition = ExactnessCondition::ExactInAttribute(dest_node.clone()); + let exact_condition = conditions_interner.insert(exact_condition); + + let skip_condition = ExactnessCondition::Skip(dest_node.clone()); + let skip_condition = conditions_interner.insert(skip_condition); + Ok(vec![(0, exact_condition), (1, skip_condition)]) + } + + fn log_state( + graph: &RankingRuleGraph, + paths: &[Vec>], + dead_ends_cache: &DeadEndsCache, + universe: &RoaringBitmap, + costs: &MappedInterner>, + cost: u64, + logger: &mut dyn SearchLogger, + ) { + todo!() + } + + fn label_for_condition(ctx: &mut SearchContext, condition: &Self::Condition) -> Result { + todo!() + } +} diff --git a/milli/src/search/new/ranking_rule_graph/mod.rs b/milli/src/search/new/ranking_rule_graph/mod.rs index 7c40008c8..936c3e942 100644 --- a/milli/src/search/new/ranking_rule_graph/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/mod.rs @@ -10,6 +10,8 @@ mod cheapest_paths; mod condition_docids_cache; mod dead_ends_cache; +/// Implementation of the `exactness` ranking rule +mod exactness; /// Implementation of the `proximity` ranking rule mod proximity; /// Implementation of the `typo` ranking rule @@ -20,6 +22,7 @@ use std::hash::Hash; pub use cheapest_paths::PathVisitor; pub use condition_docids_cache::ConditionDocIdsCache; pub use dead_ends_cache::DeadEndsCache; +pub use exactness::{ExactnessCondition, ExactnessGraph}; pub use proximity::{ProximityCondition, ProximityGraph}; use roaring::RoaringBitmap; pub use typo::{TypoCondition, TypoGraph};