From f513cf930acd3cb403cb58dc5bffcc278a5a214f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 5 Apr 2023 14:42:51 +0200 Subject: [PATCH] Exact attribute with state --- milli/src/search/new/exact_attribute.rs | 166 +++++++++++++++++------- 1 file changed, 122 insertions(+), 44 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index bb6299e28..fa837272b 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -1,5 +1,5 @@ use heed::BytesDecode; -use roaring::MultiOps; +use roaring::{MultiOps, RoaringBitmap}; use super::query_graph::QueryGraph; use super::ranking_rules::{RankingRule, RankingRuleOutput}; @@ -7,19 +7,18 @@ use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::ExactTerm; use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; -/// FIXME: +/// A ranking rule that produces 3 disjoint buckets: /// -/// - A lot of work done in next_bucket that start_iteration could do. -/// - Consider calling the graph based rule directly from this one. -/// - currently we did exact term, don't forget about prefix -/// - some tests +/// 1. Documents from the universe whose value is exactly the query. +/// 2. Documents from the universe not in (1) whose value starts with the query. +/// 3. Documents from the universe not in (1) or (2). pub struct ExactAttribute { - query_graph: Option, + state: State, } impl ExactAttribute { pub fn new() -> Self { - Self { query_graph: None } + Self { state: Default::default() } } } @@ -30,23 +29,69 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { fn start_iteration( &mut self, - _ctx: &mut SearchContext<'ctx>, + ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, - _universe: &roaring::RoaringBitmap, + universe: &roaring::RoaringBitmap, query: &QueryGraph, ) -> Result<()> { - self.query_graph = Some(query.clone()); + self.state = State::start_iteration(ctx, universe, query)?; + Ok(()) } fn next_bucket( &mut self, - ctx: &mut SearchContext<'ctx>, + _ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, universe: &roaring::RoaringBitmap, ) -> Result>> { - // iterate on the nodes of the graph, retain LocatedQueryTermSubset - let query_graph = self.query_graph.as_ref().unwrap(); + let state = std::mem::take(&mut self.state); + let (state, output) = State::next(state, universe); + self.state = state; + + Ok(output) + } + + fn end_iteration( + &mut self, + _ctx: &mut SearchContext<'ctx>, + _logger: &mut dyn SearchLogger, + ) { + self.state = Default::default(); + } +} + +/// Inner state of the ranking rule. +#[derive(Default)] +enum State { + /// State between two iterations + #[default] + Uninitialized, + /// The next call to `next` will output the documents in the universe that have an attribute that is the exact query + ExactAttribute(QueryGraph, Vec), + /// The next call to `next` will output the documents in the universe that have an attribute that starts with the exact query, + /// but isn't the exact query. + AttributeStarts(QueryGraph, Vec), + /// The next calls to `next` will output the input universe. + Empty(QueryGraph), +} + +/// The candidates sorted by attributes +/// +/// Each of the bitmap in a single `FieldCandidates` struct applies to the same field. +struct FieldCandidates { + /// The candidates that start with all the words of the query in the field + start_with_exact: RoaringBitmap, + /// The candidates that have the same number of words as the query in the field + exact_word_count: RoaringBitmap, +} + +impl State { + fn start_iteration( + ctx: &mut SearchContext<'_>, + universe: &RoaringBitmap, + query_graph: &QueryGraph, + ) -> Result { let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = Vec::with_capacity(query_graph.nodes.len() as usize); for (_, node) in query_graph.nodes.iter() { @@ -55,11 +100,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { let exact_term = if let Some(exact_term) = term.term_subset.exact_term(ctx) { exact_term } else { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + continue; }; exact_term_position_ids.push(( exact_term, @@ -73,14 +114,17 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { exact_term_position_ids.sort_by_key(|(_, _, id)| *id); // bail if there is a "hole" (missing word) in remaining query graph + if let Some((_, _, first_id)) = exact_term_position_ids.first() { + if *first_id != 0 { + return Ok(State::Empty(query_graph.clone())); + } + } else { + return Ok(State::Empty(query_graph.clone())); + } let mut previous_id = 0; for (_, _, id) in exact_term_position_ids.iter().copied() { if id < previous_id || id - previous_id > 1 { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } else { previous_id = id; } @@ -102,11 +146,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { .collect(); for (words, position) in &words_positions { if candidates.is_empty() { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } 'words: for (offset, word) in words.iter().enumerate() { @@ -116,8 +156,11 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { } else { continue 'words; }; + // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of + // longer phrases we'll be losing on precision here. + let bucketed_position = crate::bucketed_position(position + offset); let word_position_docids = CboRoaringBitmapCodec::bytes_decode( - ctx.get_db_word_position_docids(*word, position + offset)?.unwrap_or_default(), + ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default(), ) .unwrap_or_default(); candidates &= word_position_docids; @@ -127,16 +170,12 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { let candidates = candidates; if candidates.is_empty() { - // FIXME: Use `None` or some function indicating that we're passing down the bucket to our child rules - return Ok(Some(RankingRuleOutput { - query: query_graph.clone(), - candidates: universe.clone(), - })); + return Ok(State::Empty(query_graph.clone())); } let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?.unwrap_or_default(); - let mut candidates_per_attributes = Vec::with_capacity(searchable_fields_ids.len()); + let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); // then check that there exists at least one attribute that has all of the terms for fid in searchable_fields_ids { @@ -156,20 +195,59 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for ExactAttribute { )?; intersection &= &candidates; if !intersection.is_empty() { - candidates_per_attributes.push(intersection); + let candidates_with_exact_word_count = ctx + .index + .field_id_word_count_docids + .get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))? + .unwrap_or_default(); + candidates_per_attribute.push(FieldCandidates { + start_with_exact: intersection, + exact_word_count: candidates_with_exact_word_count, + }); } } // note we could have "false positives" where there both exist different attributes that collectively // have the terms in the correct order and a single attribute that have all the terms, but in the incorrect order. - let candidates = MultiOps::union(candidates_per_attributes.into_iter()); - Ok(Some(RankingRuleOutput { query: query_graph.clone(), candidates })) + Ok(State::ExactAttribute(query_graph.clone(), candidates_per_attribute)) } - fn end_iteration( - &mut self, - _ctx: &mut SearchContext<'ctx>, - _logger: &mut dyn SearchLogger, - ) { + fn next( + state: State, + universe: &RoaringBitmap, + ) -> (State, Option>) { + let (state, output) = match state { + State::Uninitialized => (state, None), + State::ExactAttribute(query_graph, candidates_per_attribute) => { + let mut candidates = MultiOps::union(candidates_per_attribute.iter().map( + |FieldCandidates { start_with_exact, exact_word_count }| { + start_with_exact & exact_word_count + }, + )); + candidates &= universe; + ( + State::AttributeStarts(query_graph.clone(), candidates_per_attribute), + Some(RankingRuleOutput { query: query_graph, candidates }), + ) + } + State::AttributeStarts(query_graph, candidates_per_attribute) => { + let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map( + |FieldCandidates { mut start_with_exact, exact_word_count }| { + start_with_exact -= exact_word_count; + start_with_exact + }, + )); + candidates &= universe; + ( + State::Empty(query_graph.clone()), + Some(RankingRuleOutput { query: query_graph, candidates }), + ) + } + State::Empty(query_graph) => ( + State::Empty(query_graph.clone()), + Some(RankingRuleOutput { query: query_graph, candidates: universe.clone() }), + ), + }; + (state, output) } }