From 701d44bd91e7230b0f9f123a1bc4492410bac43d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 15 Jun 2023 17:37:16 +0200 Subject: [PATCH] Store the scores for each bucket Remove optimization where ranking rules are not executed on buckets of a single document when the score needs to be computed --- milli/src/search/new/bucket_sort.rs | 63 +++++++++++++++++++++++------ milli/src/search/new/mod.rs | 19 +++++++-- 2 files changed, 67 insertions(+), 15 deletions(-) diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index 5144a0a28..168f8ba89 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -3,14 +3,18 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait}; use super::SearchContext; +use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput}; use crate::Result; pub struct BucketSortOutput { pub docids: Vec, + pub scores: Vec>, pub all_candidates: RoaringBitmap, } +// TODO: would probably be good to regroup some of these inside of a struct? +#[allow(clippy::too_many_arguments)] pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'ctx>, mut ranking_rules: Vec>, @@ -18,6 +22,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( universe: &RoaringBitmap, from: usize, length: usize, + scoring_strategy: ScoringStrategy, logger: &mut dyn SearchLogger, ) -> Result { logger.initial_query(query); @@ -31,7 +36,11 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( }; if universe.len() < from as u64 { - return Ok(BucketSortOutput { docids: vec![], all_candidates: universe.clone() }); + return Ok(BucketSortOutput { + docids: vec![], + scores: vec![], + all_candidates: universe.clone(), + }); } if ranking_rules.is_empty() { if let Some(distinct_fid) = distinct_fid { @@ -49,22 +58,32 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( } let mut all_candidates = universe - excluded; all_candidates.extend(results.iter().copied()); - return Ok(BucketSortOutput { docids: results, all_candidates }); + return Ok(BucketSortOutput { + scores: vec![Default::default(); results.len()], + docids: results, + all_candidates, + }); } else { - let docids = universe.iter().skip(from).take(length).collect(); - return Ok(BucketSortOutput { docids, all_candidates: universe.clone() }); + let docids: Vec = universe.iter().skip(from).take(length).collect(); + return Ok(BucketSortOutput { + scores: vec![Default::default(); docids.len()], + docids, + all_candidates: universe.clone(), + }); }; } let ranking_rules_len = ranking_rules.len(); logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe); + ranking_rules[0].start_iteration(ctx, logger, universe, query)?; + let mut ranking_rule_scores: Vec = vec![]; + let mut ranking_rule_universes: Vec = vec![RoaringBitmap::default(); ranking_rules_len]; ranking_rule_universes[0] = universe.clone(); - let mut cur_ranking_rule_index = 0; /// Finish iterating over the current ranking rule, yielding @@ -89,11 +108,15 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( } else { cur_ranking_rule_index -= 1; } + if ranking_rule_scores.len() > cur_ranking_rule_index { + ranking_rule_scores.pop(); + } }; } let mut all_candidates = universe.clone(); let mut valid_docids = vec![]; + let mut valid_scores = vec![]; let mut cur_offset = 0usize; macro_rules! maybe_add_to_results { @@ -104,21 +127,26 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( length, logger, &mut valid_docids, + &mut valid_scores, &mut all_candidates, &mut ranking_rule_universes, &mut ranking_rules, cur_ranking_rule_index, &mut cur_offset, distinct_fid, + &ranking_rule_scores, $candidates, )?; }; } while valid_docids.len() < length { - // The universe for this bucket is zero or one element, so we don't need to sort - // anything, just extend the results and go back to the parent ranking rule. - if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { + // The universe for this bucket is zero, so we don't need to sort + // anything, just go back to the parent ranking rule. + if ranking_rule_universes[cur_ranking_rule_index].is_empty() + || (scoring_strategy == ScoringStrategy::Skip + && ranking_rule_universes[cur_ranking_rule_index].len() == 1) + { let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]); maybe_add_to_results!(bucket); back!(); @@ -130,6 +158,8 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( continue; }; + ranking_rule_scores.push(next_bucket.score); + logger.next_bucket_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index].as_ref(), @@ -143,10 +173,11 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; if cur_ranking_rule_index == ranking_rules_len - 1 - || next_bucket.candidates.len() <= 1 + || (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1) || cur_offset + (next_bucket.candidates.len() as usize) < from { maybe_add_to_results!(next_bucket.candidates); + ranking_rule_scores.pop(); continue; } @@ -166,7 +197,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( )?; } - Ok(BucketSortOutput { docids: valid_docids, all_candidates }) + Ok(BucketSortOutput { docids: valid_docids, scores: valid_scores, all_candidates }) } /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` @@ -179,14 +210,18 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>( logger: &mut dyn SearchLogger, valid_docids: &mut Vec, + valid_scores: &mut Vec>, all_candidates: &mut RoaringBitmap, ranking_rule_universes: &mut [RoaringBitmap], ranking_rules: &mut [BoxRankingRule<'ctx, Q>], + cur_ranking_rule_index: usize, cur_offset: &mut usize, + distinct_fid: Option, + ranking_rule_scores: &[ScoreDetails], candidates: RoaringBitmap, ) -> Result<()> { // First apply the distinct rule on the candidates, reducing the universes if necessary @@ -231,13 +266,17 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>( let candidates = candidates.iter().take(length - valid_docids.len()).copied().collect::>(); logger.add_to_results(&candidates); - valid_docids.extend(&candidates); + valid_docids.extend_from_slice(&candidates); + valid_scores + .extend(std::iter::repeat(ranking_rule_scores.to_owned()).take(candidates.len())); } } else { // if we have passed the offset already, add some of the documents (up to the limit) let candidates = candidates.iter().take(length - valid_docids.len()).collect::>(); logger.add_to_results(&candidates); - valid_docids.extend(&candidates); + valid_docids.extend_from_slice(&candidates); + valid_scores + .extend(std::iter::repeat(ranking_rule_scores.to_owned()).take(candidates.len())); } *cur_offset += candidates.len() as usize; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index a28f42f35..9609384d5 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -44,6 +44,7 @@ use self::geo_sort::GeoSort; pub use self::geo_sort::Strategy as GeoSortStrategy; use self::graph_based_ranking_rule::Words; use self::interner::Interned; +use crate::score_details::{ScoreDetails, ScoringStrategy}; use crate::search::new::distinct::apply_distinct_rule; use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError}; @@ -411,7 +412,16 @@ pub fn execute_search( universe = resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?; - bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? + bucket_sort( + ctx, + ranking_rules, + &graph, + &universe, + from, + length, + ScoringStrategy::Skip, + query_graph_logger, + )? } else { let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria, geo_strategy)?; @@ -422,17 +432,20 @@ pub fn execute_search( &universe, from, length, + ScoringStrategy::Skip, placeholder_search_logger, )? }; - let BucketSortOutput { docids, mut all_candidates } = bucket_sort_output; + let BucketSortOutput { docids, scores, mut all_candidates } = bucket_sort_output; + + let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?; // The candidates is the universe unless the exhaustive number of hits // is requested and a distinct attribute is set. if exhaustive_number_hits { if let Some(f) = ctx.index.distinct_field(ctx.txn)? { - if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) { + if let Some(distinct_fid) = fields_ids_map.id(f) { all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining; } }