Store the scores for each bucket

Remove optimization where ranking rules are not executed on buckets of a single document
when the score needs to be computed
This commit is contained in:
Louis Dureuil 2023-06-15 17:37:16 +02:00
parent c621a250a7
commit 701d44bd91
No known key found for this signature in database
2 changed files with 67 additions and 15 deletions

View File

@ -3,14 +3,18 @@ use roaring::RoaringBitmap;
use super::logger::SearchLogger; use super::logger::SearchLogger;
use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait}; use super::ranking_rules::{BoxRankingRule, RankingRuleQueryTrait};
use super::SearchContext; use super::SearchContext;
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput}; use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput};
use crate::Result; use crate::Result;
pub struct BucketSortOutput { pub struct BucketSortOutput {
pub docids: Vec<u32>, pub docids: Vec<u32>,
pub scores: Vec<Vec<ScoreDetails>>,
pub all_candidates: RoaringBitmap, pub all_candidates: RoaringBitmap,
} }
// TODO: would probably be good to regroup some of these inside of a struct?
#[allow(clippy::too_many_arguments)]
pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
ctx: &mut SearchContext<'ctx>, ctx: &mut SearchContext<'ctx>,
mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>, mut ranking_rules: Vec<BoxRankingRule<'ctx, Q>>,
@ -18,6 +22,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
universe: &RoaringBitmap, universe: &RoaringBitmap,
from: usize, from: usize,
length: usize, length: usize,
scoring_strategy: ScoringStrategy,
logger: &mut dyn SearchLogger<Q>, logger: &mut dyn SearchLogger<Q>,
) -> Result<BucketSortOutput> { ) -> Result<BucketSortOutput> {
logger.initial_query(query); logger.initial_query(query);
@ -31,7 +36,11 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
}; };
if universe.len() < from as u64 { if universe.len() < from as u64 {
return Ok(BucketSortOutput { docids: vec![], all_candidates: universe.clone() }); return Ok(BucketSortOutput {
docids: vec![],
scores: vec![],
all_candidates: universe.clone(),
});
} }
if ranking_rules.is_empty() { if ranking_rules.is_empty() {
if let Some(distinct_fid) = distinct_fid { if let Some(distinct_fid) = distinct_fid {
@ -49,22 +58,32 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
} }
let mut all_candidates = universe - excluded; let mut all_candidates = universe - excluded;
all_candidates.extend(results.iter().copied()); all_candidates.extend(results.iter().copied());
return Ok(BucketSortOutput { docids: results, all_candidates }); return Ok(BucketSortOutput {
scores: vec![Default::default(); results.len()],
docids: results,
all_candidates,
});
} else { } else {
let docids = universe.iter().skip(from).take(length).collect(); let docids: Vec<u32> = universe.iter().skip(from).take(length).collect();
return Ok(BucketSortOutput { docids, all_candidates: universe.clone() }); return Ok(BucketSortOutput {
scores: vec![Default::default(); docids.len()],
docids,
all_candidates: universe.clone(),
});
}; };
} }
let ranking_rules_len = ranking_rules.len(); let ranking_rules_len = ranking_rules.len();
logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe); logger.start_iteration_ranking_rule(0, ranking_rules[0].as_ref(), query, universe);
ranking_rules[0].start_iteration(ctx, logger, universe, query)?; ranking_rules[0].start_iteration(ctx, logger, universe, query)?;
let mut ranking_rule_scores: Vec<ScoreDetails> = vec![];
let mut ranking_rule_universes: Vec<RoaringBitmap> = let mut ranking_rule_universes: Vec<RoaringBitmap> =
vec![RoaringBitmap::default(); ranking_rules_len]; vec![RoaringBitmap::default(); ranking_rules_len];
ranking_rule_universes[0] = universe.clone(); ranking_rule_universes[0] = universe.clone();
let mut cur_ranking_rule_index = 0; let mut cur_ranking_rule_index = 0;
/// Finish iterating over the current ranking rule, yielding /// Finish iterating over the current ranking rule, yielding
@ -89,11 +108,15 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
} else { } else {
cur_ranking_rule_index -= 1; cur_ranking_rule_index -= 1;
} }
if ranking_rule_scores.len() > cur_ranking_rule_index {
ranking_rule_scores.pop();
}
}; };
} }
let mut all_candidates = universe.clone(); let mut all_candidates = universe.clone();
let mut valid_docids = vec![]; let mut valid_docids = vec![];
let mut valid_scores = vec![];
let mut cur_offset = 0usize; let mut cur_offset = 0usize;
macro_rules! maybe_add_to_results { macro_rules! maybe_add_to_results {
@ -104,21 +127,26 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
length, length,
logger, logger,
&mut valid_docids, &mut valid_docids,
&mut valid_scores,
&mut all_candidates, &mut all_candidates,
&mut ranking_rule_universes, &mut ranking_rule_universes,
&mut ranking_rules, &mut ranking_rules,
cur_ranking_rule_index, cur_ranking_rule_index,
&mut cur_offset, &mut cur_offset,
distinct_fid, distinct_fid,
&ranking_rule_scores,
$candidates, $candidates,
)?; )?;
}; };
} }
while valid_docids.len() < length { while valid_docids.len() < length {
// The universe for this bucket is zero or one element, so we don't need to sort // The universe for this bucket is zero, so we don't need to sort
// anything, just extend the results and go back to the parent ranking rule. // anything, just go back to the parent ranking rule.
if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { if ranking_rule_universes[cur_ranking_rule_index].is_empty()
|| (scoring_strategy == ScoringStrategy::Skip
&& ranking_rule_universes[cur_ranking_rule_index].len() == 1)
{
let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]); let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]);
maybe_add_to_results!(bucket); maybe_add_to_results!(bucket);
back!(); back!();
@ -130,6 +158,8 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
continue; continue;
}; };
ranking_rule_scores.push(next_bucket.score);
logger.next_bucket_ranking_rule( logger.next_bucket_ranking_rule(
cur_ranking_rule_index, cur_ranking_rule_index,
ranking_rules[cur_ranking_rule_index].as_ref(), ranking_rules[cur_ranking_rule_index].as_ref(),
@ -143,10 +173,11 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates;
if cur_ranking_rule_index == ranking_rules_len - 1 if cur_ranking_rule_index == ranking_rules_len - 1
|| next_bucket.candidates.len() <= 1 || (scoring_strategy == ScoringStrategy::Skip && next_bucket.candidates.len() <= 1)
|| cur_offset + (next_bucket.candidates.len() as usize) < from || cur_offset + (next_bucket.candidates.len() as usize) < from
{ {
maybe_add_to_results!(next_bucket.candidates); maybe_add_to_results!(next_bucket.candidates);
ranking_rule_scores.pop();
continue; continue;
} }
@ -166,7 +197,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>(
)?; )?;
} }
Ok(BucketSortOutput { docids: valid_docids, all_candidates }) Ok(BucketSortOutput { docids: valid_docids, scores: valid_scores, all_candidates })
} }
/// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset`
@ -179,14 +210,18 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>(
logger: &mut dyn SearchLogger<Q>, logger: &mut dyn SearchLogger<Q>,
valid_docids: &mut Vec<u32>, valid_docids: &mut Vec<u32>,
valid_scores: &mut Vec<Vec<ScoreDetails>>,
all_candidates: &mut RoaringBitmap, all_candidates: &mut RoaringBitmap,
ranking_rule_universes: &mut [RoaringBitmap], ranking_rule_universes: &mut [RoaringBitmap],
ranking_rules: &mut [BoxRankingRule<'ctx, Q>], ranking_rules: &mut [BoxRankingRule<'ctx, Q>],
cur_ranking_rule_index: usize, cur_ranking_rule_index: usize,
cur_offset: &mut usize, cur_offset: &mut usize,
distinct_fid: Option<u16>, distinct_fid: Option<u16>,
ranking_rule_scores: &[ScoreDetails],
candidates: RoaringBitmap, candidates: RoaringBitmap,
) -> Result<()> { ) -> Result<()> {
// First apply the distinct rule on the candidates, reducing the universes if necessary // First apply the distinct rule on the candidates, reducing the universes if necessary
@ -231,13 +266,17 @@ fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>(
let candidates = let candidates =
candidates.iter().take(length - valid_docids.len()).copied().collect::<Vec<_>>(); candidates.iter().take(length - valid_docids.len()).copied().collect::<Vec<_>>();
logger.add_to_results(&candidates); logger.add_to_results(&candidates);
valid_docids.extend(&candidates); valid_docids.extend_from_slice(&candidates);
valid_scores
.extend(std::iter::repeat(ranking_rule_scores.to_owned()).take(candidates.len()));
} }
} else { } else {
// if we have passed the offset already, add some of the documents (up to the limit) // if we have passed the offset already, add some of the documents (up to the limit)
let candidates = candidates.iter().take(length - valid_docids.len()).collect::<Vec<u32>>(); let candidates = candidates.iter().take(length - valid_docids.len()).collect::<Vec<u32>>();
logger.add_to_results(&candidates); logger.add_to_results(&candidates);
valid_docids.extend(&candidates); valid_docids.extend_from_slice(&candidates);
valid_scores
.extend(std::iter::repeat(ranking_rule_scores.to_owned()).take(candidates.len()));
} }
*cur_offset += candidates.len() as usize; *cur_offset += candidates.len() as usize;

View File

@ -44,6 +44,7 @@ use self::geo_sort::GeoSort;
pub use self::geo_sort::Strategy as GeoSortStrategy; pub use self::geo_sort::Strategy as GeoSortStrategy;
use self::graph_based_ranking_rule::Words; use self::graph_based_ranking_rule::Words;
use self::interner::Interned; use self::interner::Interned;
use crate::score_details::{ScoreDetails, ScoringStrategy};
use crate::search::new::distinct::apply_distinct_rule; use crate::search::new::distinct::apply_distinct_rule;
use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError}; use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
@ -411,7 +412,16 @@ pub fn execute_search(
universe = universe =
resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?; resolve_universe(ctx, &universe, &graph, terms_matching_strategy, query_graph_logger)?;
bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? bucket_sort(
ctx,
ranking_rules,
&graph,
&universe,
from,
length,
ScoringStrategy::Skip,
query_graph_logger,
)?
} else { } else {
let ranking_rules = let ranking_rules =
get_ranking_rules_for_placeholder_search(ctx, sort_criteria, geo_strategy)?; get_ranking_rules_for_placeholder_search(ctx, sort_criteria, geo_strategy)?;
@ -422,17 +432,20 @@ pub fn execute_search(
&universe, &universe,
from, from,
length, length,
ScoringStrategy::Skip,
placeholder_search_logger, placeholder_search_logger,
)? )?
}; };
let BucketSortOutput { docids, mut all_candidates } = bucket_sort_output; let BucketSortOutput { docids, scores, mut all_candidates } = bucket_sort_output;
let fields_ids_map = ctx.index.fields_ids_map(ctx.txn)?;
// The candidates is the universe unless the exhaustive number of hits // The candidates is the universe unless the exhaustive number of hits
// is requested and a distinct attribute is set. // is requested and a distinct attribute is set.
if exhaustive_number_hits { if exhaustive_number_hits {
if let Some(f) = ctx.index.distinct_field(ctx.txn)? { if let Some(f) = ctx.index.distinct_field(ctx.txn)? {
if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) { if let Some(distinct_fid) = fields_ids_map.id(f) {
all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining; all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining;
} }
} }