From d0e9d65025410fe14bff4c283a416a8da1d89a97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Fri, 7 Apr 2023 11:09:01 +0200 Subject: [PATCH] Fix distinct attribute bugs --- milli/src/search/new/bucket_sort.rs | 172 ++++++++++++++++--------- milli/src/search/new/distinct.rs | 9 +- milli/src/search/new/mod.rs | 14 +- milli/src/search/new/tests/distinct.rs | 2 +- 4 files changed, 121 insertions(+), 76 deletions(-) diff --git a/milli/src/search/new/bucket_sort.rs b/milli/src/search/new/bucket_sort.rs index 712825c31..6413ff811 100644 --- a/milli/src/search/new/bucket_sort.rs +++ b/milli/src/search/new/bucket_sort.rs @@ -6,6 +6,11 @@ use super::SearchContext; use crate::search::new::distinct::{apply_distinct_rule, distinct_single_docid, DistinctOutput}; use crate::Result; +pub struct BucketSortOutput { + pub docids: Vec, + pub all_candidates: RoaringBitmap, +} + pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( ctx: &mut SearchContext<'ctx>, mut ranking_rules: Vec>, @@ -14,7 +19,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( from: usize, length: usize, logger: &mut dyn SearchLogger, -) -> Result> { +) -> Result { logger.initial_query(query); logger.ranking_rules(&ranking_rules); logger.initial_universe(universe); @@ -26,7 +31,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( }; if universe.len() < from as u64 { - return Ok(vec![]); + return Ok(BucketSortOutput { docids: vec![], all_candidates: universe.clone() }); } if ranking_rules.is_empty() { if let Some(distinct_fid) = distinct_fid { @@ -42,9 +47,12 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( distinct_single_docid(ctx.index, ctx.txn, distinct_fid, docid, &mut excluded)?; results.push(docid); } - return Ok(results); + let mut all_candidates = universe - excluded; + all_candidates.extend(results.iter().copied()); + return Ok(BucketSortOutput { docids: results, all_candidates }); } else { - return Ok(universe.iter().skip(from).take(length).collect()); + let docids = universe.iter().skip(from).take(length).collect(); + return Ok(BucketSortOutput { docids, all_candidates: universe.clone() }); }; } @@ -61,7 +69,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( /// Finish iterating over the current ranking rule, yielding /// control to the parent (or finishing the search if not possible). - /// Update the candidates accordingly and inform the logger. + /// Update the universes accordingly and inform the logger. macro_rules! back { () => { assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); @@ -80,72 +88,35 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( }; } - let mut results = vec![]; + let mut all_candidates = RoaringBitmap::new(); + let mut valid_docids = vec![]; let mut cur_offset = 0usize; - /// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` - /// into account and inform the logger. macro_rules! maybe_add_to_results { ($candidates:expr) => { - // First apply the distinct rule on the candidates, reducing the universes if necessary - let candidates = if let Some(distinct_fid) = distinct_fid { - let DistinctOutput { remaining, excluded } = apply_distinct_rule(ctx, distinct_fid, $candidates)?; - for universe in ranking_rule_universes.iter_mut() { - *universe -= &excluded; - } - remaining - } else { - $candidates.clone() - }; - let len = candidates.len(); - // if the candidates are empty, there is nothing to do; - if !candidates.is_empty() { - // if we still haven't reached the first document to return - if cur_offset < from { - // and if no document from this bucket can be returned - if cur_offset + (candidates.len() as usize) < from { - // then just skip the bucket - logger.skip_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &candidates, - ); - } else { - // otherwise, skip some of the documents and add some of the rest, in order of ids - let all_candidates = candidates.iter().collect::>(); - let (skipped_candidates, candidates) = - all_candidates.split_at(from - cur_offset); - logger.skip_bucket_ranking_rule( - cur_ranking_rule_index, - ranking_rules[cur_ranking_rule_index].as_ref(), - &skipped_candidates.into_iter().collect(), - ); - let candidates = candidates - .iter() - .take(length - results.len()) - .copied() - .collect::>(); - logger.add_to_results(&candidates); - results.extend(&candidates); - } - } else { - // if we have passed the offset already, add some of the documents (up to the limit) - let candidates = - candidates.iter().take(length - results.len()).collect::>(); - logger.add_to_results(&candidates); - results.extend(&candidates); - } - } - cur_offset += len as usize; + maybe_add_to_results( + ctx, + from, + length, + logger, + &mut valid_docids, + &mut all_candidates, + &mut ranking_rule_universes, + &mut ranking_rules, + cur_ranking_rule_index, + &mut cur_offset, + distinct_fid, + $candidates, + )?; }; } - while results.len() < length { + while valid_docids.len() < length { // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { - maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); - ranking_rule_universes[cur_ranking_rule_index].clear(); + let bucket = std::mem::take(&mut ranking_rule_universes[cur_ranking_rule_index]); + maybe_add_to_results!(bucket); back!(); continue; } @@ -171,7 +142,7 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( || next_bucket.candidates.len() <= 1 || cur_offset + (next_bucket.candidates.len() as usize) < from { - maybe_add_to_results!(&next_bucket.candidates); + maybe_add_to_results!(next_bucket.candidates); continue; } @@ -191,5 +162,80 @@ pub fn bucket_sort<'ctx, Q: RankingRuleQueryTrait>( )?; } - Ok(results) + all_candidates |= &ranking_rule_universes[0]; + + Ok(BucketSortOutput { docids: valid_docids, all_candidates }) +} + +/// Add the candidates to the results. Take `distinct`, `from`, `length`, and `cur_offset` +/// into account and inform the logger. +#[allow(clippy::too_many_arguments)] +fn maybe_add_to_results<'ctx, Q: RankingRuleQueryTrait>( + ctx: &mut SearchContext<'ctx>, + from: usize, + length: usize, + logger: &mut dyn SearchLogger, + + valid_docids: &mut Vec, + all_candidates: &mut RoaringBitmap, + + ranking_rule_universes: &mut [RoaringBitmap], + ranking_rules: &mut [BoxRankingRule<'ctx, Q>], + cur_ranking_rule_index: usize, + + cur_offset: &mut usize, + distinct_fid: Option, + candidates: RoaringBitmap, +) -> Result<()> { + // First apply the distinct rule on the candidates, reducing the universes if necessary + let candidates = if let Some(distinct_fid) = distinct_fid { + let DistinctOutput { remaining, excluded } = + apply_distinct_rule(ctx, distinct_fid, &candidates)?; + for universe in ranking_rule_universes.iter_mut() { + *universe -= &excluded; + } + remaining + } else { + candidates.clone() + }; + *all_candidates |= &candidates; + // if the candidates are empty, there is nothing to do; + if candidates.is_empty() { + return Ok(()); + } + + // if we still haven't reached the first document to return + if *cur_offset < from { + // and if no document from this bucket can be returned + if *cur_offset + (candidates.len() as usize) < from { + // then just skip the bucket + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &candidates, + ); + } else { + // otherwise, skip some of the documents and add some of the rest, in order of ids + let all_candidates = candidates.iter().collect::>(); + let (skipped_candidates, candidates) = all_candidates.split_at(from - *cur_offset); + + logger.skip_bucket_ranking_rule( + cur_ranking_rule_index, + ranking_rules[cur_ranking_rule_index].as_ref(), + &skipped_candidates.iter().collect(), + ); + let candidates = + candidates.iter().take(length - valid_docids.len()).copied().collect::>(); + logger.add_to_results(&candidates); + valid_docids.extend(&candidates); + } + } else { + // if we have passed the offset already, add some of the documents (up to the limit) + let candidates = candidates.iter().take(length - valid_docids.len()).collect::>(); + logger.add_to_results(&candidates); + valid_docids.extend(&candidates); + } + + *cur_offset += candidates.len() as usize; + Ok(()) } diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs index 7b77adf49..fbb7550a9 100644 --- a/milli/src/search/new/distinct.rs +++ b/milli/src/search/new/distinct.rs @@ -61,12 +61,9 @@ pub fn distinct_single_docid( } for item in facet_number_values(docid, field_id, index, txn)? { let ((_, _, facet_value), _) = item?; - if let Some(facet_docids) = facet_value_docids( - index.facet_id_string_docids.remap_types(), - txn, - field_id, - facet_value, - )? { + if let Some(facet_docids) = + facet_value_docids(index.facet_id_f64_docids.remap_types(), txn, field_id, facet_value)? + { *excluded |= facet_docids; } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index b307b2434..3beda526b 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -24,6 +24,7 @@ mod tests; use std::collections::HashSet; +use bucket_sort::bucket_sort; use charabia::TokenizerBuilder; use db_cache::DatabaseCache; use graph_based_ranking_rule::{Proximity, Typo}; @@ -34,11 +35,11 @@ pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; use query_term::{located_query_terms_from_string, Phrase, QueryTerm}; use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; -use bucket_sort::bucket_sort; use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; +use self::bucket_sort::BucketSortOutput; use self::exact_attribute::ExactAttribute; use self::graph_based_ranking_rule::Exactness; use self::interner::Interner; @@ -297,7 +298,7 @@ pub fn execute_search( ctx.index.documents_ids(ctx.txn)? }; - let documents_ids = if let Some(query) = query { + let bucket_sort_output = if let Some(query) = query { // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut tokbuilder = TokenizerBuilder::new(); @@ -344,13 +345,14 @@ pub fn execute_search( )? }; + let BucketSortOutput { docids, mut all_candidates } = bucket_sort_output; + // The candidates is the universe unless the exhaustive number of hits // is requested and a distinct attribute is set. - let mut candidates = universe; if exhaustive_number_hits { if let Some(f) = ctx.index.distinct_field(ctx.txn)? { if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) { - candidates = apply_distinct_rule(ctx, distinct_fid, &candidates)?.remaining; + all_candidates = apply_distinct_rule(ctx, distinct_fid, &all_candidates)?.remaining; } } } @@ -358,8 +360,8 @@ pub fn execute_search( Ok(SearchResult { // TODO: correct matching words matching_words: MatchingWords::default(), - candidates, - documents_ids, + candidates: all_candidates, + documents_ids: docids, }) } diff --git a/milli/src/search/new/tests/distinct.rs b/milli/src/search/new/tests/distinct.rs index 4073cf585..74e0cdca0 100644 --- a/milli/src/search/new/tests/distinct.rs +++ b/milli/src/search/new/tests/distinct.rs @@ -531,7 +531,7 @@ fn test_distinct_all_candidates() { let candidates = candidates.iter().collect::>(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[14, 26, 4, 7, 17, 23, 1, 19, 25, 8, 20, 24]"); // TODO: this is incorrect! - insta::assert_snapshot!(format!("{candidates:?}"), @"[0, 2, 5, 8, 9, 15, 18, 20, 21, 24, 25, 26]"); + insta::assert_snapshot!(format!("{candidates:?}"), @"[1, 4, 7, 8, 14, 17, 19, 20, 23, 24, 25, 26]"); } #[test]