diff --git a/milli/src/search/new/distinct.rs b/milli/src/search/new/distinct.rs new file mode 100644 index 000000000..9ee1746df --- /dev/null +++ b/milli/src/search/new/distinct.rs @@ -0,0 +1,119 @@ +use heed::{ + types::{ByteSlice, Str, Unit}, + Database, RoPrefix, RoTxn, +}; +use roaring::RoaringBitmap; + +const FID_SIZE: usize = 2; +const DOCID_SIZE: usize = 4; + +use crate::{ + heed_codec::{ + facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec}, + ByteSliceRefCodec, + }, + Index, Result, SearchContext, +}; + +pub struct DistinctOutput { + pub remaining: RoaringBitmap, + pub excluded: RoaringBitmap, +} + +pub fn apply_distinct_rule<'search>( + ctx: &mut SearchContext<'search>, + field_id: u16, + candidates: &RoaringBitmap, +) -> Result { + let mut excluded = RoaringBitmap::new(); + let mut remaining = RoaringBitmap::new(); + for docid in candidates { + if excluded.contains(docid) { + continue; + } + distinct_single_docid(ctx.index, ctx.txn, field_id, docid, &mut excluded)?; + remaining.push(docid); + } + Ok(DistinctOutput { remaining, excluded }) +} + +fn distinct_single_docid( + index: &Index, + txn: &RoTxn, + field_id: u16, + docid: u32, + excluded: &mut RoaringBitmap, +) -> Result<()> { + for item in facet_string_values(docid, field_id, index, txn)? { + let ((_, _, facet_value), _) = item?; + if let Some(facet_docids) = facet_value_docids( + index.facet_id_string_docids.remap_types(), + txn, + field_id, + facet_value, + )? { + *excluded |= facet_docids; + } + } + for item in facet_number_values(docid, field_id, index, txn)? { + let ((_, _, facet_value), _) = item?; + if let Some(facet_docids) = facet_value_docids( + index.facet_id_string_docids.remap_types(), + txn, + field_id, + facet_value, + )? { + *excluded |= facet_docids; + } + } + Ok(()) +} + +fn facet_value_docids( + database: Database, FacetGroupValueCodec>, + txn: &RoTxn, + field_id: u16, + facet_value: &[u8], +) -> heed::Result> { + database + .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value }) + .map(|opt| opt.map(|v| v.bitmap)) +} +fn facet_number_values<'a>( + id: u32, + distinct: u16, + index: &Index, + txn: &'a RoTxn, +) -> Result, Unit>> { + let key = facet_values_prefix_key(distinct, id); + + let iter = index + .field_id_docid_facet_f64s + .remap_key_type::() + .prefix_iter(txn, &key)? + .remap_key_type(); + + Ok(iter) +} + +fn facet_string_values<'a>( + docid: u32, + distinct: u16, + index: &Index, + txn: &'a RoTxn, +) -> Result, Str>> { + let key = facet_values_prefix_key(distinct, docid); + + let iter = index + .field_id_docid_facet_strings + .remap_key_type::() + .prefix_iter(txn, &key)? + .remap_types(); + + Ok(iter) +} + +#[allow(clippy::drop_non_drop)] +fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] { + concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) +} diff --git a/milli/src/search/new/logger/detailed.rs b/milli/src/search/new/logger/detailed.rs index 5929f406c..ef33bdbf9 100644 --- a/milli/src/search/new/logger/detailed.rs +++ b/milli/src/search/new/logger/detailed.rs @@ -432,27 +432,23 @@ results.{random} {{ file: &mut File, ) { match &node { - QueryNode::Term(LocatedQueryTerm { value, .. }) => match ctx - .query_term_interner - .get(*value) - { + QueryNode::Term(LocatedQueryTerm { value, .. }) => match value { QueryTerm::Phrase { phrase } => { let phrase = ctx.phrase_interner.get(*phrase); let phrase_str = phrase.description(&ctx.word_interner); writeln!(file, "{node_idx} : \"{phrase_str}\"").unwrap(); } - QueryTerm::Word { - derivations: - WordDerivations { - original, - zero_typo, - one_typo, - two_typos, - use_prefix_db, - synonyms, - split_words, - }, - } => { + QueryTerm::Word { derivations } => { + let WordDerivations { + original, + zero_typo, + one_typo, + two_typos, + use_prefix_db, + synonyms, + split_words, + } = ctx.derivations_interner.get(*derivations); + let original = ctx.word_interner.get(*original); writeln!( file, @@ -596,12 +592,13 @@ shape: class" graph.edges_store[edge_idx as usize].as_ref().unwrap(); let source_node = &graph.query_graph.nodes[*source_node as usize]; let source_node_desc = match source_node { - QueryNode::Term(term) => match ctx.query_term_interner.get(term.value) { + QueryNode::Term(term) => match term.value { QueryTerm::Phrase { phrase } => { - let phrase = ctx.phrase_interner.get(*phrase); + let phrase = ctx.phrase_interner.get(phrase); phrase.description(&ctx.word_interner) } QueryTerm::Word { derivations } => { + let derivations = ctx.derivations_interner.get(derivations); ctx.word_interner.get(derivations.original).to_owned() } }, @@ -611,12 +608,13 @@ shape: class" }; let dest_node = &graph.query_graph.nodes[*dest_node as usize]; let dest_node_desc = match dest_node { - QueryNode::Term(term) => match ctx.query_term_interner.get(term.value) { + QueryNode::Term(term) => match term.value { QueryTerm::Phrase { phrase } => { - let phrase = ctx.phrase_interner.get(*phrase); + let phrase = ctx.phrase_interner.get(phrase); phrase.description(&ctx.word_interner) } QueryTerm::Word { derivations } => { + let derivations = ctx.derivations_interner.get(derivations); ctx.word_interner.get(derivations.original).to_owned() } }, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 3975dd4b6..125e2b1e0 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -1,11 +1,11 @@ mod db_cache; +mod distinct; mod graph_based_ranking_rule; mod interner; mod logger; mod query_graph; mod query_term; mod ranking_rule_graph; - mod ranking_rules; mod resolve_query_graph; // TODO: documentation + comments diff --git a/milli/src/search/new/ranking_rules.rs b/milli/src/search/new/ranking_rules.rs index d65610f9d..350eed58f 100644 --- a/milli/src/search/new/ranking_rules.rs +++ b/milli/src/search/new/ranking_rules.rs @@ -3,6 +3,7 @@ use roaring::RoaringBitmap; use super::logger::SearchLogger; use super::{QueryGraph, SearchContext}; // use crate::search::new::sort::Sort; +use crate::search::new::distinct::{apply_distinct_rule, DistinctOutput}; use crate::Result; /// An internal trait implemented by only [`PlaceholderQuery`] and [`QueryGraph`] @@ -80,6 +81,12 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( logger.ranking_rules(&ranking_rules); + let distinct_fid = if let Some(field) = ctx.index.distinct_field(ctx.txn)? { + ctx.index.fields_ids_map(ctx.txn)?.id(field) + } else { + None + }; + if universe.len() < from as u64 { return Ok(vec![]); } @@ -88,8 +95,9 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( logger.start_iteration_ranking_rule(0, ranking_rules[0], query_graph, universe); ranking_rules[0].start_iteration(ctx, logger, universe, query_graph)?; - let mut candidates: Vec = vec![RoaringBitmap::default(); ranking_rules_len]; - candidates[0] = universe.clone(); + let mut ranking_rule_universes: Vec = + vec![RoaringBitmap::default(); ranking_rules_len]; + ranking_rule_universes[0] = universe.clone(); let mut cur_ranking_rule_index = 0; @@ -98,13 +106,13 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( /// Update the candidates accordingly and inform the logger. macro_rules! back { () => { - assert!(candidates[cur_ranking_rule_index].is_empty()); + assert!(ranking_rule_universes[cur_ranking_rule_index].is_empty()); logger.end_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], - &candidates[cur_ranking_rule_index], + &ranking_rule_universes[cur_ranking_rule_index], ); - candidates[cur_ranking_rule_index].clear(); + ranking_rule_universes[cur_ranking_rule_index].clear(); ranking_rules[cur_ranking_rule_index].end_iteration(ctx, logger); if cur_ranking_rule_index == 0 { break; @@ -117,22 +125,35 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( let mut results = vec![]; let mut cur_offset = 0usize; - /// Add the candidates to the results. Take the `from`, `limit`, and `cur_offset` + /// Add the candidates to the results. Take `distinct`, `from`, `limit`, and `cur_offset` /// into account and inform the logger. macro_rules! maybe_add_to_results { ($candidates:expr) => { - let candidates = $candidates; + // First apply the distinct rule on the candidates, reducing the universes if necessary + let candidates = if let Some(distinct_fid) = distinct_fid { + let DistinctOutput { remaining, excluded } = apply_distinct_rule(ctx, distinct_fid, $candidates)?; + for universe in ranking_rule_universes.iter_mut() { + *universe -= &excluded; + } + remaining + } else { + $candidates.clone() + }; let len = candidates.len(); // if the candidates are empty, there is nothing to do; if !candidates.is_empty() { + // if we still haven't reached the first document to return if cur_offset < from { + // and if no document from this bucket can be returned if cur_offset + (candidates.len() as usize) < from { + // then just skip the bucket logger.skip_bucket_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &candidates, ); } else { + // otherwise, skip some of the documents and add some of the rest, in order of ids let all_candidates = candidates.iter().collect::>(); let (skipped_candidates, candidates) = all_candidates.split_at(from - cur_offset); @@ -150,6 +171,7 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( results.extend(&candidates); } } else { + // if we have passed the offset already, add some of the documents (up to the limit) let candidates = candidates.iter().take(length - results.len()).collect::>(); logger.add_to_results(&candidates); @@ -162,14 +184,14 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( while results.len() < length { // The universe for this bucket is zero or one element, so we don't need to sort // anything, just extend the results and go back to the parent ranking rule. - if candidates[cur_ranking_rule_index].len() <= 1 { - maybe_add_to_results!(&candidates[cur_ranking_rule_index]); - candidates[cur_ranking_rule_index].clear(); + if ranking_rule_universes[cur_ranking_rule_index].len() <= 1 { + maybe_add_to_results!(&ranking_rule_universes[cur_ranking_rule_index]); + ranking_rule_universes[cur_ranking_rule_index].clear(); back!(); continue; } - let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &candidates[cur_ranking_rule_index])? else { + let Some(next_bucket) = ranking_rules[cur_ranking_rule_index].next_bucket(ctx, logger, &ranking_rule_universes[cur_ranking_rule_index])? else { back!(); continue; }; @@ -177,12 +199,12 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( logger.next_bucket_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], - &candidates[cur_ranking_rule_index], + &ranking_rule_universes[cur_ranking_rule_index], &next_bucket.candidates, ); - assert!(candidates[cur_ranking_rule_index].is_superset(&next_bucket.candidates)); - candidates[cur_ranking_rule_index] -= &next_bucket.candidates; + assert!(ranking_rule_universes[cur_ranking_rule_index].is_superset(&next_bucket.candidates)); + ranking_rule_universes[cur_ranking_rule_index] -= &next_bucket.candidates; if cur_ranking_rule_index == ranking_rules_len - 1 || next_bucket.candidates.len() <= 1 @@ -193,12 +215,12 @@ pub fn bucket_sort<'search, Q: RankingRuleQueryTrait>( } cur_ranking_rule_index += 1; - candidates[cur_ranking_rule_index] = next_bucket.candidates.clone(); + ranking_rule_universes[cur_ranking_rule_index] = next_bucket.candidates.clone(); logger.start_iteration_ranking_rule( cur_ranking_rule_index, ranking_rules[cur_ranking_rule_index], &next_bucket.query, - &candidates[cur_ranking_rule_index], + &ranking_rule_universes[cur_ranking_rule_index], ); ranking_rules[cur_ranking_rule_index].start_iteration( ctx,