use heed::types::{ByteSlice, Str, Unit}; use heed::{Database, RoPrefix, RoTxn}; use roaring::RoaringBitmap; const FID_SIZE: usize = 2; const DOCID_SIZE: usize = 4; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetCodec, }; use crate::heed_codec::ByteSliceRefCodec; use crate::{Index, Result, SearchContext}; pub struct DistinctOutput { pub remaining: RoaringBitmap, pub excluded: RoaringBitmap, } /// Return a [`DistinctOutput`] containing: /// - `remaining`: a set of docids built such that exactly one element from `candidates` /// is kept for each distinct value inside the given field. If the field does not exist, it /// is considered unique. /// - `excluded`: the set of document ids that contain a value for the given field that occurs /// in the given candidates. pub fn apply_distinct_rule( ctx: &mut SearchContext, field_id: u16, candidates: &RoaringBitmap, // TODO: add a universe here, such that the `excluded` are a subset of the universe? ) -> Result { let mut excluded = RoaringBitmap::new(); let mut remaining = RoaringBitmap::new(); for docid in candidates { if excluded.contains(docid) { continue; } distinct_single_docid(ctx.index, ctx.txn, field_id, docid, &mut excluded)?; remaining.push(docid); } Ok(DistinctOutput { remaining, excluded }) } /// Apply the distinct rule defined by [`apply_distinct_rule`] for a single document id. pub fn distinct_single_docid( index: &Index, txn: &RoTxn, field_id: u16, docid: u32, excluded: &mut RoaringBitmap, ) -> Result<()> { for item in facet_string_values(docid, field_id, index, txn)? { let ((_, _, facet_value), _) = item?; if let Some(facet_docids) = facet_value_docids( index.facet_id_string_docids.remap_types(), txn, field_id, facet_value, )? { *excluded |= facet_docids; } } for item in facet_number_values(docid, field_id, index, txn)? { let ((_, _, facet_value), _) = item?; if let Some(facet_docids) = facet_value_docids(index.facet_id_f64_docids.remap_types(), txn, field_id, facet_value)? { *excluded |= facet_docids; } } Ok(()) } /// Return all the docids containing the given value in the given field fn facet_value_docids( database: Database, FacetGroupValueCodec>, txn: &RoTxn, field_id: u16, facet_value: &[u8], ) -> heed::Result> { database .get(txn, &FacetGroupKey { field_id, level: 0, left_bound: facet_value }) .map(|opt| opt.map(|v| v.bitmap)) } /// Return an iterator over each number value in the given field of the given document. fn facet_number_values<'a>( docid: u32, field_id: u16, index: &Index, txn: &'a RoTxn, ) -> Result, Unit>> { let key = facet_values_prefix_key(field_id, docid); let iter = index .field_id_docid_facet_f64s .remap_key_type::() .prefix_iter(txn, &key)? .remap_key_type(); Ok(iter) } /// Return an iterator over each string value in the given field of the given document. fn facet_string_values<'a>( docid: u32, field_id: u16, index: &Index, txn: &'a RoTxn, ) -> Result, Str>> { let key = facet_values_prefix_key(field_id, docid); let iter = index .field_id_docid_facet_strings .remap_key_type::() .prefix_iter(txn, &key)? .remap_types(); Ok(iter) } #[allow(clippy::drop_non_drop)] fn facet_values_prefix_key(distinct: u16, id: u32) -> [u8; FID_SIZE + DOCID_SIZE] { concat_arrays::concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) }