Make the search to always return the facets ordered by count

This commit is contained in:
Kerollmops 2023-05-29 11:52:57 +02:00 committed by Clément Renault
parent bd3c026406
commit f42bef2f66
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 55 additions and 44 deletions

View File

@ -240,42 +240,49 @@ impl<'a> FacetDistribution<'a> {
} }
fn facet_values(&self, field_id: FieldId) -> heed::Result<BTreeMap<String, u64>> { fn facet_values(&self, field_id: FieldId) -> heed::Result<BTreeMap<String, u64>> {
use FacetType::{Number, String}; // use FacetType::{Number, String};
match self.candidates { let candidates = match self.candidates.as_ref() {
Some(ref candidates) => { Some(candidates) => candidates.clone(),
// Classic search, candidates were specified, we must return facet values only related None => todo!("fetch candidates"),
// to those candidates. We also enter here for facet strings for performance reasons. };
let mut distribution = BTreeMap::new();
if candidates.len() <= CANDIDATES_THRESHOLD { let mut distribution = BTreeMap::new();
self.facet_distribution_from_documents(
field_id, let number_distribution = facet_distribution_iter::count_iterate_over_facet_distribution(
Number, self.rtxn,
candidates, self.index
&mut distribution, .facet_id_f64_docids
)?; .remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
self.facet_distribution_from_documents( field_id,
field_id, &candidates,
String, )?;
candidates,
&mut distribution, for (count, facet_key, _) in number_distribution {
)?; let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap();
} else { distribution.insert(facet_key.to_string(), count);
self.facet_numbers_distribution_from_facet_levels(
field_id,
candidates,
&mut distribution,
)?;
self.facet_strings_distribution_from_facet_levels(
field_id,
candidates,
&mut distribution,
)?;
}
Ok(distribution)
}
None => self.facet_values_from_raw_facet_database(field_id),
} }
let string_distribution = facet_distribution_iter::count_iterate_over_facet_distribution(
self.rtxn,
self.index
.facet_id_string_docids
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
field_id,
&candidates,
)?;
for (count, facet_key, any_docid) in string_distribution {
let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap();
let key: (FieldId, _, &str) = (field_id, any_docid, facet_key);
let original_string =
self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned();
distribution.insert(original_string, count);
}
Ok(distribution)
} }
pub fn compute_stats(&self) -> Result<BTreeMap<String, (f64, f64)>> { pub fn compute_stats(&self) -> Result<BTreeMap<String, (f64, f64)>> {

View File

@ -1,5 +1,5 @@
use std::cmp::Reverse; use std::cmp::Reverse;
use std::collections::{BTreeMap, BinaryHeap}; use std::collections::BinaryHeap;
use std::ops::ControlFlow; use std::ops::ControlFlow;
use heed::Result; use heed::Result;
@ -46,15 +46,12 @@ where
} }
} }
pub fn count_iterate_over_facet_distribution<'t, CB>( pub fn count_iterate_over_facet_distribution<'t>(
rtxn: &'t heed::RoTxn<'t>, rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>, db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16, field_id: u16,
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
) -> Result<Vec<(u64, &'t [u8])>> ) -> Result<Vec<(u64, &'t [u8], u32)>> {
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)] #[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
struct LevelEntry<'t> { struct LevelEntry<'t> {
/// The number of candidates in this entry. /// The number of candidates in this entry.
@ -65,6 +62,8 @@ where
left_bound: &'t [u8], left_bound: &'t [u8],
/// The number of keys we must look for after `left_bound`. /// The number of keys we must look for after `left_bound`.
group_size: u8, group_size: u8,
/// Any docid in the set of matching documents. Used to find the original facet string.
any_docid: u32,
} }
// Represents the list of keys that we must explore. // Represents the list of keys that we must explore.
@ -88,20 +87,23 @@ where
if key.field_id != field_id { if key.field_id != field_id {
break; break;
} }
let count = value.bitmap.intersection_len(&candidates); let intersection = value.bitmap & candidates;
let count = intersection.len();
if count != 0 { if count != 0 {
heap.push(LevelEntry { heap.push(LevelEntry {
count, count,
level: Reverse(key.level), level: Reverse(key.level),
left_bound: key.left_bound, left_bound: key.left_bound,
group_size: value.size, group_size: value.size,
any_docid: intersection.min().unwrap(),
}); });
} }
} }
while let Some(LevelEntry { count, level, left_bound, group_size }) = heap.pop() { while let Some(LevelEntry { count, level, left_bound, group_size, any_docid }) = heap.pop()
{
if let Reverse(0) = level { if let Reverse(0) = level {
results.push((count, left_bound)); results.push((count, left_bound, any_docid));
// TODO better just call the user callback and ask for a ControlFlow // TODO better just call the user callback and ask for a ControlFlow
if results.len() == 20 { if results.len() == 20 {
break; break;
@ -116,13 +118,15 @@ where
if key.field_id != field_id { if key.field_id != field_id {
break; break;
} }
let count = value.bitmap.intersection_len(&candidates); let intersection = value.bitmap & candidates;
let count = intersection.len();
if count != 0 { if count != 0 {
heap.push(LevelEntry { heap.push(LevelEntry {
count, count,
level: Reverse(key.level), level: Reverse(key.level),
left_bound: key.left_bound, left_bound: key.left_bound,
group_size: value.size, group_size: value.size,
any_docid: intersection.min().unwrap(),
}); });
} }
} }