diff --git a/Cargo.lock b/Cargo.lock index de41e6a19..fcca06546 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -673,6 +673,7 @@ dependencies = [ "criterion", "crossbeam-channel", "csv", + "either", "flate2", "fst", "fxhash", diff --git a/Cargo.toml b/Cargo.toml index a6f0ff911..3fcdfff03 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ bstr = "0.2.13" byteorder = "1.3.4" crossbeam-channel = "0.5.0" csv = "1.1.3" +either = "1.6.1" flate2 = "1.0.17" fst = "0.4.4" fxhash = "0.2.1" diff --git a/http-ui/Cargo.lock b/http-ui/Cargo.lock index b3136fd77..662225c77 100644 --- a/http-ui/Cargo.lock +++ b/http-ui/Cargo.lock @@ -999,6 +999,7 @@ dependencies = [ "byteorder", "crossbeam-channel", "csv", + "either", "flate2", "fst", "fxhash", diff --git a/src/search/facet/mod.rs b/src/search/facet/mod.rs index 6b2d90c8e..41212e83e 100644 --- a/src/search/facet/mod.rs +++ b/src/search/facet/mod.rs @@ -1,9 +1,11 @@ use std::fmt::Debug; use std::ops::Bound::{self, Included, Excluded, Unbounded}; -use heed::types::DecodeIgnore; +use either::Either::{self, Left, Right}; +use heed::types::{DecodeIgnore, ByteSlice}; use heed::{BytesEncode, BytesDecode}; -use heed::{Database, RoRange, LazyDecode}; +use heed::{Database, RoRange, RoRevRange, LazyDecode}; +use log::debug; use num_traits::Bounded; use roaring::RoaringBitmap; @@ -76,16 +78,78 @@ where } } +struct FacetRevRange<'t, T: 't, KC> { + iter: RoRevRange<'t, KC, LazyDecode>, + end: Bound, +} + +impl<'t, T: 't, KC> FacetRevRange<'t, T, KC> +where + KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, + T: PartialOrd + Copy + Bounded, +{ + fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + ) -> heed::Result> + { + let left_bound = match left { + Included(left) => Included((field_id, level, left, T::min_value())), + Excluded(left) => Excluded((field_id, level, left, T::min_value())), + Unbounded => Included((field_id, level, T::min_value(), T::min_value())), + }; + let right_bound = Included((field_id, level, T::max_value(), T::max_value())); + let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; + Ok(FacetRevRange { iter, end: right }) + } +} + +impl<'t, T, KC> Iterator for FacetRevRange<'t, T, KC> +where + KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, + KC: BytesDecode<'t, DItem = (FieldId, u8, T, T)>, + T: PartialOrd + Copy, +{ + type Item = heed::Result<((FieldId, u8, T, T), RoaringBitmap)>; + + fn next(&mut self) -> Option { + loop { + match self.iter.next() { + Some(Ok(((fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok(docids) => return Some(Ok(((fid, level, left, right), docids))), + Err(e) => return Some(Err(e)), + } + } + continue; + }, + Some(Err(e)) => return Some(Err(e)), + None => return None, + } + } + } +} + pub struct FacetIter<'t, T: 't, KC> { rtxn: &'t heed::RoTxn<'t>, db: Database, field_id: FieldId, - documents_ids: RoaringBitmap, - level_iters: Vec>, + level_iters: Vec<(RoaringBitmap, Either, FacetRevRange<'t, T, KC>>)>, } impl<'t, T, KC> FacetIter<'t, T, KC> where + KC: heed::BytesDecode<'t, DItem = (FieldId, u8, T, T)>, KC: for<'a> BytesEncode<'a, EItem = (FieldId, u8, T, T)>, T: PartialOrd + Copy + Bounded, { @@ -97,8 +161,31 @@ where ) -> heed::Result> { let db = index.facet_field_id_value_docids.remap_key_type::(); - let level_0_iter = FacetRange::new(rtxn, db, field_id, 0, Unbounded, Unbounded)?; - Ok(FacetIter { rtxn, db, field_id, documents_ids, level_iters: vec![level_0_iter] }) + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + Ok(FacetIter { rtxn, db, field_id, level_iters: vec![(documents_ids, Left(highest_iter))] }) + } + + pub fn new_reverse( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> + { + let db = index.facet_field_id_value_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + Ok(FacetIter { rtxn, db, field_id, level_iters: vec![(documents_ids, Right(highest_iter))] }) + } + + fn highest_level(rtxn: &'t heed::RoTxn, db: Database, fid: FieldId) -> heed::Result> { + let level = db.remap_types::() + .prefix_iter(rtxn, &[fid][..])? + .remap_key_type::() + .last().transpose()? + .map(|((_, level, _, _), _)| level); + Ok(level) } } @@ -106,35 +193,54 @@ impl<'t, T: 't, KC> Iterator for FacetIter<'t, T, KC> where KC: heed::BytesDecode<'t, DItem = (FieldId, u8, T, T)>, KC: for<'x> heed::BytesEncode<'x, EItem = (FieldId, u8, T, T)>, - T: PartialOrd + Copy + Bounded, + T: PartialOrd + Copy + Bounded + Debug, { type Item = heed::Result<(T, RoaringBitmap)>; fn next(&mut self) -> Option { - loop { - let last = self.level_iters.last_mut()?; + 'outer: loop { + let (documents_ids, last) = self.level_iters.last_mut()?; + let is_ascending = last.is_left(); for result in last { + // If the last iterator must find an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + match result { Ok(((_fid, level, left, right), mut docids)) => { - if level == 0 { - docids.intersect_with(&self.documents_ids); - if !docids.is_empty() { - self.documents_ids.difference_with(&docids); + + docids.intersect_with(&documents_ids); + if !docids.is_empty() { + documents_ids.difference_with(&docids); + + if level == 0 { + debug!("found {:?} at {:?}", docids, left); return Some(Ok((left, docids))); } - } else if !docids.is_disjoint(&self.documents_ids) { - let result = FacetRange::new( - self.rtxn, - self.db, - self.field_id, - level - 1, - Included(left), - Included(right), + + let rtxn = self.rtxn; + let db = self.db; + let fid = self.field_id; + let left = Included(left); + let right = Included(right); + + debug!("calling with {:?} to {:?} (level {}) to find {:?}", + left, right, level - 1, docids, ); + + let result = if is_ascending { + FacetRange::new(rtxn, db, fid, level - 1, left, right).map(Left) + } else { + FacetRevRange::new(rtxn, db, fid, level - 1, left, right).map(Right) + }; + match result { Ok(iter) => { - self.level_iters.push(iter); - break; + self.level_iters.push((docids, iter)); + continue 'outer; }, Err(e) => return Some(Err(e)), } diff --git a/src/search/mod.rs b/src/search/mod.rs index 078cf2dab..3ec05f485 100644 --- a/src/search/mod.rs +++ b/src/search/mod.rs @@ -17,6 +17,7 @@ use crate::query_tokens::{QueryTokens, QueryToken}; use crate::{Index, FieldId, DocumentId, Criterion}; pub use self::facet::{FacetCondition, FacetNumberOperator, FacetStringOperator}; +pub use self::facet::{FacetIter}; // Building these factories is not free. static LEVDIST0: Lazy = Lazy::new(|| LevBuilder::new(0, true)); @@ -151,7 +152,7 @@ impl<'a> Search<'a> { &self, field_id: FieldId, facet_type: FacetType, - order: Order, + ascending: bool, documents_ids: RoaringBitmap, limit: usize, ) -> anyhow::Result> @@ -160,34 +161,30 @@ impl<'a> Search<'a> { let mut output = Vec::new(); match facet_type { FacetType::Float => { - facet_number_recurse::( - self.rtxn, - self.index, - field_id, - order, - documents_ids, - |_val, docids| { - limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); - debug!("Facet ordered iteration find {:?}", docids); - output.push(docids); - limit_tmp != 0 // Returns `true` if we must continue iterating - } - )?; + let facet_fn = if ascending { + FacetIter::::new + } else { + FacetIter::::new_reverse + }; + for result in facet_fn(self.rtxn, self.index, field_id, documents_ids)? { + let (_val, docids) = result?; + limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); + output.push(docids); + if limit_tmp == 0 { break } + } }, FacetType::Integer => { - facet_number_recurse::( - self.rtxn, - self.index, - field_id, - order, - documents_ids, - |_val, docids| { - limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); - debug!("Facet ordered iteration find {:?}", docids); - output.push(docids); - limit_tmp != 0 // Returns `true` if we must continue iterating - } - )?; + let facet_fn = if ascending { + FacetIter::::new + } else { + FacetIter::::new_reverse + }; + for result in facet_fn(self.rtxn, self.index, field_id, documents_ids)? { + let (_val, docids) = result?; + limit_tmp = limit_tmp.saturating_sub(docids.len() as usize); + output.push(docids); + if limit_tmp == 0 { break } + } }, FacetType::String => bail!("criteria facet type must be a number"), } @@ -214,16 +211,16 @@ impl<'a> Search<'a> { let criteria = self.index.criteria(self.rtxn)?; let result = criteria.into_iter().flat_map(|criterion| { match criterion { - Criterion::Asc(fid) => Some((fid, Order::Asc)), - Criterion::Desc(fid) => Some((fid, Order::Desc)), + Criterion::Asc(fid) => Some((fid, true)), + Criterion::Desc(fid) => Some((fid, false)), _ => None } }).next(); match result { - Some((fid, order)) => { + Some((fid, is_ascending)) => { let faceted_fields = self.index.faceted_fields(self.rtxn)?; let ftype = *faceted_fields.get(&fid).context("unknown field id")?; - Some((fid, ftype, order)) + Some((fid, ftype, is_ascending)) }, None => None, } @@ -244,7 +241,9 @@ impl<'a> Search<'a> { // If the query is not set or results in no DFAs but // there is some facet conditions we return a placeholder. let documents_ids = match order_by_facet { - Some((fid, ftype, order)) => self.facet_ordered(fid, ftype, order, facet_candidates, limit)?, + Some((fid, ftype, is_ascending)) => { + self.facet_ordered(fid, ftype, is_ascending, facet_candidates, limit)? + }, None => facet_candidates.iter().take(limit).collect(), }; return Ok(SearchResult { documents_ids, ..Default::default() }) @@ -253,7 +252,9 @@ impl<'a> Search<'a> { // If the query is not set or results in no DFAs we return a placeholder. let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_ids = match order_by_facet { - Some((fid, ftype, order)) => self.facet_ordered(fid, ftype, order, documents_ids, limit)?, + Some((fid, ftype, is_ascending)) => { + self.facet_ordered(fid, ftype, is_ascending, documents_ids, limit)? + }, None => documents_ids.iter().take(limit).collect(), }; return Ok(SearchResult { documents_ids, ..Default::default() })