Reintroduce filter range search and facet extractors

This commit is contained in:
Loïc Lecrenier 2022-08-31 08:27:16 +02:00 committed by Loïc Lecrenier
parent 22d80eeaf9
commit 39a4a0a362
5 changed files with 92 additions and 231 deletions

View File

@ -15,7 +15,7 @@ use super::get_last_facet_value;
pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>(
rtxn: &'t heed::RoTxn<'t>, rtxn: &'t heed::RoTxn<'t>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>, db: &'t heed::Database<FacetKeyCodec<BoundCodec>, FacetGroupValueCodec>,
field_id: u16, field_id: u16,
left: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>, left: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
right: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>, right: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
@ -48,13 +48,13 @@ where
} }
Bound::Unbounded => Bound::Unbounded, Bound::Unbounded => Bound::Unbounded,
}; };
let db = db.remap_key_type::<FacetKeyCodec<MyByteSlice>>();
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids }; let mut f = FacetRangeSearch { rtxn, db: &db, field_id, left, right, docids: &mut docids };
let highest_level = get_highest_level(rtxn, db, field_id)?; let highest_level = get_highest_level(rtxn, &db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(rtxn, db, field_id)? { if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(rtxn, &db, field_id)? {
let last_bound = get_last_facet_value::<MyByteSlice>(rtxn, db, field_id)?.unwrap(); let last_bound = get_last_facet_value::<MyByteSlice>(rtxn, &db, field_id)?.unwrap();
f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?; f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?;
Ok(docids) Ok(docids)
} else { } else {

View File

@ -1,22 +1,17 @@
use std::collections::HashSet;
use std::fmt::{Debug, Display};
use std::ops::Bound::{self, Excluded, Included};
use std::ops::RangeBounds;
use either::Either; use either::Either;
pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token};
use heed::types::DecodeIgnore; use heed::types::DecodeIgnore;
use heed::LazyDecode;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use std::collections::HashSet;
use std::fmt::{Debug, Display};
use std::ops::Bound::{self, Excluded, Included};
// use super::FacetNumberRange;
use crate::error::{Error, UserError}; use crate::error::{Error, UserError};
use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec; use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec;
use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec}; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec};
// use crate::heed_codec::facet::FacetLevelValueF64Codec; use crate::{distance_between_two_points, lat_lng_to_xyz, FieldId, Index, Result};
use crate::{
distance_between_two_points, lat_lng_to_xyz, CboRoaringBitmapCodec, FieldId, Index, Result, use super::facet_range_search;
};
/// The maximum number of filters the filter AST can process. /// The maximum number of filters the filter AST can process.
const MAX_FILTER_DEPTH: usize = 2000; const MAX_FILTER_DEPTH: usize = 2000;
@ -147,158 +142,15 @@ impl<'a> Filter<'a> {
} }
} }
fn explore_facet_number_levels(
rtxn: &heed::RoTxn,
db: heed::Database<FacetKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
field_id: FieldId,
) {
}
impl<'a> Filter<'a> { impl<'a> Filter<'a> {
/// Aggregates the documents ids that are part of the specified range automatically pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
/// going deeper through the levels. // to avoid doing this for each recursive call we're going to do it ONCE ahead of time
fn explore_facet_number_levels( let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?;
rtxn: &heed::RoTxn, let filterable_fields = index.filterable_fields(rtxn)?;
db: heed::Database<FacetKeyCodec<OrderedF64Codec>, CboRoaringBitmapCodec>,
field_id: FieldId,
level: u8,
left: Bound<f64>,
right: Bound<f64>,
output: &mut RoaringBitmap,
) -> Result<()> {
// level must be > 0, I'll create a separate function for level 0
// if level == 0 {
// call that function
//}
match (left, right) {
// If the request is an exact value we must go directly to the deepest level.
(Included(l), Included(r)) if l == r && level > 0 => {
return Self::explore_facet_number_levels(
rtxn, db, field_id, 0, left, right, output,
);
}
// lower TO upper when lower > upper must return no result
(Included(l), Included(r)) if l > r => return Ok(()),
(Included(l), Excluded(r)) if l >= r => return Ok(()),
(Excluded(l), Excluded(r)) if l >= r => return Ok(()),
(Excluded(l), Included(r)) if l >= r => return Ok(()),
(_, _) => (),
}
let range_start_key = FacetKey {
field_id,
level,
left_bound: match left {
Included(l) => l,
Excluded(l) => l,
Bound::Unbounded => f64::MIN,
},
};
let mut range_iter = db
.remap_data_type::<LazyDecode<FacetGroupValueCodec>>()
.range(rtxn, &(range_start_key..))?;
let (mut previous_facet_key, mut previous_value) = range_iter.next().unwrap()?; // and finally we delete all the soft_deleted_documents, again, only once at the very end
while let Some(el) = range_iter.next() { self.inner_evaluate(rtxn, index, &filterable_fields)
let (facet_key, value) = el?; .map(|result| result - soft_deleted_documents)
let range = (Included(previous_facet_key.left_bound), Excluded(facet_key.left_bound));
// if the current range intersects with the query range, then go deeper
// what does it mean for two ranges to intersect?
let gte_left = match left {
Included(l) => previous_facet_key.left_bound >= l,
Excluded(l) => previous_facet_key.left_bound > l, // TODO: not true?
Bound::Unbounded => true,
};
let lte_right = match right {
Included(r) => facet_key.left_bound <= r,
Excluded(r) => facet_key.left_bound < r,
Bound::Unbounded => true,
};
}
// at this point, previous_facet_key and previous_value are the last groups in the level
// we must also check whether we should visit this group
todo!();
// let mut left_found = None;
// let mut right_found = None;
// // We must create a custom iterator to be able to iterate over the
// // requested range as the range iterator cannot express some conditions.
// let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?;
// debug!("Iterating between {:?} and {:?} (level {})", left, right, level);
// for (i, result) in iter.enumerate() {
// let ((_fid, level, l, r), docids) = result?;
// debug!("{:?} to {:?} (level {}) found {} documents", l, r, level, docids.len());
// *output |= docids;
// // We save the leftest and rightest bounds we actually found at this level.
// if i == 0 {
// left_found = Some(l);
// }
// right_found = Some(r);
// }
// // Can we go deeper?
// let deeper_level = match level.checked_sub(1) {
// Some(level) => level,
// None => return Ok(()),
// };
// // We must refine the left and right bounds of this range by retrieving the
// // missing part in a deeper level.
// match left_found.zip(right_found) {
// Some((left_found, right_found)) => {
// // If the bound is satisfied we avoid calling this function again.
// if !matches!(left, Included(l) if l == left_found) {
// let sub_right = Excluded(left_found);
// debug!(
// "calling left with {:?} to {:?} (level {})",
// left, sub_right, deeper_level
// );
// Self::explore_facet_number_levels(
// rtxn,
// db,
// field_id,
// deeper_level,
// left,
// sub_right,
// output,
// )?;
// }
// if !matches!(right, Included(r) if r == right_found) {
// let sub_left = Excluded(right_found);
// debug!(
// "calling right with {:?} to {:?} (level {})",
// sub_left, right, deeper_level
// );
// Self::explore_facet_number_levels(
// rtxn,
// db,
// field_id,
// deeper_level,
// sub_left,
// right,
// output,
// )?;
// }
// }
// None => {
// // If we found nothing at this level it means that we must find
// // the same bounds but at a deeper, more precise level.
// Self::explore_facet_number_levels(
// rtxn,
// db,
// field_id,
// deeper_level,
// left,
// right,
// output,
// )?;
// }
// }
// Ok(())
} }
fn evaluate_operator( fn evaluate_operator(
@ -337,15 +189,15 @@ impl<'a> Filter<'a> {
Some(n) => { Some(n) => {
let n = Included(n); let n = Included(n);
let mut output = RoaringBitmap::new(); let mut output = RoaringBitmap::new();
// Self::explore_facet_number_levels( Self::explore_facet_number_levels(
// rtxn, rtxn,
// numbers_db, numbers_db,
// field_id, field_id,
// 0, 0,
// n, n,
// n, n,
// &mut output, &mut output,
// )?; )?;
output output
} }
None => RoaringBitmap::new(), None => RoaringBitmap::new(),
@ -381,29 +233,53 @@ impl<'a> Filter<'a> {
match biggest_level { match biggest_level {
Some(level) => { Some(level) => {
let mut output = RoaringBitmap::new(); let mut output = RoaringBitmap::new();
// Self::explore_facet_number_levels( Self::explore_facet_number_levels(
// rtxn, rtxn,
// numbers_db, numbers_db,
// field_id, field_id,
// level, level,
// left, left,
// right, right,
// &mut output, &mut output,
// )?; )?;
Ok(output) Ok(output)
} }
None => Ok(RoaringBitmap::new()), None => Ok(RoaringBitmap::new()),
} }
} }
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> { /// Aggregates the documents ids that are part of the specified range automatically
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time /// going deeper through the levels.
let soft_deleted_documents = index.soft_deleted_documents_ids(rtxn)?; fn explore_facet_number_levels(
let filterable_fields = index.filterable_fields(rtxn)?; rtxn: &heed::RoTxn,
db: heed::Database<FacetKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
field_id: FieldId,
level: u8,
left: Bound<f64>,
right: Bound<f64>,
output: &mut RoaringBitmap,
) -> Result<()> {
match (left, right) {
// If the request is an exact value we must go directly to the deepest level.
(Included(l), Included(r)) if l == r && level > 0 => {
return Self::explore_facet_number_levels(
rtxn, db, field_id, 0, left, right, output,
);
}
// lower TO upper when lower > upper must return no result
(Included(l), Included(r)) if l > r => return Ok(()),
(Included(l), Excluded(r)) if l >= r => return Ok(()),
(Excluded(l), Excluded(r)) if l >= r => return Ok(()),
(Excluded(l), Included(r)) if l >= r => return Ok(()),
(_, _) => (),
}
let x = facet_range_search::find_docids_of_facet_within_bounds::<OrderedF64Codec>(
rtxn, &db, field_id, &left, &right,
)?;
// TODO: the facet range search should take a mutable roaring bitmap as argument
*output = x;
// and finally we delete all the soft_deleted_documents, again, only once at the very end Ok(())
self.inner_evaluate(rtxn, index, &filterable_fields)
.map(|result| result - soft_deleted_documents)
} }
fn inner_evaluate( fn inner_evaluate(

View File

@ -2,22 +2,20 @@ use std::collections::btree_map::Entry;
use fst::IntoStreamer; use fst::IntoStreamer;
use heed::types::{ByteSlice, Str}; use heed::types::{ByteSlice, Str};
use heed::{BytesDecode, BytesEncode, Database}; use heed::Database;
use obkv::Key;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use serde_json::Value; use serde_json::Value;
use time::OffsetDateTime; use time::OffsetDateTime;
use super::{ClearDocuments, Facets}; use super::{ClearDocuments, Facets};
use crate::error::{InternalError, SerializationError, UserError}; use crate::error::{InternalError, UserError};
// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec;
use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice}; use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice};
use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::{db_name, main_key}; use crate::index::{db_name, main_key};
use crate::{ use crate::{
fields_ids_map, DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result,
FieldsIdsMap, Index, Result, RoaringBitmapCodec, SmallString32, BEU32, RoaringBitmapCodec, SmallString32, BEU32,
}; };
pub struct DeleteDocuments<'t, 'u, 'i> { pub struct DeleteDocuments<'t, 'u, 'i> {

View File

@ -6,6 +6,8 @@ use heed::{BytesDecode, BytesEncode};
use super::helpers::{ use super::helpers::{
create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters, create_sorter, merge_cbo_roaring_bitmaps, sorter_into_reader, GrenadParameters,
}; };
use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec;
use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec};
use crate::heed_codec::facet::FieldDocIdFacetF64Codec; use crate::heed_codec::facet::FieldDocIdFacetF64Codec;
use crate::Result; use crate::Result;
@ -31,14 +33,13 @@ pub fn extract_facet_number_docids<R: io::Read + io::Seek>(
let mut cursor = docid_fid_facet_number.into_cursor()?; let mut cursor = docid_fid_facet_number.into_cursor()?;
while let Some((key_bytes, _)) = cursor.move_on_next()? { while let Some((key_bytes, _)) = cursor.move_on_next()? {
todo!() let (field_id, document_id, number) =
// let (field_id, document_id, number) = FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
// FieldDocIdFacetF64Codec::bytes_decode(key_bytes).unwrap();
// let key = (field_id, 0, number, number); let key = FacetKey { field_id, level: 0, left_bound: number };
// // let key_bytes = FacetLevelValueF64Codec::bytes_encode(&key).unwrap(); let key_bytes = FacetKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
// facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?; facet_number_docids_sorter.insert(key_bytes, document_id.to_ne_bytes())?;
} }
sorter_into_reader(facet_number_docids_sorter, indexer) sorter_into_reader(facet_number_docids_sorter, indexer)

View File

@ -1,13 +1,11 @@
use std::fs::File;
use std::iter::FromIterator;
use std::{io, str};
use roaring::RoaringBitmap;
use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters};
use crate::heed_codec::facet::new::str_ref::StrRefCodec;
use crate::heed_codec::facet::new::{FacetKey, FacetKeyCodec};
use crate::update::index_documents::merge_cbo_roaring_bitmaps; use crate::update::index_documents::merge_cbo_roaring_bitmaps;
// use crate::heed_codec::facet::{encode_prefix_string, FacetStringLevelZeroCodec};
use crate::{FieldId, Result}; use crate::{FieldId, Result};
use heed::BytesEncode;
use std::fs::File;
use std::io;
/// Extracts the facet string and the documents ids where this facet string appear. /// Extracts the facet string and the documents ids where this facet string appear.
/// ///
@ -22,38 +20,26 @@ pub fn extract_facet_string_docids<R: io::Read + io::Seek>(
let mut facet_string_docids_sorter = create_sorter( let mut facet_string_docids_sorter = create_sorter(
grenad::SortAlgorithm::Stable, grenad::SortAlgorithm::Stable,
merge_cbo_roaring_bitmaps, // TODO: check merge_cbo_roaring_bitmaps, // TODO: check that it is correct
indexer.chunk_compression_type, indexer.chunk_compression_type,
indexer.chunk_compression_level, indexer.chunk_compression_level,
indexer.max_nb_chunks, indexer.max_nb_chunks,
max_memory, max_memory,
); );
let mut key_buffer = Vec::new();
let mut value_buffer = Vec::new();
let mut cursor = docid_fid_facet_string.into_cursor()?; let mut cursor = docid_fid_facet_string.into_cursor()?;
while let Some((key, original_value_bytes)) = cursor.move_on_next()? { while let Some((key, _original_value_bytes)) = cursor.move_on_next()? {
let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let (field_id_bytes, bytes) = try_split_array_at(key).unwrap();
let field_id = FieldId::from_be_bytes(field_id_bytes); let field_id = FieldId::from_be_bytes(field_id_bytes);
let (document_id_bytes, normalized_value_bytes) = try_split_array_at(bytes).unwrap();
let document_id = u32::from_be_bytes(document_id_bytes);
let original_value = str::from_utf8(original_value_bytes)?;
key_buffer.clear(); let (document_id_bytes, normalized_value_bytes) =
// TODO try_split_array_at::<_, 4>(bytes).unwrap();
// FacetStringLevelZeroCodec::serialize_into(
// field_id,
// str::from_utf8(normalized_value_bytes)?,
// &mut key_buffer,
// );
value_buffer.clear(); let normalised_value = std::str::from_utf8(normalized_value_bytes)?;
// TODO let key = FacetKey { field_id, level: 0, left_bound: normalised_value };
// encode_prefix_string(original_value, &mut value_buffer)?; let key_bytes = FacetKeyCodec::<StrRefCodec>::bytes_encode(&key).unwrap();
let bitmap = RoaringBitmap::from_iter(Some(document_id));
bitmap.serialize_into(&mut value_buffer)?;
facet_string_docids_sorter.insert(&key_buffer, &value_buffer)?; facet_string_docids_sorter.insert(&key_bytes, &document_id_bytes)?;
} }
sorter_into_reader(facet_string_docids_sorter, indexer) sorter_into_reader(facet_string_docids_sorter, indexer)