mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-23 10:37:41 +08:00
Return the original string values for the inverted facet index database
This commit is contained in:
parent
03a01166ba
commit
0227254a65
@ -627,14 +627,14 @@ fn facet_values_docids(
|
|||||||
FacetType::String => {
|
FacetType::String => {
|
||||||
wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?;
|
wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?;
|
||||||
for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? {
|
for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? {
|
||||||
let ((_fid, value), docids) = result?;
|
let ((_fid, normalized), (_original, docids)) = result?;
|
||||||
let count = docids.len();
|
let count = docids.len();
|
||||||
let docids = if debug {
|
let docids = if debug {
|
||||||
format!("{:?}", docids)
|
format!("{:?}", docids)
|
||||||
} else {
|
} else {
|
||||||
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
format!("{:?}", docids.iter().collect::<Vec<_>>())
|
||||||
};
|
};
|
||||||
wtr.write_record(&[value.to_string(), count.to_string(), docids])?;
|
wtr.write_record(&[normalized.to_string(), count.to_string(), docids])?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -0,0 +1,80 @@
|
|||||||
|
use std::borrow::Cow;
|
||||||
|
use std::convert::TryInto;
|
||||||
|
use std::{marker, str};
|
||||||
|
|
||||||
|
use super::try_split_at;
|
||||||
|
|
||||||
|
/// A codec that encodes a string in front of the value.
|
||||||
|
///
|
||||||
|
/// The usecase is for the facet string levels algorithm where we must know the
|
||||||
|
/// original string of a normalized facet value, the original values are stored
|
||||||
|
/// in the value to not break the lexicographical ordering of the LMDB keys.
|
||||||
|
pub struct FacetStringLevelZeroValueCodec<C>(marker::PhantomData<C>);
|
||||||
|
|
||||||
|
impl<'a, C> heed::BytesDecode<'a> for FacetStringLevelZeroValueCodec<C>
|
||||||
|
where
|
||||||
|
C: heed::BytesDecode<'a>,
|
||||||
|
{
|
||||||
|
type DItem = (&'a str, C::DItem);
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
let (string_len, bytes) = try_split_at(bytes, 2)?;
|
||||||
|
let string_len = string_len.try_into().ok().map(u16::from_be_bytes)?;
|
||||||
|
|
||||||
|
let (string, bytes) = try_split_at(bytes, string_len as usize)?;
|
||||||
|
let string = str::from_utf8(string).ok()?;
|
||||||
|
|
||||||
|
C::bytes_decode(bytes).map(|item| (string, item))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, C> heed::BytesEncode<'a> for FacetStringLevelZeroValueCodec<C>
|
||||||
|
where
|
||||||
|
C: heed::BytesEncode<'a>,
|
||||||
|
{
|
||||||
|
type EItem = (&'a str, C::EItem);
|
||||||
|
|
||||||
|
fn bytes_encode((string, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
|
||||||
|
let string_len: u16 = string.len().try_into().ok()?;
|
||||||
|
let value_bytes = C::bytes_encode(&value)?;
|
||||||
|
|
||||||
|
let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len());
|
||||||
|
bytes.extend_from_slice(&string_len.to_be_bytes());
|
||||||
|
bytes.extend_from_slice(string.as_bytes());
|
||||||
|
bytes.extend_from_slice(&value_bytes[..]);
|
||||||
|
|
||||||
|
Some(Cow::Owned(bytes))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use heed::types::Unit;
|
||||||
|
use heed::{BytesDecode, BytesEncode};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use super::*;
|
||||||
|
use crate::CboRoaringBitmapCodec;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn deserialize_roaring_bitmaps() {
|
||||||
|
let string = "abc";
|
||||||
|
let docids: RoaringBitmap = (0..100).chain(3500..4398).collect();
|
||||||
|
let key = (string, docids.clone());
|
||||||
|
let bytes =
|
||||||
|
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&key).unwrap();
|
||||||
|
let (out_string, out_docids) =
|
||||||
|
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&bytes).unwrap();
|
||||||
|
assert_eq!((out_string, out_docids), (string, docids));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn deserialize_unit() {
|
||||||
|
let string = "def";
|
||||||
|
let key = (string, ());
|
||||||
|
let bytes = FacetStringLevelZeroValueCodec::<Unit>::bytes_encode(&key).unwrap();
|
||||||
|
let (out_string, out_unit) =
|
||||||
|
FacetStringLevelZeroValueCodec::<Unit>::bytes_decode(&bytes).unwrap();
|
||||||
|
assert_eq!((out_string, out_unit), (string, ()));
|
||||||
|
}
|
||||||
|
}
|
@ -2,7 +2,9 @@ use std::borrow::Cow;
|
|||||||
use std::convert::TryInto;
|
use std::convert::TryInto;
|
||||||
use std::{marker, str};
|
use std::{marker, str};
|
||||||
|
|
||||||
/// A codec that encodes two strings in front of the value.
|
use super::try_split_at;
|
||||||
|
|
||||||
|
/// A codec that optionally encodes two strings in front of the value.
|
||||||
///
|
///
|
||||||
/// The usecase is for the facet string levels algorithm where we must
|
/// The usecase is for the facet string levels algorithm where we must
|
||||||
/// know the origin of a group, the group left and right bounds are stored
|
/// know the origin of a group, the group left and right bounds are stored
|
||||||
@ -79,16 +81,6 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Tries to split a slice in half at the given middle point,
|
|
||||||
/// `None` if the slice is too short.
|
|
||||||
fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
|
|
||||||
if slice.len() >= mid {
|
|
||||||
Some(slice.split_at(mid))
|
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use heed::types::Unit;
|
use heed::types::Unit;
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
mod facet_level_value_f64_codec;
|
mod facet_level_value_f64_codec;
|
||||||
mod facet_level_value_u32_codec;
|
mod facet_level_value_u32_codec;
|
||||||
mod facet_string_level_zero_codec;
|
mod facet_string_level_zero_codec;
|
||||||
|
mod facet_string_level_zero_value_codec;
|
||||||
mod facet_string_zero_bounds_value_codec;
|
mod facet_string_zero_bounds_value_codec;
|
||||||
mod field_doc_id_facet_f64_codec;
|
mod field_doc_id_facet_f64_codec;
|
||||||
mod field_doc_id_facet_string_codec;
|
mod field_doc_id_facet_string_codec;
|
||||||
@ -8,6 +9,17 @@ mod field_doc_id_facet_string_codec;
|
|||||||
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
|
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
|
||||||
pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec;
|
pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec;
|
||||||
pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec;
|
pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec;
|
||||||
|
pub use self::facet_string_level_zero_value_codec::FacetStringLevelZeroValueCodec;
|
||||||
pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec;
|
pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec;
|
||||||
pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
|
pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
|
||||||
pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
|
pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
|
||||||
|
|
||||||
|
/// Tries to split a slice in half at the given middle point,
|
||||||
|
/// `None` if the slice is too short.
|
||||||
|
pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
|
||||||
|
if slice.len() >= mid {
|
||||||
|
Some(slice.split_at(mid))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
@ -11,8 +11,8 @@ use roaring::RoaringBitmap;
|
|||||||
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
|
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
|
||||||
use crate::fields_ids_map::FieldsIdsMap;
|
use crate::fields_ids_map::FieldsIdsMap;
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec,
|
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
|
||||||
FieldDocIdFacetStringCodec,
|
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||||
};
|
};
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
|
default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion,
|
||||||
@ -90,8 +90,9 @@ pub struct Index {
|
|||||||
|
|
||||||
/// Maps the facet field id, level and the number with the docids that corresponds to it.
|
/// Maps the facet field id, level and the number with the docids that corresponds to it.
|
||||||
pub facet_id_f64_docids: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
pub facet_id_f64_docids: Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the facet field id and the string with the docids that corresponds to it.
|
/// Maps the facet field id and the string with the original string and docids that corresponds to it.
|
||||||
pub facet_id_string_docids: Database<FacetStringLevelZeroCodec, CboRoaringBitmapCodec>,
|
pub facet_id_string_docids:
|
||||||
|
Database<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>,
|
||||||
|
|
||||||
/// Maps the document id, the facet field id and the numbers.
|
/// Maps the document id, the facet field id and the numbers.
|
||||||
pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>,
|
pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>,
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
use std::mem::size_of;
|
use std::mem::size_of;
|
||||||
|
|
||||||
|
use concat_arrays::concat_arrays;
|
||||||
use heed::types::{ByteSlice, Str, Unit};
|
use heed::types::{ByteSlice, Str, Unit};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
@ -43,7 +44,10 @@ pub struct FacetDistinctIter<'a> {
|
|||||||
|
|
||||||
impl<'a> FacetDistinctIter<'a> {
|
impl<'a> FacetDistinctIter<'a> {
|
||||||
fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
|
fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key))
|
self.index
|
||||||
|
.facet_id_string_docids
|
||||||
|
.get(self.txn, &(self.distinct, key))
|
||||||
|
.map(|result| result.map(|(_original, docids)| docids))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
|
fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
|
||||||
@ -116,10 +120,7 @@ impl<'a> FacetDistinctIter<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] {
|
fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] {
|
||||||
let mut key = [0; FID_SIZE + DOCID_SIZE];
|
concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes())
|
||||||
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
|
|
||||||
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
|
|
||||||
key
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn facet_number_values<'a>(
|
fn facet_number_values<'a>(
|
||||||
|
@ -47,7 +47,7 @@ mod test {
|
|||||||
|
|
||||||
let mut documents = Vec::new();
|
let mut documents = Vec::new();
|
||||||
|
|
||||||
let txts = ["toto", "titi", "tata"];
|
let txts = ["Toto", "Titi", "Tata"];
|
||||||
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
|
let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
|
||||||
let cat_ints = (1..10).collect::<Vec<_>>();
|
let cat_ints = (1..10).collect::<Vec<_>>();
|
||||||
|
|
||||||
@ -90,7 +90,6 @@ mod test {
|
|||||||
|
|
||||||
addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments);
|
||||||
addition.update_format(UpdateFormat::Json);
|
addition.update_format(UpdateFormat::Json);
|
||||||
|
|
||||||
addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap();
|
addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap();
|
||||||
|
|
||||||
let fields_map = index.fields_ids_map(&txn).unwrap();
|
let fields_map = index.fields_ids_map(&txn).unwrap();
|
||||||
|
@ -23,7 +23,7 @@ const MAX_VALUES_BY_FACET: usize = 1000;
|
|||||||
|
|
||||||
/// Threshold on the number of candidates that will make
|
/// Threshold on the number of candidates that will make
|
||||||
/// the system to choose between one algorithm or another.
|
/// the system to choose between one algorithm or another.
|
||||||
const CANDIDATES_THRESHOLD: u64 = 35_000;
|
const CANDIDATES_THRESHOLD: u64 = 3000;
|
||||||
|
|
||||||
pub struct FacetDistribution<'a> {
|
pub struct FacetDistribution<'a> {
|
||||||
facets: Option<HashSet<String>>,
|
facets: Option<HashSet<String>>,
|
||||||
@ -72,6 +72,7 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
FacetType::Number => {
|
FacetType::Number => {
|
||||||
let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect();
|
let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect();
|
||||||
|
|
||||||
|
let distribution_prelength = distribution.len();
|
||||||
let db = self.index.field_id_docid_facet_f64s;
|
let db = self.index.field_id_docid_facet_f64s;
|
||||||
for docid in candidates.into_iter() {
|
for docid in candidates.into_iter() {
|
||||||
key_buffer.truncate(mem::size_of::<FieldId>());
|
key_buffer.truncate(mem::size_of::<FieldId>());
|
||||||
@ -84,6 +85,9 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
for result in iter {
|
for result in iter {
|
||||||
let ((_, _, value), ()) = result?;
|
let ((_, _, value), ()) = result?;
|
||||||
*distribution.entry(value.to_string()).or_insert(0) += 1;
|
*distribution.entry(value.to_string()).or_insert(0) += 1;
|
||||||
|
if distribution.len() - distribution_prelength == self.max_values_by_facet {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -106,6 +110,10 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
.entry(normalized_value)
|
.entry(normalized_value)
|
||||||
.or_insert_with(|| (original_value, 0));
|
.or_insert_with(|| (original_value, 0));
|
||||||
*count += 1;
|
*count += 1;
|
||||||
|
|
||||||
|
if normalized_distribution.len() == self.max_values_by_facet {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -154,10 +162,10 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?;
|
FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?;
|
||||||
|
|
||||||
for result in iter {
|
for result in iter {
|
||||||
let (value, mut docids) = result?;
|
let (_normalized, original, mut docids) = result?;
|
||||||
docids &= candidates;
|
docids &= candidates;
|
||||||
if !docids.is_empty() {
|
if !docids.is_empty() {
|
||||||
distribution.insert(value.to_string(), docids.len());
|
distribution.insert(original.to_string(), docids.len());
|
||||||
}
|
}
|
||||||
if distribution.len() == self.max_values_by_facet {
|
if distribution.len() == self.max_values_by_facet {
|
||||||
break;
|
break;
|
||||||
@ -193,14 +201,20 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
.prefix_iter(self.rtxn, &field_id.to_be_bytes())?
|
.prefix_iter(self.rtxn, &field_id.to_be_bytes())?
|
||||||
.remap_key_type::<FacetStringLevelZeroCodec>();
|
.remap_key_type::<FacetStringLevelZeroCodec>();
|
||||||
|
|
||||||
|
let mut normalized_distribution = BTreeMap::new();
|
||||||
for result in iter {
|
for result in iter {
|
||||||
let ((_, value), docids) = result?;
|
let ((_, normalized_value), (original_value, docids)) = result?;
|
||||||
distribution.insert(value.to_string(), docids.len());
|
normalized_distribution.insert(normalized_value, (original_value, docids.len()));
|
||||||
if distribution.len() == self.max_values_by_facet {
|
if distribution.len() == self.max_values_by_facet {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let iter = normalized_distribution
|
||||||
|
.into_iter()
|
||||||
|
.map(|(_normalized, (original, count))| (original.to_string(), count));
|
||||||
|
distribution.extend(iter);
|
||||||
|
|
||||||
Ok(distribution)
|
Ok(distribution)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -135,7 +135,8 @@ use heed::{Database, LazyDecode, RoRange};
|
|||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec,
|
FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
|
||||||
|
FacetStringZeroBoundsValueCodec,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
use crate::{FieldId, Index};
|
use crate::{FieldId, Index};
|
||||||
@ -209,7 +210,11 @@ impl<'t> Iterator for FacetStringGroupRange<'t> {
|
|||||||
///
|
///
|
||||||
/// It yields the facet string and the roaring bitmap associated with it.
|
/// It yields the facet string and the roaring bitmap associated with it.
|
||||||
pub struct FacetStringLevelZeroRange<'t> {
|
pub struct FacetStringLevelZeroRange<'t> {
|
||||||
iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>,
|
iter: RoRange<
|
||||||
|
't,
|
||||||
|
FacetStringLevelZeroCodec,
|
||||||
|
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
|
||||||
|
>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> FacetStringLevelZeroRange<'t> {
|
impl<'t> FacetStringLevelZeroRange<'t> {
|
||||||
@ -252,18 +257,23 @@ impl<'t> FacetStringLevelZeroRange<'t> {
|
|||||||
let iter = db
|
let iter = db
|
||||||
.remap_key_type::<ByteSlice>()
|
.remap_key_type::<ByteSlice>()
|
||||||
.range(rtxn, &(left_bound, right_bound))?
|
.range(rtxn, &(left_bound, right_bound))?
|
||||||
.remap_types::<FacetStringLevelZeroCodec, CboRoaringBitmapCodec>();
|
.remap_types::<
|
||||||
|
FacetStringLevelZeroCodec,
|
||||||
|
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>
|
||||||
|
>();
|
||||||
|
|
||||||
Ok(FacetStringLevelZeroRange { iter })
|
Ok(FacetStringLevelZeroRange { iter })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Iterator for FacetStringLevelZeroRange<'t> {
|
impl<'t> Iterator for FacetStringLevelZeroRange<'t> {
|
||||||
type Item = heed::Result<(&'t str, RoaringBitmap)>;
|
type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
match self.iter.next() {
|
match self.iter.next() {
|
||||||
Some(Ok(((_fid, value), docids))) => Some(Ok((value, docids))),
|
Some(Ok(((_fid, normalized), (original, docids)))) => {
|
||||||
|
Some(Ok((normalized, original, docids)))
|
||||||
|
}
|
||||||
Some(Err(e)) => Some(Err(e)),
|
Some(Err(e)) => Some(Err(e)),
|
||||||
None => None,
|
None => None,
|
||||||
}
|
}
|
||||||
@ -326,7 +336,7 @@ impl<'t> FacetStringIter<'t> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Iterator for FacetStringIter<'t> {
|
impl<'t> Iterator for FacetStringIter<'t> {
|
||||||
type Item = heed::Result<(&'t str, RoaringBitmap)>;
|
type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
'outer: loop {
|
'outer: loop {
|
||||||
@ -377,11 +387,11 @@ impl<'t> Iterator for FacetStringIter<'t> {
|
|||||||
// level zero only
|
// level zero only
|
||||||
for result in last {
|
for result in last {
|
||||||
match result {
|
match result {
|
||||||
Ok((value, mut docids)) => {
|
Ok((normalized, original, mut docids)) => {
|
||||||
docids &= &*documents_ids;
|
docids &= &*documents_ids;
|
||||||
if !docids.is_empty() {
|
if !docids.is_empty() {
|
||||||
*documents_ids -= &docids;
|
*documents_ids -= &docids;
|
||||||
return Some(Ok((value, docids)));
|
return Some(Ok((normalized, original, docids)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => return Some(Err(e)),
|
Err(e) => return Some(Err(e)),
|
||||||
|
@ -17,7 +17,9 @@ use self::Operator::*;
|
|||||||
use super::parser::{FilterParser, Rule, PREC_CLIMBER};
|
use super::parser::{FilterParser, Rule, PREC_CLIMBER};
|
||||||
use super::FacetNumberRange;
|
use super::FacetNumberRange;
|
||||||
use crate::error::UserError;
|
use crate::error::UserError;
|
||||||
use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetStringLevelZeroCodec};
|
use crate::heed_codec::facet::{
|
||||||
|
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
|
||||||
|
};
|
||||||
use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result};
|
use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result};
|
||||||
|
|
||||||
#[derive(Debug, Clone, PartialEq)]
|
#[derive(Debug, Clone, PartialEq)]
|
||||||
@ -363,7 +365,10 @@ impl FilterCondition {
|
|||||||
rtxn: &heed::RoTxn,
|
rtxn: &heed::RoTxn,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
numbers_db: heed::Database<FacetLevelValueF64Codec, CboRoaringBitmapCodec>,
|
||||||
strings_db: heed::Database<FacetStringLevelZeroCodec, CboRoaringBitmapCodec>,
|
strings_db: heed::Database<
|
||||||
|
FacetStringLevelZeroCodec,
|
||||||
|
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
|
||||||
|
>,
|
||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
operator: &Operator,
|
operator: &Operator,
|
||||||
) -> Result<RoaringBitmap> {
|
) -> Result<RoaringBitmap> {
|
||||||
@ -374,7 +379,8 @@ impl FilterCondition {
|
|||||||
GreaterThan(val) => (Excluded(*val), Included(f64::MAX)),
|
GreaterThan(val) => (Excluded(*val), Included(f64::MAX)),
|
||||||
GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)),
|
GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)),
|
||||||
Equal(number, string) => {
|
Equal(number, string) => {
|
||||||
let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default();
|
let (_original_value, string_docids) =
|
||||||
|
strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default();
|
||||||
let number_docids = match number {
|
let number_docids = match number {
|
||||||
Some(n) => {
|
Some(n) => {
|
||||||
let n = Included(*n);
|
let n = Included(*n);
|
||||||
|
@ -9,6 +9,7 @@ use serde_json::Value;
|
|||||||
|
|
||||||
use super::ClearDocuments;
|
use super::ClearDocuments;
|
||||||
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
|
use crate::error::{FieldIdMapMissingEntry, InternalError, UserError};
|
||||||
|
use crate::heed_codec::facet::FacetStringLevelZeroValueCodec;
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
use crate::index::{db_name, main_key};
|
use crate::index::{db_name, main_key};
|
||||||
use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
|
use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32};
|
||||||
@ -374,13 +375,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
drop(iter);
|
drop(iter);
|
||||||
|
|
||||||
// We delete the documents ids that are under the facet field id values.
|
// We delete the documents ids that are under the facet field id values.
|
||||||
remove_docids_from_facet_field_id_value_docids(
|
remove_docids_from_facet_field_id_number_docids(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
&self.documents_ids,
|
&self.documents_ids,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
remove_docids_from_facet_field_id_value_docids(
|
remove_docids_from_facet_field_id_string_docids(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
&self.documents_ids,
|
&self.documents_ids,
|
||||||
@ -447,7 +448,33 @@ where
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn remove_docids_from_facet_field_id_value_docids<'a, C>(
|
fn remove_docids_from_facet_field_id_string_docids<'a, C>(
|
||||||
|
wtxn: &'a mut heed::RwTxn,
|
||||||
|
db: &heed::Database<C, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>,
|
||||||
|
to_remove: &RoaringBitmap,
|
||||||
|
) -> heed::Result<()>
|
||||||
|
where
|
||||||
|
C: heed::BytesDecode<'a> + heed::BytesEncode<'a>,
|
||||||
|
{
|
||||||
|
let mut iter = db.remap_key_type::<ByteSlice>().iter_mut(wtxn)?;
|
||||||
|
while let Some(result) = iter.next() {
|
||||||
|
let (bytes, (original_value, mut docids)) = result?;
|
||||||
|
let previous_len = docids.len();
|
||||||
|
docids -= to_remove;
|
||||||
|
if docids.is_empty() {
|
||||||
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
|
unsafe { iter.del_current()? };
|
||||||
|
} else if docids.len() != previous_len {
|
||||||
|
let bytes = bytes.to_owned();
|
||||||
|
// safety: we don't keep references from inside the LMDB database.
|
||||||
|
unsafe { iter.put_current(&bytes, &(original_value, docids))? };
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn remove_docids_from_facet_field_id_number_docids<'a, C>(
|
||||||
wtxn: &'a mut heed::RwTxn,
|
wtxn: &'a mut heed::RwTxn,
|
||||||
db: &heed::Database<C, CboRoaringBitmapCodec>,
|
db: &heed::Database<C, CboRoaringBitmapCodec>,
|
||||||
to_remove: &RoaringBitmap,
|
to_remove: &RoaringBitmap,
|
||||||
|
@ -12,7 +12,7 @@ use roaring::RoaringBitmap;
|
|||||||
use crate::error::InternalError;
|
use crate::error::InternalError;
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec,
|
FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec,
|
||||||
FacetStringZeroBoundsValueCodec,
|
FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
use crate::update::index_documents::{
|
use crate::update::index_documents::{
|
||||||
@ -75,7 +75,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
|||||||
)?;
|
)?;
|
||||||
|
|
||||||
// Compute and store the faceted strings documents ids.
|
// Compute and store the faceted strings documents ids.
|
||||||
let string_documents_ids = compute_faceted_documents_ids(
|
let string_documents_ids = compute_faceted_strings_documents_ids(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
self.index.facet_id_string_docids.remap_key_type::<ByteSlice>(),
|
self.index.facet_id_string_docids.remap_key_type::<ByteSlice>(),
|
||||||
field_id,
|
field_id,
|
||||||
@ -96,7 +96,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> {
|
|||||||
clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?;
|
clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?;
|
||||||
|
|
||||||
// Compute and store the faceted numbers documents ids.
|
// Compute and store the faceted numbers documents ids.
|
||||||
let number_documents_ids = compute_faceted_documents_ids(
|
let number_documents_ids = compute_faceted_numbers_documents_ids(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
self.index.facet_id_f64_docids.remap_key_type::<ByteSlice>(),
|
self.index.facet_id_f64_docids.remap_key_type::<ByteSlice>(),
|
||||||
field_id,
|
field_id,
|
||||||
@ -237,13 +237,26 @@ fn write_number_entry(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn compute_faceted_documents_ids(
|
fn compute_faceted_strings_documents_ids(
|
||||||
|
rtxn: &heed::RoTxn,
|
||||||
|
db: heed::Database<ByteSlice, FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>>,
|
||||||
|
field_id: FieldId,
|
||||||
|
) -> Result<RoaringBitmap> {
|
||||||
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
|
for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? {
|
||||||
|
let (_key, (_original_value, docids)) = result?;
|
||||||
|
documents_ids |= docids;
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(documents_ids)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compute_faceted_numbers_documents_ids(
|
||||||
rtxn: &heed::RoTxn,
|
rtxn: &heed::RoTxn,
|
||||||
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
db: heed::Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
) -> Result<RoaringBitmap> {
|
) -> Result<RoaringBitmap> {
|
||||||
let mut documents_ids = RoaringBitmap::new();
|
let mut documents_ids = RoaringBitmap::new();
|
||||||
|
|
||||||
for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? {
|
for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? {
|
||||||
let (_key, docids) = result?;
|
let (_key, docids) = result?;
|
||||||
documents_ids |= docids;
|
documents_ids |= docids;
|
||||||
@ -265,7 +278,10 @@ fn clear_field_string_levels<'t>(
|
|||||||
|
|
||||||
fn compute_facet_string_levels<'t>(
|
fn compute_facet_string_levels<'t>(
|
||||||
rtxn: &'t heed::RoTxn,
|
rtxn: &'t heed::RoTxn,
|
||||||
db: heed::Database<FacetStringLevelZeroCodec, CboRoaringBitmapCodec>,
|
db: heed::Database<
|
||||||
|
FacetStringLevelZeroCodec,
|
||||||
|
FacetStringLevelZeroValueCodec<CboRoaringBitmapCodec>,
|
||||||
|
>,
|
||||||
compression_type: CompressionType,
|
compression_type: CompressionType,
|
||||||
compression_level: Option<u32>,
|
compression_level: Option<u32>,
|
||||||
shrink_size: Option<u64>,
|
shrink_size: Option<u64>,
|
||||||
@ -299,7 +315,7 @@ fn compute_facet_string_levels<'t>(
|
|||||||
// Because we know the size of the level 0 we can use a range iterator that starts
|
// Because we know the size of the level 0 we can use a range iterator that starts
|
||||||
// at the first value of the level and goes to the last by simply counting.
|
// at the first value of the level and goes to the last by simply counting.
|
||||||
for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() {
|
for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() {
|
||||||
let ((_field_id, value), docids) = result?;
|
let ((_field_id, value), (_original_value, docids)) = result?;
|
||||||
|
|
||||||
if i == 0 {
|
if i == 0 {
|
||||||
left = (i as u32, value);
|
left = (i as u32, value);
|
||||||
|
@ -2,8 +2,11 @@ use std::borrow::Cow;
|
|||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
use fst::IntoStreamer;
|
use fst::IntoStreamer;
|
||||||
|
use heed::{BytesDecode, BytesEncode};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::error::SerializationError;
|
||||||
|
use crate::heed_codec::facet::FacetStringLevelZeroValueCodec;
|
||||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||||
use crate::Result;
|
use crate::Result;
|
||||||
|
|
||||||
@ -69,6 +72,26 @@ pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>
|
|||||||
Ok(vec)
|
Ok(vec)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Uses the FacetStringLevelZeroValueCodec to merge the values.
|
||||||
|
pub fn tuple_string_cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
||||||
|
let (head, tail) = values.split_first().unwrap();
|
||||||
|
let (head_string, mut head_rb) =
|
||||||
|
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&head[..])
|
||||||
|
.ok_or(SerializationError::Decoding { db_name: None })?;
|
||||||
|
|
||||||
|
for value in tail {
|
||||||
|
let (_string, rb) =
|
||||||
|
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&value[..])
|
||||||
|
.ok_or(SerializationError::Decoding { db_name: None })?;
|
||||||
|
head_rb |= rb;
|
||||||
|
}
|
||||||
|
|
||||||
|
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&(head_string, head_rb))
|
||||||
|
.map(|cow| cow.into_owned())
|
||||||
|
.ok_or(SerializationError::Encoding { db_name: None })
|
||||||
|
.map_err(Into::into)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result<Vec<u8>> {
|
||||||
let (head, tail) = values.split_first().unwrap();
|
let (head, tail) = values.split_first().unwrap();
|
||||||
let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?;
|
let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?;
|
||||||
|
@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
|
|
||||||
pub use self::merge_function::{
|
pub use self::merge_function::{
|
||||||
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
|
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
|
||||||
|
tuple_string_cbo_roaring_bitmap_merge,
|
||||||
};
|
};
|
||||||
use self::store::{Readers, Store};
|
use self::store::{Readers, Store};
|
||||||
pub use self::transform::{Transform, TransformOutput};
|
pub use self::transform::{Transform, TransformOutput};
|
||||||
@ -655,7 +656,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
self.wtxn,
|
self.wtxn,
|
||||||
*self.index.facet_id_string_docids.as_polymorph(),
|
*self.index.facet_id_string_docids.as_polymorph(),
|
||||||
facet_field_strings_docids_readers,
|
facet_field_strings_docids_readers,
|
||||||
cbo_roaring_bitmap_merge,
|
tuple_string_cbo_roaring_bitmap_merge,
|
||||||
write_method,
|
write_method,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
@ -22,12 +22,13 @@ use tempfile::tempfile;
|
|||||||
|
|
||||||
use super::merge_function::{
|
use super::merge_function::{
|
||||||
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
|
cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge,
|
||||||
|
tuple_string_cbo_roaring_bitmap_merge,
|
||||||
};
|
};
|
||||||
use super::{create_sorter, create_writer, writer_into_reader, MergeFn};
|
use super::{create_sorter, create_writer, writer_into_reader, MergeFn};
|
||||||
use crate::error::{Error, InternalError, SerializationError};
|
use crate::error::{Error, InternalError, SerializationError};
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec,
|
FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
|
||||||
FieldDocIdFacetStringCodec,
|
FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
|
||||||
use crate::update::UpdateIndexingStep;
|
use crate::update::UpdateIndexingStep;
|
||||||
@ -153,7 +154,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
max_memory,
|
max_memory,
|
||||||
);
|
);
|
||||||
let facet_field_strings_docids_sorter = create_sorter(
|
let facet_field_strings_docids_sorter = create_sorter(
|
||||||
cbo_roaring_bitmap_merge,
|
tuple_string_cbo_roaring_bitmap_merge,
|
||||||
chunk_compression_type,
|
chunk_compression_type,
|
||||||
chunk_compression_level,
|
chunk_compression_level,
|
||||||
chunk_fusing_shrink_size,
|
chunk_fusing_shrink_size,
|
||||||
@ -528,17 +529,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> {
|
|||||||
Error: From<E>,
|
Error: From<E>,
|
||||||
{
|
{
|
||||||
let mut key_buffer = Vec::new();
|
let mut key_buffer = Vec::new();
|
||||||
let mut data_buffer = Vec::new();
|
|
||||||
|
|
||||||
for ((field_id, normalized_value), (original_value, docids)) in iter {
|
for ((field_id, normalized_value), (original_value, docids)) in iter {
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
data_buffer.clear();
|
|
||||||
|
|
||||||
FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer);
|
FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer);
|
||||||
CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer);
|
|
||||||
|
let data = (original_value.as_str(), docids);
|
||||||
|
let data = FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&data)
|
||||||
|
.ok_or(SerializationError::Encoding { db_name: Some("facet-id-string-docids") })?;
|
||||||
|
|
||||||
if lmdb_key_valid_size(&key_buffer) {
|
if lmdb_key_valid_size(&key_buffer) {
|
||||||
sorter.insert(&key_buffer, &data_buffer)?;
|
sorter.insert(&key_buffer, &data)?;
|
||||||
} else {
|
} else {
|
||||||
warn!("facet value {:?} is too large to be saved", original_value);
|
warn!("facet value {:?} is too large to be saved", original_value);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user