Further unify facet databases of f64s and strings

This commit is contained in:
Loïc Lecrenier 2022-09-08 13:28:17 +02:00 committed by Loïc Lecrenier
parent 3baa34d842
commit cb8442a119
5 changed files with 63 additions and 129 deletions

View File

@ -1,13 +1,15 @@
use std::borrow::Cow;
use std::convert::TryInto;
use crate::facet::value_encoding::f64_into_bytes;
use crate::{try_split_array_at, DocumentId, FieldId}; use crate::{try_split_array_at, DocumentId, FieldId};
use heed::{BytesDecode, BytesEncode};
use std::borrow::Cow;
use std::marker::PhantomData;
pub struct FieldDocIdFacetF64Codec; pub struct FieldDocIdFacetCodec<C>(PhantomData<C>);
impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec { impl<'a, C> BytesDecode<'a> for FieldDocIdFacetCodec<C>
type DItem = (FieldId, DocumentId, f64); where
C: BytesDecode<'a>,
{
type DItem = (FieldId, DocumentId, C::DItem);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> { fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id_bytes, bytes) = try_split_array_at(bytes)?; let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
@ -16,22 +18,24 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec {
let (document_id_bytes, bytes) = try_split_array_at(bytes)?; let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
let document_id = u32::from_be_bytes(document_id_bytes); let document_id = u32::from_be_bytes(document_id_bytes);
let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?; let value = C::bytes_decode(&bytes[8..])?;
Some((field_id, document_id, value)) Some((field_id, document_id, value))
} }
} }
impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec { impl<'a, C> BytesEncode<'a> for FieldDocIdFacetCodec<C>
type EItem = (FieldId, DocumentId, f64); where
C: BytesEncode<'a>,
{
type EItem = (FieldId, DocumentId, C::EItem);
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> { fn bytes_encode((field_id, document_id, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8); let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8);
bytes.extend_from_slice(&field_id.to_be_bytes()); bytes.extend_from_slice(&field_id.to_be_bytes());
bytes.extend_from_slice(&document_id.to_be_bytes()); bytes.extend_from_slice(&document_id.to_be_bytes());
let value_bytes = f64_into_bytes(*value)?; let value_bytes = C::bytes_encode(value)?;
bytes.extend_from_slice(&value_bytes); bytes.extend_from_slice(&value_bytes);
bytes.extend_from_slice(&value.to_be_bytes());
Some(Cow::Owned(bytes)) Some(Cow::Owned(bytes))
} }
} }

View File

@ -1,50 +0,0 @@
use std::borrow::Cow;
use std::str;
use crate::{try_split_array_at, DocumentId, FieldId};
pub struct FieldDocIdFacetStringCodec;
impl FieldDocIdFacetStringCodec {
pub fn serialize_into(
field_id: FieldId,
document_id: DocumentId,
normalized_value: &str,
out: &mut Vec<u8>,
) {
out.reserve(2 + 4 + normalized_value.len());
out.extend_from_slice(&field_id.to_be_bytes());
out.extend_from_slice(&document_id.to_be_bytes());
out.extend_from_slice(normalized_value.as_bytes());
}
}
impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec {
type DItem = (FieldId, DocumentId, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let (document_id_bytes, bytes) = try_split_array_at(bytes)?;
let document_id = u32::from_be_bytes(document_id_bytes);
let normalized_value = str::from_utf8(bytes).ok()?;
Some((field_id, document_id, normalized_value))
}
}
impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec {
type EItem = (FieldId, DocumentId, &'a str);
fn bytes_encode((field_id, document_id, normalized_value): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::new();
FieldDocIdFacetStringCodec::serialize_into(
*field_id,
*document_id,
normalized_value,
&mut bytes,
);
Some(Cow::Owned(bytes))
}
}

View File

@ -1,5 +1,4 @@
mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_codec;
mod field_doc_id_facet_string_codec;
mod ordered_f64_codec; mod ordered_f64_codec;
mod str_ref; mod str_ref;
@ -7,16 +6,19 @@ use std::borrow::Cow;
use std::convert::TryFrom; use std::convert::TryFrom;
use std::marker::PhantomData; use std::marker::PhantomData;
use heed::types::OwnedType; use heed::types::{DecodeIgnore, OwnedType};
use heed::{BytesDecode, BytesEncode}; use heed::{BytesDecode, BytesEncode};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec;
pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
pub use self::ordered_f64_codec::OrderedF64Codec; pub use self::ordered_f64_codec::OrderedF64Codec;
pub use self::str_ref::StrRefCodec; pub use self::str_ref::StrRefCodec;
use crate::{CboRoaringBitmapCodec, BEU16}; use crate::{CboRoaringBitmapCodec, BEU16};
pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec<OrderedF64Codec>;
pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec<StrRefCodec>;
pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec<DecodeIgnore>;
pub type FieldIdCodec = OwnedType<BEU16>; pub type FieldIdCodec = OwnedType<BEU16>;
/// Tries to split a slice in half at the given middle point, /// Tries to split a slice in half at the given middle point,

View File

@ -15,7 +15,7 @@ use log::debug;
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap; use roaring::bitmap::RoaringBitmap;
pub use self::facet::{FacetDistribution, /* FacetNumberIter,*/ Filter, DEFAULT_VALUES_PER_FACET,}; pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET};
use self::fst_utils::{Complement, Intersection, StartsWith, Union}; use self::fst_utils::{Complement, Intersection, StartsWith, Union};
pub use self::matches::{ pub use self::matches::{
FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords,

View File

@ -1,7 +1,7 @@
use std::collections::btree_map::Entry; use std::collections::btree_map::Entry;
use fst::IntoStreamer; use fst::IntoStreamer;
use heed::types::{ByteSlice, Str}; use heed::types::{ByteSlice, DecodeIgnore, Str};
use heed::Database; use heed::Database;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -11,11 +11,13 @@ use time::OffsetDateTime;
use super::{ClearDocuments, FacetsUpdateBulk}; use super::{ClearDocuments, FacetsUpdateBulk};
use crate::error::{InternalError, UserError}; use crate::error::{InternalError, UserError};
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::facet::{
ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetIgnoreCodec,
};
use crate::heed_codec::CboRoaringBitmapCodec; use crate::heed_codec::CboRoaringBitmapCodec;
use crate::index::{db_name, main_key}; use crate::index::{db_name, main_key};
use crate::{ use crate::{
DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result,
RoaringBitmapCodec, SmallString32, BEU32, RoaringBitmapCodec, SmallString32, BEU32,
}; };
@ -187,10 +189,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
word_position_docids, word_position_docids,
word_prefix_position_docids, word_prefix_position_docids,
facet_id_f64_docids: _, facet_id_f64_docids: _,
facet_id_exists_docids,
facet_id_string_docids: _, facet_id_string_docids: _,
field_id_docid_facet_f64s, field_id_docid_facet_f64s: _,
field_id_docid_facet_strings, field_id_docid_facet_strings: _,
facet_id_exists_docids,
documents, documents,
} = self.index; } = self.index;
@ -449,6 +451,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
fields_ids_map.clone(), fields_ids_map.clone(),
facet_type, facet_type,
)?; )?;
for field_id in self.index.faceted_fields_ids(self.wtxn)? {
// Remove docids from the number faceted documents ids
let mut docids =
self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?;
docids -= &self.to_delete_docids;
self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?;
remove_docids_from_field_id_docid_facet_value(
&self.index,
self.wtxn,
facet_type,
field_id,
&self.to_delete_docids,
)?;
}
} }
// We delete the documents ids that are under the facet field id values. // We delete the documents ids that are under the facet field id values.
@ -458,47 +475,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
&self.to_delete_docids, &self.to_delete_docids,
)?; )?;
// Remove the documents ids from the faceted documents ids.
for field_id in self.index.faceted_fields_ids(self.wtxn)? {
// Remove docids from the number faceted documents ids
let mut docids =
self.index.faceted_documents_ids(self.wtxn, field_id, FacetType::Number)?;
docids -= &self.to_delete_docids;
self.index.put_faceted_documents_ids(
self.wtxn,
field_id,
FacetType::Number,
&docids,
)?;
remove_docids_from_field_id_docid_facet_value(
self.wtxn,
field_id_docid_facet_f64s,
field_id,
&self.to_delete_docids,
|(_fid, docid, _value)| docid,
)?;
// Remove docids from the string faceted documents ids
let mut docids =
self.index.faceted_documents_ids(self.wtxn, field_id, FacetType::String)?;
docids -= &self.to_delete_docids;
self.index.put_faceted_documents_ids(
self.wtxn,
field_id,
FacetType::String,
&docids,
)?;
remove_docids_from_field_id_docid_facet_value(
self.wtxn,
field_id_docid_facet_strings,
field_id,
&self.to_delete_docids,
|(_fid, docid, _value)| docid,
)?;
}
Ok(DocumentDeletionResult { Ok(DocumentDeletionResult {
deleted_documents: self.to_delete_docids.len(), deleted_documents: self.to_delete_docids.len(),
remaining_documents: documents_ids.len(), remaining_documents: documents_ids.len(),
@ -564,26 +540,28 @@ fn remove_from_word_docids(
Ok(()) Ok(())
} }
fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( fn remove_docids_from_field_id_docid_facet_value<'i, 'a>(
index: &'i Index,
wtxn: &'a mut heed::RwTxn, wtxn: &'a mut heed::RwTxn,
db: &heed::Database<C, DC>, facet_type: FacetType,
field_id: FieldId, field_id: FieldId,
to_remove: &RoaringBitmap, to_remove: &RoaringBitmap,
convert: F, ) -> heed::Result<()> {
) -> heed::Result<()> let db = match facet_type {
where FacetType::String => {
C: heed::BytesDecode<'a, DItem = K>, index.field_id_docid_facet_strings.remap_types::<ByteSlice, DecodeIgnore>()
DC: heed::BytesDecode<'a, DItem = V>, }
F: Fn(K) -> DocumentId, FacetType::Number => {
{ index.field_id_docid_facet_f64s.remap_types::<ByteSlice, DecodeIgnore>()
}
};
let mut iter = db let mut iter = db
.remap_key_type::<ByteSlice>()
.prefix_iter_mut(wtxn, &field_id.to_be_bytes())? .prefix_iter_mut(wtxn, &field_id.to_be_bytes())?
.remap_key_type::<C>(); .remap_key_type::<FieldDocIdFacetIgnoreCodec>();
while let Some(result) = iter.next() { while let Some(result) = iter.next() {
let (key, _) = result?; let ((_, docid, _), _) = result?;
if to_remove.contains(convert(key)) { if to_remove.contains(docid) {
// safety: we don't keep references from inside the LMDB database. // safety: we don't keep references from inside the LMDB database.
unsafe { iter.del_current()? }; unsafe { iter.del_current()? };
} }