From cb8442a119c7bb8e7acaeeb433cf7124597d8b36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Sep 2022 13:28:17 +0200 Subject: [PATCH] Further unify facet databases of f64s and strings --- ...4_codec.rs => field_doc_id_facet_codec.rs} | 30 +++--- .../facet/field_doc_id_facet_string_codec.rs | 50 ---------- milli/src/heed_codec/facet/mod.rs | 12 ++- milli/src/search/mod.rs | 2 +- milli/src/update/delete_documents.rs | 98 +++++++------------ 5 files changed, 63 insertions(+), 129 deletions(-) rename milli/src/heed_codec/facet/{field_doc_id_facet_f64_codec.rs => field_doc_id_facet_codec.rs} (54%) delete mode 100644 milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs similarity index 54% rename from milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs rename to milli/src/heed_codec/facet/field_doc_id_facet_codec.rs index 22159601c..7c636e98a 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_f64_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_codec.rs @@ -1,13 +1,15 @@ -use std::borrow::Cow; -use std::convert::TryInto; - -use crate::facet::value_encoding::f64_into_bytes; use crate::{try_split_array_at, DocumentId, FieldId}; +use heed::{BytesDecode, BytesEncode}; +use std::borrow::Cow; +use std::marker::PhantomData; -pub struct FieldDocIdFacetF64Codec; +pub struct FieldDocIdFacetCodec(PhantomData); -impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec { - type DItem = (FieldId, DocumentId, f64); +impl<'a, C> BytesDecode<'a> for FieldDocIdFacetCodec +where + C: BytesDecode<'a>, +{ + type DItem = (FieldId, DocumentId, C::DItem); fn bytes_decode(bytes: &'a [u8]) -> Option { let (field_id_bytes, bytes) = try_split_array_at(bytes)?; @@ -16,22 +18,24 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec { let (document_id_bytes, bytes) = try_split_array_at(bytes)?; let document_id = u32::from_be_bytes(document_id_bytes); - let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?; + let value = C::bytes_decode(&bytes[8..])?; Some((field_id, document_id, value)) } } -impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec { - type EItem = (FieldId, DocumentId, f64); +impl<'a, C> BytesEncode<'a> for FieldDocIdFacetCodec +where + C: BytesEncode<'a>, +{ + type EItem = (FieldId, DocumentId, C::EItem); - fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + fn bytes_encode((field_id, document_id, value): &'a Self::EItem) -> Option> { let mut bytes = Vec::with_capacity(2 + 4 + 8 + 8); bytes.extend_from_slice(&field_id.to_be_bytes()); bytes.extend_from_slice(&document_id.to_be_bytes()); - let value_bytes = f64_into_bytes(*value)?; + let value_bytes = C::bytes_encode(value)?; bytes.extend_from_slice(&value_bytes); - bytes.extend_from_slice(&value.to_be_bytes()); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs deleted file mode 100644 index 178bb21c1..000000000 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ /dev/null @@ -1,50 +0,0 @@ -use std::borrow::Cow; -use std::str; - -use crate::{try_split_array_at, DocumentId, FieldId}; - -pub struct FieldDocIdFacetStringCodec; - -impl FieldDocIdFacetStringCodec { - pub fn serialize_into( - field_id: FieldId, - document_id: DocumentId, - normalized_value: &str, - out: &mut Vec, - ) { - out.reserve(2 + 4 + normalized_value.len()); - out.extend_from_slice(&field_id.to_be_bytes()); - out.extend_from_slice(&document_id.to_be_bytes()); - out.extend_from_slice(normalized_value.as_bytes()); - } -} - -impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { - type DItem = (FieldId, DocumentId, &'a str); - - fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id_bytes, bytes) = try_split_array_at(bytes)?; - let field_id = u16::from_be_bytes(field_id_bytes); - - let (document_id_bytes, bytes) = try_split_array_at(bytes)?; - let document_id = u32::from_be_bytes(document_id_bytes); - - let normalized_value = str::from_utf8(bytes).ok()?; - Some((field_id, document_id, normalized_value)) - } -} - -impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { - type EItem = (FieldId, DocumentId, &'a str); - - fn bytes_encode((field_id, document_id, normalized_value): &Self::EItem) -> Option> { - let mut bytes = Vec::new(); - FieldDocIdFacetStringCodec::serialize_into( - *field_id, - *document_id, - normalized_value, - &mut bytes, - ); - Some(Cow::Owned(bytes)) - } -} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 2e9f0b212..8db8b7df1 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,5 +1,4 @@ -mod field_doc_id_facet_f64_codec; -mod field_doc_id_facet_string_codec; +mod field_doc_id_facet_codec; mod ordered_f64_codec; mod str_ref; @@ -7,16 +6,19 @@ use std::borrow::Cow; use std::convert::TryFrom; use std::marker::PhantomData; -use heed::types::OwnedType; +use heed::types::{DecodeIgnore, OwnedType}; use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; -pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; -pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; +pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec; pub use self::ordered_f64_codec::OrderedF64Codec; pub use self::str_ref::StrRefCodec; use crate::{CboRoaringBitmapCodec, BEU16}; +pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec; +pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec; +pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec; + pub type FieldIdCodec = OwnedType; /// Tries to split a slice in half at the given middle point, diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index e6651737c..f62a37c1b 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -15,7 +15,7 @@ use log::debug; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, /* FacetNumberIter,*/ Filter, DEFAULT_VALUES_PER_FACET,}; +pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; use self::fst_utils::{Complement, Intersection, StartsWith, Union}; pub use self::matches::{ FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 14ef5fd6a..a56a61026 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,7 +1,7 @@ use std::collections::btree_map::Entry; use fst::IntoStreamer; -use heed::types::{ByteSlice, Str}; +use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::Database; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; @@ -11,11 +11,13 @@ use time::OffsetDateTime; use super::{ClearDocuments, FacetsUpdateBulk}; use crate::error::{InternalError, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + ByteSliceRef, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetIgnoreCodec, +}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{ - DocumentId, ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, + ExternalDocumentsIds, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, RoaringBitmapCodec, SmallString32, BEU32, }; @@ -187,10 +189,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_position_docids, word_prefix_position_docids, facet_id_f64_docids: _, - facet_id_exists_docids, facet_id_string_docids: _, - field_id_docid_facet_f64s, - field_id_docid_facet_strings, + field_id_docid_facet_f64s: _, + field_id_docid_facet_strings: _, + facet_id_exists_docids, documents, } = self.index; @@ -449,6 +451,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { fields_ids_map.clone(), facet_type, )?; + for field_id in self.index.faceted_fields_ids(self.wtxn)? { + // Remove docids from the number faceted documents ids + let mut docids = + self.index.faceted_documents_ids(self.wtxn, field_id, facet_type)?; + docids -= &self.to_delete_docids; + self.index.put_faceted_documents_ids(self.wtxn, field_id, facet_type, &docids)?; + + remove_docids_from_field_id_docid_facet_value( + &self.index, + self.wtxn, + facet_type, + field_id, + &self.to_delete_docids, + )?; + } } // We delete the documents ids that are under the facet field id values. @@ -458,47 +475,6 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { &self.to_delete_docids, )?; - // Remove the documents ids from the faceted documents ids. - for field_id in self.index.faceted_fields_ids(self.wtxn)? { - // Remove docids from the number faceted documents ids - let mut docids = - self.index.faceted_documents_ids(self.wtxn, field_id, FacetType::Number)?; - docids -= &self.to_delete_docids; - self.index.put_faceted_documents_ids( - self.wtxn, - field_id, - FacetType::Number, - &docids, - )?; - - remove_docids_from_field_id_docid_facet_value( - self.wtxn, - field_id_docid_facet_f64s, - field_id, - &self.to_delete_docids, - |(_fid, docid, _value)| docid, - )?; - - // Remove docids from the string faceted documents ids - let mut docids = - self.index.faceted_documents_ids(self.wtxn, field_id, FacetType::String)?; - docids -= &self.to_delete_docids; - self.index.put_faceted_documents_ids( - self.wtxn, - field_id, - FacetType::String, - &docids, - )?; - - remove_docids_from_field_id_docid_facet_value( - self.wtxn, - field_id_docid_facet_strings, - field_id, - &self.to_delete_docids, - |(_fid, docid, _value)| docid, - )?; - } - Ok(DocumentDeletionResult { deleted_documents: self.to_delete_docids.len(), remaining_documents: documents_ids.len(), @@ -564,26 +540,28 @@ fn remove_from_word_docids( Ok(()) } -fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( +fn remove_docids_from_field_id_docid_facet_value<'i, 'a>( + index: &'i Index, wtxn: &'a mut heed::RwTxn, - db: &heed::Database, + facet_type: FacetType, field_id: FieldId, to_remove: &RoaringBitmap, - convert: F, -) -> heed::Result<()> -where - C: heed::BytesDecode<'a, DItem = K>, - DC: heed::BytesDecode<'a, DItem = V>, - F: Fn(K) -> DocumentId, -{ +) -> heed::Result<()> { + let db = match facet_type { + FacetType::String => { + index.field_id_docid_facet_strings.remap_types::() + } + FacetType::Number => { + index.field_id_docid_facet_f64s.remap_types::() + } + }; let mut iter = db - .remap_key_type::() .prefix_iter_mut(wtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + .remap_key_type::(); while let Some(result) = iter.next() { - let (key, _) = result?; - if to_remove.contains(convert(key)) { + let ((_, docid, _), _) = result?; + if to_remove.contains(docid) { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; }