From 15a4c05379ee473f6b7a8c1447596307e59e63b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 2 May 2023 09:34:28 +0200 Subject: [PATCH] Store the facet string values in multiple FSTs --- milli/src/heed_codec/fst_set_codec.rs | 23 +++++++++++++++ milli/src/heed_codec/mod.rs | 2 ++ milli/src/index.rs | 7 ++++- milli/src/update/clear_documents.rs | 1 + milli/src/update/delete_documents.rs | 1 + milli/src/update/facet/mod.rs | 42 +++++++++++++++++++++++++-- 6 files changed, 73 insertions(+), 3 deletions(-) create mode 100644 milli/src/heed_codec/fst_set_codec.rs diff --git a/milli/src/heed_codec/fst_set_codec.rs b/milli/src/heed_codec/fst_set_codec.rs new file mode 100644 index 000000000..a8d35ebda --- /dev/null +++ b/milli/src/heed_codec/fst_set_codec.rs @@ -0,0 +1,23 @@ +use fst::Set; +use std::borrow::Cow; + +use heed::{BytesDecode, BytesEncode}; + +/// A codec for values of type `Set<&[u8]>`. +pub struct FstSetCodec; + +impl<'a> BytesEncode<'a> for FstSetCodec { + type EItem = Set>; + + fn bytes_encode(item: &'a Self::EItem) -> Option> { + Some(Cow::Borrowed(item.as_fst().as_bytes())) + } +} + +impl<'a> BytesDecode<'a> for FstSetCodec { + type DItem = Set<&'a [u8]>; + + fn bytes_decode(bytes: &'a [u8]) -> Option { + Some(Set::new(bytes).ok()?) + } +} diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index c54168a36..666f68e28 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -2,6 +2,7 @@ mod beu32_str_codec; mod byte_slice_ref; pub mod facet; mod field_id_word_count_codec; +mod fst_set_codec; mod obkv_codec; mod roaring_bitmap; mod roaring_bitmap_length; @@ -15,6 +16,7 @@ pub use str_ref::StrRefCodec; pub use self::beu32_str_codec::BEU32StrCodec; pub use self::field_id_word_count_codec::FieldIdWordCountCodec; +pub use self::fst_set_codec::FstSetCodec; pub use self::obkv_codec::ObkvCodec; pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; pub use self::roaring_bitmap_length::{ diff --git a/milli/src/index.rs b/milli/src/index.rs index a22901993..5c32f75f5 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -21,7 +21,7 @@ use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FieldIdCodec, OrderedF64Codec, }; -use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec}; +use crate::heed_codec::{FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec}; use crate::readable_slices::ReadableSlices; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, @@ -94,6 +94,7 @@ pub mod db_name { pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids"; pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; + pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const VECTOR_ID_DOCID: &str = "vector-id-docids"; @@ -154,6 +155,8 @@ pub struct Index { pub facet_id_f64_docids: Database, FacetGroupValueCodec>, /// Maps the facet field id and ranges of strings with the docids that corresponds to them. pub facet_id_string_docids: Database, FacetGroupValueCodec>, + /// Maps the facet field id of the string facets with an FST containing all the facets values. + pub facet_id_string_fst: Database, FstSetCodec>, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, @@ -206,6 +209,7 @@ impl Index { let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?; let facet_id_string_docids = env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?; + let facet_id_string_fst = env.create_database(&mut wtxn, Some(FACET_ID_STRING_FST))?; let facet_id_exists_docids = env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?; let facet_id_is_null_docids = @@ -240,6 +244,7 @@ impl Index { field_id_word_count_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_string_fst, facet_id_exists_docids, facet_id_is_null_docids, facet_id_is_empty_docids, diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index f4a2d43fe..37c0f32b2 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -34,6 +34,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { script_language_docids, facet_id_f64_docids, facet_id_string_docids, + facet_id_string_fst: _, facet_id_exists_docids, facet_id_is_null_docids, facet_id_is_empty_docids, diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 766f0e16e..c9124e591 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -237,6 +237,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { word_prefix_fid_docids, facet_id_f64_docids: _, facet_id_string_docids: _, + facet_id_string_fst: _, field_id_docid_facet_f64s: _, field_id_docid_facet_strings: _, script_language_docids, diff --git a/milli/src/update/facet/mod.rs b/milli/src/update/facet/mod.rs index 2fd748d4d..4073ab8e5 100644 --- a/milli/src/update/facet/mod.rs +++ b/milli/src/update/facet/mod.rs @@ -78,15 +78,16 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5; use std::fs::File; +use heed::types::DecodeIgnore; use log::debug; use time::OffsetDateTime; use self::incremental::FacetsUpdateIncremental; use super::FacetsUpdateBulk; use crate::facet::FacetType; -use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::ByteSliceRefCodec; -use crate::{Index, Result}; +use crate::{Index, Result, BEU16}; pub mod bulk; pub mod delete; @@ -157,6 +158,43 @@ impl<'i> FacetsUpdate<'i> { ); incremental_update.execute(wtxn)?; } + + // We compute one FST by string facet + let mut text_fsts = vec![]; + let mut current_fst: Option<(u16, fst::SetBuilder>)> = None; + let database = self.index.facet_id_string_docids.remap_data_type::(); + for result in database.iter(&wtxn)? { + let (facet_group_key, _) = result?; + if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key { + current_fst = match current_fst.take() { + Some((fid, fst_builder)) if fid != field_id => { + let fst = fst_builder.into_set(); + text_fsts.push((field_id, fst)); + Some((field_id, fst::SetBuilder::memory())) + } + Some((field_id, fst_builder)) => Some((field_id, fst_builder)), + None => Some((field_id, fst::SetBuilder::memory())), + }; + + if let Some((_, fst_builder)) = current_fst.as_mut() { + fst_builder.insert(left_bound)?; + } + } + } + + if let Some((field_id, fst_builder)) = current_fst { + let fst = fst_builder.into_set(); + text_fsts.push((field_id, fst)); + } + + // We remove all of the previous FSTs that were in this database + self.index.facet_id_string_fst.clear(wtxn)?; + + // We write those FSTs in LMDB now + for (field_id, fst) in text_fsts { + self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?; + } + Ok(()) } }