Store the facet string values in multiple FSTs

This commit is contained in:
Clément Renault 2023-05-02 09:34:28 +02:00 committed by Louis Dureuil
parent 9deeec88e0
commit 15a4c05379
No known key found for this signature in database
6 changed files with 73 additions and 3 deletions

View File

@ -0,0 +1,23 @@
use fst::Set;
use std::borrow::Cow;
use heed::{BytesDecode, BytesEncode};
/// A codec for values of type `Set<&[u8]>`.
pub struct FstSetCodec;
impl<'a> BytesEncode<'a> for FstSetCodec {
type EItem = Set<Vec<u8>>;
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
Some(Cow::Borrowed(item.as_fst().as_bytes()))
}
}
impl<'a> BytesDecode<'a> for FstSetCodec {
type DItem = Set<&'a [u8]>;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
Some(Set::new(bytes).ok()?)
}
}

View File

@ -2,6 +2,7 @@ mod beu32_str_codec;
mod byte_slice_ref; mod byte_slice_ref;
pub mod facet; pub mod facet;
mod field_id_word_count_codec; mod field_id_word_count_codec;
mod fst_set_codec;
mod obkv_codec; mod obkv_codec;
mod roaring_bitmap; mod roaring_bitmap;
mod roaring_bitmap_length; mod roaring_bitmap_length;
@ -15,6 +16,7 @@ pub use str_ref::StrRefCodec;
pub use self::beu32_str_codec::BEU32StrCodec; pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::field_id_word_count_codec::FieldIdWordCountCodec; pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
pub use self::fst_set_codec::FstSetCodec;
pub use self::obkv_codec::ObkvCodec; pub use self::obkv_codec::ObkvCodec;
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec}; pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
pub use self::roaring_bitmap_length::{ pub use self::roaring_bitmap_length::{

View File

@ -21,7 +21,7 @@ use crate::heed_codec::facet::{
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
FieldIdCodec, OrderedF64Codec, FieldIdCodec, OrderedF64Codec,
}; };
use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec}; use crate::heed_codec::{FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
use crate::readable_slices::ReadableSlices; use crate::readable_slices::ReadableSlices;
use crate::{ use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
@ -94,6 +94,7 @@ pub mod db_name {
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids"; pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids"; pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids"; pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst";
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s"; pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings"; pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
pub const VECTOR_ID_DOCID: &str = "vector-id-docids"; pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
@ -154,6 +155,8 @@ pub struct Index {
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>, pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
/// Maps the facet field id and ranges of strings with the docids that corresponds to them. /// Maps the facet field id and ranges of strings with the docids that corresponds to them.
pub facet_id_string_docids: Database<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>, pub facet_id_string_docids: Database<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>,
/// Maps the facet field id of the string facets with an FST containing all the facets values.
pub facet_id_string_fst: Database<OwnedType<BEU16>, FstSetCodec>,
/// Maps the document id, the facet field id and the numbers. /// Maps the document id, the facet field id and the numbers.
pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>, pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>,
@ -206,6 +209,7 @@ impl Index {
let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?; let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
let facet_id_string_docids = let facet_id_string_docids =
env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?; env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
let facet_id_string_fst = env.create_database(&mut wtxn, Some(FACET_ID_STRING_FST))?;
let facet_id_exists_docids = let facet_id_exists_docids =
env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?; env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
let facet_id_is_null_docids = let facet_id_is_null_docids =
@ -240,6 +244,7 @@ impl Index {
field_id_word_count_docids, field_id_word_count_docids,
facet_id_f64_docids, facet_id_f64_docids,
facet_id_string_docids, facet_id_string_docids,
facet_id_string_fst,
facet_id_exists_docids, facet_id_exists_docids,
facet_id_is_null_docids, facet_id_is_null_docids,
facet_id_is_empty_docids, facet_id_is_empty_docids,

View File

@ -34,6 +34,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
script_language_docids, script_language_docids,
facet_id_f64_docids, facet_id_f64_docids,
facet_id_string_docids, facet_id_string_docids,
facet_id_string_fst: _,
facet_id_exists_docids, facet_id_exists_docids,
facet_id_is_null_docids, facet_id_is_null_docids,
facet_id_is_empty_docids, facet_id_is_empty_docids,

View File

@ -237,6 +237,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
word_prefix_fid_docids, word_prefix_fid_docids,
facet_id_f64_docids: _, facet_id_f64_docids: _,
facet_id_string_docids: _, facet_id_string_docids: _,
facet_id_string_fst: _,
field_id_docid_facet_f64s: _, field_id_docid_facet_f64s: _,
field_id_docid_facet_strings: _, field_id_docid_facet_strings: _,
script_language_docids, script_language_docids,

View File

@ -78,15 +78,16 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
use std::fs::File; use std::fs::File;
use heed::types::DecodeIgnore;
use log::debug; use log::debug;
use time::OffsetDateTime; use time::OffsetDateTime;
use self::incremental::FacetsUpdateIncremental; use self::incremental::FacetsUpdateIncremental;
use super::FacetsUpdateBulk; use super::FacetsUpdateBulk;
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::ByteSliceRefCodec; use crate::heed_codec::ByteSliceRefCodec;
use crate::{Index, Result}; use crate::{Index, Result, BEU16};
pub mod bulk; pub mod bulk;
pub mod delete; pub mod delete;
@ -157,6 +158,43 @@ impl<'i> FacetsUpdate<'i> {
); );
incremental_update.execute(wtxn)?; incremental_update.execute(wtxn)?;
} }
// We compute one FST by string facet
let mut text_fsts = vec![];
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
for result in database.iter(&wtxn)? {
let (facet_group_key, _) = result?;
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
current_fst = match current_fst.take() {
Some((fid, fst_builder)) if fid != field_id => {
let fst = fst_builder.into_set();
text_fsts.push((field_id, fst));
Some((field_id, fst::SetBuilder::memory()))
}
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
None => Some((field_id, fst::SetBuilder::memory())),
};
if let Some((_, fst_builder)) = current_fst.as_mut() {
fst_builder.insert(left_bound)?;
}
}
}
if let Some((field_id, fst_builder)) = current_fst {
let fst = fst_builder.into_set();
text_fsts.push((field_id, fst));
}
// We remove all of the previous FSTs that were in this database
self.index.facet_id_string_fst.clear(wtxn)?;
// We write those FSTs in LMDB now
for (field_id, fst) in text_fsts {
self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?;
}
Ok(()) Ok(())
} }
} }