mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-19 09:35:51 +08:00
Store the facet string values in multiple FSTs
This commit is contained in:
parent
9deeec88e0
commit
15a4c05379
23
milli/src/heed_codec/fst_set_codec.rs
Normal file
23
milli/src/heed_codec/fst_set_codec.rs
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
use fst::Set;
|
||||||
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
use heed::{BytesDecode, BytesEncode};
|
||||||
|
|
||||||
|
/// A codec for values of type `Set<&[u8]>`.
|
||||||
|
pub struct FstSetCodec;
|
||||||
|
|
||||||
|
impl<'a> BytesEncode<'a> for FstSetCodec {
|
||||||
|
type EItem = Set<Vec<u8>>;
|
||||||
|
|
||||||
|
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
|
||||||
|
Some(Cow::Borrowed(item.as_fst().as_bytes()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> BytesDecode<'a> for FstSetCodec {
|
||||||
|
type DItem = Set<&'a [u8]>;
|
||||||
|
|
||||||
|
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||||
|
Some(Set::new(bytes).ok()?)
|
||||||
|
}
|
||||||
|
}
|
@ -2,6 +2,7 @@ mod beu32_str_codec;
|
|||||||
mod byte_slice_ref;
|
mod byte_slice_ref;
|
||||||
pub mod facet;
|
pub mod facet;
|
||||||
mod field_id_word_count_codec;
|
mod field_id_word_count_codec;
|
||||||
|
mod fst_set_codec;
|
||||||
mod obkv_codec;
|
mod obkv_codec;
|
||||||
mod roaring_bitmap;
|
mod roaring_bitmap;
|
||||||
mod roaring_bitmap_length;
|
mod roaring_bitmap_length;
|
||||||
@ -15,6 +16,7 @@ pub use str_ref::StrRefCodec;
|
|||||||
|
|
||||||
pub use self::beu32_str_codec::BEU32StrCodec;
|
pub use self::beu32_str_codec::BEU32StrCodec;
|
||||||
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
|
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
|
||||||
|
pub use self::fst_set_codec::FstSetCodec;
|
||||||
pub use self::obkv_codec::ObkvCodec;
|
pub use self::obkv_codec::ObkvCodec;
|
||||||
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
|
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
|
||||||
pub use self::roaring_bitmap_length::{
|
pub use self::roaring_bitmap_length::{
|
||||||
|
@ -21,7 +21,7 @@ use crate::heed_codec::facet::{
|
|||||||
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
||||||
FieldIdCodec, OrderedF64Codec,
|
FieldIdCodec, OrderedF64Codec,
|
||||||
};
|
};
|
||||||
use crate::heed_codec::{ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
|
use crate::heed_codec::{FstSetCodec, ScriptLanguageCodec, StrBEU16Codec, StrRefCodec};
|
||||||
use crate::readable_slices::ReadableSlices;
|
use crate::readable_slices::ReadableSlices;
|
||||||
use crate::{
|
use crate::{
|
||||||
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
|
||||||
@ -94,6 +94,7 @@ pub mod db_name {
|
|||||||
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
|
pub const FACET_ID_IS_NULL_DOCIDS: &str = "facet-id-is-null-docids";
|
||||||
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
|
pub const FACET_ID_IS_EMPTY_DOCIDS: &str = "facet-id-is-empty-docids";
|
||||||
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
pub const FACET_ID_STRING_DOCIDS: &str = "facet-id-string-docids";
|
||||||
|
pub const FACET_ID_STRING_FST: &str = "facet-id-string-fst";
|
||||||
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
pub const FIELD_ID_DOCID_FACET_F64S: &str = "field-id-docid-facet-f64s";
|
||||||
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
pub const FIELD_ID_DOCID_FACET_STRINGS: &str = "field-id-docid-facet-strings";
|
||||||
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
|
pub const VECTOR_ID_DOCID: &str = "vector-id-docids";
|
||||||
@ -154,6 +155,8 @@ pub struct Index {
|
|||||||
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
pub facet_id_f64_docids: Database<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
||||||
/// Maps the facet field id and ranges of strings with the docids that corresponds to them.
|
/// Maps the facet field id and ranges of strings with the docids that corresponds to them.
|
||||||
pub facet_id_string_docids: Database<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>,
|
pub facet_id_string_docids: Database<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>,
|
||||||
|
/// Maps the facet field id of the string facets with an FST containing all the facets values.
|
||||||
|
pub facet_id_string_fst: Database<OwnedType<BEU16>, FstSetCodec>,
|
||||||
|
|
||||||
/// Maps the document id, the facet field id and the numbers.
|
/// Maps the document id, the facet field id and the numbers.
|
||||||
pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>,
|
pub field_id_docid_facet_f64s: Database<FieldDocIdFacetF64Codec, Unit>,
|
||||||
@ -206,6 +209,7 @@ impl Index {
|
|||||||
let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
|
let facet_id_f64_docids = env.create_database(&mut wtxn, Some(FACET_ID_F64_DOCIDS))?;
|
||||||
let facet_id_string_docids =
|
let facet_id_string_docids =
|
||||||
env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
|
env.create_database(&mut wtxn, Some(FACET_ID_STRING_DOCIDS))?;
|
||||||
|
let facet_id_string_fst = env.create_database(&mut wtxn, Some(FACET_ID_STRING_FST))?;
|
||||||
let facet_id_exists_docids =
|
let facet_id_exists_docids =
|
||||||
env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
|
env.create_database(&mut wtxn, Some(FACET_ID_EXISTS_DOCIDS))?;
|
||||||
let facet_id_is_null_docids =
|
let facet_id_is_null_docids =
|
||||||
@ -240,6 +244,7 @@ impl Index {
|
|||||||
field_id_word_count_docids,
|
field_id_word_count_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
|
facet_id_string_fst,
|
||||||
facet_id_exists_docids,
|
facet_id_exists_docids,
|
||||||
facet_id_is_null_docids,
|
facet_id_is_null_docids,
|
||||||
facet_id_is_empty_docids,
|
facet_id_is_empty_docids,
|
||||||
|
@ -34,6 +34,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
script_language_docids,
|
script_language_docids,
|
||||||
facet_id_f64_docids,
|
facet_id_f64_docids,
|
||||||
facet_id_string_docids,
|
facet_id_string_docids,
|
||||||
|
facet_id_string_fst: _,
|
||||||
facet_id_exists_docids,
|
facet_id_exists_docids,
|
||||||
facet_id_is_null_docids,
|
facet_id_is_null_docids,
|
||||||
facet_id_is_empty_docids,
|
facet_id_is_empty_docids,
|
||||||
|
@ -237,6 +237,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
word_prefix_fid_docids,
|
word_prefix_fid_docids,
|
||||||
facet_id_f64_docids: _,
|
facet_id_f64_docids: _,
|
||||||
facet_id_string_docids: _,
|
facet_id_string_docids: _,
|
||||||
|
facet_id_string_fst: _,
|
||||||
field_id_docid_facet_f64s: _,
|
field_id_docid_facet_f64s: _,
|
||||||
field_id_docid_facet_strings: _,
|
field_id_docid_facet_strings: _,
|
||||||
script_language_docids,
|
script_language_docids,
|
||||||
|
@ -78,15 +78,16 @@ pub const FACET_MIN_LEVEL_SIZE: u8 = 5;
|
|||||||
|
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
|
|
||||||
|
use heed::types::DecodeIgnore;
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
use self::incremental::FacetsUpdateIncremental;
|
use self::incremental::FacetsUpdateIncremental;
|
||||||
use super::FacetsUpdateBulk;
|
use super::FacetsUpdateBulk;
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec};
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
||||||
use crate::heed_codec::ByteSliceRefCodec;
|
use crate::heed_codec::ByteSliceRefCodec;
|
||||||
use crate::{Index, Result};
|
use crate::{Index, Result, BEU16};
|
||||||
|
|
||||||
pub mod bulk;
|
pub mod bulk;
|
||||||
pub mod delete;
|
pub mod delete;
|
||||||
@ -157,6 +158,43 @@ impl<'i> FacetsUpdate<'i> {
|
|||||||
);
|
);
|
||||||
incremental_update.execute(wtxn)?;
|
incremental_update.execute(wtxn)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// We compute one FST by string facet
|
||||||
|
let mut text_fsts = vec![];
|
||||||
|
let mut current_fst: Option<(u16, fst::SetBuilder<Vec<u8>>)> = None;
|
||||||
|
let database = self.index.facet_id_string_docids.remap_data_type::<DecodeIgnore>();
|
||||||
|
for result in database.iter(&wtxn)? {
|
||||||
|
let (facet_group_key, _) = result?;
|
||||||
|
if let FacetGroupKey { field_id, level: 0, left_bound } = facet_group_key {
|
||||||
|
current_fst = match current_fst.take() {
|
||||||
|
Some((fid, fst_builder)) if fid != field_id => {
|
||||||
|
let fst = fst_builder.into_set();
|
||||||
|
text_fsts.push((field_id, fst));
|
||||||
|
Some((field_id, fst::SetBuilder::memory()))
|
||||||
|
}
|
||||||
|
Some((field_id, fst_builder)) => Some((field_id, fst_builder)),
|
||||||
|
None => Some((field_id, fst::SetBuilder::memory())),
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some((_, fst_builder)) = current_fst.as_mut() {
|
||||||
|
fst_builder.insert(left_bound)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some((field_id, fst_builder)) = current_fst {
|
||||||
|
let fst = fst_builder.into_set();
|
||||||
|
text_fsts.push((field_id, fst));
|
||||||
|
}
|
||||||
|
|
||||||
|
// We remove all of the previous FSTs that were in this database
|
||||||
|
self.index.facet_id_string_fst.clear(wtxn)?;
|
||||||
|
|
||||||
|
// We write those FSTs in LMDB now
|
||||||
|
for (field_id, fst) in text_fsts {
|
||||||
|
self.index.facet_id_string_fst.put(wtxn, &BEU16::new(field_id), &fst)?;
|
||||||
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user