diff --git a/crates/milli/src/update/del_add.rs b/crates/milli/src/update/del_add.rs index 97ff86f2a..6825e2bd3 100644 --- a/crates/milli/src/update/del_add.rs +++ b/crates/milli/src/update/del_add.rs @@ -81,6 +81,17 @@ pub enum DelAddOperation { DeletionAndAddition, } +impl DelAddOperation { + /// Merge two DelAddOperation enum variants. + pub fn merge(self, other: Self) -> Self { + match (self, other) { + (Self::Deletion, Self::Deletion) => Self::Deletion, + (Self::Addition, Self::Addition) => Self::Addition, + _ => Self::DeletionAndAddition, + } + } +} + /// Creates a Kv> from two Kv /// /// putting each deletion obkv's keys under an DelAdd::Deletion diff --git a/crates/milli/src/update/facet/bulk.rs b/crates/milli/src/update/facet/bulk.rs index 1ab8740ed..5de0ff4ed 100644 --- a/crates/milli/src/update/facet/bulk.rs +++ b/crates/milli/src/update/facet/bulk.rs @@ -6,7 +6,7 @@ use heed::types::Bytes; use heed::{BytesDecode, BytesEncode, Error, PutFlags, RoTxn, RwTxn}; use roaring::RoaringBitmap; -use super::{FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; +use super::{clear_facet_levels, FACET_GROUP_SIZE, FACET_MIN_LEVEL_SIZE}; use crate::facet::FacetType; use crate::heed_codec::facet::{ FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec, @@ -97,9 +97,7 @@ pub(crate) struct FacetsUpdateBulkInner { impl FacetsUpdateBulkInner { pub fn update(mut self, wtxn: &mut RwTxn<'_>, field_ids: &[u16]) -> Result<()> { self.update_level0(wtxn)?; - for &field_id in field_ids.iter() { - self.clear_levels(wtxn, field_id)?; - } + clear_facet_levels(wtxn, &self.db.remap_data_type(), field_ids)?; for &field_id in field_ids.iter() { let level_readers = self.compute_levels_for_field_id(field_id, wtxn)?; @@ -114,14 +112,6 @@ impl FacetsUpdateBulkInner { Ok(()) } - fn clear_levels(&self, wtxn: &mut heed::RwTxn<'_>, field_id: FieldId) -> Result<()> { - let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; - let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; - let range = left..=right; - self.db.delete_range(wtxn, &range).map(drop)?; - Ok(()) - } - fn update_level0(&mut self, wtxn: &mut RwTxn<'_>) -> Result<()> { let delta_data = match self.delta_data.take() { Some(x) => x, @@ -365,8 +355,6 @@ impl FacetsUpdateBulkInner { mod tests { use std::iter::once; - use big_s::S; - use maplit::hashset; use roaring::RoaringBitmap; use crate::documents::mmap_from_objects; @@ -374,7 +362,7 @@ mod tests { use crate::heed_codec::StrRefCodec; use crate::index::tests::TempIndex; use crate::update::facet::test_helpers::{ordered_string, FacetIndex}; - use crate::{db_snap, milli_snap}; + use crate::{db_snap, milli_snap, FilterableAttributesRule}; #[test] fn insert() { @@ -474,7 +462,8 @@ mod tests { index .update_settings(|settings| { settings.set_primary_key("id".to_owned()); - settings.set_filterable_fields(hashset! { S("id") }); + settings + .set_filterable_fields(vec![FilterableAttributesRule::Field("id".to_string())]); }) .unwrap(); diff --git a/crates/milli/src/update/facet/mod.rs b/crates/milli/src/update/facet/mod.rs index dbacf6248..027bb355e 100644 --- a/crates/milli/src/update/facet/mod.rs +++ b/crates/milli/src/update/facet/mod.rs @@ -89,6 +89,7 @@ use time::OffsetDateTime; use tracing::debug; use self::incremental::FacetsUpdateIncremental; +use super::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use super::{FacetsUpdateBulk, MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps}; use crate::facet::FacetType; use crate::heed_codec::facet::{ @@ -147,7 +148,11 @@ impl<'i> FacetsUpdate<'i> { } } - pub fn execute(self, wtxn: &mut heed::RwTxn<'_>) -> Result<()> { + pub fn execute( + self, + wtxn: &mut heed::RwTxn<'_>, + new_settings: &InnerIndexSettings, + ) -> Result<()> { if self.data_size == 0 { return Ok(()); } @@ -156,8 +161,7 @@ impl<'i> FacetsUpdate<'i> { // See self::comparison_bench::benchmark_facet_indexing if self.data_size >= (self.database.len(wtxn)? / 500) { - let field_ids = - self.index.faceted_fields_ids(wtxn)?.iter().copied().collect::>(); + let field_ids = facet_levels_field_ids(new_settings); let bulk_update = FacetsUpdateBulk::new( self.index, field_ids, @@ -291,6 +295,53 @@ fn index_facet_search( Ok(()) } +/// Clear all the levels greater than 0 for given field ids. +pub fn clear_facet_levels<'a, I>( + wtxn: &mut heed::RwTxn<'_>, + db: &heed::Database, DecodeIgnore>, + field_ids: I, +) -> Result<()> +where + I: IntoIterator, +{ + for field_id in field_ids { + let field_id = *field_id; + let left = FacetGroupKey::<&[u8]> { field_id, level: 1, left_bound: &[] }; + let right = FacetGroupKey::<&[u8]> { field_id, level: u8::MAX, left_bound: &[] }; + let range = left..=right; + db.delete_range(wtxn, &range).map(drop)?; + } + Ok(()) +} + +pub fn clear_facet_levels_based_on_settings_diff( + wtxn: &mut heed::RwTxn<'_>, + index: &Index, + settings_diff: &InnerIndexSettingsDiff, +) -> Result<()> { + let new_field_ids: BTreeSet<_> = facet_levels_field_ids(&settings_diff.new); + let old_field_ids: BTreeSet<_> = facet_levels_field_ids(&settings_diff.old); + + let field_ids_to_clear: Vec<_> = old_field_ids.difference(&new_field_ids).copied().collect(); + clear_facet_levels(wtxn, &index.facet_id_string_docids.remap_types(), &field_ids_to_clear)?; + clear_facet_levels(wtxn, &index.facet_id_f64_docids.remap_types(), &field_ids_to_clear)?; + Ok(()) +} + +fn facet_levels_field_ids(settings: &InnerIndexSettings) -> B +where + B: FromIterator, +{ + settings + .fields_ids_map + .iter_id_metadata() + .filter(|(_, metadata)| { + metadata.require_facet_level_database(&settings.filterable_attributes_rules) + }) + .map(|(id, _)| id) + .collect() +} + #[cfg(test)] pub(crate) mod test_helpers { use std::cell::Cell; diff --git a/crates/milli/src/update/index_documents/enrich.rs b/crates/milli/src/update/index_documents/enrich.rs index c35701961..1f15dd570 100644 --- a/crates/milli/src/update/index_documents/enrich.rs +++ b/crates/milli/src/update/index_documents/enrich.rs @@ -95,12 +95,7 @@ pub fn enrich_documents_batch( // If the settings specifies that a _geo field must be used therefore we must check the // validity of it in all the documents of this batch and this is when we return `Some`. let geo_field_id = match documents_batch_index.id(RESERVED_GEO_FIELD_NAME) { - Some(geo_field_id) - if index.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME) - || index.filterable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME) => - { - Some(geo_field_id) - } + Some(geo_field_id) if index.is_geo_enabled(rtxn)? => Some(geo_field_id), _otherwise => None, }; diff --git a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 606ae6b54..d502e69cc 100644 --- a/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/crates/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -150,9 +150,14 @@ fn searchable_fields_changed( obkv: &KvReader, settings_diff: &InnerIndexSettingsDiff, ) -> bool { - let searchable_fields = &settings_diff.new.searchable_fields_ids; for (field_id, field_bytes) in obkv.iter() { - if searchable_fields.contains(&field_id) { + let Some(metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else { + // If the field id is not in the fields ids map, skip it. + // This happens for the vectors sub-fields. for example: + // "_vectors": { "manual": [1, 2, 3]} -> "_vectors.manual" is not registered. + continue; + }; + if metadata.is_searchable() { let del_add = KvReaderDelAdd::from_slice(field_bytes); match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { // if both fields are None, check the next field. @@ -200,8 +205,14 @@ fn tokens_from_document<'a>( buffers.obkv_buffer.clear(); let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { + let Some(metadata) = settings.fields_ids_map.metadata(field_id) else { + // If the field id is not in the fields ids map, skip it. + // This happens for the vectors sub-fields. for example: + // "_vectors": { "manual": [1, 2, 3]} -> "_vectors.manual" is not registered. + continue; + }; // if field is searchable. - if settings.searchable_fields_ids.contains(&field_id) { + if metadata.is_searchable() { // extract deletion or addition only. if let Some(field_bytes) = KvReaderDelAdd::from_slice(field_bytes).get(del_add) { // parse json. @@ -216,7 +227,7 @@ fn tokens_from_document<'a>( buffers.field_buffer.clear(); if let Some(field) = json_to_string(&value, &mut buffers.field_buffer) { // create an iterator of token with their positions. - let locales = settings.localized_searchable_fields_ids.locales(field_id); + let locales = metadata.locales(&settings.localized_attributes_rules); let tokens = process_tokens(tokenizer.tokenize_with_allow_list(field, locales)) .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); diff --git a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs index d330ea5a0..994125c50 100644 --- a/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs +++ b/crates/milli/src/update/index_documents/extract/extract_facet_string_docids.rs @@ -12,12 +12,11 @@ use heed::BytesEncode; use super::helpers::{create_sorter, sorter_into_reader, try_split_array_at, GrenadParameters}; use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec}; use crate::heed_codec::{BEU16StrCodec, StrRefCodec}; -use crate::localized_attributes_rules::LocalizedFieldIds; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::index_documents::helpers::{ MergeDeladdBtreesetString, MergeDeladdCboRoaringBitmaps, }; -use crate::update::settings::InnerIndexSettingsDiff; +use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::{FieldId, Result, MAX_FACET_VALUE_LENGTH}; /// Extracts the facet string and the documents ids where this facet string appear. @@ -33,13 +32,10 @@ pub fn extract_facet_string_docids( if settings_diff.settings_update_only() { extract_facet_string_docids_settings(docid_fid_facet_string, indexer, settings_diff) } else { - let localized_field_ids = &settings_diff.new.localized_faceted_fields_ids; - let facet_search = settings_diff.new.facet_search; extract_facet_string_docids_document_update( docid_fid_facet_string, indexer, - localized_field_ids, - facet_search, + &settings_diff.new, ) } } @@ -52,8 +48,7 @@ pub fn extract_facet_string_docids( fn extract_facet_string_docids_document_update( docid_fid_facet_string: grenad::Reader, indexer: GrenadParameters, - localized_field_ids: &LocalizedFieldIds, - facet_search: bool, + settings: &InnerIndexSettings, ) -> Result<(grenad::Reader>, grenad::Reader>)> { let max_memory = indexer.max_memory_by_thread(); @@ -92,6 +87,14 @@ fn extract_facet_string_docids_document_update( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); + let Some(metadata) = settings.fields_ids_map.metadata(field_id) else { + unreachable!("metadata not found for field_id: {}", field_id) + }; + + if !metadata.is_faceted(&settings.filterable_attributes_rules) { + continue; + } + let (document_id_bytes, normalized_value_bytes) = try_split_array_at::<_, 4>(bytes).unwrap(); let document_id = u32::from_be_bytes(document_id_bytes); @@ -99,8 +102,10 @@ fn extract_facet_string_docids_document_update( let normalized_value = str::from_utf8(normalized_value_bytes)?; // Facet search normalization - if facet_search { - let locales = localized_field_ids.locales(field_id); + let features = + metadata.filterable_attributes_features(&settings.filterable_attributes_rules); + if features.is_facet_searchable() { + let locales = metadata.locales(&settings.localized_attributes_rules); let hyper_normalized_value = normalize_facet_string(normalized_value, locales); let set = BTreeSet::from_iter(std::iter::once(normalized_value)); @@ -178,8 +183,15 @@ fn extract_facet_string_docids_settings( let (field_id_bytes, bytes) = try_split_array_at(key).unwrap(); let field_id = FieldId::from_be_bytes(field_id_bytes); - let old_locales = settings_diff.old.localized_faceted_fields_ids.locales(field_id); - let new_locales = settings_diff.new.localized_faceted_fields_ids.locales(field_id); + let Some(old_metadata) = settings_diff.old.fields_ids_map.metadata(field_id) else { + unreachable!("old metadata not found for field_id: {}", field_id) + }; + let Some(new_metadata) = settings_diff.new.fields_ids_map.metadata(field_id) else { + unreachable!("new metadata not found for field_id: {}", field_id) + }; + + let old_locales = old_metadata.locales(&settings_diff.old.localized_attributes_rules); + let new_locales = new_metadata.locales(&settings_diff.new.localized_attributes_rules); let are_same_locales = old_locales == new_locales; let reindex_facet_search = @@ -197,10 +209,15 @@ fn extract_facet_string_docids_settings( // Facet search normalization if settings_diff.new.facet_search { + let new_filterable_features = new_metadata + .filterable_attributes_features(&settings_diff.new.filterable_attributes_rules); let new_hyper_normalized_value = normalize_facet_string(normalized_value, new_locales); let old_hyper_normalized_value; + let old_filterable_features = old_metadata + .filterable_attributes_features(&settings_diff.old.filterable_attributes_rules); let old_hyper_normalized_value = if !settings_diff.old.facet_search || deladd_reader.get(DelAdd::Deletion).is_none() + || !old_filterable_features.is_facet_searchable() { // if the facet search is disabled in the old settings or if no facet string is deleted, // we don't need to normalize the facet string. @@ -215,7 +232,9 @@ fn extract_facet_string_docids_settings( let set = BTreeSet::from_iter(std::iter::once(normalized_value)); // if the facet string is the same, we can put the deletion and addition in the same obkv. - if old_hyper_normalized_value == Some(&new_hyper_normalized_value) { + if old_hyper_normalized_value == Some(&new_hyper_normalized_value) + && new_filterable_features.is_facet_searchable() + { // nothing to do if we delete and re-add the value. if is_same_value { continue; @@ -249,7 +268,9 @@ fn extract_facet_string_docids_settings( } // addition - if deladd_reader.get(DelAdd::Addition).is_some() { + if new_filterable_features.is_facet_searchable() + && deladd_reader.get(DelAdd::Addition).is_some() + { // insert new value let val = SerdeJson::bytes_encode(&set).map_err(heed::Error::Encoding)?; buffer.clear(); diff --git a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs index 88c02fe70..de87c5a7c 100644 --- a/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs +++ b/crates/milli/src/update/index_documents/extract/extract_fid_docid_facet_values.rs @@ -76,9 +76,9 @@ pub fn extract_fid_docid_facet_values( let mut strings_key_buffer = Vec::new(); let old_faceted_fids: BTreeSet<_> = - settings_diff.old.faceted_fields_ids.iter().copied().collect(); + settings_diff.list_faceted_fields_from_fid_map(DelAdd::Deletion); let new_faceted_fids: BTreeSet<_> = - settings_diff.new.faceted_fields_ids.iter().copied().collect(); + settings_diff.list_faceted_fields_from_fid_map(DelAdd::Addition); if !settings_diff.settings_update_only || settings_diff.reindex_facets() { let mut cursor = obkv_documents.into_cursor()?; diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index 9103e8324..560b73834 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -15,8 +15,9 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::error::FaultSource; +use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::index::IndexEmbeddingConfig; -use crate::prompt::{FieldsIdsMapWithMetadata, Prompt}; +use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution}; @@ -190,12 +191,8 @@ pub fn extract_vector_points( let reindex_vectors = settings_diff.reindex_vectors(); let old_fields_ids_map = &settings_diff.old.fields_ids_map; - let old_fields_ids_map = - FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids); let new_fields_ids_map = &settings_diff.new.fields_ids_map; - let new_fields_ids_map = - FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids); // the vector field id may have changed let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); @@ -383,7 +380,7 @@ pub fn extract_vector_points( ); continue; } - regenerate_prompt(obkv, prompt, &new_fields_ids_map)? + regenerate_prompt(obkv, prompt, new_fields_ids_map)? } }, // prompt regeneration is only triggered for existing embedders @@ -400,7 +397,7 @@ pub fn extract_vector_points( regenerate_if_prompt_changed( obkv, (old_prompt, prompt), - (&old_fields_ids_map, &new_fields_ids_map), + (old_fields_ids_map, new_fields_ids_map), )? } else { // we can simply ignore user provided vectors as they are not regenerated and are @@ -416,7 +413,7 @@ pub fn extract_vector_points( prompt, (add_to_user_provided, remove_from_user_provided), (old, new), - (&old_fields_ids_map, &new_fields_ids_map), + (old_fields_ids_map, new_fields_ids_map), document_id, embedder_name, embedder_is_manual, @@ -486,10 +483,7 @@ fn extract_vector_document_diff( prompt: &Prompt, (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), (old, new): (VectorState, VectorState), - (old_fields_ids_map, new_fields_ids_map): ( - &FieldsIdsMapWithMetadata, - &FieldsIdsMapWithMetadata, - ), + (old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata), document_id: impl Fn() -> Value, embedder_name: &str, embedder_is_manual: bool, @@ -611,10 +605,7 @@ fn extract_vector_document_diff( fn regenerate_if_prompt_changed( obkv: &obkv::KvReader, (old_prompt, new_prompt): (&Prompt, &Prompt), - (old_fields_ids_map, new_fields_ids_map): ( - &FieldsIdsMapWithMetadata, - &FieldsIdsMapWithMetadata, - ), + (old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata), ) -> Result { let old_prompt = old_prompt .render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map) @@ -630,7 +621,7 @@ fn regenerate_if_prompt_changed( fn regenerate_prompt( obkv: &obkv::KvReader, prompt: &Prompt, - new_fields_ids_map: &FieldsIdsMapWithMetadata, + new_fields_ids_map: &FieldIdMapWithMetadata, ) -> Result { let prompt = prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 86f2ed4af..19ab1ff34 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -26,6 +26,7 @@ use typed_chunk::{write_typed_chunk_into_index, ChunkAccumulator, TypedChunk}; pub use self::enrich::{extract_finite_float_from_value, DocumentId}; pub use self::helpers::*; pub use self::transform::{Transform, TransformOutput}; +use super::facet::clear_facet_levels_based_on_settings_diff; use super::new::StdResult; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError}; @@ -215,9 +216,8 @@ where flattened_documents, } = output; - // update the internal facet and searchable list, + // update the searchable list, // because they might have changed due to the nested documents flattening. - settings_diff.new.recompute_facets(self.wtxn, self.index)?; settings_diff.new.recompute_searchables(self.wtxn, self.index)?; let settings_diff = Arc::new(settings_diff); @@ -465,6 +465,11 @@ where } } + // If the settings are only being updated, we may have to clear some of the facet levels. + if settings_diff.settings_update_only() { + clear_facet_levels_based_on_settings_diff(self.wtxn, self.index, &settings_diff)?; + } + Ok(()) }).map_err(InternalError::from)??; @@ -765,18 +770,19 @@ mod tests { use bumpalo::Bump; use fst::IntoStreamer; use heed::RwTxn; - use maplit::hashset; + use maplit::{btreeset, hashset}; use super::*; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::documents::mmap_from_objects; + use crate::filterable_attributes_rules::filtered_matching_field_names; use crate::index::tests::TempIndex; use crate::index::IndexEmbeddingConfig; use crate::progress::Progress; use crate::search::TermsMatchingStrategy; use crate::update::new::indexer; use crate::update::Setting; - use crate::{all_obkv_to_json, db_snap, Filter, Search, UserError}; + use crate::{all_obkv_to_json, db_snap, Filter, FilterableAttributesRule, Search, UserError}; #[test] fn simple_document_replacement() { @@ -1006,7 +1012,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); } @@ -1018,7 +1026,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); }) .unwrap(); @@ -1234,15 +1244,24 @@ mod tests { let searchable_fields = vec![S("title"), S("nested.object"), S("nested.machin")]; settings.set_searchable_fields(searchable_fields); - let faceted_fields = hashset!(S("title"), S("nested.object"), S("nested.machin")); + let faceted_fields = vec![ + FilterableAttributesRule::Field("title".to_string()), + FilterableAttributesRule::Field("nested.object".to_string()), + FilterableAttributesRule::Field("nested.machin".to_string()), + ]; settings.set_filterable_fields(faceted_fields); }) .unwrap(); let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("title"), S("nested.object"), S("nested.machin"))); + let filterable_fields = index.filterable_attributes_rules(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); + assert_eq!(facets, btreeset!("title", "nested.object", "nested.machin")); // testing the simple query search let mut search = crate::Search::new(&rtxn, &index); @@ -1438,7 +1457,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset!(String::from("dog"))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "dog".to_string(), + )]); }) .unwrap(); @@ -1457,9 +1478,14 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let hidden = index.faceted_fields(&rtxn).unwrap(); + let filterable_fields = index.filterable_attributes_rules(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); - assert_eq!(hidden, hashset!(S("dog"), S("dog.race"), S("dog.race.bernese mountain"))); + assert_eq!(facets, btreeset!("dog", "dog.race", "dog.race.bernese mountain")); for (s, i) in [("zeroth", 0), ("first", 1), ("second", 2), ("third", 3)] { let mut search = crate::Search::new(&rtxn, &index); @@ -1480,9 +1506,14 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); + let filterable_fields = index.filterable_attributes_rules(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); - assert_eq!(facets, hashset!()); + assert_eq!(facets, btreeset!()); // update the settings to test the sortable index @@ -1506,10 +1537,6 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - - assert_eq!(facets, hashset!(S("dog.race"), S("dog.race.bernese mountain"))); - let mut search = crate::Search::new(&rtxn, &index); search.sort_criteria(vec![crate::AscDesc::Asc(crate::Member::Field(S( "dog.race.bernese mountain", @@ -1717,8 +1744,13 @@ mod tests { let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); + let filterable_fields = index.filterable_attributes_rules(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); + assert_eq!(facets, btreeset!("colour", "colour.green", "colour.green.blue")); let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); @@ -1738,7 +1770,7 @@ mod tests { assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![7]); }; - let faceted_fields = hashset!(S("colour")); + let faceted_fields = vec![FilterableAttributesRule::Field("colour".to_string())]; let index = TempIndex::new(); index.add_documents(content()).unwrap(); @@ -1823,8 +1855,13 @@ mod tests { let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("colour"), S("colour.green"), S("colour.green.blue"))); + let filterable_fields = index.filterable_attributes_rules(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); + assert_eq!(facets, btreeset!("colour", "colour.green", "colour.green.blue")); let colour_id = index.fields_ids_map(&rtxn).unwrap().id("colour").unwrap(); let colour_green_id = index.fields_ids_map(&rtxn).unwrap().id("colour.green").unwrap(); @@ -1844,7 +1881,7 @@ mod tests { assert_eq!(bitmap_colour_blue.into_iter().collect::>(), vec![3]); }; - let faceted_fields = hashset!(S("colour")); + let faceted_fields = vec![FilterableAttributesRule::Field("colour".to_string())]; let index = TempIndex::new(); index.add_documents(content()).unwrap(); @@ -1887,8 +1924,13 @@ mod tests { let check_ok = |index: &Index| { let rtxn = index.read_txn().unwrap(); - let facets = index.faceted_fields(&rtxn).unwrap(); - assert_eq!(facets, hashset!(S("tags"), S("tags.green"), S("tags.green.blue"))); + let filterable_fields = index.filterable_attributes_rules(&rtxn).unwrap(); + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let facets = + filtered_matching_field_names(&filterable_fields, &fields_ids_map, &|features| { + features.is_filterable() + }); + assert_eq!(facets, btreeset!("tags", "tags.green", "tags.green.blue")); let tags_id = index.fields_ids_map(&rtxn).unwrap().id("tags").unwrap(); let tags_green_id = index.fields_ids_map(&rtxn).unwrap().id("tags.green").unwrap(); @@ -1907,7 +1949,7 @@ mod tests { assert_eq!(bitmap_tags_blue.into_iter().collect::>(), vec![12]); }; - let faceted_fields = hashset!(S("tags")); + let faceted_fields = vec![FilterableAttributesRule::Field("tags".to_string())]; let index = TempIndex::new(); index.add_documents(content()).unwrap(); @@ -2259,7 +2301,9 @@ mod tests { index .update_settings(|settings| { - settings.set_filterable_fields(hashset! { S("title") }); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + "title".to_string(), + )]); }) .unwrap(); @@ -3115,7 +3159,10 @@ mod tests { index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("docid")); - settings.set_filterable_fields(hashset! { S("label"), S("label2") }); + settings.set_filterable_fields(vec![ + FilterableAttributesRule::Field("label".to_string()), + FilterableAttributesRule::Field("label2".to_string()), + ]); }) .unwrap(); wtxn.commit().unwrap(); @@ -3294,7 +3341,9 @@ mod tests { index .update_settings_using_wtxn(&mut wtxn, |settings| { settings.set_primary_key(S("id")); - settings.set_filterable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); + settings.set_filterable_fields(vec![FilterableAttributesRule::Field( + RESERVED_GEO_FIELD_NAME.to_string(), + )]); settings.set_sortable_fields(hashset!(S(RESERVED_GEO_FIELD_NAME))); }) .unwrap(); diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index d87524a34..b2ee21cbf 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::btree_map::Entry as BEntry; use std::collections::hash_map::Entry as HEntry; -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap}; use std::fs::File; use std::io::{Read, Seek}; @@ -18,8 +18,10 @@ use super::helpers::{ ObkvsMergeAdditionsAndDeletions, }; use super::{create_writer, IndexDocumentsMethod, IndexerConfig, KeepFirst}; +use crate::attribute_patterns::PatternMatch; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::index::{db_name, main_key}; use crate::update::del_add::{ into_del_add_obkv, into_del_add_obkv_conditional_operation, DelAdd, DelAddOperation, @@ -31,9 +33,7 @@ use crate::update::{AvailableIds, UpdateIndexingStep}; use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use crate::vector::settings::WriteBackToDocuments; use crate::vector::ArroyWrapper; -use crate::{ - is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, -}; +use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result}; pub struct TransformOutput { pub primary_key: String, @@ -52,7 +52,7 @@ pub struct TransformOutput { /// containing all those documents. pub struct Transform<'a, 'i> { pub index: &'i Index, - fields_ids_map: FieldsIdsMap, + fields_ids_map: FieldIdMapWithMetadata, indexer_settings: &'a IndexerConfig, pub index_documents_method: IndexDocumentsMethod, @@ -84,7 +84,7 @@ pub enum Operation { /// /// If new fields are present in the addition, they are added to the index field ids map. fn create_fields_mapping( - index_field_map: &mut FieldsIdsMap, + index_field_map: &mut FieldIdMapWithMetadata, batch_field_map: &DocumentsBatchIndex, ) -> Result> { batch_field_map @@ -141,10 +141,13 @@ impl<'a, 'i> Transform<'a, 'i> { true, ); let documents_ids = index.documents_ids(wtxn)?; + let fields_ids_map = index.fields_ids_map(wtxn)?; + let builder = MetadataBuilder::from_index(index, wtxn)?; + let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder); Ok(Transform { index, - fields_ids_map: index.fields_ids_map(wtxn)?, + fields_ids_map, indexer_settings, available_documents_ids: AvailableIds::new(&documents_ids), original_sorter, @@ -354,7 +357,7 @@ impl<'a, 'i> Transform<'a, 'i> { documents_seen: documents_count, }); - self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; + self.index.put_fields_ids_map(wtxn, self.fields_ids_map.as_fields_ids_map())?; self.index.put_primary_key(wtxn, &primary_key)?; self.documents_count += documents_count; // Now that we have a valid sorter that contains the user id and the obkv we @@ -371,7 +374,7 @@ impl<'a, 'i> Transform<'a, 'i> { )] fn flatten_from_fields_ids_map( obkv: &KvReader, - fields_ids_map: &mut FieldsIdsMap, + fields_ids_map: &mut FieldIdMapWithMetadata, ) -> Result>> { if obkv .iter() @@ -657,7 +660,6 @@ impl<'a, 'i> Transform<'a, 'i> { fn rebind_existing_document( old_obkv: &KvReader, settings_diff: &InnerIndexSettingsDiff, - modified_faceted_fields: &HashSet, mut injected_vectors: serde_json::Map, old_vectors_fid: Option, original_obkv_buffer: Option<&mut Vec>, @@ -667,23 +669,26 @@ impl<'a, 'i> Transform<'a, 'i> { let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; // If only a faceted field has been added, keep only this field. - let global_facet_settings_changed = settings_diff.global_facet_settings_changed(); let facet_fids_changed = settings_diff.facet_fids_changed(); - let necessary_faceted_field = - |id: FieldId| -> bool { + + let necessary_faceted_field = |id: FieldId| -> Option { + if facet_fids_changed { let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); - if global_facet_settings_changed { - settings_diff.new.user_defined_faceted_fields.iter().any(|long| { - is_faceted_by(long, field_name) || is_faceted_by(field_name, long) - }) - } else if facet_fids_changed { - modified_faceted_fields.iter().any(|long| { - is_faceted_by(long, field_name) || is_faceted_by(field_name, long) - }) - } else { - false + // if the faceted fields changed, we need to keep all the field that are + // faceted in the old or new settings. + match ( + settings_diff.old.match_faceted_field(field_name), + settings_diff.new.match_faceted_field(field_name), + ) { + (PatternMatch::NoMatch, PatternMatch::NoMatch) => None, + (PatternMatch::NoMatch, _) => Some(DelAddOperation::Addition), + (_, PatternMatch::NoMatch) => Some(DelAddOperation::Deletion), + (_, _) => Some(DelAddOperation::DeletionAndAddition), } - }; + } else { + None + } + }; // Alway provide all fields when vectors are involved because // we need the fields for the prompt/templating. @@ -734,12 +739,22 @@ impl<'a, 'i> Transform<'a, 'i> { } } - if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors { + if is_primary_key(id) || reindex_vectors { operations.insert(id, DelAddOperation::DeletionAndAddition); obkv_writer.insert(id, val)?; - } else if let Some(operation) = settings_diff.reindex_searchable_id(id) { - operations.insert(id, operation); - obkv_writer.insert(id, val)?; + } else { + let facet_operation = necessary_faceted_field(id); + let searchable_operation = settings_diff.reindex_searchable_id(id); + let operation = facet_operation + // TODO: replace `zip.map` with `zip_with` once stable + .zip(searchable_operation) + .map(|(op1, op2)| op1.merge(op2)) + .or(facet_operation) + .or(searchable_operation); + if let Some(operation) = operation { + operations.insert(id, operation); + obkv_writer.insert(id, val)?; + } } } if !injected_vectors.is_empty() { @@ -856,7 +871,6 @@ impl<'a, 'i> Transform<'a, 'i> { }; if original_sorter.is_some() || flattened_sorter.is_some() { - let modified_faceted_fields = settings_diff.modified_faceted_fields(); let mut original_obkv_buffer = Vec::new(); let mut flattened_obkv_buffer = Vec::new(); let mut document_sorter_key_buffer = Vec::new(); @@ -897,7 +911,6 @@ impl<'a, 'i> Transform<'a, 'i> { Self::rebind_existing_document( old_obkv, &settings_diff, - &modified_faceted_fields, injected_vectors, old_vectors_fid, Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index 0809d9601..10dbdc834 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -365,7 +365,7 @@ pub(crate) fn write_typed_chunk_into_index( let merger = builder.build(); let indexer = FacetsUpdate::new(index, FacetType::Number, merger, None, data_size); - indexer.execute(wtxn)?; + indexer.execute(wtxn, &settings_diff.new)?; is_merged_database = true; } TypedChunk::FieldIdFacetStringDocids(_) => { @@ -401,7 +401,7 @@ pub(crate) fn write_typed_chunk_into_index( Some(normalized_facet_id_string_merger), data_size, ); - indexer.execute(wtxn)?; + indexer.execute(wtxn, &settings_diff.new)?; is_merged_database = true; } TypedChunk::FieldIdFacetExistsDocids(_) => { diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index 0d0648fc8..d38fdf138 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -6,17 +6,20 @@ use std::sync::Arc; use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use deserr::{DeserializeError, Deserr}; -use itertools::{EitherOrBoth, Itertools}; +use itertools::{merge_join_by, EitherOrBoth, Itertools}; use roaring::RoaringBitmap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; -use super::del_add::DelAddOperation; +use super::del_add::{DelAdd, DelAddOperation}; use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; -use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; +use crate::attribute_patterns::PatternMatch; +use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::criterion::Criterion; use crate::error::UserError; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; +use crate::filterable_attributes_rules::match_faceted_field; use crate::index::{ IndexEmbeddingConfig, PrefixSearch, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, @@ -31,7 +34,7 @@ use crate::vector::settings::{ WriteBackToDocuments, }; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; -use crate::{FieldId, FieldsIdsMap, Index, LocalizedAttributesRule, LocalizedFieldIds, Result}; +use crate::{FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result}; #[derive(Debug, Clone, PartialEq, Eq, Copy)] pub enum Setting { @@ -155,7 +158,7 @@ pub struct Settings<'a, 't, 'i> { searchable_fields: Setting>, displayed_fields: Setting>, - filterable_fields: Setting>, + filterable_fields: Setting>, sortable_fields: Setting>, criteria: Setting>, stop_words: Setting>, @@ -241,8 +244,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.filterable_fields = Setting::Reset; } - pub fn set_filterable_fields(&mut self, names: HashSet) { - self.filterable_fields = Setting::Set(names); + pub fn set_filterable_fields(&mut self, rules: Vec) { + self.filterable_fields = Setting::Set(rules); } pub fn set_sortable_fields(&mut self, names: HashSet) { @@ -516,7 +519,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } /// Updates the index's searchable attributes. - fn update_searchable(&mut self) -> Result { + fn update_user_defined_searchable_attributes(&mut self) -> Result { match self.searchable_fields { Setting::Set(ref fields) => { // Check to see if the searchable fields changed before doing anything else @@ -529,26 +532,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { return Ok(false); } - // Since we're updating the settings we can only add new fields at the end of the field id map - let mut fields_ids_map = self.index.fields_ids_map(self.wtxn)?; // fields are deduplicated, only the first occurrence is taken into account let names = fields.iter().unique().map(String::as_str).collect::>(); - // Add all the searchable attributes to the field map, and then add the - // remaining fields from the old field map to the new one - for name in names.iter() { - // The fields ids map won't change the field id of already present elements thus only the - // new fields will be inserted. - fields_ids_map.insert(name).ok_or(UserError::AttributeLimitReached)?; - } - - self.index.put_all_searchable_fields_from_fields_ids_map( - self.wtxn, - &names, - &fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME), - &fields_ids_map, - )?; - self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; + self.index.put_user_defined_searchable_fields(self.wtxn, &names)?; Ok(true) } Setting::Reset => Ok(self.index.delete_all_searchable_fields(self.wtxn)?), @@ -760,14 +747,10 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { fn update_filterable(&mut self) -> Result<()> { match self.filterable_fields { Setting::Set(ref fields) => { - let mut new_facets = HashSet::new(); - for name in fields { - new_facets.insert(name.clone()); - } - self.index.put_filterable_fields(self.wtxn, &new_facets)?; + self.index.put_filterable_attributes_rules(self.wtxn, fields)?; } Setting::Reset => { - self.index.delete_filterable_fields(self.wtxn)?; + self.index.delete_filterable_attributes_rules(self.wtxn)?; } Setting::NotSet => (), } @@ -1257,7 +1240,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_separator_tokens()?; self.update_dictionary()?; self.update_synonyms()?; - self.update_searchable()?; + self.update_user_defined_searchable_attributes()?; self.update_exact_attributes()?; self.update_proximity_precision()?; self.update_prefix_search()?; @@ -1267,7 +1250,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let embedding_config_updates = self.update_embedding_configs()?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn, None)?; - new_inner_settings.recompute_facets(self.wtxn, self.index)?; + new_inner_settings.recompute_searchables(self.wtxn, self.index)?; let primary_key_id = self .index @@ -1319,8 +1302,8 @@ impl InnerIndexSettingsDiff { settings_update_only: bool, ) -> Self { let only_additional_fields = match ( - &old_settings.user_defined_searchable_fields, - &new_settings.user_defined_searchable_fields, + &old_settings.user_defined_searchable_attributes, + &new_settings.user_defined_searchable_attributes, ) { (None, None) | (Some(_), None) | (None, Some(_)) => None, // None means * (Some(old), Some(new)) => { @@ -1342,14 +1325,14 @@ impl InnerIndexSettingsDiff { || old_settings.dictionary != new_settings.dictionary || old_settings.proximity_precision != new_settings.proximity_precision || old_settings.prefix_search != new_settings.prefix_search - || old_settings.localized_searchable_fields_ids - != new_settings.localized_searchable_fields_ids + || old_settings.localized_attributes_rules + != new_settings.localized_attributes_rules }; let cache_exact_attributes = old_settings.exact_attributes != new_settings.exact_attributes; - let cache_user_defined_searchables = old_settings.user_defined_searchable_fields - != new_settings.user_defined_searchable_fields; + let cache_user_defined_searchables = old_settings.user_defined_searchable_attributes + != new_settings.user_defined_searchable_attributes; // if the user-defined searchables changed, then we need to reindex prompts. if cache_user_defined_searchables { @@ -1432,30 +1415,70 @@ impl InnerIndexSettingsDiff { } } + /// List the faceted fields from the inner fid map. + /// This is used to list the faceted fields when we are reindexing, + /// but it can't be used in document addition because the field id map must be exhaustive. + pub fn list_faceted_fields_from_fid_map(&self, del_add: DelAdd) -> BTreeSet { + let settings = match del_add { + DelAdd::Deletion => &self.old, + DelAdd::Addition => &self.new, + }; + + settings + .fields_ids_map + .iter_id_metadata() + .filter(|(_, metadata)| metadata.is_faceted(&settings.filterable_attributes_rules)) + .map(|(id, _)| id) + .collect() + } + pub fn facet_fids_changed(&self) -> bool { - let existing_fields = &self.new.existing_fields; - if existing_fields.iter().any(|field| field.contains('.')) { - return true; + for eob in merge_join_by( + self.old.fields_ids_map.iter().filter(|(_, _, metadata)| { + metadata.is_faceted(&self.old.filterable_attributes_rules) + }), + self.new.fields_ids_map.iter().filter(|(_, _, metadata)| { + metadata.is_faceted(&self.new.filterable_attributes_rules) + }), + |(old_fid, _, _), (new_fid, _, _)| old_fid.cmp(new_fid), + ) { + match eob { + // If there is a difference, we need to reindex facet databases. + EitherOrBoth::Left(_) | EitherOrBoth::Right(_) => return true, + // If the field is faceted in both old and new settings, we check the facet-searchable and facet level database. + EitherOrBoth::Both((_, _, old_metadata), (_, _, new_metadata)) => { + // Check if the field is facet-searchable in the old and new settings. + // If there is a difference, we need to reindex facet-search database. + let old_filterable_features = old_metadata + .filterable_attributes_features(&self.old.filterable_attributes_rules); + let new_filterable_features = new_metadata + .filterable_attributes_features(&self.new.filterable_attributes_rules); + let is_old_facet_searchable = + old_filterable_features.is_facet_searchable() && self.old.facet_search; + let is_new_facet_searchable = + new_filterable_features.is_facet_searchable() && self.new.facet_search; + if is_old_facet_searchable != is_new_facet_searchable { + return true; + } + + // Check if the field needs a facet level database in the old and new settings. + // If there is a difference, we need to reindex facet level databases. + let old_facet_level_database = old_metadata + .require_facet_level_database(&self.old.filterable_attributes_rules); + let new_facet_level_database = new_metadata + .require_facet_level_database(&self.new.filterable_attributes_rules); + if old_facet_level_database != new_facet_level_database { + return true; + } + } + } } - let old_faceted_fields = &self.old.user_defined_faceted_fields; - if old_faceted_fields.iter().any(|field| field.contains('.')) { - return true; - } - - // If there is new faceted fields we indicate that we must reindex as we must - // index new fields as facets. It means that the distinct attribute, - // an Asc/Desc criterion or a filtered attribute as be added or removed. - let new_faceted_fields = &self.new.user_defined_faceted_fields; - if new_faceted_fields.iter().any(|field| field.contains('.')) { - return true; - } - - (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) + false } pub fn global_facet_settings_changed(&self) -> bool { - self.old.localized_faceted_fields_ids != self.new.localized_faceted_fields_ids + self.old.localized_attributes_rules != self.new.localized_attributes_rules || self.old.facet_search != self.new.facet_search } @@ -1475,10 +1498,6 @@ impl InnerIndexSettingsDiff { self.old.geo_fields_ids != self.new.geo_fields_ids || (!self.settings_update_only && self.new.geo_fields_ids.is_some()) } - - pub fn modified_faceted_fields(&self) -> HashSet { - &self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields - } } #[derive(Clone)] @@ -1486,20 +1505,17 @@ pub(crate) struct InnerIndexSettings { pub stop_words: Option>>, pub allowed_separators: Option>, pub dictionary: Option>, - pub fields_ids_map: FieldsIdsMap, - pub user_defined_faceted_fields: HashSet, - pub user_defined_searchable_fields: Option>, - pub faceted_fields_ids: HashSet, - pub searchable_fields_ids: Vec, + pub fields_ids_map: FieldIdMapWithMetadata, + pub localized_attributes_rules: Vec, + pub filterable_attributes_rules: Vec, + pub asc_desc_fields: HashSet, + pub distinct_field: Option, + pub user_defined_searchable_attributes: Option>, + pub sortable_fields: HashSet, pub exact_attributes: HashSet, pub proximity_precision: ProximityPrecision, pub embedding_configs: EmbeddingConfigs, - pub existing_fields: HashSet, pub geo_fields_ids: Option<(FieldId, FieldId)>, - pub non_searchable_fields_ids: Vec, - pub non_faceted_fields_ids: Vec, - pub localized_searchable_fields_ids: LocalizedFieldIds, - pub localized_faceted_fields_ids: LocalizedFieldIds, pub prefix_search: PrefixSearch, pub facet_search: bool, } @@ -1515,12 +1531,6 @@ impl InnerIndexSettings { let allowed_separators = index.allowed_separators(rtxn)?; let dictionary = index.dictionary(rtxn)?; let mut fields_ids_map = index.fields_ids_map(rtxn)?; - let user_defined_searchable_fields = index.user_defined_searchable_fields(rtxn)?; - let user_defined_searchable_fields = - user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); - let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; - let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?; - let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let embedding_configs = match embedding_configs { @@ -1529,87 +1539,57 @@ impl InnerIndexSettings { }; let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default(); let facet_search = index.facet_search(rtxn)?; - let existing_fields: HashSet<_> = index - .field_distribution(rtxn)? - .into_iter() - .filter_map(|(field, count)| (count != 0).then_some(field)) - .collect(); - // index.fields_ids_map($a)? ==>> fields_ids_map let geo_fields_ids = match fields_ids_map.id(RESERVED_GEO_FIELD_NAME) { - Some(gfid) => { - let is_sortable = index.sortable_fields_ids(rtxn)?.contains(&gfid); - let is_filterable = index.filterable_fields_ids(rtxn)?.contains(&gfid); + Some(_) if index.is_geo_enabled(rtxn)? => { // if `_geo` is faceted then we get the `lat` and `lng` - if is_sortable || is_filterable { - let field_ids = fields_ids_map - .insert("_geo.lat") - .zip(fields_ids_map.insert("_geo.lng")) - .ok_or(UserError::AttributeLimitReached)?; - Some(field_ids) - } else { - None - } + let field_ids = fields_ids_map + .insert("_geo.lat") + .zip(fields_ids_map.insert("_geo.lng")) + .ok_or(UserError::AttributeLimitReached)?; + Some(field_ids) } - None => None, + _ => None, }; - let localized_attributes_rules = index.localized_attributes_rules(rtxn)?; - let localized_searchable_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &fields_ids_map, - searchable_fields_ids.iter().cloned(), - ); - let localized_faceted_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &fields_ids_map, - faceted_fields_ids.iter().cloned(), - ); - - let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME); - searchable_fields_ids.retain(|id| !vectors_fids.contains(id)); - faceted_fields_ids.retain(|id| !vectors_fids.contains(id)); + let localized_attributes_rules = + index.localized_attributes_rules(rtxn)?.unwrap_or_default(); + let filterable_attributes_rules = index.filterable_attributes_rules(rtxn)?; + let sortable_fields = index.sortable_fields(rtxn)?; + let asc_desc_fields = index.asc_desc_fields(rtxn)?; + let distinct_field = index.distinct_field(rtxn)?.map(|f| f.to_string()); + let user_defined_searchable_attributes = index + .user_defined_searchable_fields(rtxn)? + .map(|fields| fields.into_iter().map(|f| f.to_string()).collect()); + let builder = MetadataBuilder::from_index(index, rtxn)?; + let fields_ids_map = FieldIdMapWithMetadata::new(fields_ids_map, builder); Ok(Self { stop_words, allowed_separators, dictionary, fields_ids_map, - user_defined_faceted_fields, - user_defined_searchable_fields, - faceted_fields_ids, - searchable_fields_ids, + localized_attributes_rules, + filterable_attributes_rules, + asc_desc_fields, + distinct_field, + user_defined_searchable_attributes, + sortable_fields, exact_attributes, proximity_precision, embedding_configs, - existing_fields, geo_fields_ids, - non_searchable_fields_ids: vectors_fids.clone(), - non_faceted_fields_ids: vectors_fids.clone(), - localized_searchable_fields_ids, - localized_faceted_fields_ids, prefix_search, facet_search, }) } - // find and insert the new field ids - pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn<'_>, index: &Index) -> Result<()> { - let new_facets = self - .fields_ids_map - .iter() - .filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid)) - .filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields)) - .map(|(_fid, field)| field.to_string()) - .collect(); - index.put_faceted_fields(wtxn, &new_facets)?; - - self.faceted_fields_ids = index.faceted_fields_ids(wtxn)?; - let localized_attributes_rules = index.localized_attributes_rules(wtxn)?; - self.localized_faceted_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &self.fields_ids_map, - self.faceted_fields_ids.iter().cloned(), - ); - Ok(()) + pub fn match_faceted_field(&self, field: &str) -> PatternMatch { + match_faceted_field( + field, + &self.filterable_attributes_rules, + &self.sortable_fields, + &self.asc_desc_fields, + &self.distinct_field, + ) } // find and insert the new field ids @@ -1619,7 +1599,7 @@ impl InnerIndexSettings { index: &Index, ) -> Result<()> { let searchable_fields = self - .user_defined_searchable_fields + .user_defined_searchable_attributes .as_ref() .map(|searchable| searchable.iter().map(|s| s.as_str()).collect::>()); @@ -1628,17 +1608,9 @@ impl InnerIndexSettings { index.put_all_searchable_fields_from_fields_ids_map( wtxn, &searchable_fields, - &self.non_searchable_fields_ids, &self.fields_ids_map, )?; } - self.searchable_fields_ids = index.searchable_fields_ids(wtxn)?; - let localized_attributes_rules = index.localized_attributes_rules(wtxn)?; - self.localized_searchable_fields_ids = LocalizedFieldIds::new( - &localized_attributes_rules, - &self.fields_ids_map, - self.searchable_fields_ids.iter().cloned(), - ); Ok(()) }