diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 2de9f384b..c790b4d32 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -4,10 +4,10 @@ use heed::RoTxn; use super::document::{ Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions, }; -use super::extract::perm_json_p; use super::vector_document::{ MergedVectorDocument, VectorDocumentFromDb, VectorDocumentFromVersions, }; +use crate::attribute_patterns::PatternMatch; use crate::documents::FieldIdMapper; use crate::vector::EmbeddingConfigs; use crate::{DocumentId, Index, Result}; @@ -173,7 +173,7 @@ impl<'doc> Update<'doc> { /// Otherwise `false`. pub fn has_changed_for_fields<'t, Mapper: FieldIdMapper>( &self, - fields: Option<&[&str]>, + selector: &mut impl FnMut(&str) -> PatternMatch, rtxn: &'t RoTxn, index: &'t Index, mapper: &'t Mapper, @@ -185,7 +185,7 @@ impl<'doc> Update<'doc> { for entry in self.only_changed_fields().iter_top_level_fields() { let (key, updated_value) = entry?; - if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + if selector(key) == PatternMatch::NoMatch { continue; } @@ -229,7 +229,7 @@ impl<'doc> Update<'doc> { for entry in current.iter_top_level_fields() { let (key, _) = entry?; - if perm_json_p::select_field(key, fields, &[]) == perm_json_p::Selection::Skip { + if selector(key) == PatternMatch::NoMatch { continue; } current_selected_field_count += 1; diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index 41b6a12a2..3201e23f9 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -5,12 +5,13 @@ use std::ops::DerefMut as _; use bumpalo::collections::Vec as BVec; use bumpalo::Bump; use hashbrown::HashMap; -use heed::RoTxn; use serde_json::Value; use super::super::cache::BalancedCaches; use super::facet_document::extract_document_facets; use super::FacetKind; +use crate::fields_ids_map::metadata::Metadata; +use crate::filterable_attributes_rules::match_faceted_field; use crate::heed_codec::facet::OrderedF64Codec; use crate::update::del_add::DelAdd; use crate::update::new::channel::FieldIdDocidFacetSender; @@ -23,13 +24,17 @@ use crate::update::new::steps::IndexingStep; use crate::update::new::thread_local::{FullySend, ThreadLocal}; use crate::update::new::DocumentChange; use crate::update::GrenadParameters; -use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; +use crate::{DocumentId, FieldId, FilterableAttributesRule, Result, MAX_FACET_VALUE_LENGTH}; pub struct FacetedExtractorData<'a, 'b> { - attributes_to_extract: &'a [&'a str], sender: &'a FieldIdDocidFacetSender<'a, 'b>, grenad_parameters: &'a GrenadParameters, buckets: usize, + filterable_attributes: Vec, + sortable_fields: HashSet, + asc_desc_fields: HashSet, + distinct_field: Option, + is_geo_enabled: bool, } impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> { @@ -52,7 +57,11 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> let change = change?; FacetedDocidsExtractor::extract_document_change( context, - self.attributes_to_extract, + &self.filterable_attributes, + &self.sortable_fields, + &self.asc_desc_fields, + &self.distinct_field, + self.is_geo_enabled, change, self.sender, )? @@ -64,13 +73,18 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a, 'b> pub struct FacetedDocidsExtractor; impl FacetedDocidsExtractor { + #[allow(clippy::too_many_arguments)] fn extract_document_change( context: &DocumentChangeContext>, - attributes_to_extract: &[&str], + filterable_attributes: &[FilterableAttributesRule], + sortable_fields: &HashSet, + asc_desc_fields: &HashSet, + distinct_field: &Option, + is_geo_enabled: bool, document_change: DocumentChange, sender: &FieldIdDocidFacetSender, ) -> Result<()> { - let index = &context.index; + let index = context.index; let rtxn = &context.rtxn; let mut new_fields_ids_map = context.new_fields_ids_map.borrow_mut_or_yield(); let mut cached_sorter = context.data.borrow_mut_or_yield(); @@ -78,11 +92,15 @@ impl FacetedDocidsExtractor { let docid = document_change.docid(); let res = match document_change { DocumentChange::Deletion(inner) => extract_document_facets( - attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, + &mut |fid, meta, depth, value| { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -91,6 +109,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_del, docid, fid, + meta, + filterable_attributes, depth, value, ) @@ -98,7 +118,15 @@ impl FacetedDocidsExtractor { ), DocumentChange::Update(inner) => { if !inner.has_changed_for_fields( - Some(attributes_to_extract), + &mut |field_name| { + match_faceted_field( + field_name, + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + ) + }, rtxn, index, context.db_fields_ids_map, @@ -107,11 +135,15 @@ impl FacetedDocidsExtractor { } extract_document_facets( - attributes_to_extract, inner.current(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, + &mut |fid, meta, depth, value| { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -120,6 +152,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_del, docid, fid, + meta, + filterable_attributes, depth, value, ) @@ -127,11 +161,15 @@ impl FacetedDocidsExtractor { )?; extract_document_facets( - attributes_to_extract, inner.merged(rtxn, index, context.db_fields_ids_map)?, inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, + &mut |fid, meta, depth, value| { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -140,6 +178,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_add, docid, fid, + meta, + filterable_attributes, depth, value, ) @@ -147,11 +187,15 @@ impl FacetedDocidsExtractor { ) } DocumentChange::Insertion(inner) => extract_document_facets( - attributes_to_extract, inner.inserted(), inner.external_document_id(), new_fields_ids_map.deref_mut(), - &mut |fid, depth, value| { + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, + &mut |fid, meta, depth, value| { Self::facet_fn_with_options( &context.doc_alloc, cached_sorter.deref_mut(), @@ -160,6 +204,8 @@ impl FacetedDocidsExtractor { DelAddFacetValue::insert_add, docid, fid, + meta, + filterable_attributes, depth, value, ) @@ -180,9 +226,18 @@ impl FacetedDocidsExtractor { facet_fn: impl Fn(&mut DelAddFacetValue<'doc>, FieldId, BVec<'doc, u8>, FacetKind), docid: DocumentId, fid: FieldId, + meta: Metadata, + filterable_attributes: &[FilterableAttributesRule], depth: perm_json_p::Depth, value: &Value, ) -> Result<()> { + // if the field is not faceted, do nothing + if !meta.is_faceted(filterable_attributes) { + return Ok(()); + } + + let features = meta.filterable_attributes_features(filterable_attributes); + let mut buffer = BVec::new_in(doc_alloc); // Exists // key: fid @@ -246,7 +301,9 @@ impl FacetedDocidsExtractor { } // Null // key: fid - Value::Null if depth == perm_json_p::Depth::OnBaseKey => { + Value::Null + if depth == perm_json_p::Depth::OnBaseKey && features.is_filterable_null() => + { buffer.clear(); buffer.push(FacetKind::Null as u8); buffer.extend_from_slice(&fid.to_be_bytes()); @@ -254,19 +311,29 @@ impl FacetedDocidsExtractor { } // Empty // key: fid - Value::Array(a) if a.is_empty() && depth == perm_json_p::Depth::OnBaseKey => { + Value::Array(a) + if a.is_empty() + && depth == perm_json_p::Depth::OnBaseKey + && features.is_filterable_empty() => + { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); cache_fn(cached_sorter, &buffer, docid) } - Value::String(_) if depth == perm_json_p::Depth::OnBaseKey => { + Value::String(_) + if depth == perm_json_p::Depth::OnBaseKey && features.is_filterable_empty() => + { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); cache_fn(cached_sorter, &buffer, docid) } - Value::Object(o) if o.is_empty() && depth == perm_json_p::Depth::OnBaseKey => { + Value::Object(o) + if o.is_empty() + && depth == perm_json_p::Depth::OnBaseKey + && features.is_filterable_empty() => + { buffer.clear(); buffer.push(FacetKind::Empty as u8); buffer.extend_from_slice(&fid.to_be_bytes()); @@ -276,10 +343,6 @@ impl FacetedDocidsExtractor { _ => Ok(()), } } - - fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { - index.user_defined_faceted_fields(rtxn) - } } struct DelAddFacetValue<'doc> { @@ -399,9 +462,11 @@ impl FacetedDocidsExtractor { { let index = indexing_context.index; let rtxn = index.read_txn()?; - let attributes_to_extract = Self::attributes_to_extract(&rtxn, index)?; - let attributes_to_extract: Vec<_> = - attributes_to_extract.iter().map(|s| s.as_ref()).collect(); + let filterable_attributes = index.filterable_attributes_rules(&rtxn)?; + let sortable_fields = index.sortable_fields(&rtxn)?; + let asc_desc_fields = index.asc_desc_fields(&rtxn)?; + let distinct_field = index.distinct_field(&rtxn)?.map(|s| s.to_string()); + let is_geo_enabled = index.is_geo_enabled(&rtxn)?; let datastore = ThreadLocal::new(); { @@ -410,10 +475,14 @@ impl FacetedDocidsExtractor { let _entered = span.enter(); let extractor = FacetedExtractorData { - attributes_to_extract: &attributes_to_extract, grenad_parameters: indexing_context.grenad_parameters, buckets: rayon::current_num_threads(), sender, + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + is_geo_enabled, }; extract( document_changes, diff --git a/crates/milli/src/update/new/extract/faceted/facet_document.rs b/crates/milli/src/update/new/extract/faceted/facet_document.rs index 8d582d103..e74131402 100644 --- a/crates/milli/src/update/new/extract/faceted/facet_document.rs +++ b/crates/milli/src/update/new/extract/faceted/facet_document.rs @@ -1,46 +1,80 @@ +use std::collections::HashSet; + use serde_json::Value; -use crate::constants::RESERVED_GEO_FIELD_NAME; +use crate::attribute_patterns::PatternMatch; +use crate::fields_ids_map::metadata::Metadata; use crate::update::new::document::Document; use crate::update::new::extract::geo::extract_geo_coordinates; use crate::update::new::extract::perm_json_p; -use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError}; +use crate::{ + FieldId, FilterableAttributesRule, GlobalFieldsIdsMap, InternalError, Result, UserError, +}; +use crate::filterable_attributes_rules::match_faceted_field; + +#[allow(clippy::too_many_arguments)] pub fn extract_document_facets<'doc>( - attributes_to_extract: &[&str], document: impl Document<'doc>, external_document_id: &str, field_id_map: &mut GlobalFieldsIdsMap, - facet_fn: &mut impl FnMut(FieldId, perm_json_p::Depth, &Value) -> Result<()>, + filterable_attributes: &[FilterableAttributesRule], + sortable_fields: &HashSet, + asc_desc_fields: &HashSet, + distinct_field: &Option, + is_geo_enabled: bool, + facet_fn: &mut impl FnMut(FieldId, Metadata, perm_json_p::Depth, &Value) -> Result<()>, ) -> Result<()> { + // return the match result for the given field name. + let match_field = |field_name: &str| -> PatternMatch { + match_faceted_field( + field_name, + filterable_attributes, + sortable_fields, + asc_desc_fields, + distinct_field, + ) + }; + + // extract the field if it is faceted (facet searchable, filterable, sortable) + let mut extract_field = |name: &str, depth: perm_json_p::Depth, value: &Value| -> Result<()> { + match field_id_map.id_with_metadata_or_insert(name) { + Some((field_id, meta)) => { + facet_fn(field_id, meta, depth, value)?; + + Ok(()) + } + None => Err(UserError::AttributeLimitReached.into()), + } + }; + for res in document.iter_top_level_fields() { let (field_name, value) = res?; + let selection = match_field(field_name); - let mut tokenize_field = - |name: &str, depth: perm_json_p::Depth, value: &Value| match field_id_map - .id_or_insert(name) - { - Some(field_id) => facet_fn(field_id, depth, value), - None => Err(UserError::AttributeLimitReached.into()), - }; + // extract the field if it matches a pattern and if it is faceted (facet searchable, filterable, sortable) + let mut match_and_extract = |name: &str, depth: perm_json_p::Depth, value: &Value| { + let selection = match_field(name); + if selection == PatternMatch::Match { + extract_field(name, depth, value)?; + } - // if the current field is searchable or contains a searchable attribute - let selection = perm_json_p::select_field(field_name, Some(attributes_to_extract), &[]); - if selection != perm_json_p::Selection::Skip { + Ok(selection) + }; + + if selection != PatternMatch::NoMatch { // parse json. match serde_json::value::to_value(value).map_err(InternalError::SerdeJson)? { Value::Object(object) => { perm_json_p::seek_leaf_values_in_object( &object, - Some(attributes_to_extract), - &[], // skip no attributes field_name, perm_json_p::Depth::OnBaseKey, - &mut tokenize_field, + &mut match_and_extract, )?; - if selection == perm_json_p::Selection::Select { - tokenize_field( + if selection == PatternMatch::Match { + extract_field( field_name, perm_json_p::Depth::OnBaseKey, &Value::Object(object), @@ -50,36 +84,34 @@ pub fn extract_document_facets<'doc>( Value::Array(array) => { perm_json_p::seek_leaf_values_in_array( &array, - Some(attributes_to_extract), - &[], // skip no attributes field_name, perm_json_p::Depth::OnBaseKey, - &mut tokenize_field, + &mut match_and_extract, )?; - if selection == perm_json_p::Selection::Select { - tokenize_field( + if selection == PatternMatch::Match { + extract_field( field_name, perm_json_p::Depth::OnBaseKey, &Value::Array(array), )?; } } - value => tokenize_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?, + value => extract_field(field_name, perm_json_p::Depth::OnBaseKey, &value)?, } } } - if attributes_to_extract.contains(&RESERVED_GEO_FIELD_NAME) { + if is_geo_enabled { if let Some(geo_value) = document.geo_field()? { if let Some([lat, lng]) = extract_geo_coordinates(external_document_id, geo_value)? { - let (lat_fid, lng_fid) = field_id_map - .id_or_insert("_geo.lat") - .zip(field_id_map.id_or_insert("_geo.lng")) + let ((lat_fid, lat_meta), (lng_fid, lng_meta)) = field_id_map + .id_with_metadata_or_insert("_geo.lat") + .zip(field_id_map.id_with_metadata_or_insert("_geo.lng")) .ok_or(UserError::AttributeLimitReached)?; - facet_fn(lat_fid, perm_json_p::Depth::OnBaseKey, &lat.into())?; - facet_fn(lng_fid, perm_json_p::Depth::OnBaseKey, &lng.into())?; + facet_fn(lat_fid, lat_meta, perm_json_p::Depth::OnBaseKey, &lat.into())?; + facet_fn(lng_fid, lng_meta, perm_json_p::Depth::OnBaseKey, &lng.into())?; } } } diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index f2af0b229..d51fd9d36 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -9,7 +9,6 @@ use heed::RoTxn; use serde_json::value::RawValue; use serde_json::Value; -use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::error::GeoError; use crate::update::new::document::Document; use crate::update::new::indexer::document_changes::{DocumentChangeContext, Extractor}; @@ -29,9 +28,7 @@ impl GeoExtractor { index: &Index, grenad_parameters: GrenadParameters, ) -> Result> { - let is_sortable = index.sortable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME); - let is_filterable = index.filterable_fields(rtxn)?.contains(RESERVED_GEO_FIELD_NAME); - if is_sortable || is_filterable { + if index.is_geo_enabled(rtxn)? { Ok(Some(GeoExtractor { grenad_parameters })) } else { Ok(None) diff --git a/crates/milli/src/update/new/extract/mod.rs b/crates/milli/src/update/new/extract/mod.rs index aa0a3d333..a8264ba4a 100644 --- a/crates/milli/src/update/new/extract/mod.rs +++ b/crates/milli/src/update/new/extract/mod.rs @@ -5,7 +5,6 @@ mod geo; mod searchable; mod vectors; -use bumpalo::Bump; pub use cache::{ merge_caches_sorted, transpose_and_freeze_caches, BalancedCaches, DelAddRoaringBitmap, }; @@ -15,27 +14,11 @@ pub use geo::*; pub use searchable::*; pub use vectors::EmbeddingExtractor; -use super::indexer::document_changes::{DocumentChanges, IndexingContext}; -use super::steps::IndexingStep; -use super::thread_local::{FullySend, ThreadLocal}; -use crate::Result; - -pub trait DocidsExtractor { - fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>( - document_changes: &DC, - indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>, - extractor_allocs: &'extractor mut ThreadLocal>, - step: IndexingStep, - ) -> Result>> - where - MSP: Fn() -> bool + Sync; -} - /// TODO move in permissive json pointer pub mod perm_json_p { use serde_json::{Map, Value}; - use crate::Result; + use crate::{attribute_patterns::PatternMatch, Result}; const SPLIT_SYMBOL: char = '.'; /// Returns `true` if the `selector` match the `key`. @@ -68,11 +51,9 @@ pub mod perm_json_p { pub fn seek_leaf_values_in_object( value: &Map, - selectors: Option<&[&str]>, - skip_selectors: &[&str], base_key: &str, base_depth: Depth, - seeker: &mut impl FnMut(&str, Depth, &Value) -> Result<()>, + seeker: &mut impl FnMut(&str, Depth, &Value) -> Result, ) -> Result<()> { if value.is_empty() { seeker(base_key, base_depth, &Value::Object(Map::with_capacity(0)))?; @@ -85,40 +66,16 @@ pub mod perm_json_p { format!("{}{}{}", base_key, SPLIT_SYMBOL, key) }; - // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` - // so we check the contained_in on both side - let selection = select_field(&base_key, selectors, skip_selectors); - if selection != Selection::Skip { + let selection = seeker(&base_key, Depth::OnBaseKey, value)?; + if selection != PatternMatch::NoMatch { match value { Value::Object(object) => { - if selection == Selection::Select { - seeker(&base_key, Depth::OnBaseKey, value)?; - } - - seek_leaf_values_in_object( - object, - selectors, - skip_selectors, - &base_key, - Depth::OnBaseKey, - seeker, - ) + seek_leaf_values_in_object(object, &base_key, Depth::OnBaseKey, seeker) } Value::Array(array) => { - if selection == Selection::Select { - seeker(&base_key, Depth::OnBaseKey, value)?; - } - - seek_leaf_values_in_array( - array, - selectors, - skip_selectors, - &base_key, - Depth::OnBaseKey, - seeker, - ) + seek_leaf_values_in_array(array, &base_key, Depth::OnBaseKey, seeker) } - value => seeker(&base_key, Depth::OnBaseKey, value), + _ => Ok(()), }?; } } @@ -128,11 +85,9 @@ pub mod perm_json_p { pub fn seek_leaf_values_in_array( values: &[Value], - selectors: Option<&[&str]>, - skip_selectors: &[&str], base_key: &str, base_depth: Depth, - seeker: &mut impl FnMut(&str, Depth, &Value) -> Result<()>, + seeker: &mut impl FnMut(&str, Depth, &Value) -> Result, ) -> Result<()> { if values.is_empty() { seeker(base_key, base_depth, &Value::Array(vec![]))?; @@ -140,61 +95,16 @@ pub mod perm_json_p { for value in values { match value { - Value::Object(object) => seek_leaf_values_in_object( - object, - selectors, - skip_selectors, - base_key, - Depth::InsideArray, - seeker, - ), - Value::Array(array) => seek_leaf_values_in_array( - array, - selectors, - skip_selectors, - base_key, - Depth::InsideArray, - seeker, - ), - value => seeker(base_key, Depth::InsideArray, value), + Value::Object(object) => { + seek_leaf_values_in_object(object, base_key, Depth::InsideArray, seeker) + } + Value::Array(array) => { + seek_leaf_values_in_array(array, base_key, Depth::InsideArray, seeker) + } + value => seeker(base_key, Depth::InsideArray, value).map(|_| ()), }?; } Ok(()) } - - pub fn select_field( - field_name: &str, - selectors: Option<&[&str]>, - skip_selectors: &[&str], - ) -> Selection { - if skip_selectors.iter().any(|skip_selector| { - contained_in(skip_selector, field_name) || contained_in(field_name, skip_selector) - }) { - Selection::Skip - } else if let Some(selectors) = selectors { - let mut selection = Selection::Skip; - for selector in selectors { - if contained_in(field_name, selector) { - selection = Selection::Select; - break; - } else if contained_in(selector, field_name) { - selection = Selection::Parent; - } - } - selection - } else { - Selection::Select - } - } - - #[derive(Debug, Clone, Copy, PartialEq, Eq)] - pub enum Selection { - /// The field is a parent of the of a nested field that must be selected - Parent, - /// The field must be selected - Select, - /// The field must be skipped - Skip, - } } diff --git a/crates/milli/src/update/new/facet_search_builder.rs b/crates/milli/src/update/new/facet_search_builder.rs index d1ff6096d..6e9ffa1ed 100644 --- a/crates/milli/src/update/new/facet_search_builder.rs +++ b/crates/milli/src/update/new/facet_search_builder.rs @@ -9,12 +9,14 @@ use heed::{BytesDecode, BytesEncode, RoTxn, RwTxn}; use super::fst_merger_builder::FstMergerBuilder; use super::KvReaderDelAdd; +use crate::attribute_patterns::PatternMatch; use crate::heed_codec::facet::FacetGroupKey; use crate::update::del_add::{DelAdd, KvWriterDelAdd}; use crate::update::{create_sorter, MergeDeladdBtreesetString}; use crate::{ - BEU16StrCodec, FieldId, GlobalFieldsIdsMap, Index, LocalizedAttributesRule, Result, - MAX_FACET_VALUE_LENGTH, + BEU16StrCodec, FieldId, FieldIdMapMissingEntry, FilterableAttributesFeatures, + FilterableAttributesRule, GlobalFieldsIdsMap, Index, InternalError, LocalizedAttributesRule, + Result, MAX_FACET_VALUE_LENGTH, }; pub struct FacetSearchBuilder<'indexer> { @@ -22,6 +24,7 @@ pub struct FacetSearchBuilder<'indexer> { normalized_facet_string_docids_sorter: Sorter, global_fields_ids_map: GlobalFieldsIdsMap<'indexer>, localized_attributes_rules: Vec, + filterable_attributes_rules: Vec, // Buffered data below buffer: Vec, localized_field_ids: HashMap>>, @@ -31,6 +34,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { pub fn new( global_fields_ids_map: GlobalFieldsIdsMap<'indexer>, localized_attributes_rules: Vec, + filterable_attributes_rules: Vec, ) -> Self { let registered_facets = HashMap::new(); let normalized_facet_string_docids_sorter = create_sorter( @@ -49,6 +53,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { buffer: Vec::new(), global_fields_ids_map, localized_attributes_rules, + filterable_attributes_rules, localized_field_ids: HashMap::new(), } } @@ -60,6 +65,13 @@ impl<'indexer> FacetSearchBuilder<'indexer> { ) -> Result<()> { let FacetGroupKey { field_id, level: _level, left_bound } = facet_key; + let filterable_attributes_features = self.filterable_attributes_features(field_id)?; + + // if facet search is disabled, we don't need to register the facet + if !filterable_attributes_features.is_facet_searchable() { + return Ok(()); + }; + if deladd == DelAdd::Addition { self.registered_facets.entry(field_id).and_modify(|count| *count += 1).or_insert(1); } @@ -83,6 +95,24 @@ impl<'indexer> FacetSearchBuilder<'indexer> { Ok(()) } + fn filterable_attributes_features( + &mut self, + field_id: u16, + ) -> Result { + let Some(filterable_attributes_features) = + self.global_fields_ids_map.metadata(field_id).map(|metadata| { + metadata.filterable_attributes_features(&self.filterable_attributes_rules) + }) + else { + return Err(InternalError::FieldIdMapMissingEntry(FieldIdMapMissingEntry::FieldId { + field_id, + process: "facet_search_builder::register_from_key", + }) + .into()); + }; + Ok(filterable_attributes_features) + } + fn locales(&mut self, field_id: FieldId) -> Option<&[Language]> { if let Entry::Vacant(e) = self.localized_field_ids.entry(field_id) { let Some(field_name) = self.global_fields_ids_map.name(field_id) else { @@ -92,7 +122,7 @@ impl<'indexer> FacetSearchBuilder<'indexer> { let locales = self .localized_attributes_rules .iter() - .find(|rule| rule.match_str(field_name)) + .find(|rule| rule.match_str(field_name) == PatternMatch::Match) .map(|rule| rule.locales.clone()); e.insert(locales); diff --git a/crates/milli/src/update/new/indexer/post_processing.rs b/crates/milli/src/update/new/indexer/post_processing.rs index 201ab9ec9..4ea749a85 100644 --- a/crates/milli/src/update/new/indexer/post_processing.rs +++ b/crates/milli/src/update/new/indexer/post_processing.rs @@ -33,10 +33,8 @@ where { let index = indexing_context.index; indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets); - if index.facet_search(wtxn)? { - compute_facet_search_database(index, wtxn, global_fields_ids_map)?; - } - compute_facet_level_database(index, wtxn, facet_field_ids_delta)?; + compute_facet_level_database(index, wtxn, facet_field_ids_delta, &global_fields_ids_map)?; + compute_facet_search_database(index, wtxn, global_fields_ids_map)?; indexing_context.progress.update_progress(IndexingStep::PostProcessingWords); if let Some(prefix_delta) = compute_word_fst(index, wtxn)? { compute_prefix_database(index, wtxn, prefix_delta, indexing_context.grenad_parameters)?; @@ -116,10 +114,18 @@ fn compute_facet_search_database( global_fields_ids_map: GlobalFieldsIdsMap, ) -> Result<()> { let rtxn = index.read_txn()?; + + // if the facet search is not enabled, we can skip the rest of the function + if !index.facet_search(wtxn)? { + return Ok(()); + } + let localized_attributes_rules = index.localized_attributes_rules(&rtxn)?; + let filterable_attributes_rules = index.filterable_attributes_rules(&rtxn)?; let mut facet_search_builder = FacetSearchBuilder::new( global_fields_ids_map, localized_attributes_rules.unwrap_or_default(), + filterable_attributes_rules, ); let previous_facet_id_string_docids = index @@ -164,8 +170,19 @@ fn compute_facet_level_database( index: &Index, wtxn: &mut RwTxn, mut facet_field_ids_delta: FacetFieldIdsDelta, + global_fields_ids_map: &GlobalFieldsIdsMap, ) -> Result<()> { + let rtxn = index.read_txn()?; + let filterable_attributes_rules = index.filterable_attributes_rules(&rtxn)?; for (fid, delta) in facet_field_ids_delta.consume_facet_string_delta() { + // skip field ids that should not be facet leveled + let Some(metadata) = global_fields_ids_map.metadata(fid) else { + continue; + }; + if !metadata.require_facet_level_database(&filterable_attributes_rules) { + continue; + } + let span = tracing::trace_span!(target: "indexing::facet_field_ids", "string"); let _entered = span.enter(); match delta { diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 723e018a1..a8bd3217f 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -137,7 +137,6 @@ pub(super) fn update_index( index.put_primary_key(wtxn, new_primary_key.name())?; } let mut inner_index_settings = InnerIndexSettings::from_index(index, wtxn, Some(embedders))?; - inner_index_settings.recompute_facets(wtxn, index)?; inner_index_settings.recompute_searchables(wtxn, index)?; index.put_field_distribution(wtxn, &field_distribution)?; index.put_documents_ids(wtxn, &document_ids)?;