diff --git a/crates/milli/src/fields_ids_map/global.rs b/crates/milli/src/fields_ids_map/global.rs index 2ffc45eb7..e5f1212df 100644 --- a/crates/milli/src/fields_ids_map/global.rs +++ b/crates/milli/src/fields_ids_map/global.rs @@ -105,6 +105,11 @@ impl<'indexing> GlobalFieldsIdsMap<'indexing> { self.local.name(id) } + + /// Get the metadata of a field based on its id. + pub fn metadata(&self, id: FieldId) -> Option { + self.local.metadata(id).or_else(|| self.global.read().unwrap().metadata(id)) + } } impl<'indexing> MutFieldIdMapper for GlobalFieldsIdsMap<'indexing> { diff --git a/crates/milli/src/fields_ids_map/metadata.rs b/crates/milli/src/fields_ids_map/metadata.rs index 65a1111fa..fd333c3c6 100644 --- a/crates/milli/src/fields_ids_map/metadata.rs +++ b/crates/milli/src/fields_ids_map/metadata.rs @@ -5,14 +5,29 @@ use charabia::Language; use heed::RoTxn; use super::FieldsIdsMap; -use crate::{FieldId, Index, LocalizedAttributesRule, Result}; +use crate::attribute_patterns::{match_field_legacy, PatternMatch}; +use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; +use crate::{ + is_faceted_by, FieldId, FilterableAttributesFeatures, FilterableAttributesRule, Index, + LocalizedAttributesRule, Result, Weight, +}; #[derive(Debug, Clone, Copy)] pub struct Metadata { - pub searchable: bool, - pub filterable: bool, + /// The weight as defined in the FieldidsWeightsMap of the searchable attribute if it is searchable. + pub searchable: Option, + /// The field is part of the sortable attributes. pub sortable: bool, - localized_attributes_rule_id: Option, + /// The field is defined as the distinct attribute. + pub distinct: bool, + /// The field has been defined as asc/desc in the ranking rules. + pub asc_desc: bool, + /// The field is a geo field. + pub geo: bool, + /// The id of the localized attributes rule if the field is localized. + pub localized_attributes_rule_id: Option, + /// The id of the filterable attributes rule if the field is filterable. + pub filterable_attributes_rule_id: Option, } #[derive(Debug, Clone)] @@ -106,76 +121,215 @@ impl Metadata { let rule = rules.get((localized_attributes_rule_id - 1) as usize).unwrap(); Some(rule.locales()) } + + pub fn filterable_attributes<'rules>( + &self, + rules: &'rules [FilterableAttributesRule], + ) -> Option<&'rules FilterableAttributesRule> { + let filterable_attributes_rule_id = self.filterable_attributes_rule_id?.get(); + // - 1: `filterable_attributes_rule_id` is NonZero + let rule = rules.get((filterable_attributes_rule_id - 1) as usize).unwrap(); + Some(rule) + } + + pub fn filterable_attributes_features( + &self, + rules: &[FilterableAttributesRule], + ) -> FilterableAttributesFeatures { + self.filterable_attributes(rules) + .map(|rule| rule.features()) + // if there is no filterable attributes rule, return no features + .unwrap_or_else(FilterableAttributesFeatures::no_features) + } + + pub fn is_sortable(&self) -> bool { + self.sortable + } + + pub fn is_searchable(&self) -> bool { + self.searchable.is_some() + } + + pub fn searchable_weight(&self) -> Option { + self.searchable + } + + pub fn is_distinct(&self) -> bool { + self.distinct + } + + pub fn is_asc_desc(&self) -> bool { + self.asc_desc + } + + pub fn is_geo(&self) -> bool { + self.geo + } + + /// Returns `true` if the field is part of the facet databases. (sortable, distinct, asc_desc, filterable or facet searchable) + pub fn is_faceted(&self, rules: &[FilterableAttributesRule]) -> bool { + if self.is_distinct() || self.is_sortable() || self.is_asc_desc() { + return true; + } + + let features = self.filterable_attributes_features(rules); + if features.is_filterable() || features.is_facet_searchable() { + return true; + } + + false + } + + pub fn require_facet_level_database(&self, rules: &[FilterableAttributesRule]) -> bool { + let features = self.filterable_attributes_features(rules); + + self.is_sortable() || self.is_asc_desc() || features.is_filterable_comparison() + } } #[derive(Debug, Clone)] pub struct MetadataBuilder { - searchable_attributes: Vec, - filterable_attributes: HashSet, + searchable_attributes: Option>, + filterable_attributes: Vec, sortable_attributes: HashSet, localized_attributes: Option>, + distinct_attribute: Option, + asc_desc_attributes: HashSet, } impl MetadataBuilder { pub fn from_index(index: &Index, rtxn: &RoTxn) -> Result { - let searchable_attributes = - index.searchable_fields(rtxn)?.into_iter().map(|s| s.to_string()).collect(); - let filterable_attributes = index.filterable_fields(rtxn)?; + let searchable_attributes = match index.user_defined_searchable_fields(rtxn)? { + Some(fields) if fields.contains(&"*") => None, + None => None, + Some(fields) => Some(fields.into_iter().map(|s| s.to_string()).collect()), + }; + let filterable_attributes = index.filterable_attributes_rules(rtxn)?; let sortable_attributes = index.sortable_fields(rtxn)?; let localized_attributes = index.localized_attributes_rules(rtxn)?; + let distinct_attribute = index.distinct_field(rtxn)?.map(|s| s.to_string()); + let asc_desc_attributes = index.asc_desc_fields(rtxn)?; Ok(Self { searchable_attributes, filterable_attributes, sortable_attributes, localized_attributes, + distinct_attribute, + asc_desc_attributes, }) } + #[cfg(test)] + /// Build a new `MetadataBuilder` from the given parameters. + /// + /// This is used for testing, prefer using `MetadataBuilder::from_index` instead. pub fn new( - searchable_attributes: Vec, - filterable_attributes: HashSet, + searchable_attributes: Option>, + filterable_attributes: Vec, sortable_attributes: HashSet, localized_attributes: Option>, + distinct_attribute: Option, + asc_desc_attributes: HashSet, ) -> Self { + let searchable_attributes = match searchable_attributes { + Some(fields) if fields.iter().any(|f| f == "*") => None, + None => None, + Some(fields) => Some(fields), + }; + Self { searchable_attributes, filterable_attributes, sortable_attributes, localized_attributes, + distinct_attribute, + asc_desc_attributes, } } pub fn metadata_for_field(&self, field: &str) -> Metadata { - let searchable = self - .searchable_attributes + if is_faceted_by(field, RESERVED_VECTORS_FIELD_NAME) { + // Vectors fields are not searchable, filterable, distinct or asc_desc + return Metadata { + searchable: None, + sortable: false, + distinct: false, + asc_desc: false, + geo: false, + localized_attributes_rule_id: None, + filterable_attributes_rule_id: None, + }; + } + + // A field is sortable if it is faceted by a sortable attribute + let sortable = self + .sortable_attributes .iter() - .any(|attribute| attribute == "*" || attribute == field); + .any(|pattern| match_field_legacy(pattern, field) == PatternMatch::Match); - let filterable = self.filterable_attributes.contains(field); + let filterable_attributes_rule_id = self + .filterable_attributes + .iter() + .position(|attribute| attribute.match_str(field) == PatternMatch::Match) + // saturating_add(1): make `id` `NonZero` + .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); - let sortable = self.sortable_attributes.contains(field); + if match_field_legacy(RESERVED_GEO_FIELD_NAME, field) == PatternMatch::Match { + // Geo fields are not searchable, distinct or asc_desc + return Metadata { + searchable: None, + sortable, + distinct: false, + asc_desc: false, + geo: true, + localized_attributes_rule_id: None, + filterable_attributes_rule_id, + }; + } + + let searchable = match &self.searchable_attributes { + // A field is searchable if it is faceted by a searchable attribute + Some(attributes) => attributes + .iter() + .enumerate() + .find(|(_i, pattern)| is_faceted_by(field, pattern)) + .map(|(i, _)| i as u16), + None => Some(0), + }; + + let distinct = + self.distinct_attribute.as_ref().is_some_and(|distinct_field| field == distinct_field); + let asc_desc = self.asc_desc_attributes.contains(field); let localized_attributes_rule_id = self .localized_attributes .iter() .flat_map(|v| v.iter()) - .position(|rule| rule.match_str(field)) + .position(|rule| rule.match_str(field) == PatternMatch::Match) // saturating_add(1): make `id` `NonZero` .map(|id| NonZeroU16::new(id.saturating_add(1).try_into().unwrap()).unwrap()); - Metadata { searchable, filterable, sortable, localized_attributes_rule_id } + Metadata { + searchable, + sortable, + distinct, + asc_desc, + geo: false, + localized_attributes_rule_id, + filterable_attributes_rule_id, + } } - pub fn searchable_attributes(&self) -> &[String] { - self.searchable_attributes.as_slice() + pub fn searchable_attributes(&self) -> Option<&[String]> { + self.searchable_attributes.as_deref() } pub fn sortable_attributes(&self) -> &HashSet { &self.sortable_attributes } - pub fn filterable_attributes(&self) -> &HashSet { + pub fn filterable_attributes(&self) -> &[FilterableAttributesRule] { &self.filterable_attributes } diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 186d55809..75f4a8c17 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -1,6 +1,5 @@ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; -use std::convert::TryInto; use std::fs::File; use std::path::Path; @@ -10,10 +9,11 @@ use roaring::RoaringBitmap; use rstar::RTree; use serde::{Deserialize, Serialize}; -use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME}; +use crate::constants::{self, RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::database_stats::DatabaseStats; use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; +use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, @@ -27,8 +27,9 @@ use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, - FieldidsWeightsMap, GeoPoint, LocalizedAttributesRule, ObkvCodec, Result, RoaringBitmapCodec, - RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32, BEU64, + FieldidsWeightsMap, FilterableAttributesRule, GeoPoint, LocalizedAttributesRule, ObkvCodec, + Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, Weight, BEU16, BEU32, + BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -738,8 +739,7 @@ impl Index { &self, wtxn: &mut RwTxn<'_>, user_fields: &[&str], - non_searchable_fields_ids: &[FieldId], - fields_ids_map: &FieldsIdsMap, + fields_ids_map: &FieldIdMapWithMetadata, ) -> Result<()> { // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; @@ -747,29 +747,17 @@ impl Index { let mut weights = FieldidsWeightsMap::default(); // Now we generate the real searchable fields: - // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. - // 2. Iterate over the user defined searchable fields. - // 3. If a user defined field is a subset of a field defined in the fields_ids_map - // (ie doggo.name is a subset of doggo) right after doggo and with the same weight. let mut real_fields = Vec::new(); - - for (id, field_from_map) in fields_ids_map.iter() { - for (weight, user_field) in user_fields.iter().enumerate() { - if crate::is_faceted_by(field_from_map, user_field) - && !real_fields.contains(&field_from_map) - && !non_searchable_fields_ids.contains(&id) - { - real_fields.push(field_from_map); - - let weight: u16 = - weight.try_into().map_err(|_| UserError::AttributeLimitReached)?; - weights.insert(id, weight); - } + for (id, field_from_map, metadata) in fields_ids_map.iter() { + if let Some(weight) = metadata.searchable_weight() { + real_fields.push(field_from_map); + weights.insert(id, weight); } } self.put_searchable_fields(wtxn, &real_fields)?; self.put_fieldids_weights_map(wtxn, &weights)?; + Ok(()) } diff --git a/crates/milli/src/prompt/fields.rs b/crates/milli/src/prompt/fields.rs index ab15c31b0..ffafffd63 100644 --- a/crates/milli/src/prompt/fields.rs +++ b/crates/milli/src/prompt/fields.rs @@ -7,14 +7,14 @@ use liquid::model::{ }; use liquid::{ObjectView, ValueView}; -use super::{FieldMetadata, FieldsIdsMapWithMetadata}; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, Metadata}; use crate::GlobalFieldsIdsMap; #[derive(Debug, Clone, Copy)] pub struct FieldValue<'a, D: ObjectView> { name: &'a str, document: &'a D, - metadata: FieldMetadata, + metadata: Metadata, } impl<'a, D: ObjectView> ValueView for FieldValue<'a, D> { @@ -67,7 +67,10 @@ impl<'a, D: ObjectView> FieldValue<'a, D> { } pub fn is_searchable(&self) -> &bool { - &self.metadata.searchable + match self.metadata.is_searchable() { + true => &true, + false => &false, + } } pub fn is_empty(&self) -> bool { @@ -125,15 +128,11 @@ pub struct BorrowedFields<'a, 'map, D: ObjectView> { } impl<'a, D: ObjectView> OwnedFields<'a, D> { - pub fn new(document: &'a D, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self { + pub fn new(document: &'a D, field_id_map: &'a FieldIdMapWithMetadata) -> Self { Self( std::iter::repeat(document) .zip(field_id_map.iter()) - .map(|(document, (fid, name))| FieldValue { - document, - name, - metadata: field_id_map.metadata(fid).unwrap_or_default(), - }) + .map(|(document, (_fid, name, metadata))| FieldValue { document, name, metadata }) .collect(), ) } @@ -187,7 +186,7 @@ impl<'a, 'map, D: ObjectView> ArrayView for BorrowedFields<'a, 'map, D> { let fv = self.doc_alloc.alloc(FieldValue { name: self.doc_alloc.alloc_str(&k), document: self.document, - metadata: FieldMetadata { searchable: metadata.searchable }, + metadata, }); fv as _ })) @@ -207,7 +206,7 @@ impl<'a, 'map, D: ObjectView> ArrayView for BorrowedFields<'a, 'map, D> { let fv = self.doc_alloc.alloc(FieldValue { name: self.doc_alloc.alloc_str(&key), document: self.document, - metadata: FieldMetadata { searchable: metadata.searchable }, + metadata, }); Some(fv as _) } diff --git a/crates/milli/src/prompt/mod.rs b/crates/milli/src/prompt/mod.rs index 3eb91611e..a5cb8de48 100644 --- a/crates/milli/src/prompt/mod.rs +++ b/crates/milli/src/prompt/mod.rs @@ -5,11 +5,9 @@ mod fields; mod template_checker; use std::cell::RefCell; -use std::collections::BTreeMap; use std::convert::TryFrom; use std::fmt::Debug; use std::num::NonZeroUsize; -use std::ops::Deref; use bumpalo::Bump; use document::ParseableDocument; @@ -18,8 +16,9 @@ use fields::{BorrowedFields, OwnedFields}; use self::context::Context; use self::document::Document; +use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::update::del_add::DelAdd; -use crate::{FieldId, FieldsIdsMap, GlobalFieldsIdsMap}; +use crate::GlobalFieldsIdsMap; pub struct Prompt { template: liquid::Template, @@ -145,9 +144,9 @@ impl Prompt { &self, document: &obkv::KvReaderU16, side: DelAdd, - field_id_map: &FieldsIdsMapWithMetadata, + field_id_map: &FieldIdMapWithMetadata, ) -> Result { - let document = Document::new(document, side, field_id_map); + let document = Document::new(document, side, field_id_map.as_fields_ids_map()); let fields = OwnedFields::new(&document, field_id_map); let context = Context::new(&document, &fields); @@ -172,40 +171,6 @@ fn truncate(s: &mut String, max_bytes: usize) { } } -pub struct FieldsIdsMapWithMetadata<'a> { - fields_ids_map: &'a FieldsIdsMap, - metadata: BTreeMap, -} - -impl<'a> FieldsIdsMapWithMetadata<'a> { - pub fn new(fields_ids_map: &'a FieldsIdsMap, searchable_fields_ids: &'_ [FieldId]) -> Self { - let mut metadata: BTreeMap = - fields_ids_map.ids().map(|id| (id, Default::default())).collect(); - for searchable_field_id in searchable_fields_ids { - let Some(metadata) = metadata.get_mut(searchable_field_id) else { continue }; - metadata.searchable = true; - } - Self { fields_ids_map, metadata } - } - - pub fn metadata(&self, field_id: FieldId) -> Option { - self.metadata.get(&field_id).copied() - } -} - -impl<'a> Deref for FieldsIdsMapWithMetadata<'a> { - type Target = FieldsIdsMap; - - fn deref(&self) -> &Self::Target { - self.fields_ids_map - } -} - -#[derive(Debug, Default, Clone, Copy)] -pub struct FieldMetadata { - pub searchable: bool, -} - #[cfg(test)] mod test { use super::Prompt; diff --git a/crates/milli/src/search/facet/facet_distribution.rs b/crates/milli/src/search/facet/facet_distribution.rs index ee0fad535..b165d4e80 100644 --- a/crates/milli/src/search/facet/facet_distribution.rs +++ b/crates/milli/src/search/facet/facet_distribution.rs @@ -1,4 +1,4 @@ -use std::collections::{BTreeMap, HashMap, HashSet}; +use std::collections::{BTreeMap, BTreeSet, HashMap}; use std::fmt::Display; use std::ops::ControlFlow; use std::{fmt, mem}; @@ -9,8 +9,9 @@ use indexmap::IndexMap; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; -use crate::error::UserError; +use crate::attribute_patterns::match_field_legacy; use crate::facet::FacetType; +use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, Metadata, MetadataBuilder}; use crate::heed_codec::facet::{ FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec, }; @@ -18,7 +19,7 @@ use crate::heed_codec::{BytesRefCodec, StrRefCodec}; use crate::search::facet::facet_distribution_iter::{ count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution, }; -use crate::{FieldId, Index, Result}; +use crate::{Error, FieldId, FilterableAttributesRule, Index, PatternMatch, Result, UserError}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -287,37 +288,23 @@ impl<'a> FacetDistribution<'a> { } pub fn compute_stats(&self) -> Result> { - let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let filterable_fields = self.index.filterable_fields(self.rtxn)?; let candidates = if let Some(candidates) = self.candidates.clone() { candidates } else { return Ok(Default::default()); }; - let fields = match &self.facets { - Some(facets) => { - let invalid_fields: HashSet<_> = facets - .iter() - .map(|(name, _)| name) - .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) - .collect(); - if !invalid_fields.is_empty() { - return Err(UserError::InvalidFacetsDistribution { - invalid_facets_name: invalid_fields.into_iter().cloned().collect(), - valid_facets_name: filterable_fields.into_iter().collect(), - } - .into()); - } else { - facets.iter().map(|(name, _)| name).cloned().collect() - } - } - None => filterable_fields, - }; + let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; + let fields_ids_map = FieldIdMapWithMetadata::new( + fields_ids_map, + MetadataBuilder::from_index(self.index, self.rtxn)?, + ); + let filterable_attributes_rules = self.index.filterable_attributes_rules(self.rtxn)?; + self.check_faceted_fields(&fields_ids_map, &filterable_attributes_rules)?; let mut distribution = BTreeMap::new(); - for (fid, name) in fields_ids_map.iter() { - if crate::is_faceted(name, &fields) { + for (fid, name, metadata) in fields_ids_map.iter() { + if self.select_field(name, &metadata, &filterable_attributes_rules) { let min_value = if let Some(min_value) = crate::search::facet::facet_min_value( self.index, self.rtxn, @@ -348,31 +335,16 @@ impl<'a> FacetDistribution<'a> { pub fn execute(&self) -> Result>> { let fields_ids_map = self.index.fields_ids_map(self.rtxn)?; - let filterable_fields = self.index.filterable_fields(self.rtxn)?; - - let fields = match self.facets { - Some(ref facets) => { - let invalid_fields: HashSet<_> = facets - .iter() - .map(|(name, _)| name) - .filter(|facet| !crate::is_faceted(facet, &filterable_fields)) - .collect(); - if !invalid_fields.is_empty() { - return Err(UserError::InvalidFacetsDistribution { - invalid_facets_name: invalid_fields.into_iter().cloned().collect(), - valid_facets_name: filterable_fields.into_iter().collect(), - } - .into()); - } else { - facets.iter().map(|(name, _)| name).cloned().collect() - } - } - None => filterable_fields, - }; + let fields_ids_map = FieldIdMapWithMetadata::new( + fields_ids_map, + MetadataBuilder::from_index(self.index, self.rtxn)?, + ); + let filterable_attributes_rules = self.index.filterable_attributes_rules(self.rtxn)?; + self.check_faceted_fields(&fields_ids_map, &filterable_attributes_rules)?; let mut distribution = BTreeMap::new(); - for (fid, name) in fields_ids_map.iter() { - if crate::is_faceted(name, &fields) { + for (fid, name, metadata) in fields_ids_map.iter() { + if self.select_field(name, &metadata, &filterable_attributes_rules) { let order_by = self .facets .as_ref() @@ -385,6 +357,61 @@ impl<'a> FacetDistribution<'a> { Ok(distribution) } + + /// Select a field if it is faceted and in the facets. + fn select_field( + &self, + name: &str, + metadata: &Metadata, + filterable_attributes_rules: &[FilterableAttributesRule], + ) -> bool { + match &self.facets { + Some(facets) => { + // The list of facets provided by the user is a legacy pattern ("dog.age" must be selected with "dog"). + facets.keys().any(|key| match_field_legacy(key, name) == PatternMatch::Match) + } + None => metadata.is_faceted(filterable_attributes_rules), + } + } + + /// Check if the fields in the facets are valid faceted fields. + fn check_faceted_fields( + &self, + fields_ids_map: &FieldIdMapWithMetadata, + filterable_attributes_rules: &[FilterableAttributesRule], + ) -> Result<()> { + let mut invalid_facets = BTreeSet::new(); + if let Some(facets) = &self.facets { + for (field, _) in facets { + let is_valid_faceted_field = + fields_ids_map.id_with_metadata(field).map_or(false, |(_, metadata)| { + metadata.is_faceted(filterable_attributes_rules) + }); + if !is_valid_faceted_field { + invalid_facets.insert(field.to_string()); + } + } + } + + if !invalid_facets.is_empty() { + let valid_facets_name = fields_ids_map + .iter() + .filter_map(|(_, name, metadata)| { + if metadata.is_faceted(filterable_attributes_rules) { + Some(name.to_string()) + } else { + None + } + }) + .collect(); + return Err(Error::UserError(UserError::InvalidFacetsDistribution { + invalid_facets_name: invalid_facets, + valid_facets_name, + })); + } + + Ok(()) + } } impl fmt::Debug for FacetDistribution<'_> {