diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs new file mode 100644 index 000000000..255f6ab80 --- /dev/null +++ b/milli/src/fieldids_weights_map.rs @@ -0,0 +1,28 @@ +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; + +use crate::{FieldId, Weight}; + +#[derive(Debug, Default, Serialize, Deserialize)] +pub struct FieldidsWeightsMap { + map: HashMap, +} + +impl FieldidsWeightsMap { + pub fn insert(&mut self, fid: FieldId, weight: Weight) -> Option { + self.map.insert(fid, weight) + } + + pub fn remove(&mut self, fid: FieldId) -> Option { + self.map.remove(&fid) + } + + pub fn weight(&self, fid: FieldId) -> Option { + self.map.get(&fid).copied() + } + + pub fn max_weight(&self) -> Option { + self.map.values().copied().max() + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index 27b273393..b6b07404b 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1,5 +1,6 @@ use std::borrow::Cow; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::convert::TryInto; use std::fs::File; use std::path::Path; @@ -25,8 +26,9 @@ use crate::proximity::ProximityPrecision; use crate::vector::EmbeddingConfig; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, - FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, GeoPoint, ObkvCodec, - Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, BEU16, BEU32, BEU64, + FacetDistribution, FieldDistribution, FieldId, FieldIdWordCountCodec, FieldidsWeightsMap, + GeoPoint, ObkvCodec, Result, RoaringBitmapCodec, RoaringBitmapLenCodec, Search, U8StrStrCodec, + BEU16, BEU32, BEU64, }; pub const DEFAULT_MIN_WORD_LEN_ONE_TYPO: u8 = 5; @@ -42,6 +44,7 @@ pub mod main_key { pub const SORTABLE_FIELDS_KEY: &str = "sortable-fields"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; + pub const FIELDIDS_WEIGHTS_MAP_KEY: &str = "fieldids-weights-map"; pub const GEO_FACETED_DOCUMENTS_IDS_KEY: &str = "geo-faceted-documents-ids"; pub const GEO_RTREE_KEY: &str = "geo-rtree"; pub const PRIMARY_KEY_KEY: &str = "primary-key"; @@ -414,6 +417,32 @@ impl Index { .unwrap_or_default()) } + /* fieldids weights map */ + // This maps the fields ids to their weights. + // Their weights is defined by the ordering of the searchable attributes. + + /// Writes the fieldids weights map which associates the field ids to their weights + pub(crate) fn put_fieldids_weights_map( + &self, + wtxn: &mut RwTxn, + map: &FieldidsWeightsMap, + ) -> heed::Result<()> { + self.main.remap_types::>().put( + wtxn, + main_key::FIELDIDS_WEIGHTS_MAP_KEY, + map, + ) + } + + /// Get the fieldids weights map which associates the field ids to their weights + pub fn fieldids_weights_map(&self, rtxn: &RoTxn) -> heed::Result { + Ok(self + .main + .remap_types::>() + .get(rtxn, main_key::FIELDIDS_WEIGHTS_MAP_KEY)? + .unwrap_or_default()) + } + /* geo rtree */ /// Writes the provided `rtree` which associates coordinates to documents ids. @@ -578,10 +607,12 @@ impl Index { wtxn: &mut RwTxn, user_fields: &[&str], fields_ids_map: &FieldsIdsMap, - ) -> heed::Result<()> { + ) -> Result<()> { // We can write the user defined searchable fields as-is. self.put_user_defined_searchable_fields(wtxn, user_fields)?; + let mut weights = self.fieldids_weights_map(&wtxn)?; + // Now we generate the real searchable fields: // 1. Take the user defined searchable fields as-is to keep the priority defined by the attributes criterion. // 2. Iterate over the user defined searchable fields. @@ -589,17 +620,23 @@ impl Index { // (ie doggo.name is a subset of doggo) then we push it at the end of the fields. let mut real_fields = user_fields.to_vec(); - for field_from_map in fields_ids_map.names() { - for user_field in user_fields { + for (id, field_from_map) in fields_ids_map.iter() { + for (weight, user_field) in user_fields.iter().enumerate() { if crate::is_faceted_by(field_from_map, user_field) && !user_fields.contains(&field_from_map) { real_fields.push(field_from_map); + + let weight: u16 = + weight.try_into().map_err(|_| UserError::AttributeLimitReached)?; + weights.insert(id, weight as u16); } } } - self.put_searchable_fields(wtxn, &real_fields) + self.put_searchable_fields(wtxn, &real_fields)?; + self.put_fieldids_weights_map(wtxn, &weights)?; + Ok(()) } pub(crate) fn delete_all_searchable_fields(&self, wtxn: &mut RwTxn) -> heed::Result { @@ -623,28 +660,31 @@ impl Index { } /// Returns the searchable fields, those are the fields that are indexed, - /// if the searchable fields aren't there it means that **all** the fields are indexed. - pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { + pub fn searchable_fields<'t>(&self, rtxn: &'t RoTxn) -> heed::Result>> { self.main .remap_types::>>() - .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY) + .get(rtxn, main_key::SEARCHABLE_FIELDS_KEY)? + .map(|fields| Ok(fields.into_iter().map(|field| Cow::Borrowed(field)).collect())) + .unwrap_or_else(|| { + Ok(self + .fields_ids_map(rtxn)? + .names() + .map(|field| Cow::Owned(field.to_string())) + .collect()) + }) } /// Identical to `searchable_fields`, but returns the ids instead. - pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result>> { - match self.searchable_fields(rtxn)? { - Some(fields) => { - let fields_ids_map = self.fields_ids_map(rtxn)?; - let mut fields_ids = Vec::new(); - for name in fields { - if let Some(field_id) = fields_ids_map.id(name) { - fields_ids.push(field_id); - } - } - Ok(Some(fields_ids)) + pub fn searchable_fields_ids(&self, rtxn: &RoTxn) -> Result> { + let fields = self.searchable_fields(rtxn)?; + let fields_ids_map = self.fields_ids_map(rtxn)?; + let mut fields_ids = Vec::new(); + for name in fields { + if let Some(field_id) = fields_ids_map.id(&name) { + fields_ids.push(field_id); } - None => Ok(None), } + Ok(fields_ids) } /// Writes the searchable fields, when this list is specified, only these are indexed. @@ -1710,10 +1750,14 @@ pub(crate) mod tests { ])) .unwrap(); - db_snap!(index, field_distribution, 1); + db_snap!(index, field_distribution, @r###" + age 1 | + id 2 | + name 2 | + "###); db_snap!(index, word_docids, - @r###" + @r###" 1 [0, ] 2 [1, ] 20 [1, ] @@ -1722,18 +1766,6 @@ pub(crate) mod tests { "### ); - db_snap!(index, field_distribution); - - db_snap!(index, field_distribution, - @r###" - age 1 | - id 2 | - name 2 | - "### - ); - - // snapshot_index!(&index, "1", include: "^field_distribution$"); - // we add all the documents a second time. we are supposed to get the same // field_distribution in the end index @@ -1820,7 +1852,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); @@ -1840,7 +1872,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); assert_eq!(user_defined, &["doggo", "name"]); @@ -1856,7 +1888,7 @@ pub(crate) mod tests { // ensure we get the right real searchable fields + user defined searchable fields let rtxn = index.read_txn().unwrap(); - let real = index.searchable_fields(&rtxn).unwrap().unwrap(); + let real = index.searchable_fields(&rtxn).unwrap(); assert_eq!(real, &["doggo", "name", "doggo.name", "doggo.age"]); let user_defined = index.user_defined_searchable_fields(&rtxn).unwrap().unwrap(); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index a1e240464..881633b5c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -28,6 +28,7 @@ pub mod vector; #[cfg(test)] #[macro_use] pub mod snapshot_tests; +mod fieldids_weights_map; use std::collections::{BTreeMap, HashMap}; use std::convert::{TryFrom, TryInto}; @@ -52,6 +53,7 @@ pub use self::error::{ Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, }; pub use self::external_documents_ids::ExternalDocumentsIds; +pub use self::fieldids_weights_map::FieldidsWeightsMap; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ BEU16StrCodec, BEU32StrCodec, BoRoaringBitmapCodec, BoRoaringBitmapLenCodec, @@ -77,6 +79,7 @@ pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; pub type FieldDistribution = BTreeMap; pub type FieldId = u16; +pub type Weight = u16; pub type Object = serde_json::Map; pub type Position = u32; pub type RelativePosition = u16; diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 62c921a1d..a99000f60 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -315,11 +315,7 @@ impl<'ctx> SearchContext<'ctx> { .map_err(heed::Error::Decoding)? } else { // Compute the distance at the attribute level and store it in the cache. - let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { - fids - } else { - self.index.fields_ids_map(self.txn)?.ids().collect() - }; + let fids = self.index.searchable_fields_ids(self.txn)?; let mut docids = RoaringBitmap::new(); for fid in fids { // for each field, intersect left word bitmap and right word bitmap, @@ -408,11 +404,7 @@ impl<'ctx> SearchContext<'ctx> { let prefix_docids = match proximity_precision { ProximityPrecision::ByAttribute => { // Compute the distance at the attribute level and store it in the cache. - let fids = if let Some(fids) = self.index.searchable_fields_ids(self.txn)? { - fids - } else { - self.index.fields_ids_map(self.txn)?.ids().collect() - }; + let fids = self.index.searchable_fields_ids(self.txn)?; let mut prefix_docids = RoaringBitmap::new(); // for each field, intersect left word bitmap and right word bitmap, // then merge the result in a global bitmap before storing it in the cache. diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index 7932f0c2a..41b70ae39 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -184,13 +184,7 @@ impl State { return Ok(State::Empty(query_graph.clone())); } - let searchable_fields_ids = { - if let Some(fids) = ctx.index.searchable_fields_ids(ctx.txn)? { - fids - } else { - ctx.index.fields_ids_map(ctx.txn)?.ids().collect() - } - }; + let searchable_fields_ids = ctx.index.searchable_fields_ids(ctx.txn)?; let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); // then check that there exists at least one attribute that has all of the terms diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 617068ef8..acbb3638b 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -96,27 +96,22 @@ impl<'ctx> SearchContext<'ctx> { contains_wildcard = true; continue; } - let searchable_contains_name = - searchable_names.as_ref().map(|sn| sn.iter().any(|name| name == field_name)); + let searchable_contains_name = searchable_names.iter().any(|name| name == field_name); let fid = match (fids_map.id(field_name), searchable_contains_name) { // The Field id exist and the field is searchable - (Some(fid), Some(true)) | (Some(fid), None) => fid, + (Some(fid), true) => fid, // The field is searchable but the Field id doesn't exist => Internal Error - (None, Some(true)) => { + (None, true) => { return Err(FieldIdMapMissingEntry::FieldName { field_name: field_name.to_string(), process: "search", } .into()) } - // The field is not searchable, but the searchableAttributes are set to * => ignore field - (None, None) => continue, // The field is not searchable => User error - (_fid, Some(false)) => { - let (valid_fields, hidden_fields) = match searchable_names { - Some(sn) => self.index.remove_hidden_fields(self.txn, sn)?, - None => self.index.remove_hidden_fields(self.txn, fids_map.names())?, - }; + (_fid, false) => { + let (valid_fields, hidden_fields) = + self.index.remove_hidden_fields(self.txn, searchable_names)?; let field = field_name.to_string(); return Err(UserError::InvalidSearchableAttribute { diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index 8f3e0cc82..cf65249de 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -77,17 +77,7 @@ impl RankingRuleGraphTrait for FidGraph { } // always lookup the max_fid if we don't already and add an artificial condition for max scoring - let max_fid: Option = { - if let Some(max_fid) = ctx - .index - .searchable_fields_ids(ctx.txn)? - .map(|field_ids| field_ids.into_iter().max()) - { - max_fid - } else { - ctx.index.fields_ids_map(ctx.txn)?.ids().max() - } - }; + let max_fid: Option = ctx.index.searchable_fields_ids(ctx.txn)?.into_iter().max(); if let Some(max_fid) = max_fid { if !all_fields.contains(&max_fid) { diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 6af5bba6d..d97b6639e 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -186,7 +186,7 @@ fn searchable_fields_changed( ) -> bool { let searchable_fields = &settings_diff.new.searchable_fields_ids; for (field_id, field_bytes) in obkv.iter() { - if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + if searchable_fields.contains(&field_id) { let del_add = KvReaderDelAdd::new(field_bytes); match (del_add.get(DelAdd::Deletion), del_add.get(DelAdd::Addition)) { // if both fields are None, check the next field. @@ -298,7 +298,7 @@ fn lang_safe_tokens_from_document<'a>( /// Extract words mapped with their positions of a document. fn tokens_from_document<'a>( obkv: &KvReader, - searchable_fields: &Option>, + searchable_fields: &[FieldId], tokenizer: &Tokenizer, max_positions_per_attributes: u32, del_add: DelAdd, @@ -309,7 +309,7 @@ fn tokens_from_document<'a>( let mut document_writer = KvWriterU16::new(&mut buffers.obkv_buffer); for (field_id, field_bytes) in obkv.iter() { // if field is searchable. - if searchable_fields.as_ref().map_or(true, |sf| sf.contains(&field_id)) { + if searchable_fields.as_ref().contains(&field_id) { // extract deletion or addition only. if let Some(field_bytes) = KvReaderDelAdd::new(field_bytes).get(del_add) { // parse json. diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1997e966e..c0742a74a 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -468,14 +468,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Setting::Set(ref fields) => { // Check to see if the searchable fields changed before doing anything else let old_fields = self.index.searchable_fields(self.wtxn)?; - let did_change = match old_fields { - // If old_fields is Some, let's check to see if the fields actually changed - Some(old_fields) => { - let new_fields = fields.iter().map(String::as_str).collect::>(); - new_fields != old_fields - } - // If old_fields is None, the fields have changed (because they are being set) - None => true, + let did_change = { + let new_fields = fields.iter().map(String::as_str).collect::>(); + new_fields != old_fields }; if !did_change { return Ok(false); @@ -1172,7 +1167,7 @@ pub(crate) struct InnerIndexSettings { pub user_defined_faceted_fields: HashSet, pub user_defined_searchable_fields: Option>, pub faceted_fields_ids: HashSet, - pub searchable_fields_ids: Option>, + pub searchable_fields_ids: Vec, pub exact_attributes: HashSet, pub proximity_precision: ProximityPrecision, pub embedding_configs: EmbeddingConfigs, @@ -1517,6 +1512,7 @@ mod tests { use big_s::S; use heed::types::Bytes; use maplit::{btreemap, btreeset, hashset}; + use meili_snap::snapshot; use super::*; use crate::error::Error; @@ -1576,7 +1572,7 @@ mod tests { // Check that the searchable field have been reset and documents are found now. let rtxn = index.read_txn().unwrap(); let searchable_fields = index.searchable_fields(&rtxn).unwrap(); - assert_eq!(searchable_fields, None); + snapshot!(format!("{searchable_fields:?}"), @r###"["name", "id", "age"]"###); let result = index.search(&rtxn).query("23").execute().unwrap(); assert_eq!(result.documents_ids.len(), 1); let documents = index.documents(&rtxn, result.documents_ids).unwrap();