From fe17c0f52e22b30fb6aec06fb233fbd4afbccf8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 22 May 2024 16:05:55 +0200 Subject: [PATCH] Construct the minimal OBKVs according to the settings diff --- meilisearch/tests/search/mod.rs | 20 +++ milli/src/lib.rs | 3 +- milli/src/update/index_documents/transform.rs | 137 +++++++++++------- milli/src/update/settings.rs | 26 ++-- 4 files changed, 122 insertions(+), 64 deletions(-) diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index f601e2b03..b02c10319 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -680,6 +680,26 @@ async fn search_facet_distribution() { }, ) .await; + + index.update_settings(json!({"filterableAttributes": ["doggos.name"]})).await; + index.wait_task(5).await; + + index + .search( + json!({ + "facets": ["doggos.name"] + }), + |response, code| { + assert_eq!(code, 200, "{}", response); + let dist = response["facetDistribution"].as_object().unwrap(); + assert_eq!(dist.len(), 1); + assert_eq!( + dist["doggos.name"], + json!({ "bobby": 1, "buddy": 1, "gros bill": 1, "turbo": 1, "fast": 1}) + ); + }, + ) + .await; } #[actix_rt::test] diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 881633b5c..c74aa10e8 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -354,8 +354,7 @@ pub fn is_faceted(field: &str, faceted_fields: impl IntoIterator bool { - field.starts_with(facet) - && field[facet.len()..].chars().next().map(|c| c == '.').unwrap_or(true) + field.starts_with(facet) && field[facet.len()..].chars().next().map_or(true, |c| c == '.') } pub fn normalize_facet(original: &str) -> String { diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index aef4d1583..733e74800 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::btree_map::Entry as BEntry; use std::collections::hash_map::Entry as HEntry; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek}; @@ -20,13 +20,13 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::{db_name, main_key}; -use crate::update::del_add::{ - del_add_from_two_obkvs, into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd, -}; +use crate::update::del_add::{into_del_add_obkv, DelAdd, DelAddOperation, KvReaderDelAdd}; use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; -use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result}; +use crate::{ + is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, +}; pub struct TransformOutput { pub primary_key: String, @@ -808,11 +808,15 @@ impl<'a, 'i> Transform<'a, 'i> { })?; let old_inner_settings = InnerIndexSettings::from_index(self.index, wtxn)?; + let fields_ids_map = self.fields_ids_map; + let primary_key_id = self.index.primary_key(wtxn)?.and_then(|name| fields_ids_map.id(name)); let mut new_inner_settings = old_inner_settings.clone(); - new_inner_settings.fields_ids_map = self.fields_ids_map; + new_inner_settings.fields_ids_map = fields_ids_map; + let settings_diff = InnerIndexSettingsDiff { old: old_inner_settings, new: new_inner_settings, + primary_key_id, embedding_configs_updated: false, settings_update_only: false, }; @@ -837,37 +841,66 @@ impl<'a, 'i> Transform<'a, 'i> { fn rebind_existing_document( old_obkv: KvReader, settings_diff: &InnerIndexSettingsDiff, - original_obkv_buffer: &mut Vec, - flattened_obkv_buffer: &mut Vec, + modified_faceted_fields: &HashSet, + original_obkv_buffer: Option<&mut Vec>, + flattened_obkv_buffer: Option<&mut Vec>, ) -> Result<()> { - // TODO do a XOR of the faceted fields - // TODO if reindex_searchable returns true store all searchables else none - // TODO no longer useful after Tamo's PR - let mut old_fields_ids_map = settings_diff.old.fields_ids_map.clone(); - let mut new_fields_ids_map = settings_diff.new.fields_ids_map.clone(); + // Always keep the primary key. + let is_primary_key = |id: FieldId| -> bool { settings_diff.primary_key_id == Some(id) }; + + // If only the `searchableAttributes` has been changed, keep only the searchable fields. + let must_reindex_searchables = settings_diff.reindex_searchable(); + let necessary_searchable_field = |id: FieldId| -> bool { + must_reindex_searchables + && (settings_diff.old.searchable_fields_ids.contains(&id) + || settings_diff.new.searchable_fields_ids.contains(&id)) + }; + + // If only a faceted field has been added, keep only this field. + let must_reindex_facets = settings_diff.reindex_facets(); + let necessary_faceted_field = |id: FieldId| -> bool { + let field_name = settings_diff.new.fields_ids_map.name(id).unwrap(); + must_reindex_facets + && modified_faceted_fields + .iter() + .any(|long| is_faceted_by(long, field_name) || is_faceted_by(field_name, long)) + }; + + // Alway provide all fields when vectors are involved because + // we need the fields for the prompt/templating. + let reindex_vectors = settings_diff.reindex_vectors(); + let mut obkv_writer = KvWriter::<_, FieldId>::memory(); - // We iterate over the new `FieldsIdsMap` ids in order and construct the new obkv. - for (id, name) in new_fields_ids_map.iter() { - if let Some(val) = old_fields_ids_map.id(name).and_then(|id| old_obkv.get(id)) { + for (id, val) in old_obkv.iter() { + if is_primary_key(id) + || necessary_searchable_field(id) + || necessary_faceted_field(id) + || reindex_vectors + { obkv_writer.insert(id, val)?; } } let data = obkv_writer.into_inner()?; - let new_obkv = KvReader::::new(&data); + let obkv = KvReader::::new(&data); - // take the non-flattened version if flatten_from_fields_ids_map returns None. - let old_flattened = Self::flatten_from_fields_ids_map(&old_obkv, &mut old_fields_ids_map)?; - let old_flattened = - old_flattened.as_deref().map_or_else(|| old_obkv, KvReader::::new); - let new_flattened = Self::flatten_from_fields_ids_map(&new_obkv, &mut new_fields_ids_map)?; - let new_flattened = - new_flattened.as_deref().map_or_else(|| new_obkv, KvReader::::new); + if let Some(original_obkv_buffer) = original_obkv_buffer { + original_obkv_buffer.clear(); + into_del_add_obkv(obkv, DelAddOperation::DeletionAndAddition, original_obkv_buffer)?; + } - original_obkv_buffer.clear(); - flattened_obkv_buffer.clear(); + if let Some(flattened_obkv_buffer) = flattened_obkv_buffer { + // take the non-flattened version if flatten_from_fields_ids_map returns None. + let mut fields_ids_map = settings_diff.new.fields_ids_map.clone(); + let flattened = Self::flatten_from_fields_ids_map(&obkv, &mut fields_ids_map)?; + let flattened = flattened.as_deref().map_or(obkv, KvReader::new); - del_add_from_two_obkvs(&old_obkv, &new_obkv, original_obkv_buffer)?; - del_add_from_two_obkvs(&old_flattened, &new_flattened, flattened_obkv_buffer)?; + flattened_obkv_buffer.clear(); + into_del_add_obkv( + flattened, + DelAddOperation::DeletionAndAddition, + flattened_obkv_buffer, + )?; + } Ok(()) } @@ -924,30 +957,34 @@ impl<'a, 'i> Transform<'a, 'i> { None }; - let mut original_obkv_buffer = Vec::new(); - let mut flattened_obkv_buffer = Vec::new(); - let mut document_sorter_key_buffer = Vec::new(); - for result in self.index.external_documents_ids().iter(wtxn)? { - let (external_id, docid) = result?; - let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or( - InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, - )?; + if original_sorter.is_some() || flattened_sorter.is_some() { + let modified_faceted_fields = settings_diff.modified_faceted_fields(); + let mut original_obkv_buffer = Vec::new(); + let mut flattened_obkv_buffer = Vec::new(); + let mut document_sorter_key_buffer = Vec::new(); + for result in self.index.external_documents_ids().iter(wtxn)? { + let (external_id, docid) = result?; + let old_obkv = self.index.documents.get(wtxn, &docid)?.ok_or( + InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, + )?; - Self::rebind_existing_document( - old_obkv, - &settings_diff, - &mut original_obkv_buffer, - &mut flattened_obkv_buffer, - )?; + Self::rebind_existing_document( + old_obkv, + &settings_diff, + &modified_faceted_fields, + Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), + Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()), + )?; - document_sorter_key_buffer.clear(); - document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); - document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); - if let Some(original_sorter) = original_sorter.as_mut() { - original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; - } - if let Some(flattened_sorter) = flattened_sorter.as_mut() { - flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; + if let Some(original_sorter) = original_sorter.as_mut() { + document_sorter_key_buffer.clear(); + document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); + document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); + original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; + } + if let Some(flattened_sorter) = flattened_sorter.as_mut() { + flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; + } } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 1529e1fe6..0fd39ce77 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1067,10 +1067,17 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage let embedding_configs_updated = self.update_embedding_configs()?; - let new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; + let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; + new_inner_settings.recompute_facets(self.wtxn, self.index)?; + + let primary_key_id = self + .index + .primary_key(self.wtxn)? + .and_then(|name| new_inner_settings.fields_ids_map.id(name)); let inner_settings_diff = InnerIndexSettingsDiff { old: old_inner_settings, new: new_inner_settings, + primary_key_id, embedding_configs_updated, settings_update_only: true, }; @@ -1086,10 +1093,9 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { pub struct InnerIndexSettingsDiff { pub(crate) old: InnerIndexSettings, pub(crate) new: InnerIndexSettings, - + pub(crate) primary_key_id: Option, // TODO: compare directly the embedders. pub(crate) embedding_configs_updated: bool, - pub(crate) settings_update_only: bool, } @@ -1127,15 +1133,7 @@ impl InnerIndexSettingsDiff { return true; } - let faceted_updated = - (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields); - - self.old - .fields_ids_map - .iter() - .zip(self.new.fields_ids_map.iter()) - .any(|(old, new)| old != new) - || faceted_updated + (existing_fields - old_faceted_fields) != (existing_fields - new_faceted_fields) } pub fn reindex_vectors(&self) -> bool { @@ -1145,6 +1143,10 @@ impl InnerIndexSettingsDiff { pub fn settings_update_only(&self) -> bool { self.settings_update_only } + + pub fn modified_faceted_fields(&self) -> HashSet { + &self.old.user_defined_faceted_fields ^ &self.new.user_defined_faceted_fields + } } #[derive(Clone)]