From d18c1f77d7453b2842851a947621cbe5687fdbb5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:04:54 +0200 Subject: [PATCH] Update embedder configs with a finer granularity - no longer clear vector DB between any two embedder changes --- milli/src/update/settings.rs | 278 +++++++++++++++++++++-------------- 1 file changed, 171 insertions(+), 107 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 08b12d178..5421b64a7 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -23,7 +23,10 @@ use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; -use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; +use crate::vector::settings::{ + check_set, check_unset, EmbedderAction, EmbedderSource, EmbeddingSettings, ReindexAction, + WriteBackToDocuments, +}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::{FieldId, FieldsIdsMap, Index, Result}; @@ -924,111 +927,177 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(changed) } - fn update_embedding_configs(&mut self) -> Result { - let update = match std::mem::take(&mut self.embedder_settings) { - Setting::Set(configs) => { - let mut changed = false; + fn update_embedding_configs(&mut self) -> Result> { + match std::mem::take(&mut self.embedder_settings) { + Setting::Set(configs) => self.update_embedding_configs_set(configs), + Setting::Reset => { + // all vectors should be written back to documents let old_configs = self.index.embedding_configs(self.wtxn)?; - let old_configs: BTreeMap, RoaringBitmap)> = - old_configs - .into_iter() - .map( - |IndexEmbeddingConfig { name, config, user_provided: user_defined }| { - (name, (Setting::Set(config.into()), user_defined)) - }, - ) - .collect(); - - let mut new_configs = BTreeMap::new(); - for joined in old_configs + let remove_all: Result> = old_configs .into_iter() - .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) - { - match joined { - // updated config - EitherOrBoth::Both((name, (mut old, user_provided)), (_, new)) => { - changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); - if changed { - tracing::debug!( - embedder = name, - user_provided = user_provided.len(), - "need reindex" - ); - } else { - tracing::debug!(embedder = name, "skip reindex"); - } - let new = validate_embedding_settings(old, &name)?; - new_configs.insert(name, (new, user_provided)); - } - // unchanged config - EitherOrBoth::Left((name, setting)) => { - new_configs.insert(name, setting); - } - // new config - EitherOrBoth::Right((name, mut setting)) => { - // apply the default source in case the source was not set so that it gets validated - crate::vector::settings::EmbeddingSettings::apply_default_source( - &mut setting, - ); - crate::vector::settings::EmbeddingSettings::apply_default_openai_model( - &mut setting, - ); - let setting = validate_embedding_settings(setting, &name)?; - changed = true; - new_configs.insert(name, (setting, RoaringBitmap::new())); - } - } - } - let new_configs: Vec = new_configs - .into_iter() - .filter_map(|(name, (config, user_provided))| match config { - Setting::Set(config) => Some(IndexEmbeddingConfig { + .map(|IndexEmbeddingConfig { name, config: _, user_provided }| -> Result<_> { + let embedder_id = + self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; + Ok(( name, - config: config.into(), - user_provided, - }), - Setting::Reset => None, - Setting::NotSet => Some(IndexEmbeddingConfig { - name, - config: EmbeddingSettings::default().into(), - user_provided, - }), + EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }), + )) }) .collect(); + let remove_all = remove_all?; + self.index.embedder_category_id.clear(self.wtxn)?; - for (index, index_embedding_config) in new_configs.iter().enumerate() { - self.index.embedder_category_id.put_with_flags( - self.wtxn, - heed::PutFlags::APPEND, - &index_embedding_config.name, - &index - .try_into() - .map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?, - )?; - } - - if new_configs.is_empty() { - self.index.delete_embedding_configs(self.wtxn)?; - } else { - self.index.put_embedding_configs(self.wtxn, new_configs)?; - } - changed - } - Setting::Reset => { self.index.delete_embedding_configs(self.wtxn)?; - true + Ok(remove_all) } - Setting::NotSet => false, - }; - - // if any changes force a reindexing - // clear the vector database. - if update { - self.index.vector_arroy.clear(self.wtxn)?; + Setting::NotSet => Ok(Default::default()), } + } - Ok(update) + fn update_embedding_configs_set( + &mut self, + configs: BTreeMap>, + ) -> Result> { + use crate::vector::settings::SettingsDiff; + + let old_configs = self.index.embedding_configs(self.wtxn)?; + let old_configs: BTreeMap = old_configs + .into_iter() + .map(|IndexEmbeddingConfig { name, config, user_provided }| { + (name, (config.into(), user_provided)) + }) + .collect(); + let mut updated_configs = BTreeMap::new(); + let mut embedder_actions = BTreeMap::new(); + for joined in old_configs + .into_iter() + .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) + { + match joined { + // updated config + EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => { + let settings_diff = SettingsDiff::from_settings(old, new); + match settings_diff { + SettingsDiff::Remove => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + "removing embedder" + ); + let embedder_id = + self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; + // free id immediately + self.index.embedder_category_id.delete(self.wtxn, &name)?; + embedder_actions.insert( + name, + EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }), + ); + } + SettingsDiff::Reindex { action, updated_settings } => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + ?action, + "reindex embedder" + ); + embedder_actions.insert(name.clone(), EmbedderAction::Reindex(action)); + let new = + validate_embedding_settings(Setting::Set(updated_settings), &name)?; + updated_configs.insert(name, (new, user_provided)); + } + SettingsDiff::UpdateWithoutReindex { updated_settings } => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + "update without reindex embedder" + ); + let new = + validate_embedding_settings(Setting::Set(updated_settings), &name)?; + updated_configs.insert(name, (new, user_provided)); + } + } + } + // unchanged config + EitherOrBoth::Left((name, (setting, user_provided))) => { + tracing::debug!(embedder = name, "unchanged embedder"); + updated_configs.insert(name, (Setting::Set(setting), user_provided)); + } + // new config + EitherOrBoth::Right((name, mut setting)) => { + tracing::debug!(embedder = name, "new embedder"); + // apply the default source in case the source was not set so that it gets validated + crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting); + crate::vector::settings::EmbeddingSettings::apply_default_openai_model( + &mut setting, + ); + let setting = validate_embedding_settings(setting, &name)?; + embedder_actions + .insert(name.clone(), EmbedderAction::Reindex(ReindexAction::FullReindex)); + updated_configs.insert(name, (setting, RoaringBitmap::new())); + } + } + } + let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; + for res in self.index.embedder_category_id.iter(self.wtxn)? { + let (_name, id) = res?; + free_indices[id as usize] = false; + } + let mut free_indices = free_indices.iter_mut().enumerate(); + let mut find_free_index = + move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); + for (name, action) in embedder_actions.iter() { + match action { + EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => { + /* cannot be a new embedder, so has to have an id already */ + } + EmbedderAction::Reindex(ReindexAction::FullReindex) => { + if self.index.embedder_category_id.get(self.wtxn, name)?.is_none() { + let id = find_free_index() + .ok_or(UserError::TooManyEmbedders(updated_configs.len()))?; + tracing::debug!(embedder = name, id, "assigning free id to new embedder"); + self.index.embedder_category_id.put(self.wtxn, name, &id)?; + } + } + EmbedderAction::WriteBackToDocuments(_) => { /* already removed */ } + } + } + let updated_configs: Vec = updated_configs + .into_iter() + .filter_map(|(name, (config, user_provided))| match config { + Setting::Set(config) => { + Some(IndexEmbeddingConfig { name, config: config.into(), user_provided }) + } + Setting::Reset => None, + Setting::NotSet => Some(IndexEmbeddingConfig { + name, + config: EmbeddingSettings::default().into(), + user_provided, + }), + }) + .collect(); + if updated_configs.is_empty() { + self.index.delete_embedding_configs(self.wtxn)?; + } else { + self.index.put_embedding_configs(self.wtxn, updated_configs)?; + } + Ok(embedder_actions) } fn update_search_cutoff(&mut self) -> Result { @@ -1082,13 +1151,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_searchable()?; self.update_exact_attributes()?; self.update_proximity_precision()?; - // TODO: very rough approximation of the needs for reindexing where any change will result in - // a full reindexing. - // What can be done instead: - // 1. Only change the distance on a distance change - // 2. Only change the name -> embedder mapping on a name change - // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage - let embedding_configs_updated = self.update_embedding_configs()?; + + let embedding_config_updates = self.update_embedding_configs()?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; new_inner_settings.recompute_facets(self.wtxn, self.index)?; @@ -1102,7 +1166,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { old_inner_settings, new_inner_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, ); @@ -1119,7 +1183,7 @@ pub struct InnerIndexSettingsDiff { pub(crate) new: InnerIndexSettings, pub(crate) primary_key_id: Option, // TODO: compare directly the embedders. - pub(crate) embedding_configs_updated: bool, + pub(crate) embedding_config_updates: BTreeMap, pub(crate) settings_update_only: bool, /// The set of only the additional searchable fields. /// If any other searchable field has been modified, is set to None. @@ -1140,7 +1204,7 @@ impl InnerIndexSettingsDiff { old_settings: InnerIndexSettings, new_settings: InnerIndexSettings, primary_key_id: Option, - embedding_configs_updated: bool, + embedding_config_updates: BTreeMap, settings_update_only: bool, ) -> Self { let only_additional_fields = match ( @@ -1177,7 +1241,7 @@ impl InnerIndexSettingsDiff { old: old_settings, new: new_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, only_additional_fields, cache_reindex_searchable_without_user_defined, @@ -1244,7 +1308,7 @@ impl InnerIndexSettingsDiff { } pub fn reindex_vectors(&self) -> bool { - self.embedding_configs_updated + !self.embedding_config_updates.is_empty() } pub fn settings_update_only(&self) -> bool {