mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-25 19:45:05 +08:00
Update embedder configs with a finer granularity
- no longer clear vector DB between any two embedder changes
This commit is contained in:
parent
d0b05ae691
commit
d18c1f77d7
@ -23,7 +23,10 @@ use crate::proximity::ProximityPrecision;
|
|||||||
use crate::update::index_documents::IndexDocumentsMethod;
|
use crate::update::index_documents::IndexDocumentsMethod;
|
||||||
use crate::update::{IndexDocuments, UpdateIndexingStep};
|
use crate::update::{IndexDocuments, UpdateIndexingStep};
|
||||||
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
|
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
|
||||||
use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings};
|
use crate::vector::settings::{
|
||||||
|
check_set, check_unset, EmbedderAction, EmbedderSource, EmbeddingSettings, ReindexAction,
|
||||||
|
WriteBackToDocuments,
|
||||||
|
};
|
||||||
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
|
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
|
||||||
use crate::{FieldId, FieldsIdsMap, Index, Result};
|
use crate::{FieldId, FieldsIdsMap, Index, Result};
|
||||||
|
|
||||||
@ -924,111 +927,177 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
Ok(changed)
|
Ok(changed)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn update_embedding_configs(&mut self) -> Result<bool> {
|
fn update_embedding_configs(&mut self) -> Result<BTreeMap<String, EmbedderAction>> {
|
||||||
let update = match std::mem::take(&mut self.embedder_settings) {
|
match std::mem::take(&mut self.embedder_settings) {
|
||||||
Setting::Set(configs) => {
|
Setting::Set(configs) => self.update_embedding_configs_set(configs),
|
||||||
let mut changed = false;
|
Setting::Reset => {
|
||||||
|
// all vectors should be written back to documents
|
||||||
let old_configs = self.index.embedding_configs(self.wtxn)?;
|
let old_configs = self.index.embedding_configs(self.wtxn)?;
|
||||||
let old_configs: BTreeMap<String, (Setting<EmbeddingSettings>, RoaringBitmap)> =
|
let remove_all: Result<BTreeMap<String, EmbedderAction>> = old_configs
|
||||||
old_configs
|
|
||||||
.into_iter()
|
|
||||||
.map(
|
|
||||||
|IndexEmbeddingConfig { name, config, user_provided: user_defined }| {
|
|
||||||
(name, (Setting::Set(config.into()), user_defined))
|
|
||||||
},
|
|
||||||
)
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
let mut new_configs = BTreeMap::new();
|
|
||||||
for joined in old_configs
|
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
|
.map(|IndexEmbeddingConfig { name, config: _, user_provided }| -> Result<_> {
|
||||||
{
|
let embedder_id =
|
||||||
match joined {
|
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
|
||||||
// updated config
|
crate::InternalError::DatabaseMissingEntry {
|
||||||
EitherOrBoth::Both((name, (mut old, user_provided)), (_, new)) => {
|
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
|
||||||
changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new);
|
key: None,
|
||||||
if changed {
|
},
|
||||||
tracing::debug!(
|
)?;
|
||||||
embedder = name,
|
Ok((
|
||||||
user_provided = user_provided.len(),
|
|
||||||
"need reindex"
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
tracing::debug!(embedder = name, "skip reindex");
|
|
||||||
}
|
|
||||||
let new = validate_embedding_settings(old, &name)?;
|
|
||||||
new_configs.insert(name, (new, user_provided));
|
|
||||||
}
|
|
||||||
// unchanged config
|
|
||||||
EitherOrBoth::Left((name, setting)) => {
|
|
||||||
new_configs.insert(name, setting);
|
|
||||||
}
|
|
||||||
// new config
|
|
||||||
EitherOrBoth::Right((name, mut setting)) => {
|
|
||||||
// apply the default source in case the source was not set so that it gets validated
|
|
||||||
crate::vector::settings::EmbeddingSettings::apply_default_source(
|
|
||||||
&mut setting,
|
|
||||||
);
|
|
||||||
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
|
|
||||||
&mut setting,
|
|
||||||
);
|
|
||||||
let setting = validate_embedding_settings(setting, &name)?;
|
|
||||||
changed = true;
|
|
||||||
new_configs.insert(name, (setting, RoaringBitmap::new()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let new_configs: Vec<IndexEmbeddingConfig> = new_configs
|
|
||||||
.into_iter()
|
|
||||||
.filter_map(|(name, (config, user_provided))| match config {
|
|
||||||
Setting::Set(config) => Some(IndexEmbeddingConfig {
|
|
||||||
name,
|
name,
|
||||||
config: config.into(),
|
EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
|
||||||
user_provided,
|
embedder_id,
|
||||||
}),
|
user_provided,
|
||||||
Setting::Reset => None,
|
}),
|
||||||
Setting::NotSet => Some(IndexEmbeddingConfig {
|
))
|
||||||
name,
|
|
||||||
config: EmbeddingSettings::default().into(),
|
|
||||||
user_provided,
|
|
||||||
}),
|
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
let remove_all = remove_all?;
|
||||||
|
|
||||||
self.index.embedder_category_id.clear(self.wtxn)?;
|
self.index.embedder_category_id.clear(self.wtxn)?;
|
||||||
for (index, index_embedding_config) in new_configs.iter().enumerate() {
|
|
||||||
self.index.embedder_category_id.put_with_flags(
|
|
||||||
self.wtxn,
|
|
||||||
heed::PutFlags::APPEND,
|
|
||||||
&index_embedding_config.name,
|
|
||||||
&index
|
|
||||||
.try_into()
|
|
||||||
.map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
|
|
||||||
if new_configs.is_empty() {
|
|
||||||
self.index.delete_embedding_configs(self.wtxn)?;
|
|
||||||
} else {
|
|
||||||
self.index.put_embedding_configs(self.wtxn, new_configs)?;
|
|
||||||
}
|
|
||||||
changed
|
|
||||||
}
|
|
||||||
Setting::Reset => {
|
|
||||||
self.index.delete_embedding_configs(self.wtxn)?;
|
self.index.delete_embedding_configs(self.wtxn)?;
|
||||||
true
|
Ok(remove_all)
|
||||||
}
|
}
|
||||||
Setting::NotSet => false,
|
Setting::NotSet => Ok(Default::default()),
|
||||||
};
|
|
||||||
|
|
||||||
// if any changes force a reindexing
|
|
||||||
// clear the vector database.
|
|
||||||
if update {
|
|
||||||
self.index.vector_arroy.clear(self.wtxn)?;
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
Ok(update)
|
fn update_embedding_configs_set(
|
||||||
|
&mut self,
|
||||||
|
configs: BTreeMap<String, Setting<EmbeddingSettings>>,
|
||||||
|
) -> Result<BTreeMap<String, EmbedderAction>> {
|
||||||
|
use crate::vector::settings::SettingsDiff;
|
||||||
|
|
||||||
|
let old_configs = self.index.embedding_configs(self.wtxn)?;
|
||||||
|
let old_configs: BTreeMap<String, (EmbeddingSettings, RoaringBitmap)> = old_configs
|
||||||
|
.into_iter()
|
||||||
|
.map(|IndexEmbeddingConfig { name, config, user_provided }| {
|
||||||
|
(name, (config.into(), user_provided))
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let mut updated_configs = BTreeMap::new();
|
||||||
|
let mut embedder_actions = BTreeMap::new();
|
||||||
|
for joined in old_configs
|
||||||
|
.into_iter()
|
||||||
|
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
|
||||||
|
{
|
||||||
|
match joined {
|
||||||
|
// updated config
|
||||||
|
EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => {
|
||||||
|
let settings_diff = SettingsDiff::from_settings(old, new);
|
||||||
|
match settings_diff {
|
||||||
|
SettingsDiff::Remove => {
|
||||||
|
tracing::debug!(
|
||||||
|
embedder = name,
|
||||||
|
user_provided = user_provided.len(),
|
||||||
|
"removing embedder"
|
||||||
|
);
|
||||||
|
let embedder_id =
|
||||||
|
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
|
||||||
|
crate::InternalError::DatabaseMissingEntry {
|
||||||
|
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
|
||||||
|
key: None,
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
// free id immediately
|
||||||
|
self.index.embedder_category_id.delete(self.wtxn, &name)?;
|
||||||
|
embedder_actions.insert(
|
||||||
|
name,
|
||||||
|
EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
|
||||||
|
embedder_id,
|
||||||
|
user_provided,
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
SettingsDiff::Reindex { action, updated_settings } => {
|
||||||
|
tracing::debug!(
|
||||||
|
embedder = name,
|
||||||
|
user_provided = user_provided.len(),
|
||||||
|
?action,
|
||||||
|
"reindex embedder"
|
||||||
|
);
|
||||||
|
embedder_actions.insert(name.clone(), EmbedderAction::Reindex(action));
|
||||||
|
let new =
|
||||||
|
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
|
||||||
|
updated_configs.insert(name, (new, user_provided));
|
||||||
|
}
|
||||||
|
SettingsDiff::UpdateWithoutReindex { updated_settings } => {
|
||||||
|
tracing::debug!(
|
||||||
|
embedder = name,
|
||||||
|
user_provided = user_provided.len(),
|
||||||
|
"update without reindex embedder"
|
||||||
|
);
|
||||||
|
let new =
|
||||||
|
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
|
||||||
|
updated_configs.insert(name, (new, user_provided));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// unchanged config
|
||||||
|
EitherOrBoth::Left((name, (setting, user_provided))) => {
|
||||||
|
tracing::debug!(embedder = name, "unchanged embedder");
|
||||||
|
updated_configs.insert(name, (Setting::Set(setting), user_provided));
|
||||||
|
}
|
||||||
|
// new config
|
||||||
|
EitherOrBoth::Right((name, mut setting)) => {
|
||||||
|
tracing::debug!(embedder = name, "new embedder");
|
||||||
|
// apply the default source in case the source was not set so that it gets validated
|
||||||
|
crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting);
|
||||||
|
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
|
||||||
|
&mut setting,
|
||||||
|
);
|
||||||
|
let setting = validate_embedding_settings(setting, &name)?;
|
||||||
|
embedder_actions
|
||||||
|
.insert(name.clone(), EmbedderAction::Reindex(ReindexAction::FullReindex));
|
||||||
|
updated_configs.insert(name, (setting, RoaringBitmap::new()));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize];
|
||||||
|
for res in self.index.embedder_category_id.iter(self.wtxn)? {
|
||||||
|
let (_name, id) = res?;
|
||||||
|
free_indices[id as usize] = false;
|
||||||
|
}
|
||||||
|
let mut free_indices = free_indices.iter_mut().enumerate();
|
||||||
|
let mut find_free_index =
|
||||||
|
move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8);
|
||||||
|
for (name, action) in embedder_actions.iter() {
|
||||||
|
match action {
|
||||||
|
EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => {
|
||||||
|
/* cannot be a new embedder, so has to have an id already */
|
||||||
|
}
|
||||||
|
EmbedderAction::Reindex(ReindexAction::FullReindex) => {
|
||||||
|
if self.index.embedder_category_id.get(self.wtxn, name)?.is_none() {
|
||||||
|
let id = find_free_index()
|
||||||
|
.ok_or(UserError::TooManyEmbedders(updated_configs.len()))?;
|
||||||
|
tracing::debug!(embedder = name, id, "assigning free id to new embedder");
|
||||||
|
self.index.embedder_category_id.put(self.wtxn, name, &id)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
EmbedderAction::WriteBackToDocuments(_) => { /* already removed */ }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let updated_configs: Vec<IndexEmbeddingConfig> = updated_configs
|
||||||
|
.into_iter()
|
||||||
|
.filter_map(|(name, (config, user_provided))| match config {
|
||||||
|
Setting::Set(config) => {
|
||||||
|
Some(IndexEmbeddingConfig { name, config: config.into(), user_provided })
|
||||||
|
}
|
||||||
|
Setting::Reset => None,
|
||||||
|
Setting::NotSet => Some(IndexEmbeddingConfig {
|
||||||
|
name,
|
||||||
|
config: EmbeddingSettings::default().into(),
|
||||||
|
user_provided,
|
||||||
|
}),
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
if updated_configs.is_empty() {
|
||||||
|
self.index.delete_embedding_configs(self.wtxn)?;
|
||||||
|
} else {
|
||||||
|
self.index.put_embedding_configs(self.wtxn, updated_configs)?;
|
||||||
|
}
|
||||||
|
Ok(embedder_actions)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn update_search_cutoff(&mut self) -> Result<bool> {
|
fn update_search_cutoff(&mut self) -> Result<bool> {
|
||||||
@ -1082,13 +1151,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
self.update_searchable()?;
|
self.update_searchable()?;
|
||||||
self.update_exact_attributes()?;
|
self.update_exact_attributes()?;
|
||||||
self.update_proximity_precision()?;
|
self.update_proximity_precision()?;
|
||||||
// TODO: very rough approximation of the needs for reindexing where any change will result in
|
|
||||||
// a full reindexing.
|
let embedding_config_updates = self.update_embedding_configs()?;
|
||||||
// What can be done instead:
|
|
||||||
// 1. Only change the distance on a distance change
|
|
||||||
// 2. Only change the name -> embedder mapping on a name change
|
|
||||||
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
|
|
||||||
let embedding_configs_updated = self.update_embedding_configs()?;
|
|
||||||
|
|
||||||
let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
|
let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
|
||||||
new_inner_settings.recompute_facets(self.wtxn, self.index)?;
|
new_inner_settings.recompute_facets(self.wtxn, self.index)?;
|
||||||
@ -1102,7 +1166,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
old_inner_settings,
|
old_inner_settings,
|
||||||
new_inner_settings,
|
new_inner_settings,
|
||||||
primary_key_id,
|
primary_key_id,
|
||||||
embedding_configs_updated,
|
embedding_config_updates,
|
||||||
settings_update_only,
|
settings_update_only,
|
||||||
);
|
);
|
||||||
|
|
||||||
@ -1119,7 +1183,7 @@ pub struct InnerIndexSettingsDiff {
|
|||||||
pub(crate) new: InnerIndexSettings,
|
pub(crate) new: InnerIndexSettings,
|
||||||
pub(crate) primary_key_id: Option<FieldId>,
|
pub(crate) primary_key_id: Option<FieldId>,
|
||||||
// TODO: compare directly the embedders.
|
// TODO: compare directly the embedders.
|
||||||
pub(crate) embedding_configs_updated: bool,
|
pub(crate) embedding_config_updates: BTreeMap<String, EmbedderAction>,
|
||||||
pub(crate) settings_update_only: bool,
|
pub(crate) settings_update_only: bool,
|
||||||
/// The set of only the additional searchable fields.
|
/// The set of only the additional searchable fields.
|
||||||
/// If any other searchable field has been modified, is set to None.
|
/// If any other searchable field has been modified, is set to None.
|
||||||
@ -1140,7 +1204,7 @@ impl InnerIndexSettingsDiff {
|
|||||||
old_settings: InnerIndexSettings,
|
old_settings: InnerIndexSettings,
|
||||||
new_settings: InnerIndexSettings,
|
new_settings: InnerIndexSettings,
|
||||||
primary_key_id: Option<FieldId>,
|
primary_key_id: Option<FieldId>,
|
||||||
embedding_configs_updated: bool,
|
embedding_config_updates: BTreeMap<String, EmbedderAction>,
|
||||||
settings_update_only: bool,
|
settings_update_only: bool,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
let only_additional_fields = match (
|
let only_additional_fields = match (
|
||||||
@ -1177,7 +1241,7 @@ impl InnerIndexSettingsDiff {
|
|||||||
old: old_settings,
|
old: old_settings,
|
||||||
new: new_settings,
|
new: new_settings,
|
||||||
primary_key_id,
|
primary_key_id,
|
||||||
embedding_configs_updated,
|
embedding_config_updates,
|
||||||
settings_update_only,
|
settings_update_only,
|
||||||
only_additional_fields,
|
only_additional_fields,
|
||||||
cache_reindex_searchable_without_user_defined,
|
cache_reindex_searchable_without_user_defined,
|
||||||
@ -1244,7 +1308,7 @@ impl InnerIndexSettingsDiff {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn reindex_vectors(&self) -> bool {
|
pub fn reindex_vectors(&self) -> bool {
|
||||||
self.embedding_configs_updated
|
!self.embedding_config_updates.is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn settings_update_only(&self) -> bool {
|
pub fn settings_update_only(&self) -> bool {
|
||||||
|
Loading…
Reference in New Issue
Block a user