From 3bc8f81abc3f8d57060c1571d6801e50f43ce33f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 18:11:11 +0200 Subject: [PATCH] user_provided => regenerate --- index-scheduler/src/batch.rs | 6 +- meilisearch/src/routes/indexes/documents.rs | 5 +- meilisearch/src/search.rs | 3 +- .../extract/extract_vector_points.rs | 56 ++++++++++--------- milli/src/update/index_documents/transform.rs | 6 +- milli/src/vector/parsed_vectors.rs | 34 +++++------ 6 files changed, 62 insertions(+), 48 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 30ff54a62..cd5525eea 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -958,10 +958,10 @@ impl IndexScheduler { .is_some_and(|conf| conf.user_provided.contains(id)); let embeddings = ExplicitVectors { - embeddings: VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, + embeddings: Some( + VectorOrArrayOfVectors::from_array_of_vectors(embeddings), ), - user_provided, + regenerate: !user_provided, }; vectors.insert( embedder_name, diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 70623bb35..bfbe20207 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -625,7 +625,10 @@ fn some_documents<'a, 't: 'a>( .iter() .find(|conf| conf.name == name) .is_some_and(|conf| conf.user_provided.contains(key)); - let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided }; + let embeddings = ExplicitVectors { + embeddings: Some(vector.into()), + regenerate: !user_provided, + }; vectors.insert( name, serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index ce712f17f..60f684ede 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1072,7 +1072,8 @@ fn make_hits( .iter() .find(|conf| conf.name == name) .is_some_and(|conf| conf.user_provided.contains(id)); - let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided }; + let embeddings = + ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided }; vectors.insert(name, serde_json::to_value(embeddings)?); } document.insert("_vectors".into(), vectors.into()); diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index fdf8649f4..0a27a28bd 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -260,28 +260,33 @@ pub fn extract_vector_points( // 2. an existing embedder changed so that it must regenerate all generated embeddings. // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB VectorState::Inline(vectors) => { - if vectors.is_user_provided() { + if !vectors.must_regenerate() { add_to_user_provided.insert(docid); } - let add_vectors = vectors.into_array_of_vectors(); - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); + match vectors.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError( + crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ), + )); + } + VectorStateDelta::NowManual(add_vectors) + } + None => VectorStateDelta::NoChange, } - - VectorStateDelta::NowManual(add_vectors) } // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors - VectorState::InDb => VectorStateDelta::NoChange, + VectorState::Manual => VectorStateDelta::NoChange, // generated vectors must be regenerated VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?, }, // prompt regeneration is only triggered for existing embedders ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { - if !old.is_user_provided() { + if old.must_regenerate() { regenerate_if_prompt_changed( obkv, (old_prompt, prompt), @@ -362,31 +367,32 @@ fn extract_vector_document_diff( (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), document_id: impl Fn() -> Value, ) -> Result { - match (old.is_user_provided(), new.is_user_provided()) { + match (old.must_regenerate(), new.must_regenerate()) { (true, true) | (false, false) => {} (true, false) => { - remove_from_user_provided.insert(docid); + add_to_user_provided.insert(docid); } (false, true) => { - add_to_user_provided.insert(docid); + remove_from_user_provided.insert(docid); } } let delta = match (old, new) { // regardless of the previous state, if a document now contains inline _vectors, they must // be extracted manually - (_old, VectorState::Inline(new)) => { - let add_vectors = new.into_array_of_vectors(); + (_old, VectorState::Inline(new)) => match new.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); + } - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); + VectorStateDelta::NowManual(add_vectors) } - - VectorStateDelta::NowManual(add_vectors) - } + None => VectorStateDelta::NoChange, + }, // no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the // document changed (VectorState::Generated, VectorState::Generated) => { @@ -437,7 +443,7 @@ fn extract_vector_document_diff( VectorStateDelta::NowRemoved } } - (_old, VectorState::InDb) => { + (_old, VectorState::Manual) => { // Do we keep this document? let document_is_kept = obkv .iter() diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index b2fe04a4c..467a2810a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1068,8 +1068,10 @@ impl<'a, 'i> Transform<'a, 'i> { Some(Ok(( name.to_string(), serde_json::to_value(ExplicitVectors { - embeddings: VectorOrArrayOfVectors::from_array_of_vectors(vectors), - user_provided: true, + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + vectors, + )), + regenerate: false, }) .unwrap(), ))) diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 9007e03e4..92d6cb382 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -18,18 +18,20 @@ pub enum Vectors { } impl Vectors { - pub fn is_user_provided(&self) -> bool { + pub fn must_regenerate(&self) -> bool { match self { - Vectors::ImplicitlyUserProvided(_) => true, - Vectors::Explicit(ExplicitVectors { user_provided, .. }) => *user_provided, + Vectors::ImplicitlyUserProvided(_) => false, + Vectors::Explicit(ExplicitVectors { regenerate, .. }) => *regenerate, } } - pub fn into_array_of_vectors(self) -> Vec { + pub fn into_array_of_vectors(self) -> Option> { match self { - Vectors::ImplicitlyUserProvided(embeddings) - | Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => { - embeddings.into_array_of_vectors().unwrap_or_default() + Vectors::ImplicitlyUserProvided(embeddings) => { + Some(embeddings.into_array_of_vectors().unwrap_or_default()) + } + Vectors::Explicit(ExplicitVectors { embeddings, regenerate: _ }) => { + embeddings.map(|embeddings| embeddings.into_array_of_vectors().unwrap_or_default()) } } } @@ -38,22 +40,22 @@ impl Vectors { #[derive(serde::Serialize, serde::Deserialize, Debug)] #[serde(rename_all = "camelCase")] pub struct ExplicitVectors { - pub embeddings: VectorOrArrayOfVectors, - pub user_provided: bool, + pub embeddings: Option, + pub regenerate: bool, } pub enum VectorState { Inline(Vectors), - InDb, + Manual, Generated, } impl VectorState { - pub fn is_user_provided(&self) -> bool { + pub fn must_regenerate(&self) -> bool { match self { - VectorState::Inline(vectors) => vectors.is_user_provided(), - VectorState::InDb => true, - VectorState::Generated => false, + VectorState::Inline(vectors) => vectors.must_regenerate(), + VectorState::Manual => false, + VectorState::Generated => true, } } } @@ -96,7 +98,7 @@ impl ParsedVectorsDiff { .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect()); for embedding_config in embedders_configs { if embedding_config.user_provided.contains(docid) { - old.entry(embedding_config.name.to_string()).or_insert(VectorState::InDb); + old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual); } } @@ -121,7 +123,7 @@ impl ParsedVectorsDiff { let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated); let state_from_old = match old { // assume a userProvided is still userProvided - VectorState::InDb => VectorState::InDb, + VectorState::Manual => VectorState::Manual, // generated is still generated VectorState::Generated => VectorState::Generated, // weird case that shouldn't happen were the previous docs version is inline,