user_provided => regenerate

This commit is contained in:
Louis Dureuil 2024-06-12 18:11:11 +02:00
parent a89eea233b
commit 3bc8f81abc
No known key found for this signature in database
6 changed files with 62 additions and 48 deletions

View File

@ -958,10 +958,10 @@ impl IndexScheduler {
.is_some_and(|conf| conf.user_provided.contains(id)); .is_some_and(|conf| conf.user_provided.contains(id));
let embeddings = ExplicitVectors { let embeddings = ExplicitVectors {
embeddings: VectorOrArrayOfVectors::from_array_of_vectors( embeddings: Some(
embeddings, VectorOrArrayOfVectors::from_array_of_vectors(embeddings),
), ),
user_provided, regenerate: !user_provided,
}; };
vectors.insert( vectors.insert(
embedder_name, embedder_name,

View File

@ -625,7 +625,10 @@ fn some_documents<'a, 't: 'a>(
.iter() .iter()
.find(|conf| conf.name == name) .find(|conf| conf.name == name)
.is_some_and(|conf| conf.user_provided.contains(key)); .is_some_and(|conf| conf.user_provided.contains(key));
let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided }; let embeddings = ExplicitVectors {
embeddings: Some(vector.into()),
regenerate: !user_provided,
};
vectors.insert( vectors.insert(
name, name,
serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?,

View File

@ -1072,7 +1072,8 @@ fn make_hits(
.iter() .iter()
.find(|conf| conf.name == name) .find(|conf| conf.name == name)
.is_some_and(|conf| conf.user_provided.contains(id)); .is_some_and(|conf| conf.user_provided.contains(id));
let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided }; let embeddings =
ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided };
vectors.insert(name, serde_json::to_value(embeddings)?); vectors.insert(name, serde_json::to_value(embeddings)?);
} }
document.insert("_vectors".into(), vectors.into()); document.insert("_vectors".into(), vectors.into());

View File

@ -260,28 +260,33 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
// 2. an existing embedder changed so that it must regenerate all generated embeddings. // 2. an existing embedder changed so that it must regenerate all generated embeddings.
// For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB
VectorState::Inline(vectors) => { VectorState::Inline(vectors) => {
if vectors.is_user_provided() { if !vectors.must_regenerate() {
add_to_user_provided.insert(docid); add_to_user_provided.insert(docid);
} }
let add_vectors = vectors.into_array_of_vectors();
if add_vectors.len() > usize::from(u8::MAX) { match vectors.into_array_of_vectors() {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors( Some(add_vectors) => {
document_id().to_string(), if add_vectors.len() > usize::from(u8::MAX) {
add_vectors.len(), return Err(crate::Error::UserError(
))); crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
),
));
}
VectorStateDelta::NowManual(add_vectors)
}
None => VectorStateDelta::NoChange,
} }
VectorStateDelta::NowManual(add_vectors)
} }
// this happens only when an existing embedder changed. We cannot regenerate userProvided vectors // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors
VectorState::InDb => VectorStateDelta::NoChange, VectorState::Manual => VectorStateDelta::NoChange,
// generated vectors must be regenerated // generated vectors must be regenerated
VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?, VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?,
}, },
// prompt regeneration is only triggered for existing embedders // prompt regeneration is only triggered for existing embedders
ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { ExtractionAction::SettingsRegeneratePrompts { old_prompt } => {
if !old.is_user_provided() { if old.must_regenerate() {
regenerate_if_prompt_changed( regenerate_if_prompt_changed(
obkv, obkv,
(old_prompt, prompt), (old_prompt, prompt),
@ -362,31 +367,32 @@ fn extract_vector_document_diff(
(old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap),
document_id: impl Fn() -> Value, document_id: impl Fn() -> Value,
) -> Result<VectorStateDelta> { ) -> Result<VectorStateDelta> {
match (old.is_user_provided(), new.is_user_provided()) { match (old.must_regenerate(), new.must_regenerate()) {
(true, true) | (false, false) => {} (true, true) | (false, false) => {}
(true, false) => { (true, false) => {
remove_from_user_provided.insert(docid); add_to_user_provided.insert(docid);
} }
(false, true) => { (false, true) => {
add_to_user_provided.insert(docid); remove_from_user_provided.insert(docid);
} }
} }
let delta = match (old, new) { let delta = match (old, new) {
// regardless of the previous state, if a document now contains inline _vectors, they must // regardless of the previous state, if a document now contains inline _vectors, they must
// be extracted manually // be extracted manually
(_old, VectorState::Inline(new)) => { (_old, VectorState::Inline(new)) => match new.into_array_of_vectors() {
let add_vectors = new.into_array_of_vectors(); Some(add_vectors) => {
if add_vectors.len() > usize::from(u8::MAX) {
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
}
if add_vectors.len() > usize::from(u8::MAX) { VectorStateDelta::NowManual(add_vectors)
return Err(crate::Error::UserError(crate::UserError::TooManyVectors(
document_id().to_string(),
add_vectors.len(),
)));
} }
None => VectorStateDelta::NoChange,
VectorStateDelta::NowManual(add_vectors) },
}
// no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the // no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the
// document changed // document changed
(VectorState::Generated, VectorState::Generated) => { (VectorState::Generated, VectorState::Generated) => {
@ -437,7 +443,7 @@ fn extract_vector_document_diff(
VectorStateDelta::NowRemoved VectorStateDelta::NowRemoved
} }
} }
(_old, VectorState::InDb) => { (_old, VectorState::Manual) => {
// Do we keep this document? // Do we keep this document?
let document_is_kept = obkv let document_is_kept = obkv
.iter() .iter()

View File

@ -1068,8 +1068,10 @@ impl<'a, 'i> Transform<'a, 'i> {
Some(Ok(( Some(Ok((
name.to_string(), name.to_string(),
serde_json::to_value(ExplicitVectors { serde_json::to_value(ExplicitVectors {
embeddings: VectorOrArrayOfVectors::from_array_of_vectors(vectors), embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
user_provided: true, vectors,
)),
regenerate: false,
}) })
.unwrap(), .unwrap(),
))) )))

View File

@ -18,18 +18,20 @@ pub enum Vectors {
} }
impl Vectors { impl Vectors {
pub fn is_user_provided(&self) -> bool { pub fn must_regenerate(&self) -> bool {
match self { match self {
Vectors::ImplicitlyUserProvided(_) => true, Vectors::ImplicitlyUserProvided(_) => false,
Vectors::Explicit(ExplicitVectors { user_provided, .. }) => *user_provided, Vectors::Explicit(ExplicitVectors { regenerate, .. }) => *regenerate,
} }
} }
pub fn into_array_of_vectors(self) -> Vec<Embedding> { pub fn into_array_of_vectors(self) -> Option<Vec<Embedding>> {
match self { match self {
Vectors::ImplicitlyUserProvided(embeddings) Vectors::ImplicitlyUserProvided(embeddings) => {
| Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => { Some(embeddings.into_array_of_vectors().unwrap_or_default())
embeddings.into_array_of_vectors().unwrap_or_default() }
Vectors::Explicit(ExplicitVectors { embeddings, regenerate: _ }) => {
embeddings.map(|embeddings| embeddings.into_array_of_vectors().unwrap_or_default())
} }
} }
} }
@ -38,22 +40,22 @@ impl Vectors {
#[derive(serde::Serialize, serde::Deserialize, Debug)] #[derive(serde::Serialize, serde::Deserialize, Debug)]
#[serde(rename_all = "camelCase")] #[serde(rename_all = "camelCase")]
pub struct ExplicitVectors { pub struct ExplicitVectors {
pub embeddings: VectorOrArrayOfVectors, pub embeddings: Option<VectorOrArrayOfVectors>,
pub user_provided: bool, pub regenerate: bool,
} }
pub enum VectorState { pub enum VectorState {
Inline(Vectors), Inline(Vectors),
InDb, Manual,
Generated, Generated,
} }
impl VectorState { impl VectorState {
pub fn is_user_provided(&self) -> bool { pub fn must_regenerate(&self) -> bool {
match self { match self {
VectorState::Inline(vectors) => vectors.is_user_provided(), VectorState::Inline(vectors) => vectors.must_regenerate(),
VectorState::InDb => true, VectorState::Manual => false,
VectorState::Generated => false, VectorState::Generated => true,
} }
} }
} }
@ -96,7 +98,7 @@ impl ParsedVectorsDiff {
.flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect()); .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect());
for embedding_config in embedders_configs { for embedding_config in embedders_configs {
if embedding_config.user_provided.contains(docid) { if embedding_config.user_provided.contains(docid) {
old.entry(embedding_config.name.to_string()).or_insert(VectorState::InDb); old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual);
} }
} }
@ -121,7 +123,7 @@ impl ParsedVectorsDiff {
let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated); let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated);
let state_from_old = match old { let state_from_old = match old {
// assume a userProvided is still userProvided // assume a userProvided is still userProvided
VectorState::InDb => VectorState::InDb, VectorState::Manual => VectorState::Manual,
// generated is still generated // generated is still generated
VectorState::Generated => VectorState::Generated, VectorState::Generated => VectorState::Generated,
// weird case that shouldn't happen were the previous docs version is inline, // weird case that shouldn't happen were the previous docs version is inline,