From 1daaed163a90b4550f6e6f4abc1d4e544a9f0630 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 27 Jun 2024 11:01:52 +0200 Subject: [PATCH] Make _vectors.:embedding.regenerate mandatory + tests + error messages --- meilisearch-types/src/error.rs | 3 +- meilisearch/tests/vector/mod.rs | 205 +++++++++++++++++++++++++++++ milli/src/error.rs | 2 + milli/src/vector/parsed_vectors.rs | 142 +++++++++++++++++--- 4 files changed, 336 insertions(+), 16 deletions(-) diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 086396d7d..f529238e4 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -398,7 +398,8 @@ impl ErrorCode for milli::Error { UserError::CriterionError(_) => Code::InvalidSettingsRankingRules, UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField, UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions, - UserError::InvalidVectorsMapType { .. } => Code::InvalidVectorsType, + UserError::InvalidVectorsMapType { .. } + | UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType, UserError::TooManyVectors(_, _) => Code::TooManyVectors, UserError::SortError(_) => Code::InvalidSearchSort, UserError::InvalidMinTypoWordLenSetting(_, _) => { diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs index 4172ef444..dcefe2460 100644 --- a/meilisearch/tests/vector/mod.rs +++ b/meilisearch/tests/vector/mod.rs @@ -190,6 +190,211 @@ async fn generate_default_user_provided_documents(server: &Server) -> Index { index } +#[actix_rt::test] +async fn user_provided_embeddings_error() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + // First case, we forget to specify the `regenerate` + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [0, 0, 0] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 2, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // Second case, we don't specify anything + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": {}}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 3, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Missing field `regenerate` inside `.manual`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // Third case, we specify something wrong in place of regenerate + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "regenerate": "yes please" }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 4, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.regenerate`: expected a boolean, but found a string: `\"yes please\"`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": true }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 5, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings`: expected null or an array, but found a boolean: `true`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [true] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 6, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0]`: expected a number or an array, but found a boolean: `true`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = + json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [[true]] }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 7, + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Bad embedder configuration in the document with id: `\"0\"`. Invalid value type at `.manual.embeddings[0][0]`: expected a number, but found a boolean: `true`", + "code": "invalid_vectors_type", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vectors_type" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let documents = json!({"id": 0, "name": "kefir", "_vectors": { "manual": { "embeddings": [23, 0.1, -12], "regenerate": true }}}); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r###" + { + "uid": 8, + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + #[actix_rt::test] async fn clear_documents() { let server = Server::new().await; diff --git a/milli/src/error.rs b/milli/src/error.rs index 8210d92e0..8e03fde4e 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -119,6 +119,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco InvalidVectorDimensions { expected: usize, found: usize }, #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")] InvalidVectorsMapType { document_id: String, value: Value }, + #[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")] + InvalidVectorsEmbedderConf { document_id: String, error: deserr::errors::JsonError }, #[error("{0}")] InvalidFilter(String), #[error("Invalid type for filter subexpression: expected: {}, found: {1}.", .0.join(", "))] diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 92d6cb382..f934953fd 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -1,5 +1,6 @@ use std::collections::{BTreeMap, BTreeSet}; +use deserr::{take_cf_content, DeserializeError, Deserr, Sequence}; use obkv::KvReader; use serde_json::{from_slice, Value}; @@ -10,13 +11,44 @@ use crate::{DocumentId, FieldId, InternalError, UserError}; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; -#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[derive(serde::Serialize, Debug)] #[serde(untagged)] pub enum Vectors { ImplicitlyUserProvided(VectorOrArrayOfVectors), Explicit(ExplicitVectors), } +impl Deserr for Vectors { + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + match value { + deserr::Value::Sequence(_) | deserr::Value::Null => { + Ok(Vectors::ImplicitlyUserProvided(VectorOrArrayOfVectors::deserialize_from_value( + value, location, + )?)) + } + deserr::Value::Map(_) => { + Ok(Vectors::Explicit(ExplicitVectors::deserialize_from_value(value, location)?)) + } + + value => Err(take_cf_content(E::error( + None, + deserr::ErrorKind::IncorrectValueKind { + actual: value, + accepted: &[ + deserr::ValueKind::Sequence, + deserr::ValueKind::Map, + deserr::ValueKind::Null, + ], + }, + location, + ))), + } + } +} + impl Vectors { pub fn must_regenerate(&self) -> bool { match self { @@ -37,9 +69,11 @@ impl Vectors { } } -#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[derive(serde::Serialize, Deserr, Debug)] #[serde(rename_all = "camelCase")] pub struct ExplicitVectors { + #[serde(default)] + #[deserr(default)] pub embeddings: Option, pub regenerate: bool, } @@ -149,13 +183,20 @@ impl ParsedVectorsDiff { pub struct ParsedVectors(pub BTreeMap); +impl Deserr for ParsedVectors { + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + let value = >::deserialize_from_value(value, location)?; + Ok(ParsedVectors(value)) + } +} + impl ParsedVectors { pub fn from_bytes(value: &[u8]) -> Result { - let Ok(value) = from_slice(value) else { - let value = from_slice(value).map_err(Error::InternalSerdeJson)?; - return Err(Error::InvalidMap(value)); - }; - Ok(ParsedVectors(value)) + let value: serde_json::Value = from_slice(value).map_err(Error::InternalSerdeJson)?; + deserr::deserialize(value).map_err(|error| Error::InvalidEmbedderConf { error }) } pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet) { @@ -165,6 +206,7 @@ impl ParsedVectors { pub enum Error { InvalidMap(Value), + InvalidEmbedderConf { error: deserr::errors::JsonError }, InternalSerdeJson(serde_json::Error), } @@ -174,6 +216,12 @@ impl Error { Error::InvalidMap(value) => { crate::Error::UserError(UserError::InvalidVectorsMapType { document_id, value }) } + Error::InvalidEmbedderConf { error } => { + crate::Error::UserError(UserError::InvalidVectorsEmbedderConf { + document_id, + error, + }) + } Error::InternalSerdeJson(error) => { crate::Error::InternalError(InternalError::SerdeJson(error)) } @@ -194,13 +242,73 @@ fn to_vector_map( } /// Represents either a vector or an array of multiple vectors. -#[derive(serde::Serialize, serde::Deserialize, Debug)] +#[derive(serde::Serialize, Debug)] #[serde(transparent)] pub struct VectorOrArrayOfVectors { #[serde(with = "either::serde_untagged_optional")] inner: Option, Embedding>>, } +impl Deserr for VectorOrArrayOfVectors { + fn deserialize_from_value( + value: deserr::Value, + location: deserr::ValuePointerRef, + ) -> Result { + match value { + deserr::Value::Null => Ok(VectorOrArrayOfVectors { inner: None }), + deserr::Value::Sequence(seq) => { + let mut iter = seq.into_iter(); + let location = location.push_index(0); + match iter.next().map(|v| v.into_value()) { + None => { + // With the strange way serde serialize the `Either`, we must send the left part + // otherwise it'll consider we returned [[]] + Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(Vec::new())) }) + } + Some(val @ deserr::Value::Sequence(_)) => { + let first = Embedding::deserialize_from_value(val, location)?; + let mut collect = vec![first]; + let mut tail = iter + .map(|v| Embedding::deserialize_from_value(v.into_value(), location)) + .collect::, _>>()?; + collect.append(&mut tail); + + Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Left(collect)) }) + } + Some( + val @ deserr::Value::Integer(_) + | val @ deserr::Value::NegativeInteger(_) + | val @ deserr::Value::Float(_), + ) => { + let first = ::deserialize_from_value(val, location)?; + let mut embedding = iter + .map(|v| ::deserialize_from_value(v.into_value(), location)) + .collect::, _>>()?; + embedding.insert(0, first); + Ok(VectorOrArrayOfVectors { inner: Some(either::Either::Right(embedding)) }) + } + Some(value) => Err(take_cf_content(E::error( + None, + deserr::ErrorKind::IncorrectValueKind { + actual: value, + accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Float], + }, + location, + ))), + } + } + value => Err(take_cf_content(E::error( + None, + deserr::ErrorKind::IncorrectValueKind { + actual: value, + accepted: &[deserr::ValueKind::Sequence, deserr::ValueKind::Null], + }, + location, + ))), + } + } +} + impl VectorOrArrayOfVectors { pub fn into_array_of_vectors(self) -> Option> { match self.inner? { @@ -234,15 +342,19 @@ impl From> for VectorOrArrayOfVectors { mod test { use super::VectorOrArrayOfVectors; + fn embedding_from_str(s: &str) -> Result { + let value: serde_json::Value = serde_json::from_str(s).unwrap(); + deserr::deserialize(value) + } + #[test] fn array_of_vectors() { - let null: VectorOrArrayOfVectors = serde_json::from_str("null").unwrap(); - let empty: VectorOrArrayOfVectors = serde_json::from_str("[]").unwrap(); - let one: VectorOrArrayOfVectors = serde_json::from_str("[0.1]").unwrap(); - let two: VectorOrArrayOfVectors = serde_json::from_str("[0.1, 0.2]").unwrap(); - let one_vec: VectorOrArrayOfVectors = serde_json::from_str("[[0.1, 0.2]]").unwrap(); - let two_vecs: VectorOrArrayOfVectors = - serde_json::from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap(); + let null = embedding_from_str("null").unwrap(); + let empty = embedding_from_str("[]").unwrap(); + let one = embedding_from_str("[0.1]").unwrap(); + let two = embedding_from_str("[0.1, 0.2]").unwrap(); + let one_vec = embedding_from_str("[[0.1, 0.2]]").unwrap(); + let two_vecs = embedding_from_str("[[0.1, 0.2], [0.3, 0.4]]").unwrap(); insta::assert_json_snapshot!(null.into_array_of_vectors(), @"null"); insta::assert_json_snapshot!(empty.into_array_of_vectors(), @"[]");