From cc5dca8321736805b881bcb8679f566300a8f9e8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 4 Jun 2024 16:41:33 +0200 Subject: [PATCH] fix two bug and add a dump test --- index-scheduler/src/batch.rs | 26 +-- meilisearch/src/routes/indexes/search.rs | 4 +- meilisearch/src/search.rs | 12 +- meilisearch/tests/dumps/mod.rs | 206 +++++++++++++++++++++++ 4 files changed, 234 insertions(+), 14 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 181ac49a3..d59a657c9 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -909,6 +909,7 @@ impl IndexScheduler { let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index.embedding_configs(&rtxn)?; // 3.1. Dump the documents for ret in index.all_documents(&rtxn)? { @@ -951,16 +952,21 @@ impl IndexScheduler { }; for (embedder_name, embeddings) in embeddings { - // don't change the entry if it already exists, because it was user-provided - vectors.entry(embedder_name).or_insert_with(|| { - let embeddings = ExplicitVectors { - embeddings: VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, - ), - user_provided: false, - }; - serde_json::to_value(embeddings).unwrap() - }); + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_defined.contains(id)); + + let embeddings = ExplicitVectors { + embeddings: VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + ), + user_provided, + }; + vectors.insert( + embedder_name, + serde_json::to_value(embeddings).unwrap(), + ); } } diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 91c8c8178..ae6402cf6 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -52,7 +52,7 @@ pub struct SearchQueryGet { #[deserr(default, error = DeserrQueryParamError)] attributes_to_retrieve: Option>, #[deserr(default, error = DeserrQueryParamError)] - retrieve_vectors: bool, + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] attributes_to_crop: Option>, #[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError)] @@ -155,7 +155,7 @@ impl From for SearchQuery { page: other.page.as_deref().copied(), hits_per_page: other.hits_per_page.as_deref().copied(), attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()), - retrieve_vectors: other.retrieve_vectors, + retrieve_vectors: other.retrieve_vectors.0, attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()), crop_length: other.crop_length.0, attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()), diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 1ab42a79f..d80910f09 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1051,6 +1051,7 @@ fn make_hits( formatter_builder.highlight_prefix(format.highlight_pre_tag); formatter_builder.highlight_suffix(format.highlight_post_tag); let mut documents = Vec::new(); + let embedding_configs = index.embedding_configs(&rtxn)?; let documents_iter = index.documents(rtxn, documents_ids)?; for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { // First generate a document with all the displayed fields @@ -1066,12 +1067,19 @@ fn make_hits( if retrieve_vectors { let mut vectors = serde_json::Map::new(); for (name, mut vector) in index.embeddings(&rtxn, id)? { + let user_defined = embedding_configs + .iter() + .find(|conf| conf.name == name) + .is_some_and(|conf| conf.user_defined.contains(id)); + let mut embedding = serde_json::Map::new(); + embedding.insert("userDefined".to_string(), user_defined.into()); if vector.len() == 1 { let vector = vector.pop().unwrap(); - vectors.insert(name.into(), vector.into()); + embedding.insert("embedding".to_string(), vector.into()); } else { - vectors.insert(name.into(), vector.into()); + embedding.insert("embedding".to_string(), vector.into()); } + vectors.insert(name.into(), embedding.into()); } document.insert("_vectors".into(), vectors.into()); } diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index c8f8ca105..dfac2e806 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1938,3 +1938,209 @@ async fn import_dump_v6_containing_experimental_features() { }) .await; } + +// In this test we must generate the dump ourselves to ensure the +// `user defined` vectors are well set +#[actix_rt::test] +async fn generate_and_import_dump_containing_vectors() { + let temp = tempfile::tempdir().unwrap(); + let mut opt = default_settings(temp.path()); + let server = Server::new_with_options(opt.clone()).await.unwrap(); + let (code, _) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + let index = server.index("pets"); + let (response, code) = index + .update_settings(json!( + { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}", + } + } + } + )) + .await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response); + let (response, code) = index + .add_documents( + json!([ + {"id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": vec![0; 384] }}, + {"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "userProvided": true, "embeddings": vec![1; 384] }}}, + {"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "userProvided": false, "embeddings": vec![2; 384] }}}, + {"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "userProvided": false }}}, + {"id": 4, "doggo": "max" }, + ]), + None, + ) + .await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response); + + let (response, code) = server.create_dump().await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // ========= We made a dump, now we should clear the DB except and try to import our dump + drop(server); + tokio::fs::remove_dir_all(&opt.db_path).await.unwrap(); + let dump_name = format!("{}.dump", response["details"]["dumpUid"].as_str().unwrap()); + let dump_path = opt.dump_dir.join(dump_name); + assert!(dump_path.exists(), "path: `{}`", dump_path.display()); + + opt.import_dump = Some(dump_path); + // NOTE: We shouldn't have to change the database path but I lost one hour + // because of a « bad path » error and that fixed it. + opt.db_path = temp.path().join("data.ms"); + + let mut server = Server::new_auth_with_options(opt, temp).await; + server.use_api_key("MASTER_KEY"); + + let (indexes, code) = server.list_indexes(None, None).await; + assert_eq!(code, 200, "{indexes}"); + + snapshot!(indexes["results"].as_array().unwrap().len(), @"1"); + snapshot!(indexes["results"][0]["uid"], @r###""pets""###); + snapshot!(indexes["results"][0]["primaryKey"], @r###""id""###); + + let (response, code) = server.get_features().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let index = server.index("pets"); + + let (response, code) = index.settings().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "displayedAttributes": [ + "*" + ], + "searchableAttributes": [ + "*" + ], + "filterableAttributes": [], + "sortableAttributes": [], + "rankingRules": [ + "words", + "typo", + "proximity", + "attribute", + "sort", + "exactness" + ], + "stopWords": [], + "nonSeparatorTokens": [], + "separatorTokens": [], + "dictionary": [], + "synonyms": {}, + "distinctAttribute": null, + "proximityPrecision": "byWord", + "typoTolerance": { + "enabled": true, + "minWordSizeForTypos": { + "oneTypo": 5, + "twoTypos": 9 + }, + "disableOnWords": [], + "disableOnAttributes": [] + }, + "faceting": { + "maxValuesPerFacet": 100, + "sortFacetValuesBy": { + "*": "alpha" + } + }, + "pagination": { + "maxTotalHits": 1000 + }, + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}" + } + }, + "searchCutoffMs": null + } + "###); + + index + .search(json!({"retrieveVectors": true}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embedding" => "[vector]" }), @r###" + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "doggo_embedder": { + "userDefined": true, + "embedding": "[vector]" + } + } + }, + { + "id": 1, + "doggo": "echo", + "_vectors": { + "doggo_embedder": { + "userDefined": true, + "embedding": "[vector]" + } + } + }, + { + "id": 2, + "doggo": "intel", + "_vectors": { + "doggo_embedder": { + "userDefined": false, + "embedding": "[vector]" + } + } + }, + { + "id": 3, + "doggo": "bill", + "_vectors": { + "doggo_embedder": { + "userDefined": false, + "embedding": "[vector]" + } + } + }, + { + "id": 4, + "doggo": "max", + "_vectors": { + "doggo_embedder": { + "userDefined": false, + "embedding": "[vector]" + } + } + } + ] + "###); + }) + .await; +}