From 3493093c4f4df2889c5fc895fd372f7e5ea2cf50 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 11 Jun 2024 16:03:45 +0200 Subject: [PATCH] add a batch of tests --- index-scheduler/src/lib.rs | 176 +++++++++++++++++++++++++-- meilisearch/tests/vector/mod.rs | 78 ++++++++++++ meilisearch/tests/vector/settings.rs | 161 ++++++++++++++++++++++++ 3 files changed, 407 insertions(+), 8 deletions(-) create mode 100644 meilisearch/tests/vector/settings.rs diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index c5ae1c31f..e2a6f03a0 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -2016,6 +2016,7 @@ mod tests { // Wait for one successful batch. #[track_caller] fn advance_one_successful_batch(&mut self) { + self.index_scheduler.assert_internally_consistent(); self.advance_till([Start, BatchCreated]); loop { match self.advance() { @@ -2025,12 +2026,16 @@ mod tests { // the batch went successfully, we can stop the loop and go on with the next states. ProcessBatchSucceeded => break, AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), - ProcessBatchFailed => panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler)), + ProcessBatchFailed => { + while self.advance() != Start {} + panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler)) + }, breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), } } self.advance_till([AfterProcessing]); + self.index_scheduler.assert_internally_consistent(); } // Wait for one failed batch. @@ -5012,7 +5017,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors"); @@ -5105,7 +5109,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel"); @@ -5180,7 +5183,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); @@ -5303,9 +5305,7 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); handle.advance_one_successful_batch(); - index_scheduler.assert_internally_consistent(); let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); @@ -5452,9 +5452,7 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); handle.advance_one_successful_batch(); - index_scheduler.assert_internally_consistent(); // the document with the id 3 should have its original embedding updated let rtxn = index.read_txn().unwrap(); @@ -5481,4 +5479,166 @@ mod tests { assert!(!embedding.is_empty()); } + + #[test] + fn delete_document_containing_vector() { + // 1. Add an embedder + // 2. Push two documents containing a simple vector + // 3. Delete the first document + // 4. The user defined roaring bitmap shouldn't contains the id of the first document anymore + // 5. Clear the index + // 6. The user defined roaring bitmap shouldn't contains the id of the second document + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), + dimensions: Setting::Set(3), + ..Default::default() + }) + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "manual": vec![0, 0, 0], + } + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "manual": vec![1, 1, 1], + } + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + index_scheduler + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1")], + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###); + let conf = index.embedding_configs(&rtxn).unwrap(); + // TODO: Here the user provided vectors should NOT contains 1 + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, + }, + ), + prompt: PromptData { + template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + }, + }, + user_provided: RoaringBitmap<[0, 1]>, + }, + ] + "###); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["manual"]; + assert!(!embedding.is_empty(), "{embedding:?}"); + + index_scheduler + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @"[]"); + let conf = index.embedding_configs(&rtxn).unwrap(); + // TODO: Here the user provided vectors should contains nothing + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, + }, + ), + prompt: PromptData { + template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + }, + }, + user_provided: RoaringBitmap<[0, 1]>, + }, + ] + "###); + } } diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs index b4350116f..55dc186d5 100644 --- a/meilisearch/tests/vector/mod.rs +++ b/meilisearch/tests/vector/mod.rs @@ -1,5 +1,8 @@ +mod settings; + use meili_snap::{json_string, snapshot}; +use crate::common::index::Index; use crate::common::{GetAllDocumentsOptions, Server}; use crate::json; @@ -147,3 +150,78 @@ async fn add_remove_user_provided() { } "###); } + +async fn generate_default_user_provided_documents(server: &Server) -> Index { + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }}, + {"id": 2, "name": "billou", "_vectors": { "manual": [[2, 2, 2], [2, 2, 3]] }}, + {"id": 3, "name": "intel", "_vectors": { "manual": { "userProvided": true, "embeddings": [3, 3, 3] }}}, + {"id": 4, "name": "max", "_vectors": { "manual": { "userProvided": true, "embeddings": [[4, 4, 4], [4, 4, 5]] }}}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + index +} + +#[actix_rt::test] +async fn clear_documents() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + let (value, _code) = index.clear_all_documents().await; + index.wait_task(value.uid()).await; + + // Make sure the documents DB has been cleared + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [], + "offset": 0, + "limit": 20, + "total": 0 + } + "###); + + // Make sure the arroy DB has been cleared + let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; + snapshot!(json_string!(documents), @r###" + { + "hits": [], + "query": "", + "processingTimeMs": 0, + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0, + "semanticHitCount": 0 + } + "###); +} diff --git a/meilisearch/tests/vector/settings.rs b/meilisearch/tests/vector/settings.rs new file mode 100644 index 000000000..6b93f001e --- /dev/null +++ b/meilisearch/tests/vector/settings.rs @@ -0,0 +1,161 @@ +use meili_snap::{json_string, snapshot}; + +use crate::common::{GetAllDocumentsOptions, Server}; +use crate::json; +use crate::vector::generate_default_user_provided_documents; + +#[actix_rt::test] +async fn update_embedder() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "manual": {}}, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 2, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + + let ret = server.wait_task(response.uid()).await; + snapshot!(ret, @r###" + { + "uid": 1, + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 2 + } + } + }, + "error": { + "message": "`.embedders.manual`: Field `model` unavailable for source `userProvided` (only available for sources: `huggingFace`, `openAi`, `ollama`). Available fields: `source`, `dimensions`, `distribution`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn reset_embedder_documents() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + let (response, code) = index.delete_settings().await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + // Make sure the documents are still present + let (documents, _code) = index.get_all_documents(Default::default()).await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir" + }, + { + "id": 1, + "name": "echo" + }, + { + "id": 2, + "name": "billou" + }, + { + "id": 3, + "name": "intel" + }, + { + "id": 4, + "name": "max" + } + ], + "offset": 0, + "limit": 20, + "total": 5 + } + "###); + + // Make sure we are still able to retrieve their vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": {} + }, + { + "id": 1, + "name": "echo", + "_vectors": {} + }, + { + "id": 2, + "name": "billou", + "_vectors": {} + }, + { + "id": 3, + "name": "intel", + "_vectors": {} + }, + { + "id": 4, + "name": "max", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 5 + } + "###); + + // Make sure the arroy DB has been cleared + let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; + snapshot!(json_string!(documents), @r###" + { + "message": "Cannot find embedder with name `default`.", + "code": "invalid_embedder", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_embedder" + } + "###); +}