From 84e498299bcc492ab91ecfefb989fdbd8ef897d8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 22 May 2024 15:27:09 +0200 Subject: [PATCH] Remove the vectors from the documents database --- Cargo.lock | 2 + index-scheduler/Cargo.toml | 2 + index-scheduler/src/lib.rs | 287 +++++++++++++++++- .../documents after initial push.snap | 4 + .../documents after setting an embedder.snap | 4 + meilisearch-types/src/settings.rs | 2 +- milli/Cargo.toml | 2 +- milli/src/index.rs | 18 +- .../extract/extract_vector_points.rs | 46 ++- .../src/update/index_documents/extract/mod.rs | 4 + milli/src/update/index_documents/mod.rs | 9 +- .../src/update/index_documents/typed_chunk.rs | 20 +- milli/src/update/settings.rs | 40 ++- milli/src/vector/parsed_vectors.rs | 18 +- 14 files changed, 407 insertions(+), 51 deletions(-) create mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap diff --git a/Cargo.lock b/Cargo.lock index b62a61f92..3b28a00e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2455,6 +2455,7 @@ name = "index-scheduler" version = "1.9.0" dependencies = [ "anyhow", + "arroy", "big_s", "bincode", "crossbeam", @@ -2465,6 +2466,7 @@ dependencies = [ "file-store", "flate2", "insta", + "maplit", "meili-snap", "meilisearch-auth", "meilisearch-types", diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 21fa34733..8959bb070 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -40,7 +40,9 @@ ureq = "2.9.7" uuid = { version = "1.6.1", features = ["serde", "v4"] } [dev-dependencies] +arroy = "0.3.1" big_s = "1.0.2" crossbeam = "0.8.4" insta = { version = "1.34.0", features = ["json", "redactions"] } +maplit = "1.0.2" meili-snap = { path = "../meili-snap" } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 8a1c2f540..ebeac30b3 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1459,11 +1459,11 @@ impl IndexScheduler { // TODO: consider using a type alias or a struct embedder/template pub fn embedders( &self, - embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>, + embedding_configs: Vec<(String, milli::vector::EmbeddingConfig, RoaringBitmap)>, ) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| { + .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt }, _)| { let prompt = Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); // optimistically return existing embedder @@ -1748,6 +1748,9 @@ mod tests { use meilisearch_types::milli::update::IndexDocumentsMethod::{ ReplaceDocuments, UpdateDocuments, }; + use meilisearch_types::milli::update::Setting; + use meilisearch_types::milli::vector::settings::EmbeddingSettings; + use meilisearch_types::settings::{Checked, Unchecked}; use meilisearch_types::tasks::IndexSwap; use meilisearch_types::VERSION_FILE_NAME; use tempfile::{NamedTempFile, TempDir}; @@ -3052,7 +3055,9 @@ mod tests { let rtxn = index.read_txn().unwrap(); let configs = index.embedding_configs(&rtxn).unwrap(); - let (_, embedding_config) = configs.first().unwrap(); + let (name, embedding_config, user_provided) = configs.first().unwrap(); + insta::assert_snapshot!(name, @"default"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(embedding_config.embedder_options); } @@ -5017,13 +5022,15 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, fakerest_config) = configs.get(0).unwrap(); - insta::assert_json_snapshot!(name, @r###""A_fakerest""###); + let (name, fakerest_config, user_provided) = configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let (name, simple_hf_config) = configs.get(1).unwrap(); - insta::assert_json_snapshot!(name, @r###""B_small_hf""###); + let (name, simple_hf_config, user_provided) = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); @@ -5091,6 +5098,18 @@ mod tests { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + // Ensure the document have been inserted into the relevant bitamp + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let (name, _config, user_defined) = configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + + let (name, _config, user_defined) = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + let embeddings = index.embeddings(&rtxn, 0).unwrap(); assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); @@ -5153,6 +5172,18 @@ mod tests { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + // Ensure the document have been inserted into the relevant bitamp + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let (name, _config, user_defined) = configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + + let (name, _config, user_defined) = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + let embeddings = index.embeddings(&rtxn, 0).unwrap(); // automatically changed to patou @@ -5176,4 +5207,246 @@ mod tests { } } } + + #[test] + fn import_vectors_first_and_embedder_later() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "my_doggo_embedder": vec![1; 384], + "unknown embedder": vec![1, 2, 3], + } + }, + { + "id": 2, + "doggo": "max", + "_vectors": { + "my_doggo_embedder": { + "userProvided": true, + "embeddings": vec![2; 384], + }, + "unknown embedder": vec![4, 5], + }, + }, + { + "id": 3, + "doggo": "marcel", + "_vectors": { + "my_doggo_embedder": { + "userProvided": false, + "embeddings": vec![3; 384], + }, + }, + }, + { + "id": 4, + "doggo": "sora", + "_vectors": { + "my_doggo_embedder": { + "userProvided": false, + }, + }, + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0 as u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"5"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push"); + + let mut setting = meilisearch_types::settings::Settings::::default(); + setting.embedders = Setting::Set(maplit::btreemap! { + S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + .. EmbeddingSettings::default() + }) + }); + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + // the all the vectors linked to the new specified embedder have been removed + // Only the unknown embedders stays in the document DB + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###); + let conf = index.embedding_configs(&rtxn).unwrap(); + // even though we specified the vector for the ID 3, it shouldn't be marked + // as user provided since we explicitely marked it as NOT user provided. + snapshot!(format!("{conf:#?}"), @r###" + [ + ( + "my_doggo_embedder", + EmbeddingConfig { + embedder_options: HuggingFace( + EmbedderOptions { + model: "sentence-transformers/all-MiniLM-L6-v2", + revision: Some( + "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + ), + distribution: None, + }, + ), + prompt: PromptData { + template: "{{doc.doggo}}", + }, + }, + RoaringBitmap<[1, 2]>, + ), + ] + "###); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + assert!(!embedding.is_empty(), "{embedding:?}"); + + // the document with the id 3 should keep its original embedding + let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let mut embeddings = Vec::new(); + + 'vectors: for i in 0..=u8::MAX { + let reader = arroy::Reader::open(&rtxn, 0 | (i as u16), index.vector_arroy) + .map(Some) + .or_else(|e| match e { + arroy::Error::MissingMetadata => Ok(None), + e => Err(e), + }) + .transpose(); + + let Some(reader) = reader else { + break 'vectors; + }; + + let embedding = reader.unwrap().item_vector(&rtxn, docid).unwrap(); + if let Some(embedding) = embedding { + embeddings.push(embedding) + } else { + break 'vectors; + } + } + + snapshot!(embeddings.len(), @"1"); + assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); + + // If we update marcel it should regenerate its embedding automatically + + let content = serde_json::json!( + [ + { + "id": 3, + "doggo": "marvel", + }, + { + "id": 4, + "doggo": "sorry", + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1 as u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + + // the document with the id 3 should have its original embedding updated + let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + + assert!(!embedding.is_empty()); + /// TODO: it shouldn’t be equal to 3.0 + assert!(embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); + + // the document with the id 4 should generate an embedding + // let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); + // let embeddings = index.embeddings(&rtxn, docid).unwrap(); + // dbg!(&embeddings); + // let embedding = &embeddings["my_doggo_embedder"]; + + // assert!(!embedding.is_empty()); + // assert!(embedding[0]); + } } diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap new file mode 100644 index 000000000..433a190f9 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap @@ -0,0 +1,4 @@ +--- +source: index-scheduler/src/lib.rs +--- +[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"my_doggo_embedder":[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"my_doggo_embedder":{"embeddings":[2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0],"userProvided":true},"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel","_vectors":{"my_doggo_embedder":{"embeddings":[3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0],"userProvided":false}}},{"id":4,"doggo":"sora","_vectors":{"my_doggo_embedder":{"embeddings":null,"userProvided":false}}}] diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap new file mode 100644 index 000000000..853be8b0a --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap @@ -0,0 +1,4 @@ +--- +source: index-scheduler/src/lib.rs +--- +[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}] diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index 223d71658..d1d82be68 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -672,7 +672,7 @@ pub fn settings( let embedders: BTreeMap<_, _> = index .embedding_configs(rtxn)? .into_iter() - .map(|(name, config)| (name, Setting::Set(config.into()))) + .map(|(name, config, _)| (name, Setting::Set(config.into()))) .collect(); let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; diff --git a/milli/Cargo.toml b/milli/Cargo.toml index f23694d10..7fba2af1e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -44,7 +44,7 @@ once_cell = "1.19.0" ordered-float = "4.2.0" rand_pcg = { version = "0.3.1", features = ["serde1"] } rayon = "1.8.0" -roaring = "0.10.2" +roaring = { version = "0.10.2", features = ["serde"] } rstar = { version = "0.11.0", features = ["serde"] } serde = { version = "1.0.195", features = ["derive"] } serde_json = { version = "1.0.111", features = ["preserve_order"] } diff --git a/milli/src/index.rs b/milli/src/index.rs index ef4936ed1..569a9a692 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1572,16 +1572,18 @@ impl Index { Ok(script_language) } + /// Put the embedding configs: + /// 1. The name of the embedder + /// 2. The configuration option for this embedder + /// 3. The list of documents with a user provided embedding pub(crate) fn put_embedding_configs( &self, wtxn: &mut RwTxn<'_>, - configs: Vec<(String, EmbeddingConfig)>, + configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>, ) -> heed::Result<()> { - self.main.remap_types::>>().put( - wtxn, - main_key::EMBEDDING_CONFIGS, - &configs, - ) + self.main + .remap_types::>>() + .put(wtxn, main_key::EMBEDDING_CONFIGS, &configs) } pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { @@ -1591,10 +1593,10 @@ impl Index { pub fn embedding_configs( &self, rtxn: &RoTxn<'_>, - ) -> Result> { + ) -> Result> { Ok(self .main - .remap_types::>>() + .remap_types::>>() .get(rtxn, main_key::EMBEDDING_CONFIGS)? .unwrap_or_default()) } diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 76ec90d65..d97d1403c 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -10,16 +10,16 @@ use bytemuck::cast_slice; use grenad::Writer; use itertools::EitherOrBoth; use ordered_float::OrderedFloat; +use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; -use crate::update::index_documents::helpers::try_split_at; use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME}; use crate::vector::Embedder; -use crate::{DocumentId, Result, ThreadPoolNoAbort}; +use crate::{try_split_array_at, DocumentId, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::(); @@ -35,6 +35,8 @@ pub struct ExtractedVectorPoints { // embedder pub embedder_name: String, pub embedder: Arc, + pub user_defined: RoaringBitmap, + pub remove_from_user_defined: RoaringBitmap, } enum VectorStateDelta { @@ -80,6 +82,11 @@ struct EmbedderVectorExtractor { prompts_writer: Writer>, // (docid) -> () remove_vectors_writer: Writer>, + + // The docids of the documents that contains a user defined embedding + user_defined: RoaringBitmap, + // The docids of the documents that contains an auto-generated embedding + remove_from_user_defined: RoaringBitmap, } /// Extracts the embedding vector contained in each document under the `_vectors` field. @@ -134,6 +141,8 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, + user_defined: RoaringBitmap::new(), + remove_from_user_defined: RoaringBitmap::new(), }); } @@ -141,13 +150,15 @@ pub fn extract_vector_points( let mut cursor = obkv_documents.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { // this must always be serialized as (docid, external_docid); + const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::(); let (docid_bytes, external_id_bytes) = - try_split_at(key, std::mem::size_of::()).unwrap(); + try_split_array_at::(key).unwrap(); debug_assert!(from_utf8(external_id_bytes).is_ok()); + let docid = DocumentId::from_be_bytes(docid_bytes); let obkv = obkv::KvReader::new(value); key_buffer.clear(); - key_buffer.extend_from_slice(docid_bytes); + key_buffer.extend_from_slice(docid_bytes.as_slice()); // since we only need the primary key when we throw an error we create this getter to // lazily get it when needed @@ -163,10 +174,22 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, + user_defined, + remove_from_user_defined, } in extractors.iter_mut() { let delta = match parsed_vectors.remove(embedder_name) { (Some(old), Some(new)) => { + match (old.is_user_provided(), new.is_user_provided()) { + (true, true) | (false, false) => (), + (true, false) => { + remove_from_user_defined.insert(docid); + } + (false, true) => { + user_defined.insert(docid); + } + } + // no autogeneration let del_vectors = old.into_array_of_vectors(); let add_vectors = new.into_array_of_vectors(); @@ -187,6 +210,7 @@ pub fn extract_vector_points( .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); if document_is_kept { + remove_from_user_defined.insert(docid); // becomes autogenerated VectorStateDelta::NowGenerated(prompt.render( obkv, @@ -198,6 +222,11 @@ pub fn extract_vector_points( } } (None, Some(new)) => { + if new.is_user_provided() { + user_defined.insert(docid); + } else { + remove_from_user_defined.insert(docid); + } // was possibly autogenerated, remove all vectors for that document let add_vectors = new.into_array_of_vectors(); if add_vectors.len() > usize::from(u8::MAX) { @@ -239,6 +268,7 @@ pub fn extract_vector_points( VectorStateDelta::NoChange } } else { + remove_from_user_defined.remove(docid); VectorStateDelta::NowRemoved } } @@ -265,18 +295,18 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, + user_defined, + remove_from_user_defined, } in extractors { results.push(ExtractedVectorPoints { - // docid, _index -> KvWriterDelAdd -> Vector manual_vectors: writer_into_reader(manual_vectors_writer)?, - // docid -> () remove_vectors: writer_into_reader(remove_vectors_writer)?, - // docid -> prompt prompts: writer_into_reader(prompts_writer)?, - embedder, embedder_name, + user_defined, + remove_from_user_defined, }) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 18340a3ae..80214e7c8 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -238,6 +238,8 @@ fn send_original_documents_data( prompts, embedder_name, embedder, + user_defined, + remove_from_user_defined: auto_generated, } in extracted_vectors { let embeddings = match extract_embeddings( @@ -262,6 +264,8 @@ fn send_original_documents_data( expected_dimension: embedder.dimensions(), manual_vectors, embedder_name, + user_defined, + remove_from_user_defined: auto_generated, })); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2420463b4..a03e4333e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -501,6 +501,8 @@ where embeddings, manual_vectors, embedder_name, + user_defined, + remove_from_user_defined, } => { dimension.insert(embedder_name.clone(), expected_dimension); TypedChunk::VectorPoints { @@ -509,6 +511,8 @@ where expected_dimension, manual_vectors, embedder_name, + user_defined, + remove_from_user_defined, } } otherwise => otherwise, @@ -2616,10 +2620,11 @@ mod tests { let rtxn = index.read_txn().unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let (embedder_name, embedder) = embedding_configs.pop().unwrap(); + let (embedder_name, embedder, user_defined) = embedding_configs.pop().unwrap(); + insta::assert_snapshot!(embedder_name, @"manual"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>"); let embedder = std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap()); - assert_eq!("manual", embedder_name); let res = index .search(&rtxn) .semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec())) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2fbe91685..2c4e17858 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -90,6 +90,8 @@ pub(crate) enum TypedChunk { expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, + user_defined: RoaringBitmap, + remove_from_user_defined: RoaringBitmap, }, ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } @@ -155,7 +157,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut iter = merger.into_stream_merger_iter()?; let embedders: BTreeSet<_> = - index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect(); + index.embedding_configs(wtxn)?.into_iter().map(|(name, _, _)| name).collect(); let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); @@ -181,7 +183,7 @@ pub(crate) fn write_typed_chunk_into_index( // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is break 'vectors Some(addition); }; - vectors.retain_user_provided_vectors(&embedders); + vectors.retain_not_embedded_vectors(&embedders); let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; if vectors.is_empty() { // skip writing empty `_vectors` map @@ -619,6 +621,8 @@ pub(crate) fn write_typed_chunk_into_index( let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); + let mut user_defined = RoaringBitmap::new(); + let mut remove_from_user_defined = RoaringBitmap::new(); let mut params = None; for typed_chunk in typed_chunks { let TypedChunk::VectorPoints { @@ -627,6 +631,8 @@ pub(crate) fn write_typed_chunk_into_index( embeddings, expected_dimension, embedder_name, + user_defined: ud, + remove_from_user_defined: rud, } = typed_chunk else { unreachable!(); @@ -639,11 +645,21 @@ pub(crate) fn write_typed_chunk_into_index( if let Some(embeddings) = embeddings { embeddings_builder.push(embeddings.into_cursor()?); } + user_defined |= ud; + remove_from_user_defined |= rud; } // typed chunks has always at least 1 chunk. let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; + let mut embedding_configs = index.embedding_configs(&wtxn)?; + let (_name, _conf, ud) = + embedding_configs.iter_mut().find(|config| config.0 == embedder_name).unwrap(); + *ud -= remove_from_user_defined; + *ud |= user_defined; + + index.put_embedding_configs(wtxn, embedding_configs)?; + let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, )?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 68c31fabb..64998bcc3 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use deserr::{DeserializeError, Deserr}; use itertools::{EitherOrBoth, Itertools}; +use roaring::RoaringBitmap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; @@ -926,8 +927,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Setting::Set(configs) => { let mut changed = false; let old_configs = self.index.embedding_configs(self.wtxn)?; - let old_configs: BTreeMap> = - old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect(); + let old_configs: BTreeMap, RoaringBitmap)> = + old_configs + .into_iter() + .map(|(name, setting, user_defined)| { + (name, (Setting::Set(setting.into()), user_defined)) + }) + .collect(); let mut new_configs = BTreeMap::new(); for joined in old_configs @@ -936,15 +942,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { { match joined { // updated config - EitherOrBoth::Both((name, mut old), (_, new)) => { + EitherOrBoth::Both((name, (mut old, user_defined)), (_, new)) => { changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); if changed { - tracing::debug!(embedder = name, "need reindex"); + tracing::debug!( + embedder = name, + documents = user_defined.len(), + "need reindex" + ); } else { tracing::debug!(embedder = name, "skip reindex"); } let new = validate_embedding_settings(old, &name)?; - new_configs.insert(name, new); + new_configs.insert(name, (new, user_defined)); } // unchanged config EitherOrBoth::Left((name, setting)) => { @@ -961,21 +971,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { ); let setting = validate_embedding_settings(setting, &name)?; changed = true; - new_configs.insert(name, setting); + new_configs.insert(name, (setting, RoaringBitmap::new())); } } } - let new_configs: Vec<(String, EmbeddingConfig)> = new_configs + let new_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)> = new_configs .into_iter() - .filter_map(|(name, setting)| match setting { - Setting::Set(value) => Some((name, value.into())), + .filter_map(|(name, (setting, user_defined))| match setting { + Setting::Set(settings) => Some((name, settings.into(), user_defined)), Setting::Reset => None, - Setting::NotSet => Some((name, EmbeddingSettings::default().into())), + Setting::NotSet => { + Some((name, EmbeddingSettings::default().into(), user_defined)) + } }) .collect(); self.index.embedder_category_id.clear(self.wtxn)?; - for (index, (embedder_name, _)) in new_configs.iter().enumerate() { + for (index, (embedder_name, _, _)) in new_configs.iter().enumerate() { self.index.embedder_category_id.put_with_flags( self.wtxn, heed::PutFlags::APPEND, @@ -1359,10 +1371,12 @@ impl InnerIndexSettings { } } -fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result { +fn embedders( + embedding_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>, +) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, EmbeddingConfig { embedder_options, prompt })| { + .map(|(name, EmbeddingConfig { embedder_options, prompt }, _)| { let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); let embedder = Arc::new( diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 2c61baa9e..62c418149 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -17,6 +17,13 @@ pub enum Vectors { } impl Vectors { + pub fn is_user_provided(&self) -> bool { + match self { + Vectors::ImplicitlyUserProvided(_) => true, + Vectors::Explicit(ExplicitVectors { user_provided, .. }) => *user_provided, + } + } + pub fn into_array_of_vectors(self) -> Vec { match self { Vectors::ImplicitlyUserProvided(embeddings) @@ -89,15 +96,8 @@ impl ParsedVectors { Ok(ParsedVectors(value)) } - pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet) { - self.0.retain(|k, v| match v { - Vectors::ImplicitlyUserProvided(_) => true, - Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => { - *user_provided - // if the embedder is not in the config, then never touch it - || !embedders.contains(k) - } - }); + pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet) { + self.0.retain(|k, _v| !embedders.contains(k)) } }