From 4148fbbe8557dd3fa1d5a6d67d14665eca816e4c Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 21 May 2024 11:47:05 +0200 Subject: [PATCH 01/44] provide a method to get all the nested fields ids from a name --- milli/src/fields_ids_map.rs | 38 +++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 9c1c87f82..f9d7c3704 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -41,6 +41,16 @@ impl FieldsIdsMap { } } + /// Get the ids of a field and all its nested fields based on its name. + pub fn nested_ids(&self, name: &str) -> Vec { + self.names_ids + .range(name.to_string()..) + .take_while(|(key, _)| key.starts_with(name)) + .filter(|(key, _)| crate::is_faceted_by(key, name)) + .map(|(_name, id)| *id) + .collect() + } + /// Get the id of a field based on its name. pub fn id(&self, name: &str) -> Option { self.names_ids.get(name).copied() @@ -126,4 +136,32 @@ mod tests { assert_eq!(iter.next(), Some((3, "title"))); assert_eq!(iter.next(), None); } + + #[test] + fn nested_fields() { + let mut map = FieldsIdsMap::new(); + + assert_eq!(map.insert("id"), Some(0)); + assert_eq!(map.insert("doggo"), Some(1)); + assert_eq!(map.insert("doggo.name"), Some(2)); + assert_eq!(map.insert("doggolution"), Some(3)); + assert_eq!(map.insert("doggo.breed.name"), Some(4)); + assert_eq!(map.insert("description"), Some(5)); + + insta::assert_debug_snapshot!(map.nested_ids("doggo"), @r###" + [ + 1, + 4, + 2, + ] + "###); + + insta::assert_debug_snapshot!(map.nested_ids("doggo.breed"), @r###" + [ + 4, + ] + "###); + + insta::assert_debug_snapshot!(map.nested_ids("_vector"), @"[]"); + } } From 7a84697570c4f03f903328d0da7145941a1ef445 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 21 May 2024 17:08:45 +0200 Subject: [PATCH 02/44] never store the _vectors as searchable or faceted fields --- milli/src/fieldids_weights_map.rs | 10 ++- milli/src/index.rs | 101 +++++++++++++++++++++++++++++- milli/src/update/settings.rs | 22 +++++-- 3 files changed, 124 insertions(+), 9 deletions(-) diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index a737632a4..2bf828711 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -4,7 +4,7 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use crate::{FieldId, FieldsIdsMap, Weight}; +use crate::{vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME, FieldId, FieldsIdsMap, Weight}; #[derive(Debug, Default, Serialize, Deserialize)] pub struct FieldidsWeightsMap { @@ -23,7 +23,13 @@ impl FieldidsWeightsMap { /// Should only be called in the case there are NO searchable attributes. /// All the fields will be inserted in the order of the fields ids map with a weight of 0. pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { - FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() } + FieldidsWeightsMap { + map: fid_map + .iter() + .filter(|(_fid, name)| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME)) + .map(|(fid, _name)| (fid, 0)) + .collect(), + } } /// Removes a field id from the map, returning the associated weight previously in the map. diff --git a/milli/src/index.rs b/milli/src/index.rs index 3c502d541..ef4936ed1 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -23,6 +23,7 @@ use crate::heed_codec::{ }; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::vector::{Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, @@ -644,6 +645,7 @@ impl Index { &self, wtxn: &mut RwTxn, user_fields: &[&str], + non_searchable_fields_ids: &[FieldId], fields_ids_map: &FieldsIdsMap, ) -> Result<()> { // We can write the user defined searchable fields as-is. @@ -662,6 +664,7 @@ impl Index { for (weight, user_field) in user_fields.iter().enumerate() { if crate::is_faceted_by(field_from_map, user_field) && !real_fields.contains(&field_from_map) + && !non_searchable_fields_ids.contains(&id) { real_fields.push(field_from_map); @@ -708,6 +711,7 @@ impl Index { Ok(self .fields_ids_map(rtxn)? .names() + .filter(|name| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME)) .map(|field| Cow::Owned(field.to_string())) .collect()) }) @@ -1669,15 +1673,17 @@ pub(crate) mod tests { use big_s::S; use heed::{EnvOpenOptions, RwTxn}; - use maplit::hashset; + use maplit::{btreemap, hashset}; use tempfile::TempDir; use crate::documents::DocumentsBatchReader; use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{ - self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, + self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, + Settings, }; + use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult}; pub(crate) struct TempIndex { @@ -2783,4 +2789,95 @@ pub(crate) mod tests { ] "###); } + + #[test] + fn vectors_are_never_indexed_as_searchable_or_filterable() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { "id": 0, "_vectors": { "doggo": [2345] } }, + { "id": 1, "_vectors": { "doggo": [6789] } }, + ])) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @r###"["id"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + drop(rtxn); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]); + settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]); + }) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @"[]"); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + + let mut search = index.search(&rtxn); + let results = search + .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) + .execute() + .unwrap(); + assert!(results.candidates.is_empty()); + + index + .update_settings(|settings| { + settings.set_embedder_settings(btreemap! { + S("doggo") => Setting::Set(EmbeddingSettings { + dimensions: Setting::Set(1), + source: Setting::Set(EmbedderSource::UserProvided), + ..EmbeddingSettings::default()}), + }); + }) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @"[]"); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + + let mut search = index.search(&rtxn); + let results = search + .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) + .execute() + .unwrap(); + assert!(results.candidates.is_empty()); + } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index be9b6b74e..68c31fabb 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -19,6 +19,7 @@ use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::{FieldId, FieldsIdsMap, Index, Result}; @@ -490,6 +491,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, &names, + &fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME), &fields_ids_map, )?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -1252,6 +1254,8 @@ pub(crate) struct InnerIndexSettings { pub embedding_configs: EmbeddingConfigs, pub existing_fields: HashSet, pub geo_fields_ids: Option<(FieldId, FieldId)>, + pub non_searchable_fields_ids: Vec, + pub non_faceted_fields_ids: Vec, } impl InnerIndexSettings { @@ -1265,8 +1269,8 @@ impl InnerIndexSettings { let user_defined_searchable_fields = user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; - let searchable_fields_ids = index.searchable_fields_ids(rtxn)?; - let faceted_fields_ids = index.faceted_fields_ids(rtxn)?; + let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?; + let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let embedding_configs = embedders(index.embedding_configs(rtxn)?)?; @@ -1294,6 +1298,10 @@ impl InnerIndexSettings { None => None, }; + let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME); + searchable_fields_ids.retain(|id| !vectors_fids.contains(id)); + faceted_fields_ids.retain(|id| !vectors_fids.contains(id)); + Ok(Self { stop_words, allowed_separators, @@ -1308,6 +1316,8 @@ impl InnerIndexSettings { embedding_configs, existing_fields, geo_fields_ids, + non_searchable_fields_ids: vectors_fids.clone(), + non_faceted_fields_ids: vectors_fids.clone(), }) } @@ -1315,9 +1325,10 @@ impl InnerIndexSettings { pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { let new_facets = self .fields_ids_map - .names() - .filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields)) - .map(|field| field.to_string()) + .iter() + .filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid)) + .filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields)) + .map(|(_fid, field)| field.to_string()) .collect(); index.put_faceted_fields(wtxn, &new_facets)?; @@ -1337,6 +1348,7 @@ impl InnerIndexSettings { index.put_all_searchable_fields_from_fields_ids_map( wtxn, &searchable_fields, + &self.non_searchable_fields_ids, &self.fields_ids_map, )?; } From 84e498299bcc492ab91ecfefb989fdbd8ef897d8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 22 May 2024 15:27:09 +0200 Subject: [PATCH 03/44] Remove the vectors from the documents database --- Cargo.lock | 2 + index-scheduler/Cargo.toml | 2 + index-scheduler/src/lib.rs | 287 +++++++++++++++++- .../documents after initial push.snap | 4 + .../documents after setting an embedder.snap | 4 + meilisearch-types/src/settings.rs | 2 +- milli/Cargo.toml | 2 +- milli/src/index.rs | 18 +- .../extract/extract_vector_points.rs | 46 ++- .../src/update/index_documents/extract/mod.rs | 4 + milli/src/update/index_documents/mod.rs | 9 +- .../src/update/index_documents/typed_chunk.rs | 20 +- milli/src/update/settings.rs | 40 ++- milli/src/vector/parsed_vectors.rs | 18 +- 14 files changed, 407 insertions(+), 51 deletions(-) create mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap create mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap diff --git a/Cargo.lock b/Cargo.lock index b62a61f92..3b28a00e3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2455,6 +2455,7 @@ name = "index-scheduler" version = "1.9.0" dependencies = [ "anyhow", + "arroy", "big_s", "bincode", "crossbeam", @@ -2465,6 +2466,7 @@ dependencies = [ "file-store", "flate2", "insta", + "maplit", "meili-snap", "meilisearch-auth", "meilisearch-types", diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 21fa34733..8959bb070 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -40,7 +40,9 @@ ureq = "2.9.7" uuid = { version = "1.6.1", features = ["serde", "v4"] } [dev-dependencies] +arroy = "0.3.1" big_s = "1.0.2" crossbeam = "0.8.4" insta = { version = "1.34.0", features = ["json", "redactions"] } +maplit = "1.0.2" meili-snap = { path = "../meili-snap" } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 8a1c2f540..ebeac30b3 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1459,11 +1459,11 @@ impl IndexScheduler { // TODO: consider using a type alias or a struct embedder/template pub fn embedders( &self, - embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>, + embedding_configs: Vec<(String, milli::vector::EmbeddingConfig, RoaringBitmap)>, ) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| { + .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt }, _)| { let prompt = Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); // optimistically return existing embedder @@ -1748,6 +1748,9 @@ mod tests { use meilisearch_types::milli::update::IndexDocumentsMethod::{ ReplaceDocuments, UpdateDocuments, }; + use meilisearch_types::milli::update::Setting; + use meilisearch_types::milli::vector::settings::EmbeddingSettings; + use meilisearch_types::settings::{Checked, Unchecked}; use meilisearch_types::tasks::IndexSwap; use meilisearch_types::VERSION_FILE_NAME; use tempfile::{NamedTempFile, TempDir}; @@ -3052,7 +3055,9 @@ mod tests { let rtxn = index.read_txn().unwrap(); let configs = index.embedding_configs(&rtxn).unwrap(); - let (_, embedding_config) = configs.first().unwrap(); + let (name, embedding_config, user_provided) = configs.first().unwrap(); + insta::assert_snapshot!(name, @"default"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(embedding_config.embedder_options); } @@ -5017,13 +5022,15 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, fakerest_config) = configs.get(0).unwrap(); - insta::assert_json_snapshot!(name, @r###""A_fakerest""###); + let (name, fakerest_config, user_provided) = configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let (name, simple_hf_config) = configs.get(1).unwrap(); - insta::assert_json_snapshot!(name, @r###""B_small_hf""###); + let (name, simple_hf_config, user_provided) = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); @@ -5091,6 +5098,18 @@ mod tests { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + // Ensure the document have been inserted into the relevant bitamp + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let (name, _config, user_defined) = configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + + let (name, _config, user_defined) = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + let embeddings = index.embeddings(&rtxn, 0).unwrap(); assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); @@ -5153,6 +5172,18 @@ mod tests { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + // Ensure the document have been inserted into the relevant bitamp + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let (name, _config, user_defined) = configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + + let (name, _config, user_defined) = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + let embeddings = index.embeddings(&rtxn, 0).unwrap(); // automatically changed to patou @@ -5176,4 +5207,246 @@ mod tests { } } } + + #[test] + fn import_vectors_first_and_embedder_later() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "my_doggo_embedder": vec![1; 384], + "unknown embedder": vec![1, 2, 3], + } + }, + { + "id": 2, + "doggo": "max", + "_vectors": { + "my_doggo_embedder": { + "userProvided": true, + "embeddings": vec![2; 384], + }, + "unknown embedder": vec![4, 5], + }, + }, + { + "id": 3, + "doggo": "marcel", + "_vectors": { + "my_doggo_embedder": { + "userProvided": false, + "embeddings": vec![3; 384], + }, + }, + }, + { + "id": 4, + "doggo": "sora", + "_vectors": { + "my_doggo_embedder": { + "userProvided": false, + }, + }, + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0 as u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"5"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push"); + + let mut setting = meilisearch_types::settings::Settings::::default(); + setting.embedders = Setting::Set(maplit::btreemap! { + S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + .. EmbeddingSettings::default() + }) + }); + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + // the all the vectors linked to the new specified embedder have been removed + // Only the unknown embedders stays in the document DB + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###); + let conf = index.embedding_configs(&rtxn).unwrap(); + // even though we specified the vector for the ID 3, it shouldn't be marked + // as user provided since we explicitely marked it as NOT user provided. + snapshot!(format!("{conf:#?}"), @r###" + [ + ( + "my_doggo_embedder", + EmbeddingConfig { + embedder_options: HuggingFace( + EmbedderOptions { + model: "sentence-transformers/all-MiniLM-L6-v2", + revision: Some( + "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + ), + distribution: None, + }, + ), + prompt: PromptData { + template: "{{doc.doggo}}", + }, + }, + RoaringBitmap<[1, 2]>, + ), + ] + "###); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + assert!(!embedding.is_empty(), "{embedding:?}"); + + // the document with the id 3 should keep its original embedding + let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let mut embeddings = Vec::new(); + + 'vectors: for i in 0..=u8::MAX { + let reader = arroy::Reader::open(&rtxn, 0 | (i as u16), index.vector_arroy) + .map(Some) + .or_else(|e| match e { + arroy::Error::MissingMetadata => Ok(None), + e => Err(e), + }) + .transpose(); + + let Some(reader) = reader else { + break 'vectors; + }; + + let embedding = reader.unwrap().item_vector(&rtxn, docid).unwrap(); + if let Some(embedding) = embedding { + embeddings.push(embedding) + } else { + break 'vectors; + } + } + + snapshot!(embeddings.len(), @"1"); + assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); + + // If we update marcel it should regenerate its embedding automatically + + let content = serde_json::json!( + [ + { + "id": 3, + "doggo": "marvel", + }, + { + "id": 4, + "doggo": "sorry", + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1 as u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + + // the document with the id 3 should have its original embedding updated + let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + + assert!(!embedding.is_empty()); + /// TODO: it shouldn’t be equal to 3.0 + assert!(embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); + + // the document with the id 4 should generate an embedding + // let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); + // let embeddings = index.embeddings(&rtxn, docid).unwrap(); + // dbg!(&embeddings); + // let embedding = &embeddings["my_doggo_embedder"]; + + // assert!(!embedding.is_empty()); + // assert!(embedding[0]); + } } diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap new file mode 100644 index 000000000..433a190f9 --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap @@ -0,0 +1,4 @@ +--- +source: index-scheduler/src/lib.rs +--- +[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"my_doggo_embedder":[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"my_doggo_embedder":{"embeddings":[2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0],"userProvided":true},"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel","_vectors":{"my_doggo_embedder":{"embeddings":[3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0],"userProvided":false}}},{"id":4,"doggo":"sora","_vectors":{"my_doggo_embedder":{"embeddings":null,"userProvided":false}}}] diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap new file mode 100644 index 000000000..853be8b0a --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap @@ -0,0 +1,4 @@ +--- +source: index-scheduler/src/lib.rs +--- +[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}] diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index 223d71658..d1d82be68 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -672,7 +672,7 @@ pub fn settings( let embedders: BTreeMap<_, _> = index .embedding_configs(rtxn)? .into_iter() - .map(|(name, config)| (name, Setting::Set(config.into()))) + .map(|(name, config, _)| (name, Setting::Set(config.into()))) .collect(); let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; diff --git a/milli/Cargo.toml b/milli/Cargo.toml index f23694d10..7fba2af1e 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -44,7 +44,7 @@ once_cell = "1.19.0" ordered-float = "4.2.0" rand_pcg = { version = "0.3.1", features = ["serde1"] } rayon = "1.8.0" -roaring = "0.10.2" +roaring = { version = "0.10.2", features = ["serde"] } rstar = { version = "0.11.0", features = ["serde"] } serde = { version = "1.0.195", features = ["derive"] } serde_json = { version = "1.0.111", features = ["preserve_order"] } diff --git a/milli/src/index.rs b/milli/src/index.rs index ef4936ed1..569a9a692 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1572,16 +1572,18 @@ impl Index { Ok(script_language) } + /// Put the embedding configs: + /// 1. The name of the embedder + /// 2. The configuration option for this embedder + /// 3. The list of documents with a user provided embedding pub(crate) fn put_embedding_configs( &self, wtxn: &mut RwTxn<'_>, - configs: Vec<(String, EmbeddingConfig)>, + configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>, ) -> heed::Result<()> { - self.main.remap_types::>>().put( - wtxn, - main_key::EMBEDDING_CONFIGS, - &configs, - ) + self.main + .remap_types::>>() + .put(wtxn, main_key::EMBEDDING_CONFIGS, &configs) } pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { @@ -1591,10 +1593,10 @@ impl Index { pub fn embedding_configs( &self, rtxn: &RoTxn<'_>, - ) -> Result> { + ) -> Result> { Ok(self .main - .remap_types::>>() + .remap_types::>>() .get(rtxn, main_key::EMBEDDING_CONFIGS)? .unwrap_or_default()) } diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 76ec90d65..d97d1403c 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -10,16 +10,16 @@ use bytemuck::cast_slice; use grenad::Writer; use itertools::EitherOrBoth; use ordered_float::OrderedFloat; +use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; -use crate::update::index_documents::helpers::try_split_at; use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME}; use crate::vector::Embedder; -use crate::{DocumentId, Result, ThreadPoolNoAbort}; +use crate::{try_split_array_at, DocumentId, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::(); @@ -35,6 +35,8 @@ pub struct ExtractedVectorPoints { // embedder pub embedder_name: String, pub embedder: Arc, + pub user_defined: RoaringBitmap, + pub remove_from_user_defined: RoaringBitmap, } enum VectorStateDelta { @@ -80,6 +82,11 @@ struct EmbedderVectorExtractor { prompts_writer: Writer>, // (docid) -> () remove_vectors_writer: Writer>, + + // The docids of the documents that contains a user defined embedding + user_defined: RoaringBitmap, + // The docids of the documents that contains an auto-generated embedding + remove_from_user_defined: RoaringBitmap, } /// Extracts the embedding vector contained in each document under the `_vectors` field. @@ -134,6 +141,8 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, + user_defined: RoaringBitmap::new(), + remove_from_user_defined: RoaringBitmap::new(), }); } @@ -141,13 +150,15 @@ pub fn extract_vector_points( let mut cursor = obkv_documents.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { // this must always be serialized as (docid, external_docid); + const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::(); let (docid_bytes, external_id_bytes) = - try_split_at(key, std::mem::size_of::()).unwrap(); + try_split_array_at::(key).unwrap(); debug_assert!(from_utf8(external_id_bytes).is_ok()); + let docid = DocumentId::from_be_bytes(docid_bytes); let obkv = obkv::KvReader::new(value); key_buffer.clear(); - key_buffer.extend_from_slice(docid_bytes); + key_buffer.extend_from_slice(docid_bytes.as_slice()); // since we only need the primary key when we throw an error we create this getter to // lazily get it when needed @@ -163,10 +174,22 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, + user_defined, + remove_from_user_defined, } in extractors.iter_mut() { let delta = match parsed_vectors.remove(embedder_name) { (Some(old), Some(new)) => { + match (old.is_user_provided(), new.is_user_provided()) { + (true, true) | (false, false) => (), + (true, false) => { + remove_from_user_defined.insert(docid); + } + (false, true) => { + user_defined.insert(docid); + } + } + // no autogeneration let del_vectors = old.into_array_of_vectors(); let add_vectors = new.into_array_of_vectors(); @@ -187,6 +210,7 @@ pub fn extract_vector_points( .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); if document_is_kept { + remove_from_user_defined.insert(docid); // becomes autogenerated VectorStateDelta::NowGenerated(prompt.render( obkv, @@ -198,6 +222,11 @@ pub fn extract_vector_points( } } (None, Some(new)) => { + if new.is_user_provided() { + user_defined.insert(docid); + } else { + remove_from_user_defined.insert(docid); + } // was possibly autogenerated, remove all vectors for that document let add_vectors = new.into_array_of_vectors(); if add_vectors.len() > usize::from(u8::MAX) { @@ -239,6 +268,7 @@ pub fn extract_vector_points( VectorStateDelta::NoChange } } else { + remove_from_user_defined.remove(docid); VectorStateDelta::NowRemoved } } @@ -265,18 +295,18 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, + user_defined, + remove_from_user_defined, } in extractors { results.push(ExtractedVectorPoints { - // docid, _index -> KvWriterDelAdd -> Vector manual_vectors: writer_into_reader(manual_vectors_writer)?, - // docid -> () remove_vectors: writer_into_reader(remove_vectors_writer)?, - // docid -> prompt prompts: writer_into_reader(prompts_writer)?, - embedder, embedder_name, + user_defined, + remove_from_user_defined, }) } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 18340a3ae..80214e7c8 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -238,6 +238,8 @@ fn send_original_documents_data( prompts, embedder_name, embedder, + user_defined, + remove_from_user_defined: auto_generated, } in extracted_vectors { let embeddings = match extract_embeddings( @@ -262,6 +264,8 @@ fn send_original_documents_data( expected_dimension: embedder.dimensions(), manual_vectors, embedder_name, + user_defined, + remove_from_user_defined: auto_generated, })); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2420463b4..a03e4333e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -501,6 +501,8 @@ where embeddings, manual_vectors, embedder_name, + user_defined, + remove_from_user_defined, } => { dimension.insert(embedder_name.clone(), expected_dimension); TypedChunk::VectorPoints { @@ -509,6 +511,8 @@ where expected_dimension, manual_vectors, embedder_name, + user_defined, + remove_from_user_defined, } } otherwise => otherwise, @@ -2616,10 +2620,11 @@ mod tests { let rtxn = index.read_txn().unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let (embedder_name, embedder) = embedding_configs.pop().unwrap(); + let (embedder_name, embedder, user_defined) = embedding_configs.pop().unwrap(); + insta::assert_snapshot!(embedder_name, @"manual"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>"); let embedder = std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap()); - assert_eq!("manual", embedder_name); let res = index .search(&rtxn) .semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec())) diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2fbe91685..2c4e17858 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -90,6 +90,8 @@ pub(crate) enum TypedChunk { expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, + user_defined: RoaringBitmap, + remove_from_user_defined: RoaringBitmap, }, ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } @@ -155,7 +157,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut iter = merger.into_stream_merger_iter()?; let embedders: BTreeSet<_> = - index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect(); + index.embedding_configs(wtxn)?.into_iter().map(|(name, _, _)| name).collect(); let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); @@ -181,7 +183,7 @@ pub(crate) fn write_typed_chunk_into_index( // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is break 'vectors Some(addition); }; - vectors.retain_user_provided_vectors(&embedders); + vectors.retain_not_embedded_vectors(&embedders); let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; if vectors.is_empty() { // skip writing empty `_vectors` map @@ -619,6 +621,8 @@ pub(crate) fn write_typed_chunk_into_index( let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); + let mut user_defined = RoaringBitmap::new(); + let mut remove_from_user_defined = RoaringBitmap::new(); let mut params = None; for typed_chunk in typed_chunks { let TypedChunk::VectorPoints { @@ -627,6 +631,8 @@ pub(crate) fn write_typed_chunk_into_index( embeddings, expected_dimension, embedder_name, + user_defined: ud, + remove_from_user_defined: rud, } = typed_chunk else { unreachable!(); @@ -639,11 +645,21 @@ pub(crate) fn write_typed_chunk_into_index( if let Some(embeddings) = embeddings { embeddings_builder.push(embeddings.into_cursor()?); } + user_defined |= ud; + remove_from_user_defined |= rud; } // typed chunks has always at least 1 chunk. let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; + let mut embedding_configs = index.embedding_configs(&wtxn)?; + let (_name, _conf, ud) = + embedding_configs.iter_mut().find(|config| config.0 == embedder_name).unwrap(); + *ud -= remove_from_user_defined; + *ud |= user_defined; + + index.put_embedding_configs(wtxn, embedding_configs)?; + let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, )?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 68c31fabb..64998bcc3 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use deserr::{DeserializeError, Deserr}; use itertools::{EitherOrBoth, Itertools}; +use roaring::RoaringBitmap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; @@ -926,8 +927,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Setting::Set(configs) => { let mut changed = false; let old_configs = self.index.embedding_configs(self.wtxn)?; - let old_configs: BTreeMap> = - old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect(); + let old_configs: BTreeMap, RoaringBitmap)> = + old_configs + .into_iter() + .map(|(name, setting, user_defined)| { + (name, (Setting::Set(setting.into()), user_defined)) + }) + .collect(); let mut new_configs = BTreeMap::new(); for joined in old_configs @@ -936,15 +942,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { { match joined { // updated config - EitherOrBoth::Both((name, mut old), (_, new)) => { + EitherOrBoth::Both((name, (mut old, user_defined)), (_, new)) => { changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); if changed { - tracing::debug!(embedder = name, "need reindex"); + tracing::debug!( + embedder = name, + documents = user_defined.len(), + "need reindex" + ); } else { tracing::debug!(embedder = name, "skip reindex"); } let new = validate_embedding_settings(old, &name)?; - new_configs.insert(name, new); + new_configs.insert(name, (new, user_defined)); } // unchanged config EitherOrBoth::Left((name, setting)) => { @@ -961,21 +971,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { ); let setting = validate_embedding_settings(setting, &name)?; changed = true; - new_configs.insert(name, setting); + new_configs.insert(name, (setting, RoaringBitmap::new())); } } } - let new_configs: Vec<(String, EmbeddingConfig)> = new_configs + let new_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)> = new_configs .into_iter() - .filter_map(|(name, setting)| match setting { - Setting::Set(value) => Some((name, value.into())), + .filter_map(|(name, (setting, user_defined))| match setting { + Setting::Set(settings) => Some((name, settings.into(), user_defined)), Setting::Reset => None, - Setting::NotSet => Some((name, EmbeddingSettings::default().into())), + Setting::NotSet => { + Some((name, EmbeddingSettings::default().into(), user_defined)) + } }) .collect(); self.index.embedder_category_id.clear(self.wtxn)?; - for (index, (embedder_name, _)) in new_configs.iter().enumerate() { + for (index, (embedder_name, _, _)) in new_configs.iter().enumerate() { self.index.embedder_category_id.put_with_flags( self.wtxn, heed::PutFlags::APPEND, @@ -1359,10 +1371,12 @@ impl InnerIndexSettings { } } -fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result { +fn embedders( + embedding_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>, +) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, EmbeddingConfig { embedder_options, prompt })| { + .map(|(name, EmbeddingConfig { embedder_options, prompt }, _)| { let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); let embedder = Arc::new( diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 2c61baa9e..62c418149 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -17,6 +17,13 @@ pub enum Vectors { } impl Vectors { + pub fn is_user_provided(&self) -> bool { + match self { + Vectors::ImplicitlyUserProvided(_) => true, + Vectors::Explicit(ExplicitVectors { user_provided, .. }) => *user_provided, + } + } + pub fn into_array_of_vectors(self) -> Vec { match self { Vectors::ImplicitlyUserProvided(embeddings) @@ -89,15 +96,8 @@ impl ParsedVectors { Ok(ParsedVectors(value)) } - pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet) { - self.0.retain(|k, v| match v { - Vectors::ImplicitlyUserProvided(_) => true, - Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => { - *user_provided - // if the embedder is not in the config, then never touch it - || !embedders.contains(k) - } - }); + pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet) { + self.0.retain(|k, _v| !embedders.contains(k)) } } From 30d66abf8d92e0e5da0206b29457f2ab2f972b10 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 28 May 2024 18:07:53 +0200 Subject: [PATCH 04/44] fix the test --- index-scheduler/src/lib.rs | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index ebeac30b3..29b7c861f 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1750,7 +1750,7 @@ mod tests { }; use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::settings::EmbeddingSettings; - use meilisearch_types::settings::{Checked, Unchecked}; + use meilisearch_types::settings::Unchecked; use meilisearch_types::tasks::IndexSwap; use meilisearch_types::VERSION_FILE_NAME; use tempfile::{NamedTempFile, TempDir}; @@ -5432,21 +5432,29 @@ mod tests { index_scheduler.assert_internally_consistent(); // the document with the id 3 should have its original embedding updated + let rtxn = index.read_txn().unwrap(); let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let doc = index.documents(&rtxn, Some(docid)).unwrap()[0]; + let doc = obkv_to_json(&field_ids, &field_ids_map, doc.1).unwrap(); + snapshot!(json_string!(doc), @r###" + { + "id": 3, + "doggo": "marvel" + } + "###); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); let embedding = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); - /// TODO: it shouldn’t be equal to 3.0 - assert!(embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); + assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); // the document with the id 4 should generate an embedding - // let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); - // let embeddings = index.embeddings(&rtxn, docid).unwrap(); - // dbg!(&embeddings); - // let embedding = &embeddings["my_doggo_embedder"]; + let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + dbg!(&embeddings); + let embedding = &embeddings["my_doggo_embedder"]; - // assert!(!embedding.is_empty()); - // assert!(embedding[0]); + assert!(!embedding.is_empty()); } } From 04f6523f3c90e16068c8b540853c24a2e19ea597 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 29 May 2024 17:22:58 +0200 Subject: [PATCH 05/44] expose a new parameter to retrieve the embedders at search time --- index-scheduler/src/lib.rs | 42 ++++++++++--------- meilisearch-types/src/error.rs | 2 + .../src/analytics/segment_analytics.rs | 3 ++ .../src/routes/indexes/facet_search.rs | 1 + meilisearch/src/routes/indexes/search.rs | 3 ++ meilisearch/src/routes/indexes/similar.rs | 10 ++--- meilisearch/src/search.rs | 35 +++++++++++++++- meilisearch/tests/search/hybrid.rs | 6 +-- meilisearch/tests/similar/mod.rs | 8 ++-- milli/src/vector/rest.rs | 2 + 10 files changed, 79 insertions(+), 33 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 29b7c861f..c76a207f5 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5045,25 +5045,25 @@ mod tests { // add one doc, specifying vectors let doc = serde_json::json!( - { - "id": 0, - "doggo": "Intel", - "breed": "beagle", - "_vectors": { - &fakerest_name: { - // this will never trigger regeneration, which is good because we can't actually generate with - // this embedder - "userProvided": true, - "embeddings": beagle_embed, - }, - &simple_hf_name: { - // this will be regenerated on updates - "userProvided": false, - "embeddings": lab_embed, - }, - "noise": [0.1, 0.2, 0.3] - } - } + { + "id": 0, + "doggo": "Intel", + "breed": "beagle", + "_vectors": { + &fakerest_name: { + // this will never trigger regeneration, which is good because we can't actually generate with + // this embedder + "userProvided": true, + "embeddings": beagle_embed, + }, + &simple_hf_name: { + // this will be regenerated on updates + "userProvided": false, + "embeddings": lab_embed, + }, + "noise": [0.1, 0.2, 0.3] + } + } ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap(); @@ -5163,7 +5163,9 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); - handle.advance_one_successful_batch(); + println!("HEEEEERE"); + // handle.advance_one_successful_batch(); + handle.advance_one_failed_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds"); { diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 150c56b9d..63543fb1b 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -240,9 +240,11 @@ InvalidSearchAttributesToSearchOn , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ; InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; +InvalidSimilarRetrieveVectors , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; InvalidSearchRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; InvalidSimilarRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; +InvalidSearchRetrieveVectors , InvalidRequest , BAD_REQUEST ; InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ; InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ; InvalidSearchFacets , InvalidRequest , BAD_REQUEST ; diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index aed29e612..3eb74c7d1 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -662,6 +662,7 @@ impl SearchAggregator { page, hits_per_page, attributes_to_retrieve: _, + retrieve_vectors: _, attributes_to_crop: _, crop_length, attributes_to_highlight: _, @@ -1079,6 +1080,7 @@ impl MultiSearchAggregator { page: _, hits_per_page: _, attributes_to_retrieve: _, + retrieve_vectors: _, attributes_to_crop: _, crop_length: _, attributes_to_highlight: _, @@ -1646,6 +1648,7 @@ impl SimilarAggregator { offset, limit, attributes_to_retrieve: _, + retrieve_vectors: _, show_ranking_score, show_ranking_score_details, filter, diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 10b371f2d..2e9cf6e1b 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -115,6 +115,7 @@ impl From for SearchQuery { page: None, hits_per_page: None, attributes_to_retrieve: None, + retrieve_vectors: false, attributes_to_crop: None, crop_length: DEFAULT_CROP_LENGTH(), attributes_to_highlight: None, diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 348d8295c..91c8c8178 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -51,6 +51,8 @@ pub struct SearchQueryGet { hits_per_page: Option>, #[deserr(default, error = DeserrQueryParamError)] attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: bool, #[deserr(default, error = DeserrQueryParamError)] attributes_to_crop: Option>, #[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError)] @@ -153,6 +155,7 @@ impl From for SearchQuery { page: other.page.as_deref().copied(), hits_per_page: other.hits_per_page.as_deref().copied(), attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()), + retrieve_vectors: other.retrieve_vectors, attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()), crop_length: other.crop_length.0, attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()), diff --git a/meilisearch/src/routes/indexes/similar.rs b/meilisearch/src/routes/indexes/similar.rs index 518fedab7..54ea912ec 100644 --- a/meilisearch/src/routes/indexes/similar.rs +++ b/meilisearch/src/routes/indexes/similar.rs @@ -4,11 +4,7 @@ use deserr::actix_web::{AwebJson, AwebQueryParameter}; use index_scheduler::IndexScheduler; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; -use meilisearch_types::error::deserr_codes::{ - InvalidEmbedder, InvalidSimilarAttributesToRetrieve, InvalidSimilarFilter, InvalidSimilarId, - InvalidSimilarLimit, InvalidSimilarOffset, InvalidSimilarRankingScoreThreshold, - InvalidSimilarShowRankingScore, InvalidSimilarShowRankingScoreDetails, -}; +use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{ErrorCode as _, ResponseError}; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::keys::actions; @@ -122,6 +118,8 @@ pub struct SimilarQueryGet { limit: Param, #[deserr(default, error = DeserrQueryParamError)] attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] filter: Option, #[deserr(default, error = DeserrQueryParamError)] @@ -156,6 +154,7 @@ impl TryFrom for SimilarQuery { offset, limit, attributes_to_retrieve, + retrieve_vectors, filter, show_ranking_score, show_ranking_score_details, @@ -180,6 +179,7 @@ impl TryFrom for SimilarQuery { filter, embedder, attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()), + retrieve_vectors: retrieve_vectors.0, show_ranking_score: show_ranking_score.0, show_ranking_score_details: show_ranking_score_details.0, ranking_score_threshold: ranking_score_threshold.map(|x| x.0), diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 05b3c1aff..1ab42a79f 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -59,6 +59,8 @@ pub struct SearchQuery { pub hits_per_page: Option, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrJsonError)] + pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] pub attributes_to_crop: Option>, #[deserr(default, error = DeserrJsonError, default = DEFAULT_CROP_LENGTH())] @@ -141,6 +143,7 @@ impl fmt::Debug for SearchQuery { page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -173,6 +176,9 @@ impl fmt::Debug for SearchQuery { if let Some(q) = q { debug.field("q", &q); } + if *retrieve_vectors { + debug.field("retrieve_vectors", &retrieve_vectors); + } if let Some(v) = vector { if v.len() < 10 { debug.field("vector", &v); @@ -370,6 +376,8 @@ pub struct SearchQueryWithIndex { pub hits_per_page: Option, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrJsonError)] + pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] pub attributes_to_crop: Option>, #[deserr(default, error = DeserrJsonError, default = DEFAULT_CROP_LENGTH())] @@ -413,6 +421,7 @@ impl SearchQueryWithIndex { page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -440,6 +449,7 @@ impl SearchQueryWithIndex { page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -478,6 +488,8 @@ pub struct SimilarQuery { pub embedder: Option, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrJsonError)] + pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError, default)] pub show_ranking_score: bool, #[deserr(default, error = DeserrJsonError, default)] @@ -847,6 +859,7 @@ pub fn perform_search( page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -870,6 +883,7 @@ pub fn perform_search( let format = AttributesFormat { attributes_to_retrieve, + retrieve_vectors, attributes_to_highlight, attributes_to_crop, crop_length, @@ -953,6 +967,7 @@ pub fn perform_search( struct AttributesFormat { attributes_to_retrieve: Option>, + retrieve_vectors: bool, attributes_to_highlight: Option>, attributes_to_crop: Option>, crop_length: usize, @@ -1000,6 +1015,9 @@ fn make_hits( .intersection(&displayed_ids) .cloned() .collect(); + let is_vectors_displayed = + fields_ids_map.id("_vectors").is_some_and(|fid| displayed_ids.contains(&fid)); + let retrieve_vectors = format.retrieve_vectors && is_vectors_displayed; let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default(); let attr_to_crop = format.attributes_to_crop.unwrap_or_default(); let formatted_options = compute_formatted_options( @@ -1034,7 +1052,7 @@ fn make_hits( formatter_builder.highlight_suffix(format.highlight_post_tag); let mut documents = Vec::new(); let documents_iter = index.documents(rtxn, documents_ids)?; - for ((_id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { + for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { // First generate a document with all the displayed fields let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?; @@ -1045,6 +1063,19 @@ fn make_hits( let mut document = permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve); + if retrieve_vectors { + let mut vectors = serde_json::Map::new(); + for (name, mut vector) in index.embeddings(&rtxn, id)? { + if vector.len() == 1 { + let vector = vector.pop().unwrap(); + vectors.insert(name.into(), vector.into()); + } else { + vectors.insert(name.into(), vector.into()); + } + } + document.insert("_vectors".into(), vectors.into()); + } + let (matches_position, formatted) = format_fields( &displayed_document, &fields_ids_map, @@ -1125,6 +1156,7 @@ pub fn perform_similar( filter: _, embedder: _, attributes_to_retrieve, + retrieve_vectors, show_ranking_score, show_ranking_score_details, ranking_score_threshold, @@ -1171,6 +1203,7 @@ pub fn perform_similar( let format = AttributesFormat { attributes_to_retrieve, + retrieve_vectors, attributes_to_highlight: None, attributes_to_crop: None, crop_length: DEFAULT_CROP_LENGTH(), diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 9c50df6e1..0c8b4534c 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -124,7 +124,7 @@ async fn simple_search() { let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); @@ -133,7 +133,7 @@ async fn simple_search() { let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); @@ -142,7 +142,7 @@ async fn simple_search() { let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index bde23b67f..a2378eb58 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -557,7 +557,7 @@ async fn limit_and_offset() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 143, "limit": 1}), |response, code| { + .similar(json!({"id": 143, "limit": 1, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -567,9 +567,9 @@ async fn limit_and_offset() { "id": "522681", "_vectors": { "manual": [ - 0.1, - 0.6, - 0.8 + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 ] } } diff --git a/milli/src/vector/rest.rs b/milli/src/vector/rest.rs index 60f54782e..e7fc509b3 100644 --- a/milli/src/vector/rest.rs +++ b/milli/src/vector/rest.rs @@ -163,6 +163,7 @@ impl Embedder { text_chunks: Vec>, threads: &ThreadPoolNoAbort, ) -> Result>>, EmbedError> { + dbg!(&text_chunks); threads .install(move || { text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() @@ -230,6 +231,7 @@ where input_value } [input] => { + dbg!(&options); let mut body = options.query.clone(); body.as_object_mut() From 9eb6f522ea62e6dd06cedd8ee553b0d7101e1d1a Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 30 May 2024 11:50:30 +0200 Subject: [PATCH 06/44] wraps the index embedding config in a struct --- Cargo.lock | 4 +- index-scheduler/src/lib.rs | 77 +++++++++++-------- meilisearch-types/src/settings.rs | 3 +- milli/src/index.rs | 25 +++--- milli/src/update/index_documents/mod.rs | 4 +- .../src/update/index_documents/typed_chunk.rs | 18 +++-- milli/src/update/settings.rs | 56 ++++++++------ 7 files changed, 112 insertions(+), 75 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3b28a00e3..b00e94072 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5310,9 +5310,9 @@ dependencies = [ [[package]] name = "tracing-actix-web" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa069bd1503dd526ee793bb3fce408895136c95fc86d2edb2acf1c646d7f0684" +checksum = "4ee9e39a66d9b615644893ffc1704d2a89b5b315b7fd0228ad3182ca9a306b19" dependencies = [ "actix-web", "mutually_exclusive_features", diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index c76a207f5..d007acd2c 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -53,6 +53,7 @@ use meilisearch_types::heed::byteorder::BE; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; use meilisearch_types::milli::documents::DocumentsBatchBuilder; +use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; @@ -1459,33 +1460,39 @@ impl IndexScheduler { // TODO: consider using a type alias or a struct embedder/template pub fn embedders( &self, - embedding_configs: Vec<(String, milli::vector::EmbeddingConfig, RoaringBitmap)>, + embedding_configs: Vec, ) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt }, _)| { - let prompt = - Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); - // optimistically return existing embedder - { - let embedders = self.embedders.read().unwrap(); - if let Some(embedder) = embedders.get(&embedder_options) { - return Ok((name, (embedder.clone(), prompt))); + .map( + |IndexEmbeddingConfig { + name, + config: milli::vector::EmbeddingConfig { embedder_options, prompt }, + .. + }| { + let prompt = + Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); + // optimistically return existing embedder + { + let embedders = self.embedders.read().unwrap(); + if let Some(embedder) = embedders.get(&embedder_options) { + return Ok((name, (embedder.clone(), prompt))); + } } - } - // add missing embedder - let embedder = Arc::new( - Embedder::new(embedder_options.clone()) - .map_err(meilisearch_types::milli::vector::Error::from) - .map_err(meilisearch_types::milli::Error::from)?, - ); - { - let mut embedders = self.embedders.write().unwrap(); - embedders.insert(embedder_options, embedder.clone()); - } - Ok((name, (embedder, prompt))) - }) + // add missing embedder + let embedder = Arc::new( + Embedder::new(embedder_options.clone()) + .map_err(meilisearch_types::milli::vector::Error::from) + .map_err(meilisearch_types::milli::Error::from)?, + ); + { + let mut embedders = self.embedders.write().unwrap(); + embedders.insert(embedder_options, embedder.clone()); + } + Ok((name, (embedder, prompt))) + }, + ) .collect(); res.map(EmbeddingConfigs::new) } @@ -3055,10 +3062,10 @@ mod tests { let rtxn = index.read_txn().unwrap(); let configs = index.embedding_configs(&rtxn).unwrap(); - let (name, embedding_config, user_provided) = configs.first().unwrap(); + let IndexEmbeddingConfig { name, config, user_defined } = configs.first().unwrap(); insta::assert_snapshot!(name, @"default"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); - insta::assert_json_snapshot!(embedding_config.embedder_options); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_json_snapshot!(config.embedder_options); } #[test] @@ -5022,15 +5029,17 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, fakerest_config, user_provided) = configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: fakerest_config, user_defined } = + configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let (name, simple_hf_config, user_provided) = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: simple_hf_config, user_defined } = + configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); @@ -5102,11 +5111,11 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, _config, user_defined) = configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_defined } = configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); - let (name, _config, user_defined) = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_defined } = configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); @@ -5178,11 +5187,13 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, _config, user_defined) = configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_defined } = + configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); - let (name, _config, user_defined) = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_defined } = + configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index d1d82be68..8a9708d29 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -8,6 +8,7 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use fst::IntoStreamer; +use milli::index::IndexEmbeddingConfig; use milli::proximity::ProximityPrecision; use milli::update::Setting; use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; @@ -672,7 +673,7 @@ pub fn settings( let embedders: BTreeMap<_, _> = index .embedding_configs(rtxn)? .into_iter() - .map(|(name, config, _)| (name, Setting::Set(config.into()))) + .map(|IndexEmbeddingConfig { name, config, .. }| (name, Setting::Set(config.into()))) .collect(); let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; diff --git a/milli/src/index.rs b/milli/src/index.rs index 569a9a692..a47c07e08 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -9,6 +9,7 @@ use heed::types::*; use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; use roaring::RoaringBitmap; use rstar::RTree; +use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use crate::documents::PrimaryKey; @@ -1579,24 +1580,23 @@ impl Index { pub(crate) fn put_embedding_configs( &self, wtxn: &mut RwTxn<'_>, - configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>, + configs: Vec, ) -> heed::Result<()> { - self.main - .remap_types::>>() - .put(wtxn, main_key::EMBEDDING_CONFIGS, &configs) + self.main.remap_types::>>().put( + wtxn, + main_key::EMBEDDING_CONFIGS, + &configs, + ) } pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { self.main.remap_key_type::().delete(wtxn, main_key::EMBEDDING_CONFIGS) } - pub fn embedding_configs( - &self, - rtxn: &RoTxn<'_>, - ) -> Result> { + pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result> { Ok(self .main - .remap_types::>>() + .remap_types::>>() .get(rtxn, main_key::EMBEDDING_CONFIGS)? .unwrap_or_default()) } @@ -1668,6 +1668,13 @@ impl Index { } } +#[derive(Debug, Deserialize, Serialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub config: EmbeddingConfig, + pub user_defined: RoaringBitmap, +} + #[cfg(test)] pub(crate) mod tests { use std::collections::HashSet; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index a03e4333e..2dc93f67a 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -785,6 +785,7 @@ mod tests { use super::*; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; + use crate::index::IndexEmbeddingConfig; use crate::search::TermsMatchingStrategy; use crate::update::Setting; use crate::{db_snap, Filter, Search}; @@ -2620,7 +2621,8 @@ mod tests { let rtxn = index.read_txn().unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let (embedder_name, embedder, user_defined) = embedding_configs.pop().unwrap(); + let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_defined } = + embedding_configs.pop().unwrap(); insta::assert_snapshot!(embedder_name, @"manual"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>"); let embedder = diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2c4e17858..078010554 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -20,6 +20,7 @@ use super::MergeFn; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; +use crate::index::IndexEmbeddingConfig; use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; @@ -156,8 +157,11 @@ pub(crate) fn write_typed_chunk_into_index( let mut docids = index.documents_ids(wtxn)?; let mut iter = merger.into_stream_merger_iter()?; - let embedders: BTreeSet<_> = - index.embedding_configs(wtxn)?.into_iter().map(|(name, _, _)| name).collect(); + let embedders: BTreeSet<_> = index + .embedding_configs(wtxn)? + .into_iter() + .map(|IndexEmbeddingConfig { name, .. }| name) + .collect(); let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); @@ -653,10 +657,12 @@ pub(crate) fn write_typed_chunk_into_index( let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; let mut embedding_configs = index.embedding_configs(&wtxn)?; - let (_name, _conf, ud) = - embedding_configs.iter_mut().find(|config| config.0 == embedder_name).unwrap(); - *ud -= remove_from_user_defined; - *ud |= user_defined; + let index_embedder_config = embedding_configs + .iter_mut() + .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) + .unwrap(); + index_embedder_config.user_defined -= remove_from_user_defined; + index_embedder_config.user_defined |= user_defined; index.put_embedding_configs(wtxn, embedding_configs)?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 64998bcc3..6b07e614e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -15,7 +15,9 @@ use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; -use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; +use crate::index::{ + IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, +}; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; @@ -930,8 +932,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let old_configs: BTreeMap, RoaringBitmap)> = old_configs .into_iter() - .map(|(name, setting, user_defined)| { - (name, (Setting::Set(setting.into()), user_defined)) + .map(|IndexEmbeddingConfig { name, config, user_defined }| { + (name, (Setting::Set(config.into()), user_defined)) }) .collect(); @@ -975,23 +977,27 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } } } - let new_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)> = new_configs + let new_configs: Vec = new_configs .into_iter() - .filter_map(|(name, (setting, user_defined))| match setting { - Setting::Set(settings) => Some((name, settings.into(), user_defined)), - Setting::Reset => None, - Setting::NotSet => { - Some((name, EmbeddingSettings::default().into(), user_defined)) + .filter_map(|(name, (config, user_defined))| match config { + Setting::Set(config) => { + Some(IndexEmbeddingConfig { name, config: config.into(), user_defined }) } + Setting::Reset => None, + Setting::NotSet => Some(IndexEmbeddingConfig { + name, + config: EmbeddingSettings::default().into(), + user_defined, + }), }) .collect(); self.index.embedder_category_id.clear(self.wtxn)?; - for (index, (embedder_name, _, _)) in new_configs.iter().enumerate() { + for (index, index_embedding_config) in new_configs.iter().enumerate() { self.index.embedder_category_id.put_with_flags( self.wtxn, heed::PutFlags::APPEND, - embedder_name, + &index_embedding_config.name, &index .try_into() .map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?, @@ -1371,21 +1377,25 @@ impl InnerIndexSettings { } } -fn embedders( - embedding_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>, -) -> Result { +fn embedders(embedding_configs: Vec) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, EmbeddingConfig { embedder_options, prompt }, _)| { - let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); + .map( + |IndexEmbeddingConfig { + name, + config: EmbeddingConfig { embedder_options, prompt }, + .. + }| { + let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); - let embedder = Arc::new( - Embedder::new(embedder_options.clone()) - .map_err(crate::vector::Error::from) - .map_err(crate::Error::from)?, - ); - Ok((name, (embedder, prompt))) - }) + let embedder = Arc::new( + Embedder::new(embedder_options.clone()) + .map_err(crate::vector::Error::from) + .map_err(crate::Error::from)?, + ); + Ok((name, (embedder, prompt))) + }, + ) .collect(); res.map(EmbeddingConfigs::new) } From a73ccc78a6e9db032d0195dd3347e56cbd8f2735 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 30 May 2024 12:16:06 +0200 Subject: [PATCH 07/44] forward the embedding config to the extractors --- .../index_documents/extract/extract_vector_points.rs | 2 ++ milli/src/update/index_documents/extract/mod.rs | 12 +++++++++++- milli/src/update/index_documents/mod.rs | 2 ++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index d97d1403c..3eb761bce 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -14,6 +14,7 @@ use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; +use crate::index::IndexEmbeddingConfig; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; @@ -96,6 +97,7 @@ struct EmbedderVectorExtractor { pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, + embedders_configs: &[IndexEmbeddingConfig], settings_diff: &InnerIndexSettingsDiff, ) -> Result> { let reindex_vectors = settings_diff.reindex_vectors(); diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 80214e7c8..6399b40f8 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -30,6 +30,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; +use crate::index::IndexEmbeddingConfig; use crate::update::settings::InnerIndexSettingsDiff; use crate::{FieldId, Result, ThreadPoolNoAbortBuilder}; @@ -43,6 +44,7 @@ pub(crate) fn data_from_obkv_documents( indexer: GrenadParameters, lmdb_writer_sx: Sender>, primary_key_id: FieldId, + embedders_configs: Arc>, settings_diff: Arc, max_positions_per_attributes: Option, ) -> Result<()> { @@ -55,6 +57,7 @@ pub(crate) fn data_from_obkv_documents( original_documents_chunk, indexer, lmdb_writer_sx.clone(), + embedders_configs.clone(), settings_diff.clone(), ) }) @@ -210,6 +213,7 @@ fn send_original_documents_data( original_documents_chunk: Result>>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, + embedders_configs: Arc>, settings_diff: Arc, ) -> Result<()> { let original_documents_chunk = @@ -226,11 +230,17 @@ fn send_original_documents_data( if index_vectors { let settings_diff = settings_diff.clone(); + let embedders_configs = embedders_configs.clone(); let original_documents_chunk = original_documents_chunk.clone(); let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { - match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) { + match extract_vector_points( + original_documents_chunk.clone(), + indexer, + &embedders_configs, + &settings_diff, + ) { Ok(extracted_vectors) => { for ExtractedVectorPoints { manual_vectors, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2dc93f67a..907554753 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -286,6 +286,7 @@ where settings_diff.new.recompute_searchables(self.wtxn, self.index)?; let settings_diff = Arc::new(settings_diff); + let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?); let backup_pool; let pool = match self.indexer_config.thread_pool { @@ -399,6 +400,7 @@ where pool_params, lmdb_writer_sx.clone(), primary_key_id, + embedders_configs.clone(), settings_diff_cloned, max_positions_per_attributes, ) From 5d50850e12f72a07221184c7d9962f511a6dc791 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 3 Jun 2024 16:04:14 +0200 Subject: [PATCH 08/44] always push the user defined vectors in arroy --- index-scheduler/src/lib.rs | 14 +- ..._scheduler__tests__import_vectors-15.snap} | 4 - ..._scheduler__tests__import_vectors-22.snap} | 4 - ...x_scheduler__tests__import_vectors-5.snap} | 0 ...x_scheduler__tests__import_vectors-8.snap} | 0 ..._scheduler__tests__settings_update-5.snap} | 0 .../documents after setting an embedder.snap | 4 - meilisearch/tests/search/hybrid.rs | 40 +-- meilisearch/tests/search/mod.rs | 1 + meilisearch/tests/similar/mod.rs | 217 ++++++++-------- ...__attribute_fid__attribute_fid_ngrams.snap | 244 ------------------ .../1/field_distribution.snap | 7 - .../field_distribution.snap | 7 - .../extract/extract_vector_points.rs | 75 +++--- milli/src/vector/parsed_vectors.rs | 22 +- 15 files changed, 189 insertions(+), 450 deletions(-) rename index-scheduler/src/snapshots/{index_scheduler__tests__import_vectors-9.snap => index_scheduler__tests__import_vectors-15.snap} (67%) rename index-scheduler/src/snapshots/{index_scheduler__tests__import_vectors-12.snap => index_scheduler__tests__import_vectors-22.snap} (67%) rename index-scheduler/src/snapshots/{index_scheduler__tests__import_vectors-4.snap => index_scheduler__tests__import_vectors-5.snap} (100%) rename index-scheduler/src/snapshots/{index_scheduler__tests__import_vectors-6.snap => index_scheduler__tests__import_vectors-8.snap} (100%) rename index-scheduler/src/snapshots/{index_scheduler__tests__settings_update-3.snap => index_scheduler__tests__settings_update-5.snap} (100%) delete mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap delete mode 100644 milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap delete mode 100644 milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap delete mode 100644 milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index d007acd2c..f69736297 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5173,8 +5173,8 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); println!("HEEEEERE"); - // handle.advance_one_successful_batch(); - handle.advance_one_failed_batch(); + handle.advance_one_successful_batch(); + // handle.advance_one_failed_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds"); { @@ -5351,9 +5351,9 @@ mod tests { // as user provided since we explicitely marked it as NOT user provided. snapshot!(format!("{conf:#?}"), @r###" [ - ( - "my_doggo_embedder", - EmbeddingConfig { + IndexEmbeddingConfig { + name: "my_doggo_embedder", + config: EmbeddingConfig { embedder_options: HuggingFace( EmbedderOptions { model: "sentence-transformers/all-MiniLM-L6-v2", @@ -5367,8 +5367,8 @@ mod tests { template: "{{doc.doggo}}", }, }, - RoaringBitmap<[1, 2]>, - ), + user_defined: RoaringBitmap<[1, 2]>, + }, ] "###); let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap similarity index 67% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap index 002a42e59..540835dfb 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap @@ -6,10 +6,6 @@ expression: doc "doggo": "Intel", "breed": "beagle", "_vectors": { - "A_fakerest": { - "embeddings": "[vector]", - "userProvided": true - }, "noise": [ 0.1, 0.2, diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap similarity index 67% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap index 718ea229c..bc35d84f6 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap @@ -6,10 +6,6 @@ expression: doc "doggo": "kefir", "breed": "patou", "_vectors": { - "A_fakerest": { - "embeddings": "[vector]", - "userProvided": true - }, "noise": [ 0.1, 0.2, diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap deleted file mode 100644 index 853be8b0a..000000000 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: index-scheduler/src/lib.rs ---- -[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}] diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 0c8b4534c..1e415bc63 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -204,7 +204,7 @@ async fn distribution_shift() { let server = Server::new().await; let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; - let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}}); + let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true}); let (response, code) = index.search_post(search.clone()).await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); @@ -239,20 +239,23 @@ async fn highlighter() { let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, - "attributesToHighlight": [ - "desc" + "retrieveVectors": true, + "attributesToHighlight": [ + "desc", + "_vectors", ], - "highlightPreTag": "**BEGIN**", - "highlightPostTag": "**END**" + "highlightPreTag": "**BEGIN**", + "highlightPostTag": "**END**", })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, + "retrieveVectors": true, "showRankingScore": true, "attributesToHighlight": [ "desc" @@ -262,13 +265,14 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 1.0}, + "retrieveVectors": true, "showRankingScore": true, "attributesToHighlight": [ "desc" @@ -278,7 +282,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -361,7 +365,7 @@ async fn single_document() { let (response, code) = index .search_post( - json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}), ) .await; @@ -377,7 +381,7 @@ async fn query_combination() { // search without query and vector, but with hybrid => still placeholder let (response, code) = index - .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -386,7 +390,7 @@ async fn query_combination() { // same with a different semantic ratio let (response, code) = index - .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true})) + .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -395,7 +399,7 @@ async fn query_combination() { // wrong vector dimensions let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"400 Bad Request"); @@ -410,7 +414,7 @@ async fn query_combination() { // full vector let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -419,7 +423,7 @@ async fn query_combination() { // full keyword, without a query let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -428,7 +432,7 @@ async fn query_combination() { // query + vector, full keyword => keyword let (response, code) = index - .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) + .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -437,7 +441,7 @@ async fn query_combination() { // query + vector, no hybrid keyword => let (response, code) = index - .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true})) + .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"400 Bad Request"); @@ -453,7 +457,7 @@ async fn query_combination() { // full vector, without a vector => error let (response, code) = index .search_post( - json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}), ) .await; @@ -470,7 +474,7 @@ async fn query_combination() { // hybrid without a vector => full keyword let (response, code) = index .search_post( - json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true}), + json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true, "retrieveVectors": true}), ) .await; diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index b65c0dc42..955b324a6 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1337,6 +1337,7 @@ async fn experimental_feature_vector_store() { .search_post(json!({ "vector": [1.0, 2.0, 3.0], "showRankingScore": true, + "retrieveVectors": true, })) .await; diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index a2378eb58..f2af91588 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -78,7 +78,7 @@ async fn basic() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 143}), |response, code| { + .similar(json!({"id": 143, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -88,9 +88,9 @@ async fn basic() { "id": "522681", "_vectors": { "manual": [ - 0.1, - 0.6, - 0.8 + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 ] } }, @@ -100,9 +100,9 @@ async fn basic() { "id": "299537", "_vectors": { "manual": [ - 0.6, - 0.8, - -0.2 + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 ] } }, @@ -112,9 +112,9 @@ async fn basic() { "id": "166428", "_vectors": { "manual": [ - 0.7, - 0.7, - -0.4 + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 ] } }, @@ -124,8 +124,8 @@ async fn basic() { "id": "287947", "_vectors": { "manual": [ - 0.8, - 0.4, + 0.800000011920929, + 0.4000000059604645, -0.5 ] } @@ -136,7 +136,7 @@ async fn basic() { .await; index - .similar(json!({"id": "299537"}), |response, code| { + .similar(json!({"id": "299537", "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -146,9 +146,9 @@ async fn basic() { "id": "166428", "_vectors": { "manual": [ - 0.7, - 0.7, - -0.4 + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 ] } }, @@ -158,8 +158,8 @@ async fn basic() { "id": "287947", "_vectors": { "manual": [ - 0.8, - 0.4, + 0.800000011920929, + 0.4000000059604645, -0.5 ] } @@ -170,9 +170,9 @@ async fn basic() { "id": "522681", "_vectors": { "manual": [ - 0.1, - 0.6, - 0.8 + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 ] } }, @@ -183,8 +183,8 @@ async fn basic() { "_vectors": { "manual": [ -0.5, - 0.3, - 0.85 + 0.30000001192092896, + 0.8500000238418579 ] } } @@ -456,71 +456,77 @@ async fn filter() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 522681, "filter": "release_year = 2019"}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "Captain Marvel", - "release_year": 2019, - "id": "299537", - "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] - } - }, - { - "title": "How to Train Your Dragon: The Hidden World", - "release_year": 2019, - "id": "166428", - "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] - } - }, - { - "title": "Shazam!", - "release_year": 2019, - "id": "287947", - "_vectors": { - "manual": [ - 0.8, - 0.4, - -0.5 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 522681, "filter": "release_year = 2019", "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "release_year": 2019, + "id": "166428", + "_vectors": { + "manual": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } + }, + { + "title": "Shazam!", + "release_year": 2019, + "id": "287947", + "_vectors": { + "manual": [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + } + } + ] + "###); + }, + ) .await; index - .similar(json!({"id": 522681, "filter": "release_year < 2000"}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "All Quiet on the Western Front", - "release_year": 1930, - "id": "143", - "_vectors": { - "manual": [ - -0.5, - 0.3, - 0.85 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 522681, "filter": "release_year < 2000", "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "All Quiet on the Western Front", + "release_year": 1930, + "id": "143", + "_vectors": { + "manual": [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + } + } + ] + "###); + }, + ) .await; } @@ -579,24 +585,27 @@ async fn limit_and_offset() { .await; index - .similar(json!({"id": 143, "limit": 1, "offset": 1}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "Captain Marvel", - "release_year": 2019, - "id": "299537", - "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 143, "limit": 1, "offset": 1, "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } + } + ] + "###); + }, + ) .await; } diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap deleted file mode 100644 index 930a21626..000000000 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap +++ /dev/null @@ -1,244 +0,0 @@ ---- -source: milli/src/search/new/tests/attribute_fid.rs -expression: "format!(\"{document_ids_scores:#?}\")" ---- -[ - ( - 2, - [ - Fid( - Rank { - rank: 19, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 91, - max_rank: 91, - }, - ), - ], - ), - ( - 6, - [ - Fid( - Rank { - rank: 15, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 5, - [ - Fid( - Rank { - rank: 14, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 79, - max_rank: 91, - }, - ), - ], - ), - ( - 4, - [ - Fid( - Rank { - rank: 13, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 77, - max_rank: 91, - }, - ), - ], - ), - ( - 3, - [ - Fid( - Rank { - rank: 12, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 83, - max_rank: 91, - }, - ), - ], - ), - ( - 9, - [ - Fid( - Rank { - rank: 11, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 75, - max_rank: 91, - }, - ), - ], - ), - ( - 8, - [ - Fid( - Rank { - rank: 10, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 79, - max_rank: 91, - }, - ), - ], - ), - ( - 7, - [ - Fid( - Rank { - rank: 10, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 73, - max_rank: 91, - }, - ), - ], - ), - ( - 11, - [ - Fid( - Rank { - rank: 7, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 77, - max_rank: 91, - }, - ), - ], - ), - ( - 10, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 13, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 12, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 78, - max_rank: 91, - }, - ), - ], - ), - ( - 14, - [ - Fid( - Rank { - rank: 5, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 75, - max_rank: 91, - }, - ), - ], - ), - ( - 0, - [ - Fid( - Rank { - rank: 1, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 91, - max_rank: 91, - }, - ), - ], - ), -] diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap deleted file mode 100644 index 1d1d629e6..000000000 --- a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/index.rs ---- -age 1 | -id 2 | -name 2 | - diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap deleted file mode 100644 index 1d1d629e6..000000000 --- a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/index.rs ---- -age 1 | -id 2 | -name 2 | - diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 3eb761bce..1e56bec83 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -8,7 +8,6 @@ use std::sync::Arc; use bytemuck::cast_slice; use grenad::Writer; -use itertools::EitherOrBoth; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; @@ -50,7 +49,7 @@ enum VectorStateDelta { // Note: changing the value of the manually specified vector **should not record** this delta WasGeneratedNowManual(Vec>), - ManualDelta(Vec>, Vec>), + ManualDelta(Vec>), // Add the vector computed from the specified prompt // Remove any previous vector @@ -59,14 +58,12 @@ enum VectorStateDelta { } impl VectorStateDelta { - fn into_values(self) -> (bool, String, (Vec>, Vec>)) { + fn into_values(self) -> (bool, String, Vec>) { match self { VectorStateDelta::NoChange => Default::default(), VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), - VectorStateDelta::WasGeneratedNowManual(add) => { - (true, Default::default(), (Default::default(), add)) - } - VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)), + VectorStateDelta::WasGeneratedNowManual(add) => (true, Default::default(), add), + VectorStateDelta::ManualDelta(add) => (false, Default::default(), add), VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), } } @@ -166,8 +163,14 @@ pub fn extract_vector_points( // lazily get it when needed let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; - let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid) - .map_err(|error| error.to_crate_error(document_id().to_string()))?; + let mut parsed_vectors = ParsedVectorsDiff::new( + docid, + embedders_configs, + obkv, + old_vectors_fid, + new_vectors_fid, + ) + .map_err(|error| error.to_crate_error(document_id().to_string()))?; for EmbedderVectorExtractor { embedder_name, @@ -182,7 +185,7 @@ pub fn extract_vector_points( { let delta = match parsed_vectors.remove(embedder_name) { (Some(old), Some(new)) => { - match (old.is_user_provided(), new.is_user_provided()) { + match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) { (true, true) | (false, false) => (), (true, false) => { remove_from_user_defined.insert(docid); @@ -193,7 +196,6 @@ pub fn extract_vector_points( } // no autogeneration - let del_vectors = old.into_array_of_vectors(); let add_vectors = new.into_array_of_vectors(); if add_vectors.len() > usize::from(u8::MAX) { @@ -203,15 +205,15 @@ pub fn extract_vector_points( ))); } - VectorStateDelta::ManualDelta(del_vectors, add_vectors) + VectorStateDelta::ManualDelta(add_vectors) } - (Some(_old), None) => { + (Some(old), None) => { // Do we keep this document? let document_is_kept = obkv .iter() .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - if document_is_kept { + if document_is_kept && old.is_some() { remove_from_user_defined.insert(docid); // becomes autogenerated VectorStateDelta::NowGenerated(prompt.render( @@ -219,6 +221,8 @@ pub fn extract_vector_points( DelAdd::Addition, new_fields_ids_map, )?) + } else if document_is_kept && old.is_none() { + VectorStateDelta::NoChange } else { VectorStateDelta::NowRemoved } @@ -315,8 +319,8 @@ pub fn extract_vector_points( Ok(results) } -/// Computes the diff between both Del and Add numbers and -/// only inserts the parts that differ in the sorter. +/// We cannot compute the diff between both Del and Add vectors. +/// We'll push every vector and compute the difference later in TypedChunk. fn push_vectors_diff( remove_vectors_writer: &mut Writer>, prompts_writer: &mut Writer>, @@ -325,7 +329,7 @@ fn push_vectors_diff( delta: VectorStateDelta, reindex_vectors: bool, ) -> Result<()> { - let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); + let (must_remove, prompt, mut add_vectors) = delta.into_values(); if must_remove // TODO: the below condition works because we erase the vec database when a embedding setting changes. // When vector pipeline will be optimized, this should be removed. @@ -340,44 +344,25 @@ fn push_vectors_diff( } // We sort and dedup the vectors - del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); - del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); - let merged_vectors_iter = - itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); + // let merged_vectors_iter = + // itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); // insert vectors into the writer - for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { + for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { // Generate the key by extending the unique index to it. key_buffer.truncate(TRUNCATE_SIZE); let index = u16::try_from(i).unwrap(); key_buffer.extend_from_slice(&index.to_be_bytes()); - match eob { - EitherOrBoth::Both(_, _) => (), // no need to touch anything - EitherOrBoth::Left(vector) => { - // TODO: the below condition works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - if !reindex_vectors { - // We insert only the Del part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; - } - } - EitherOrBoth::Right(vector) => { - // We insert only the Add part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Addition, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; - } - } + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + manual_vectors_writer.insert(&key_buffer, bytes)?; } Ok(()) diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 62c418149..672e27cc5 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -4,8 +4,9 @@ use obkv::KvReader; use serde_json::{from_slice, Value}; use super::Embedding; +use crate::index::IndexEmbeddingConfig; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; -use crate::{FieldId, InternalError, UserError}; +use crate::{DocumentId, FieldId, InternalError, UserError}; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; @@ -42,17 +43,19 @@ pub struct ExplicitVectors { } pub struct ParsedVectorsDiff { - pub old: Option>, + pub old: BTreeMap>, pub new: Option>, } impl ParsedVectorsDiff { pub fn new( + docid: DocumentId, + embedders_configs: &[IndexEmbeddingConfig], documents_diff: KvReader<'_, FieldId>, old_vectors_fid: Option, new_vectors_fid: Option, ) -> Result { - let old = match old_vectors_fid + let mut old = match old_vectors_fid .and_then(|vectors_fid| documents_diff.get(vectors_fid)) .map(KvReaderDelAdd::new) .map(|obkv| to_vector_map(obkv, DelAdd::Deletion)) @@ -68,7 +71,13 @@ impl ParsedVectorsDiff { return Err(error); } } - .flatten(); + .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, Some(vec))).collect()); + for embedding_config in embedders_configs { + if embedding_config.user_defined.contains(docid) { + old.entry(embedding_config.name.to_string()).or_insert(None); + } + } + let new = new_vectors_fid .and_then(|vectors_fid| documents_diff.get(vectors_fid)) .map(KvReaderDelAdd::new) @@ -78,8 +87,9 @@ impl ParsedVectorsDiff { Ok(Self { old, new }) } - pub fn remove(&mut self, embedder_name: &str) -> (Option, Option) { - let old = self.old.as_mut().and_then(|old| old.remove(embedder_name)); + /// Return (Some(None), _) in case the vector is user defined and contained in the database. + pub fn remove(&mut self, embedder_name: &str) -> (Option>, Option) { + let old = self.old.remove(embedder_name); let new = self.new.as_mut().and_then(|new| new.remove(embedder_name)); (old, new) } From cc5dca8321736805b881bcb8679f566300a8f9e8 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 4 Jun 2024 16:41:33 +0200 Subject: [PATCH 09/44] fix two bug and add a dump test --- index-scheduler/src/batch.rs | 26 +-- meilisearch/src/routes/indexes/search.rs | 4 +- meilisearch/src/search.rs | 12 +- meilisearch/tests/dumps/mod.rs | 206 +++++++++++++++++++++++ 4 files changed, 234 insertions(+), 14 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 181ac49a3..d59a657c9 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -909,6 +909,7 @@ impl IndexScheduler { let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index.embedding_configs(&rtxn)?; // 3.1. Dump the documents for ret in index.all_documents(&rtxn)? { @@ -951,16 +952,21 @@ impl IndexScheduler { }; for (embedder_name, embeddings) in embeddings { - // don't change the entry if it already exists, because it was user-provided - vectors.entry(embedder_name).or_insert_with(|| { - let embeddings = ExplicitVectors { - embeddings: VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, - ), - user_provided: false, - }; - serde_json::to_value(embeddings).unwrap() - }); + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_defined.contains(id)); + + let embeddings = ExplicitVectors { + embeddings: VectorOrArrayOfVectors::from_array_of_vectors( + embeddings, + ), + user_provided, + }; + vectors.insert( + embedder_name, + serde_json::to_value(embeddings).unwrap(), + ); } } diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 91c8c8178..ae6402cf6 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -52,7 +52,7 @@ pub struct SearchQueryGet { #[deserr(default, error = DeserrQueryParamError)] attributes_to_retrieve: Option>, #[deserr(default, error = DeserrQueryParamError)] - retrieve_vectors: bool, + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] attributes_to_crop: Option>, #[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError)] @@ -155,7 +155,7 @@ impl From for SearchQuery { page: other.page.as_deref().copied(), hits_per_page: other.hits_per_page.as_deref().copied(), attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()), - retrieve_vectors: other.retrieve_vectors, + retrieve_vectors: other.retrieve_vectors.0, attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()), crop_length: other.crop_length.0, attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()), diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 1ab42a79f..d80910f09 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1051,6 +1051,7 @@ fn make_hits( formatter_builder.highlight_prefix(format.highlight_pre_tag); formatter_builder.highlight_suffix(format.highlight_post_tag); let mut documents = Vec::new(); + let embedding_configs = index.embedding_configs(&rtxn)?; let documents_iter = index.documents(rtxn, documents_ids)?; for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { // First generate a document with all the displayed fields @@ -1066,12 +1067,19 @@ fn make_hits( if retrieve_vectors { let mut vectors = serde_json::Map::new(); for (name, mut vector) in index.embeddings(&rtxn, id)? { + let user_defined = embedding_configs + .iter() + .find(|conf| conf.name == name) + .is_some_and(|conf| conf.user_defined.contains(id)); + let mut embedding = serde_json::Map::new(); + embedding.insert("userDefined".to_string(), user_defined.into()); if vector.len() == 1 { let vector = vector.pop().unwrap(); - vectors.insert(name.into(), vector.into()); + embedding.insert("embedding".to_string(), vector.into()); } else { - vectors.insert(name.into(), vector.into()); + embedding.insert("embedding".to_string(), vector.into()); } + vectors.insert(name.into(), embedding.into()); } document.insert("_vectors".into(), vectors.into()); } diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index c8f8ca105..dfac2e806 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1938,3 +1938,209 @@ async fn import_dump_v6_containing_experimental_features() { }) .await; } + +// In this test we must generate the dump ourselves to ensure the +// `user defined` vectors are well set +#[actix_rt::test] +async fn generate_and_import_dump_containing_vectors() { + let temp = tempfile::tempdir().unwrap(); + let mut opt = default_settings(temp.path()); + let server = Server::new_with_options(opt.clone()).await.unwrap(); + let (code, _) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + let index = server.index("pets"); + let (response, code) = index + .update_settings(json!( + { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}", + } + } + } + )) + .await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response); + let (response, code) = index + .add_documents( + json!([ + {"id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": vec![0; 384] }}, + {"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "userProvided": true, "embeddings": vec![1; 384] }}}, + {"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "userProvided": false, "embeddings": vec![2; 384] }}}, + {"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "userProvided": false }}}, + {"id": 4, "doggo": "max" }, + ]), + None, + ) + .await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response); + + let (response, code) = server.create_dump().await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // ========= We made a dump, now we should clear the DB except and try to import our dump + drop(server); + tokio::fs::remove_dir_all(&opt.db_path).await.unwrap(); + let dump_name = format!("{}.dump", response["details"]["dumpUid"].as_str().unwrap()); + let dump_path = opt.dump_dir.join(dump_name); + assert!(dump_path.exists(), "path: `{}`", dump_path.display()); + + opt.import_dump = Some(dump_path); + // NOTE: We shouldn't have to change the database path but I lost one hour + // because of a « bad path » error and that fixed it. + opt.db_path = temp.path().join("data.ms"); + + let mut server = Server::new_auth_with_options(opt, temp).await; + server.use_api_key("MASTER_KEY"); + + let (indexes, code) = server.list_indexes(None, None).await; + assert_eq!(code, 200, "{indexes}"); + + snapshot!(indexes["results"].as_array().unwrap().len(), @"1"); + snapshot!(indexes["results"][0]["uid"], @r###""pets""###); + snapshot!(indexes["results"][0]["primaryKey"], @r###""id""###); + + let (response, code) = server.get_features().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let index = server.index("pets"); + + let (response, code) = index.settings().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "displayedAttributes": [ + "*" + ], + "searchableAttributes": [ + "*" + ], + "filterableAttributes": [], + "sortableAttributes": [], + "rankingRules": [ + "words", + "typo", + "proximity", + "attribute", + "sort", + "exactness" + ], + "stopWords": [], + "nonSeparatorTokens": [], + "separatorTokens": [], + "dictionary": [], + "synonyms": {}, + "distinctAttribute": null, + "proximityPrecision": "byWord", + "typoTolerance": { + "enabled": true, + "minWordSizeForTypos": { + "oneTypo": 5, + "twoTypos": 9 + }, + "disableOnWords": [], + "disableOnAttributes": [] + }, + "faceting": { + "maxValuesPerFacet": 100, + "sortFacetValuesBy": { + "*": "alpha" + } + }, + "pagination": { + "maxTotalHits": 1000 + }, + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}" + } + }, + "searchCutoffMs": null + } + "###); + + index + .search(json!({"retrieveVectors": true}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embedding" => "[vector]" }), @r###" + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "doggo_embedder": { + "userDefined": true, + "embedding": "[vector]" + } + } + }, + { + "id": 1, + "doggo": "echo", + "_vectors": { + "doggo_embedder": { + "userDefined": true, + "embedding": "[vector]" + } + } + }, + { + "id": 2, + "doggo": "intel", + "_vectors": { + "doggo_embedder": { + "userDefined": false, + "embedding": "[vector]" + } + } + }, + { + "id": 3, + "doggo": "bill", + "_vectors": { + "doggo_embedder": { + "userDefined": false, + "embedding": "[vector]" + } + } + }, + { + "id": 4, + "doggo": "max", + "_vectors": { + "doggo_embedder": { + "userDefined": false, + "embedding": "[vector]" + } + } + } + ] + "###); + }) + .await; +} From caad40964a94ee0b751d61544d8874abfb3f75d7 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 4 Jun 2024 17:27:31 +0200 Subject: [PATCH 10/44] implements the analytics --- meilisearch/src/analytics/segment_analytics.rs | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 3eb74c7d1..6e91b99b0 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -622,6 +622,7 @@ pub struct SearchAggregator { // Whether a non-default embedder was specified embedder: bool, hybrid: bool, + retrieve_vectors: bool, // every time a search is done, we increment the counter linked to the used settings matching_strategy: HashMap, @@ -662,7 +663,7 @@ impl SearchAggregator { page, hits_per_page, attributes_to_retrieve: _, - retrieve_vectors: _, + retrieve_vectors, attributes_to_crop: _, crop_length, attributes_to_highlight: _, @@ -729,6 +730,7 @@ impl SearchAggregator { if let Some(ref vector) = vector { ret.max_vector_size = vector.len(); } + ret.retrieve_vectors |= retrieve_vectors; if query.is_finite_pagination() { let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); @@ -804,6 +806,7 @@ impl SearchAggregator { attributes_to_search_on_total_number_of_uses, max_terms_number, max_vector_size, + retrieve_vectors, matching_strategy, max_limit, max_offset, @@ -874,6 +877,7 @@ impl SearchAggregator { // vector self.max_vector_size = self.max_vector_size.max(max_vector_size); + self.retrieve_vectors |= retrieve_vectors; self.semantic_ratio |= semantic_ratio; self.hybrid |= hybrid; self.embedder |= embedder; @@ -930,6 +934,7 @@ impl SearchAggregator { attributes_to_search_on_total_number_of_uses, max_terms_number, max_vector_size, + retrieve_vectors, matching_strategy, max_limit, max_offset, @@ -992,6 +997,7 @@ impl SearchAggregator { }, "vector": { "max_vector_size": max_vector_size, + "retrieve_vectors": retrieve_vectors, }, "hybrid": { "enabled": hybrid, @@ -1625,6 +1631,7 @@ pub struct SimilarAggregator { // Whether a non-default embedder was specified embedder: bool, + retrieve_vectors: bool, // pagination max_limit: usize, @@ -1648,7 +1655,7 @@ impl SimilarAggregator { offset, limit, attributes_to_retrieve: _, - retrieve_vectors: _, + retrieve_vectors, show_ranking_score, show_ranking_score_details, filter, @@ -1693,6 +1700,7 @@ impl SimilarAggregator { ret.ranking_score_threshold = ranking_score_threshold.is_some(); ret.embedder = embedder.is_some(); + ret.retrieve_vectors = *retrieve_vectors; ret } @@ -1725,6 +1733,7 @@ impl SimilarAggregator { show_ranking_score_details, embedder, ranking_score_threshold, + retrieve_vectors, } = other; if self.timestamp.is_none() { @@ -1754,6 +1763,7 @@ impl SimilarAggregator { } self.embedder |= embedder; + self.retrieve_vectors |= retrieve_vectors; // pagination self.max_limit = self.max_limit.max(max_limit); @@ -1788,6 +1798,7 @@ impl SimilarAggregator { show_ranking_score_details, embedder, ranking_score_threshold, + retrieve_vectors, } = self; if total_received == 0 { @@ -1814,6 +1825,9 @@ impl SimilarAggregator { "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), }, + "vector": { + "retrieve_vectors": retrieve_vectors, + }, "hybrid": { "embedder": embedder, }, From 6b29676e7eaaa56bb6ddee2efe5b34636723e538 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 4 Jun 2024 17:38:28 +0200 Subject: [PATCH 11/44] update snapshots --- .../1.snap | 25 +++ .../2.snap | 19 ++ meilisearch/tests/search/hybrid.rs | 30 +-- meilisearch/tests/search/mod.rs | 65 ++++--- meilisearch/tests/similar/mod.rs | 182 +++++++++++------- 5 files changed, 211 insertions(+), 110 deletions(-) create mode 100644 meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap create mode 100644 meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap diff --git a/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap new file mode 100644 index 000000000..4b05d417a --- /dev/null +++ b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap @@ -0,0 +1,25 @@ +--- +source: meilisearch/tests/dumps/mod.rs +--- +{ + "uid": 0, + "indexUid": "pets", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap new file mode 100644 index 000000000..43971924b --- /dev/null +++ b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap @@ -0,0 +1,19 @@ +--- +source: meilisearch/tests/dumps/mod.rs +--- +{ + "uid": 1, + "indexUid": "pets", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 5, + "indexedDocuments": 5 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 1e415bc63..713dbe3bb 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -128,7 +128,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index @@ -137,7 +137,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"2"); let (response, code) = index @@ -146,7 +146,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -207,7 +207,7 @@ async fn distribution_shift() { let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true}); let (response, code) = index.search_post(search.clone()).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9472135901451112}]"###); let (response, code) = index .update_settings(json!({ @@ -228,7 +228,7 @@ async fn distribution_shift() { let (response, code) = index.search_post(search).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.1920928955078125e-7}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.1920928955078125e-7}]"###); } #[actix_rt::test] @@ -249,7 +249,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index @@ -265,7 +265,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic @@ -282,7 +282,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -370,7 +370,7 @@ async fn single_document() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0}"###); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0}"###); snapshot!(response["semanticHitCount"], @"1"); } @@ -385,7 +385,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // same with a different semantic ratio @@ -394,7 +394,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // wrong vector dimensions @@ -418,7 +418,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.6581138968467712}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.6581138968467712}]"###); snapshot!(response["semanticHitCount"], @"3"); // full keyword, without a query @@ -427,7 +427,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, full keyword => keyword @@ -436,7 +436,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, no hybrid keyword => @@ -479,6 +479,6 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 955b324a6..2a2b23fd5 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1349,11 +1349,14 @@ async fn experimental_feature_vector_store() { "title": "Shazam!", "id": "287947", "_vectors": { - "manual": [ - 1.0, - 2.0, - 3.0 - ] + "manual": { + "userDefined": true, + "embedding": [ + 1.0, + 2.0, + 3.0 + ] + } }, "_rankingScore": 1.0 }, @@ -1361,11 +1364,14 @@ async fn experimental_feature_vector_store() { "title": "Captain Marvel", "id": "299537", "_vectors": { - "manual": [ - 1.0, - 2.0, - 54.0 - ] + "manual": { + "userDefined": true, + "embedding": [ + 1.0, + 2.0, + 54.0 + ] + } }, "_rankingScore": 0.9129111766815186 }, @@ -1373,11 +1379,14 @@ async fn experimental_feature_vector_store() { "title": "Gläss", "id": "450465", "_vectors": { - "manual": [ - -100.0, - 340.0, - 90.0 - ] + "manual": { + "userDefined": true, + "embedding": [ + -100.0, + 340.0, + 90.0 + ] + } }, "_rankingScore": 0.8106412887573242 }, @@ -1385,11 +1394,14 @@ async fn experimental_feature_vector_store() { "title": "How to Train Your Dragon: The Hidden World", "id": "166428", "_vectors": { - "manual": [ - -100.0, - 231.0, - 32.0 - ] + "manual": { + "userDefined": true, + "embedding": [ + -100.0, + 231.0, + 32.0 + ] + } }, "_rankingScore": 0.7412010431289673 }, @@ -1397,11 +1409,14 @@ async fn experimental_feature_vector_store() { "title": "Escape Room", "id": "522681", "_vectors": { - "manual": [ - 10.0, - -23.0, - 32.0 - ] + "manual": { + "userDefined": true, + "embedding": [ + 10.0, + -23.0, + 32.0 + ] + } }, "_rankingScore": 0.6972063183784485 } diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index f2af91588..7c9f4fff0 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -87,11 +87,14 @@ async fn basic() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } } }, { @@ -99,11 +102,14 @@ async fn basic() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } } }, { @@ -111,11 +117,14 @@ async fn basic() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } } }, { @@ -123,11 +132,14 @@ async fn basic() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + } } } ] @@ -145,11 +157,14 @@ async fn basic() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } } }, { @@ -157,11 +172,14 @@ async fn basic() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + } } }, { @@ -169,11 +187,14 @@ async fn basic() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } } }, { @@ -181,11 +202,14 @@ async fn basic() { "release_year": 1930, "id": "143", "_vectors": { - "manual": [ - -0.5, - 0.30000001192092896, - 0.8500000238418579 - ] + "manual": { + "userDefined": true, + "embedding": [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + } } } ] @@ -467,11 +491,14 @@ async fn filter() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } } }, { @@ -479,11 +506,14 @@ async fn filter() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } } }, { @@ -491,11 +521,14 @@ async fn filter() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + } } } ] @@ -516,11 +549,14 @@ async fn filter() { "release_year": 1930, "id": "143", "_vectors": { - "manual": [ - -0.5, - 0.30000001192092896, - 0.8500000238418579 - ] + "manual": { + "userDefined": true, + "embedding": [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + } } } ] @@ -572,11 +608,14 @@ async fn limit_and_offset() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } } } ] @@ -596,11 +635,14 @@ async fn limit_and_offset() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } } } ] From b867829ef1d067d41217512619fa123a6269a3ab Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 4 Jun 2024 18:18:24 +0200 Subject: [PATCH 12/44] remove useless dbg --- index-scheduler/src/lib.rs | 1 - milli/src/vector/rest.rs | 1 - 2 files changed, 2 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index f69736297..57eccbe66 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5465,7 +5465,6 @@ mod tests { // the document with the id 4 should generate an embedding let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - dbg!(&embeddings); let embedding = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); diff --git a/milli/src/vector/rest.rs b/milli/src/vector/rest.rs index e7fc509b3..fd771a228 100644 --- a/milli/src/vector/rest.rs +++ b/milli/src/vector/rest.rs @@ -163,7 +163,6 @@ impl Embedder { text_chunks: Vec>, threads: &ThreadPoolNoAbort, ) -> Result>>, EmbedError> { - dbg!(&text_chunks); threads .install(move || { text_chunks.into_par_iter().map(move |chunk| self.embed(chunk)).collect() From d92c173fdc1abcce46ec2489bac90448f4bc6673 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 11:05:19 +0200 Subject: [PATCH 13/44] update the new similar tests --- meilisearch/tests/similar/mod.rs | 140 +++++++++++++++++++------------ 1 file changed, 85 insertions(+), 55 deletions(-) diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index 7c9f4fff0..2b70b3df5 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -252,7 +252,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"4"); @@ -263,11 +263,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } }, "_rankingScore": 0.890957772731781 }, @@ -276,11 +279,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } }, "_rankingScore": 0.39060014486312866 }, @@ -289,11 +295,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } }, "_rankingScore": 0.2819308042526245 }, @@ -302,11 +311,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.8, - 0.4, - -0.5 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + } }, "_rankingScore": 0.1662663221359253 } @@ -318,7 +330,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"3"); @@ -329,11 +341,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } }, "_rankingScore": 0.890957772731781 }, @@ -342,11 +357,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } }, "_rankingScore": 0.39060014486312866 }, @@ -355,11 +373,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } }, "_rankingScore": 0.2819308042526245 } @@ -371,7 +392,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"2"); @@ -382,11 +403,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } }, "_rankingScore": 0.890957772731781 }, @@ -395,11 +419,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } }, "_rankingScore": 0.39060014486312866 } @@ -411,7 +438,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"1"); @@ -422,11 +449,14 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "userDefined": true, + "embedding": [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + } }, "_rankingScore": 0.890957772731781 } @@ -438,7 +468,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @"[]"); From 376b3a19a755da0158be9f632b4d3c913a8297b2 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 11:21:06 +0200 Subject: [PATCH 14/44] makes clippy and fmt happy --- meilisearch/src/search.rs | 6 +++--- milli/src/fieldids_weights_map.rs | 3 ++- milli/src/update/index_documents/typed_chunk.rs | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index d80910f09..9b72ed596 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1051,7 +1051,7 @@ fn make_hits( formatter_builder.highlight_prefix(format.highlight_pre_tag); formatter_builder.highlight_suffix(format.highlight_post_tag); let mut documents = Vec::new(); - let embedding_configs = index.embedding_configs(&rtxn)?; + let embedding_configs = index.embedding_configs(rtxn)?; let documents_iter = index.documents(rtxn, documents_ids)?; for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { // First generate a document with all the displayed fields @@ -1066,7 +1066,7 @@ fn make_hits( if retrieve_vectors { let mut vectors = serde_json::Map::new(); - for (name, mut vector) in index.embeddings(&rtxn, id)? { + for (name, mut vector) in index.embeddings(rtxn, id)? { let user_defined = embedding_configs .iter() .find(|conf| conf.name == name) @@ -1079,7 +1079,7 @@ fn make_hits( } else { embedding.insert("embedding".to_string(), vector.into()); } - vectors.insert(name.into(), embedding.into()); + vectors.insert(name, embedding.into()); } document.insert("_vectors".into(), vectors.into()); } diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index 2bf828711..13f2f8afc 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -4,7 +4,8 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; -use crate::{vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME, FieldId, FieldsIdsMap, Weight}; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; +use crate::{FieldId, FieldsIdsMap, Weight}; #[derive(Debug, Default, Serialize, Deserialize)] pub struct FieldidsWeightsMap { diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 078010554..ab9ef0525 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -656,7 +656,7 @@ pub(crate) fn write_typed_chunk_into_index( // typed chunks has always at least 1 chunk. let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; - let mut embedding_configs = index.embedding_configs(&wtxn)?; + let mut embedding_configs = index.embedding_configs(wtxn)?; let index_embedder_config = embedding_configs .iter_mut() .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) From 400cf3eb92be992e8056d3acddb8ffa086396051 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 11:49:03 +0200 Subject: [PATCH 15/44] add api error test on the new retrieveVectors parameter --- meilisearch/tests/search/errors.rs | 68 +++++++++++++++++++++++++++++ meilisearch/tests/similar/errors.rs | 51 ++++++++++++++++++++++ 2 files changed, 119 insertions(+) diff --git a/meilisearch/tests/search/errors.rs b/meilisearch/tests/search/errors.rs index 53d516c44..75977b190 100644 --- a/meilisearch/tests/search/errors.rs +++ b/meilisearch/tests/search/errors.rs @@ -167,6 +167,74 @@ async fn search_bad_hits_per_page() { "###); } +#[actix_rt::test] +async fn search_bad_attributes_to_retrieve() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index.search_post(json!({"attributesToRetrieve": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.attributesToRetrieve`: expected an array, but found a string: `\"doggo\"`", + "code": "invalid_search_attributes_to_retrieve", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_retrieve" + } + "###); + // Can't make the `attributes_to_retrieve` fail with a get search since it'll accept anything as an array of strings. +} + +#[actix_rt::test] +async fn search_bad_retrieve_vectors() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index.search_post(json!({"retrieveVectors": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); + + let (response, code) = index.search_post(json!({"retrieveVectors": [true]})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); + + let (response, code) = index.search_get("retrieveVectors=").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); + + let (response, code) = index.search_get("retrieveVectors=doggo").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); +} + #[actix_rt::test] async fn search_bad_attributes_to_crop() { let server = Server::new().await; diff --git a/meilisearch/tests/similar/errors.rs b/meilisearch/tests/similar/errors.rs index 7765b9a85..546554882 100644 --- a/meilisearch/tests/similar/errors.rs +++ b/meilisearch/tests/similar/errors.rs @@ -756,3 +756,54 @@ async fn filter_reserved_geo_point_string() { }) .await; } + +#[actix_rt::test] +async fn similar_bad_retrieve_vectors() { + let server = Server::new().await; + server.set_features(json!({"vectorStore": true})).await; + let index = server.index("test"); + + let (response, code) = index.similar_post(json!({"retrieveVectors": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); + + let (response, code) = index.similar_post(json!({"retrieveVectors": [true]})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); + + let (response, code) = index.similar_get("retrieveVectors=").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); + + let (response, code) = index.similar_get("retrieveVectors=doggo").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); +} From 49fa41ce6590862be2b0739c343c0b78861c5d97 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 12:09:14 +0200 Subject: [PATCH 16/44] apply first round of review comments --- index-scheduler/src/lib.rs | 2 -- meilisearch/src/search.rs | 10 ++++------ meilisearch/tests/dumps/mod.rs | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 57eccbe66..f98e419a1 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5172,9 +5172,7 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); - println!("HEEEEERE"); handle.advance_one_successful_batch(); - // handle.advance_one_failed_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds"); { diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 9b72ed596..c749dff86 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1073,12 +1073,10 @@ fn make_hits( .is_some_and(|conf| conf.user_defined.contains(id)); let mut embedding = serde_json::Map::new(); embedding.insert("userDefined".to_string(), user_defined.into()); - if vector.len() == 1 { - let vector = vector.pop().unwrap(); - embedding.insert("embedding".to_string(), vector.into()); - } else { - embedding.insert("embedding".to_string(), vector.into()); - } + match vector.as_mut_slice() { + [one] => embedding.insert("embedding".to_string(), std::mem::take(one).into()), + _ => embedding.insert("embedding".to_string(), vector.into()), + }; vectors.insert(name, embedding.into()); } document.insert("_vectors".into(), vectors.into()); diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index dfac2e806..b657fc1ee 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1993,7 +1993,7 @@ async fn generate_and_import_dump_containing_vectors() { let response = index.wait_task(response.uid()).await; snapshot!(response["status"], @r###""succeeded""###); - // ========= We made a dump, now we should clear the DB except and try to import our dump + // ========= We made a dump, now we should clear the DB and try to import our dump drop(server); tokio::fs::remove_dir_all(&opt.db_path).await.unwrap(); let dump_name = format!("{}.dump", response["details"]["dumpUid"].as_str().unwrap()); From b7349910d9285ec485bb59f9712f7d211604b10f Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 15:19:22 +0200 Subject: [PATCH 17/44] implements mor review comments --- index-scheduler/src/lib.rs | 2 +- .../extract/extract_vector_points.rs | 39 +++++++++---------- .../src/update/index_documents/extract/mod.rs | 8 ++-- milli/src/update/index_documents/mod.rs | 8 ++-- .../src/update/index_documents/typed_chunk.rs | 8 ++-- milli/src/vector/rest.rs | 1 - 6 files changed, 31 insertions(+), 35 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index f98e419a1..1f5a1fdcd 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5012,7 +5012,7 @@ mod tests { insta::assert_json_snapshot!(task.details); } - handle.advance_n_successful_batches(1); + handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors"); { diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 1e56bec83..88c42864e 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -35,8 +35,8 @@ pub struct ExtractedVectorPoints { // embedder pub embedder_name: String, pub embedder: Arc, - pub user_defined: RoaringBitmap, - pub remove_from_user_defined: RoaringBitmap, + pub user_provided: RoaringBitmap, + pub remove_from_user_provided: RoaringBitmap, } enum VectorStateDelta { @@ -82,9 +82,9 @@ struct EmbedderVectorExtractor { remove_vectors_writer: Writer>, // The docids of the documents that contains a user defined embedding - user_defined: RoaringBitmap, + user_provided: RoaringBitmap, // The docids of the documents that contains an auto-generated embedding - remove_from_user_defined: RoaringBitmap, + remove_from_user_provided: RoaringBitmap, } /// Extracts the embedding vector contained in each document under the `_vectors` field. @@ -140,8 +140,8 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_defined: RoaringBitmap::new(), - remove_from_user_defined: RoaringBitmap::new(), + user_provided: RoaringBitmap::new(), + remove_from_user_provided: RoaringBitmap::new(), }); } @@ -179,8 +179,8 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, } in extractors.iter_mut() { let delta = match parsed_vectors.remove(embedder_name) { @@ -188,10 +188,10 @@ pub fn extract_vector_points( match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) { (true, true) | (false, false) => (), (true, false) => { - remove_from_user_defined.insert(docid); + remove_from_user_provided.insert(docid); } (false, true) => { - user_defined.insert(docid); + user_provided.insert(docid); } } @@ -214,7 +214,7 @@ pub fn extract_vector_points( .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); if document_is_kept && old.is_some() { - remove_from_user_defined.insert(docid); + remove_from_user_provided.insert(docid); // becomes autogenerated VectorStateDelta::NowGenerated(prompt.render( obkv, @@ -229,9 +229,9 @@ pub fn extract_vector_points( } (None, Some(new)) => { if new.is_user_provided() { - user_defined.insert(docid); + user_provided.insert(docid); } else { - remove_from_user_defined.insert(docid); + remove_from_user_provided.insert(docid); } // was possibly autogenerated, remove all vectors for that document let add_vectors = new.into_array_of_vectors(); @@ -274,7 +274,7 @@ pub fn extract_vector_points( VectorStateDelta::NoChange } } else { - remove_from_user_defined.remove(docid); + remove_from_user_provided.remove(docid); VectorStateDelta::NowRemoved } } @@ -301,8 +301,8 @@ pub fn extract_vector_points( manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, } in extractors { results.push(ExtractedVectorPoints { @@ -311,8 +311,8 @@ pub fn extract_vector_points( prompts: writer_into_reader(prompts_writer)?, embedder, embedder_name, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, }) } @@ -347,9 +347,6 @@ fn push_vectors_diff( add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); - // let merged_vectors_iter = - // itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); - // insert vectors into the writer for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { // Generate the key by extending the unique index to it. diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 6399b40f8..2babe330f 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -248,8 +248,8 @@ fn send_original_documents_data( prompts, embedder_name, embedder, - user_defined, - remove_from_user_defined: auto_generated, + user_provided, + remove_from_user_provided, } in extracted_vectors { let embeddings = match extract_embeddings( @@ -274,8 +274,8 @@ fn send_original_documents_data( expected_dimension: embedder.dimensions(), manual_vectors, embedder_name, - user_defined, - remove_from_user_defined: auto_generated, + user_provided, + remove_from_user_provided, })); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 907554753..07c10bb45 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -503,8 +503,8 @@ where embeddings, manual_vectors, embedder_name, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, } => { dimension.insert(embedder_name.clone(), expected_dimension); TypedChunk::VectorPoints { @@ -513,8 +513,8 @@ where expected_dimension, manual_vectors, embedder_name, - user_defined, - remove_from_user_defined, + user_provided, + remove_from_user_provided, } } otherwise => otherwise, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index ab9ef0525..05c849809 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -91,8 +91,8 @@ pub(crate) enum TypedChunk { expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, - user_defined: RoaringBitmap, - remove_from_user_defined: RoaringBitmap, + user_provided: RoaringBitmap, + remove_from_user_provided: RoaringBitmap, }, ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } @@ -635,8 +635,8 @@ pub(crate) fn write_typed_chunk_into_index( embeddings, expected_dimension, embedder_name, - user_defined: ud, - remove_from_user_defined: rud, + user_provided: ud, + remove_from_user_provided: rud, } = typed_chunk else { unreachable!(); diff --git a/milli/src/vector/rest.rs b/milli/src/vector/rest.rs index fd771a228..60f54782e 100644 --- a/milli/src/vector/rest.rs +++ b/milli/src/vector/rest.rs @@ -230,7 +230,6 @@ where input_value } [input] => { - dbg!(&options); let mut body = options.query.clone(); body.as_object_mut() From d85ab23b82276a72ff812fab0a32ba70ccf958ec Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 15:38:49 +0200 Subject: [PATCH 18/44] rename all occurences of user_defined to user_provided for consistency --- index-scheduler/src/batch.rs | 2 +- index-scheduler/src/lib.rs | 25 +++++++++--------- meilisearch/src/search.rs | 6 ++--- milli/src/index.rs | 2 +- milli/src/update/index_documents/mod.rs | 4 +-- .../src/update/index_documents/typed_chunk.rs | 12 ++++----- milli/src/update/settings.rs | 26 +++++++++++-------- milli/src/vector/parsed_vectors.rs | 2 +- 8 files changed, 42 insertions(+), 37 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index d59a657c9..30ff54a62 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -955,7 +955,7 @@ impl IndexScheduler { let user_provided = embedding_configs .iter() .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_defined.contains(id)); + .is_some_and(|conf| conf.user_provided.contains(id)); let embeddings = ExplicitVectors { embeddings: VectorOrArrayOfVectors::from_array_of_vectors( diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 1f5a1fdcd..8d6237408 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -3062,9 +3062,9 @@ mod tests { let rtxn = index.read_txn().unwrap(); let configs = index.embedding_configs(&rtxn).unwrap(); - let IndexEmbeddingConfig { name, config, user_defined } = configs.first().unwrap(); + let IndexEmbeddingConfig { name, config, user_provided } = configs.first().unwrap(); insta::assert_snapshot!(name, @"default"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(config.embedder_options); } @@ -5029,17 +5029,17 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: fakerest_config, user_defined } = + let IndexEmbeddingConfig { name, config: fakerest_config, user_provided } = configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let IndexEmbeddingConfig { name, config: simple_hf_config, user_defined } = + let IndexEmbeddingConfig { name, config: simple_hf_config, user_provided } = configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); @@ -5111,13 +5111,14 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: _, user_defined } = configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = + configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); - let IndexEmbeddingConfig { name, config: _, user_defined } = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); let embeddings = index.embeddings(&rtxn, 0).unwrap(); @@ -5185,15 +5186,15 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: _, user_defined } = + let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = configs.get(0).unwrap(); insta::assert_snapshot!(name, @"A_fakerest"); insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); - let IndexEmbeddingConfig { name, config: _, user_defined } = + let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); let embeddings = index.embeddings(&rtxn, 0).unwrap(); diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index c749dff86..a0c05b09e 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1067,12 +1067,12 @@ fn make_hits( if retrieve_vectors { let mut vectors = serde_json::Map::new(); for (name, mut vector) in index.embeddings(rtxn, id)? { - let user_defined = embedding_configs + let user_provided = embedding_configs .iter() .find(|conf| conf.name == name) - .is_some_and(|conf| conf.user_defined.contains(id)); + .is_some_and(|conf| conf.user_provided.contains(id)); let mut embedding = serde_json::Map::new(); - embedding.insert("userDefined".to_string(), user_defined.into()); + embedding.insert("userProvided".to_string(), user_provided.into()); match vector.as_mut_slice() { [one] => embedding.insert("embedding".to_string(), std::mem::take(one).into()), _ => embedding.insert("embedding".to_string(), vector.into()), diff --git a/milli/src/index.rs b/milli/src/index.rs index a47c07e08..d325d6fa4 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1672,7 +1672,7 @@ impl Index { pub struct IndexEmbeddingConfig { pub name: String, pub config: EmbeddingConfig, - pub user_defined: RoaringBitmap, + pub user_provided: RoaringBitmap, } #[cfg(test)] diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 07c10bb45..a533f1984 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2623,10 +2623,10 @@ mod tests { let rtxn = index.read_txn().unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_defined } = + let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } = embedding_configs.pop().unwrap(); insta::assert_snapshot!(embedder_name, @"manual"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>"); let embedder = std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap()); let res = index diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 05c849809..0cb5e58af 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -625,8 +625,8 @@ pub(crate) fn write_typed_chunk_into_index( let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); - let mut user_defined = RoaringBitmap::new(); - let mut remove_from_user_defined = RoaringBitmap::new(); + let mut user_provided = RoaringBitmap::new(); + let mut remove_from_user_provided = RoaringBitmap::new(); let mut params = None; for typed_chunk in typed_chunks { let TypedChunk::VectorPoints { @@ -649,8 +649,8 @@ pub(crate) fn write_typed_chunk_into_index( if let Some(embeddings) = embeddings { embeddings_builder.push(embeddings.into_cursor()?); } - user_defined |= ud; - remove_from_user_defined |= rud; + user_provided |= ud; + remove_from_user_provided |= rud; } // typed chunks has always at least 1 chunk. @@ -661,8 +661,8 @@ pub(crate) fn write_typed_chunk_into_index( .iter_mut() .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) .unwrap(); - index_embedder_config.user_defined -= remove_from_user_defined; - index_embedder_config.user_defined |= user_defined; + index_embedder_config.user_provided -= remove_from_user_provided; + index_embedder_config.user_provided |= user_provided; index.put_embedding_configs(wtxn, embedding_configs)?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 6b07e614e..08b12d178 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -932,9 +932,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let old_configs: BTreeMap, RoaringBitmap)> = old_configs .into_iter() - .map(|IndexEmbeddingConfig { name, config, user_defined }| { - (name, (Setting::Set(config.into()), user_defined)) - }) + .map( + |IndexEmbeddingConfig { name, config, user_provided: user_defined }| { + (name, (Setting::Set(config.into()), user_defined)) + }, + ) .collect(); let mut new_configs = BTreeMap::new(); @@ -944,19 +946,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { { match joined { // updated config - EitherOrBoth::Both((name, (mut old, user_defined)), (_, new)) => { + EitherOrBoth::Both((name, (mut old, user_provided)), (_, new)) => { changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); if changed { tracing::debug!( embedder = name, - documents = user_defined.len(), + user_provided = user_provided.len(), "need reindex" ); } else { tracing::debug!(embedder = name, "skip reindex"); } let new = validate_embedding_settings(old, &name)?; - new_configs.insert(name, (new, user_defined)); + new_configs.insert(name, (new, user_provided)); } // unchanged config EitherOrBoth::Left((name, setting)) => { @@ -979,15 +981,17 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { } let new_configs: Vec = new_configs .into_iter() - .filter_map(|(name, (config, user_defined))| match config { - Setting::Set(config) => { - Some(IndexEmbeddingConfig { name, config: config.into(), user_defined }) - } + .filter_map(|(name, (config, user_provided))| match config { + Setting::Set(config) => Some(IndexEmbeddingConfig { + name, + config: config.into(), + user_provided, + }), Setting::Reset => None, Setting::NotSet => Some(IndexEmbeddingConfig { name, config: EmbeddingSettings::default().into(), - user_defined, + user_provided, }), }) .collect(); diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 672e27cc5..4e9e60520 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -73,7 +73,7 @@ impl ParsedVectorsDiff { } .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, Some(vec))).collect()); for embedding_config in embedders_configs { - if embedding_config.user_defined.contains(docid) { + if embedding_config.user_provided.contains(docid) { old.entry(embedding_config.name.to_string()).or_insert(None); } } From 31a793d226154dcecff10f8e761582b775665dd5 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 16:10:56 +0200 Subject: [PATCH 19/44] fix the regeneration of the embeddings in the search --- index-scheduler/src/lib.rs | 2 +- meilisearch/src/search.rs | 12 +- meilisearch/tests/dumps/mod.rs | 25 +-- meilisearch/tests/search/hybrid.rs | 30 +-- meilisearch/tests/search/mod.rs | 70 +++--- meilisearch/tests/similar/mod.rs | 336 ++++++++++++++++------------- milli/src/vector/parsed_vectors.rs | 16 ++ 7 files changed, 281 insertions(+), 210 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 8d6237408..de263c50d 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5366,7 +5366,7 @@ mod tests { template: "{{doc.doggo}}", }, }, - user_defined: RoaringBitmap<[1, 2]>, + user_provided: RoaringBitmap<[1, 2]>, }, ] "###); diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index a0c05b09e..ce712f17f 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -15,6 +15,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; +use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::vector::Embedder; use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; @@ -1066,18 +1067,13 @@ fn make_hits( if retrieve_vectors { let mut vectors = serde_json::Map::new(); - for (name, mut vector) in index.embeddings(rtxn, id)? { + for (name, vector) in index.embeddings(rtxn, id)? { let user_provided = embedding_configs .iter() .find(|conf| conf.name == name) .is_some_and(|conf| conf.user_provided.contains(id)); - let mut embedding = serde_json::Map::new(); - embedding.insert("userProvided".to_string(), user_provided.into()); - match vector.as_mut_slice() { - [one] => embedding.insert("embedding".to_string(), std::mem::take(one).into()), - _ => embedding.insert("embedding".to_string(), vector.into()), - }; - vectors.insert(name, embedding.into()); + let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided }; + vectors.insert(name, serde_json::to_value(embeddings)?); } document.insert("_vectors".into(), vectors.into()); } diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index b657fc1ee..6f93d94a7 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1940,8 +1940,9 @@ async fn import_dump_v6_containing_experimental_features() { } // In this test we must generate the dump ourselves to ensure the -// `user defined` vectors are well set +// `user provided` vectors are well set #[actix_rt::test] +#[cfg_attr(target_os = "windows", ignore)] async fn generate_and_import_dump_containing_vectors() { let temp = tempfile::tempdir().unwrap(); let mut opt = default_settings(temp.path()); @@ -2087,15 +2088,15 @@ async fn generate_and_import_dump_containing_vectors() { index .search(json!({"retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embedding" => "[vector]" }), @r###" + snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###" [ { "id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": { - "userDefined": true, - "embedding": "[vector]" + "embeddings": "[vector]", + "userProvided": true } } }, @@ -2104,8 +2105,8 @@ async fn generate_and_import_dump_containing_vectors() { "doggo": "echo", "_vectors": { "doggo_embedder": { - "userDefined": true, - "embedding": "[vector]" + "embeddings": "[vector]", + "userProvided": true } } }, @@ -2114,8 +2115,8 @@ async fn generate_and_import_dump_containing_vectors() { "doggo": "intel", "_vectors": { "doggo_embedder": { - "userDefined": false, - "embedding": "[vector]" + "embeddings": "[vector]", + "userProvided": false } } }, @@ -2124,8 +2125,8 @@ async fn generate_and_import_dump_containing_vectors() { "doggo": "bill", "_vectors": { "doggo_embedder": { - "userDefined": false, - "embedding": "[vector]" + "embeddings": "[vector]", + "userProvided": false } } }, @@ -2134,8 +2135,8 @@ async fn generate_and_import_dump_containing_vectors() { "doggo": "max", "_vectors": { "doggo_embedder": { - "userDefined": false, - "embedding": "[vector]" + "embeddings": "[vector]", + "userProvided": false } } } diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 713dbe3bb..b8a4110ad 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -128,7 +128,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index @@ -137,7 +137,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"2"); let (response, code) = index @@ -146,7 +146,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -207,7 +207,7 @@ async fn distribution_shift() { let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true}); let (response, code) = index.search_post(search.clone()).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9472135901451112}]"###); let (response, code) = index .update_settings(json!({ @@ -228,7 +228,7 @@ async fn distribution_shift() { let (response, code) = index.search_post(search).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.1920928955078125e-7}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.1920928955078125e-7}]"###); } #[actix_rt::test] @@ -249,7 +249,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index @@ -265,7 +265,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic @@ -282,7 +282,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -370,7 +370,7 @@ async fn single_document() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0}"###); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0}"###); snapshot!(response["semanticHitCount"], @"1"); } @@ -385,7 +385,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // same with a different semantic ratio @@ -394,7 +394,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // wrong vector dimensions @@ -418,7 +418,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.6581138968467712}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.6581138968467712}]"###); snapshot!(response["semanticHitCount"], @"3"); // full keyword, without a query @@ -427,7 +427,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, full keyword => keyword @@ -436,7 +436,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"userDefined":true,"embedding":[2.0,3.0]}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"userDefined":true,"embedding":[1.0,3.0]}},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, no hybrid keyword => @@ -479,6 +479,6 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"userDefined":true,"embedding":[1.0,2.0]}},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 2a2b23fd5..9e19fa4e8 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1350,12 +1350,14 @@ async fn experimental_feature_vector_store() { "id": "287947", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 1.0, - 2.0, - 3.0 - ] + "embeddings": [ + [ + 1.0, + 2.0, + 3.0 + ] + ], + "userProvided": true } }, "_rankingScore": 1.0 @@ -1365,12 +1367,14 @@ async fn experimental_feature_vector_store() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 1.0, - 2.0, - 54.0 - ] + "embeddings": [ + [ + 1.0, + 2.0, + 54.0 + ] + ], + "userProvided": true } }, "_rankingScore": 0.9129111766815186 @@ -1380,12 +1384,14 @@ async fn experimental_feature_vector_store() { "id": "450465", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - -100.0, - 340.0, - 90.0 - ] + "embeddings": [ + [ + -100.0, + 340.0, + 90.0 + ] + ], + "userProvided": true } }, "_rankingScore": 0.8106412887573242 @@ -1395,12 +1401,14 @@ async fn experimental_feature_vector_store() { "id": "166428", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - -100.0, - 231.0, - 32.0 - ] + "embeddings": [ + [ + -100.0, + 231.0, + 32.0 + ] + ], + "userProvided": true } }, "_rankingScore": 0.7412010431289673 @@ -1410,12 +1418,14 @@ async fn experimental_feature_vector_store() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 10.0, - -23.0, - 32.0 - ] + "embeddings": [ + [ + 10.0, + -23.0, + 32.0 + ] + ], + "userProvided": true } }, "_rankingScore": 0.6972063183784485 diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index 2b70b3df5..0a568553c 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -88,12 +88,14 @@ async fn basic() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } } }, @@ -103,12 +105,14 @@ async fn basic() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "userProvided": true } } }, @@ -118,12 +122,14 @@ async fn basic() { "id": "166428", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "userProvided": true } } }, @@ -133,12 +139,14 @@ async fn basic() { "id": "287947", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "userProvided": true } } } @@ -158,12 +166,14 @@ async fn basic() { "id": "166428", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "userProvided": true } } }, @@ -173,12 +183,14 @@ async fn basic() { "id": "287947", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "userProvided": true } } }, @@ -188,12 +200,14 @@ async fn basic() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } } }, @@ -203,12 +217,14 @@ async fn basic() { "id": "143", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - -0.5, - 0.30000001192092896, - 0.8500000238418579 - ] + "embeddings": [ + [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + ], + "userProvided": true } } } @@ -264,12 +280,14 @@ async fn ranking_score_threshold() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } }, "_rankingScore": 0.890957772731781 @@ -280,12 +298,14 @@ async fn ranking_score_threshold() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "userProvided": true } }, "_rankingScore": 0.39060014486312866 @@ -296,12 +316,14 @@ async fn ranking_score_threshold() { "id": "166428", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "userProvided": true } }, "_rankingScore": 0.2819308042526245 @@ -312,12 +334,14 @@ async fn ranking_score_threshold() { "id": "287947", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "userProvided": true } }, "_rankingScore": 0.1662663221359253 @@ -342,12 +366,14 @@ async fn ranking_score_threshold() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } }, "_rankingScore": 0.890957772731781 @@ -358,12 +384,14 @@ async fn ranking_score_threshold() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "userProvided": true } }, "_rankingScore": 0.39060014486312866 @@ -374,12 +402,14 @@ async fn ranking_score_threshold() { "id": "166428", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "userProvided": true } }, "_rankingScore": 0.2819308042526245 @@ -404,12 +434,14 @@ async fn ranking_score_threshold() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } }, "_rankingScore": 0.890957772731781 @@ -420,12 +452,14 @@ async fn ranking_score_threshold() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "userProvided": true } }, "_rankingScore": 0.39060014486312866 @@ -450,12 +484,14 @@ async fn ranking_score_threshold() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } }, "_rankingScore": 0.890957772731781 @@ -522,12 +558,14 @@ async fn filter() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "userProvided": true } } }, @@ -537,12 +575,14 @@ async fn filter() { "id": "166428", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.699999988079071, - 0.699999988079071, - -0.4000000059604645 - ] + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "userProvided": true } } }, @@ -552,12 +592,14 @@ async fn filter() { "id": "287947", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.800000011920929, - 0.4000000059604645, - -0.5 - ] + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "userProvided": true } } } @@ -580,12 +622,14 @@ async fn filter() { "id": "143", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - -0.5, - 0.30000001192092896, - 0.8500000238418579 - ] + "embeddings": [ + [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + ], + "userProvided": true } } } @@ -639,12 +683,14 @@ async fn limit_and_offset() { "id": "522681", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.10000000149011612, - 0.6000000238418579, - 0.800000011920929 - ] + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "userProvided": true } } } @@ -666,12 +712,14 @@ async fn limit_and_offset() { "id": "299537", "_vectors": { "manual": { - "userDefined": true, - "embedding": [ - 0.6000000238418579, - 0.800000011920929, - -0.20000000298023224 - ] + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "userProvided": true } } } diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 4e9e60520..501bd2ad2 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -160,6 +160,22 @@ impl VectorOrArrayOfVectors { pub fn from_array_of_vectors(array_of_vec: Vec) -> Self { Self { inner: Some(either::Either::Left(array_of_vec)) } } + + pub fn from_vector(vec: Embedding) -> Self { + Self { inner: Some(either::Either::Right(vec)) } + } +} + +impl From for VectorOrArrayOfVectors { + fn from(vec: Embedding) -> Self { + Self::from_vector(vec) + } +} + +impl From> for VectorOrArrayOfVectors { + fn from(vec: Vec) -> Self { + Self::from_array_of_vectors(vec) + } } #[cfg(test)] From ea61e5cbec610b8025dd3448162a5bb769e603d0 Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 16:26:03 +0200 Subject: [PATCH 20/44] makes clippy happy x2 --- index-scheduler/src/lib.rs | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index de263c50d..50fc619d8 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5271,7 +5271,7 @@ mod tests { ] ); - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0 as u128).unwrap(); + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); let documents_count = read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) .unwrap(); @@ -5307,16 +5307,18 @@ mod tests { .collect::>(); snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push"); - let mut setting = meilisearch_types::settings::Settings::::default(); - setting.embedders = Setting::Set(maplit::btreemap! { - S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { - source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), - model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), - revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), - document_template: Setting::Set(S("{{doc.doggo}}")), - .. EmbeddingSettings::default() - }) - }); + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + ..Default::default() + }) + }), + ..Default::default() + }; index_scheduler .register( KindWithContent::SettingsUpdate { @@ -5380,7 +5382,7 @@ mod tests { let mut embeddings = Vec::new(); 'vectors: for i in 0..=u8::MAX { - let reader = arroy::Reader::open(&rtxn, 0 | (i as u16), index.vector_arroy) + let reader = arroy::Reader::open(&rtxn, i as u16, index.vector_arroy) .map(Some) .or_else(|e| match e { arroy::Error::MissingMetadata => Ok(None), @@ -5418,7 +5420,7 @@ mod tests { ] ); - let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1 as u128).unwrap(); + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1_u128).unwrap(); let documents_count = read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) .unwrap(); From 6607875f49b3047a3fe6d8771a700d21f36a0b9e Mon Sep 17 00:00:00 2001 From: Tamo Date: Wed, 5 Jun 2024 23:40:29 +0200 Subject: [PATCH 21/44] add the retrieveVectors parameter to the get and fetch documents route --- meilisearch-types/src/error.rs | 1 + meilisearch/src/routes/indexes/documents.rs | 80 ++++-- meilisearch/tests/common/index.rs | 37 +-- meilisearch/tests/common/mod.rs | 2 +- meilisearch/tests/documents/errors.rs | 24 ++ meilisearch/tests/documents/get_documents.rs | 268 ++++++++++++++++--- 6 files changed, 325 insertions(+), 87 deletions(-) diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 63543fb1b..ae2a753db 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -222,6 +222,7 @@ InvalidApiKeyUid , InvalidRequest , BAD_REQUEST ; InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ; InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ; InvalidDocumentFields , InvalidRequest , BAD_REQUEST ; +InvalidDocumentRetrieveVectors , InvalidRequest , BAD_REQUEST ; MissingDocumentFilter , InvalidRequest , BAD_REQUEST ; InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ; InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 43fab1dae..97ded8069 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -16,6 +16,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::update::IndexDocumentsMethod; +use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::DocumentId; use meilisearch_types::star_or::OptionStarOrList; use meilisearch_types::tasks::KindWithContent; @@ -94,6 +95,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) { pub struct GetDocument { #[deserr(default, error = DeserrQueryParamError)] fields: OptionStarOrList, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, } pub async fn get_document( @@ -109,11 +112,12 @@ pub async fn get_document( analytics.get_fetch_documents(&DocumentFetchKind::PerDocumentId, &req); - let GetDocument { fields } = params.into_inner(); + let GetDocument { fields, retrieve_vectors } = params.into_inner(); let attributes_to_retrieve = fields.merge_star_and_none(); let index = index_scheduler.index(&index_uid)?; - let document = retrieve_document(&index, &document_id, attributes_to_retrieve)?; + let document = + retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors.0)?; debug!(returns = ?document, "Get document"); Ok(HttpResponse::Ok().json(document)) } @@ -153,6 +157,8 @@ pub struct BrowseQueryGet { limit: Param, #[deserr(default, error = DeserrQueryParamError)] fields: OptionStarOrList, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] filter: Option, } @@ -166,6 +172,8 @@ pub struct BrowseQuery { limit: usize, #[deserr(default, error = DeserrJsonError)] fields: Option>, + #[deserr(default, error = DeserrJsonError)] + retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] filter: Option, } @@ -201,7 +209,7 @@ pub async fn get_documents( ) -> Result { debug!(parameters = ?params, "Get documents GET"); - let BrowseQueryGet { limit, offset, fields, filter } = params.into_inner(); + let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner(); let filter = match filter { Some(f) => match serde_json::from_str(&f) { @@ -215,6 +223,7 @@ pub async fn get_documents( offset: offset.0, limit: limit.0, fields: fields.merge_star_and_none(), + retrieve_vectors: retrieve_vectors.0, filter, }; @@ -236,10 +245,11 @@ fn documents_by_query( query: BrowseQuery, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let BrowseQuery { offset, limit, fields, filter } = query; + let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query; let index = index_scheduler.index(&index_uid)?; - let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields)?; + let (total, documents) = + retrieve_documents(&index, offset, limit, filter, fields, retrieve_vectors)?; let ret = PaginationView::new(offset, limit, total as usize, documents); @@ -579,13 +589,33 @@ fn some_documents<'a, 't: 'a>( index: &'a Index, rtxn: &'t RoTxn, doc_ids: impl IntoIterator + 'a, + retrieve_vectors: bool, ) -> Result> + 'a, ResponseError> { let fields_ids_map = index.fields_ids_map(rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index.embedding_configs(rtxn)?; Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| { - ret.map_err(ResponseError::from).and_then(|(_key, document)| -> Result<_, ResponseError> { - Ok(milli::obkv_to_json(&all_fields, &fields_ids_map, document)?) + ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> { + let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; + + if retrieve_vectors { + let mut vectors = serde_json::Map::new(); + for (name, vector) in index.embeddings(rtxn, key)? { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == name) + .is_some_and(|conf| conf.user_provided.contains(key)); + let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided }; + vectors.insert( + name, + serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, + ); + } + document.insert("_vectors".into(), vectors.into()); + } + + Ok(document) }) })) } @@ -596,6 +626,7 @@ fn retrieve_documents>( limit: usize, filter: Option, attributes_to_retrieve: Option>, + retrieve_vectors: bool, ) -> Result<(u64, Vec), ResponseError> { let rtxn = index.read_txn()?; let filter = &filter; @@ -620,53 +651,58 @@ fn retrieve_documents>( let (it, number_of_documents) = { let number_of_documents = candidates.len(); ( - some_documents(index, &rtxn, candidates.into_iter().skip(offset).take(limit))?, + some_documents( + index, + &rtxn, + candidates.into_iter().skip(offset).take(limit), + retrieve_vectors, + )?, number_of_documents, ) }; - let documents: Result, ResponseError> = it + let documents: Vec<_> = it .map(|document| { Ok(match &attributes_to_retrieve { Some(attributes_to_retrieve) => permissive_json_pointer::select_values( &document?, - attributes_to_retrieve.iter().map(|s| s.as_ref()), + attributes_to_retrieve + .iter() + .map(|s| s.as_ref()) + .chain(retrieve_vectors.then_some("_vectors")), ), None => document?, }) }) - .collect(); + .collect::>()?; - Ok((number_of_documents, documents?)) + Ok((number_of_documents, documents)) } fn retrieve_document>( index: &Index, doc_id: &str, attributes_to_retrieve: Option>, + retrieve_vectors: bool, ) -> Result { let txn = index.read_txn()?; - let fields_ids_map = index.fields_ids_map(&txn)?; - let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let internal_id = index .external_documents_ids() .get(&txn, doc_id)? .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; - let document = index - .documents(&txn, std::iter::once(internal_id))? - .into_iter() + let document = some_documents(index, &txn, Some(internal_id), retrieve_vectors)? .next() - .map(|(_, d)| d) - .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; + .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))??; - let document = meilisearch_types::milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; let document = match &attributes_to_retrieve { Some(attributes_to_retrieve) => permissive_json_pointer::select_values( &document, - attributes_to_retrieve.iter().map(|s| s.as_ref()), + attributes_to_retrieve + .iter() + .map(|s| s.as_ref()) + .chain(retrieve_vectors.then_some("_vectors")), ), None => document, }; diff --git a/meilisearch/tests/common/index.rs b/meilisearch/tests/common/index.rs index 3ac33b4e9..f81fe8c8a 100644 --- a/meilisearch/tests/common/index.rs +++ b/meilisearch/tests/common/index.rs @@ -182,14 +182,10 @@ impl Index<'_> { self.service.get(url).await } - pub async fn get_document( - &self, - id: u64, - options: Option, - ) -> (Value, StatusCode) { + pub async fn get_document(&self, id: u64, options: Option) -> (Value, StatusCode) { let mut url = format!("/indexes/{}/documents/{}", urlencode(self.uid.as_ref()), id); - if let Some(fields) = options.and_then(|o| o.fields) { - let _ = write!(url, "?fields={}", fields.join(",")); + if let Some(options) = options { + write!(url, "?{}", yaup::to_string(&options).unwrap()).unwrap(); } self.service.get(url).await } @@ -205,18 +201,11 @@ impl Index<'_> { } pub async fn get_all_documents(&self, options: GetAllDocumentsOptions) -> (Value, StatusCode) { - let mut url = format!("/indexes/{}/documents?", urlencode(self.uid.as_ref())); - if let Some(limit) = options.limit { - let _ = write!(url, "limit={}&", limit); - } - - if let Some(offset) = options.offset { - let _ = write!(url, "offset={}&", offset); - } - - if let Some(attributes_to_retrieve) = options.attributes_to_retrieve { - let _ = write!(url, "fields={}&", attributes_to_retrieve.join(",")); - } + let url = format!( + "/indexes/{}/documents?{}", + urlencode(self.uid.as_ref()), + yaup::to_string(&options).unwrap() + ); self.service.get(url).await } @@ -435,13 +424,11 @@ impl Index<'_> { } } -pub struct GetDocumentOptions { - pub fields: Option>, -} - -#[derive(Debug, Default)] +#[derive(Debug, Default, serde::Serialize)] +#[serde(rename_all = "camelCase")] pub struct GetAllDocumentsOptions { pub limit: Option, pub offset: Option, - pub attributes_to_retrieve: Option>, + pub fields: Option>, + pub retrieve_vectors: bool, } diff --git a/meilisearch/tests/common/mod.rs b/meilisearch/tests/common/mod.rs index 3117dd185..317e5e171 100644 --- a/meilisearch/tests/common/mod.rs +++ b/meilisearch/tests/common/mod.rs @@ -6,7 +6,7 @@ pub mod service; use std::fmt::{self, Display}; #[allow(unused)] -pub use index::{GetAllDocumentsOptions, GetDocumentOptions}; +pub use index::GetAllDocumentsOptions; use meili_snap::json_string; use serde::{Deserialize, Serialize}; #[allow(unused)] diff --git a/meilisearch/tests/documents/errors.rs b/meilisearch/tests/documents/errors.rs index cd2d89813..cd1be4dc4 100644 --- a/meilisearch/tests/documents/errors.rs +++ b/meilisearch/tests/documents/errors.rs @@ -795,3 +795,27 @@ async fn fetch_document_by_filter() { } "###); } + +#[actix_rt::test] +async fn retrieve_vectors() { + let server = Server::new().await; + let index = server.index("doggo"); + let (response, _code) = index.get_all_documents_raw("?retrieveVectors=tamo").await; + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`", + "code": "invalid_document_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" + } + "###); + let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": "tamo"}))).await; + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`", + "code": "invalid_document_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" + } + "###); +} diff --git a/meilisearch/tests/documents/get_documents.rs b/meilisearch/tests/documents/get_documents.rs index 3b0629fcb..1ade00b06 100644 --- a/meilisearch/tests/documents/get_documents.rs +++ b/meilisearch/tests/documents/get_documents.rs @@ -4,7 +4,7 @@ use meili_snap::*; use urlencoding::encode as urlencode; use crate::common::encoder::Encoder; -use crate::common::{GetAllDocumentsOptions, GetDocumentOptions, Server, Value}; +use crate::common::{GetAllDocumentsOptions, Server, Value}; use crate::json; // TODO: partial test since we are testing error, amd error is not yet fully implemented in @@ -59,8 +59,7 @@ async fn get_document() { }) ); - let (response, code) = - index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["id"]) })).await; + let (response, code) = index.get_document(0, Some(json!({ "fields": ["id"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -69,9 +68,8 @@ async fn get_document() { }) ); - let (response, code) = index - .get_document(0, Some(GetDocumentOptions { fields: Some(vec!["nested.content"]) })) - .await; + let (response, code) = + index.get_document(0, Some(json!({ "fields": ["nested.content"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -211,7 +209,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["name"]), + fields: Some(vec!["name"]), ..Default::default() }) .await; @@ -225,9 +223,21 @@ async fn test_get_all_documents_attributes_to_retrieve() { assert_eq!(response["limit"], json!(20)); assert_eq!(response["total"], json!(77)); + let (response, code) = index + .get_all_documents(GetAllDocumentsOptions { fields: Some(vec![]), ..Default::default() }) + .await; + assert_eq!(code, 200); + assert_eq!(response["results"].as_array().unwrap().len(), 20); + for results in response["results"].as_array().unwrap() { + assert_eq!(results.as_object().unwrap().keys().count(), 0); + } + assert_eq!(response["offset"], json!(0)); + assert_eq!(response["limit"], json!(20)); + assert_eq!(response["total"], json!(77)); + let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec![]), + fields: Some(vec!["wrong"]), ..Default::default() }) .await; @@ -242,22 +252,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["wrong"]), - ..Default::default() - }) - .await; - assert_eq!(code, 200); - assert_eq!(response["results"].as_array().unwrap().len(), 20); - for results in response["results"].as_array().unwrap() { - assert_eq!(results.as_object().unwrap().keys().count(), 0); - } - assert_eq!(response["offset"], json!(0)); - assert_eq!(response["limit"], json!(20)); - assert_eq!(response["total"], json!(77)); - - let (response, code) = index - .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["name", "tags"]), + fields: Some(vec!["name", "tags"]), ..Default::default() }) .await; @@ -270,10 +265,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { } let (response, code) = index - .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["*"]), - ..Default::default() - }) + .get_all_documents(GetAllDocumentsOptions { fields: Some(vec!["*"]), ..Default::default() }) .await; assert_eq!(code, 200); assert_eq!(response["results"].as_array().unwrap().len(), 20); @@ -283,7 +275,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["*", "wrong"]), + fields: Some(vec!["*", "wrong"]), ..Default::default() }) .await; @@ -316,12 +308,10 @@ async fn get_document_s_nested_attributes_to_retrieve() { assert_eq!(code, 202); index.wait_task(1).await; - let (response, code) = - index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await; + let (response, code) = index.get_document(0, Some(json!({ "fields": ["content"] }))).await; assert_eq!(code, 200); assert_eq!(response, json!({})); - let (response, code) = - index.get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await; + let (response, code) = index.get_document(1, Some(json!({ "fields": ["content"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -333,9 +323,7 @@ async fn get_document_s_nested_attributes_to_retrieve() { }) ); - let (response, code) = index - .get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) })) - .await; + let (response, code) = index.get_document(0, Some(json!({ "fields": ["content.truc"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -343,9 +331,7 @@ async fn get_document_s_nested_attributes_to_retrieve() { "content.truc": "foobar", }) ); - let (response, code) = index - .get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) })) - .await; + let (response, code) = index.get_document(1, Some(json!({ "fields": ["content.truc"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -540,3 +526,207 @@ async fn get_document_by_filter() { } "###); } + +#[actix_rt::test] +async fn get_document_with_vectors() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": null }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + // by default you shouldn't see the `_vectors` object + let (documents, _code) = index.get_all_documents(Default::default()).await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir" + }, + { + "id": 1, + "name": "echo" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = index.get_document(0, None).await; + snapshot!(json_string!(documents), @r###" + { + "id": 0, + "name": "kefir" + } + "###); + + // if we try to retrieve the vectors with the `fields` parameter they + // still shouldn't be displayed + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { + fields: Some(vec!["name", "_vectors"]), + ..Default::default() + }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "name": "kefir" + }, + { + "name": "echo" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = + index.get_document(0, Some(json!({"fields": ["name", "_vectors"]}))).await; + snapshot!(json_string!(documents), @r###" + { + "name": "kefir" + } + "###); + + // If we specify the retrieve vectors boolean and nothing else we should get the vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "userProvided": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await; + snapshot!(json_string!(documents), @r###" + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "userProvided": true + } + } + } + "###); + + // If we specify the retrieve vectors boolean and exclude vectors form the `fields` we should still get the vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { + retrieve_vectors: true, + fields: Some(vec!["name"]), + ..Default::default() + }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "userProvided": true + } + } + }, + { + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = + index.get_document(0, Some(json!({"retrieveVectors": true, "fields": ["name"]}))).await; + snapshot!(json_string!(documents), @r###" + { + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "userProvided": true + } + } + } + "###); +} From 2cdcb703d9995f1f5f59f9dc60c47830bea2bdb9 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 6 Jun 2024 10:41:16 +0200 Subject: [PATCH 22/44] fix the deletion of vectors and add a test --- meilisearch/tests/integration.rs | 1 + meilisearch/tests/vector/mod.rs | 149 ++++++++++++++++++ .../extract/extract_vector_points.rs | 3 +- 3 files changed, 152 insertions(+), 1 deletion(-) create mode 100644 meilisearch/tests/vector/mod.rs diff --git a/meilisearch/tests/integration.rs b/meilisearch/tests/integration.rs index bb77ecc63..78da9825a 100644 --- a/meilisearch/tests/integration.rs +++ b/meilisearch/tests/integration.rs @@ -13,6 +13,7 @@ mod snapshot; mod stats; mod swap_indexes; mod tasks; +mod vector; // Tests are isolated by features in different modules to allow better readability, test // targetability, and improved incremental compilation times. diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs new file mode 100644 index 000000000..b4350116f --- /dev/null +++ b/meilisearch/tests/vector/mod.rs @@ -0,0 +1,149 @@ +use meili_snap::{json_string, snapshot}; + +use crate::common::{GetAllDocumentsOptions, Server}; +use crate::json; + +#[actix_rt::test] +async fn add_remove_user_provided() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "userProvided": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "manual": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "userProvided": true + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [10, 10, 10] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": null }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 10.0, + 10.0, + 10.0 + ] + ], + "userProvided": true + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + + let (value, code) = index.delete_document(0).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 1, + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); +} diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 88c42864e..964cb35e8 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -63,7 +63,8 @@ impl VectorStateDelta { VectorStateDelta::NoChange => Default::default(), VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), VectorStateDelta::WasGeneratedNowManual(add) => (true, Default::default(), add), - VectorStateDelta::ManualDelta(add) => (false, Default::default(), add), + // We always delete the previous vectors + VectorStateDelta::ManualDelta(add) => (true, Default::default(), add), VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), } } From 63dded3961863a319c2c14e40f6506241574ca9d Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 6 Jun 2024 11:29:16 +0200 Subject: [PATCH 23/44] implements the new analytics for the get documents routes --- meilisearch/src/analytics/mod.rs | 4 ++-- meilisearch/src/analytics/segment_analytics.rs | 17 +++++++++++++---- meilisearch/src/routes/indexes/documents.rs | 7 ++++++- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index 3468ad2c7..6863dc57b 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -74,8 +74,8 @@ pub enum DocumentDeletionKind { #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum DocumentFetchKind { - PerDocumentId, - Normal { with_filter: bool, limit: usize, offset: usize }, + PerDocumentId { retrieve_vectors: bool }, + Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, } pub trait Analytics: Sync + Send { diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 6e91b99b0..56a781c47 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -1542,6 +1542,9 @@ pub struct DocumentsFetchAggregator { // if a filter was used per_filter: bool, + #[serde(rename = "vector.retrieve_vectors")] + retrieve_vectors: bool, + // pagination #[serde(rename = "pagination.max_limit")] max_limit: usize, @@ -1551,18 +1554,21 @@ pub struct DocumentsFetchAggregator { impl DocumentsFetchAggregator { pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self { - let (limit, offset) = match query { - DocumentFetchKind::PerDocumentId => (1, 0), - DocumentFetchKind::Normal { limit, offset, .. } => (*limit, *offset), + let (limit, offset, retrieve_vectors) = match query { + DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), + DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { + (*limit, *offset, *retrieve_vectors) + } }; Self { timestamp: Some(OffsetDateTime::now_utc()), user_agents: extract_user_agents(request).into_iter().collect(), total_received: 1, - per_document_id: matches!(query, DocumentFetchKind::PerDocumentId), + per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), max_limit: limit, max_offset: offset, + retrieve_vectors, } } @@ -1576,6 +1582,7 @@ impl DocumentsFetchAggregator { per_filter, max_limit, max_offset, + retrieve_vectors, } = other; if self.timestamp.is_none() { @@ -1591,6 +1598,8 @@ impl DocumentsFetchAggregator { self.max_limit = self.max_limit.max(max_limit); self.max_offset = self.max_offset.max(max_offset); + + self.retrieve_vectors |= retrieve_vectors; } pub fn into_event(self, user: &User, event_name: &str) -> Option { diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 97ded8069..81e297d54 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -110,7 +110,10 @@ pub async fn get_document( debug!(parameters = ?params, "Get document"); let index_uid = IndexUid::try_from(index_uid)?; - analytics.get_fetch_documents(&DocumentFetchKind::PerDocumentId, &req); + analytics.get_fetch_documents( + &DocumentFetchKind::PerDocumentId { retrieve_vectors: params.retrieve_vectors.0 }, + &req, + ); let GetDocument { fields, retrieve_vectors } = params.into_inner(); let attributes_to_retrieve = fields.merge_star_and_none(); @@ -193,6 +196,7 @@ pub async fn documents_by_query_post( with_filter: body.filter.is_some(), limit: body.limit, offset: body.offset, + retrieve_vectors: body.retrieve_vectors, }, &req, ); @@ -232,6 +236,7 @@ pub async fn get_documents( with_filter: query.filter.is_some(), limit: query.limit, offset: query.offset, + retrieve_vectors: query.retrieve_vectors, }, &req, ); From 734d1c53ad2488aba411554cabe5232f8cdb1d5a Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 6 Jun 2024 16:31:07 +0200 Subject: [PATCH 24/44] fix a panic in yaup --- meilisearch/tests/common/index.rs | 2 +- meilisearch/tests/documents/get_documents.rs | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/meilisearch/tests/common/index.rs b/meilisearch/tests/common/index.rs index f81fe8c8a..114ede9b8 100644 --- a/meilisearch/tests/common/index.rs +++ b/meilisearch/tests/common/index.rs @@ -429,6 +429,6 @@ impl Index<'_> { pub struct GetAllDocumentsOptions { pub limit: Option, pub offset: Option, - pub fields: Option>, pub retrieve_vectors: bool, + pub fields: Option>, } diff --git a/meilisearch/tests/documents/get_documents.rs b/meilisearch/tests/documents/get_documents.rs index 1ade00b06..3bf3727c4 100644 --- a/meilisearch/tests/documents/get_documents.rs +++ b/meilisearch/tests/documents/get_documents.rs @@ -223,9 +223,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { assert_eq!(response["limit"], json!(20)); assert_eq!(response["total"], json!(77)); - let (response, code) = index - .get_all_documents(GetAllDocumentsOptions { fields: Some(vec![]), ..Default::default() }) - .await; + let (response, code) = index.get_all_documents_raw("?fields=").await; assert_eq!(code, 200); assert_eq!(response["results"].as_array().unwrap().len(), 20); for results in response["results"].as_array().unwrap() { From 0502b175017119610dac4034eb3524d7b551912f Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 10 Jun 2024 10:52:49 +0200 Subject: [PATCH 25/44] log the state of the index-scheduler in all failed tests --- index-scheduler/src/lib.rs | 29 ++++++++++++++++++++--------- 1 file changed, 20 insertions(+), 9 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 50fc619d8..c5ae1c31f 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -1836,6 +1836,7 @@ mod tests { assert_eq!(breakpoint, (Init, false)); let index_scheduler_handle = IndexSchedulerHandle { _tempdir: tempdir, + index_scheduler: index_scheduler.private_clone(), test_breakpoint_rcv: receiver, last_breakpoint: breakpoint.0, }; @@ -1924,6 +1925,7 @@ mod tests { pub struct IndexSchedulerHandle { _tempdir: TempDir, + index_scheduler: IndexScheduler, test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>, last_breakpoint: Breakpoint, } @@ -1941,9 +1943,13 @@ mod tests { { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { - panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}") + } + Err(RecvTimeoutError::Disconnected) => { + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler crashed.\n{state}") } - Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."), }; // if we've already encountered a breakpoint we're supposed to be stuck on the false // and we expect the same variant with the true to come now. @@ -1962,9 +1968,13 @@ mod tests { { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { - panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}") + } + Err(RecvTimeoutError::Disconnected) => { + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler crashed.\n{state}") } - Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."), }; assert!(!b, "Found the breakpoint handle in a bad state. Check your test suite"); @@ -1978,9 +1988,10 @@ mod tests { fn advance_till(&mut self, breakpoints: impl IntoIterator) { for breakpoint in breakpoints { let b = self.advance(); + let state = snapshot_index_scheduler(&self.index_scheduler); assert_eq!( b, breakpoint, - "Was expecting the breakpoint `{:?}` but instead got `{:?}`.", + "Was expecting the breakpoint `{:?}` but instead got `{:?}`.\n{state}", breakpoint, b ); } @@ -2013,8 +2024,8 @@ mod tests { InsideProcessBatch => (), // the batch went successfully, we can stop the loop and go on with the next states. ProcessBatchSucceeded => break, - AbortedIndexation => panic!("The batch was aborted."), - ProcessBatchFailed => panic!("The batch failed."), + AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), + ProcessBatchFailed => panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler)), breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), } } @@ -2033,8 +2044,8 @@ mod tests { InsideProcessBatch => (), // the batch went failed, we can stop the loop and go on with the next states. ProcessBatchFailed => break, - ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)"), - AbortedIndexation => panic!("The batch was aborted."), + ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)\n{}", snapshot_index_scheduler(&self.index_scheduler)), + AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), } } From 600e97d9dcec39588bd4c305a607fc0025620e15 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 10 Jun 2024 18:26:12 +0200 Subject: [PATCH 26/44] gate the retrieveVectors parameter behind the vectors feature flag --- meilisearch/src/routes/indexes/documents.rs | 24 +++++++++--- meilisearch/src/routes/indexes/search.rs | 8 ++-- meilisearch/tests/documents/errors.rs | 43 +++++++++++++++++++++ meilisearch/tests/search/mod.rs | 41 ++++++++++++++------ 4 files changed, 96 insertions(+), 20 deletions(-) diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 81e297d54..70623bb35 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -110,14 +110,18 @@ pub async fn get_document( debug!(parameters = ?params, "Get document"); let index_uid = IndexUid::try_from(index_uid)?; - analytics.get_fetch_documents( - &DocumentFetchKind::PerDocumentId { retrieve_vectors: params.retrieve_vectors.0 }, - &req, - ); - let GetDocument { fields, retrieve_vectors } = params.into_inner(); let attributes_to_retrieve = fields.merge_star_and_none(); + let features = index_scheduler.features(); + if retrieve_vectors.0 { + features.check_vector("Passing `retrieveVectors` as a parameter")?; + } + analytics.get_fetch_documents( + &DocumentFetchKind::PerDocumentId { retrieve_vectors: retrieve_vectors.0 }, + &req, + ); + let index = index_scheduler.index(&index_uid)?; let document = retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors.0)?; @@ -191,6 +195,11 @@ pub async fn documents_by_query_post( let body = body.into_inner(); debug!(parameters = ?body, "Get documents POST"); + let features = index_scheduler.features(); + if body.retrieve_vectors { + features.check_vector("Passing `retrieveVectors` as a parameter")?; + } + analytics.post_fetch_documents( &DocumentFetchKind::Normal { with_filter: body.filter.is_some(), @@ -215,6 +224,11 @@ pub async fn get_documents( let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner(); + let features = index_scheduler.features(); + if retrieve_vectors.0 { + features.check_vector("Passing `retrieveVectors` as a parameter")?; + } + let filter = match filter { Some(f) => match serde_json::from_str(&f) { Ok(v) => Some(v), diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index ae6402cf6..6fdff4568 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -290,11 +290,13 @@ pub fn search_kind( features: RoFeatures, ) -> Result { if query.vector.is_some() { - features.check_vector("Passing `vector` as a query parameter")?; + features.check_vector("Passing `vector` as a parameter")?; } - if query.hybrid.is_some() { - features.check_vector("Passing `hybrid` as a query parameter")?; + features.check_vector("Passing `hybrid` as a parameter")?; + } + if query.retrieve_vectors { + features.check_vector("Passing `retrieveVectors` as a parameter")?; } // regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing diff --git a/meilisearch/tests/documents/errors.rs b/meilisearch/tests/documents/errors.rs index cd1be4dc4..8e9a3a696 100644 --- a/meilisearch/tests/documents/errors.rs +++ b/meilisearch/tests/documents/errors.rs @@ -800,6 +800,8 @@ async fn fetch_document_by_filter() { async fn retrieve_vectors() { let server = Server::new().await; let index = server.index("doggo"); + + // GET ALL DOCUMENTS BY QUERY let (response, _code) = index.get_all_documents_raw("?retrieveVectors=tamo").await; snapshot!(json_string!(response), @r###" { @@ -809,6 +811,38 @@ async fn retrieve_vectors() { "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" } "###); + let (response, _code) = index.get_all_documents_raw("?retrieveVectors=true").await; + snapshot!(json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + // FETCH ALL DOCUMENTS BY POST + let (response, _code) = + index.get_document_by_filter(json!({ "retrieveVectors": "tamo" })).await; + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"tamo\"`", + "code": "invalid_document_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" + } + "###); + let (response, _code) = index.get_document_by_filter(json!({ "retrieveVectors": true })).await; + snapshot!(json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + // GET A SINGLE DOCUMENT let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": "tamo"}))).await; snapshot!(json_string!(response), @r###" { @@ -818,4 +852,13 @@ async fn retrieve_vectors() { "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" } "###); + let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await; + snapshot!(json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); } diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 9e19fa4e8..19e495edd 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1290,21 +1290,38 @@ async fn experimental_feature_vector_store() { index.add_documents(json!(documents), None).await; index.wait_task(0).await; - let (response, code) = index - .search_post(json!({ + index + .search(json!({ "vector": [1.0, 2.0, 3.0], "showRankingScore": true - })) + }), |response, code|{ + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Passing `vector` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + }) + .await; + index + .search(json!({ + "retrieveVectors": true, + "showRankingScore": true + }), |response, code|{ + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + }) .await; - meili_snap::snapshot!(code, @"400 Bad Request"); - meili_snap::snapshot!(meili_snap::json_string!(response), @r###" - { - "message": "Passing `vector` as a query parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", - "code": "feature_not_enabled", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#feature_not_enabled" - } - "###); let (response, code) = server.set_features(json!({"vectorStore": true})).await; meili_snap::snapshot!(code, @"200 OK"); From 7cef2299cf0642d846246e6687193484f8f7fc03 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 11 Jun 2024 09:42:45 +0200 Subject: [PATCH 27/44] Fix behavior when removing a document --- milli/src/update/clear_documents.rs | 7 +++++++ .../index_documents/extract/extract_vector_points.rs | 1 + 2 files changed, 8 insertions(+) diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 3490b55e4..9eca378a5 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -64,6 +64,13 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?; + // Remove all user-provided bits from the configs + let mut configs = self.index.embedding_configs(self.wtxn)?; + for config in configs.iter_mut() { + config.user_provided.clear(); + } + self.index.put_embedding_configs(self.wtxn, configs)?; + // Clear the other databases. external_documents_ids.clear(self.wtxn)?; word_docids.clear(self.wtxn)?; diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 964cb35e8..48e3e697a 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -225,6 +225,7 @@ pub fn extract_vector_points( } else if document_is_kept && old.is_none() { VectorStateDelta::NoChange } else { + remove_from_user_provided.insert(docid); VectorStateDelta::NowRemoved } } From 3493093c4f4df2889c5fc895fd372f7e5ea2cf50 Mon Sep 17 00:00:00 2001 From: Tamo Date: Tue, 11 Jun 2024 16:03:45 +0200 Subject: [PATCH 28/44] add a batch of tests --- index-scheduler/src/lib.rs | 176 +++++++++++++++++++++++++-- meilisearch/tests/vector/mod.rs | 78 ++++++++++++ meilisearch/tests/vector/settings.rs | 161 ++++++++++++++++++++++++ 3 files changed, 407 insertions(+), 8 deletions(-) create mode 100644 meilisearch/tests/vector/settings.rs diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index c5ae1c31f..e2a6f03a0 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -2016,6 +2016,7 @@ mod tests { // Wait for one successful batch. #[track_caller] fn advance_one_successful_batch(&mut self) { + self.index_scheduler.assert_internally_consistent(); self.advance_till([Start, BatchCreated]); loop { match self.advance() { @@ -2025,12 +2026,16 @@ mod tests { // the batch went successfully, we can stop the loop and go on with the next states. ProcessBatchSucceeded => break, AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), - ProcessBatchFailed => panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler)), + ProcessBatchFailed => { + while self.advance() != Start {} + panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler)) + }, breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), } } self.advance_till([AfterProcessing]); + self.index_scheduler.assert_internally_consistent(); } // Wait for one failed batch. @@ -5012,7 +5017,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors"); @@ -5105,7 +5109,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel"); @@ -5180,7 +5183,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); @@ -5303,9 +5305,7 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); handle.advance_one_successful_batch(); - index_scheduler.assert_internally_consistent(); let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); @@ -5452,9 +5452,7 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); handle.advance_one_successful_batch(); - index_scheduler.assert_internally_consistent(); // the document with the id 3 should have its original embedding updated let rtxn = index.read_txn().unwrap(); @@ -5481,4 +5479,166 @@ mod tests { assert!(!embedding.is_empty()); } + + #[test] + fn delete_document_containing_vector() { + // 1. Add an embedder + // 2. Push two documents containing a simple vector + // 3. Delete the first document + // 4. The user defined roaring bitmap shouldn't contains the id of the first document anymore + // 5. Clear the index + // 6. The user defined roaring bitmap shouldn't contains the id of the second document + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), + dimensions: Setting::Set(3), + ..Default::default() + }) + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "manual": vec![0, 0, 0], + } + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "manual": vec![1, 1, 1], + } + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + index_scheduler + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1")], + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###); + let conf = index.embedding_configs(&rtxn).unwrap(); + // TODO: Here the user provided vectors should NOT contains 1 + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, + }, + ), + prompt: PromptData { + template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + }, + }, + user_provided: RoaringBitmap<[0, 1]>, + }, + ] + "###); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["manual"]; + assert!(!embedding.is_empty(), "{embedding:?}"); + + index_scheduler + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @"[]"); + let conf = index.embedding_configs(&rtxn).unwrap(); + // TODO: Here the user provided vectors should contains nothing + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, + }, + ), + prompt: PromptData { + template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + }, + }, + user_provided: RoaringBitmap<[0, 1]>, + }, + ] + "###); + } } diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs index b4350116f..55dc186d5 100644 --- a/meilisearch/tests/vector/mod.rs +++ b/meilisearch/tests/vector/mod.rs @@ -1,5 +1,8 @@ +mod settings; + use meili_snap::{json_string, snapshot}; +use crate::common::index::Index; use crate::common::{GetAllDocumentsOptions, Server}; use crate::json; @@ -147,3 +150,78 @@ async fn add_remove_user_provided() { } "###); } + +async fn generate_default_user_provided_documents(server: &Server) -> Index { + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }}, + {"id": 2, "name": "billou", "_vectors": { "manual": [[2, 2, 2], [2, 2, 3]] }}, + {"id": 3, "name": "intel", "_vectors": { "manual": { "userProvided": true, "embeddings": [3, 3, 3] }}}, + {"id": 4, "name": "max", "_vectors": { "manual": { "userProvided": true, "embeddings": [[4, 4, 4], [4, 4, 5]] }}}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + index +} + +#[actix_rt::test] +async fn clear_documents() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + let (value, _code) = index.clear_all_documents().await; + index.wait_task(value.uid()).await; + + // Make sure the documents DB has been cleared + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [], + "offset": 0, + "limit": 20, + "total": 0 + } + "###); + + // Make sure the arroy DB has been cleared + let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; + snapshot!(json_string!(documents), @r###" + { + "hits": [], + "query": "", + "processingTimeMs": 0, + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0, + "semanticHitCount": 0 + } + "###); +} diff --git a/meilisearch/tests/vector/settings.rs b/meilisearch/tests/vector/settings.rs new file mode 100644 index 000000000..6b93f001e --- /dev/null +++ b/meilisearch/tests/vector/settings.rs @@ -0,0 +1,161 @@ +use meili_snap::{json_string, snapshot}; + +use crate::common::{GetAllDocumentsOptions, Server}; +use crate::json; +use crate::vector::generate_default_user_provided_documents; + +#[actix_rt::test] +async fn update_embedder() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "manual": {}}, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 2, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + + let ret = server.wait_task(response.uid()).await; + snapshot!(ret, @r###" + { + "uid": 1, + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 2 + } + } + }, + "error": { + "message": "`.embedders.manual`: Field `model` unavailable for source `userProvided` (only available for sources: `huggingFace`, `openAi`, `ollama`). Available fields: `source`, `dimensions`, `distribution`", + "code": "invalid_settings_embedders", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn reset_embedder_documents() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + let (response, code) = index.delete_settings().await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + // Make sure the documents are still present + let (documents, _code) = index.get_all_documents(Default::default()).await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir" + }, + { + "id": 1, + "name": "echo" + }, + { + "id": 2, + "name": "billou" + }, + { + "id": 3, + "name": "intel" + }, + { + "id": 4, + "name": "max" + } + ], + "offset": 0, + "limit": 20, + "total": 5 + } + "###); + + // Make sure we are still able to retrieve their vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": {} + }, + { + "id": 1, + "name": "echo", + "_vectors": {} + }, + { + "id": 2, + "name": "billou", + "_vectors": {} + }, + { + "id": 3, + "name": "intel", + "_vectors": {} + }, + { + "id": 4, + "name": "max", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 5 + } + "###); + + // Make sure the arroy DB has been cleared + let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; + snapshot!(json_string!(documents), @r###" + { + "message": "Cannot find embedder with name `default`.", + "code": "invalid_embedder", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_embedder" + } + "###); +} From b368105272a926b85e4848dc86d75c5b25edaf8d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:02:12 +0200 Subject: [PATCH 29/44] Add EmbedderConfigs::into_inner --- milli/src/vector/mod.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 553c8c3c1..c43fa8bd2 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -152,6 +152,10 @@ impl EmbeddingConfigs { &self.0 } + pub fn into_inner(self) -> HashMap, Arc)> { + self.0 + } + /// Get the name of the default embedder configuration. /// /// The default embedder is determined as follows: From e9bf4eb10056ed96bfc1964717d44fd7c54e4487 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:02:49 +0200 Subject: [PATCH 30/44] Reformulate ParsedVectorsDiff in terms of VectorState --- milli/src/vector/parsed_vectors.rs | 78 ++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 14 deletions(-) diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 501bd2ad2..9007e03e4 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -42,9 +42,31 @@ pub struct ExplicitVectors { pub user_provided: bool, } +pub enum VectorState { + Inline(Vectors), + InDb, + Generated, +} + +impl VectorState { + pub fn is_user_provided(&self) -> bool { + match self { + VectorState::Inline(vectors) => vectors.is_user_provided(), + VectorState::InDb => true, + VectorState::Generated => false, + } + } +} + +pub enum VectorsState { + NoVectorsFid, + NoVectorsFieldInDocument, + Vectors(BTreeMap), +} + pub struct ParsedVectorsDiff { - pub old: BTreeMap>, - pub new: Option>, + old: BTreeMap, + new: VectorsState, } impl ParsedVectorsDiff { @@ -71,26 +93,54 @@ impl ParsedVectorsDiff { return Err(error); } } - .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, Some(vec))).collect()); + .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect()); for embedding_config in embedders_configs { if embedding_config.user_provided.contains(docid) { - old.entry(embedding_config.name.to_string()).or_insert(None); + old.entry(embedding_config.name.to_string()).or_insert(VectorState::InDb); } } - let new = new_vectors_fid - .and_then(|vectors_fid| documents_diff.get(vectors_fid)) - .map(KvReaderDelAdd::new) - .map(|obkv| to_vector_map(obkv, DelAdd::Addition)) - .transpose()? - .flatten(); + let new = 'new: { + let Some(new_vectors_fid) = new_vectors_fid else { + break 'new VectorsState::NoVectorsFid; + }; + let Some(bytes) = documents_diff.get(new_vectors_fid) else { + break 'new VectorsState::NoVectorsFieldInDocument; + }; + let obkv = KvReaderDelAdd::new(bytes); + match to_vector_map(obkv, DelAdd::Addition)? { + Some(new) => VectorsState::Vectors(new), + None => VectorsState::NoVectorsFieldInDocument, + } + }; + Ok(Self { old, new }) } - /// Return (Some(None), _) in case the vector is user defined and contained in the database. - pub fn remove(&mut self, embedder_name: &str) -> (Option>, Option) { - let old = self.old.remove(embedder_name); - let new = self.new.as_mut().and_then(|new| new.remove(embedder_name)); + pub fn remove(&mut self, embedder_name: &str) -> (VectorState, VectorState) { + let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated); + let state_from_old = match old { + // assume a userProvided is still userProvided + VectorState::InDb => VectorState::InDb, + // generated is still generated + VectorState::Generated => VectorState::Generated, + // weird case that shouldn't happen were the previous docs version is inline, + // but it was removed in the new version + // Since it is not in the new version, we switch to generated + VectorState::Inline(_) => VectorState::Generated, + }; + let new = match &mut self.new { + VectorsState::Vectors(new) => { + new.remove(embedder_name).map(VectorState::Inline).unwrap_or(state_from_old) + } + _ => + // if no `_vectors` field is present in the new document, + // the state depends on the previous version of the document + { + state_from_old + } + }; + (old, new) } } From d0b05ae691681b7e490b4303bbd56daaf71a0845 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:03:40 +0200 Subject: [PATCH 31/44] Add EmbedderAction to settings --- milli/src/vector/settings.rs | 300 ++++++++++++++++++++++++++++------- 1 file changed, 240 insertions(+), 60 deletions(-) diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index e786a7164..edbed462c 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -1,4 +1,5 @@ use deserr::Deserr; +use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use super::rest::InputType; @@ -72,6 +73,245 @@ pub fn check_unset( } } +/// Indicates what action should take place during a reindexing operation for an embedder +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum ReindexAction { + /// An indexing operation should take place for this embedder, keeping existing vectors + /// and checking whether the document template changed or not + RegeneratePrompts, + /// An indexing operation should take place for all documents for this embedder, removing existing vectors + /// (except userProvided ones) + FullReindex, +} + +pub enum SettingsDiff { + Remove, + Reindex { action: ReindexAction, updated_settings: EmbeddingSettings }, + UpdateWithoutReindex { updated_settings: EmbeddingSettings }, +} + +pub enum EmbedderAction { + WriteBackToDocuments(WriteBackToDocuments), + Reindex(ReindexAction), +} + +pub struct WriteBackToDocuments { + pub embedder_id: u8, + pub user_provided: RoaringBitmap, +} + +impl SettingsDiff { + pub fn should_reindex(&self) -> bool { + match self { + SettingsDiff::Remove { .. } | SettingsDiff::Reindex { .. } => true, + SettingsDiff::UpdateWithoutReindex { .. } => false, + } + } + + pub fn from_settings(old: EmbeddingSettings, new: Setting) -> Self { + match new { + Setting::Set(new) => { + let EmbeddingSettings { + mut source, + mut model, + mut revision, + mut api_key, + mut dimensions, + mut document_template, + mut url, + mut query, + mut input_field, + mut path_to_embeddings, + mut embedding_object, + mut input_type, + mut distribution, + } = old; + + let EmbeddingSettings { + source: new_source, + model: new_model, + revision: new_revision, + api_key: new_api_key, + dimensions: new_dimensions, + document_template: new_document_template, + url: new_url, + query: new_query, + input_field: new_input_field, + path_to_embeddings: new_path_to_embeddings, + embedding_object: new_embedding_object, + input_type: new_input_type, + distribution: new_distribution, + } = new; + + let mut reindex_action = None; + + // **Warning**: do not use short-circuiting || here, we want all these operations applied + if source.apply(new_source) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + // when the source changes, we need to reapply the default settings for the new source + apply_default_for_source( + &source, + &mut model, + &mut revision, + &mut dimensions, + &mut url, + &mut query, + &mut input_field, + &mut path_to_embeddings, + &mut embedding_object, + &mut input_type, + &mut document_template, + ) + } + if model.apply(new_model) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if revision.apply(new_revision) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if dimensions.apply(new_dimensions) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if url.apply(new_url) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if query.apply(new_query) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if input_field.apply(new_input_field) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if path_to_embeddings.apply(new_path_to_embeddings) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if embedding_object.apply(new_embedding_object) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if input_type.apply(new_input_type) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if document_template.apply(new_document_template) { + ReindexAction::push_action( + &mut reindex_action, + ReindexAction::RegeneratePrompts, + ); + } + + distribution.apply(new_distribution); + api_key.apply(new_api_key); + + let updated_settings = EmbeddingSettings { + source, + model, + revision, + api_key, + dimensions, + document_template, + url, + query, + input_field, + path_to_embeddings, + embedding_object, + input_type, + distribution, + }; + + match reindex_action { + Some(action) => Self::Reindex { action, updated_settings }, + None => Self::UpdateWithoutReindex { updated_settings }, + } + } + Setting::Reset => Self::Remove, + Setting::NotSet => Self::UpdateWithoutReindex { updated_settings: old }, + } + } +} + +impl ReindexAction { + fn push_action(this: &mut Option, other: Self) { + *this = match (*this, other) { + (_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex), + (Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex), + (_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts), + } + } +} + +#[allow(clippy::too_many_arguments)] // private function +fn apply_default_for_source( + source: &Setting, + model: &mut Setting, + revision: &mut Setting, + dimensions: &mut Setting, + url: &mut Setting, + query: &mut Setting, + input_field: &mut Setting>, + path_to_embeddings: &mut Setting>, + embedding_object: &mut Setting>, + input_type: &mut Setting, + document_template: &mut Setting, +) { + match source { + Setting::Set(EmbedderSource::HuggingFace) => { + *model = Setting::Reset; + *revision = Setting::Reset; + *dimensions = Setting::NotSet; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + } + Setting::Set(EmbedderSource::Ollama) => { + *model = Setting::Reset; + *revision = Setting::NotSet; + *dimensions = Setting::Reset; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + } + Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => { + *model = Setting::Reset; + *revision = Setting::NotSet; + *dimensions = Setting::NotSet; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + } + Setting::Set(EmbedderSource::Rest) => { + *model = Setting::NotSet; + *revision = Setting::NotSet; + *dimensions = Setting::Reset; + *url = Setting::Reset; + *query = Setting::Reset; + *input_field = Setting::Reset; + *path_to_embeddings = Setting::Reset; + *embedding_object = Setting::Reset; + *input_type = Setting::Reset; + } + Setting::Set(EmbedderSource::UserProvided) => { + *model = Setting::NotSet; + *revision = Setting::NotSet; + *dimensions = Setting::Reset; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + *document_template = Setting::NotSet; + } + Setting::NotSet => {} + } +} + pub fn check_set( key: &Setting, field: &'static str, @@ -210,66 +450,6 @@ impl EmbeddingSettings { *model = Setting::Set(openai::EmbeddingModel::default().name().to_owned()) } } - - pub(crate) fn apply_and_need_reindex( - old: &mut Setting, - new: Setting, - ) -> bool { - match (old, new) { - ( - Setting::Set(EmbeddingSettings { - source: old_source, - model: old_model, - revision: old_revision, - api_key: old_api_key, - dimensions: old_dimensions, - document_template: old_document_template, - url: old_url, - query: old_query, - input_field: old_input_field, - path_to_embeddings: old_path_to_embeddings, - embedding_object: old_embedding_object, - input_type: old_input_type, - distribution: old_distribution, - }), - Setting::Set(EmbeddingSettings { - source: new_source, - model: new_model, - revision: new_revision, - api_key: new_api_key, - dimensions: new_dimensions, - document_template: new_document_template, - url: new_url, - query: new_query, - input_field: new_input_field, - path_to_embeddings: new_path_to_embeddings, - embedding_object: new_embedding_object, - input_type: new_input_type, - distribution: new_distribution, - }), - ) => { - let mut needs_reindex = false; - - needs_reindex |= old_source.apply(new_source); - needs_reindex |= old_model.apply(new_model); - needs_reindex |= old_revision.apply(new_revision); - needs_reindex |= old_dimensions.apply(new_dimensions); - needs_reindex |= old_document_template.apply(new_document_template); - needs_reindex |= old_url.apply(new_url); - needs_reindex |= old_query.apply(new_query); - needs_reindex |= old_input_field.apply(new_input_field); - needs_reindex |= old_path_to_embeddings.apply(new_path_to_embeddings); - needs_reindex |= old_embedding_object.apply(new_embedding_object); - needs_reindex |= old_input_type.apply(new_input_type); - - old_distribution.apply(new_distribution); - old_api_key.apply(new_api_key); - needs_reindex - } - (Setting::Reset, Setting::Reset) | (_, Setting::NotSet) => false, - _ => true, - } - } } #[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] From d18c1f77d7453b2842851a947621cbe5687fdbb5 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:04:54 +0200 Subject: [PATCH 32/44] Update embedder configs with a finer granularity - no longer clear vector DB between any two embedder changes --- milli/src/update/settings.rs | 278 +++++++++++++++++++++-------------- 1 file changed, 171 insertions(+), 107 deletions(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 08b12d178..5421b64a7 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -23,7 +23,10 @@ use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; -use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; +use crate::vector::settings::{ + check_set, check_unset, EmbedderAction, EmbedderSource, EmbeddingSettings, ReindexAction, + WriteBackToDocuments, +}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::{FieldId, FieldsIdsMap, Index, Result}; @@ -924,111 +927,177 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(changed) } - fn update_embedding_configs(&mut self) -> Result { - let update = match std::mem::take(&mut self.embedder_settings) { - Setting::Set(configs) => { - let mut changed = false; + fn update_embedding_configs(&mut self) -> Result> { + match std::mem::take(&mut self.embedder_settings) { + Setting::Set(configs) => self.update_embedding_configs_set(configs), + Setting::Reset => { + // all vectors should be written back to documents let old_configs = self.index.embedding_configs(self.wtxn)?; - let old_configs: BTreeMap, RoaringBitmap)> = - old_configs - .into_iter() - .map( - |IndexEmbeddingConfig { name, config, user_provided: user_defined }| { - (name, (Setting::Set(config.into()), user_defined)) - }, - ) - .collect(); - - let mut new_configs = BTreeMap::new(); - for joined in old_configs + let remove_all: Result> = old_configs .into_iter() - .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) - { - match joined { - // updated config - EitherOrBoth::Both((name, (mut old, user_provided)), (_, new)) => { - changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); - if changed { - tracing::debug!( - embedder = name, - user_provided = user_provided.len(), - "need reindex" - ); - } else { - tracing::debug!(embedder = name, "skip reindex"); - } - let new = validate_embedding_settings(old, &name)?; - new_configs.insert(name, (new, user_provided)); - } - // unchanged config - EitherOrBoth::Left((name, setting)) => { - new_configs.insert(name, setting); - } - // new config - EitherOrBoth::Right((name, mut setting)) => { - // apply the default source in case the source was not set so that it gets validated - crate::vector::settings::EmbeddingSettings::apply_default_source( - &mut setting, - ); - crate::vector::settings::EmbeddingSettings::apply_default_openai_model( - &mut setting, - ); - let setting = validate_embedding_settings(setting, &name)?; - changed = true; - new_configs.insert(name, (setting, RoaringBitmap::new())); - } - } - } - let new_configs: Vec = new_configs - .into_iter() - .filter_map(|(name, (config, user_provided))| match config { - Setting::Set(config) => Some(IndexEmbeddingConfig { + .map(|IndexEmbeddingConfig { name, config: _, user_provided }| -> Result<_> { + let embedder_id = + self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; + Ok(( name, - config: config.into(), - user_provided, - }), - Setting::Reset => None, - Setting::NotSet => Some(IndexEmbeddingConfig { - name, - config: EmbeddingSettings::default().into(), - user_provided, - }), + EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }), + )) }) .collect(); + let remove_all = remove_all?; + self.index.embedder_category_id.clear(self.wtxn)?; - for (index, index_embedding_config) in new_configs.iter().enumerate() { - self.index.embedder_category_id.put_with_flags( - self.wtxn, - heed::PutFlags::APPEND, - &index_embedding_config.name, - &index - .try_into() - .map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?, - )?; - } - - if new_configs.is_empty() { - self.index.delete_embedding_configs(self.wtxn)?; - } else { - self.index.put_embedding_configs(self.wtxn, new_configs)?; - } - changed - } - Setting::Reset => { self.index.delete_embedding_configs(self.wtxn)?; - true + Ok(remove_all) } - Setting::NotSet => false, - }; - - // if any changes force a reindexing - // clear the vector database. - if update { - self.index.vector_arroy.clear(self.wtxn)?; + Setting::NotSet => Ok(Default::default()), } + } - Ok(update) + fn update_embedding_configs_set( + &mut self, + configs: BTreeMap>, + ) -> Result> { + use crate::vector::settings::SettingsDiff; + + let old_configs = self.index.embedding_configs(self.wtxn)?; + let old_configs: BTreeMap = old_configs + .into_iter() + .map(|IndexEmbeddingConfig { name, config, user_provided }| { + (name, (config.into(), user_provided)) + }) + .collect(); + let mut updated_configs = BTreeMap::new(); + let mut embedder_actions = BTreeMap::new(); + for joined in old_configs + .into_iter() + .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) + { + match joined { + // updated config + EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => { + let settings_diff = SettingsDiff::from_settings(old, new); + match settings_diff { + SettingsDiff::Remove => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + "removing embedder" + ); + let embedder_id = + self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; + // free id immediately + self.index.embedder_category_id.delete(self.wtxn, &name)?; + embedder_actions.insert( + name, + EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }), + ); + } + SettingsDiff::Reindex { action, updated_settings } => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + ?action, + "reindex embedder" + ); + embedder_actions.insert(name.clone(), EmbedderAction::Reindex(action)); + let new = + validate_embedding_settings(Setting::Set(updated_settings), &name)?; + updated_configs.insert(name, (new, user_provided)); + } + SettingsDiff::UpdateWithoutReindex { updated_settings } => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + "update without reindex embedder" + ); + let new = + validate_embedding_settings(Setting::Set(updated_settings), &name)?; + updated_configs.insert(name, (new, user_provided)); + } + } + } + // unchanged config + EitherOrBoth::Left((name, (setting, user_provided))) => { + tracing::debug!(embedder = name, "unchanged embedder"); + updated_configs.insert(name, (Setting::Set(setting), user_provided)); + } + // new config + EitherOrBoth::Right((name, mut setting)) => { + tracing::debug!(embedder = name, "new embedder"); + // apply the default source in case the source was not set so that it gets validated + crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting); + crate::vector::settings::EmbeddingSettings::apply_default_openai_model( + &mut setting, + ); + let setting = validate_embedding_settings(setting, &name)?; + embedder_actions + .insert(name.clone(), EmbedderAction::Reindex(ReindexAction::FullReindex)); + updated_configs.insert(name, (setting, RoaringBitmap::new())); + } + } + } + let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; + for res in self.index.embedder_category_id.iter(self.wtxn)? { + let (_name, id) = res?; + free_indices[id as usize] = false; + } + let mut free_indices = free_indices.iter_mut().enumerate(); + let mut find_free_index = + move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); + for (name, action) in embedder_actions.iter() { + match action { + EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => { + /* cannot be a new embedder, so has to have an id already */ + } + EmbedderAction::Reindex(ReindexAction::FullReindex) => { + if self.index.embedder_category_id.get(self.wtxn, name)?.is_none() { + let id = find_free_index() + .ok_or(UserError::TooManyEmbedders(updated_configs.len()))?; + tracing::debug!(embedder = name, id, "assigning free id to new embedder"); + self.index.embedder_category_id.put(self.wtxn, name, &id)?; + } + } + EmbedderAction::WriteBackToDocuments(_) => { /* already removed */ } + } + } + let updated_configs: Vec = updated_configs + .into_iter() + .filter_map(|(name, (config, user_provided))| match config { + Setting::Set(config) => { + Some(IndexEmbeddingConfig { name, config: config.into(), user_provided }) + } + Setting::Reset => None, + Setting::NotSet => Some(IndexEmbeddingConfig { + name, + config: EmbeddingSettings::default().into(), + user_provided, + }), + }) + .collect(); + if updated_configs.is_empty() { + self.index.delete_embedding_configs(self.wtxn)?; + } else { + self.index.put_embedding_configs(self.wtxn, updated_configs)?; + } + Ok(embedder_actions) } fn update_search_cutoff(&mut self) -> Result { @@ -1082,13 +1151,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_searchable()?; self.update_exact_attributes()?; self.update_proximity_precision()?; - // TODO: very rough approximation of the needs for reindexing where any change will result in - // a full reindexing. - // What can be done instead: - // 1. Only change the distance on a distance change - // 2. Only change the name -> embedder mapping on a name change - // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage - let embedding_configs_updated = self.update_embedding_configs()?; + + let embedding_config_updates = self.update_embedding_configs()?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; new_inner_settings.recompute_facets(self.wtxn, self.index)?; @@ -1102,7 +1166,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { old_inner_settings, new_inner_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, ); @@ -1119,7 +1183,7 @@ pub struct InnerIndexSettingsDiff { pub(crate) new: InnerIndexSettings, pub(crate) primary_key_id: Option, // TODO: compare directly the embedders. - pub(crate) embedding_configs_updated: bool, + pub(crate) embedding_config_updates: BTreeMap, pub(crate) settings_update_only: bool, /// The set of only the additional searchable fields. /// If any other searchable field has been modified, is set to None. @@ -1140,7 +1204,7 @@ impl InnerIndexSettingsDiff { old_settings: InnerIndexSettings, new_settings: InnerIndexSettings, primary_key_id: Option, - embedding_configs_updated: bool, + embedding_config_updates: BTreeMap, settings_update_only: bool, ) -> Self { let only_additional_fields = match ( @@ -1177,7 +1241,7 @@ impl InnerIndexSettingsDiff { old: old_settings, new: new_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, only_additional_fields, cache_reindex_searchable_without_user_defined, @@ -1244,7 +1308,7 @@ impl InnerIndexSettingsDiff { } pub fn reindex_vectors(&self) -> bool { - self.embedding_configs_updated + !self.embedding_config_updates.is_empty() } pub fn settings_update_only(&self) -> bool { From d1dd7e5d097dea50d85d49c3a14ffbef62f46bb7 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:05:52 +0200 Subject: [PATCH 33/44] In transform for removed embedders, write back their user provided vectors in documents, and clear the writers --- milli/src/update/index_documents/transform.rs | 118 +++++++++++++++++- 1 file changed, 114 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index c34b7876a..f58ffebf0 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::btree_map::Entry as BEntry; use std::collections::hash_map::Entry as HEntry; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek}; @@ -27,6 +27,7 @@ use crate::update::del_add::{ use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; use crate::{ is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, }; @@ -808,13 +809,13 @@ impl<'a, 'i> Transform<'a, 'i> { let mut new_inner_settings = old_inner_settings.clone(); new_inner_settings.fields_ids_map = fields_ids_map; - let embedding_configs_updated = false; + let embedding_config_updates = Default::default(); let settings_update_only = false; let settings_diff = InnerIndexSettingsDiff::new( old_inner_settings, new_inner_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, ); @@ -835,10 +836,13 @@ impl<'a, 'i> Transform<'a, 'i> { /// Rebind the field_ids of the provided document to their values /// based on the field_ids_maps difference between the old and the new settings, /// then fill the provided buffers with delta documents using KvWritterDelAdd. + #[allow(clippy::too_many_arguments)] // need the vectors + fid, feel free to create a struct xo xo fn rebind_existing_document( old_obkv: KvReader, settings_diff: &InnerIndexSettingsDiff, modified_faceted_fields: &HashSet, + mut injected_vectors: serde_json::Map, + old_vectors_fid: Option, original_obkv_buffer: Option<&mut Vec>, flattened_obkv_buffer: Option<&mut Vec>, ) -> Result<()> { @@ -863,7 +867,36 @@ impl<'a, 'i> Transform<'a, 'i> { let mut operations = HashMap::new(); let mut obkv_writer = KvWriter::<_, FieldId>::memory(); - for (id, val) in old_obkv.iter() { + 'write_fid: for (id, val) in old_obkv.iter() { + if !injected_vectors.is_empty() { + 'inject_vectors: { + let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors }; + + if id != vectors_fid { + break 'inject_vectors; + } + + let existing_vectors: std::result::Result< + serde_json::Map, + serde_json::Error, + > = serde_json::from_slice(val); + + let mut existing_vectors = match existing_vectors { + Ok(existing_vectors) => existing_vectors, + Err(error) => { + tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map"); + Default::default() + } + }; + + existing_vectors.append(&mut injected_vectors); + + operations.insert(id, DelAddOperation::DeletionAndAddition); + obkv_writer.insert(id, serde_json::to_vec(&existing_vectors).unwrap())?; + continue 'write_fid; + } + } + if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors { operations.insert(id, DelAddOperation::DeletionAndAddition); obkv_writer.insert(id, val)?; @@ -937,6 +970,35 @@ impl<'a, 'i> Transform<'a, 'i> { None }; + let readers: Result< + BTreeMap<&str, (Vec>, &RoaringBitmap)>, + > = settings_diff + .embedding_config_updates + .iter() + .filter_map(|(name, action)| { + if let EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }) = action + { + let readers: Result> = + self.index.arroy_readers(wtxn, *embedder_id).collect(); + match readers { + Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))), + Err(error) => Some(Err(error)), + } + } else { + None + } + }) + .collect(); + let readers = readers?; + + let old_vectors_fid = settings_diff + .old + .fields_ids_map + .id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + // We initialize the sorter with the user indexing settings. let mut flattened_sorter = if settings_diff.reindex_searchable() || settings_diff.reindex_facets() { @@ -963,10 +1025,41 @@ impl<'a, 'i> Transform<'a, 'i> { InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, )?; + let injected_vectors: std::result::Result< + serde_json::Map, + arroy::Error, + > = readers + .iter() + .filter_map(|(name, (readers, user_provided))| { + if !user_provided.contains(docid) { + return None; + } + let mut vectors = Vec::new(); + for reader in readers { + let Some(vector) = reader.item_vector(wtxn, docid).transpose() else { + break; + }; + + match vector { + Ok(vector) => vectors.push(vector), + Err(error) => return Some(Err(error)), + } + } + if vectors.is_empty() { + return None; + } + Some(Ok((name.to_string(), serde_json::to_value(vectors).unwrap()))) + }) + .collect(); + + let injected_vectors = injected_vectors?; + Self::rebind_existing_document( old_obkv, &settings_diff, &modified_faceted_fields, + injected_vectors, + old_vectors_fid, Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()), )?; @@ -983,6 +1076,23 @@ impl<'a, 'i> Transform<'a, 'i> { } } + let mut writers = Vec::new(); + + // delete all vectors from the embedders that need removal + for (_, (readers, _)) in readers { + for reader in readers { + let dimensions = reader.dimensions(); + let arroy_index = reader.index(); + drop(reader); + let writer = arroy::Writer::new(self.index.vector_arroy, arroy_index, dimensions); + writers.push(writer); + } + } + + for writer in writers { + writer.clear(wtxn)?; + } + let grenad_params = GrenadParameters { chunk_compression_type: self.indexer_settings.chunk_compression_type, chunk_compression_level: self.indexer_settings.chunk_compression_level, From f5cf01e7d1efc5c383837826eabc4d887a957374 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:11:29 +0200 Subject: [PATCH 34/44] Rework extraction to use EmbedderAction --- .../extract/extract_vector_points.rs | 461 ++++++++++++------ .../src/update/index_documents/extract/mod.rs | 4 +- milli/src/update/index_documents/mod.rs | 4 +- .../src/update/index_documents/typed_chunk.rs | 10 +- 4 files changed, 318 insertions(+), 161 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 48e3e697a..fdf8649f4 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -17,9 +17,10 @@ use crate::index::IndexEmbeddingConfig; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; -use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME}; +use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME}; +use crate::vector::settings::{EmbedderAction, ReindexAction}; use crate::vector::Embedder; -use crate::{try_split_array_at, DocumentId, Result, ThreadPoolNoAbort}; +use crate::{try_split_array_at, DocumentId, FieldId, FieldsIdsMap, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::(); @@ -35,7 +36,7 @@ pub struct ExtractedVectorPoints { // embedder pub embedder_name: String, pub embedder: Arc, - pub user_provided: RoaringBitmap, + pub add_to_user_provided: RoaringBitmap, pub remove_from_user_provided: RoaringBitmap, } @@ -44,12 +45,7 @@ enum VectorStateDelta { // Remove all vectors, generated or manual, from this document NowRemoved, - // Add the manually specified vectors, passed in the other grenad - // Remove any previously generated vectors - // Note: changing the value of the manually specified vector **should not record** this delta - WasGeneratedNowManual(Vec>), - - ManualDelta(Vec>), + NowManual(Vec>), // Add the vector computed from the specified prompt // Remove any previous vector @@ -62,9 +58,8 @@ impl VectorStateDelta { match self { VectorStateDelta::NoChange => Default::default(), VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), - VectorStateDelta::WasGeneratedNowManual(add) => (true, Default::default(), add), // We always delete the previous vectors - VectorStateDelta::ManualDelta(add) => (true, Default::default(), add), + VectorStateDelta::NowManual(add) => (true, Default::default(), add), VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), } } @@ -75,19 +70,29 @@ struct EmbedderVectorExtractor { embedder: Arc, prompt: Arc, - // (docid, _index) -> KvWriterDelAdd -> Vector - manual_vectors_writer: Writer>, // (docid) -> (prompt) prompts_writer: Writer>, // (docid) -> () remove_vectors_writer: Writer>, - + // (docid, _index) -> KvWriterDelAdd -> Vector + manual_vectors_writer: Writer>, // The docids of the documents that contains a user defined embedding - user_provided: RoaringBitmap, + add_to_user_provided: RoaringBitmap, + + action: ExtractionAction, +} + +struct DocumentOperation { // The docids of the documents that contains an auto-generated embedding remove_from_user_provided: RoaringBitmap, } +enum ExtractionAction { + SettingsFullReindex, + SettingsRegeneratePrompts { old_prompt: Arc }, + DocumentOperation(DocumentOperation), +} + /// Extracts the embedding vector contained in each document under the `_vectors` field. /// /// Returns the generated grenad reader containing the docid as key associated to the Vec @@ -104,46 +109,109 @@ pub fn extract_vector_points( let new_fields_ids_map = &settings_diff.new.fields_ids_map; // the vector field id may have changed let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); - // filter the old vector fid if the settings has been changed forcing reindexing. - let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors); let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); let mut extractors = Vec::new(); - for (embedder_name, (embedder, prompt)) in - settings_diff.new.embedding_configs.clone().into_iter() - { - // (docid, _index) -> KvWriterDelAdd -> Vector - let manual_vectors_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); - // (docid) -> (prompt) - let prompts_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + let mut configs = settings_diff.new.embedding_configs.clone().into_inner(); + let old_configs = &settings_diff.old.embedding_configs; - // (docid) -> () - let remove_vectors_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + if reindex_vectors { + for (name, action) in settings_diff.embedding_config_updates.iter() { + match action { + EmbedderAction::WriteBackToDocuments(_) => continue, // already deleted + EmbedderAction::Reindex(action) => { + let Some((embedder_name, (embedder, prompt))) = configs.remove_entry(name) + else { + tracing::error!(embedder = name, "Requested embedder config not found"); + continue; + }; - extractors.push(EmbedderVectorExtractor { - embedder_name, - embedder, - prompt, - manual_vectors_writer, - prompts_writer, - remove_vectors_writer, - user_provided: RoaringBitmap::new(), - remove_from_user_provided: RoaringBitmap::new(), - }); + // (docid, _index) -> KvWriterDelAdd -> Vector + let manual_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> (prompt) + let prompts_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> () + let remove_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + let action = match action { + ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex, + ReindexAction::RegeneratePrompts => { + let Some((_, old_prompt)) = old_configs.get(name) else { + tracing::error!(embedder = name, "Old embedder config not found"); + continue; + }; + + ExtractionAction::SettingsRegeneratePrompts { old_prompt } + } + }; + + extractors.push(EmbedderVectorExtractor { + embedder_name, + embedder, + prompt, + prompts_writer, + remove_vectors_writer, + manual_vectors_writer, + add_to_user_provided: RoaringBitmap::new(), + action, + }); + } + } + } + } else { + // document operation + + for (embedder_name, (embedder, prompt)) in configs.into_iter() { + // (docid, _index) -> KvWriterDelAdd -> Vector + let manual_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> (prompt) + let prompts_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> () + let remove_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + extractors.push(EmbedderVectorExtractor { + embedder_name, + embedder, + prompt, + prompts_writer, + remove_vectors_writer, + manual_vectors_writer, + add_to_user_provided: RoaringBitmap::new(), + action: ExtractionAction::DocumentOperation(DocumentOperation { + remove_from_user_provided: RoaringBitmap::new(), + }), + }); + } } let mut key_buffer = Vec::new(); @@ -177,111 +245,66 @@ pub fn extract_vector_points( embedder_name, embedder: _, prompt, - manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_provided, - remove_from_user_provided, + manual_vectors_writer, + add_to_user_provided, + action, } in extractors.iter_mut() { - let delta = match parsed_vectors.remove(embedder_name) { - (Some(old), Some(new)) => { - match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) { - (true, true) | (false, false) => (), - (true, false) => { - remove_from_user_provided.insert(docid); + let (old, new) = parsed_vectors.remove(embedder_name); + let delta = match action { + ExtractionAction::SettingsFullReindex => match old { + // A full reindex can be triggered either by: + // 1. a new embedder + // 2. an existing embedder changed so that it must regenerate all generated embeddings. + // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB + VectorState::Inline(vectors) => { + if vectors.is_user_provided() { + add_to_user_provided.insert(docid); } - (false, true) => { - user_provided.insert(docid); + let add_vectors = vectors.into_array_of_vectors(); + + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); } + + VectorStateDelta::NowManual(add_vectors) } - - // no autogeneration - let add_vectors = new.into_array_of_vectors(); - - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); - } - - VectorStateDelta::ManualDelta(add_vectors) - } - (Some(old), None) => { - // Do we keep this document? - let document_is_kept = obkv - .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) - .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - if document_is_kept && old.is_some() { - remove_from_user_provided.insert(docid); - // becomes autogenerated - VectorStateDelta::NowGenerated(prompt.render( + // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors + VectorState::InDb => VectorStateDelta::NoChange, + // generated vectors must be regenerated + VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?, + }, + // prompt regeneration is only triggered for existing embedders + ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { + if !old.is_user_provided() { + regenerate_if_prompt_changed( obkv, - DelAdd::Addition, - new_fields_ids_map, - )?) - } else if document_is_kept && old.is_none() { + (old_prompt, prompt), + (&old_fields_ids_map, &new_fields_ids_map), + )? + } else { + // we can simply ignore user provided vectors as they are not regenerated and are + // already in the DB since this is an existing embedder VectorStateDelta::NoChange - } else { - remove_from_user_provided.insert(docid); - VectorStateDelta::NowRemoved - } - } - (None, Some(new)) => { - if new.is_user_provided() { - user_provided.insert(docid); - } else { - remove_from_user_provided.insert(docid); - } - // was possibly autogenerated, remove all vectors for that document - let add_vectors = new.into_array_of_vectors(); - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); - } - - VectorStateDelta::WasGeneratedNowManual(add_vectors) - } - (None, None) => { - // Do we keep this document? - let document_is_kept = obkv - .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) - .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - - if document_is_kept { - // Don't give up if the old prompt was failing - let old_prompt = Some(&prompt) - // TODO: this filter works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - .filter(|_| !settings_diff.reindex_vectors()) - .map(|p| { - p.render(obkv, DelAdd::Deletion, old_fields_ids_map) - .unwrap_or_default() - }); - let new_prompt = - prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; - if old_prompt.as_ref() != Some(&new_prompt) { - let old_prompt = old_prompt.unwrap_or_default(); - tracing::trace!( - "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" - ); - VectorStateDelta::NowGenerated(new_prompt) - } else { - tracing::trace!("⏭️ Prompt unmodified, skipping"); - VectorStateDelta::NoChange - } - } else { - remove_from_user_provided.remove(docid); - VectorStateDelta::NowRemoved } } + ExtractionAction::DocumentOperation(DocumentOperation { + remove_from_user_provided, + }) => extract_vector_document_diff( + docid, + obkv, + prompt, + (add_to_user_provided, remove_from_user_provided), + (old, new), + (&old_fields_ids_map, &new_fields_ids_map), + document_id, + )?, }; - // and we finally push the unique vectors into the writer push_vectors_diff( remove_vectors_writer, @@ -289,7 +312,6 @@ pub fn extract_vector_points( manual_vectors_writer, &mut key_buffer, delta, - reindex_vectors, )?; } } @@ -300,20 +322,30 @@ pub fn extract_vector_points( embedder_name, embedder, prompt: _, - manual_vectors_writer, prompts_writer, remove_vectors_writer, - user_provided, - remove_from_user_provided, + action, + manual_vectors_writer, + add_to_user_provided, } in extractors { + let remove_from_user_provided = + if let ExtractionAction::DocumentOperation(DocumentOperation { + remove_from_user_provided, + }) = action + { + remove_from_user_provided + } else { + Default::default() + }; + results.push(ExtractedVectorPoints { manual_vectors: writer_into_reader(manual_vectors_writer)?, remove_vectors: writer_into_reader(remove_vectors_writer)?, prompts: writer_into_reader(prompts_writer)?, embedder, embedder_name, - user_provided, + add_to_user_provided, remove_from_user_provided, }) } @@ -321,6 +353,136 @@ pub fn extract_vector_points( Ok(results) } +fn extract_vector_document_diff( + docid: DocumentId, + obkv: obkv::KvReader<'_, FieldId>, + prompt: &Prompt, + (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), + (old, new): (VectorState, VectorState), + (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), + document_id: impl Fn() -> Value, +) -> Result { + match (old.is_user_provided(), new.is_user_provided()) { + (true, true) | (false, false) => {} + (true, false) => { + remove_from_user_provided.insert(docid); + } + (false, true) => { + add_to_user_provided.insert(docid); + } + } + + let delta = match (old, new) { + // regardless of the previous state, if a document now contains inline _vectors, they must + // be extracted manually + (_old, VectorState::Inline(new)) => { + let add_vectors = new.into_array_of_vectors(); + + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); + } + + VectorStateDelta::NowManual(add_vectors) + } + // no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the + // document changed + (VectorState::Generated, VectorState::Generated) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + + if document_is_kept { + // Don't give up if the old prompt was failing + let old_prompt = Some(&prompt).map(|p| { + p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default() + }); + let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + if old_prompt.as_ref() != Some(&new_prompt) { + let old_prompt = old_prompt.unwrap_or_default(); + tracing::trace!( + "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" + ); + VectorStateDelta::NowGenerated(new_prompt) + } else { + tracing::trace!("⏭️ Prompt unmodified, skipping"); + VectorStateDelta::NoChange + } + } else { + VectorStateDelta::NowRemoved + } + } + // when the vectors are no longer user-provided, + // we generate the prompt unconditionally + (_not_generated, VectorState::Generated) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + if document_is_kept { + // becomes autogenerated + VectorStateDelta::NowGenerated(prompt.render( + obkv, + DelAdd::Addition, + new_fields_ids_map, + )?) + } else { + // make sure the document is always removed from user provided on removal + remove_from_user_provided.insert(docid); + VectorStateDelta::NowRemoved + } + } + (_old, VectorState::InDb) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + if document_is_kept { + // if the new version of documents has the vectors in the DB, + // then they are user-provided and nothing possibly changed + VectorStateDelta::NoChange + } else { + // make sure the document is always removed from user provided on removal + remove_from_user_provided.insert(docid); + VectorStateDelta::NowRemoved + } + } + }; + + Ok(delta) +} + +fn regenerate_if_prompt_changed( + obkv: obkv::KvReader<'_, FieldId>, + (old_prompt, new_prompt): (&Prompt, &Prompt), + (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), +) -> Result { + let old_prompt = + old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default()); + let new_prompt = new_prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + + if new_prompt == old_prompt { + return Ok(VectorStateDelta::NoChange); + } + Ok(VectorStateDelta::NowGenerated(new_prompt)) +} + +fn regenerate_prompt( + obkv: obkv::KvReader<'_, FieldId>, + prompt: &Prompt, + new_fields_ids_map: &FieldsIdsMap, +) -> Result { + let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + + Ok(VectorStateDelta::NowGenerated(prompt)) +} + /// We cannot compute the diff between both Del and Add vectors. /// We'll push every vector and compute the difference later in TypedChunk. fn push_vectors_diff( @@ -329,14 +491,9 @@ fn push_vectors_diff( manual_vectors_writer: &mut Writer>, key_buffer: &mut Vec, delta: VectorStateDelta, - reindex_vectors: bool, ) -> Result<()> { let (must_remove, prompt, mut add_vectors) = delta.into_values(); - if must_remove - // TODO: the below condition works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - && !reindex_vectors - { + if must_remove { key_buffer.truncate(TRUNCATE_SIZE); remove_vectors_writer.insert(&key_buffer, [])?; } diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 2babe330f..9da3983fc 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -248,7 +248,7 @@ fn send_original_documents_data( prompts, embedder_name, embedder, - user_provided, + add_to_user_provided, remove_from_user_provided, } in extracted_vectors { @@ -274,7 +274,7 @@ fn send_original_documents_data( expected_dimension: embedder.dimensions(), manual_vectors, embedder_name, - user_provided, + add_to_user_provided, remove_from_user_provided, })); } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index a533f1984..3586c9c6d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -503,7 +503,7 @@ where embeddings, manual_vectors, embedder_name, - user_provided, + add_to_user_provided, remove_from_user_provided, } => { dimension.insert(embedder_name.clone(), expected_dimension); @@ -513,7 +513,7 @@ where expected_dimension, manual_vectors, embedder_name, - user_provided, + add_to_user_provided, remove_from_user_provided, } } diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 0cb5e58af..4737c6b42 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -91,7 +91,7 @@ pub(crate) enum TypedChunk { expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, - user_provided: RoaringBitmap, + add_to_user_provided: RoaringBitmap, remove_from_user_provided: RoaringBitmap, }, ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), @@ -625,7 +625,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); - let mut user_provided = RoaringBitmap::new(); + let mut add_to_user_provided = RoaringBitmap::new(); let mut remove_from_user_provided = RoaringBitmap::new(); let mut params = None; for typed_chunk in typed_chunks { @@ -635,7 +635,7 @@ pub(crate) fn write_typed_chunk_into_index( embeddings, expected_dimension, embedder_name, - user_provided: ud, + add_to_user_provided: aud, remove_from_user_provided: rud, } = typed_chunk else { @@ -649,7 +649,7 @@ pub(crate) fn write_typed_chunk_into_index( if let Some(embeddings) = embeddings { embeddings_builder.push(embeddings.into_cursor()?); } - user_provided |= ud; + add_to_user_provided |= aud; remove_from_user_provided |= rud; } @@ -662,7 +662,7 @@ pub(crate) fn write_typed_chunk_into_index( .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) .unwrap(); index_embedder_config.user_provided -= remove_from_user_provided; - index_embedder_config.user_provided |= user_provided; + index_embedder_config.user_provided |= add_to_user_provided; index.put_embedding_configs(wtxn, embedding_configs)?; From fca9fe39b35ab513ec3d505a58c34d476ee443e4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 14:49:38 +0200 Subject: [PATCH 35/44] Update test snapshots --- index-scheduler/src/lib.rs | 6 ++---- meilisearch/tests/vector/settings.rs | 9 ++------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index e2a6f03a0..fd7f29f6c 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5579,7 +5579,6 @@ mod tests { .collect::>(); snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###); let conf = index.embedding_configs(&rtxn).unwrap(); - // TODO: Here the user provided vectors should NOT contains 1 snapshot!(format!("{conf:#?}"), @r###" [ IndexEmbeddingConfig { @@ -5595,7 +5594,7 @@ mod tests { template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", }, }, - user_provided: RoaringBitmap<[0, 1]>, + user_provided: RoaringBitmap<[0]>, }, ] "###); @@ -5620,7 +5619,6 @@ mod tests { .collect::>(); snapshot!(serde_json::to_string(&documents).unwrap(), @"[]"); let conf = index.embedding_configs(&rtxn).unwrap(); - // TODO: Here the user provided vectors should contains nothing snapshot!(format!("{conf:#?}"), @r###" [ IndexEmbeddingConfig { @@ -5636,7 +5634,7 @@ mod tests { template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", }, }, - user_provided: RoaringBitmap<[0, 1]>, + user_provided: RoaringBitmap<[]>, }, ] "###); diff --git a/meilisearch/tests/vector/settings.rs b/meilisearch/tests/vector/settings.rs index 6b93f001e..e11f4368f 100644 --- a/meilisearch/tests/vector/settings.rs +++ b/meilisearch/tests/vector/settings.rs @@ -43,7 +43,7 @@ async fn update_embedder() { { "uid": 1, "indexUid": "doggo", - "status": "failed", + "status": "succeeded", "type": "settingsUpdate", "canceledBy": null, "details": { @@ -54,12 +54,7 @@ async fn update_embedder() { } } }, - "error": { - "message": "`.embedders.manual`: Field `model` unavailable for source `userProvided` (only available for sources: `huggingFace`, `openAi`, `ollama`). Available fields: `source`, `dimensions`, `distribution`", - "code": "invalid_settings_embedders", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_settings_embedders" - }, + "error": null, "duration": "[duration]", "enqueuedAt": "[date]", "startedAt": "[date]", From 34fabed214d92e607ed862e0b51a5fe8c3e93199 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 17:09:34 +0200 Subject: [PATCH 36/44] Add test for vector writeback --- index-scheduler/src/lib.rs | 167 +++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index fd7f29f6c..4278d15b3 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5639,4 +5639,171 @@ mod tests { ] "###); } + + #[test] + fn delete_embedder_with_user_provided_vectors() { + // 1. Add two embedders + // 2. Push two documents containing a simple vector + // 3. The documents must not contain the vectors after the update as they are in the vectors db + // 3. Delete the embedders + // 4. The documents contain the vectors again + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), + dimensions: Setting::Set(3), + ..Default::default() + }), + S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + ..Default::default() + }), + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "manual": vec![0, 0, 0], + "my_doggo_embedder": vec![1; 384], + } + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "manual": vec![1, 1, 1], + } + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel"}]"###); + } + + { + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Reset, + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + } + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"userProvided":true}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"userProvided":true}}}]"###); + } + + { + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Reset, + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + } + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + + /// FIXME: redaction + snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"userProvided\":true},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"userProvided\":true}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"userProvided\":true}}}]""###); + } + } } From a89eea233bbd8d5bda76e6f1a195485639a31bc4 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 17:10:19 +0200 Subject: [PATCH 37/44] Fix vectors injection --- milli/src/update/index_documents/transform.rs | 52 ++++++++++++++----- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index f58ffebf0..b2fe04a4c 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -27,6 +27,7 @@ use crate::update::del_add::{ use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; use crate::{ is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, @@ -872,28 +873,35 @@ impl<'a, 'i> Transform<'a, 'i> { 'inject_vectors: { let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors }; - if id != vectors_fid { + if id < vectors_fid { break 'inject_vectors; } - let existing_vectors: std::result::Result< - serde_json::Map, - serde_json::Error, - > = serde_json::from_slice(val); + let mut existing_vectors = if id == vectors_fid { + let existing_vectors: std::result::Result< + serde_json::Map, + serde_json::Error, + > = serde_json::from_slice(val); - let mut existing_vectors = match existing_vectors { - Ok(existing_vectors) => existing_vectors, - Err(error) => { - tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map"); - Default::default() + match existing_vectors { + Ok(existing_vectors) => existing_vectors, + Err(error) => { + tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map"); + Default::default() + } } + } else { + Default::default() }; existing_vectors.append(&mut injected_vectors); - operations.insert(id, DelAddOperation::DeletionAndAddition); - obkv_writer.insert(id, serde_json::to_vec(&existing_vectors).unwrap())?; - continue 'write_fid; + operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition); + obkv_writer + .insert(vectors_fid, serde_json::to_vec(&existing_vectors).unwrap())?; + if id == vectors_fid { + continue 'write_fid; + } } } @@ -905,6 +913,15 @@ impl<'a, 'i> Transform<'a, 'i> { obkv_writer.insert(id, val)?; } } + if !injected_vectors.is_empty() { + 'inject_vectors: { + let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors }; + + operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition); + obkv_writer.insert(vectors_fid, serde_json::to_vec(&injected_vectors).unwrap())?; + } + } + let data = obkv_writer.into_inner()?; let obkv = KvReader::::new(&data); @@ -1048,7 +1065,14 @@ impl<'a, 'i> Transform<'a, 'i> { if vectors.is_empty() { return None; } - Some(Ok((name.to_string(), serde_json::to_value(vectors).unwrap()))) + Some(Ok(( + name.to_string(), + serde_json::to_value(ExplicitVectors { + embeddings: VectorOrArrayOfVectors::from_array_of_vectors(vectors), + user_provided: true, + }) + .unwrap(), + ))) }) .collect(); From 3bc8f81abc3f8d57060c1571d6801e50f43ce33f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 18:11:11 +0200 Subject: [PATCH 38/44] user_provided => regenerate --- index-scheduler/src/batch.rs | 6 +- meilisearch/src/routes/indexes/documents.rs | 5 +- meilisearch/src/search.rs | 3 +- .../extract/extract_vector_points.rs | 56 ++++++++++--------- milli/src/update/index_documents/transform.rs | 6 +- milli/src/vector/parsed_vectors.rs | 34 +++++------ 6 files changed, 62 insertions(+), 48 deletions(-) diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 30ff54a62..cd5525eea 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -958,10 +958,10 @@ impl IndexScheduler { .is_some_and(|conf| conf.user_provided.contains(id)); let embeddings = ExplicitVectors { - embeddings: VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, + embeddings: Some( + VectorOrArrayOfVectors::from_array_of_vectors(embeddings), ), - user_provided, + regenerate: !user_provided, }; vectors.insert( embedder_name, diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 70623bb35..bfbe20207 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -625,7 +625,10 @@ fn some_documents<'a, 't: 'a>( .iter() .find(|conf| conf.name == name) .is_some_and(|conf| conf.user_provided.contains(key)); - let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided }; + let embeddings = ExplicitVectors { + embeddings: Some(vector.into()), + regenerate: !user_provided, + }; vectors.insert( name, serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index ce712f17f..60f684ede 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -1072,7 +1072,8 @@ fn make_hits( .iter() .find(|conf| conf.name == name) .is_some_and(|conf| conf.user_provided.contains(id)); - let embeddings = ExplicitVectors { embeddings: vector.into(), user_provided }; + let embeddings = + ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided }; vectors.insert(name, serde_json::to_value(embeddings)?); } document.insert("_vectors".into(), vectors.into()); diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index fdf8649f4..0a27a28bd 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -260,28 +260,33 @@ pub fn extract_vector_points( // 2. an existing embedder changed so that it must regenerate all generated embeddings. // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB VectorState::Inline(vectors) => { - if vectors.is_user_provided() { + if !vectors.must_regenerate() { add_to_user_provided.insert(docid); } - let add_vectors = vectors.into_array_of_vectors(); - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); + match vectors.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError( + crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ), + )); + } + VectorStateDelta::NowManual(add_vectors) + } + None => VectorStateDelta::NoChange, } - - VectorStateDelta::NowManual(add_vectors) } // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors - VectorState::InDb => VectorStateDelta::NoChange, + VectorState::Manual => VectorStateDelta::NoChange, // generated vectors must be regenerated VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?, }, // prompt regeneration is only triggered for existing embedders ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { - if !old.is_user_provided() { + if old.must_regenerate() { regenerate_if_prompt_changed( obkv, (old_prompt, prompt), @@ -362,31 +367,32 @@ fn extract_vector_document_diff( (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), document_id: impl Fn() -> Value, ) -> Result { - match (old.is_user_provided(), new.is_user_provided()) { + match (old.must_regenerate(), new.must_regenerate()) { (true, true) | (false, false) => {} (true, false) => { - remove_from_user_provided.insert(docid); + add_to_user_provided.insert(docid); } (false, true) => { - add_to_user_provided.insert(docid); + remove_from_user_provided.insert(docid); } } let delta = match (old, new) { // regardless of the previous state, if a document now contains inline _vectors, they must // be extracted manually - (_old, VectorState::Inline(new)) => { - let add_vectors = new.into_array_of_vectors(); + (_old, VectorState::Inline(new)) => match new.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); + } - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); + VectorStateDelta::NowManual(add_vectors) } - - VectorStateDelta::NowManual(add_vectors) - } + None => VectorStateDelta::NoChange, + }, // no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the // document changed (VectorState::Generated, VectorState::Generated) => { @@ -437,7 +443,7 @@ fn extract_vector_document_diff( VectorStateDelta::NowRemoved } } - (_old, VectorState::InDb) => { + (_old, VectorState::Manual) => { // Do we keep this document? let document_is_kept = obkv .iter() diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index b2fe04a4c..467a2810a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1068,8 +1068,10 @@ impl<'a, 'i> Transform<'a, 'i> { Some(Ok(( name.to_string(), serde_json::to_value(ExplicitVectors { - embeddings: VectorOrArrayOfVectors::from_array_of_vectors(vectors), - user_provided: true, + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + vectors, + )), + regenerate: false, }) .unwrap(), ))) diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 9007e03e4..92d6cb382 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -18,18 +18,20 @@ pub enum Vectors { } impl Vectors { - pub fn is_user_provided(&self) -> bool { + pub fn must_regenerate(&self) -> bool { match self { - Vectors::ImplicitlyUserProvided(_) => true, - Vectors::Explicit(ExplicitVectors { user_provided, .. }) => *user_provided, + Vectors::ImplicitlyUserProvided(_) => false, + Vectors::Explicit(ExplicitVectors { regenerate, .. }) => *regenerate, } } - pub fn into_array_of_vectors(self) -> Vec { + pub fn into_array_of_vectors(self) -> Option> { match self { - Vectors::ImplicitlyUserProvided(embeddings) - | Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => { - embeddings.into_array_of_vectors().unwrap_or_default() + Vectors::ImplicitlyUserProvided(embeddings) => { + Some(embeddings.into_array_of_vectors().unwrap_or_default()) + } + Vectors::Explicit(ExplicitVectors { embeddings, regenerate: _ }) => { + embeddings.map(|embeddings| embeddings.into_array_of_vectors().unwrap_or_default()) } } } @@ -38,22 +40,22 @@ impl Vectors { #[derive(serde::Serialize, serde::Deserialize, Debug)] #[serde(rename_all = "camelCase")] pub struct ExplicitVectors { - pub embeddings: VectorOrArrayOfVectors, - pub user_provided: bool, + pub embeddings: Option, + pub regenerate: bool, } pub enum VectorState { Inline(Vectors), - InDb, + Manual, Generated, } impl VectorState { - pub fn is_user_provided(&self) -> bool { + pub fn must_regenerate(&self) -> bool { match self { - VectorState::Inline(vectors) => vectors.is_user_provided(), - VectorState::InDb => true, - VectorState::Generated => false, + VectorState::Inline(vectors) => vectors.must_regenerate(), + VectorState::Manual => false, + VectorState::Generated => true, } } } @@ -96,7 +98,7 @@ impl ParsedVectorsDiff { .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect()); for embedding_config in embedders_configs { if embedding_config.user_provided.contains(docid) { - old.entry(embedding_config.name.to_string()).or_insert(VectorState::InDb); + old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual); } } @@ -121,7 +123,7 @@ impl ParsedVectorsDiff { let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated); let state_from_old = match old { // assume a userProvided is still userProvided - VectorState::InDb => VectorState::InDb, + VectorState::Manual => VectorState::Manual, // generated is still generated VectorState::Generated => VectorState::Generated, // weird case that shouldn't happen were the previous docs version is inline, From bc547dad6fb5ecf9cc9f7ccb896b817f1b9eadde Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 18:12:56 +0200 Subject: [PATCH 39/44] Update dump file --- dump/tests/assets/v6-with-vectors.dump | Bin 17539 -> 19136 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/dump/tests/assets/v6-with-vectors.dump b/dump/tests/assets/v6-with-vectors.dump index 9f8ed2ba11d233ed5a1fadd25a18e3d25c7e04a1..8c0505772efde9409111f4aaba1a74714ee89f9a 100644 GIT binary patch literal 19136 zcmV)4K+3-#iwFQs#%X2%1MQvPk{rjC-f24=4$Bw5vaiSE*c%xbW@cqpRv}_9Agwnf zd$poPu57Qo!EMYm$gu`9(EO1IGx#39b@;9P7JdVNzwcxZKoG=`S^%At+ zndi@U&iT%%qffVAJl$TNKG~jMZ_e&N-fpgMF1J@l|NZLX{DU`iq?G1b5BP7NoBx|i z{5!?((?PCt$+fgO%?By>+Uo}g=}le0>yDeN>&>MuDl(^>E*?5x96Ml<1KIaC;#DJ*8lUr{_nv*{2uqm zw;msDzWB*@bF#hs@a@@wyYxS|@^$?$xs?9F!544h2JbrFbNz4q;IDrAqldrv@lSsG z%YT3L#pe3@^5|6x`R5P+)5C`k)5S-B{nx+weEIm{59)gGu|D)y|MhhrdO?eS@Q1$B z{qd^f?R)?4p#P=3?Eg!f_Wu7}9A6&(c6)q%arv=+wucX{FK@Q@4xenVFHeuJ4j(+; zoL%YPXBVGc{o>;0dVBkd#OC7i`lFl2k5A8kvE}#OyTAHF-_ic~KF86gr{^czFSb{2 z@jKA_*1iw^pXS!~`u|RjJ^p{MkLURR?b(65^#2?9Kew9G9{<0KV~_ve?_+5Fhi~&c z?$Uqsn`4jv-s@wh|IcyY zo3{hsAOH1I_xayklK#&<)jj@u7sr$io?M?k+5UO%c`2!{wI9`G zt@E-v05@{vUmMwY|PRJ^$=YazN7e=YRHj zpZ|L|#~%N^*T+lx{~QLsK|64l{=b6%+44UB_fC#I!h64um-PQFc>g=~KUWFiOa8yL zzW4v{;`s9LdXueO`Z+R(?{>aBJF0Wtp`sL>Q*OIPZ+?>T}es^(sa&>s`@cP-)3;joioFW-}3w6gQipb?qa;Y#&vLlpP#>Q zo42oheP#9T$D89Vn#(z#Y`*yA=ImyB_0jh7FM0j&K~5>w(f^LW{^%KR{-JJSbN2M} z&EZ!%&C|_ir{|k%-?fztdv%Vaw``w0+MWm@`0`J-k2g1G*Zg{QadXL? zAAWxGnNIQ<-=uYUa&fXf)2lyv`0%G6KKg8X|Iy}Zdw+X=|F`+5>F<}@-|p6O&i!~( zm-S@bF7u2VZ^}|2R6|JlP(6_0@ytul$yk&`baATp!<_pKyjh`^rlZvj5|o z>x;*ae|8l@@TW&^{zm-wWP80i*<5e_!0}(&$A9nU*vEhG_3^Fo-y5_8zeoI6r+xhQ zPL6&2_kJHQ>Hk~s{@+3W=lPrc|NH*GcXE7rcyp>4Wpn{L2s?eE$mlYX=FPrl`yH9ctUsFx`MGZ=lG8J|9Sndq$#Dn{=buBkN@B6<6H6n8?*y=>i^69pCacm@A3b; zI9|>FN$Zr$I`8>EfAYsm`u`TZ{~h|Dvwm+c`~NiW>p$=0`0~*3f160EZ?BH^yY%z& z^7LeTesCKU408C_Q~tk7NBQ3VXaBSR+5hZ+_CNcd{m=eq|Fi#jFMj^+tFL~r3m@|H z@Zp>plAE(Tf`=~$j|j4%LWy79Pe*0#Yi&#EiU)fx(>=Y`%CgKgEp6_#EN$hrl-IR& z{Ugn5PRly`oq4HioAOfDNuRF0=o0mrb?sAKCeASNQY(|**h}kWD)T(+rK-Slp36ej zX~~OTnM$uU>$O%_-aF|vg{+ERb&WdZI?Xjt3tzjo)R(EQIZt)2)68pAORM78o^&a- zDwgHrOP$xwWRqoT(3|hw&qp=)dCpT>OD&}?jfIf~_)?a8 z`6x}j=)Ut>wO%Rtl6%$?W&KO{T1w|!X--qtpA~)gIxj6wmK&v+!rJNDs&B2YiofT^ z+?A|*=YIRDn=iGeqS#$a$Y-?zebP5*{qzZ^ebB}e-OJ18+P}kR3S$|mfuB(`rGO=}BwzoWIZN;oh z*Pf*27i(P-Wu~nEuZ`!FlNO|D+w|_WYAq37{bR{mhB`O4)GMzWmsIqYa<0`2GfTKu z?VP^2X@80@MQ6~;ui7z5nXcO_C;`=8>jc`9Wm&VAx=W%~y;1vF+hpPGNt>#TT_oe$ z9iMSpwMJUSRJ2uT&0YrWP18y*2u#-v+=~@l$jfQ(brH?yTOf{^N_9riwwsLK2?7J3vPQ6OV^(Ry}E>T;72unM)I-T{`iKIalbzf@i z<=Q16)0(6xDK5Gc-Q1Ll_OeJy;wyC8InOJa*hn2&g{5v3FIpE;Jz^c z*59tHSXh&!`{{f%Ybmw0x{9^UND$v3mDB$wiG!}z=1=#ov*;!5PuD4|g!9rT35JZ0 zmeeoF#*}HQ1)Y_s^HM`>B>>tUsU|x+YoC{_Ta?)9#w?Dx7k!fitMt^gTGllQnXH_? zwURgE3ucMnB=zX)gf`9WmkpJczfTgLHH0F~1sT&y$totPtEES;=xQWQUHFwh5hc#Rg8Y z1ClW>Ws|ee70G94(QWfH23yh~UC`HAWV9%4DiUC6f005>v26N%k%&%(1!;a!i#8`o zoFqn;C5H$~owbZw6J2r*DVG(LF3!431l>}jjn-bu0$@XQ(_Y?L3L?v1WvKN2?C)Sr z3#KC)_I!9!?MV_4XKSbZNl#9t68p!osmo|x|u1IQ`e&9(E2sJ#`t3G86VOT%H7oX z-Z`;;(z=FxcXmVqwMxn7sTHop&N?kx{AH5TY2&nvt39N?SSr?7FqwcB=WD6hR^!|P zCRl^z#~X&P*I|q#*|ae)UWnKcoe;w&&C_YaOiPc7-?@@WLOof*vF2T8lq_WFtyDypBJ88f!g#iwP3m}c+1OcXCnXLIku?(bSoBHW zA3`aU$6KpZxJZ(XY`9syrYn*YnY~WB1W8)M$)V(ESC$o_5FqPNdGBJ+r*-H1Cf-_Q z&{#WtvHY`iTd!pMIGaVSmAX0WnQ$E zJ>){52{$F#Er2}x+@?743#+)mJ!$$0OsB%&Xs5Ih#c`8Hca?i_tgF2+IJSc zRKamUNv1a%n-(9I;goC$dCK;BYk_G3o#1gf z^x?g9GxE0>C3!JjAs0?S zk|t@awp+Gn_O}YN=#mJE8aTt4SYn_Rn{dNfHUw*-59#t`D>^W4@y!Y{N@QxmgXc9O z1Ax9fnlS;PM6v}TSdboeUK?moU%Kd8GB{4|F2bOMF)w_JtSTa=t;ratOig+K7w|tJ?sodMPI%;`0|%XQnbJl z6d7N@pApug|H>#fmMau=kvo!IAu#0{*KqcNKeGA~tb#+0yEX7lK%kka=A2FfSFj-x zQVW!UuDrK&f(6AEvUs}U6}ITJ+_5KH!C zN$q_c6cK1OVwKs^G1N;o!jE8Z;HEf5mZGghCo(*-C{iRMzCy%fwX#K)h~WC9YeX3l zi^%x@~6dp#*%zFyo1cq1q706M@LZ)q=a4lXFT}ahU>ZTh;JSUoHn?7I*P zeo25tP|YASJKVuZ$^}<@<%Ot6L5>gH%g0RY(r`3FHj zEwC(e4alG+>X^=gKIX^;kbsapuyfuZ-3?9)Z$zXKKD2_}R)i&y3_+6{A&#uHf+51J zFcpHpgbJtyaVjc+gT;Y&b}sy?EQWmivN|5iyEtqjGy_S3%4n7yve36{ zeP!)s9M+Kw(c;P`YTp{`IUs}XLY`TkWksQi;ekMq_7K=5L0bcY$%$dl!8uM}=C3D(CEp=ML zlnE_Ee`rLCl3cq6G9$Pp{gL#PZ$l_ca;Q)R!;CosGHGAPV9kgXQP6#(13@F10pR42 z6>X~wC5}M;LpSHEDPRma^3J6;Lr&gdCsG(-Lq; zxJD$I$Z2&HC=XJ4AR+l^on0%4nHo3SnQep&;ni9M_kbg9uEGhKe_xDID1dOuj<$y| z3;qhD$Otb4$>WT8xETaba%^4=5Cl~hoF0g%M5sNHpb{Lcq~B!w9nTTC2^&Zj6c7x1 z&I-$RBMjs$hi3p_jb$dwge{O+)i05&lC6-e zPvlb-c?a%Rd?z>umM~x$Ac+-f#EoRj{80mfj;GcKy>`U4lG|)kn0kOD5&>Em!HYgV z*;qOm)?(D20mKx?Y=1RuUU>M4 z6hKBFZFbfG!jwWe8LbeXNJ7X0>JAbDQ+wosv|Exn;&Pd*W~}cr`couJ^*Cy>) zh1@l1#f0bM!;YjC#mcDY`rF8eB%*iS$rPWB9F#tXg%=8wO^qB`lP{MR6|L#eDQf_c zCUzr?Qbw{X=xioGSmvEg*R>!nXj^R~Qo{S8Y1+BT#!yxUr=h(n0BGNEVk?{_oXAv9 zb~Il2iKtK?(_6wX~%I*~uT0mSxY`@PY)AVxJ@Ttc#HU=Mn`@BK}6i zq^~l!grwoGfaSyw3am(eL`p*jX42L>9U}x$LQ7?`GT{koJ`YEXN9@>JDk*JbIc9P~ zILS=PKD-HOt3)VFYQt_ff!QISGAZBd}?WaI2alTH%| zP>V?FCt+mU$YK&zN+n4#MfjlqPFSCAMD5TkT&^TiXhQ23+AZrd$x#Qsj7*3nLZ6kG zE_L`NNhayULWtL*jG}ud&;YeaO2d9oq^S5^y$WWwUaf2kWg^n_c_8sZi~{BtmPZTWX3yW#CiAB|vAApSsW3MMR63(qB^oQBro0VWKF- zjykC9x)G^oiZE})yy&fNT84yT6ra^b%P4_RoH?rDl?u6ReT7uWg(N_k>^sSLDPV+h z0QA{KG`**Vvyp-e7-l%)vIu?~r3t)NB3NoxpF+-&=rnVJdH4qBsM$K^m$c(uh?5sy_@e{p`oHC}nPvj4YBJ&7Isp3lRlFJo!n2bUWrh*=I8-@h>lw6byMDyDtnccPH zKu6u|O4Q*ne+7+WBFKV7_EeuEeAAb&XtlRfZaGuR%l<9p#JUeKb^&u#;#9r`b?9a*iwtiNi>h z6DP{_5Lr_;nTK2~xK%-fN_A$q#~DvT5VuA}i(+*q(h`_1b9_|sKV|_9B4-y#@Jvv! zwE2aYtWnM)nd{Pe*h`X}IzOmiQQ=4@SHK~*w{t_oDJ7!j!h)n}REbCmyQqjw3Q2*c zU~ei!0Y;c^1d=l z3Jr!?)v77pp{y=VElU_j#g#(l=s>|+8+Yq+sos(k0<>FuI%0{xY}u)=K>LfOoeYL< zInJkWkI0pj&agFv??nIP)MmpWSO=^vg=4)IS4Yhkph?v=vJK6V5^2Ac{mmoJ5>{|z zQ+Cmu;Ed23y9j_OYm^TfNqBi?5K^EGtq1bdXcK7jGSV#*2uW{~AC)<0IS~$|Sb8J< z2rHuvOO7_zj7fF|ipU#$L(O6ouaPgxjg;zYcy#h?4I?WcsR$%O7HYw|5v?xS?IG;i zet;i_r9}Zr>IYFFlT)pfTn17`Lga2h1Ty@Q5hTx03AJU`T~f$Iytvzn(aa-QkRe~4 z9HJ0EMb4TCZUJME9S^u0m`}!+*283j3uyPV7Udge^Q0=N?az1cIkQT5b8&`?! zKw=&1I7fV|?Psy{HCBlMFmHuxSBIoI1v zz^GdQ{uSnt?#mckOJ61E&^5E2#mh{152qlJYSTTBq~)|2X$Mf4f>9y!rNtK%1#nZXzcLT>?}onM?ydQtLzqTp@Mqsm-XM-BmxDYYuI@LX-Pl^PYdH&aJ6`H`_=T6Io?*tEHtv76H75b zrKD7Gf&w*mKNdjXRxhj*S^eY%%`S%Ojwp~Sg0p-FMiw|H135c_kjPa)y+F5gy$>)| zNSDpHe!-X!W}Qo`0_lKSel~&0A+P?e7`H?kN;g2F2_!p_k_{7C$c|9MoI+4Q+}xaHCi--nN7o}*zRjo*5#Uw7!?_^#OYd$JZBL5HMC97m5 z(o%r;@rbfxvG&>dD6r8C{mxZUEve*haW$%&PQ!SP1tuC^4RCnR$vx`a_$$duTu1{* z>Khb0u&NG8{zTX~LLbdjk$9b8H*)pt_{h7GiYA1Z$$A)xcX%DL1ZG>6kf%is zl_IuQ62loN1{x^YCCQ##NtD0wqg1{K)~1MfCTicVSyHcxY`_ftPgbXqG_Gk7+OlCX zY?O7BdI?FJiAQB}cJ-bMGu1>UhpZ!en&2kM+f%VwsW~dNBzrGJfSDXcJ*@~~OEL2- zlhpCep`vIhnat$ebENCPFsBG)NjQEX?K^L3*6w28Oo=He(p5`cH;1GhAkZgGS69 z;!fpO$cH+tEO%N4&Xk#!wnWfy$v{K%8e}}`Q4`)5mUxaRfPfT8qPU}zp5Qtv?oPIWT;f_wl;TOL%kxrkA4#G{ zf{S|7M0#LeT{tGuBN^09X{{C~4kdlg^VE3&ST+W}q!dz~Hb>Tonjtiq1W_bhawS#m zJO#xDiK@QAkz}TdDN=8+Nri zft$<(q#y$wC9Z_GnSwV{>k72dAZ)IrNDI|CgK3hQl|-)LnpdtQN+1h_ZHf>BoXbLL zQ8XOMZ|euQMgYSz{+++2ih#gY9CA!dOF;04v!H5ewu z_9-Hs^tSuui;u9G5HLnWRXy5Ni6Ri`M{k;sPXe7M=$0vvfw0Ho!ljU?i9py zZ!5V5CrfExpm3C3u0~*by{*$g0>Z_^1UG&k86sATh|1`6hRqoy8eN~{kqD~WdUJK= z2DpnqHa!3ml!ZoIE&-f_lq82E+_Yc@MtX^hNDy_vBxPg`%uejE>7(`~H79%H!fl5- z}q44BbT2ToJb2@-t>DV3%kg(>(z zj=wej2np~|-Ox3e(?5KOn=TT(@i};2Quag@NgG6ag$PIkt}VxDan+m~12qoB zVdUb{CXhTxobcvkOm*$pW`BjgRel-t#x-SU7$^EZ*iM6GjbJTbp0dC9~sD@Mj? z`V>`nylTynH7QOxQU`+XH3eTy>uWxt5G2(TmmDGONE$PSo;-{<$r%YDD)?nlzN=`} zOoR*dy+$w_HB6Zd%HQa-qhoT+%PkS@USzk~APPKd(8dc7S)hz*5@`=ur-;Z{!I&nt@aRx+$slb%_)`RC?G;R3>0&gwZ5}fMEu4m8f;% zwzb*Pjf}X4DF8KA12owCk+z#0PQVp~xE54*ctsL!%A(6I(I`(P4FraD*anfMw0N(C zc@#mr1q%i!MO-pgYw+`mSq$295ShGw5MmNIL=l2xqX>)u?(^VF=b*5!RC#4L01@V) z;CW{gK_K0sA>PYm60lpbQiNIBZ(;6EI6|Mn;L-q_E!`L$i|z{k5h$YST6?r3pn@}$ ziB4M6q&OOnI`5SAR^V#5#@P_$8QNCRW@apOy^8g|;X=^2`aiL7QM=*}ldXZm(C z?#chjQ50woE7)~3M(NIhO1e3Pvb>l>|7h5u5YYTtTqVw*-ic&Ha%&xAacO3l2^dF2 zN(RB?7rk7f*aaC`IzUc-3eYRiM{x2|pW(p!fc1o1UJA`A(2sKC;1XbB(Q3qVs zNk|pspor3YSwhO_i6P7dTLq}Erk?#29}2JLZ2#L&Raz+AXA}TglOstlKdHH0|lV0 z5b%w}G#R;UYWk>Dk_9AlvA8ALB13v-0cFdtMigjU?5wO4MxhO)e`Z)}5Q+lALZKsx z1}V8ju2z?^K(v!hpf(g{x6ye?(!4P2H!>a+L9oxzm)nSlfcW$t2r0{5g|-G*pdJBv zbkqqdP-Q0exh=xc+_;c_lmjhsABYMzpCCjQE);}7&?E>#=IBfU7j!o}n?*6OeT!BS zDr(H$$Jm`y}bu-)g~DooVY7KdLW9+3mBRIqUALRV1>5`d)(3=Jd49Q!L{$xRay z%WRGkqmdYSh#6I|5p=#NyE7W_8sSo+Ll9!6{r@gIEhQqAIQ1xLaTnP+a+kJ>*$Bm? ztPYMSBfUfi;Ed=}tD4Q#%Fs-qhES)Uf$h$*O6VtUSl6^hvKLE7vvzf=V32_=kWgRf ztcPC|=B-G$A?Fa~23d!IWrlPkZvc%n_PE15lt}{e)sR&wILL!~31P`#;4YAr2|3*~ zgaDf;!XkpF(&)yPW2N;mN1Hk?6%!oj+6_jk8PXJecwnU+Z>TTQ3#20B9mPI~4ls}~ zL&$JAVb?6+uTTISeaK|yq)<~{qgR~YQ3qHGgx7Xl)Og{>PdD=?qP2#pN}JG52Wja?{Qc75U`l`}y=9z7x8+ZoEj`kX(IX(a;8n5-0t zhFy`>LgZm?3{I^lL;^x-Jy-|07uYwLXGbaDL~!KZwbcf>?nutf1#n_rU8ALd-d^^D zEVC@2edgj`#oN~Sd*)!n()2A-`F1vzeo{!6ee`6KIwgAnZCN{KS{JyKiY5*`JHv#7 zXRL$UgJ+vBxCj;disbYX^=iFmVGcr}aeNu=mUORGn&<=^gqs2h0O+ysB-=@h4YhZ{ zk&XgVP|*fV>WCKE)oT^~uP~%aGXc4bPGhIOX*+MH}~>dB|A_B$}3hR zSYcHDbvH20+|3u84uB%T0)MPUu90pH%x!}E+5?7R<5+VuTY}t5GJ*09+=UOqc&Zd+2p6fuXCxf@ZdMcZ|m1`9AlK7#)3sA0r>I01THCubS#-KkLs=t0{h zCTS|hm0gLy!Z4zGzs<0y{E!mt;^wz}K>j2%apRGzKmx%V>G& z5R)go#TtcD=7uy3eTS!8289xW5EB{9fZ+k_J7xwXQu5{N&Oq1&5j4?FPeRImfy9Yt zk@!OKY1RW}rIeulEF z4zO1|88uY6N46UYij@g}P>7@1D-(S|KwuC^V9rD@u3lf<3hk^hSv{C>Sz&$2HK zzo9JXUz|pORNBCL@CKrJl)piGLBvOpXzuCPCwq_=$rCOzLp2-Od&Z8CN(UTG)@=Md zy-g7IZ|MvROF-M8?8;!?F#rcbq866+MW+^MT!Zi4O)m}SJd@C9o=gBA-#EdH51PN& z&blMx(|duHZbS|)(RMfDP02jxu%Z(M92ixd`2tHGlNlgCxxhes!xF$fxiLCtL)|)< zsgrLC6TajChg35O+;tTVjj;y?H#r!N2}M8l4Fej;S&5WQDEwF~Eu( zC~zr5^G!ps7|TGb1a*0=l|6fRWuJ*yV6`#HWTn%ONa z4it!+{Em8+XuBhFpt9OrC@*6k0y#ITNi$;#2Ca~Ef=4Eh(~gEKrc5!zB|)^_wMQT( zJ!-g1^utp*>IU4!UX13Ac%(N)TZIo?o zGDs}3YfIXV7KBCML>$EL5uX-#c#E@m^z+83I9zFNG|U!;OhnEiJ7YmEB5Dcti7wwR zzPxB6hIUC8kz?vntTvzC0T&Qs^kXtCi3wBe#vsyqADT4*Nw7aCCdi_E)PjNr2S9>~ z?#L&Skn~tEE|3KrfDJ@IhDJs`J1)KNDpnX47`h`nzJPM%DHkS?)H(!G%LvtZK{`f1 z2jk#odO8Z`zK=0K?!1BXMuB7SoswsC(QFFwG^Q}F&e;;MleW0$Mg{C6rGL}v`_Kd? z8YU{b^y+(=(LZ`4XfWW^q0+Lyt}d@I8zTh=LMD2K2107zR|q)>pY4$|gMqiu3$l*M zCDRmy-I>G{y~R}SrZFwEBL*alVI{wbHtw&!l8O@cB zMj&LoPdW)-kAQ-LY8gd6pg8wLLuf=#kejt7o=#N8fRzPqCqn|5Jme_V7DoS{6xKY< z0`+CaD|iHk?t26dPAd@e!I>J~l49YscqA8{tp+PJxDX>12GYb}q{(xzRw{%*jC;6Uuj4nXdUhFi4S#e9BtRa=>3GIY9J8@9w$v#>CwdvV zlxSclzdX?5N)--tOu(|lg}|L5@|M=sQ3k5Ga5^>6qcIhS5VCm20F}Klw@hn9KC)1Z zscBd1r15xi(S0JPanEB@7D%MKdB;{mcd0HEE9t}=^9j0nithLxmXRSJB=^C_yPkit zVzY-zhjk*gwK&bYu#KY;g1$6!uaZlHgB;}!F!Gwa9*PQ=T?T+c;TqGQvx8F#*-T7= z4ExVV-$nM6>5E7N1Js5Q$wjyFJ+;miH@o7sl zCr#0B;B>*~?15aF9bO|%1r4WpE_I3wapzl!Ly^l}OlMLiB?VZ&gGM(4o|!d40A;(Z z<((jdO*C-sMvOD=lS<1(K6H$2KrCq+H~S#~*y1T7)NMU`Jwgc3J-k{npOz={zZT93 zjXJI^xv2$DGSPw%ku>F3GFeo=r$HmrJ7^~tx?)`H$j@s7TlZum;_3kDtQ=Vk#ym$g zf@UEZn+o9HWj#s6l8LI@j6Un6X`tE1O?xc_g{lt8emBF7v22tUItw=Wx@_P`%T_zf95r_) z%9BAPc<_w8SQG(JSvRJr!&m6L6Y32_baZ(}A}mv&;zfq?=4s^bJ!{uv>$Sn*@jcrM ze2SwRRbAU}cbQNeDq&eXbdniRG0VqSP2z}IDrB<5qA}D7#1AlA2Sa9NVh~`Q2*zMhQ#8UoF|UsLGAT!!otV~5s@mYs6CIqe#tPET-#x3Q?s{=Z>e4Rd zi4D;kPW-x<4nMWgQ%1!O3z^dxEO5{D!MWP;Q&cH|@(K-G7#omUco!%Vm9FXWD)keawjt&g~Ap2eY$w;cuGYf8=J#{Q{-QZLv$FC#XZ*W49&t$mZ z2w719?1n!DvULRB7(QSU7{J6Y2RF=0asHo{=V>IP8CJ~jr;7cW$K3J)8I=Z|Y1e*H z=|JaVV6;%EVHt=X{7`^Q$6~ZvXZY}pj>GLvwj3)RC=0e^ih@7qWB981Y&7RCK_{n2 z3%|247XbqF_-2p;o!(LUvY&|=Bx~nrh>;nHLM-(B7hn^Gb#&i z&KXliGL3ogqnMf#88K(m;5C-p3JiqNi6D~rsE@wW?pYF;JK9VfhGrNs07gpac`ztl z$EWzNtE|y4%qSNONH&|FdUvpF8mR)F2*AX%k+p+=P|YkQ`U?ViOXj3WDrEZXRTP*_ zbi})jPsU+nEr~NmXP`BQZm>1D2GLU*$)t%%_mDT8YqUXf3wlS@2|5O6syo3(OA3jI zfx<{%P)G4Y8OB^SU^AUZ-DKIQj(e6@sbf4g^6xb!5x{rNx7k-X-Pm&&39^wA`k0&Sv5kGS6g5auej0%;D(a5V z1QV0eNjBb^P+&vM(V)pOWR?Cp5ISRJ9aV=mVvfPI)9Oq`Y&@#%#}h0uq1oTG(vT8# zQ2sWeGb&((9$2?W;_Kj9Fbgnw-tibkGyAcch14oDTO+VwW+_Q+61Jn%0_zL-MX56K z+EicQ&{I~R$=KQrze4*&VSEsiD1(#pfhS>t9LhaK57|tkUVseX3Rxs(%Lji6q+KbC z>dr{Rghc|W&e5F8Q262hWo_+JJ(X;Uu2u=23l4=wnkcY0P?n$1(P=-5EpeMLiH+h; zfml5tg720DvjHDP6Z|7oRPrP&Dcq<81%%3SJTIjIS{N)#$s_P6CJHytgqyVwb~D^~ z-a&~imfiW2+4DnM4PyyorMV>>vUE-HrHMAu?37N_M4C%aU!!@sMay$~feAclB51*k zwIqp5>L|`v7%59-lIdXbPGc@o_tX*+ZX>S34O4mpdbsxw)?jmx;rYU4*r4oT%cS&{ zFjvsHAsE7R7}~aY@)1Nw+T2FF3WM4jPo9~e8i!P@W;R0blf@fDCSlYuM~ro-IX4uL*>a2iAXVAyvH2S*h9*#Xu~6AldKca-;qjl;S2!GZ$he4bcSyoW`U& zr?4b(boY!|RPd2`@Cy+tld_|Y0*eJnW~J#mB12+%Sn>1>muc6C-rZZ;W*TEQrwE^!bw-4F&y*S#g)$}Ow{RQ8)ZiM1NHRGvQ0(mH zK*q$l|0nWq5bKu4aI=A^nVJj*5!(A06X-D(J)6XN3TLlPC|Kt3rOX!Msa76r$gHA)W&B_TV5a9(S{!>k zZ-QsGM0Yw}mo%tV;_W<*-WLY0)7^u-8xm8MLLupzqfa_g8ss}&9cIj2@mOm{W)}M2 zN*)ueXU1mJsFYyXM((`M9>>mH3CZ->9)~rgB|33HOreX(Zcy#eN_%j)Xz+3&dd)j} zD%~cNXUQ;*eiY+`PiA8vsOl#A_|aL+klFRl#{?--=58=dga3;?Vhlw=X-YtIumClB z(@Tk<3;lhx2QeaG@O5IY4zAvf6k|qNckY7NVK)E?qym$+FmHEPQpGazG%9kcV=@%; zc)+6#uh(|-dhFqx^G)%P98wcJGKemO@rV@Dl$$$6sANRnC(JeYZPRUxK)c`MQSVFi zzaUnqp%;-fRl&1TCiR8GG!SF*#H$|rnQ*vg8p{6y8?Adv6 zCp~9BR>8@L-rPlv#a?5pCO@8pXL!MT8dC(I+I!yJZpb^M8z!Ex0|%`|Jky*3;NfII z;I>lvn zRpsQFBVp2c!UaOh{dUOJRYV7H4|DbGw4U_H33*QjlB1<4=@dx8w>|x z8i)WNKt71ERr5TCx0+@HWi!u&+_suvP-b<|lIxN=bQ>xgE~D0EWUg^(OwMk8P(sY4 zpSn@GGjgLrjRZBFR1;=y>?bdAYP6BMQ5*&m($&_t1R+ReI7CIFdD z!=x+6^8mF<_sBd(&_;tF)OYz4E~kxAON8Y_8uXW!(dP(`gg230oL^%=C{-%=q|+X3 zaJIU;4Lcr>K#2?iYs=V%AkSHHt{qstwU#+s++ zBWS5!%6_!n7?#P61s?7@%4cwf>1>3NG5+L%GbugT#O9@013X07KB8y(nAy`Kcr|B@+r3w)y~sxHH@H#Oc=?ZF$0r=t)IK3 zn|Je|BNig~f|n;1=c;kx*-p&UNhu}_*#=V3@htA#>>`_^vNij;STTz<9{EMSD)-%7 zqhzWi95y7yjX6y9psSSh9EGRxa4|xYw0LwDp2j15a*Vq$Xy4IB@ZL1HPcKZWTbvSh z$1>1uJcy*ZN0n?PEbp)(2r+J~FnK55NKJ{RYit9p$fH|L0U+!MSY+MfiD&~RX#xdi z_sllpcrp(3{J=Oz%NX8UaUt}Jj8XOsN~3nfB#vmOhTg_g^XMF`@o*<{WALn|-BTIJ zS+5@S=aG$Va{|)3)n7VF z?tc(jTz%ubk-mgHIC$JQDmUPIWM|mi?Cm^Gxy)L9!( zop<@hk6#@Qlq_GpGIj00maL5tMa`M@zU(A_S?C_pqPkhdD|wb*{^E>iUiR>pO<@XZ zu4L!z_Q6>zrP&t=_Q1r_=9}kuL%V+qf;34<6&0rxfNCW}9Ri1ImIXKdcfdN`vm%pV zK*GfhtkD(>Yspc<8;F^TP(HMn z0wA)_^G9c2-*Ui9KbN#&L+(af^jVnb1XbzIgY9i{z0ENB_N}^)CnT`oo7Sh*A-#y5 zt#|NH7nf;flN@hSN_BZ$kYNV=p0%9B!Wo_b54@EGyFpO+B@rIv02cx0Q?yF~Wsj^z zhn+@r)G^T3X&6xo#yXWdKkIiI;A~ki(gr$8rcjuV(#`&m)MKfZMo*%Z@*vWy)cA%H ztmNbA0)A*;(hKmYgpp=$Eq&ak!kC&6AG|D&V7 zcA+evgb$bxy-tl_leG9)wPb+pg$!zhC0fS{Y;&WC)om;(wE{aESk#O_o>SARZ|lDO zJ0jo(x!CiiK|FFipF3igk5}eXV?8PER)fREpr6r)H%qczTo0A&ga=^Vm{P9ax*Tq6 zxd%} zJy!SDpjcl<-T=Vq8JPmH z>locxB4Rm~@&b&*pcDWnFGiz#q94W3T~32JyxcHi5GbwkdXpUI8XTZy*f>gKICllJ zo^t`KO$6{PSWvaUjF0@DEF!TKy7w$v&e)XU9~oljmS^D4<6c%F?iuiS=o#x zP2%FEqjMg<@^p<4IXS(dyOeqBFzY$5kbNdm4sGIQR%WONr$dDK?(h3ij_y4N>MigM zZ8kV|x=nvq>v0gwC|*wYfH*J10I8%W&1kT|o6CDrMsvKOdg~)8YO?>ioVa1I0+#XJ#^2uj3Vq=@JtToCz1zja?Bd&P=j$u(frj$Y5czf>edNOc9WJh@X~RFD-UW%8*S z|9HM%M`VrvhUO!Wvv&lEh^J4Rb@BxeT$%?pn(3#T6B*W7HEPUmK)e^Y48j7f@?0)QMVd3634*#t(_Y3h}p1@Y1Y zx9CirgD?Ok*{8JsnUe2mJ|$!tIcNdBD;mT8F{M7Ql;P(BCS(XS<_5L$+usiv+LS-49=aj!Z}w;n@ZxO`w0DhYE%Cuc@LT z7uF3-QR^Tf_y<#R4uN)? zz!D@S)M2hgxiq!gJCcitcHQ;mpf!E%!&JBHuRoyOa*HLCUW*YF;s=hY!Y7nC^4~h| zCTulD-JQTdgKajK)wjb{FsAgnG`7gh1uMg@Wo#KRoNRPwx1{581Lao zzd1{aDc|1ry=i_I_%9)PP&&9fwu0b;d7G+j>}tH zsw#16r9gS>liV4kNqv$>O(CI}HQm^Cp}3~!S=+m^vHlOBop&Z*wy!9porv7N*F&g- z$(!zvud9>gUGTZUYp9E}eJfh4Xz^oeLWisM+ndi@B%0g?ZEgbprE95T+k@R71lrag zi0XTFuEO*>cWP7r1I6(ZR$qFQ$^Pc5>?>tnG5h5jvJpdnNKfG_Y0^;h%MOn9MG?@G zC(-6n4%Do?T$U3!UQU0LHIX>jY`;}sN!;BeZuF5A(BG}HFG-_X2S?Zx)sSop?U)}H zKB3)VNe*unGs9a;kNs@6?+8$^h1|Zk=d4H06ywJSCyW%@92&m+i-Q%Ej?fVJ)E?At zLy0;M$4f|feTAOO!TBg6QvE_TWfjioWxK>9om-8_u_}!^G!Y(FlZ9v{N zDiU;Ma4cNwySb38ITSXLR~I_S00klT?`TjxBIj&^NGK=WSb;W?Y*M5LOR+k@ NpNa&;8T)6>_&2*9=-lvq|7(Va!JGr;MUM{hpe~0gbSnv1S*PM0Aea8>+ ze(TFmu_AK)O`T)jCrQ1q@sDiXx$}>3|K-%bpN045>VMqf)!kdCKOR25?(auF-oV@? ztLyAn;ZwC=Ul-q=U)K}R+QWT!Uq7GcBkxPtipb~Z=Sz=6*YsasA4@|sGb8T@7Zf3) zeeXQq9{-+3PWdjLH@=Q{mazL|<{r=g&i(&lI6PP1_jcGvuoV8u<=es6-QCCA{af+j z?tN(}J3ajIF}iy147=}RrTMAn%GJ@AzPUxnwX*zIeY!n9qrLC6MqBTr!Q{W)^6$ja zyrni|;>`B8*R1v?P2aAWBH3lj7vI%3^;sqF>hvmb%k$=Cw$XXr+B81NjR!l|sqEcK z^R~2S14sTbjX&iv>J!y<%H`XaR-F#NuXmTWZMIzN>|6M}>R(#s^eh)kTgtZ6;~nj< zR3q*8Y~}>#mMq`Br(6+834N?fTWVcdi)Vz&K^p#Ceo&p_``A}H0*Ut~J+ku4`J!+5 zZ{;sjlbF3N7?oH1-0Dl^W>Y1rWd3kd?ls5Oj!mTK(!Vp+`kZzBn(gwu z@x1AJU$mf;*=Y^=twGOH-79igU%zf+WJo4dhFHl~ilTc^BwBWO(L)P;#mez`3@7A_VUyq-zHq9hg#K}CJ;B>vo>+8Js zjbRuyTadcfYMR8R->#_yR`q8UwuO4D&64>!Mf{pi>EFh$*1Opf`IS+JBIPL`u5T~W+vD6 zm~{PDCfIy59(|Rwg?YZ&D)QFA52$M8EyxvX(yoMz!Bz5deV?9Kl~-lFsY%#YH=mWH zhF`kXy~g(b4=&d9uf1cvwuR7M-?PpY4thmxV5n&xHttlH_2|H0yo=t}^(kK&z(rmO z4J}g_#*Uu6gpW1_jiJOWfbBl)N#elCVNKK!rW@952 z@Rj}DYE{L?)*aJ_SQ=%IBz4JO{ji|A^?m8jCBrY-f9&pX^atw}sYc^xR=}?`JGshM zOgD|9SzW@$R1hnllB}z%c_BybcACbOSZ#twh7T%Ojm%2h)*4$hHf>&CTX62b@L}Q& zGJ+=t3uOa-8%oyimZ^7Fq!=U@T|%`fH6}@2I=`Oc>LeE-INHg*z#nrvabL5lAM&^5s43jXM!0ylS7G!MysHvay z05d3E(KusrVh}VC12IUZ*?^xLON5lxDwU)r>qJ|P$Tg^@= zI`i1}ljsvj<*U~PeL1IA8rYK}<+)YGc-pjndyTPa&Xr6kJwsHfQ=Ks-< zzk$IYd9gNsRqC}*$JT6UQRjVP>D>%Mv=uO4dLrGztj$)F38HY%K*E5jE4kGJrYdRD z^Q`{9h)D`%_>&rSrh@VBtcfM>%PN`5K%dJ)`nqJekhDoMN|4Jp>8}`#`2$Z2Ff^KC zBN36i`qfRsg>2}uL^h$sJ%>Oh&cj?AH8SHx9A48&POoJ(_qLi~(`;Ug-?#Cz*-iJ> z>>p3IL~UxX%Z;)13j^;}NDQ3aUB?D#jG9W zXoDGfyM+Q;-Ml)2_L%3$!ebCBms4EgG82ZJ0D7`K^zEFig(gWVv!S6lN&^#2K21xq zZ3FyZ!CZnO5zVd=!O)I91WJLR%rm_qu_7aT6fCSV%}wD@)Y@DX#I%`oN*020VgAC+ z5H6ZFnHFg0STA8*F=A>6xax~H= zF-EUfQdOJ_%PdX-;Gox~_r!}P6P}f%6Qa&L8;@7mMR9_*&kk05cV&!_UjqER2T9UI zL8oHO9TQ*9Elm$@*xEp=C)2MRES_x7YaNPc4# zQ!tm&9~DPbZKv(h&|-k=#wcAqz_Mv1mXFI=j&$+}GK@bM^~{Hi%4`IT2VVwIK(3sg z>cLeqlQJyJ0P!9W92sQ;byHq-Uo;eo6X$6v5*^pVl}OAnVajJ|mpGLWfyo>iTa3CK zpS4liZa@l&T%IA;#$f=xhVzg%7ldsg^7(@qJ(cnHNw)epO`8(H~623t4V`S*p0??r07g<7xOkvg)V^Yg@Kapkb8Q3)5tDVF_G~7TvA! z;6FaRRp$A?am+g>6#{RE%uE)iC>S>vd#bW?LE7)^*(C(0C?q*`56G-DDq`i<%kf5*Jrt`??`Er9Cm^uZZ;IVWD1L-39DL_$zXPDY1 zGn+?~m$MWhb~OS8K}mx4n8h#Y9unHw2aI#WJ3$c4T_Vo zDJI;j@tRz@Cngl8#7D7FhUMO9hVe*OUV-J`*QJVWfIiq}d&Nu_==s&ebk+i~jax^a zBzMxn51#@}%T7l6Zk4M~ACalUO@F;6Z^eIaBbek9L~dWV2%G6|7*nhSCK?K3wk4Z^ z`AsL8h`KACw1#`{6bwV;Bv%kDXhw!HX5}=?2*`r~P&89JCtiCBNgbB01DXm#j5b^~ zwA%2ELDq2M{9CKoD$RJCsQJQUvArF{Qd=Y7RC`cjYZS|(7#0F7U5PU9WsjiaO5J?> zAyx2p4BI{^`+<=u7=;Orl5T~RYQNiq875N0e76Ikx%yGhx7}x6yERTK-Nt%MKT*8*{VNejntbklb+%7 zs;2o59gaQWP9rD@8fr))F)xl{3(UVOz=I;oSE2uS9JMBG*{Nn&Kmb8Nb#OwlU45IH z-PmeIlzpO{d`%5EVEV^#y}=aV<)Jbv6eKA~y%1=>EWFKDQlT%2p33a}UDp|kLx)tm zwiYakq3%jkCMT>y-ymQ};Vby1VdSy{A+6gfT#t^~Gl*T@2Z1z7TMA<%tYHomS37x#~0Ja%?W#{LIL~>L)SGyKsSb*V#p%Ubm)>(fr?6xjRdSj&L)nPB~PI)2rQ&IW>8JY zj$n4F(xHJ|S03mK_)}`*|FHK{-_{|rFlp?uWg{wTq-k^8Xo{}P{@N7)%OhcwqG6tr zS(&Q_Uh<(+bk(src{9B@#Fd0Ji(me^qD3A0RkXI-`vbxXJ{0AcT3-ZNR-szw|N8e0a1K zUs-r_Vh}xx8b(nD(N8p+ah3w9SYCpk*0y{O%6gN)tmIz}i9Y3pvUjk|;Xo?wl z(8{$DKo`k$mKrv*nz2m7rse`&Li*U>Bce{+Oob8-1fLYT{gghjct#vtg$>dZG)k7k zT&$mLbu{P;2x1oC{+&ZRn26`8Lg`{k$xZ)=!xV2qopQEiN1L`3#~Z(t7QPRvEUSkJ zoq+8bybGs?-$_7v&PrziIo?Bep;hb`kZXSw)ucOSf{K+`0%!znmA8z3kYQ_EB;}Ri z4espCQo@BKLr!qwF-%dAB?68e15~_PXBx?f%9W<|D;W1P2IUxux2|G`*zC>51Up-d zU&oka1fC=!8P+!F&x>L4#yCD5xLdc`1=_yO2=sRVZG+WHs;Be%~e=XbdP|TMf6`7HAAtSTsHatt7Rb=l> z@a3G`ivP`kQAz+PKJzYe`@Y9%`0yX6v4_ryd~NVleu`NVM>HI@60UbjR<`8MXT;Fe zc)M>7Vaj-Rbmiu#Ve3a?g<)#`WOba=7*gC8i($g@1Nfb@5Z$3&&6Giq!0D(JQ_e}7 z%k0?F^z1s+Lt$bsrd{H}6-Ck7r6i(F+^LpSSo9@-tGx?bfFgo5!=nf!V^lQ!C@8Gb zT7sT*wGh3aC8(ZKUubG{>>o)wMO^1@&%4M()^s+IRMyd<7Boaht(PQVSixD4+-)=z zcB%TphT2bPF9)}v>N>!FhRo&x!84dWp9+Er68+Y}GoP z$NVWPrIqIJe=B-a5_$QQ?pEWv-8n3 zNvjA)z~RcZD!XVWvZ0ZAiiwj95l4{lh>@?PX?p@mlkX25@Fm&n%|ya}bMuIk_Ci_6 zL$#xt2E<8q;Irn(a9SfXfXRXa#)oHdnc-oJ_IDdkgY`y6tc#c@>JS{gdjqIDUJMJ`la!ii3C6hc- z@=!k|g_|^Ho6&0%kfbOncAVy}k>5@TO-d?D4b|How7vD)*n7o6okAp_)ea(e^&H3@ z%%*mXzOf}@YttTB% z|G(Ir-2n$M^bjTyHQG!$`rjq1kx1h{EZ5F%v?OF0{3yFx6yaQmJW_O`d7uLcW=SYqVhapJ+TDEXjH*D=5%u~8Gf*sY zWvju?19bej0w-weziSEFArMOOL9EDS@4CB5k@vBxk>6M zX-5&hG5)|Sw@~*92{5&w_*K%cCzx3{uV<*$)*%fzqs)$u%*Wu|M@wzR=^v6VbvX3* zMJUwRB!wc39;v)rIF=O_caGhsXrz8%v zLu0>FI?ssDKKqGgj~lfZB3Wt(CK))D+9bIQvF=u}s6~LGOnNeO%?%PoiX0@Z)YdD4 zYdnTy$h3a_dlHi1mC-63=T#aMO01FaPyCtK!&&i@$>Trh4?%R=ku4a*|GdOE8nyn} z2}G@10s36k)nw`3xAbDO%9bd;Ln6a8$$11gJEU^){VBC0;m;wGKA@yA8C~d?Mg2Hj zDQTrP)E$O_+R!1yppr@S=)zO8iKGsNP?5u?)Z~fuI$B;~{cHODE(z+fsr|g;PAIIR;hZ5gH)<|WI+SFma(F)XQO>8Myb5KVB;E?)B zR}-n?@A$$p$I)iBlB24XzTFnUKXmf}pkIm_<=}YjEim>Q#%y*ezNx=}!0t$?QapV@ zcjL0m@uL*?!g5%Lq#i-c?A)wk71?dSjGnu8D3nAq}CWt zi~t+LfOhc`3V*TVu1*@Aeu@Zcj8aJkW^hVNDndr3R=Y}(iLn;=6zNQQkwVRGq!Vgw zIoE+FaKnzRbIC-dgQEaZX_)Qn*QU#}%@8?4t^kctk5nc3a^_`mva*!KWIx0`;Igh8 zXljbQF&+gXHddIoWC9WNcK_XK$P5lt)%ObNbikCH%lt&A*mmVM-9sS~{ZwunrsbsE zlx|WrAiyAz(7k}rRi2U>#UN+_s6ZW`0A>}cAdMPi!u_rhazm(3oloh7n8jWKBTe6= z$VjKlf=TbYIXiqw)Eo7SEMEgeDVPzg7^Aod4$=xcTRsj}Nnk*543(FH-Ab*;-8&3B zf(80etli1|C9rGAq2QzRb@Z_RqJGC}J%&MDSBBFL1UtyTArFoNA_~dMAUZ}$uMCm)~y{QGtfDdk{0!OQu4bE9)0V7W(f&rVp9Z(gks$`c?aeLjpb z#;~MZIiiqSL>Ffb4moL+aH0Onc#9Yg!&%w_qBlCOuULR^XTnO0K=H}w!G>oBaaXuY zv@DR?-2h5LZE}LG0W(kumrgVE<%?7uNFnx7DD*t(q041dL;JegIbwQ~ZkWX17-v~I z`AO{K0#3rBY{s0nw%iU(i*1L~et?LL0#GlJXLrIdSl(~Fw?n&G9HZ`_NJ(+%2Jj0V z@+|`ThXFokoYHk)iKStU^(gTY+^qDN00)nAB{VQEc6Yql1oq~LMpPCshBH3e20Xg^ z<$28BvkHx3E$9TsXwJkc_4wUsQ?mh3ZN!@-VMRETu6Dwjz~{LZW+k3UCQWjdX2E4^ z63~M~;~;)NLm1R*3CZI#xBe3ZwAebu2Wcoy`#+t0uHf4v#MwlkXeprz227v?A~H!s z%G)i*P*^>~5)=`9!xZL)P!`k*>rcx6lCpw#JDm6n6>!N}7jSHb{Ax_5hMf}PN`+X1 zBuaYdjkPF%3$;dg@xWgw_LYDGf>em4R(Ow~m<|9rcARF{RqOCBU>htS{w%&KeVQU8 zW!1Z}rQ(&t%de>yheb*%*(obZA>m9+x{_mtMs5JwB~D^KJJ5%eF5~sIs}T)Br56{T z!By0{42N8P;53_N52~}_kHm~wJxo@`x^p5|0aLm{v##_b8D+nie0beN&iAp{wp-$U zZ2Z1Aa06YP!XD)slS{2EieEC^)1Fr$yZU4sNBM6Q4?$-~3s9PfLczm-O=8`%8Y2+BIA^KT z4iT-~1^%Ka|9lC=e~*&+C~L|R|PugRz0}B z?3puP36)|6J)MB`S{jc%jEG<~+e};(74x%+W{b^Z_<{T5>Ix^u z+lQnm7piLpN_l@Hn5*srNTohal{gD?nS#oj(hxV-$c8WSqJjYvYV`S17bVk^gC4D= z$DRNO;D)r>YY#XpTpTheVguzTkD-DA3(JhC7OzEbuboMAFdOES164M44S@0yHtN1` z1FNGMZY}51l59{x!_QHIFHV!$cDa2Ba~*+0c>#%Vbf zH+%RHi2x3hPNL-COPGl73t<`ZL5KR5PttdbpcoUu<#}enfSn^D-WyOQ2kb15V_+z|+p}+?Dh+^|n}FG7VMzU5IFiQz?fc{A?rkN-5Fj-r{DdneDTb!Xy zJ)R->kQzb?4Y-**q;vj$QlOBcKwcHWO1B6T8SG$FL4im%eDa7i!8S8H2g&#xJOxQ3 z%{m}<9iGH+dqTtj&_agR^i!2ma5gXP^}6M_Az+S79~b;_0FYV;PFjd}2C^9UFK7%3 zU9GkL;7{j^h)|3xM54WiILYA_Lc#`@&~-ChgiP_lR)Qf!G5Dr2mc-=YEb5t5!&Ffj z+lDfU$o;V?XY5>og?en;7s72p_$@cJMf7a%lnpD)8t4?KWs%Xn=q;0Hr9lC2@kpOq z=h%p=zoUTY%H3>PU3f?$m+2(TwG-=zVL~V(Mna>kUr(?}xI7Rhbli`-^O!^NTFBj=)3PXl2jQ`7~2STZWX|EZEz1BBf@lnM`v(uMO`36feL|Y$%e@~K8H-Xxhd>d-pCHf zw|t{|Qjmm)fHicP(?p!G0$gC=RtBpYieH1al63a3eLUAI=I`ziIr#axe*M=RAAFui zuUV$IhcoYaZBJd@T2ytD2X;jx^=0=j?RY?oAdg4secUcER=4CS-i; zyF>(qka4S>QT}zqRQaoYz>$Kjqwegs%eV-t4R24Or;43;v( zq}hl@f^ui#s@YNs6=F#~pJp2?K7X>_U&C6g1cOols*I?hEZ*2~*1xb%nI2&4j*G9y zp9wiBX6(&BlfHkVv$=v}(?&;y2jyxAL{%2LhK5T!ZWJ{pHz=S0^><5!!&{gn`SPw9 zTG;Cd1Wbmfbs&pFHY*a5kz@0TGFcHpXuEPyqvFZE@@Gpo4*nzBp}?3>Cp188YKq>_ zgE-!DCD1HXG1$rptW6OYYEd2@tfudea!D}J!W&RxGdfu}+>mw1s?kbwRmUZa0hK^d z8}OJ!+zGJ%&=^BET!mGj`EzkAj6A>8SzIC}RzZqwfRU zXf+X+TG`}O*GW?@pcA`|HDG;@!`$V4_W$7Uf3#*V1dTwq0S5gFJBgx{wRlK%Q9eD; zcJ?n5ni0IXU0_&%rt#EAV%n5Ny;x0jkkoGlW}+5N(jEA}&6^Zn5lQ<`q8cJX)T}wp zrrH<3GKy7m+oO1qIBuY5S17D6gkl(4ZV{g)ennP+{3+grUIRr#vw*=x625AJ#K5u% zsK&%~Qh-x`B)UHQBy$8ymF+4$aEk(#C=R9t1#9yYen%M$B)*~ zwU(qtD*b0m|nckJ@=uCp6Tp{v@q1!I19$`6EN3T)Hf!9nx) zS)4E@B7RYsW8z^usJ4#Gf&T1D8tgB)08m$LeoG7NLRty=-(FnfA@P($WH=;cV}rr%!Lf|3o->JMU`iEd5Kw|e zvL3HG67UQZm5VbXOw}9fXWY5aci2m!nKW6~8-Z9?CTkA(pd17m@W?7wFc9KyBen(8qxuD_wS&+s$cXI) z1Iw)GDV?Os)e@)fmC-O62gCy)o`(&KLOCGUbt{12Km6u!BfnTYC!)!=O^pE_{B43= zqJii584g9r%*5389FA<&jiI#i815UJB&?YY_Pdm(w$Twp*NtC!_&2_%>w0W0ASei0Z(6j(Tv>U) zNUKsNA;xdUZ<8~22E`bAT(FKqB45C~8MLJigD(S33xbyU}u3rfgT=6xU@ht3on=fe+P1tHm7kuycl z#Tl!4q(q?LA;*+ti7!blv`#uLgnVMK<`@u;@U^x#w1=P5<{P=Dw_a?M74z|S+-MB+ zuHQzeQhl0KpuxfY3<(ju=+_;;{9Ut?pG{qt02U65t*&#kgHj{=aPh^PHCy`7qZY`mq#8Nsy4AHT zhaPlbu%sN6+juQniG&}_t5Uc5ShvnGUL~~~8{I-N1clVddL@J{_K~OzMUzA67H+np z{E|E9z}UVNHpH+{Ja19pa*r3>Fkkj3Hm@>CpWEk>#j>@z{o{B_70VrfWe4^^gy;j4DCX^2F#jZOW2#1n~NuRd+9+hO9jju$!1VIP74yhaH~OmVUxq&$?!K4EeD_EgBKf{!k!VN=xZPG zTkQ7(mj@RItlR>3IcaW}5SE-~!R3m5ctBHrkHJ#;&i{;fw2>3%zrfeHQM)>&dURpe zQ>o?-MQPaZBvet6L$&pUvG@DC#0DaTGlIPy?VjZjUbIOl)b`TKy`_=!loS9Dg^K0O zQ_Cr>i&=qq9nZh*_Ml{4b!}QAy+xsqJz=I249HEw9vtxy<>UOOa%CVC7@=^y-un?5 zd?2gc7}UfvLL5M$d4A~uzn|d~+`^=J%18Ub5p-`&!!TKB4phRoR#sV(b1nLAl(*a{?8}UQ5pV

?5&|K+bKfN?Il8*@tFQ1&$%q=BW^gEU?60)HcZZbC2arZg6l{D?Pv!@$Lbgo?68?6u4i7Yc6f{vRrj?}!1m8S(<%FdLeEdOr+o_5S~q3M%&Pj~bZ0{9Z%(GLMOmk%m0 zDV}HXhuAH0p{j+0`YsV5%|ZpHIYRBPmNX%sz>Yp4V)GmU>=L;IHqhfBt~)rpWqe0^ zs}hl?20jwrOCo+j14Y}D#SbRJ)(&5O79l2RPx5$`3fw4Ce{0dXi$oj@+}Wv5(@&@1 z&|dQ1>O9ukxkDu|8AT+69W_IO%LEh)^%&xxxTX`gp}`aCP)|wXIPY_nX12<0NHmIR zs~H<}B&q7v=;7io;x0~5V1qEt$#~K4l;9v(sQ?K4olM1mgcfZU)&xjBP<&KSnAn(K z9tI!Np*1WF32xYcUex1pRR;u8YkSy|G}N}oFt?gCB&tEgO+L62zdE3>LvP@VWUQ%j zUecJ`f+O532Ndhgs2y-UkWpc$I>5K&!_nePFRNc^NL)GH!P6G2v0zc{6!*(c#dw`U z2_)0E{~_h3$S{m=ljktm&2$NlYr8?pmL=juL6w};jDR{fq)AFtM837Ai>Q$2Rb0`^ zOz|$em6AwFZ8atp@fWo#7>>{p!wQz<4{``+ns&kPbO@y4z`Sq$@IV6?FL6udF=Hk6 z>Wpfqm4kbOXgoZxiZPY^v}6@gXsap(#V`-2!ssWoq=Fs9)M&xE)6m+&8_EP%=&pha z0E#FXxN{WAUS_ep4VEJZ309uyQviXdE#)D_+rWjYS8k`J$PzM_dXhV6B*>IFI}lxN4F*9*Z^C>9XJ@io!Ojqs z6j~;)dE7|&%*}&HT3tcSf17_AUGR2L?sxF|*p9o@E7r@Ti-Mr-2up2_s+dN*8)j=^ zMdsFCx&!pcpbzcFi78Wb#zNgrj57R{;VfbcOPemK4`V8GR??YB>yC)6cm~&?MgBXp zk_~_Zn>`(gSj3eGj?-rzOT;1?Pt>=Dj`0`|E;shNZ4r+pt$5G}t!C zax|m!B=%UrYJ)lfPuV>VGh%ZmO2Z(uQO{M> za4QZ5tycl@t40JWs2Y_+IfW`#IQi;b(mTW9y3KS9lXS{612RPIh4!CH(A z2ssKyG^}o@k3qK(0=8}eI4w#%qY$*7yEk#}0A(@2Qv!>`HHcJtL3Yh6!fIia@jvE9+rk;ll9U#9KI z+S+(p+Q9%y$B-onBWfe`;JFD`bbfEH%FC+0;5&B>o?d0@WfJa!mjy*G31;y@iVD1I zJSZJKWGrQ-1A(B2w3*GbCuA^SF4IcXI(}R6a8|R?zZr`_KCF)!eVt&6390iQ<5F2v#qyAL` zGz}H8u0=-V_uCmFWDd*|Wr>Tt1TBH@%hWHW7Z4<|Ml55*Iw{7~6!#(=Sv8}BcL!Dx zetK5~gWA@R5PmKFwb^VWgjPzAqeZrlcyW4In~8bb)YQer;OLr^FzkX00eADTK4C zpWS}tK%B%WpqCfoN+N)gQ$F=br_}vb{k^8%W`%2!1rJ4$J+JtOUyF;dA|M=$<;kwl z;|@7%qSVZPfb@1EhXWWYm;WaUtyxC3hyUG-0LuoL*O&!f%gYgRJ-`{!wg{3;H*|C0 zh-cd(;)4Alz5?XKvb4c7DyD}dF_AFt$6i+c)C=R(iEYR{DYovlQyUYUt#08Y#eaXr z1BbZN{|o9Hx*97Oo``aJdk*WApme@Zqns8d+tQgh9$mypZCcctL>G+hm~k&IhDd9u zTNr;nk+;pGsi_=}aSV-iR#bEYBhSLhK-h3U5Q4@vOoRAAAf{p6NG@vGRMUJhFFDQ1 zYQGb!HauFq=_fjXC6@Ii&o4+j0CVg~&tD9tVHNBiPSJ~$2l*EyS&FFMnuFx7qb>G* z-bHO$@UMnz<7Tu@@TEc;PnZ z<~Y=!U#Mm%bpd3o210ML;+wHe6*mYtR19c|7ZKlTCxgsC%(k}a7XxOx9BndpdQLP? zi{gQQ(9Pdb36qgZ*3mrS!0j-&;fF6!$=)15GYIdLIR=D9B_OGdfd`WJ35$~EyhNFc zusRuxvJ>Qo%5oSEd8FL`^dKzH?g>O98Pj&&ChlMocL%iwwa#zg!kKqeep?qMbKoyV zW=VUVpUZMEds=&|F&l%;Wx-gmBr}302F4x!4y|lbqryd@x^0mpmq%s-b<*|5bjVo5 zFAU%MF+%j9#wB9aOGBK@Yey()5A6(^Kpa3R8(M6e8Hy+2TRd;$)5us^^?s8JwI@(% z%-(<*P{eYa$qZ2*qv2I*zwow-gMYPd<8{0yr-JsGB%H^x$4_KEIhR2OFLHTyB~zKk z&S!<=cu8=f$2oZdm(xqLoMlL96lKAK^6^a0gzjR!CWA5O?WM9%srpXbD@v9ZVWt*l!+H9*{cOU^c5jn2J+?W5A-w%M! z9bMwmyJ*Dr?fTP}O^5NfTOeqN(xKt5&8^YL1!PilsnI@Y)+^5yfVR>uaYb;0L{(-Z zRW^DPh+XOpx8E3$`7zzW&Ni*8YNLfQ znUb>Ihe0_gUF-jX_kge1ZdsiBReK??z zXFuLiggN!L0T+@Torw$id;vA=O z9>CGhuTN;PsCRE>TV{c!voB*HRs)^fe6B9TVh7IeIE21|%Sf7Ou=TeEO3Ne(dYGgO z=LTv8skQ|=JQ|7_yJ(GPQPrBKl8Aap{6jrP^*0*!4GcK1nz?MyV9m{3#>R_k{zPHk|tC-VrNDp#z20zZKz32J43 z31iH*V47NEj6edMoI7H;2?g@K)!60p-*F-z9@Adhyku~W0oC9$Q7uIvt9)m-c8H8* zi{GGL@SX;-3k2VHRrw#s1+a7hbl<}fTw20A_Ia9~%9BJd)Qp8_0No5o0As&%@W7o| zV0O_)mDmQQ?bm7geV{C&`C3luD%^>w`aZ;odf zw;E9sICQO)&{8bO-ETJ0eZQO*IXdI;?%Hy#d7C-YDXVDR4FqeB^>>HxzC{Eu2AKHb z&g|3WxnD+je1c|51e!3nQIM_fB)XPW6h{fmG_^P6RxvBVZ(lV;qSi++Rq$uHf58WP zvKqiL(^-VA%v)R^l!)X4b>$F$Fcd2wriy$x-2z7~%4CBqB^7DnAXTT1)<4=thqgl? zN+jEGYq-}fS03;@;_vp-X3Y~63XN1Kdg@G$;iU%NrAe*?X;Mn?ATXQLKK}H;Tp=E3H$6OZBuTi?5aj9|X zN!o9T);U`v%-#5Dh{lxGFDZATcCHjd0p!`$A+fc&S8b=A!6G6p|C%pxR%KJs$c+3I zreZ9LB(oNb>sFuw$&?>zpg(O~^ak95QO4Q2^pE^~?3A>z5C^R{A zlY zJ8Ot6_qI0V+xntGG~&;A^YCwJ-N%d{@@A){rujokC(y{aezGuu?<+I*ZY~#7EDVAy zh=oZciR?M+Rq5+}H-Mp2mEJ`B#S>ZD)!6^&tND&F%Q6$olBAK%(gCW)vrW~_r2ztv0} zi}_djZ>8bBx;K#&KBX$n0mGGkE z-R@KSnrGOGeOSm&i`-dTp2Xr#8JrI66u%veXbe6*!d@hJCGCP|tbGB#w|g}$Xb2%_ zwB0XJ)ZG36DkY=1czg1AXy<}c=uuvs(a+s5_(DT7`DRNEyjJosXK^XcCoZBKsZ*oq zQ~to%f##|)X!Zc@+q~z~$^`FDZ6casrCvnL7SR23Z~zs4>KLaZMK&%3PJUdE9SDoX z+*N^LI^nfRrSMI5kf48jLpvPA{KNU0)R zT!Lo5svY{?QCW4DMwcCF?X`tyasnLN2!fV%uLzhB5ewWEK&{0EYolg-QXR5F$Ubik zG&oXz6W17VSGMCZb=*~uEiZ**1D%t~x%3ta_Q(G>#-Eze_g4j;!C82jh7Ei)m*SD2 z^LC|G!8wwIJ+XR}=eM~I!j456r-h5RF&q?2+Utg>K!G2H6*P^!!lg6hgT9B@a1#=l zcMAHGKATb)F^j-x@>OU1{^lpB@8Gk2{@rtVcs4$v$FqL!AoABh-=cg|c)alcFaF=c zGK{OsB;SQGhZBD<|M~wPVf#hH))2o-a5FnQcQc#!vvvKvl6-qE0UcK`L9JE|dHk*z zSan}ctizGVG7M!9ob*U7@FZJtnEdqrvfWDqqVLbd9wGlRb>r;LC2r?v zE8Gd+N0P_Z{FBx+pMGAPJjVcS$dx}V3B6@MV6{ZgKj1wO9s1jTF5WM=Ha?!N_YdDU z{XD(p{$FZHZ!b7L=)^u@W$?@0Y=i$oE$a8AVHs2Y$?`Jc2f4t${mTqN#b$58Q|Ht9Q z_rpW()W7$cgO!omSrYcw2lhIXogwCW=2jBTdZauDuMxEIw%wg=Vuk$P&W^)zL%q+v zp}lh#x0Zw1%W80c|Bs?ej32*0yL*56cm{;^{^FYp_(_Dj#QukT0KN{);!HQsPDZ~3Ay^Y{@QuIT=VzG7ri?@?TCB%x!@D%PbS_UsR`cr*FNf> zF;&0rAEqyKt|-)h$-MpT@BSmqXQL4UlCIbrzUK4y=d<-KKio{<;|KYUFYP=j=HhwB z5t;iVTyOpTIs56!9+lbuy8T_&ynB3^SoMi`@6z_;2b=`tcID@t^PeK~PVu*Yz#8k1 z@XZf#JV?n$Pxi#!4h)Et0@0V+!*6kr@1>sq_EZ=I?5_7yxcuGyEB{&fe~>!skN@<~ zPsisEGdc6OtT&M&a1Qy$R;S@%+#K6)XhP`}}7ofJUbaQaVrV^?6IL zpE2+DSv%j(p1tgc_N?#q@baA2{=-W$Jtt$%2jeSA?-nfakZUU5xKLpF@!8!m8(Wx^ z-}x%kRAiK%`(erX9Z>51hXQzi<8n|IJiGer?zhUYv>0c1Cl|LTSJ!tZXUAo_Rv7KP z>G{dup5618|9bM%Yac_b6sNhq(g!}de|dTF>N}S6C+Er)zqXold-<}}SNYlNxt%}b zVXn_!&eyM=z07HQ)ehUM{p{73SM#sGw}oum_6M$`k9@2H{CHk>XSct*ee%P;{)GB3 zP56)X-w*Ke{(lsj{jm_e!w(<~e@gv-j`Xc)vLpS9$#bKhPaPhk)OQ*0pguiJN)=}6b~c5 z5W?`=YtAa1=u+tOPmctXcW7L`{4bgR_jRQNpm4cqjyt544^FV2rc6wayM z0M^~UAl1~Hhw~RF|2oI^uct~}@2~imH0NKR!{oR==M)!libdf1`0Xhkmhj<_n}a0a zmip`E#ntbSlDoT4ZeM%^QSkQpo5RKBMWn#bUtPcX$d`Nj-@g6)?~$qbqWh7A<2@3! zj$=ac`y@(-Hs)K*)ZdY)_9tlhUxcCEUQ|y^?^}P*9;_pI*3kR*K!=v++AWOt!xr?% zE%@fu+F#_sQ*~1;?00OzIE8#G?(dZ-UcEYee18$&o5RVQi`!f6$f?4~^OMi6?@wOdUtIcciZFE$U0WwF58A!M$=%Io z-n{D*-S4?}tZWe^dy$KJ^|Wkzb_!`Q$-T^F2ml9XU&Xuc3}Z2aoc5 z$4KnoVI;oMNI%+%M?3N3uoK^|V2j>CL-u_`M*AozO}?06{8y?(40KE57bkFUqq Q|Bu)I10^F{h5+IK0As3ms{jB1 From 3f212a82029d995303ce5372f62396c2ec0da976 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Wed, 12 Jun 2024 18:13:34 +0200 Subject: [PATCH 40/44] Update tests --- ...__test__import_dump_v6_with_vectors-6.snap | 2 +- ...__test__import_dump_v6_with_vectors-7.snap | 2 +- index-scheduler/src/lib.rs | 20 ++--- .../documents after initial push.snap | 2 +- meilisearch/tests/documents/get_documents.rs | 8 +- meilisearch/tests/dumps/mod.rs | 16 ++-- meilisearch/tests/search/hybrid.rs | 30 +++---- meilisearch/tests/search/mod.rs | 10 +-- meilisearch/tests/similar/mod.rs | 48 +++++------ meilisearch/tests/vector/mod.rs | 10 +-- meilisearch/tests/vector/settings.rs | 80 +++++++++++++++++-- 11 files changed, 149 insertions(+), 79 deletions(-) diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap index 0aad0ea97..a9c76227a 100644 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap @@ -780,7 +780,7 @@ expression: document 1.3484878540039063 ] ], - "userProvided": false + "regenerate": true } } } diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap index f2a5e1d69..e5d28e450 100644 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap @@ -779,7 +779,7 @@ expression: document 1.04031240940094 ] ], - "userProvided": false + "regenerate": true } } } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 4278d15b3..88997b715 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5077,12 +5077,12 @@ mod tests { &fakerest_name: { // this will never trigger regeneration, which is good because we can't actually generate with // this embedder - "userProvided": true, + "regenerate": false, "embeddings": beagle_embed, }, &simple_hf_name: { // this will be regenerated on updates - "userProvided": false, + "regenerate": true, "embeddings": lab_embed, }, "noise": [0.1, 0.2, 0.3] @@ -5211,9 +5211,9 @@ mod tests { let embeddings = index.embeddings(&rtxn, 0).unwrap(); - // automatically changed to patou + // automatically changed to patou because set to regenerate assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); - // remained beagle because set to userProvided + // remained beagle assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; @@ -5256,7 +5256,7 @@ mod tests { "doggo": "max", "_vectors": { "my_doggo_embedder": { - "userProvided": true, + "regenerate": false, "embeddings": vec![2; 384], }, "unknown embedder": vec![4, 5], @@ -5267,7 +5267,7 @@ mod tests { "doggo": "marcel", "_vectors": { "my_doggo_embedder": { - "userProvided": false, + "regenerate": true, "embeddings": vec![3; 384], }, }, @@ -5277,7 +5277,7 @@ mod tests { "doggo": "sora", "_vectors": { "my_doggo_embedder": { - "userProvided": false, + "regenerate": true, }, }, }, @@ -5768,7 +5768,7 @@ mod tests { .unwrap() .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) .collect::>(); - snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"userProvided":true}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"userProvided":true}}}]"###); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"regenerate":false}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"regenerate":false}}}]"###); } { @@ -5802,8 +5802,8 @@ mod tests { .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) .collect::>(); - /// FIXME: redaction - snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"userProvided\":true},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"userProvided\":true}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"userProvided\":true}}}]""###); + // FIXME: redaction + snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"regenerate\":false},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"regenerate\":false}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"regenerate\":false}}}]""###); } } } diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap index 433a190f9..d2473d00a 100644 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap @@ -1,4 +1,4 @@ --- source: index-scheduler/src/lib.rs --- -[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"my_doggo_embedder":[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"my_doggo_embedder":{"embeddings":[2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0],"userProvided":true},"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel","_vectors":{"my_doggo_embedder":{"embeddings":[3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0],"userProvided":false}}},{"id":4,"doggo":"sora","_vectors":{"my_doggo_embedder":{"embeddings":null,"userProvided":false}}}] +[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"my_doggo_embedder":[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"my_doggo_embedder":{"embeddings":[2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0],"regenerate":false},"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel","_vectors":{"my_doggo_embedder":{"embeddings":[3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0],"regenerate":true}}},{"id":4,"doggo":"sora","_vectors":{"my_doggo_embedder":{"embeddings":null,"regenerate":true}}}] diff --git a/meilisearch/tests/documents/get_documents.rs b/meilisearch/tests/documents/get_documents.rs index 3bf3727c4..efe4cf8e9 100644 --- a/meilisearch/tests/documents/get_documents.rs +++ b/meilisearch/tests/documents/get_documents.rs @@ -637,7 +637,7 @@ async fn get_document_with_vectors() { 0.0 ] ], - "userProvided": true + "regenerate": false } } }, @@ -666,7 +666,7 @@ async fn get_document_with_vectors() { 0.0 ] ], - "userProvided": true + "regenerate": false } } } @@ -694,7 +694,7 @@ async fn get_document_with_vectors() { 0.0 ] ], - "userProvided": true + "regenerate": false } } }, @@ -722,7 +722,7 @@ async fn get_document_with_vectors() { 0.0 ] ], - "userProvided": true + "regenerate": false } } } diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index 6f93d94a7..fa402cb41 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1977,9 +1977,9 @@ async fn generate_and_import_dump_containing_vectors() { .add_documents( json!([ {"id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": vec![0; 384] }}, - {"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "userProvided": true, "embeddings": vec![1; 384] }}}, - {"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "userProvided": false, "embeddings": vec![2; 384] }}}, - {"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "userProvided": false }}}, + {"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "regenerate": false, "embeddings": vec![1; 384] }}}, + {"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "regenerate": true, "embeddings": vec![2; 384] }}}, + {"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "regenerate": true }}}, {"id": 4, "doggo": "max" }, ]), None, @@ -2096,7 +2096,7 @@ async fn generate_and_import_dump_containing_vectors() { "_vectors": { "doggo_embedder": { "embeddings": "[vector]", - "userProvided": true + "regenerate": false } } }, @@ -2106,7 +2106,7 @@ async fn generate_and_import_dump_containing_vectors() { "_vectors": { "doggo_embedder": { "embeddings": "[vector]", - "userProvided": true + "regenerate": false } } }, @@ -2116,7 +2116,7 @@ async fn generate_and_import_dump_containing_vectors() { "_vectors": { "doggo_embedder": { "embeddings": "[vector]", - "userProvided": false + "regenerate": true } } }, @@ -2126,7 +2126,7 @@ async fn generate_and_import_dump_containing_vectors() { "_vectors": { "doggo_embedder": { "embeddings": "[vector]", - "userProvided": false + "regenerate": true } } }, @@ -2136,7 +2136,7 @@ async fn generate_and_import_dump_containing_vectors() { "_vectors": { "doggo_embedder": { "embeddings": "[vector]", - "userProvided": false + "regenerate": true } } } diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index b8a4110ad..be6e0b1c8 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -128,7 +128,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index @@ -137,7 +137,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"2"); let (response, code) = index @@ -146,7 +146,7 @@ async fn simple_search() { ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -207,7 +207,7 @@ async fn distribution_shift() { let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true}); let (response, code) = index.search_post(search.clone()).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); let (response, code) = index .update_settings(json!({ @@ -228,7 +228,7 @@ async fn distribution_shift() { let (response, code) = index.search_post(search).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.1920928955078125e-7}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7}]"###); } #[actix_rt::test] @@ -249,7 +249,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index @@ -265,7 +265,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic @@ -282,7 +282,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -370,7 +370,7 @@ async fn single_document() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0}"###); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0}"###); snapshot!(response["semanticHitCount"], @"1"); } @@ -385,7 +385,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // same with a different semantic ratio @@ -394,7 +394,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // wrong vector dimensions @@ -418,7 +418,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.6581138968467712}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.6581138968467712}]"###); snapshot!(response["semanticHitCount"], @"3"); // full keyword, without a query @@ -427,7 +427,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, full keyword => keyword @@ -436,7 +436,7 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"userProvided":true}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"userProvided":true}},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, no hybrid keyword => @@ -479,6 +479,6 @@ async fn query_combination() { .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"userProvided":true}},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 19e495edd..c2c1b9fd7 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1374,7 +1374,7 @@ async fn experimental_feature_vector_store() { 3.0 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 1.0 @@ -1391,7 +1391,7 @@ async fn experimental_feature_vector_store() { 54.0 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.9129111766815186 @@ -1408,7 +1408,7 @@ async fn experimental_feature_vector_store() { 90.0 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.8106412887573242 @@ -1425,7 +1425,7 @@ async fn experimental_feature_vector_store() { 32.0 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.7412010431289673 @@ -1442,7 +1442,7 @@ async fn experimental_feature_vector_store() { 32.0 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.6972063183784485 diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index 0a568553c..60a0203ed 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -95,7 +95,7 @@ async fn basic() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } } }, @@ -112,7 +112,7 @@ async fn basic() { -0.20000000298023224 ] ], - "userProvided": true + "regenerate": false } } }, @@ -129,7 +129,7 @@ async fn basic() { -0.4000000059604645 ] ], - "userProvided": true + "regenerate": false } } }, @@ -146,7 +146,7 @@ async fn basic() { -0.5 ] ], - "userProvided": true + "regenerate": false } } } @@ -173,7 +173,7 @@ async fn basic() { -0.4000000059604645 ] ], - "userProvided": true + "regenerate": false } } }, @@ -190,7 +190,7 @@ async fn basic() { -0.5 ] ], - "userProvided": true + "regenerate": false } } }, @@ -207,7 +207,7 @@ async fn basic() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } } }, @@ -224,7 +224,7 @@ async fn basic() { 0.8500000238418579 ] ], - "userProvided": true + "regenerate": false } } } @@ -287,7 +287,7 @@ async fn ranking_score_threshold() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.890957772731781 @@ -305,7 +305,7 @@ async fn ranking_score_threshold() { -0.20000000298023224 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.39060014486312866 @@ -323,7 +323,7 @@ async fn ranking_score_threshold() { -0.4000000059604645 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.2819308042526245 @@ -341,7 +341,7 @@ async fn ranking_score_threshold() { -0.5 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.1662663221359253 @@ -373,7 +373,7 @@ async fn ranking_score_threshold() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.890957772731781 @@ -391,7 +391,7 @@ async fn ranking_score_threshold() { -0.20000000298023224 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.39060014486312866 @@ -409,7 +409,7 @@ async fn ranking_score_threshold() { -0.4000000059604645 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.2819308042526245 @@ -441,7 +441,7 @@ async fn ranking_score_threshold() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.890957772731781 @@ -459,7 +459,7 @@ async fn ranking_score_threshold() { -0.20000000298023224 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.39060014486312866 @@ -491,7 +491,7 @@ async fn ranking_score_threshold() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } }, "_rankingScore": 0.890957772731781 @@ -565,7 +565,7 @@ async fn filter() { -0.20000000298023224 ] ], - "userProvided": true + "regenerate": false } } }, @@ -582,7 +582,7 @@ async fn filter() { -0.4000000059604645 ] ], - "userProvided": true + "regenerate": false } } }, @@ -599,7 +599,7 @@ async fn filter() { -0.5 ] ], - "userProvided": true + "regenerate": false } } } @@ -629,7 +629,7 @@ async fn filter() { 0.8500000238418579 ] ], - "userProvided": true + "regenerate": false } } } @@ -690,7 +690,7 @@ async fn limit_and_offset() { 0.800000011920929 ] ], - "userProvided": true + "regenerate": false } } } @@ -719,7 +719,7 @@ async fn limit_and_offset() { -0.20000000298023224 ] ], - "userProvided": true + "regenerate": false } } } diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs index 55dc186d5..8d619a15a 100644 --- a/meilisearch/tests/vector/mod.rs +++ b/meilisearch/tests/vector/mod.rs @@ -59,7 +59,7 @@ async fn add_remove_user_provided() { 0.0 ] ], - "userProvided": true + "regenerate": false } } }, @@ -75,7 +75,7 @@ async fn add_remove_user_provided() { 1.0 ] ], - "userProvided": true + "regenerate": false } } } @@ -112,7 +112,7 @@ async fn add_remove_user_provided() { 10.0 ] ], - "userProvided": true + "regenerate": false } } }, @@ -180,8 +180,8 @@ async fn generate_default_user_provided_documents(server: &Server) -> Index { {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }}, {"id": 2, "name": "billou", "_vectors": { "manual": [[2, 2, 2], [2, 2, 3]] }}, - {"id": 3, "name": "intel", "_vectors": { "manual": { "userProvided": true, "embeddings": [3, 3, 3] }}}, - {"id": 4, "name": "max", "_vectors": { "manual": { "userProvided": true, "embeddings": [[4, 4, 4], [4, 4, 5]] }}}, + {"id": 3, "name": "intel", "_vectors": { "manual": { "regenerate": false, "embeddings": [3, 3, 3] }}}, + {"id": 4, "name": "max", "_vectors": { "manual": { "regenerate": false, "embeddings": [[4, 4, 4], [4, 4, 5]] }}}, ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); diff --git a/meilisearch/tests/vector/settings.rs b/meilisearch/tests/vector/settings.rs index e11f4368f..3fe161f9b 100644 --- a/meilisearch/tests/vector/settings.rs +++ b/meilisearch/tests/vector/settings.rs @@ -79,23 +79,93 @@ async fn reset_embedder_documents() { "results": [ { "id": 0, - "name": "kefir" + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } }, { "id": 1, - "name": "echo" + "name": "echo", + "_vectors": { + "manual": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } }, { "id": 2, - "name": "billou" + "name": "billou", + "_vectors": { + "manual": { + "embeddings": [ + [ + 2.0, + 2.0, + 2.0 + ], + [ + 2.0, + 2.0, + 3.0 + ] + ], + "regenerate": false + } + } }, { "id": 3, - "name": "intel" + "name": "intel", + "_vectors": { + "manual": { + "embeddings": [ + [ + 3.0, + 3.0, + 3.0 + ] + ], + "regenerate": false + } + } }, { "id": 4, - "name": "max" + "name": "max", + "_vectors": { + "manual": { + "embeddings": [ + [ + 4.0, + 4.0, + 4.0 + ], + [ + 4.0, + 4.0, + 5.0 + ] + ], + "regenerate": false + } + } } ], "offset": 0, From e35ef31738fdf2cd473ce55986c8e99d04966b69 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 13 Jun 2024 14:20:48 +0200 Subject: [PATCH 41/44] Small changes following review --- .../index_documents/extract/extract_vector_points.rs | 12 +++++++++--- milli/src/update/index_documents/transform.rs | 6 +++++- milli/src/update/settings.rs | 1 - milli/src/vector/settings.rs | 7 ------- 4 files changed, 14 insertions(+), 12 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 0a27a28bd..736c21c9f 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -422,8 +422,11 @@ fn extract_vector_document_diff( VectorStateDelta::NowRemoved } } - // when the vectors are no longer user-provided, - // we generate the prompt unconditionally + // inline to the left is not supposed to be possible because the embedder is not new, so `_vectors` was removed from + // the previous version of the document. + // Manual -> Generated is also not possible without an Inline to the right (which is handled above) + // Generated -> Generated is handled above, so not possible + // As a result, this code is unreachable (_not_generated, VectorState::Generated) => { // Do we keep this document? let document_is_kept = obkv @@ -443,7 +446,10 @@ fn extract_vector_document_diff( VectorStateDelta::NowRemoved } } - (_old, VectorState::Manual) => { + // inline to the left is not possible because the embedder is not new, and so `_vectors` was removed from the previous + // version of the document. + // however the Rust type system cannot know that. + (_manual, VectorState::Manual) => { // Do we keep this document? let document_is_kept = obkv .iter() diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 467a2810a..997ab64ff 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -866,6 +866,7 @@ impl<'a, 'i> Transform<'a, 'i> { // The operations that we must perform on the different fields. let mut operations = HashMap::new(); + let mut error_seen = false; let mut obkv_writer = KvWriter::<_, FieldId>::memory(); 'write_fid: for (id, val) in old_obkv.iter() { @@ -886,7 +887,10 @@ impl<'a, 'i> Transform<'a, 'i> { match existing_vectors { Ok(existing_vectors) => existing_vectors, Err(error) => { - tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map"); + if !error_seen { + tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map"); + error_seen = true; + } Default::default() } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 5421b64a7..b792cde52 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1182,7 +1182,6 @@ pub struct InnerIndexSettingsDiff { pub(crate) old: InnerIndexSettings, pub(crate) new: InnerIndexSettings, pub(crate) primary_key_id: Option, - // TODO: compare directly the embedders. pub(crate) embedding_config_updates: BTreeMap, pub(crate) settings_update_only: bool, /// The set of only the additional searchable fields. diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index edbed462c..9c7fb09b1 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -101,13 +101,6 @@ pub struct WriteBackToDocuments { } impl SettingsDiff { - pub fn should_reindex(&self) -> bool { - match self { - SettingsDiff::Remove { .. } | SettingsDiff::Reindex { .. } => true, - SettingsDiff::UpdateWithoutReindex { .. } => false, - } - } - pub fn from_settings(old: EmbeddingSettings, new: Setting) -> Self { match new { Setting::Set(new) => { From b9b938c902b68c125786f56ddbc7b90087a332c3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 13 Jun 2024 17:13:36 +0200 Subject: [PATCH 42/44] Change `retrieveVectors` behavior: - when the feature is disabled, documents are never modified - when the feature is enabled and `retrieveVectors` is disabled, `_vectors` is removed from documents - when the feature is enabled and `retrieveVectors` is enabled, vectors from the vectors DB are merged with `_vectors` in documents Additionally `_vectors` is never displayed when the `displayedAttributes` list does not contain either `*` or `_vectors` - fixed an issue where `_vectors` was not injected when all vectors in the dataset where always generated --- meilisearch/src/routes/indexes/documents.rs | 83 +++++++++---------- meilisearch/src/routes/indexes/search.rs | 24 +++--- meilisearch/src/routes/indexes/similar.rs | 12 ++- meilisearch/src/routes/multi_search.rs | 13 +-- meilisearch/src/search.rs | 92 ++++++++++++++++++--- 5 files changed, 150 insertions(+), 74 deletions(-) diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index bfbe20207..1f413ec7d 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -40,7 +40,7 @@ use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{ get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, }; -use crate::search::parse_filter; +use crate::search::{parse_filter, RetrieveVectors}; use crate::Opt; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { @@ -110,21 +110,20 @@ pub async fn get_document( debug!(parameters = ?params, "Get document"); let index_uid = IndexUid::try_from(index_uid)?; - let GetDocument { fields, retrieve_vectors } = params.into_inner(); + let GetDocument { fields, retrieve_vectors: param_retrieve_vectors } = params.into_inner(); let attributes_to_retrieve = fields.merge_star_and_none(); let features = index_scheduler.features(); - if retrieve_vectors.0 { - features.check_vector("Passing `retrieveVectors` as a parameter")?; - } + let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?; + analytics.get_fetch_documents( - &DocumentFetchKind::PerDocumentId { retrieve_vectors: retrieve_vectors.0 }, + &DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 }, &req, ); let index = index_scheduler.index(&index_uid)?; let document = - retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors.0)?; + retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors)?; debug!(returns = ?document, "Get document"); Ok(HttpResponse::Ok().json(document)) } @@ -195,11 +194,6 @@ pub async fn documents_by_query_post( let body = body.into_inner(); debug!(parameters = ?body, "Get documents POST"); - let features = index_scheduler.features(); - if body.retrieve_vectors { - features.check_vector("Passing `retrieveVectors` as a parameter")?; - } - analytics.post_fetch_documents( &DocumentFetchKind::Normal { with_filter: body.filter.is_some(), @@ -224,11 +218,6 @@ pub async fn get_documents( let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner(); - let features = index_scheduler.features(); - if retrieve_vectors.0 { - features.check_vector("Passing `retrieveVectors` as a parameter")?; - } - let filter = match filter { Some(f) => match serde_json::from_str(&f) { Ok(v) => Some(v), @@ -266,6 +255,9 @@ fn documents_by_query( let index_uid = IndexUid::try_from(index_uid.into_inner())?; let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query; + let features = index_scheduler.features(); + let retrieve_vectors = RetrieveVectors::new(retrieve_vectors, features)?; + let index = index_scheduler.index(&index_uid)?; let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields, retrieve_vectors)?; @@ -608,7 +600,7 @@ fn some_documents<'a, 't: 'a>( index: &'a Index, rtxn: &'t RoTxn, doc_ids: impl IntoIterator + 'a, - retrieve_vectors: bool, + retrieve_vectors: RetrieveVectors, ) -> Result> + 'a, ResponseError> { let fields_ids_map = index.fields_ids_map(rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); @@ -617,24 +609,32 @@ fn some_documents<'a, 't: 'a>( Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| { ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> { let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; - - if retrieve_vectors { - let mut vectors = serde_json::Map::new(); - for (name, vector) in index.embeddings(rtxn, key)? { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == name) - .is_some_and(|conf| conf.user_provided.contains(key)); - let embeddings = ExplicitVectors { - embeddings: Some(vector.into()), - regenerate: !user_provided, - }; - vectors.insert( - name, - serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, - ); + match retrieve_vectors { + RetrieveVectors::Ignore => {} + RetrieveVectors::Hide => { + document.remove("_vectors"); + } + RetrieveVectors::Retrieve => { + let mut vectors = match document.remove("_vectors") { + Some(Value::Object(map)) => map, + _ => Default::default(), + }; + for (name, vector) in index.embeddings(rtxn, key)? { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == name) + .is_some_and(|conf| conf.user_provided.contains(key)); + let embeddings = ExplicitVectors { + embeddings: Some(vector.into()), + regenerate: !user_provided, + }; + vectors.insert( + name, + serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, + ); + } + document.insert("_vectors".into(), vectors.into()); } - document.insert("_vectors".into(), vectors.into()); } Ok(document) @@ -648,7 +648,7 @@ fn retrieve_documents>( limit: usize, filter: Option, attributes_to_retrieve: Option>, - retrieve_vectors: bool, + retrieve_vectors: RetrieveVectors, ) -> Result<(u64, Vec), ResponseError> { let rtxn = index.read_txn()?; let filter = &filter; @@ -688,10 +688,9 @@ fn retrieve_documents>( Ok(match &attributes_to_retrieve { Some(attributes_to_retrieve) => permissive_json_pointer::select_values( &document?, - attributes_to_retrieve - .iter() - .map(|s| s.as_ref()) - .chain(retrieve_vectors.then_some("_vectors")), + attributes_to_retrieve.iter().map(|s| s.as_ref()).chain( + (retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"), + ), ), None => document?, }) @@ -705,7 +704,7 @@ fn retrieve_document>( index: &Index, doc_id: &str, attributes_to_retrieve: Option>, - retrieve_vectors: bool, + retrieve_vectors: RetrieveVectors, ) -> Result { let txn = index.read_txn()?; @@ -724,7 +723,7 @@ fn retrieve_document>( attributes_to_retrieve .iter() .map(|s| s.as_ref()) - .chain(retrieve_vectors.then_some("_vectors")), + .chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")), ), None => document, }; diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 6fdff4568..421cf2940 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -20,9 +20,9 @@ use crate::extractors::sequential_extractor::SeqHandler; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; use crate::search::{ add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, - SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, - DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, - DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, + RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, + DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, + DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, }; use crate::search_queue::SearchQueue; @@ -225,10 +225,12 @@ pub async fn search_with_url_query( let features = index_scheduler.features(); let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; - + let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?; let _permit = search_queue.try_get_search_permit().await?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?; + let search_result = tokio::task::spawn_blocking(move || { + perform_search(&index, query, search_kind, retrieve_vector) + }) + .await?; if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } @@ -265,10 +267,13 @@ pub async fn search_with_post( let features = index_scheduler.features(); let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; + let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; let _permit = search_queue.try_get_search_permit().await?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?; + let search_result = tokio::task::spawn_blocking(move || { + perform_search(&index, query, search_kind, retrieve_vectors) + }) + .await?; if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); if search_result.degraded { @@ -295,9 +300,6 @@ pub fn search_kind( if query.hybrid.is_some() { features.check_vector("Passing `hybrid` as a parameter")?; } - if query.retrieve_vectors { - features.check_vector("Passing `retrieveVectors` as a parameter")?; - } // regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing if query.vector.is_none() { diff --git a/meilisearch/src/routes/indexes/similar.rs b/meilisearch/src/routes/indexes/similar.rs index 54ea912ec..1dd83b09b 100644 --- a/meilisearch/src/routes/indexes/similar.rs +++ b/meilisearch/src/routes/indexes/similar.rs @@ -17,8 +17,8 @@ use crate::analytics::{Analytics, SimilarAggregator}; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::search::{ - add_search_rules, perform_similar, RankingScoreThresholdSimilar, SearchKind, SimilarQuery, - SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, + add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind, + SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, }; pub fn configure(cfg: &mut web::ServiceConfig) { @@ -93,6 +93,8 @@ async fn similar( features.check_vector("Using the similar API")?; + let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; + // Tenant token search_rules. if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) { add_search_rules(&mut query.filter, search_rules); @@ -103,8 +105,10 @@ async fn similar( let (embedder_name, embedder) = SearchKind::embedder(&index_scheduler, &index, query.embedder.as_deref(), None)?; - tokio::task::spawn_blocking(move || perform_similar(&index, query, embedder_name, embedder)) - .await? + tokio::task::spawn_blocking(move || { + perform_similar(&index, query, embedder_name, embedder, retrieve_vectors) + }) + .await? } #[derive(Debug, deserr::Deserr)] diff --git a/meilisearch/src/routes/multi_search.rs b/meilisearch/src/routes/multi_search.rs index a83dc4bc0..1d697dac6 100644 --- a/meilisearch/src/routes/multi_search.rs +++ b/meilisearch/src/routes/multi_search.rs @@ -15,7 +15,7 @@ use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::indexes::search::search_kind; use crate::search::{ - add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex, + add_search_rules, perform_search, RetrieveVectors, SearchQueryWithIndex, SearchResultWithIndex, }; use crate::search_queue::SearchQueue; @@ -83,11 +83,14 @@ pub async fn multi_search_with_post( let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features) .with_index(query_index)?; + let retrieve_vector = + RetrieveVectors::new(query.retrieve_vectors, features).with_index(query_index)?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)) - .await - .with_index(query_index)?; + let search_result = tokio::task::spawn_blocking(move || { + perform_search(&index, query, search_kind, retrieve_vector) + }) + .await + .with_index(query_index)?; search_results.push(SearchResultWithIndex { index_uid: index_uid.into_inner(), diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 60f684ede..9632e3f5d 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -823,6 +823,7 @@ pub fn perform_search( index: &Index, query: SearchQuery, search_kind: SearchKind, + retrieve_vectors: RetrieveVectors, ) -> Result { let before_search = Instant::now(); let rtxn = index.read_txn()?; @@ -860,7 +861,8 @@ pub fn perform_search( page, hits_per_page, attributes_to_retrieve, - retrieve_vectors, + // use the enum passed as parameter + retrieve_vectors: _, attributes_to_crop, crop_length, attributes_to_highlight, @@ -968,7 +970,7 @@ pub fn perform_search( struct AttributesFormat { attributes_to_retrieve: Option>, - retrieve_vectors: bool, + retrieve_vectors: RetrieveVectors, attributes_to_highlight: Option>, attributes_to_crop: Option>, crop_length: usize, @@ -981,6 +983,36 @@ struct AttributesFormat { show_ranking_score_details: bool, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RetrieveVectors { + /// Do not touch the `_vectors` field + /// + /// this is the behavior when the vectorStore feature is disabled + Ignore, + /// Remove the `_vectors` field + /// + /// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `false` + Hide, + /// Retrieve vectors from the DB and merge them into the `_vectors` field + /// + /// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `true` + Retrieve, +} + +impl RetrieveVectors { + pub fn new( + retrieve_vector: bool, + features: index_scheduler::RoFeatures, + ) -> Result { + match (retrieve_vector, features.check_vector("Passing `retrieveVectors` as a parameter")) { + (true, Ok(())) => Ok(Self::Retrieve), + (true, Err(error)) => Err(error), + (false, Ok(())) => Ok(Self::Hide), + (false, Err(_)) => Ok(Self::Ignore), + } + } +} + fn make_hits( index: &Index, rtxn: &RoTxn<'_>, @@ -990,10 +1022,32 @@ fn make_hits( document_scores: Vec>, ) -> Result, MeilisearchHttpError> { let fields_ids_map = index.fields_ids_map(rtxn).unwrap(); - let displayed_ids = index - .displayed_fields_ids(rtxn)? - .map(|fields| fields.into_iter().collect::>()) - .unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); + let displayed_ids = + index.displayed_fields_ids(rtxn)?.map(|fields| fields.into_iter().collect::>()); + + let vectors_fid = fields_ids_map.id(milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + + let vectors_is_hidden = match (&displayed_ids, vectors_fid) { + // displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid + (None, _) => false, + // displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field + (Some(_), None) => true, + // displayed_ids is a finit list, so hide if `_vectors` is not part of it + (Some(map), Some(vectors_fid)) => map.contains(&vectors_fid), + }; + + let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors { + if vectors_is_hidden { + RetrieveVectors::Hide + } else { + RetrieveVectors::Retrieve + } + } else { + format.retrieve_vectors + }; + + let displayed_ids = + displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); let fids = |attrs: &BTreeSet| { let mut ids = BTreeSet::new(); for attr in attrs { @@ -1016,9 +1070,7 @@ fn make_hits( .intersection(&displayed_ids) .cloned() .collect(); - let is_vectors_displayed = - fields_ids_map.id("_vectors").is_some_and(|fid| displayed_ids.contains(&fid)); - let retrieve_vectors = format.retrieve_vectors && is_vectors_displayed; + let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default(); let attr_to_crop = format.attributes_to_crop.unwrap_or_default(); let formatted_options = compute_formatted_options( @@ -1058,15 +1110,30 @@ fn make_hits( // First generate a document with all the displayed fields let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?; + let add_vectors_fid = + vectors_fid.filter(|_fid| retrieve_vectors == RetrieveVectors::Retrieve); + // select the attributes to retrieve let attributes_to_retrieve = to_retrieve_ids .iter() + // skip the vectors_fid if RetrieveVectors::Hide + .filter(|fid| match vectors_fid { + Some(vectors_fid) => { + !(retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid) + } + None => true, + }) + // need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve` + .chain(add_vectors_fid.iter()) .map(|&fid| fields_ids_map.name(fid).expect("Missing field name")); let mut document = permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve); - if retrieve_vectors { - let mut vectors = serde_json::Map::new(); + if retrieve_vectors == RetrieveVectors::Retrieve { + let mut vectors = match document.remove("_vectors") { + Some(Value::Object(map)) => map, + _ => Default::default(), + }; for (name, vector) in index.embeddings(rtxn, id)? { let user_provided = embedding_configs .iter() @@ -1148,6 +1215,7 @@ pub fn perform_similar( query: SimilarQuery, embedder_name: String, embedder: Arc, + retrieve_vectors: RetrieveVectors, ) -> Result { let before_search = Instant::now(); let rtxn = index.read_txn()?; @@ -1159,7 +1227,7 @@ pub fn perform_similar( filter: _, embedder: _, attributes_to_retrieve, - retrieve_vectors, + retrieve_vectors: _, show_ranking_score, show_ranking_score_details, ranking_score_threshold, From 09d9b63e1c0c1369e2c92b66e329d21e837f49d3 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 13 Jun 2024 17:16:41 +0200 Subject: [PATCH 43/44] - test case where all vectors were generated - update tests following changes in behavior from previous commit --- meilisearch/tests/search/hybrid.rs | 82 ++++++++++++++++++++++++++++ meilisearch/tests/vector/settings.rs | 82 ++++++++++++++-------------- 2 files changed, 124 insertions(+), 40 deletions(-) diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index be6e0b1c8..31b2940d8 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -482,3 +482,85 @@ async fn query_combination() { snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } + +#[actix_rt::test] +async fn retrieve_vectors() { + let server = Server::new().await; + let index = index_with_documents_hf(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), + ) + .await; + snapshot!(code, @"200 OK"); + insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###" + [ + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + } + ] + "###); + + // remove `_vectors` from displayed attributes + let (response, code) = + index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), + ) + .await; + snapshot!(code, @"200 OK"); + insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###" + [ + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2" + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3" + }, + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1" + } + ] + "###); +} diff --git a/meilisearch/tests/vector/settings.rs b/meilisearch/tests/vector/settings.rs index 3fe161f9b..e53ceb383 100644 --- a/meilisearch/tests/vector/settings.rs +++ b/meilisearch/tests/vector/settings.rs @@ -73,7 +73,48 @@ async fn reset_embedder_documents() { server.wait_task(response.uid()).await; // Make sure the documents are still present - let (documents, _code) = index.get_all_documents(Default::default()).await; + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { + limit: None, + offset: None, + retrieve_vectors: false, + fields: None, + }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir" + }, + { + "id": 1, + "name": "echo" + }, + { + "id": 2, + "name": "billou" + }, + { + "id": 3, + "name": "intel" + }, + { + "id": 4, + "name": "max" + } + ], + "offset": 0, + "limit": 20, + "total": 5 + } + "###); + + // Make sure we are still able to retrieve their vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; snapshot!(json_string!(documents), @r###" { "results": [ @@ -174,45 +215,6 @@ async fn reset_embedder_documents() { } "###); - // Make sure we are still able to retrieve their vectors - let (documents, _code) = index - .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) - .await; - snapshot!(json_string!(documents), @r###" - { - "results": [ - { - "id": 0, - "name": "kefir", - "_vectors": {} - }, - { - "id": 1, - "name": "echo", - "_vectors": {} - }, - { - "id": 2, - "name": "billou", - "_vectors": {} - }, - { - "id": 3, - "name": "intel", - "_vectors": {} - }, - { - "id": 4, - "name": "max", - "_vectors": {} - } - ], - "offset": 0, - "limit": 20, - "total": 5 - } - "###); - // Make sure the arroy DB has been cleared let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; snapshot!(json_string!(documents), @r###" From 0a8f50695eac018f2664d996e024bf33d4e19d6f Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Thu, 13 Jun 2024 17:47:44 +0200 Subject: [PATCH 44/44] Fixes for Rust v1.79 --- dump/src/reader/v3/settings.rs | 1 + dump/src/reader/v4/settings.rs | 1 + dump/src/reader/v5/tasks.rs | 1 + milli/Cargo.toml | 4 ++-- milli/src/search/new/logger/visual.rs | 11 ++++------- milli/src/update/index_documents/transform.rs | 4 +--- xtask/Cargo.toml | 2 +- 7 files changed, 11 insertions(+), 13 deletions(-) diff --git a/dump/src/reader/v3/settings.rs b/dump/src/reader/v3/settings.rs index 0027bf4ff..3288bb1e7 100644 --- a/dump/src/reader/v3/settings.rs +++ b/dump/src/reader/v3/settings.rs @@ -152,6 +152,7 @@ impl Settings { } #[derive(Debug, Clone, Deserialize)] +#[allow(dead_code)] // otherwise rustc complains that the fields go unused #[cfg_attr(test, derive(serde::Serialize))] #[serde(deny_unknown_fields)] #[serde(rename_all = "camelCase")] diff --git a/dump/src/reader/v4/settings.rs b/dump/src/reader/v4/settings.rs index 964cd1152..78d9118ff 100644 --- a/dump/src/reader/v4/settings.rs +++ b/dump/src/reader/v4/settings.rs @@ -182,6 +182,7 @@ impl Settings { } } +#[allow(dead_code)] // otherwise rustc complains that the fields go unused #[derive(Debug, Clone, Deserialize)] #[cfg_attr(test, derive(serde::Serialize))] #[serde(deny_unknown_fields)] diff --git a/dump/src/reader/v5/tasks.rs b/dump/src/reader/v5/tasks.rs index 528a870fc..8dfb2d0b0 100644 --- a/dump/src/reader/v5/tasks.rs +++ b/dump/src/reader/v5/tasks.rs @@ -200,6 +200,7 @@ impl std::ops::Deref for IndexUid { } } +#[allow(dead_code)] // otherwise rustc complains that the fields go unused #[derive(Debug)] #[cfg_attr(test, derive(serde::Serialize))] #[cfg_attr(test, serde(rename_all = "camelCase"))] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 7fba2af1e..a4aa4ef95 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -71,10 +71,10 @@ csv = "1.3.0" candle-core = { version = "0.4.1" } candle-transformers = { version = "0.4.1" } candle-nn = { version = "0.4.1" } -tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default_features = false, features = [ +tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [ "onig", ] } -hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [ +hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [ "online", ] } tiktoken-rs = "0.5.8" diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs index 8df56da89..2bffdd8d9 100644 --- a/milli/src/search/new/logger/visual.rs +++ b/milli/src/search/new/logger/visual.rs @@ -22,7 +22,7 @@ pub enum SearchEvents { RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 }, RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 }, RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 }, - RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 }, + RankingRuleEndIteration { ranking_rule_idx: usize }, ExtendResults { new: Vec }, ProximityGraph { graph: RankingRuleGraph }, ProximityPaths { paths: Vec>> }, @@ -123,12 +123,9 @@ impl SearchLogger for VisualSearchLogger { &mut self, ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule, - universe: &RoaringBitmap, + _universe: &RoaringBitmap, ) { - self.events.push(SearchEvents::RankingRuleEndIteration { - ranking_rule_idx, - universe_len: universe.len(), - }); + self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx }); self.location.pop(); } fn add_to_results(&mut self, docids: &[u32]) { @@ -326,7 +323,7 @@ impl<'ctx> DetailedLoggerFinish<'ctx> { assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); self.write_skip_bucket(bucket_len)?; } - SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe_len: _ } => { + SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => { assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); self.write_end_iteration()?; } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 997ab64ff..1dff29a90 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -53,7 +53,6 @@ pub struct Transform<'a, 'i> { fields_ids_map: FieldsIdsMap, indexer_settings: &'a IndexerConfig, - pub autogenerate_docids: bool, pub index_documents_method: IndexDocumentsMethod, available_documents_ids: AvailableDocumentsIds, @@ -107,7 +106,7 @@ impl<'a, 'i> Transform<'a, 'i> { index: &'i Index, indexer_settings: &'a IndexerConfig, index_documents_method: IndexDocumentsMethod, - autogenerate_docids: bool, + _autogenerate_docids: bool, ) -> Result { // We must choose the appropriate merge function for when two or more documents // with the same user id must be merged or fully replaced in the same batch. @@ -141,7 +140,6 @@ impl<'a, 'i> Transform<'a, 'i> { index, fields_ids_map: index.fields_ids_map(wtxn)?, indexer_settings, - autogenerate_docids, available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), original_sorter, flattened_sorter, diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index 562dfddb3..a618b06a5 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -21,7 +21,7 @@ reqwest = { version = "0.11.23", features = [ "stream", "json", "rustls-tls", -], default_features = false } +], default-features = false } serde = { version = "1.0.195", features = ["derive"] } serde_json = "1.0.111" sha2 = "0.10.8"