From 5d50850e12f72a07221184c7d9962f511a6dc791 Mon Sep 17 00:00:00 2001 From: Tamo Date: Mon, 3 Jun 2024 16:04:14 +0200 Subject: [PATCH] always push the user defined vectors in arroy --- index-scheduler/src/lib.rs | 14 +- ..._scheduler__tests__import_vectors-15.snap} | 4 - ..._scheduler__tests__import_vectors-22.snap} | 4 - ...x_scheduler__tests__import_vectors-5.snap} | 0 ...x_scheduler__tests__import_vectors-8.snap} | 0 ..._scheduler__tests__settings_update-5.snap} | 0 .../documents after setting an embedder.snap | 4 - meilisearch/tests/search/hybrid.rs | 40 +-- meilisearch/tests/search/mod.rs | 1 + meilisearch/tests/similar/mod.rs | 217 ++++++++-------- ...__attribute_fid__attribute_fid_ngrams.snap | 244 ------------------ .../1/field_distribution.snap | 7 - .../field_distribution.snap | 7 - .../extract/extract_vector_points.rs | 75 +++--- milli/src/vector/parsed_vectors.rs | 22 +- 15 files changed, 189 insertions(+), 450 deletions(-) rename index-scheduler/src/snapshots/{index_scheduler__tests__import_vectors-9.snap => index_scheduler__tests__import_vectors-15.snap} (67%) rename index-scheduler/src/snapshots/{index_scheduler__tests__import_vectors-12.snap => index_scheduler__tests__import_vectors-22.snap} (67%) rename index-scheduler/src/snapshots/{index_scheduler__tests__import_vectors-4.snap => index_scheduler__tests__import_vectors-5.snap} (100%) rename index-scheduler/src/snapshots/{index_scheduler__tests__import_vectors-6.snap => index_scheduler__tests__import_vectors-8.snap} (100%) rename index-scheduler/src/snapshots/{index_scheduler__tests__settings_update-3.snap => index_scheduler__tests__settings_update-5.snap} (100%) delete mode 100644 index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap delete mode 100644 milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap delete mode 100644 milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap delete mode 100644 milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index d007acd2c..f69736297 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5173,8 +5173,8 @@ mod tests { snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); println!("HEEEEERE"); - // handle.advance_one_successful_batch(); - handle.advance_one_failed_batch(); + handle.advance_one_successful_batch(); + // handle.advance_one_failed_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds"); { @@ -5351,9 +5351,9 @@ mod tests { // as user provided since we explicitely marked it as NOT user provided. snapshot!(format!("{conf:#?}"), @r###" [ - ( - "my_doggo_embedder", - EmbeddingConfig { + IndexEmbeddingConfig { + name: "my_doggo_embedder", + config: EmbeddingConfig { embedder_options: HuggingFace( EmbedderOptions { model: "sentence-transformers/all-MiniLM-L6-v2", @@ -5367,8 +5367,8 @@ mod tests { template: "{{doc.doggo}}", }, }, - RoaringBitmap<[1, 2]>, - ), + user_defined: RoaringBitmap<[1, 2]>, + }, ] "###); let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap similarity index 67% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap index 002a42e59..540835dfb 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap @@ -6,10 +6,6 @@ expression: doc "doggo": "Intel", "breed": "beagle", "_vectors": { - "A_fakerest": { - "embeddings": "[vector]", - "userProvided": true - }, "noise": [ 0.1, 0.2, diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap similarity index 67% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap index 718ea229c..bc35d84f6 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap @@ -6,10 +6,6 @@ expression: doc "doggo": "kefir", "breed": "patou", "_vectors": { - "A_fakerest": { - "embeddings": "[vector]", - "userProvided": true - }, "noise": [ 0.1, 0.2, diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap deleted file mode 100644 index 853be8b0a..000000000 --- a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after setting an embedder.snap +++ /dev/null @@ -1,4 +0,0 @@ ---- -source: index-scheduler/src/lib.rs ---- -[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}] diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 0c8b4534c..1e415bc63 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -204,7 +204,7 @@ async fn distribution_shift() { let server = Server::new().await; let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; - let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}}); + let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true}); let (response, code) = index.search_post(search.clone()).await; snapshot!(code, @"200 OK"); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); @@ -239,20 +239,23 @@ async fn highlighter() { let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, - "attributesToHighlight": [ - "desc" + "retrieveVectors": true, + "attributesToHighlight": [ + "desc", + "_vectors", ], - "highlightPreTag": "**BEGIN**", - "highlightPostTag": "**END**" + "highlightPreTag": "**BEGIN**", + "highlightPostTag": "**END**", })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, + "retrieveVectors": true, "showRankingScore": true, "attributesToHighlight": [ "desc" @@ -262,13 +265,14 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 1.0}, + "retrieveVectors": true, "showRankingScore": true, "attributesToHighlight": [ "desc" @@ -278,7 +282,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -361,7 +365,7 @@ async fn single_document() { let (response, code) = index .search_post( - json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}), ) .await; @@ -377,7 +381,7 @@ async fn query_combination() { // search without query and vector, but with hybrid => still placeholder let (response, code) = index - .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -386,7 +390,7 @@ async fn query_combination() { // same with a different semantic ratio let (response, code) = index - .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true})) + .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -395,7 +399,7 @@ async fn query_combination() { // wrong vector dimensions let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"400 Bad Request"); @@ -410,7 +414,7 @@ async fn query_combination() { // full vector let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -419,7 +423,7 @@ async fn query_combination() { // full keyword, without a query let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -428,7 +432,7 @@ async fn query_combination() { // query + vector, full keyword => keyword let (response, code) = index - .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) + .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); @@ -437,7 +441,7 @@ async fn query_combination() { // query + vector, no hybrid keyword => let (response, code) = index - .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true})) + .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"400 Bad Request"); @@ -453,7 +457,7 @@ async fn query_combination() { // full vector, without a vector => error let (response, code) = index .search_post( - json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}), ) .await; @@ -470,7 +474,7 @@ async fn query_combination() { // hybrid without a vector => full keyword let (response, code) = index .search_post( - json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true}), + json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true, "retrieveVectors": true}), ) .await; diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index b65c0dc42..955b324a6 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1337,6 +1337,7 @@ async fn experimental_feature_vector_store() { .search_post(json!({ "vector": [1.0, 2.0, 3.0], "showRankingScore": true, + "retrieveVectors": true, })) .await; diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index a2378eb58..f2af91588 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -78,7 +78,7 @@ async fn basic() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 143}), |response, code| { + .similar(json!({"id": 143, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -88,9 +88,9 @@ async fn basic() { "id": "522681", "_vectors": { "manual": [ - 0.1, - 0.6, - 0.8 + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 ] } }, @@ -100,9 +100,9 @@ async fn basic() { "id": "299537", "_vectors": { "manual": [ - 0.6, - 0.8, - -0.2 + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 ] } }, @@ -112,9 +112,9 @@ async fn basic() { "id": "166428", "_vectors": { "manual": [ - 0.7, - 0.7, - -0.4 + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 ] } }, @@ -124,8 +124,8 @@ async fn basic() { "id": "287947", "_vectors": { "manual": [ - 0.8, - 0.4, + 0.800000011920929, + 0.4000000059604645, -0.5 ] } @@ -136,7 +136,7 @@ async fn basic() { .await; index - .similar(json!({"id": "299537"}), |response, code| { + .similar(json!({"id": "299537", "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -146,9 +146,9 @@ async fn basic() { "id": "166428", "_vectors": { "manual": [ - 0.7, - 0.7, - -0.4 + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 ] } }, @@ -158,8 +158,8 @@ async fn basic() { "id": "287947", "_vectors": { "manual": [ - 0.8, - 0.4, + 0.800000011920929, + 0.4000000059604645, -0.5 ] } @@ -170,9 +170,9 @@ async fn basic() { "id": "522681", "_vectors": { "manual": [ - 0.1, - 0.6, - 0.8 + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 ] } }, @@ -183,8 +183,8 @@ async fn basic() { "_vectors": { "manual": [ -0.5, - 0.3, - 0.85 + 0.30000001192092896, + 0.8500000238418579 ] } } @@ -456,71 +456,77 @@ async fn filter() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 522681, "filter": "release_year = 2019"}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "Captain Marvel", - "release_year": 2019, - "id": "299537", - "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] - } - }, - { - "title": "How to Train Your Dragon: The Hidden World", - "release_year": 2019, - "id": "166428", - "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] - } - }, - { - "title": "Shazam!", - "release_year": 2019, - "id": "287947", - "_vectors": { - "manual": [ - 0.8, - 0.4, - -0.5 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 522681, "filter": "release_year = 2019", "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "release_year": 2019, + "id": "166428", + "_vectors": { + "manual": [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + } + }, + { + "title": "Shazam!", + "release_year": 2019, + "id": "287947", + "_vectors": { + "manual": [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + } + } + ] + "###); + }, + ) .await; index - .similar(json!({"id": 522681, "filter": "release_year < 2000"}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "All Quiet on the Western Front", - "release_year": 1930, - "id": "143", - "_vectors": { - "manual": [ - -0.5, - 0.3, - 0.85 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 522681, "filter": "release_year < 2000", "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "All Quiet on the Western Front", + "release_year": 1930, + "id": "143", + "_vectors": { + "manual": [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + } + } + ] + "###); + }, + ) .await; } @@ -579,24 +585,27 @@ async fn limit_and_offset() { .await; index - .similar(json!({"id": 143, "limit": 1, "offset": 1}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "Captain Marvel", - "release_year": 2019, - "id": "299537", - "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 143, "limit": 1, "offset": 1, "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + } + } + ] + "###); + }, + ) .await; } diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap deleted file mode 100644 index 930a21626..000000000 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap +++ /dev/null @@ -1,244 +0,0 @@ ---- -source: milli/src/search/new/tests/attribute_fid.rs -expression: "format!(\"{document_ids_scores:#?}\")" ---- -[ - ( - 2, - [ - Fid( - Rank { - rank: 19, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 91, - max_rank: 91, - }, - ), - ], - ), - ( - 6, - [ - Fid( - Rank { - rank: 15, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 5, - [ - Fid( - Rank { - rank: 14, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 79, - max_rank: 91, - }, - ), - ], - ), - ( - 4, - [ - Fid( - Rank { - rank: 13, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 77, - max_rank: 91, - }, - ), - ], - ), - ( - 3, - [ - Fid( - Rank { - rank: 12, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 83, - max_rank: 91, - }, - ), - ], - ), - ( - 9, - [ - Fid( - Rank { - rank: 11, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 75, - max_rank: 91, - }, - ), - ], - ), - ( - 8, - [ - Fid( - Rank { - rank: 10, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 79, - max_rank: 91, - }, - ), - ], - ), - ( - 7, - [ - Fid( - Rank { - rank: 10, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 73, - max_rank: 91, - }, - ), - ], - ), - ( - 11, - [ - Fid( - Rank { - rank: 7, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 77, - max_rank: 91, - }, - ), - ], - ), - ( - 10, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 13, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 12, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 78, - max_rank: 91, - }, - ), - ], - ), - ( - 14, - [ - Fid( - Rank { - rank: 5, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 75, - max_rank: 91, - }, - ), - ], - ), - ( - 0, - [ - Fid( - Rank { - rank: 1, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 91, - max_rank: 91, - }, - ), - ], - ), -] diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap deleted file mode 100644 index 1d1d629e6..000000000 --- a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/index.rs ---- -age 1 | -id 2 | -name 2 | - diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap deleted file mode 100644 index 1d1d629e6..000000000 --- a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/index.rs ---- -age 1 | -id 2 | -name 2 | - diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 3eb761bce..1e56bec83 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -8,7 +8,6 @@ use std::sync::Arc; use bytemuck::cast_slice; use grenad::Writer; -use itertools::EitherOrBoth; use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use serde_json::Value; @@ -50,7 +49,7 @@ enum VectorStateDelta { // Note: changing the value of the manually specified vector **should not record** this delta WasGeneratedNowManual(Vec>), - ManualDelta(Vec>, Vec>), + ManualDelta(Vec>), // Add the vector computed from the specified prompt // Remove any previous vector @@ -59,14 +58,12 @@ enum VectorStateDelta { } impl VectorStateDelta { - fn into_values(self) -> (bool, String, (Vec>, Vec>)) { + fn into_values(self) -> (bool, String, Vec>) { match self { VectorStateDelta::NoChange => Default::default(), VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), - VectorStateDelta::WasGeneratedNowManual(add) => { - (true, Default::default(), (Default::default(), add)) - } - VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)), + VectorStateDelta::WasGeneratedNowManual(add) => (true, Default::default(), add), + VectorStateDelta::ManualDelta(add) => (false, Default::default(), add), VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), } } @@ -166,8 +163,14 @@ pub fn extract_vector_points( // lazily get it when needed let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; - let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid) - .map_err(|error| error.to_crate_error(document_id().to_string()))?; + let mut parsed_vectors = ParsedVectorsDiff::new( + docid, + embedders_configs, + obkv, + old_vectors_fid, + new_vectors_fid, + ) + .map_err(|error| error.to_crate_error(document_id().to_string()))?; for EmbedderVectorExtractor { embedder_name, @@ -182,7 +185,7 @@ pub fn extract_vector_points( { let delta = match parsed_vectors.remove(embedder_name) { (Some(old), Some(new)) => { - match (old.is_user_provided(), new.is_user_provided()) { + match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) { (true, true) | (false, false) => (), (true, false) => { remove_from_user_defined.insert(docid); @@ -193,7 +196,6 @@ pub fn extract_vector_points( } // no autogeneration - let del_vectors = old.into_array_of_vectors(); let add_vectors = new.into_array_of_vectors(); if add_vectors.len() > usize::from(u8::MAX) { @@ -203,15 +205,15 @@ pub fn extract_vector_points( ))); } - VectorStateDelta::ManualDelta(del_vectors, add_vectors) + VectorStateDelta::ManualDelta(add_vectors) } - (Some(_old), None) => { + (Some(old), None) => { // Do we keep this document? let document_is_kept = obkv .iter() .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - if document_is_kept { + if document_is_kept && old.is_some() { remove_from_user_defined.insert(docid); // becomes autogenerated VectorStateDelta::NowGenerated(prompt.render( @@ -219,6 +221,8 @@ pub fn extract_vector_points( DelAdd::Addition, new_fields_ids_map, )?) + } else if document_is_kept && old.is_none() { + VectorStateDelta::NoChange } else { VectorStateDelta::NowRemoved } @@ -315,8 +319,8 @@ pub fn extract_vector_points( Ok(results) } -/// Computes the diff between both Del and Add numbers and -/// only inserts the parts that differ in the sorter. +/// We cannot compute the diff between both Del and Add vectors. +/// We'll push every vector and compute the difference later in TypedChunk. fn push_vectors_diff( remove_vectors_writer: &mut Writer>, prompts_writer: &mut Writer>, @@ -325,7 +329,7 @@ fn push_vectors_diff( delta: VectorStateDelta, reindex_vectors: bool, ) -> Result<()> { - let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); + let (must_remove, prompt, mut add_vectors) = delta.into_values(); if must_remove // TODO: the below condition works because we erase the vec database when a embedding setting changes. // When vector pipeline will be optimized, this should be removed. @@ -340,44 +344,25 @@ fn push_vectors_diff( } // We sort and dedup the vectors - del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); - del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); - let merged_vectors_iter = - itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); + // let merged_vectors_iter = + // itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); // insert vectors into the writer - for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { + for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { // Generate the key by extending the unique index to it. key_buffer.truncate(TRUNCATE_SIZE); let index = u16::try_from(i).unwrap(); key_buffer.extend_from_slice(&index.to_be_bytes()); - match eob { - EitherOrBoth::Both(_, _) => (), // no need to touch anything - EitherOrBoth::Left(vector) => { - // TODO: the below condition works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - if !reindex_vectors { - // We insert only the Del part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; - } - } - EitherOrBoth::Right(vector) => { - // We insert only the Add part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Addition, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; - } - } + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + manual_vectors_writer.insert(&key_buffer, bytes)?; } Ok(()) diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 62c418149..672e27cc5 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -4,8 +4,9 @@ use obkv::KvReader; use serde_json::{from_slice, Value}; use super::Embedding; +use crate::index::IndexEmbeddingConfig; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; -use crate::{FieldId, InternalError, UserError}; +use crate::{DocumentId, FieldId, InternalError, UserError}; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; @@ -42,17 +43,19 @@ pub struct ExplicitVectors { } pub struct ParsedVectorsDiff { - pub old: Option>, + pub old: BTreeMap>, pub new: Option>, } impl ParsedVectorsDiff { pub fn new( + docid: DocumentId, + embedders_configs: &[IndexEmbeddingConfig], documents_diff: KvReader<'_, FieldId>, old_vectors_fid: Option, new_vectors_fid: Option, ) -> Result { - let old = match old_vectors_fid + let mut old = match old_vectors_fid .and_then(|vectors_fid| documents_diff.get(vectors_fid)) .map(KvReaderDelAdd::new) .map(|obkv| to_vector_map(obkv, DelAdd::Deletion)) @@ -68,7 +71,13 @@ impl ParsedVectorsDiff { return Err(error); } } - .flatten(); + .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, Some(vec))).collect()); + for embedding_config in embedders_configs { + if embedding_config.user_defined.contains(docid) { + old.entry(embedding_config.name.to_string()).or_insert(None); + } + } + let new = new_vectors_fid .and_then(|vectors_fid| documents_diff.get(vectors_fid)) .map(KvReaderDelAdd::new) @@ -78,8 +87,9 @@ impl ParsedVectorsDiff { Ok(Self { old, new }) } - pub fn remove(&mut self, embedder_name: &str) -> (Option, Option) { - let old = self.old.as_mut().and_then(|old| old.remove(embedder_name)); + /// Return (Some(None), _) in case the vector is user defined and contained in the database. + pub fn remove(&mut self, embedder_name: &str) -> (Option>, Option) { + let old = self.old.remove(embedder_name); let new = self.new.as_mut().and_then(|new| new.remove(embedder_name)); (old, new) }