always push the user defined vectors in arroy

This commit is contained in:
Tamo 2024-06-03 16:04:14 +02:00
parent a73ccc78a6
commit 5d50850e12
15 changed files with 189 additions and 450 deletions

View File

@ -5173,8 +5173,8 @@ mod tests {
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir");
println!("HEEEEERE"); println!("HEEEEERE");
// handle.advance_one_successful_batch(); handle.advance_one_successful_batch();
handle.advance_one_failed_batch(); // handle.advance_one_failed_batch();
snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds"); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir succeeds");
{ {
@ -5351,9 +5351,9 @@ mod tests {
// as user provided since we explicitely marked it as NOT user provided. // as user provided since we explicitely marked it as NOT user provided.
snapshot!(format!("{conf:#?}"), @r###" snapshot!(format!("{conf:#?}"), @r###"
[ [
( IndexEmbeddingConfig {
"my_doggo_embedder", name: "my_doggo_embedder",
EmbeddingConfig { config: EmbeddingConfig {
embedder_options: HuggingFace( embedder_options: HuggingFace(
EmbedderOptions { EmbedderOptions {
model: "sentence-transformers/all-MiniLM-L6-v2", model: "sentence-transformers/all-MiniLM-L6-v2",
@ -5367,8 +5367,8 @@ mod tests {
template: "{{doc.doggo}}", template: "{{doc.doggo}}",
}, },
}, },
RoaringBitmap<[1, 2]>, user_defined: RoaringBitmap<[1, 2]>,
), },
] ]
"###); "###);
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();

View File

@ -6,10 +6,6 @@ expression: doc
"doggo": "Intel", "doggo": "Intel",
"breed": "beagle", "breed": "beagle",
"_vectors": { "_vectors": {
"A_fakerest": {
"embeddings": "[vector]",
"userProvided": true
},
"noise": [ "noise": [
0.1, 0.1,
0.2, 0.2,

View File

@ -6,10 +6,6 @@ expression: doc
"doggo": "kefir", "doggo": "kefir",
"breed": "patou", "breed": "patou",
"_vectors": { "_vectors": {
"A_fakerest": {
"embeddings": "[vector]",
"userProvided": true
},
"noise": [ "noise": [
0.1, 0.1,
0.2, 0.2,

View File

@ -1,4 +0,0 @@
---
source: index-scheduler/src/lib.rs
---
[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]

View File

@ -204,7 +204,7 @@ async fn distribution_shift() {
let server = Server::new().await; let server = Server::new().await;
let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await;
let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}}); let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true});
let (response, code) = index.search_post(search.clone()).await; let (response, code) = index.search_post(search.clone()).await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###);
@ -239,20 +239,23 @@ async fn highlighter() {
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
"hybrid": {"semanticRatio": 0.2}, "hybrid": {"semanticRatio": 0.2},
"retrieveVectors": true,
"attributesToHighlight": [ "attributesToHighlight": [
"desc" "desc",
"_vectors",
], ],
"highlightPreTag": "**BEGIN**", "highlightPreTag": "**BEGIN**",
"highlightPostTag": "**END**" "highlightPostTag": "**END**",
})) }))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###);
snapshot!(response["semanticHitCount"], @"0"); snapshot!(response["semanticHitCount"], @"0");
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
"hybrid": {"semanticRatio": 0.8}, "hybrid": {"semanticRatio": 0.8},
"retrieveVectors": true,
"showRankingScore": true, "showRankingScore": true,
"attributesToHighlight": [ "attributesToHighlight": [
"desc" "desc"
@ -262,13 +265,14 @@ async fn highlighter() {
})) }))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"3"); snapshot!(response["semanticHitCount"], @"3");
// no highlighting on full semantic // no highlighting on full semantic
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0],
"hybrid": {"semanticRatio": 1.0}, "hybrid": {"semanticRatio": 1.0},
"retrieveVectors": true,
"showRankingScore": true, "showRankingScore": true,
"attributesToHighlight": [ "attributesToHighlight": [
"desc" "desc"
@ -278,7 +282,7 @@ async fn highlighter() {
})) }))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###);
snapshot!(response["semanticHitCount"], @"3"); snapshot!(response["semanticHitCount"], @"3");
} }
@ -361,7 +365,7 @@ async fn single_document() {
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}),
) )
.await; .await;
@ -377,7 +381,7 @@ async fn query_combination() {
// search without query and vector, but with hybrid => still placeholder // search without query and vector, but with hybrid => still placeholder
let (response, code) = index let (response, code) = index
.search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
@ -386,7 +390,7 @@ async fn query_combination() {
// same with a different semantic ratio // same with a different semantic ratio
let (response, code) = index let (response, code) = index
.search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true})) .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
@ -395,7 +399,7 @@ async fn query_combination() {
// wrong vector dimensions // wrong vector dimensions
let (response, code) = index let (response, code) = index
.search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
@ -410,7 +414,7 @@ async fn query_combination() {
// full vector // full vector
let (response, code) = index let (response, code) = index
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
@ -419,7 +423,7 @@ async fn query_combination() {
// full keyword, without a query // full keyword, without a query
let (response, code) = index let (response, code) = index
.search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
@ -428,7 +432,7 @@ async fn query_combination() {
// query + vector, full keyword => keyword // query + vector, full keyword => keyword
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
@ -437,7 +441,7 @@ async fn query_combination() {
// query + vector, no hybrid keyword => // query + vector, no hybrid keyword =>
let (response, code) = index let (response, code) = index
.search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true})) .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true, "retrieveVectors": true}))
.await; .await;
snapshot!(code, @"400 Bad Request"); snapshot!(code, @"400 Bad Request");
@ -453,7 +457,7 @@ async fn query_combination() {
// full vector, without a vector => error // full vector, without a vector => error
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}),
) )
.await; .await;
@ -470,7 +474,7 @@ async fn query_combination() {
// hybrid without a vector => full keyword // hybrid without a vector => full keyword
let (response, code) = index let (response, code) = index
.search_post( .search_post(
json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true}), json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true, "retrieveVectors": true}),
) )
.await; .await;

View File

@ -1337,6 +1337,7 @@ async fn experimental_feature_vector_store() {
.search_post(json!({ .search_post(json!({
"vector": [1.0, 2.0, 3.0], "vector": [1.0, 2.0, 3.0],
"showRankingScore": true, "showRankingScore": true,
"retrieveVectors": true,
})) }))
.await; .await;

View File

@ -78,7 +78,7 @@ async fn basic() {
index.wait_task(value.uid()).await; index.wait_task(value.uid()).await;
index index
.similar(json!({"id": 143}), |response, code| { .similar(json!({"id": 143, "retrieveVectors": true}), |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###" snapshot!(json_string!(response["hits"]), @r###"
[ [
@ -88,9 +88,9 @@ async fn basic() {
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": [ "manual": [
0.1, 0.10000000149011612,
0.6, 0.6000000238418579,
0.8 0.800000011920929
] ]
} }
}, },
@ -100,9 +100,9 @@ async fn basic() {
"id": "299537", "id": "299537",
"_vectors": { "_vectors": {
"manual": [ "manual": [
0.6, 0.6000000238418579,
0.8, 0.800000011920929,
-0.2 -0.20000000298023224
] ]
} }
}, },
@ -112,9 +112,9 @@ async fn basic() {
"id": "166428", "id": "166428",
"_vectors": { "_vectors": {
"manual": [ "manual": [
0.7, 0.699999988079071,
0.7, 0.699999988079071,
-0.4 -0.4000000059604645
] ]
} }
}, },
@ -124,8 +124,8 @@ async fn basic() {
"id": "287947", "id": "287947",
"_vectors": { "_vectors": {
"manual": [ "manual": [
0.8, 0.800000011920929,
0.4, 0.4000000059604645,
-0.5 -0.5
] ]
} }
@ -136,7 +136,7 @@ async fn basic() {
.await; .await;
index index
.similar(json!({"id": "299537"}), |response, code| { .similar(json!({"id": "299537", "retrieveVectors": true}), |response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###" snapshot!(json_string!(response["hits"]), @r###"
[ [
@ -146,9 +146,9 @@ async fn basic() {
"id": "166428", "id": "166428",
"_vectors": { "_vectors": {
"manual": [ "manual": [
0.7, 0.699999988079071,
0.7, 0.699999988079071,
-0.4 -0.4000000059604645
] ]
} }
}, },
@ -158,8 +158,8 @@ async fn basic() {
"id": "287947", "id": "287947",
"_vectors": { "_vectors": {
"manual": [ "manual": [
0.8, 0.800000011920929,
0.4, 0.4000000059604645,
-0.5 -0.5
] ]
} }
@ -170,9 +170,9 @@ async fn basic() {
"id": "522681", "id": "522681",
"_vectors": { "_vectors": {
"manual": [ "manual": [
0.1, 0.10000000149011612,
0.6, 0.6000000238418579,
0.8 0.800000011920929
] ]
} }
}, },
@ -183,8 +183,8 @@ async fn basic() {
"_vectors": { "_vectors": {
"manual": [ "manual": [
-0.5, -0.5,
0.3, 0.30000001192092896,
0.85 0.8500000238418579
] ]
} }
} }
@ -456,7 +456,9 @@ async fn filter() {
index.wait_task(value.uid()).await; index.wait_task(value.uid()).await;
index index
.similar(json!({"id": 522681, "filter": "release_year = 2019"}), |response, code| { .similar(
json!({"id": 522681, "filter": "release_year = 2019", "retrieveVectors": true}),
|response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###" snapshot!(json_string!(response["hits"]), @r###"
[ [
@ -466,9 +468,9 @@ async fn filter() {
"id": "299537", "id": "299537",
"_vectors": { "_vectors": {
"manual": [ "manual": [
0.6, 0.6000000238418579,
0.8, 0.800000011920929,
-0.2 -0.20000000298023224
] ]
} }
}, },
@ -478,9 +480,9 @@ async fn filter() {
"id": "166428", "id": "166428",
"_vectors": { "_vectors": {
"manual": [ "manual": [
0.7, 0.699999988079071,
0.7, 0.699999988079071,
-0.4 -0.4000000059604645
] ]
} }
}, },
@ -490,19 +492,22 @@ async fn filter() {
"id": "287947", "id": "287947",
"_vectors": { "_vectors": {
"manual": [ "manual": [
0.8, 0.800000011920929,
0.4, 0.4000000059604645,
-0.5 -0.5
] ]
} }
} }
] ]
"###); "###);
}) },
)
.await; .await;
index index
.similar(json!({"id": 522681, "filter": "release_year < 2000"}), |response, code| { .similar(
json!({"id": 522681, "filter": "release_year < 2000", "retrieveVectors": true}),
|response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###" snapshot!(json_string!(response["hits"]), @r###"
[ [
@ -513,14 +518,15 @@ async fn filter() {
"_vectors": { "_vectors": {
"manual": [ "manual": [
-0.5, -0.5,
0.3, 0.30000001192092896,
0.85 0.8500000238418579
] ]
} }
} }
] ]
"###); "###);
}) },
)
.await; .await;
} }
@ -579,7 +585,9 @@ async fn limit_and_offset() {
.await; .await;
index index
.similar(json!({"id": 143, "limit": 1, "offset": 1}), |response, code| { .similar(
json!({"id": 143, "limit": 1, "offset": 1, "retrieveVectors": true}),
|response, code| {
snapshot!(code, @"200 OK"); snapshot!(code, @"200 OK");
snapshot!(json_string!(response["hits"]), @r###" snapshot!(json_string!(response["hits"]), @r###"
[ [
@ -589,14 +597,15 @@ async fn limit_and_offset() {
"id": "299537", "id": "299537",
"_vectors": { "_vectors": {
"manual": [ "manual": [
0.6, 0.6000000238418579,
0.8, 0.800000011920929,
-0.2 -0.20000000298023224
] ]
} }
} }
] ]
"###); "###);
}) },
)
.await; .await;
} }

View File

@ -1,244 +0,0 @@
---
source: milli/src/search/new/tests/attribute_fid.rs
expression: "format!(\"{document_ids_scores:#?}\")"
---
[
(
2,
[
Fid(
Rank {
rank: 19,
max_rank: 19,
},
),
Position(
Rank {
rank: 91,
max_rank: 91,
},
),
],
),
(
6,
[
Fid(
Rank {
rank: 15,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
5,
[
Fid(
Rank {
rank: 14,
max_rank: 19,
},
),
Position(
Rank {
rank: 79,
max_rank: 91,
},
),
],
),
(
4,
[
Fid(
Rank {
rank: 13,
max_rank: 19,
},
),
Position(
Rank {
rank: 77,
max_rank: 91,
},
),
],
),
(
3,
[
Fid(
Rank {
rank: 12,
max_rank: 19,
},
),
Position(
Rank {
rank: 83,
max_rank: 91,
},
),
],
),
(
9,
[
Fid(
Rank {
rank: 11,
max_rank: 19,
},
),
Position(
Rank {
rank: 75,
max_rank: 91,
},
),
],
),
(
8,
[
Fid(
Rank {
rank: 10,
max_rank: 19,
},
),
Position(
Rank {
rank: 79,
max_rank: 91,
},
),
],
),
(
7,
[
Fid(
Rank {
rank: 10,
max_rank: 19,
},
),
Position(
Rank {
rank: 73,
max_rank: 91,
},
),
],
),
(
11,
[
Fid(
Rank {
rank: 7,
max_rank: 19,
},
),
Position(
Rank {
rank: 77,
max_rank: 91,
},
),
],
),
(
10,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
13,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 81,
max_rank: 91,
},
),
],
),
(
12,
[
Fid(
Rank {
rank: 6,
max_rank: 19,
},
),
Position(
Rank {
rank: 78,
max_rank: 91,
},
),
],
),
(
14,
[
Fid(
Rank {
rank: 5,
max_rank: 19,
},
),
Position(
Rank {
rank: 75,
max_rank: 91,
},
),
],
),
(
0,
[
Fid(
Rank {
rank: 1,
max_rank: 19,
},
),
Position(
Rank {
rank: 91,
max_rank: 91,
},
),
],
),
]

View File

@ -1,7 +0,0 @@
---
source: milli/src/index.rs
---
age 1 |
id 2 |
name 2 |

View File

@ -1,7 +0,0 @@
---
source: milli/src/index.rs
---
age 1 |
id 2 |
name 2 |

View File

@ -8,7 +8,6 @@ use std::sync::Arc;
use bytemuck::cast_slice; use bytemuck::cast_slice;
use grenad::Writer; use grenad::Writer;
use itertools::EitherOrBoth;
use ordered_float::OrderedFloat; use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
@ -50,7 +49,7 @@ enum VectorStateDelta {
// Note: changing the value of the manually specified vector **should not record** this delta // Note: changing the value of the manually specified vector **should not record** this delta
WasGeneratedNowManual(Vec<Vec<f32>>), WasGeneratedNowManual(Vec<Vec<f32>>),
ManualDelta(Vec<Vec<f32>>, Vec<Vec<f32>>), ManualDelta(Vec<Vec<f32>>),
// Add the vector computed from the specified prompt // Add the vector computed from the specified prompt
// Remove any previous vector // Remove any previous vector
@ -59,14 +58,12 @@ enum VectorStateDelta {
} }
impl VectorStateDelta { impl VectorStateDelta {
fn into_values(self) -> (bool, String, (Vec<Vec<f32>>, Vec<Vec<f32>>)) { fn into_values(self) -> (bool, String, Vec<Vec<f32>>) {
match self { match self {
VectorStateDelta::NoChange => Default::default(), VectorStateDelta::NoChange => Default::default(),
VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()),
VectorStateDelta::WasGeneratedNowManual(add) => { VectorStateDelta::WasGeneratedNowManual(add) => (true, Default::default(), add),
(true, Default::default(), (Default::default(), add)) VectorStateDelta::ManualDelta(add) => (false, Default::default(), add),
}
VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)),
VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()),
} }
} }
@ -166,7 +163,13 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
// lazily get it when needed // lazily get it when needed
let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() };
let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid) let mut parsed_vectors = ParsedVectorsDiff::new(
docid,
embedders_configs,
obkv,
old_vectors_fid,
new_vectors_fid,
)
.map_err(|error| error.to_crate_error(document_id().to_string()))?; .map_err(|error| error.to_crate_error(document_id().to_string()))?;
for EmbedderVectorExtractor { for EmbedderVectorExtractor {
@ -182,7 +185,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
{ {
let delta = match parsed_vectors.remove(embedder_name) { let delta = match parsed_vectors.remove(embedder_name) {
(Some(old), Some(new)) => { (Some(old), Some(new)) => {
match (old.is_user_provided(), new.is_user_provided()) { match (old.map_or(true, |old| old.is_user_provided()), new.is_user_provided()) {
(true, true) | (false, false) => (), (true, true) | (false, false) => (),
(true, false) => { (true, false) => {
remove_from_user_defined.insert(docid); remove_from_user_defined.insert(docid);
@ -193,7 +196,6 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
} }
// no autogeneration // no autogeneration
let del_vectors = old.into_array_of_vectors();
let add_vectors = new.into_array_of_vectors(); let add_vectors = new.into_array_of_vectors();
if add_vectors.len() > usize::from(u8::MAX) { if add_vectors.len() > usize::from(u8::MAX) {
@ -203,15 +205,15 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
))); )));
} }
VectorStateDelta::ManualDelta(del_vectors, add_vectors) VectorStateDelta::ManualDelta(add_vectors)
} }
(Some(_old), None) => { (Some(old), None) => {
// Do we keep this document? // Do we keep this document?
let document_is_kept = obkv let document_is_kept = obkv
.iter() .iter()
.map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .map(|(_, deladd)| KvReaderDelAdd::new(deladd))
.any(|deladd| deladd.get(DelAdd::Addition).is_some()); .any(|deladd| deladd.get(DelAdd::Addition).is_some());
if document_is_kept { if document_is_kept && old.is_some() {
remove_from_user_defined.insert(docid); remove_from_user_defined.insert(docid);
// becomes autogenerated // becomes autogenerated
VectorStateDelta::NowGenerated(prompt.render( VectorStateDelta::NowGenerated(prompt.render(
@ -219,6 +221,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
DelAdd::Addition, DelAdd::Addition,
new_fields_ids_map, new_fields_ids_map,
)?) )?)
} else if document_is_kept && old.is_none() {
VectorStateDelta::NoChange
} else { } else {
VectorStateDelta::NowRemoved VectorStateDelta::NowRemoved
} }
@ -315,8 +319,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
Ok(results) Ok(results)
} }
/// Computes the diff between both Del and Add numbers and /// We cannot compute the diff between both Del and Add vectors.
/// only inserts the parts that differ in the sorter. /// We'll push every vector and compute the difference later in TypedChunk.
fn push_vectors_diff( fn push_vectors_diff(
remove_vectors_writer: &mut Writer<BufWriter<File>>, remove_vectors_writer: &mut Writer<BufWriter<File>>,
prompts_writer: &mut Writer<BufWriter<File>>, prompts_writer: &mut Writer<BufWriter<File>>,
@ -325,7 +329,7 @@ fn push_vectors_diff(
delta: VectorStateDelta, delta: VectorStateDelta,
reindex_vectors: bool, reindex_vectors: bool,
) -> Result<()> { ) -> Result<()> {
let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); let (must_remove, prompt, mut add_vectors) = delta.into_values();
if must_remove if must_remove
// TODO: the below condition works because we erase the vec database when a embedding setting changes. // TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed. // When vector pipeline will be optimized, this should be removed.
@ -340,36 +344,19 @@ fn push_vectors_diff(
} }
// We sort and dedup the vectors // We sort and dedup the vectors
del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b));
del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq());
let merged_vectors_iter = // let merged_vectors_iter =
itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); // itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add));
// insert vectors into the writer // insert vectors into the writer
for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) {
// Generate the key by extending the unique index to it. // Generate the key by extending the unique index to it.
key_buffer.truncate(TRUNCATE_SIZE); key_buffer.truncate(TRUNCATE_SIZE);
let index = u16::try_from(i).unwrap(); let index = u16::try_from(i).unwrap();
key_buffer.extend_from_slice(&index.to_be_bytes()); key_buffer.extend_from_slice(&index.to_be_bytes());
match eob {
EitherOrBoth::Both(_, _) => (), // no need to touch anything
EitherOrBoth::Left(vector) => {
// TODO: the below condition works because we erase the vec database when a embedding setting changes.
// When vector pipeline will be optimized, this should be removed.
if !reindex_vectors {
// We insert only the Del part of the Obkv to inform
// that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory();
obkv.insert(DelAdd::Deletion, cast_slice(&vector))?;
let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?;
}
}
EitherOrBoth::Right(vector) => {
// We insert only the Add part of the Obkv to inform // We insert only the Add part of the Obkv to inform
// that we only want to remove all those vectors. // that we only want to remove all those vectors.
let mut obkv = KvWriterDelAdd::memory(); let mut obkv = KvWriterDelAdd::memory();
@ -377,8 +364,6 @@ fn push_vectors_diff(
let bytes = obkv.into_inner()?; let bytes = obkv.into_inner()?;
manual_vectors_writer.insert(&key_buffer, bytes)?; manual_vectors_writer.insert(&key_buffer, bytes)?;
} }
}
}
Ok(()) Ok(())
} }

View File

@ -4,8 +4,9 @@ use obkv::KvReader;
use serde_json::{from_slice, Value}; use serde_json::{from_slice, Value};
use super::Embedding; use super::Embedding;
use crate::index::IndexEmbeddingConfig;
use crate::update::del_add::{DelAdd, KvReaderDelAdd}; use crate::update::del_add::{DelAdd, KvReaderDelAdd};
use crate::{FieldId, InternalError, UserError}; use crate::{DocumentId, FieldId, InternalError, UserError};
pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors";
@ -42,17 +43,19 @@ pub struct ExplicitVectors {
} }
pub struct ParsedVectorsDiff { pub struct ParsedVectorsDiff {
pub old: Option<BTreeMap<String, Vectors>>, pub old: BTreeMap<String, Option<Vectors>>,
pub new: Option<BTreeMap<String, Vectors>>, pub new: Option<BTreeMap<String, Vectors>>,
} }
impl ParsedVectorsDiff { impl ParsedVectorsDiff {
pub fn new( pub fn new(
docid: DocumentId,
embedders_configs: &[IndexEmbeddingConfig],
documents_diff: KvReader<'_, FieldId>, documents_diff: KvReader<'_, FieldId>,
old_vectors_fid: Option<FieldId>, old_vectors_fid: Option<FieldId>,
new_vectors_fid: Option<FieldId>, new_vectors_fid: Option<FieldId>,
) -> Result<Self, Error> { ) -> Result<Self, Error> {
let old = match old_vectors_fid let mut old = match old_vectors_fid
.and_then(|vectors_fid| documents_diff.get(vectors_fid)) .and_then(|vectors_fid| documents_diff.get(vectors_fid))
.map(KvReaderDelAdd::new) .map(KvReaderDelAdd::new)
.map(|obkv| to_vector_map(obkv, DelAdd::Deletion)) .map(|obkv| to_vector_map(obkv, DelAdd::Deletion))
@ -68,7 +71,13 @@ impl ParsedVectorsDiff {
return Err(error); return Err(error);
} }
} }
.flatten(); .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, Some(vec))).collect());
for embedding_config in embedders_configs {
if embedding_config.user_defined.contains(docid) {
old.entry(embedding_config.name.to_string()).or_insert(None);
}
}
let new = new_vectors_fid let new = new_vectors_fid
.and_then(|vectors_fid| documents_diff.get(vectors_fid)) .and_then(|vectors_fid| documents_diff.get(vectors_fid))
.map(KvReaderDelAdd::new) .map(KvReaderDelAdd::new)
@ -78,8 +87,9 @@ impl ParsedVectorsDiff {
Ok(Self { old, new }) Ok(Self { old, new })
} }
pub fn remove(&mut self, embedder_name: &str) -> (Option<Vectors>, Option<Vectors>) { /// Return (Some(None), _) in case the vector is user defined and contained in the database.
let old = self.old.as_mut().and_then(|old| old.remove(embedder_name)); pub fn remove(&mut self, embedder_name: &str) -> (Option<Option<Vectors>>, Option<Vectors>) {
let old = self.old.remove(embedder_name);
let new = self.new.as_mut().and_then(|new| new.remove(embedder_name)); let new = self.new.as_mut().and_then(|new| new.remove(embedder_name));
(old, new) (old, new)
} }